mirror of
git://git.sv.gnu.org/coreutils.git
synced 2026-04-20 18:56:39 +02:00
wc: improve avx2 API
* src/wc.c: Use "#include <...>" for files not in the current dir. Include "wc.h" instead of declaring wc_lines_avx2 by hand. (wc_lines): New API, with no file name (no longer needed) and with a return struct rather than arg pointers. All uses changed. Use avx2_supported directly instead of using a function pointer. Exploit C99-style declarations after statements. Multiply by 15 rather than dividing; it’s faster and more accurate and cannot overflow here. (wc): Simplify based on wc_lines API change. * src/wc.h: New file. * src/wc_avx2.c: Include it, to check API better. (wc_lines_avx2): Use new API. All uses changed. Exploit C99. Make locals more local.
This commit is contained in:
118
src/wc.c
118
src/wc.c
@@ -25,16 +25,18 @@
|
||||
#include <sys/types.h>
|
||||
#include <uchar.h>
|
||||
|
||||
#include <assure.h>
|
||||
#include <argmatch.h>
|
||||
#include <argv-iter.h>
|
||||
#include <fadvise.h>
|
||||
#include <physmem.h>
|
||||
#include <readtokens0.h>
|
||||
#include <safe-read.h>
|
||||
#include <stat-size.h>
|
||||
#include <xbinary-io.h>
|
||||
|
||||
#include "system.h"
|
||||
#include "assure.h"
|
||||
#include "argmatch.h"
|
||||
#include "argv-iter.h"
|
||||
#include "fadvise.h"
|
||||
#include "physmem.h"
|
||||
#include "readtokens0.h"
|
||||
#include "safe-read.h"
|
||||
#include "stat-size.h"
|
||||
#include "xbinary-io.h"
|
||||
#include "wc.h"
|
||||
|
||||
/* The official name of this program (e.g., no 'g' prefix). */
|
||||
#define PROGRAM_NAME "wc"
|
||||
@@ -46,13 +48,6 @@
|
||||
/* Size of atomic reads. */
|
||||
#define BUFFER_SIZE (16 * 1024)
|
||||
|
||||
#ifdef USE_AVX2_WC_LINECOUNT
|
||||
/* From wc_avx2.c */
|
||||
extern bool
|
||||
wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
|
||||
uintmax_t *bytes_out);
|
||||
#endif
|
||||
|
||||
static bool wc_isprint[UCHAR_MAX + 1];
|
||||
static bool wc_isspace[UCHAR_MAX + 1];
|
||||
|
||||
@@ -253,51 +248,44 @@ write_counts (uintmax_t lines,
|
||||
putchar ('\n');
|
||||
}
|
||||
|
||||
static bool
|
||||
wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out)
|
||||
/* Read FD and return a summary. */
|
||||
static struct wc_lines
|
||||
wc_lines (int fd)
|
||||
{
|
||||
size_t bytes_read;
|
||||
uintmax_t lines, bytes;
|
||||
char buf[BUFFER_SIZE + 1];
|
||||
#ifdef USE_AVX2_WC_LINECOUNT
|
||||
static signed char use_avx2;
|
||||
if (!use_avx2)
|
||||
use_avx2 = avx2_supported () ? 1 : -1;
|
||||
if (0 < use_avx2)
|
||||
return wc_lines_avx2 (fd);
|
||||
#endif
|
||||
|
||||
uintmax_t lines = 0, bytes = 0;
|
||||
bool long_lines = false;
|
||||
|
||||
if (!lines_out || !bytes_out)
|
||||
while (true)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
lines = bytes = 0;
|
||||
|
||||
while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
|
||||
{
|
||||
|
||||
if (bytes_read == SAFE_READ_ERROR)
|
||||
{
|
||||
error (0, errno, "%s", quotef (file));
|
||||
return false;
|
||||
}
|
||||
char buf[BUFFER_SIZE + 1];
|
||||
size_t bytes_read = safe_read (fd, buf, BUFFER_SIZE);
|
||||
if (! (0 < bytes_read && bytes_read <= BUFFER_SIZE))
|
||||
return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
|
||||
|
||||
bytes += bytes_read;
|
||||
|
||||
char *p = buf;
|
||||
char *end = buf + bytes_read;
|
||||
uintmax_t plines = lines;
|
||||
idx_t buflines = 0;
|
||||
|
||||
if (! long_lines)
|
||||
{
|
||||
/* Avoid function call overhead for shorter lines. */
|
||||
while (p != end)
|
||||
lines += *p++ == '\n';
|
||||
for (char *p = buf; p < end; p++)
|
||||
buflines += *p == '\n';
|
||||
}
|
||||
else
|
||||
{
|
||||
/* rawmemchr is more efficient with longer lines. */
|
||||
*end = '\n';
|
||||
while ((p = rawmemchr (p, '\n')) < end)
|
||||
{
|
||||
++p;
|
||||
++lines;
|
||||
}
|
||||
for (char *p = buf; (p = rawmemchr (p, '\n')) < end; p++)
|
||||
buflines++;
|
||||
}
|
||||
|
||||
/* If the average line length in the block is >= 15, then use
|
||||
@@ -306,16 +294,9 @@ wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out)
|
||||
FIXME: This line length was determined in 2015, on both
|
||||
x86_64 and ppc64, but it's worth re-evaluating in future with
|
||||
newer compilers, CPUs, or memchr() implementations etc. */
|
||||
if (lines - plines <= bytes_read / 15)
|
||||
long_lines = true;
|
||||
else
|
||||
long_lines = false;
|
||||
long_lines = 15 * buflines <= bytes_read;
|
||||
lines += buflines;
|
||||
}
|
||||
|
||||
*bytes_out = bytes;
|
||||
*lines_out = lines;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Count words. FILE_X is the name of the file (or null for standard
|
||||
@@ -325,7 +306,7 @@ wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out)
|
||||
static bool
|
||||
wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
|
||||
{
|
||||
bool ok = true;
|
||||
int err = 0;
|
||||
char buf[BUFFER_SIZE + 1];
|
||||
size_t bytes_read;
|
||||
uintmax_t lines, words, chars, bytes, linelength;
|
||||
@@ -412,8 +393,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
|
||||
{
|
||||
if (bytes_read == SAFE_READ_ERROR)
|
||||
{
|
||||
error (0, errno, "%s", quotef (file));
|
||||
ok = false;
|
||||
err = errno;
|
||||
break;
|
||||
}
|
||||
bytes += bytes_read;
|
||||
@@ -422,18 +402,12 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
|
||||
}
|
||||
else if (!count_chars && !count_complicated)
|
||||
{
|
||||
#ifdef USE_AVX2_WC_LINECOUNT
|
||||
static bool (*wc_lines_p) (char const *, int, uintmax_t *, uintmax_t *);
|
||||
if (!wc_lines_p)
|
||||
wc_lines_p = avx2_supported () ? wc_lines_avx2 : wc_lines;
|
||||
#else
|
||||
bool (*wc_lines_p) (char const *, int, uintmax_t *, uintmax_t *)
|
||||
= wc_lines;
|
||||
#endif
|
||||
|
||||
/* Use a separate loop when counting only lines or lines and bytes --
|
||||
but not chars or words. */
|
||||
ok = wc_lines_p (file, fd, &lines, &bytes);
|
||||
struct wc_lines w = wc_lines (fd);
|
||||
err = w.err;
|
||||
lines = w.lines;
|
||||
bytes = w.bytes;
|
||||
}
|
||||
else if (MB_CUR_MAX > 1)
|
||||
{
|
||||
@@ -449,8 +423,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
|
||||
char const *p;
|
||||
if (bytes_read == SAFE_READ_ERROR)
|
||||
{
|
||||
error (0, errno, "%s", quotef (file));
|
||||
ok = false;
|
||||
err = errno;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -583,8 +556,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
|
||||
char const *p = buf;
|
||||
if (bytes_read == SAFE_READ_ERROR)
|
||||
{
|
||||
error (0, errno, "%s", quotef (file));
|
||||
ok = false;
|
||||
err = errno;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -646,7 +618,9 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
|
||||
if (linelength > max_line_length)
|
||||
max_line_length = linelength;
|
||||
|
||||
return ok;
|
||||
if (err)
|
||||
error (0, err, "%s", quotef (file));
|
||||
return !err;
|
||||
}
|
||||
|
||||
static bool
|
||||
|
||||
3
src/wc.h
Normal file
3
src/wc.h
Normal file
@@ -0,0 +1,3 @@
|
||||
#include <stdint.h>
|
||||
struct wc_lines { int err; uintmax_t lines; uintmax_t bytes; };
|
||||
struct wc_lines wc_lines_avx2 (int);
|
||||
@@ -16,6 +16,8 @@
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include "wc.h"
|
||||
|
||||
#include "system.h"
|
||||
#include "safe-read.h"
|
||||
|
||||
@@ -26,62 +28,43 @@
|
||||
so there is no single bytes in the optimal case. */
|
||||
#define BUFSIZE (16320)
|
||||
|
||||
extern bool
|
||||
wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
|
||||
uintmax_t *bytes_out);
|
||||
|
||||
extern bool
|
||||
wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
|
||||
uintmax_t *bytes_out)
|
||||
/* Read FD and return a summary. */
|
||||
struct wc_lines
|
||||
wc_lines_avx2 (int fd)
|
||||
{
|
||||
__m256i accumulator;
|
||||
__m256i accumulator2;
|
||||
__m256i zeroes;
|
||||
__m256i endlines;
|
||||
__m256i avx_buf[BUFSIZE / sizeof (__m256i)];
|
||||
__m256i *datap;
|
||||
uintmax_t lines = 0;
|
||||
uintmax_t bytes = 0;
|
||||
size_t bytes_read = 0;
|
||||
|
||||
__m256i
|
||||
zeroes = _mm256_setzero_si256 (),
|
||||
endlines = _mm256_set1_epi8 ('\n');
|
||||
|
||||
if (!lines_out || !bytes_out)
|
||||
return false;
|
||||
|
||||
/* Using two parallel accumulators gave a good performance increase.
|
||||
Adding a third gave no additional benefit, at least on an
|
||||
Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector
|
||||
execution engines it would be a win. */
|
||||
accumulator = _mm256_setzero_si256 ();
|
||||
accumulator2 = _mm256_setzero_si256 ();
|
||||
zeroes = _mm256_setzero_si256 ();
|
||||
endlines = _mm256_set1_epi8 ('\n');
|
||||
|
||||
while ((bytes_read = safe_read (fd, avx_buf, sizeof (avx_buf))) > 0)
|
||||
while (true)
|
||||
{
|
||||
__m256i to_match;
|
||||
__m256i to_match2;
|
||||
__m256i matches;
|
||||
__m256i matches2;
|
||||
/* Using two parallel accumulators gave a good performance increase.
|
||||
Adding a third gave no additional benefit, at least on an
|
||||
Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector
|
||||
execution engines it would be a win. */
|
||||
__m256i
|
||||
accumulator = _mm256_setzero_si256 (),
|
||||
accumulator2 = _mm256_setzero_si256 (),
|
||||
avx_buf[BUFSIZE / sizeof (__m256i)];
|
||||
|
||||
if (bytes_read == SAFE_READ_ERROR)
|
||||
{
|
||||
error (0, errno, "%s", quotef (file));
|
||||
return false;
|
||||
}
|
||||
size_t bytes_read = safe_read (fd, avx_buf, sizeof avx_buf);
|
||||
if (! (0 < bytes_read && bytes_read <= sizeof avx_buf))
|
||||
return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
|
||||
|
||||
bytes += bytes_read;
|
||||
|
||||
datap = avx_buf;
|
||||
char *end = ((char *)avx_buf) + bytes_read;
|
||||
__m256i *datap = avx_buf;
|
||||
|
||||
while (bytes_read >= 64)
|
||||
{
|
||||
to_match = _mm256_load_si256 (datap);
|
||||
to_match2 = _mm256_load_si256 (datap + 1);
|
||||
__m256i
|
||||
to_match = _mm256_load_si256 (datap),
|
||||
to_match2 = _mm256_load_si256 (datap + 1),
|
||||
matches = _mm256_cmpeq_epi8 (to_match, endlines),
|
||||
matches2 = _mm256_cmpeq_epi8 (to_match2, endlines);
|
||||
|
||||
matches = _mm256_cmpeq_epi8 (to_match, endlines);
|
||||
matches2 = _mm256_cmpeq_epi8 (to_match2, endlines);
|
||||
/* Compare will set each 8 bit integer in the register to 0xFF
|
||||
on match. When we subtract it the 8 bit accumulators
|
||||
will underflow, so this is equal to adding 1. */
|
||||
@@ -92,30 +75,22 @@ wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
|
||||
bytes_read -= 64;
|
||||
}
|
||||
|
||||
/* Horizontally add all 8 bit integers in the register,
|
||||
and then reset it */
|
||||
/* Horizontally add all 8 bit integers in the register. */
|
||||
accumulator = _mm256_sad_epu8 (accumulator, zeroes);
|
||||
lines += _mm256_extract_epi16 (accumulator, 0)
|
||||
+ _mm256_extract_epi16 (accumulator, 4)
|
||||
+ _mm256_extract_epi16 (accumulator, 8)
|
||||
+ _mm256_extract_epi16 (accumulator, 12);
|
||||
accumulator = _mm256_setzero_si256 ();
|
||||
|
||||
accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes);
|
||||
lines += _mm256_extract_epi16 (accumulator2, 0)
|
||||
+ _mm256_extract_epi16 (accumulator2, 4)
|
||||
+ _mm256_extract_epi16 (accumulator2, 8)
|
||||
+ _mm256_extract_epi16 (accumulator2, 12);
|
||||
accumulator2 = _mm256_setzero_si256 ();
|
||||
|
||||
/* Finish up any left over bytes */
|
||||
char *p = (char *)datap;
|
||||
while (p != end)
|
||||
lines += *p++ == '\n';
|
||||
char *end = (char *) datap + bytes_read;
|
||||
for (char *p = (char *) datap; p < end; p++)
|
||||
lines += *p == '\n';
|
||||
}
|
||||
|
||||
*lines_out = lines;
|
||||
*bytes_out = bytes;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user