wc: improve avx2 API

* src/wc.c: Use "#include <...>" for files not in the current dir. Include "wc.h" instead of declaring wc_lines_avx2 by hand. (wc_lines): New API, with no file name (no longer needed) and with a return struct rather than arg pointers. All uses changed. Use avx2_supported directly instead of using a function pointer. Exploit C99-style declarations after statements. Multiply by 15 rather than dividing; it’s faster and more accurate and cannot overflow here. (wc): Simplify based on wc_lines API change. * src/wc.h: New file. * src/wc_avx2.c: Include it, to check API better. (wc_lines_avx2): Use new API. All uses changed. Exploit C99. Make locals more local.
2026-04-20 18:56:39 +02:00 · 2023-09-23 13:38:08 -07:00
parent 769ace51e8
commit 8d41285fe4
3 changed files with 79 additions and 127 deletions
--- a/src/wc.c
+++ b/src/wc.c
@@ -25,16 +25,18 @@
 #include <sys/types.h>
 #include <uchar.h>

+#include <assure.h>
+#include <argmatch.h>
+#include <argv-iter.h>
+#include <fadvise.h>
+#include <physmem.h>
+#include <readtokens0.h>
+#include <safe-read.h>
+#include <stat-size.h>
+#include <xbinary-io.h>
+
 #include "system.h"
-#include "assure.h"
-#include "argmatch.h"
-#include "argv-iter.h"
-#include "fadvise.h"
-#include "physmem.h"
-#include "readtokens0.h"
-#include "safe-read.h"
-#include "stat-size.h"
-#include "xbinary-io.h"
+#include "wc.h"

 /* The official name of this program (e.g., no 'g' prefix).  */
 #define PROGRAM_NAME "wc"
@@ -46,13 +48,6 @@
 /* Size of atomic reads. */
 #define BUFFER_SIZE (16 * 1024)

-#ifdef USE_AVX2_WC_LINECOUNT
-/* From wc_avx2.c */
-extern bool
-wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
-               uintmax_t *bytes_out);
-#endif
-
 static bool wc_isprint[UCHAR_MAX + 1];
 static bool wc_isspace[UCHAR_MAX + 1];

@@ -253,51 +248,44 @@ write_counts (uintmax_t lines,
  putchar ('\n');
 }

-static bool
-wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out)
+/* Read FD and return a summary.  */
+static struct wc_lines
+wc_lines (int fd)
 {
-  size_t bytes_read;
-  uintmax_t lines, bytes;
-  char buf[BUFFER_SIZE + 1];
+#ifdef USE_AVX2_WC_LINECOUNT
+  static signed char use_avx2;
+  if (!use_avx2)
+    use_avx2 = avx2_supported () ? 1 : -1;
+  if (0 < use_avx2)
+    return wc_lines_avx2 (fd);
+#endif
+
+  uintmax_t lines = 0, bytes = 0;
  bool long_lines = false;

-  if (!lines_out || !bytes_out)
+  while (true)
    {
-      return false;
-    }
-
-  lines = bytes = 0;
-
-  while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
-    {
-
-      if (bytes_read == SAFE_READ_ERROR)
-        {
-          error (0, errno, "%s", quotef (file));
-          return false;
-        }
+      char buf[BUFFER_SIZE + 1];
+      size_t bytes_read = safe_read (fd, buf, BUFFER_SIZE);
+      if (! (0 < bytes_read && bytes_read <= BUFFER_SIZE))
+        return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };

      bytes += bytes_read;
-
-      char *p = buf;
      char *end = buf + bytes_read;
-      uintmax_t plines = lines;
+      idx_t buflines = 0;

      if (! long_lines)
        {
          /* Avoid function call overhead for shorter lines.  */
-          while (p != end)
-            lines += *p++ == '\n';
+          for (char *p = buf; p < end; p++)
+            buflines += *p == '\n';
        }
      else
        {
          /* rawmemchr is more efficient with longer lines.  */
          *end = '\n';
-          while ((p = rawmemchr (p, '\n')) < end)
-            {
-              ++p;
-              ++lines;
-            }
+          for (char *p = buf; (p = rawmemchr (p, '\n')) < end; p++)
+            buflines++;
        }

      /* If the average line length in the block is >= 15, then use
@@ -306,16 +294,9 @@ wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out)
          FIXME: This line length was determined in 2015, on both
          x86_64 and ppc64, but it's worth re-evaluating in future with
          newer compilers, CPUs, or memchr() implementations etc.  */
-      if (lines - plines <= bytes_read / 15)
-        long_lines = true;
-      else
-        long_lines = false;
+      long_lines = 15 * buflines <= bytes_read;
+      lines += buflines;
    }
-
-  *bytes_out = bytes;
-  *lines_out = lines;
-
-  return true;
 }

 /* Count words.  FILE_X is the name of the file (or null for standard
@@ -325,7 +306,7 @@ wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out)
 static bool
 wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
 {
-  bool ok = true;
+  int err = 0;
  char buf[BUFFER_SIZE + 1];
  size_t bytes_read;
  uintmax_t lines, words, chars, bytes, linelength;
@@ -412,8 +393,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
            {
              if (bytes_read == SAFE_READ_ERROR)
                {
-                  error (0, errno, "%s", quotef (file));
-                  ok = false;
+                  err = errno;
                  break;
                }
              bytes += bytes_read;
@@ -422,18 +402,12 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
    }
  else if (!count_chars && !count_complicated)
    {
-#ifdef USE_AVX2_WC_LINECOUNT
-      static bool (*wc_lines_p) (char const *, int, uintmax_t *, uintmax_t *);
-      if (!wc_lines_p)
-        wc_lines_p = avx2_supported () ? wc_lines_avx2 : wc_lines;
-#else
-      bool (*wc_lines_p) (char const *, int, uintmax_t *, uintmax_t *)
-        = wc_lines;
-#endif
-
      /* Use a separate loop when counting only lines or lines and bytes --
         but not chars or words.  */
-      ok = wc_lines_p (file, fd, &lines, &bytes);
+      struct wc_lines w = wc_lines (fd);
+      err = w.err;
+      lines = w.lines;
+      bytes = w.bytes;
    }
  else if (MB_CUR_MAX > 1)
    {
@@ -449,8 +423,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
          char const *p;
          if (bytes_read == SAFE_READ_ERROR)
            {
-              error (0, errno, "%s", quotef (file));
-              ok = false;
+              err = errno;
              break;
            }

@@ -583,8 +556,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
          char const *p = buf;
          if (bytes_read == SAFE_READ_ERROR)
            {
-              error (0, errno, "%s", quotef (file));
-              ok = false;
+              err = errno;
              break;
            }

@@ -646,7 +618,9 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
  if (linelength > max_line_length)
    max_line_length = linelength;

-  return ok;
+  if (err)
+    error (0, err, "%s", quotef (file));
+  return !err;
 }

 static bool
--- a/src/wc.h
+++ b/src/wc.h
@@ -0,0 +1,3 @@
+#include <stdint.h>
+struct wc_lines { int err; uintmax_t lines; uintmax_t bytes; };
+struct wc_lines wc_lines_avx2 (int);
--- a/src/wc_avx2.c
+++ b/src/wc_avx2.c
@@ -16,6 +16,8 @@

 #include <config.h>

+#include "wc.h"
+
 #include "system.h"
 #include "safe-read.h"

@@ -26,62 +28,43 @@
   so there is no single bytes in the optimal case. */
 #define BUFSIZE (16320)

-extern bool
-wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
-               uintmax_t *bytes_out);
-
-extern bool
-wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
-               uintmax_t *bytes_out)
+/* Read FD and return a summary.  */
+struct wc_lines
+wc_lines_avx2 (int fd)
 {
-  __m256i accumulator;
-  __m256i accumulator2;
-  __m256i zeroes;
-  __m256i endlines;
-  __m256i avx_buf[BUFSIZE / sizeof (__m256i)];
-  __m256i *datap;
  uintmax_t lines = 0;
  uintmax_t bytes = 0;
-  size_t bytes_read = 0;

+  __m256i
+    zeroes = _mm256_setzero_si256 (),
+    endlines = _mm256_set1_epi8 ('\n');

-  if (!lines_out || !bytes_out)
-    return false;
-
-  /* Using two parallel accumulators gave a good performance increase.
-     Adding a third gave no additional benefit, at least on an
-     Intel Xeon E3-1231v3.  Maybe on a newer CPU with additional vector
-     execution engines it would be a win. */
-  accumulator = _mm256_setzero_si256 ();
-  accumulator2 = _mm256_setzero_si256 ();
-  zeroes = _mm256_setzero_si256 ();
-  endlines = _mm256_set1_epi8 ('\n');
-
-  while ((bytes_read = safe_read (fd, avx_buf, sizeof (avx_buf))) > 0)
+  while (true)
    {
-      __m256i to_match;
-      __m256i to_match2;
-      __m256i matches;
-      __m256i matches2;
+      /* Using two parallel accumulators gave a good performance increase.
+         Adding a third gave no additional benefit, at least on an
+         Intel Xeon E3-1231v3.  Maybe on a newer CPU with additional vector
+         execution engines it would be a win. */
+      __m256i
+        accumulator = _mm256_setzero_si256 (),
+        accumulator2 = _mm256_setzero_si256 (),
+        avx_buf[BUFSIZE / sizeof (__m256i)];

-      if (bytes_read == SAFE_READ_ERROR)
-        {
-          error (0, errno, "%s", quotef (file));
-          return false;
-        }
+      size_t bytes_read = safe_read (fd, avx_buf, sizeof avx_buf);
+      if (! (0 < bytes_read && bytes_read <= sizeof avx_buf))
+        return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };

      bytes += bytes_read;
-
-      datap = avx_buf;
-      char *end = ((char *)avx_buf) + bytes_read;
+      __m256i *datap = avx_buf;

      while (bytes_read >= 64)
        {
-          to_match = _mm256_load_si256 (datap);
-          to_match2 = _mm256_load_si256 (datap + 1);
+          __m256i
+            to_match = _mm256_load_si256 (datap),
+            to_match2 = _mm256_load_si256 (datap + 1),
+            matches = _mm256_cmpeq_epi8 (to_match, endlines),
+            matches2 = _mm256_cmpeq_epi8 (to_match2, endlines);

-          matches = _mm256_cmpeq_epi8 (to_match, endlines);
-          matches2 = _mm256_cmpeq_epi8 (to_match2, endlines);
          /* Compare will set each 8 bit integer in the register to 0xFF
             on match.  When we subtract it the 8 bit accumulators
             will underflow, so this is equal to adding 1. */
@@ -92,30 +75,22 @@ wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
          bytes_read -= 64;
        }

-      /* Horizontally add all 8 bit integers in the register,
-         and then reset it */
+      /* Horizontally add all 8 bit integers in the register.  */
      accumulator = _mm256_sad_epu8 (accumulator, zeroes);
      lines +=   _mm256_extract_epi16 (accumulator, 0)
               + _mm256_extract_epi16 (accumulator, 4)
               + _mm256_extract_epi16 (accumulator, 8)
               + _mm256_extract_epi16 (accumulator, 12);
-      accumulator = _mm256_setzero_si256 ();

      accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes);
      lines +=   _mm256_extract_epi16 (accumulator2, 0)
               + _mm256_extract_epi16 (accumulator2, 4)
               + _mm256_extract_epi16 (accumulator2, 8)
               + _mm256_extract_epi16 (accumulator2, 12);
-      accumulator2 = _mm256_setzero_si256 ();

      /* Finish up any left over bytes */
-      char *p = (char *)datap;
-      while (p != end)
-        lines += *p++ == '\n';
+      char *end = (char *) datap + bytes_read;
+      for (char *p = (char *) datap; p < end; p++)
+        lines += *p == '\n';
    }
-
-  *lines_out = lines;
-  *bytes_out = bytes;
-
-  return true;
 }