mirror of
git://git.sv.gnu.org/coreutils.git
synced 2026-04-11 06:34:40 +02:00
cut: faster utf8 processing
TODO: improve to use bounded memory where possible
This commit is contained in:
131
src/cut.c
131
src/cut.c
@@ -122,6 +122,12 @@ static bool trim_outer_whitespace;
|
||||
/* If true, default the output delimiter to a single space. */
|
||||
static bool space_output_delimiter_default;
|
||||
|
||||
/* Buffer for record-oriented UTF-8 field splitting. */
|
||||
static char *line_buffer;
|
||||
|
||||
/* The number of bytes allocated for LINE_BUFFER. */
|
||||
static idx_t line_bufsize;
|
||||
|
||||
enum whitespace_option
|
||||
{
|
||||
WHITESPACE_OPTION_TRIMMED
|
||||
@@ -293,6 +299,26 @@ single_byte_field_delim_ok (void)
|
||||
&& (MB_CUR_MAX <= 1 || to_uchar (delim_bytes[0]) < 0x30);
|
||||
}
|
||||
|
||||
/* Return true if the current charset is UTF-8. */
|
||||
static bool
|
||||
is_utf8_charset (void)
|
||||
{
|
||||
static int is_utf8 = -1;
|
||||
if (is_utf8 == -1)
|
||||
{
|
||||
char32_t w;
|
||||
mbstate_t mbs = {0};
|
||||
is_utf8 = mbrtoc32 (&w, "\xe2\x9f\xb8", 3, &mbs) == 3 && w == 0x27F8;
|
||||
}
|
||||
return is_utf8;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
utf8_field_delim_ok (void)
|
||||
{
|
||||
return ! delim_mcel.err && is_utf8_charset ();
|
||||
}
|
||||
|
||||
static inline bool
|
||||
field_delim_eq (mcel_t g)
|
||||
{
|
||||
@@ -471,6 +497,44 @@ scan_mb_field (mbbuf_t *mbbuf, struct mbfield_parser *parser,
|
||||
}
|
||||
}
|
||||
|
||||
/* Return a pointer to the next field delimiter in the UTF-8 record BUF,
|
||||
searching LEN bytes. Return NULL if none is found. DELIM_BYTES must
|
||||
represent a valid UTF-8 character.
|
||||
Like mbsmbchr() in numfmt but handles NUL bytes. */
|
||||
ATTRIBUTE_PURE
|
||||
static char *
|
||||
find_utf8_field_delim (char const *buf, size_t len)
|
||||
{
|
||||
unsigned char delim_0 = delim_bytes[0];
|
||||
if (delim_0 < 0x80)
|
||||
return memchr ((void *) buf, delim_0, len);
|
||||
|
||||
#if MEMMEM_IS_FASTER /* Surprisingly not on glibc-2.42 */
|
||||
return memmem (buf, len, delim_bytes, delim_length);
|
||||
#else
|
||||
char const *p = buf;
|
||||
char const *end = buf + len;
|
||||
|
||||
while (p < end)
|
||||
{
|
||||
char const *nul = memchr (p, '\0', end - p);
|
||||
if (!nul)
|
||||
return (char *) strstr (p, delim_bytes);
|
||||
|
||||
if (p < nul)
|
||||
{
|
||||
char *match = strstr (p, delim_bytes);
|
||||
if (match)
|
||||
return match;
|
||||
}
|
||||
|
||||
p = nul + 1;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void
|
||||
reset_field_line (uintmax_t *field_idx, bool *found_any_selected_field,
|
||||
bool *have_pending_line, struct mbfield_parser *parser)
|
||||
@@ -711,6 +775,72 @@ cut_fields_mb (FILE *stream)
|
||||
cut_fields_mb_any (stream, false);
|
||||
}
|
||||
|
||||
/* Read from STREAM, printing to standard output any selected fields,
|
||||
using UTF-8-aware byte searches for the field delimiter. */
|
||||
|
||||
static void
|
||||
cut_fields_mb_utf8 (FILE *stream)
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
size_t line_bufsize_s = line_bufsize;
|
||||
ssize_t len = getndelim2 (&line_buffer, &line_bufsize_s, 0,
|
||||
GETNLINE_NO_LIMIT, line_delim, EOF, stream);
|
||||
line_bufsize = line_bufsize_s;
|
||||
|
||||
if (len < 0)
|
||||
return;
|
||||
|
||||
size_t n_bytes = len;
|
||||
bool have_line_delim = 0 < n_bytes && line_buffer[n_bytes - 1] == line_delim;
|
||||
size_t data_len = n_bytes - have_line_delim;
|
||||
char *field_start = line_buffer;
|
||||
char *field_end = find_utf8_field_delim (field_start, data_len);
|
||||
|
||||
if (!field_end)
|
||||
{
|
||||
if (!suppress_non_delimited)
|
||||
{
|
||||
write_bytes (line_buffer, data_len);
|
||||
write_line_delim ();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
uintmax_t field_idx = 1;
|
||||
bool found_any_selected_field = false;
|
||||
current_rp = frp;
|
||||
|
||||
while (true)
|
||||
{
|
||||
size_t field_len = (field_end
|
||||
? field_end - field_start
|
||||
: line_buffer + data_len - field_start);
|
||||
|
||||
if (print_kth (field_idx))
|
||||
{
|
||||
if (found_any_selected_field)
|
||||
write_bytes (output_delimiter_string, output_delimiter_length);
|
||||
write_bytes (field_start, field_len);
|
||||
found_any_selected_field = true;
|
||||
}
|
||||
|
||||
if (!field_end)
|
||||
break;
|
||||
|
||||
next_item (&field_idx);
|
||||
field_start = field_end + delim_length;
|
||||
field_end = find_utf8_field_delim (field_start,
|
||||
line_buffer + data_len
|
||||
- field_start);
|
||||
}
|
||||
|
||||
if (found_any_selected_field
|
||||
|| !(suppress_non_delimited && field_idx == 1))
|
||||
write_line_delim ();
|
||||
}
|
||||
}
|
||||
|
||||
/* Read from STREAM, printing to standard output any selected fields,
|
||||
using runs of whitespace as the field delimiter. */
|
||||
|
||||
@@ -1098,6 +1228,7 @@ main (int argc, char **argv)
|
||||
case CUT_MODE_FIELDS:
|
||||
cut_stream = whitespace_delimited ? cut_fields_ws
|
||||
: single_byte_field_delim_ok () ? cut_fields
|
||||
: utf8_field_delim_ok () ? cut_fields_mb_utf8
|
||||
: cut_fields_mb;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -315,6 +315,12 @@ if ($mb_locale ne 'C')
|
||||
['mb-delim-5', '-d', "\xa9", '-f2', # Different from coreutils-i18n
|
||||
{IN=>"A\xc3\xa9B\xa9C\n"}, {OUT=>"C\n"}, # (we don't split valid chars)
|
||||
{ENV => "LC_ALL=$mb_locale"}],
|
||||
['mb-delim-6', '-d', "\xc3\xa9", '-f1,3',
|
||||
{IN=>"a\xc3\xa9b\xc3\xa9c"}, {OUT=>"a\xc3\xa9c\n"},
|
||||
{ENV => "LC_ALL=$mb_locale"}],
|
||||
['mb-delim-7', '-d', "\xc3\xa9", '-f2',
|
||||
{IN=>"a\0b\xc3\xa9c\n"}, {OUT=>"c\n"},
|
||||
{ENV => "LC_ALL=$mb_locale"}],
|
||||
['mb-w-delim-1', '-w', '-f2', {IN=>"a\xe2\x80\x83b\n"}, {OUT=>"b\n"},
|
||||
{ENV => "LC_ALL=$mb_locale"}],
|
||||
['mb-w-delim-2', '-sw', '-f2', {IN=>"a\xc2\xa0b\n"}, {OUT=>""},
|
||||
|
||||
Reference in New Issue
Block a user