mirror of
git://git.sv.gnu.org/coreutils.git
synced 2026-02-10 17:31:56 +02:00
fold: move multi-byte character reading to a module
* gl/modules/mbbuf: New file. * gl/lib/mbbuf.c: Likewise. * gl/lib/mbbuf.h: Likewise. * gl/local.mk (EXTRA_DIST): Add the new files. * bootstrap.conf (gnulib_modules): Add mbbuf. * src/fold.c: Include mbbuf.h. (fold_file): Use the mbbuf functions instead of calling fread and handling the input buffer ourselves. * cfg.mk (exclude_file_name_regexp--sc_preprocessor_indentation) (exclude_file_name_regexp--sc_GPL_version): Match gl/lib/mbbuf.c and gl/lib/mbbuf.h.
This commit is contained in:
@@ -169,6 +169,7 @@ gnulib_modules="
|
||||
maintainer-makefile
|
||||
malloc-gnu
|
||||
manywarnings
|
||||
mbbuf
|
||||
mbrlen
|
||||
mbrtoc32
|
||||
mbrtowc
|
||||
|
||||
3
cfg.mk
3
cfg.mk
@@ -938,7 +938,7 @@ exclude_file_name_regexp--sc_prohibit_tab_based_indentation = \
|
||||
$(tbi_1)|$(tbi_2)|$(tbi_3)
|
||||
|
||||
exclude_file_name_regexp--sc_preprocessor_indentation = \
|
||||
^(gl/lib/rand-isaac\.[ch]|gl/tests/test-rand-isaac\.c)$$|$(_ll)
|
||||
^(gl/lib/(rand-isaac|mbbuf)\.[ch]|gl/tests/test-rand-isaac\.c)$$|$(_ll)
|
||||
exclude_file_name_regexp--sc_prohibit_stat_st_blocks = \
|
||||
^(src/system\.h|tests/du/2g\.sh)$$
|
||||
|
||||
@@ -999,3 +999,4 @@ csiwl_2 = kno,ois,afile,whats,hda,indx,ot,nam,ist
|
||||
codespell_ignore_words_list = $(csiwl_1),$(csiwl_2)
|
||||
exclude_file_name_regexp--sc_codespell = \
|
||||
^(THANKS\.in|tests/pr/.*(F|tn?|l(o|m|i)|bl))$$
|
||||
exclude_file_name_regexp--sc_GPL_version = ^(gl/lib/mbbuf\.[hc])$$
|
||||
|
||||
22
gl/lib/mbbuf.c
Normal file
22
gl/lib/mbbuf.c
Normal file
@@ -0,0 +1,22 @@
|
||||
/* Buffering for multi-byte characters.
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
||||
|
||||
/* Written by Collin Funk. */
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#define MBBUF_INLINE _GL_EXTERN_INLINE
|
||||
#include "mbbuf.h"
|
||||
117
gl/lib/mbbuf.h
Normal file
117
gl/lib/mbbuf.h
Normal file
@@ -0,0 +1,117 @@
|
||||
/* Buffering for multi-byte characters.
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 2.1 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This file is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
||||
|
||||
/* Written by Collin Funk. */
|
||||
|
||||
#ifndef _MBBUF_H
|
||||
#define _MBBUF_H 1
|
||||
|
||||
#ifndef _GL_INLINE_HEADER_BEGIN
|
||||
# error "Please include config.h first."
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#include "mcel.h"
|
||||
#include "idx.h"
|
||||
|
||||
_GL_INLINE_HEADER_BEGIN
|
||||
#ifndef MBBUF_INLINE
|
||||
# define MBBUF_INLINE _GL_INLINE
|
||||
#endif
|
||||
|
||||
/* End of file. */
|
||||
#define MBBUF_EOF UINT32_MAX
|
||||
|
||||
/* MBBUF_EOF should not be a valid character. */
|
||||
static_assert (MCEL_CHAR_MAX < MBBUF_EOF);
|
||||
|
||||
typedef struct
|
||||
{
|
||||
char *buffer; /* Input buffer. */
|
||||
FILE *fp; /* Input file stream. */
|
||||
idx_t size; /* Number of bytes allocated for BUFFER. */
|
||||
idx_t length; /* Number of bytes with data in BUFFER. */
|
||||
idx_t offset; /* Current position in BUFFER. */
|
||||
} mbbuf_t;
|
||||
|
||||
/* Initialize MBBUF with an allocated BUFFER of SIZE bytes and a file stream
|
||||
FP open for reading. SIZE must be greater than or equal to MCEL_LEN_MAX.
|
||||
*/
|
||||
MBBUF_INLINE void
|
||||
mbbuf_init (mbbuf_t *mbbuf, char *buffer, idx_t size, FILE *fp)
|
||||
{
|
||||
if (size < MCEL_LEN_MAX)
|
||||
unreachable ();
|
||||
mbbuf->buffer = buffer;
|
||||
mbbuf->fp = fp;
|
||||
mbbuf->size = size;
|
||||
mbbuf->length = 0;
|
||||
mbbuf->offset = 0;
|
||||
}
|
||||
|
||||
/* Get the next character in the buffer, filling it from FP if necessary.
|
||||
If an invalid multi-byte character is seen, we assume the program wants to
|
||||
fall back to the read byte. */
|
||||
MBBUF_INLINE mcel_t
|
||||
mbbuf_get_char (mbbuf_t *mbbuf)
|
||||
{
|
||||
idx_t available = mbbuf->length - mbbuf->offset;
|
||||
/* Check if we need to fill the input buffer. */
|
||||
if (available < MCEL_LEN_MAX && ! feof (mbbuf->fp))
|
||||
{
|
||||
idx_t start;
|
||||
if (!(0 < available))
|
||||
start = 0;
|
||||
else
|
||||
{
|
||||
memmove (mbbuf->buffer, mbbuf->buffer + mbbuf->offset, available);
|
||||
start = available;
|
||||
}
|
||||
mbbuf->length = fread (mbbuf->buffer + start, 1, mbbuf->size - start,
|
||||
mbbuf->fp) + start;
|
||||
mbbuf->offset = 0;
|
||||
available = mbbuf->length - mbbuf->offset;
|
||||
}
|
||||
if (available <= 0)
|
||||
return (mcel_t) { .ch = MBBUF_EOF };
|
||||
mcel_t g = mcel_scan (mbbuf->buffer + mbbuf->offset,
|
||||
mbbuf->buffer + mbbuf->length);
|
||||
if (! g.err)
|
||||
mbbuf->offset += g.len;
|
||||
else
|
||||
{
|
||||
/* Assume the program will emit the byte, but keep the error flag. */
|
||||
g.ch = mbbuf->buffer[mbbuf->offset++];
|
||||
g.len = 1;
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
/* Returns a pointer to the first byte in the previously read character from
|
||||
mbbuf_get_char. */
|
||||
MBBUF_INLINE char *
|
||||
mbbuf_char_offset (mbbuf_t *mbbuf, mcel_t g)
|
||||
{
|
||||
if (mbbuf->offset < g.len)
|
||||
unreachable ();
|
||||
return mbbuf->buffer + (mbbuf->offset - g.len);
|
||||
}
|
||||
|
||||
_GL_INLINE_HEADER_END
|
||||
|
||||
#endif
|
||||
@@ -30,6 +30,8 @@ gl/lib/fd-reopen.c \
|
||||
gl/lib/fd-reopen.h \
|
||||
gl/lib/heap.c \
|
||||
gl/lib/heap.h \
|
||||
gl/lib/mbbuf.c \
|
||||
gl/lib/mbbuf.h \
|
||||
gl/lib/rand-isaac.c \
|
||||
gl/lib/rand-isaac.h \
|
||||
gl/lib/randint.c \
|
||||
@@ -65,6 +67,7 @@ gl/modules/fadvise-tests \
|
||||
gl/modules/fd-reopen \
|
||||
gl/modules/heap \
|
||||
gl/modules/link-tests.diff \
|
||||
gl/modules/mbbuf \
|
||||
gl/modules/randint \
|
||||
gl/modules/randperm \
|
||||
gl/modules/randread \
|
||||
|
||||
27
gl/modules/mbbuf
Normal file
27
gl/modules/mbbuf
Normal file
@@ -0,0 +1,27 @@
|
||||
Description:
|
||||
Buffering for multi-byte characters.
|
||||
|
||||
Files:
|
||||
lib/mbbuf.c
|
||||
lib/mbbuf.h
|
||||
|
||||
Depends-on:
|
||||
c99
|
||||
extern-inline
|
||||
idx
|
||||
mcel
|
||||
stddef-h
|
||||
|
||||
configure.ac:
|
||||
|
||||
Makefile.am:
|
||||
lib_SOURCES += mbbuf.c mbbuf.h
|
||||
|
||||
Include:
|
||||
"mbbuf.h"
|
||||
|
||||
License:
|
||||
LGPLv2+
|
||||
|
||||
Maintainer:
|
||||
all
|
||||
167
src/fold.c
167
src/fold.c
@@ -27,6 +27,7 @@
|
||||
#include "fadvise.h"
|
||||
#include "ioblksize.h"
|
||||
#include "mcel.h"
|
||||
#include "mbbuf.h"
|
||||
#include "xdectoint.h"
|
||||
|
||||
#define TAB_WIDTH 8
|
||||
@@ -153,8 +154,7 @@ fold_file (char const *filename, size_t width)
|
||||
idx_t offset_out = 0; /* Index in 'line_out' for next char. */
|
||||
static char line_out[IO_BUFSIZE];
|
||||
static char line_in[IO_BUFSIZE];
|
||||
static size_t offset_in = 0;
|
||||
static size_t length_in = 0;
|
||||
mbbuf_t mbbuf;
|
||||
int saved_errno;
|
||||
|
||||
if (streq (filename, "-"))
|
||||
@@ -172,116 +172,87 @@ fold_file (char const *filename, size_t width)
|
||||
}
|
||||
|
||||
fadvise (istream, FADVISE_SEQUENTIAL);
|
||||
mbbuf_init (&mbbuf, line_in, sizeof line_in, istream);
|
||||
|
||||
while (0 < (length_in = fread (line_in + offset_in, 1,
|
||||
sizeof line_in - offset_in, istream))
|
||||
|| 0 < offset_in)
|
||||
mcel_t g;
|
||||
while ((g = mbbuf_get_char (&mbbuf)).ch != MBBUF_EOF)
|
||||
{
|
||||
char *p = line_in;
|
||||
char *lim = p + length_in + offset_in;
|
||||
mcel_t g;
|
||||
for (; p < lim; p += g.len)
|
||||
if (g.ch == '\n')
|
||||
{
|
||||
g = mcel_scan (p, lim);
|
||||
if (g.err)
|
||||
write_out (line_out, offset_out, /*newline=*/ true);
|
||||
column = offset_out = 0;
|
||||
continue;
|
||||
}
|
||||
rescan:
|
||||
column = adjust_column (column, g);
|
||||
|
||||
if (column > width)
|
||||
{
|
||||
/* This character would make the line too long.
|
||||
Print the line plus a newline, and make this character
|
||||
start the next line. */
|
||||
if (break_spaces)
|
||||
{
|
||||
/* Replace the character with the byte if it cannot be a
|
||||
truncated multibyte sequence. */
|
||||
if (!(lim - p <= MCEL_LEN_MAX) || length_in == 0)
|
||||
g.ch = p[0];
|
||||
else
|
||||
int space_length = 0;
|
||||
idx_t logical_end = offset_out;
|
||||
char *logical_p = line_out;
|
||||
char *logical_lim = logical_p + logical_end;
|
||||
|
||||
for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len)
|
||||
{
|
||||
/* It may be a truncated multibyte sequence. Move it to the
|
||||
front of the input buffer. */
|
||||
memmove (line_in, p, lim - p);
|
||||
offset_in = lim - p;
|
||||
goto next_line;
|
||||
g2 = mcel_scan (logical_p, logical_lim);
|
||||
if (c32isblank (g2.ch) && ! c32isnbspace (g2.ch))
|
||||
{
|
||||
space_length = g2.len;
|
||||
logical_end = logical_p - line_out;
|
||||
}
|
||||
}
|
||||
|
||||
if (space_length)
|
||||
{
|
||||
logical_end += space_length;
|
||||
/* Found a blank. Don't output the part after it. */
|
||||
write_out (line_out, logical_end, /*newline=*/ true);
|
||||
/* Move the remainder to the beginning of the next line.
|
||||
The areas being copied here might overlap. */
|
||||
memmove (line_out, line_out + logical_end,
|
||||
offset_out - logical_end);
|
||||
offset_out -= logical_end;
|
||||
column = 0;
|
||||
char *printed_p = line_out;
|
||||
char *printed_lim = printed_p + offset_out;
|
||||
for (mcel_t g2; printed_p < printed_lim;
|
||||
printed_p += g2.len)
|
||||
{
|
||||
g2 = mcel_scan (printed_p, printed_lim);
|
||||
column = adjust_column (column, g2);
|
||||
}
|
||||
goto rescan;
|
||||
}
|
||||
}
|
||||
if (g.ch == '\n')
|
||||
|
||||
if (offset_out == 0)
|
||||
{
|
||||
write_out (line_out, offset_out, /*newline=*/ true);
|
||||
column = offset_out = 0;
|
||||
memcpy (line_out, mbbuf_char_offset (&mbbuf, g), g.len);
|
||||
offset_out += g.len;
|
||||
continue;
|
||||
}
|
||||
rescan:
|
||||
column = adjust_column (column, g);
|
||||
|
||||
if (column > width)
|
||||
{
|
||||
/* This character would make the line too long.
|
||||
Print the line plus a newline, and make this character
|
||||
start the next line. */
|
||||
if (break_spaces)
|
||||
{
|
||||
int space_length = 0;
|
||||
idx_t logical_end = offset_out;
|
||||
char *logical_p = line_out;
|
||||
char *logical_lim = logical_p + logical_end;
|
||||
|
||||
for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len)
|
||||
{
|
||||
g2 = mcel_scan (logical_p, logical_lim);
|
||||
if (c32isblank (g2.ch) && ! c32isnbspace (g2.ch))
|
||||
{
|
||||
space_length = g2.len;
|
||||
logical_end = logical_p - line_out;
|
||||
}
|
||||
}
|
||||
|
||||
if (space_length)
|
||||
{
|
||||
logical_end += space_length;
|
||||
/* Found a blank. Don't output the part after it. */
|
||||
write_out (line_out, logical_end, /*newline=*/ true);
|
||||
/* Move the remainder to the beginning of the next line.
|
||||
The areas being copied here might overlap. */
|
||||
memmove (line_out, line_out + logical_end,
|
||||
offset_out - logical_end);
|
||||
offset_out -= logical_end;
|
||||
column = 0;
|
||||
char *printed_p = line_out;
|
||||
char *printed_lim = printed_p + offset_out;
|
||||
for (mcel_t g2; printed_p < printed_lim;
|
||||
printed_p += g2.len)
|
||||
{
|
||||
g2 = mcel_scan (printed_p, printed_lim);
|
||||
column = adjust_column (column, g2);
|
||||
}
|
||||
goto rescan;
|
||||
}
|
||||
}
|
||||
|
||||
if (offset_out == 0)
|
||||
{
|
||||
memcpy (line_out, p, g.len);
|
||||
offset_out += g.len;
|
||||
continue;
|
||||
}
|
||||
|
||||
write_out (line_out, offset_out, /*newline=*/ true);
|
||||
column = offset_out = 0;
|
||||
goto rescan;
|
||||
}
|
||||
|
||||
/* This can occur if we have read characters with a width of
|
||||
zero. */
|
||||
if (sizeof line_out <= offset_out + g.len)
|
||||
{
|
||||
write_out (line_out, offset_out, /*newline=*/ false);
|
||||
offset_out = 0;
|
||||
}
|
||||
|
||||
memcpy (line_out + offset_out, p, g.len);
|
||||
offset_out += g.len;
|
||||
write_out (line_out, offset_out, /*newline=*/ true);
|
||||
column = offset_out = 0;
|
||||
goto rescan;
|
||||
}
|
||||
if (feof (istream))
|
||||
break;
|
||||
|
||||
/* We read a full buffer of complete characters. */
|
||||
offset_in = 0;
|
||||
/* This can occur if we have read characters with a width of
|
||||
zero. */
|
||||
if (sizeof line_out <= offset_out + g.len)
|
||||
{
|
||||
write_out (line_out, offset_out, /*newline=*/ false);
|
||||
offset_out = 0;
|
||||
}
|
||||
|
||||
next_line:;
|
||||
memcpy (line_out + offset_out, mbbuf_char_offset (&mbbuf, g), g.len);
|
||||
offset_out += g.len;
|
||||
}
|
||||
|
||||
saved_errno = errno;
|
||||
|
||||
Reference in New Issue
Block a user