1
0
mirror of git://git.sv.gnu.org/coreutils.git synced 2026-04-19 18:26:32 +02:00

split: support split -n on larger pipe input

* bootstrap.conf (gnulib_modules): Add free-posix, tmpfile.
* src/split.c (copy_to_tmpfile): New function.
(input_file_size): Use it to split larger files when sizes cannot
easily be determined via fstat or lseek.  See Bug#61386#235.
* tests/split/l-chunk.sh: Mark tests of /dev/zero as
very expensive since they exhaust /tmp.
This commit is contained in:
Paul Eggert
2023-03-07 12:58:12 -08:00
parent 35ac97e0d6
commit bb9dbcbbfd
5 changed files with 71 additions and 42 deletions

3
NEWS
View File

@@ -145,6 +145,9 @@ GNU coreutils NEWS -*- outline -*-
split now accepts options like '-n SIZE' that exceed machine integer
range, when they can be implemented as if they were infinity.
split -n now accepts piped input even when not in round-robin mode,
by first copying input to a temporary file to determine its size.
wc now accepts the --total={auto,never,always,only} option
to give explicit control over when the total is output.

View File

@@ -103,6 +103,7 @@ gnulib_modules="
fnmatch-gnu
fopen-safer
fprintftime
free-posix
freopen
freopen-safer
fseeko
@@ -270,6 +271,7 @@ gnulib_modules="
time_rz
timer-time
timespec
tmpfile
tzset
uname
unicodeio

View File

@@ -3409,8 +3409,10 @@ are not split even if they overlap a partition, the files written
can be larger or smaller than the partition size, and even empty
if a line/record is so long as to completely overlap the partition.
For @samp{r} mode, the size of @var{input} is irrelevant,
and so can be a pipe for example.
When the input is a pipe or some other special file where the size
cannot easily be determined, there is no trouble for @samp{r} mode
because the size of the input is irrelevant. For other modes, such an
input is first copied to a temporary to determine its size.
@item -a @var{length}
@itemx --suffix-length=@var{length}

View File

@@ -275,6 +275,39 @@ CHUNKS may be:\n\
exit (status);
}
/* Copy the data in FD to a temporary file, then make that file FD.
Use BUF, of size BUFSIZE, to copy. Return the number of
bytes copied, or -1 (setting errno) on error. */
static off_t
copy_to_tmpfile (int fd, char *buf, idx_t bufsize)
{
FILE *tmp = tmpfile ();
if (!tmp)
return -1;
off_t copied = 0;
off_t r;
while (0 < (r = read (fd, buf, bufsize)))
{
if (fwrite (buf, 1, r, tmp) != r)
return -1;
if (INT_ADD_WRAPV (copied, r, &copied))
{
errno = EOVERFLOW;
return -1;
}
}
if (r < 0)
return r;
r = dup2 (fileno (tmp), fd);
if (r < 0)
return r;
if (fclose (tmp) < 0)
return -1;
return copied;
}
/* Return the number of bytes that can be read from FD with status ST.
Store up to the first BUFSIZE bytes of the file's data into BUF,
and advance the file position by the number of bytes read. On
@@ -293,49 +326,35 @@ input_file_size (int fd, struct stat const *st, char *buf, idx_t bufsize)
}
while (size < bufsize);
off_t cur = lseek (fd, 0, SEEK_CUR);
if (cur < 0)
off_t cur, end;
if ((usable_st_size (st) && st->st_size < size)
|| (cur = lseek (fd, 0, SEEK_CUR)) < 0
|| cur < size /* E.g., /dev/zero on GNU/Linux. */
|| (end = lseek (fd, 0, SEEK_END)) < 0)
{
if (errno == ESPIPE)
errno = 0; /* Suppress confusing seek error. */
return cur;
}
off_t end;
if (usable_st_size (st))
end = st->st_size;
else
{
end = lseek (fd, 0, SEEK_END);
char *tmpbuf = xmalloc (bufsize);
end = copy_to_tmpfile (fd, tmpbuf, bufsize);
free (tmpbuf);
if (end < 0)
return end;
if (end == OFF_T_MAX)
goto overflow; /* E.g., /dev/zero on GNU/Hurd. */
if (cur < end)
{
off_t cur1 = lseek (fd, cur, SEEK_SET);
if (cur1 < 0)
return cur1;
}
cur = 0;
}
/* Report overflow if we filled the buffer from a file with more
bytes than stat or lseek reports. This can happen with mutating
(e.g., /proc) files that are larger than the input block size.
FIXME: Handle this properly, e.g., by copying the growing file's
data into the first output file, and then splitting that output
file (which should not grow) into the other output files. */
if (end < size)
goto overflow;
if (end == OFF_T_MAX /* E.g., /dev/zero on GNU/Hurd. */
|| (cur < end && INT_ADD_WRAPV (size, end - cur, &size)))
{
errno = EOVERFLOW;
return -1;
}
if (cur < end && INT_ADD_WRAPV (size, end - cur, &size))
goto overflow;
if (cur < end)
{
off_t r = lseek (fd, cur, SEEK_SET);
if (r < 0)
return r;
}
return size;
overflow:
errno = EOVERFLOW;
return -1;
}
/* Compute the next sequential output file name and store it into the

View File

@@ -37,12 +37,15 @@ rm x??
# 'split' should reject any attempt to create an infinitely
# long output file.
returns_ 1 split -n l/2 /dev/zero || fail=1
rm x??
# This test is very expensive as it runs out of /tmp space.
if test "${RUN_VERY_EXPENSIVE_TESTS+set}" = set; then
returns_ 1 split -n l/2 /dev/zero || fail=1
rm x??
# Repeat the above, but with 1/2, not l/2:
returns_ 1 split -n 1/2 /dev/zero || fail=1
rm x??
# Repeat the above, but with 1/2, not l/2:
returns_ 1 split -n 1/2 /dev/zero || fail=1
rm x??
fi
# Ensure --elide-empty-files is honored
split -e -n l/10 /dev/null || fail=1