1
0
mirror of git://git.sv.gnu.org/coreutils.git synced 2026-04-18 17:56:54 +02:00

split: split more evenly with -n

* src/split.c (bytes_split): New arg REM_BYTES.
Use this to split more evenly.  All callers changed.
(lines_chunk_split, bytes_chunk_extract):
Be consistent with new byte_split.
* tests/split/b-chunk.sh, tests/split/l-chunk.sh: Test new behavior.
This commit is contained in:
Paul Eggert
2023-03-04 11:41:03 -08:00
parent 0d997e18b9
commit fb6fc7f3ce
5 changed files with 67 additions and 52 deletions

5
NEWS
View File

@@ -106,6 +106,11 @@ GNU coreutils NEWS -*- outline -*-
internal errors it would exit with status 1, which was less distinguishable
from errors from the invoked command.
'split -n N' now splits more evenly when the input size is not a
multiple of N, by creating N output files whose sizes differ by at
most 1 byte. Formerly, it did this only when the input size was
less than N.
'stat -c %s' now prints sizes as unsigned, consistent with 'ls'.
** New Features

View File

@@ -3393,8 +3393,8 @@ r/@var{n} like @samp{l} but use round robin distribution
r/@var{k}/@var{n} likewise but output only @var{k}th of @var{n} to stdout
@end example
Any excess bytes remaining after dividing the @var{input}
into @var{n} chunks, are assigned to the last chunk.
If the input size is not a multiple of @var{n}, early output files are
one byte longer than later output files, to make up the difference.
Any excess bytes appearing after the initial calculation are discarded
(except when using @samp{r} mode).
@@ -3402,8 +3402,8 @@ All @var{n} files are created even if there are fewer than @var{n} lines,
or the @var{input} is truncated.
For @samp{l} mode, chunks are approximately @var{input} size / @var{n}.
The @var{input} is partitioned into @var{n} equal sized portions, with
the last assigned any excess. If a line @emph{starts} within a partition
Although the @var{input} is still partitioned as before into @var{n} regions
of approximately equal size, if a line @emph{starts} within a partition
it is written completely to the corresponding file. Since lines or records
are not split even if they overlap a partition, the files written
can be larger or smaller than the partition size, and even empty

View File

@@ -619,21 +619,23 @@ cwrite (bool new_file_flag, char const *bp, size_t bytes)
}
/* Split into pieces of exactly N_BYTES bytes.
However, the first REM_BYTES pieces should be 1 byte longer.
Use buffer BUF, whose size is BUFSIZE.
BUF contains the first INITIAL_READ input bytes. */
static void
bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read,
bytes_split (uintmax_t n_bytes, uintmax_t rem_bytes,
char *buf, size_t bufsize, size_t initial_read,
uintmax_t max_files)
{
size_t n_read;
bool new_file_flag = true;
bool filter_ok = true;
uintmax_t to_write = n_bytes;
uintmax_t opened = 0;
bool eof;
uintmax_t to_write = n_bytes + (0 < rem_bytes);
bool eof = ! to_write;
do
while (! eof)
{
if (initial_read != SIZE_MAX)
{
@@ -646,7 +648,7 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read,
if (! filter_ok
&& lseek (STDIN_FILENO, to_write, SEEK_CUR) != -1)
{
to_write = n_bytes;
to_write = n_bytes + (opened + 1 < rem_bytes);
new_file_flag = true;
}
@@ -656,7 +658,7 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read,
eof = n_read == 0;
}
char *bp_out = buf;
while (to_write <= n_read)
while (0 < to_write && to_write <= n_read)
{
if (filter_ok || new_file_flag)
filter_ok = cwrite (new_file_flag, bp_out, to_write);
@@ -671,7 +673,7 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read,
}
bp_out += to_write;
n_read -= to_write;
to_write = n_bytes;
to_write = n_bytes + (opened < rem_bytes);
}
if (n_read != 0)
{
@@ -687,7 +689,6 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read,
to_write -= n_read;
}
}
while (! eof);
/* Ensure NUMBER files are created, which truncates
any existing files or notifies any consumers on fifos.
@@ -864,19 +865,20 @@ static void
lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
size_t initial_read, off_t file_size)
{
assert (n && k <= n && n <= file_size);
assert (n && k <= n);
const off_t chunk_size = file_size / n;
uintmax_t rem_bytes = file_size % n;
off_t chunk_size = file_size / n;
uintmax_t chunk_no = 1;
off_t chunk_end = chunk_size;
off_t chunk_end = chunk_size + (0 < rem_bytes);
off_t n_written = 0;
bool new_file_flag = true;
bool chunk_truncated = false;
if (k > 1)
if (k > 1 && 0 < file_size)
{
/* Start reading 1 byte before kth chunk of file. */
off_t start = (k - 1) * chunk_size - 1;
off_t start = (k - 1) * chunk_size + MIN (k - 1, rem_bytes) - 1;
if (start < initial_read)
{
memmove (buf, buf + start, initial_read - start);
@@ -890,7 +892,7 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
}
n_written = start;
chunk_no = k - 1;
chunk_end = chunk_no * chunk_size;
chunk_end = start + 1;
}
while (n_written < file_size)
@@ -904,13 +906,13 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
}
else
{
n_read = safe_read (STDIN_FILENO, buf, bufsize);
n_read = safe_read (STDIN_FILENO, buf,
MIN (bufsize, file_size - n_written));
if (n_read == SAFE_READ_ERROR)
die (EXIT_FAILURE, errno, "%s", quotef (infile));
}
if (n_read == 0)
break; /* eof. */
n_read = MIN (n_read, file_size - n_written);
chunk_truncated = false;
eob = buf + n_read;
@@ -956,13 +958,10 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
chunk_truncated = true;
break;
}
chunk_no++;
if (k && chunk_no > k)
if (k == chunk_no)
return;
if (chunk_no == n)
chunk_end = file_size; /* >= chunk_size. */
else
chunk_end += chunk_size;
chunk_end += chunk_size + (chunk_no < rem_bytes);
chunk_no++;
if (chunk_end <= n_written)
{
if (! k)
@@ -994,10 +993,10 @@ bytes_chunk_extract (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
off_t start;
off_t end;
assert (k && n && k <= n && n <= file_size);
assert (0 < k && k <= n);
start = (k - 1) * (file_size / n);
end = (k == n) ? file_size : k * (file_size / n);
start = (k - 1) * (file_size / n) + MIN (k - 1, file_size % n);
end = k == n ? file_size : k * (file_size / n) + MIN (k, file_size % n);
if (start < initial_read)
{
@@ -1607,9 +1606,6 @@ main (int argc, char **argv)
_("invalid number of chunks"),
quote (umaxtostr (n_units, buffer)));
}
/* increase file_size to n_units here, so that we still process
any input data, and create empty files for the rest. */
file_size = MAX (file_size, n_units);
}
/* When filtering, closure of one pipe must not terminate the process,
@@ -1632,7 +1628,7 @@ main (int argc, char **argv)
break;
case type_bytes:
bytes_split (n_units, buf, in_blk_size, SIZE_MAX, 0);
bytes_split (n_units, 0, buf, in_blk_size, SIZE_MAX, 0);
break;
case type_byteslines:
@@ -1641,8 +1637,8 @@ main (int argc, char **argv)
case type_chunk_bytes:
if (k_units == 0)
bytes_split (file_size / n_units, buf, in_blk_size, initial_read,
n_units);
bytes_split (file_size / n_units, file_size % n_units,
buf, in_blk_size, initial_read, n_units);
else
bytes_chunk_extract (k_units, n_units, buf, in_blk_size, initial_read,
file_size);

View File

@@ -25,6 +25,24 @@ split -n 10 /dev/null || fail=1
test "$(stat -c %s x* | uniq -c | sed 's/^ *//; s/ /x/')" = "10x0" || fail=1
rm -f x??
printf 'abc' > abc || framework_failure_
printf 'a' > exp-a || framework_failure_
printf 'b' > exp-b || framework_failure_
printf 'c' > exp-c || framework_failure_
printf 'ab' > exp-ab || framework_failure_
split -n 4 abc || fail=1
compare exp-a xaa || fail=1
compare exp-b xab || fail=1
compare exp-c xac || fail=1
compare /dev/null xad || fail=1
test ! -f xae || fail=1
rm -f x??
split -n 2 abc || fail=1
compare exp-ab xaa || fail=1
compare exp-c xab || fail=1
test ! -f xac || fail=1
rm -f x??
# When extracting K of N where N > file size
# no data is extracted, and no files are written
split -n 2/3 /dev/null || fail=1
@@ -35,9 +53,9 @@ split -e -n 10 /dev/null || fail=1
returns_ 1 stat x?? 2>/dev/null || fail=1
printf '1\n2\n3\n4\n5\n' > input || framework_failure_
printf '1\n2' > exp-1 || framework_failure_
printf '\n3\n' > exp-2 || framework_failure_
printf '4\n5\n' > exp-3 || framework_failure_
printf '1\n2\n' > exp-1 || framework_failure_
printf '3\n4' > exp-2 || framework_failure_
printf '\n5\n' > exp-3 || framework_failure_
for file in input /proc/version /sys/kernel/profiling; do
test -f $file || continue

View File

@@ -59,11 +59,11 @@ sed "s/': .*/'/" < err.t > err || framework_failure_
compare exp err || fail=1
printf '%s' "\
14 16 09 15 16 10
14 16 16 08 16 10
14 08 08 10 14 08 08 10
06 08 08 02 06 08 08 02 06 08 08 10
06 08 02 06 08 00 08 02 06 08 02 06 08 00 10
06 00 08 00 02 06 00 02 06 00 08 00 01 07 00 02 06 00 08 00 02 16
08 06 08 08 08 08 08 02 06 08 08 02
06 08 08 02 06 08 02 06 08 02 06 08 00 08 02
06 02 06 02 06 02 06 02 06 02 06 02 06 02 06 00 08 00 02 06 00 02
" > exp || framework_failure_
sed 's/00 *//g' exp > exp.elide_empty || framework_failure_
@@ -120,17 +120,13 @@ test "$DEBUGGING" && test "$VERBOSE" && set -x
# Check extraction of particular chunks
> out
printf '1\n12345\n' > exp
split -n l/13/15 in > out
split -n l/13/15 in > out &&
compare /dev/null out || fail=1
printf '1\n12345\n' > exp || framework_failure_
split -n l/14/15 in > out &&
compare exp out || fail=1
> out
printf '' > exp
split -n l/14/15 in > out
compare exp out || fail=1
> out
printf '1\n12345\n1\n' > exp
split -n l/15/15 in > out
printf '1\n' > exp || framework_failure_
split -n l/15/15 in > out &&
compare exp out || fail=1
# test input with no \n at end