mirror of
git://git.sv.gnu.org/coreutils.git
synced 2026-04-21 03:12:48 +02:00
Reservoir sampling optimizes selecting K random lines from large or unknown-sized input: http://en.wikipedia.org/wiki/Reservoir_sampling Note this also avoids reading any input when -n0 is specified. * src/shuf.c (main): Use reservoir-sampling when the number of output lines is known, and the input size is large or unknown. (input_size): A new function to get the input size for regular files. (read_input_reservoir_sampling): New function to read lines from input, keeping only K lines in memory, replacing lines with decreasing prob. (write_permuted_output_reservoir): New function to output reservoir. * tests/misc/shuf-reservoir.sh: An expensive_ test using valgrind to exercise the reservoir-sampling code. * tests/local.mk: Reference new test. * NEWS: Mention the improvement.
70 lines
2.4 KiB
Bash
Executable File
70 lines
2.4 KiB
Bash
Executable File
#!/bin/sh
|
|
# Exercise shuf's reservoir-sampling code
|
|
# NOTE:
|
|
# These tests do not check valid randomness,
|
|
# they just check memory allocation related code.
|
|
|
|
# Copyright (C) 2013 Free Software Foundation, Inc.
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
|
|
print_ver_ shuf
|
|
expensive_
|
|
require_valgrind_
|
|
|
|
# Run "shuf" with specific number of input lines and output lines
|
|
# Check the output for expected number of lines.
|
|
run_shuf_n()
|
|
{
|
|
INPUT_LINES="$1"
|
|
OUTPUT_LINES="$2"
|
|
|
|
# Critical memory-related bugs will cause a segfault here
|
|
# (with varying numbres of input/output lines)
|
|
seq "$INPUT_LINES" | valgrind --leak-check=full --error-exitcode=1 \
|
|
shuf -n "$OUTPUT_LINES" -o "out_${INPUT_LINES}_${OUTPUT_LINES}" || return 1
|
|
|
|
EXPECTED_LINES="$OUTPUT_LINES"
|
|
test "$INPUT_LINES" -lt "$OUTPUT_LINES" && EXPECTED_LINES="$INPUT_LINES"
|
|
|
|
# There is no sure way to verify shuffled output (as it is random).
|
|
# Ensure we have the correct number of all numeric lines non duplicated lines.
|
|
GOOD_LINES=$(grep '^[0-9][0-9]*$' "out_${INPUT_LINES}_${OUTPUT_LINES}" |
|
|
sort -un | wc -l) || framework_failure_
|
|
LINES=$(wc -l < "out_${INPUT_LINES}_${OUTPUT_LINES}") || framework_failure_
|
|
|
|
test "$EXPECTED_LINES" -eq "$GOOD_LINES" || return 1
|
|
test "$EXPECTED_LINES" -eq "$LINES" || return 1
|
|
|
|
return 0
|
|
}
|
|
|
|
# Test multiple combinations of input lines and output lines.
|
|
# (e.g. small number of input lines and large number of output lines,
|
|
# and vice-versa. Also, each reservoir allocation uses a 1024-lines batch,
|
|
# so test 1023/1024/1025 and related values).
|
|
TEST_LINES="0 1 5 1023 1024 1025 3071 3072 3073"
|
|
|
|
for IN_N in $TEST_LINES; do
|
|
for OUT_N in $TEST_LINES; do
|
|
run_shuf_n "$IN_N" "$OUT_N" || {
|
|
fail=1
|
|
echo "shuf-reservoir-sampling failed with IN_N=$IN_N OUT_N=$OUT_N" >&2;
|
|
}
|
|
done
|
|
done
|
|
|
|
Exit $fail
|