mirror of
git://git.sv.gnu.org/coreutils.git
synced 2026-02-11 09:51:56 +02:00
* bootstrap.conf (gnulib_modules): Add findprog-in. * src/sort.c: Include findprog.h. (pipe_child): Look up the compress_program in $PATH and report errors such as ENOENT or EACCES before invoking posix_spawnp. This avoids inconsistency on systems that emulate posix_spawn through fork/exec, as they would otherwise treat such a failure as a generic failure and fail the sort, rather than continuing without compression.
4963 lines
148 KiB
C
4963 lines
148 KiB
C
/* sort - sort lines of text (with all kinds of options).
|
|
Copyright (C) 1988-2025 Free Software Foundation, Inc.
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
Written December 1988 by Mike Haertel.
|
|
The author may be reached (Email) at the address mike@gnu.ai.mit.edu,
|
|
or (US mail) as Mike Haertel c/o Free Software Foundation.
|
|
|
|
Ørn E. Hansen added NLS support in 1997. */
|
|
|
|
#include <config.h>
|
|
|
|
#include <ctype.h>
|
|
#include <getopt.h>
|
|
#include <pthread.h>
|
|
#include <sys/resource.h>
|
|
#include <sys/types.h>
|
|
#include <sys/wait.h>
|
|
#include <signal.h>
|
|
#include <spawn.h>
|
|
#include "system.h"
|
|
#include "argmatch.h"
|
|
#include "assure.h"
|
|
#include "c-ctype.h"
|
|
#include "fadvise.h"
|
|
#include "filevercmp.h"
|
|
#include "findprog.h"
|
|
#include "flexmember.h"
|
|
#include "hard-locale.h"
|
|
#include "hash.h"
|
|
#include "heap.h"
|
|
#include "ignore-value.h"
|
|
#include "mbswidth.h"
|
|
#include "nproc.h"
|
|
#include "physmem.h"
|
|
#include "posixver.h"
|
|
#include "quote.h"
|
|
#include "randread.h"
|
|
#include "readtokens0.h"
|
|
#include "stdlib--.h"
|
|
#include "strnumcmp.h"
|
|
#include "xmemcoll.h"
|
|
#include "xnanosleep.h"
|
|
#include "xstrtol.h"
|
|
#include "xstrtol-error.h"
|
|
|
|
#ifndef RLIMIT_DATA
|
|
struct rlimit { size_t rlim_cur; };
|
|
# define getrlimit(Resource, Rlp) (-1)
|
|
#endif
|
|
|
|
/* The official name of this program (e.g., no 'g' prefix). */
|
|
#define PROGRAM_NAME "sort"
|
|
|
|
#define AUTHORS \
|
|
proper_name ("Mike Haertel"), \
|
|
proper_name ("Paul Eggert")
|
|
|
|
#if HAVE_LANGINFO_CODESET
|
|
# include <langinfo.h>
|
|
#endif
|
|
|
|
/* Use SA_NOCLDSTOP as a proxy for whether the sigaction machinery is
|
|
present. */
|
|
#ifndef SA_NOCLDSTOP
|
|
# define SA_NOCLDSTOP 0
|
|
/* No sigprocmask. Always 'return' zero. */
|
|
# define sigprocmask(How, Set, Oset) (0)
|
|
# define sigset_t int
|
|
# if ! HAVE_SIGINTERRUPT
|
|
# define siginterrupt(sig, flag) /* empty */
|
|
# endif
|
|
#endif
|
|
|
|
#if !defined OPEN_MAX && defined NR_OPEN
|
|
# define OPEN_MAX NR_OPEN
|
|
#endif
|
|
#if !defined OPEN_MAX
|
|
# define OPEN_MAX 20
|
|
#endif
|
|
|
|
#define UCHAR_LIM (UCHAR_MAX + 1)
|
|
|
|
#ifndef DEFAULT_TMPDIR
|
|
# define DEFAULT_TMPDIR "/tmp"
|
|
#endif
|
|
|
|
/* Maximum number of lines to merge every time a NODE is taken from
|
|
the merge queue. Node is at LEVEL in the binary merge tree,
|
|
and is responsible for merging TOTAL lines. */
|
|
#define MAX_MERGE(total, level) (((total) >> (2 * ((level) + 1))) + 1)
|
|
|
|
/* Heuristic value for the number of lines for which it is worth creating
|
|
a subthread, during an internal merge sort. I.e., it is a small number
|
|
of "average" lines for which sorting via two threads is faster than
|
|
sorting via one on an "average" system. On a dual-core 2.0 GHz i686
|
|
system with 3GB of RAM and 2MB of L2 cache, a file containing 128K
|
|
lines of gensort -a output is sorted slightly faster with --parallel=2
|
|
than with --parallel=1. By contrast, using --parallel=1 is about 10%
|
|
faster than using --parallel=2 with a 64K-line input. */
|
|
enum { SUBTHREAD_LINES_HEURISTIC = 128 * 1024 };
|
|
static_assert (4 <= SUBTHREAD_LINES_HEURISTIC);
|
|
|
|
/* The number of threads after which there are
|
|
diminishing performance gains. */
|
|
enum { DEFAULT_MAX_THREADS = 8 };
|
|
|
|
/* Exit statuses. */
|
|
enum
|
|
{
|
|
/* POSIX says to exit with status 1 if invoked with -c and the
|
|
input is not properly sorted. */
|
|
SORT_OUT_OF_ORDER = 1,
|
|
|
|
/* POSIX says any other irregular exit must exit with a status
|
|
code greater than 1. */
|
|
SORT_FAILURE = 2
|
|
};
|
|
|
|
enum
|
|
{
|
|
/* The number of times we should try to spawn a compression process
|
|
(we retry if the posix_spawnp call fails with EAGAIN). We don't _need_
|
|
to compress temp files, this is just to reduce file system access, so
|
|
this number can be small. Each retry doubles in duration. */
|
|
MAX_TRIES_COMPRESS = 4,
|
|
|
|
/* The number of times we should try to spawn a decompression process.
|
|
If we can't spawn a decompression process, we can't sort, so this
|
|
number should be big. Each retry doubles in duration. */
|
|
MAX_TRIES_DECOMPRESS = 9
|
|
};
|
|
|
|
enum
|
|
{
|
|
/* Level of the end-of-merge node, one level above the root. */
|
|
MERGE_END = 0,
|
|
|
|
/* Level of the root node in merge tree. */
|
|
MERGE_ROOT = 1
|
|
};
|
|
|
|
/* The representation of the decimal point in the current locale. */
|
|
static char decimal_point;
|
|
|
|
/* Thousands separator; if outside char range, there is no separator. */
|
|
static int thousands_sep;
|
|
/* We currently ignore multi-byte grouping chars. */
|
|
static bool thousands_sep_ignored;
|
|
|
|
/* Nonzero if the corresponding locales are hard. */
|
|
static bool hard_LC_COLLATE;
|
|
#if HAVE_NL_LANGINFO
|
|
static bool hard_LC_TIME;
|
|
#endif
|
|
|
|
#define NONZERO(x) ((x) != 0)
|
|
|
|
/* The kind of blanks for '-b' to skip in various options. */
|
|
enum blanktype { bl_start, bl_end, bl_both };
|
|
|
|
/* The character marking end of line. Default to \n. */
|
|
static char eolchar = '\n';
|
|
|
|
/* Lines are held in memory as counted strings. */
|
|
struct line
|
|
{
|
|
char *text; /* Text of the line. */
|
|
size_t length; /* Length including final newline. */
|
|
char *keybeg; /* Start of first key. */
|
|
char *keylim; /* Limit of first key. */
|
|
};
|
|
|
|
/* Input buffers. */
|
|
struct buffer
|
|
{
|
|
char *buf; /* Dynamically allocated buffer,
|
|
partitioned into 3 regions:
|
|
- input data;
|
|
- unused area;
|
|
- an array of lines, in reverse order. */
|
|
size_t used; /* Number of bytes used for input data. */
|
|
size_t nlines; /* Number of lines in the line array. */
|
|
idx_t alloc; /* Number of bytes allocated. */
|
|
size_t left; /* Number of bytes left from previous reads. */
|
|
size_t line_bytes; /* Number of bytes to reserve for each line. */
|
|
bool eof; /* An EOF has been read. */
|
|
};
|
|
|
|
/* Sort key. */
|
|
struct keyfield
|
|
{
|
|
size_t sword; /* Zero-origin 'word' to start at. */
|
|
size_t schar; /* Additional characters to skip. */
|
|
size_t eword; /* Zero-origin last 'word' of key. */
|
|
size_t echar; /* Additional characters in field. */
|
|
bool const *ignore; /* Boolean array of characters to ignore. */
|
|
char const *translate; /* Translation applied to characters. */
|
|
bool skipsblanks; /* Skip leading blanks when finding start. */
|
|
bool skipeblanks; /* Skip leading blanks when finding end. */
|
|
bool numeric; /* Flag for numeric comparison. Handle
|
|
strings of digits with optional decimal
|
|
point, but no exponential notation. */
|
|
bool random; /* Sort by random hash of key. */
|
|
bool general_numeric; /* Flag for general, numeric comparison.
|
|
Handle numbers in exponential notation. */
|
|
bool human_numeric; /* Flag for sorting by human readable
|
|
units with either SI or IEC prefixes. */
|
|
bool month; /* Flag for comparison by month name. */
|
|
bool reverse; /* Reverse the sense of comparison. */
|
|
bool version; /* sort by version number */
|
|
bool traditional_used; /* Traditional key option format is used. */
|
|
struct keyfield *next; /* Next keyfield to try. */
|
|
};
|
|
|
|
struct month
|
|
{
|
|
char const *name;
|
|
int val;
|
|
};
|
|
|
|
/* Binary merge tree node. */
|
|
struct merge_node
|
|
{
|
|
struct line *lo; /* Lines to merge from LO child node. */
|
|
struct line *hi; /* Lines to merge from HI child node. */
|
|
struct line *end_lo; /* End of available lines from LO. */
|
|
struct line *end_hi; /* End of available lines from HI. */
|
|
struct line **dest; /* Pointer to destination of merge. */
|
|
size_t nlo; /* Total Lines remaining from LO. */
|
|
size_t nhi; /* Total lines remaining from HI. */
|
|
struct merge_node *parent; /* Parent node. */
|
|
struct merge_node *lo_child; /* LO child node. */
|
|
struct merge_node *hi_child; /* HI child node. */
|
|
unsigned int level; /* Level in merge tree. */
|
|
bool queued; /* Node is already in heap. */
|
|
pthread_mutex_t lock; /* Lock for node operations. */
|
|
};
|
|
|
|
/* Priority queue of merge nodes. */
|
|
struct merge_node_queue
|
|
{
|
|
struct heap *priority_queue; /* Priority queue of merge tree nodes. */
|
|
pthread_mutex_t mutex; /* Lock for queue operations. */
|
|
pthread_cond_t cond; /* Conditional wait for empty queue to populate
|
|
when popping. */
|
|
};
|
|
|
|
/* Used to implement --unique (-u). */
|
|
static struct line saved_line;
|
|
|
|
/* FIXME: None of these tables work with multibyte character sets.
|
|
Also, there are many other bugs when handling multibyte characters.
|
|
One way to fix this is to rewrite 'sort' to use wide characters
|
|
internally, but doing this with good performance is a bit
|
|
tricky. */
|
|
|
|
/* Table of blanks. */
|
|
static bool blanks[UCHAR_LIM];
|
|
|
|
/* Table of non-printing characters. */
|
|
static bool nonprinting[UCHAR_LIM];
|
|
|
|
/* Table of non-dictionary characters (not letters, digits, or blanks). */
|
|
static bool nondictionary[UCHAR_LIM];
|
|
|
|
/* Translation table folding lower case to upper. */
|
|
static char fold_toupper[UCHAR_LIM];
|
|
|
|
#define MONTHS_PER_YEAR 12
|
|
|
|
/* Table mapping month names to integers.
|
|
Alphabetic order allows binary search. */
|
|
static struct month monthtab[] =
|
|
{
|
|
{"APR", 4},
|
|
{"AUG", 8},
|
|
{"DEC", 12},
|
|
{"FEB", 2},
|
|
{"JAN", 1},
|
|
{"JUL", 7},
|
|
{"JUN", 6},
|
|
{"MAR", 3},
|
|
{"MAY", 5},
|
|
{"NOV", 11},
|
|
{"OCT", 10},
|
|
{"SEP", 9}
|
|
};
|
|
|
|
/* During the merge phase, the number of files to merge at once. */
|
|
#define NMERGE_DEFAULT 16
|
|
|
|
/* Minimum size for a merge or check buffer. */
|
|
#define MIN_MERGE_BUFFER_SIZE (2 + sizeof (struct line))
|
|
|
|
/* Minimum sort size; the code might not work with smaller sizes. */
|
|
#define MIN_SORT_SIZE (nmerge * MIN_MERGE_BUFFER_SIZE)
|
|
|
|
/* The number of bytes needed for a merge or check buffer, which can
|
|
function relatively efficiently even if it holds only one line. If
|
|
a longer line is seen, this value is increased. */
|
|
static size_t merge_buffer_size = MAX (MIN_MERGE_BUFFER_SIZE, 256 * 1024);
|
|
|
|
/* The approximate maximum number of bytes of main memory to use, as
|
|
specified by the user. Zero if the user has not specified a size. */
|
|
static size_t sort_size;
|
|
|
|
/* The initial allocation factor for non-regular files.
|
|
This is used, e.g., when reading from a pipe.
|
|
Don't make it too big, since it is multiplied by ~130 to
|
|
obtain the size of the actual buffer sort will allocate.
|
|
Also, there may be 8 threads all doing this at the same time. */
|
|
#define INPUT_FILE_SIZE_GUESS (128 * 1024)
|
|
|
|
/* Array of directory names in which any temporary files are to be created. */
|
|
static char const **temp_dirs;
|
|
|
|
/* Number of temporary directory names used. */
|
|
static idx_t temp_dir_count;
|
|
|
|
/* Number of allocated slots in temp_dirs. */
|
|
static idx_t temp_dir_alloc;
|
|
|
|
/* Flag to reverse the order of all comparisons. */
|
|
static bool reverse;
|
|
|
|
/* Flag for stable sort. This turns off the last ditch bytewise
|
|
comparison of lines, and instead leaves lines in the same order
|
|
they were read if all keys compare equal. */
|
|
static bool stable;
|
|
|
|
/* An int value outside char range. */
|
|
enum { NON_CHAR = CHAR_MAX + 1 };
|
|
|
|
/* If TAB has this value, blanks separate fields. */
|
|
enum { TAB_DEFAULT = CHAR_MAX + 1 };
|
|
|
|
/* Tab character separating fields. If TAB_DEFAULT, then fields are
|
|
separated by the empty string between a non-blank character and a blank
|
|
character. */
|
|
static int tab = TAB_DEFAULT;
|
|
|
|
/* Flag to remove consecutive duplicate lines from the output.
|
|
Only the last of a sequence of equal lines will be output. */
|
|
static bool unique;
|
|
|
|
/* Nonzero if any of the input files are the standard input. */
|
|
static bool have_read_stdin;
|
|
|
|
/* List of key field comparisons to be tried. */
|
|
static struct keyfield *keylist;
|
|
|
|
/* Program used to (de)compress temp files. Must accept -d. */
|
|
static char const *compress_program;
|
|
|
|
/* Annotate the output with extra info to aid the user. */
|
|
static bool debug;
|
|
|
|
/* Maximum number of files to merge in one go. If more than this
|
|
number are present, temp files will be used. */
|
|
static unsigned int nmerge = NMERGE_DEFAULT;
|
|
|
|
/* Whether SIGPIPE had the default disposition at startup. */
|
|
static bool default_SIGPIPE;
|
|
|
|
/* The list of temporary files. */
|
|
struct tempnode
|
|
{
|
|
struct tempnode *volatile next;
|
|
pid_t pid; /* The subprocess PID; undefined if state == UNCOMPRESSED. */
|
|
char state;
|
|
char name[FLEXIBLE_ARRAY_MEMBER];
|
|
};
|
|
static struct tempnode *volatile temphead;
|
|
static struct tempnode *volatile *temptail = &temphead;
|
|
|
|
/* Clean up any remaining temporary files. */
|
|
|
|
static void
|
|
cleanup (void)
|
|
{
|
|
struct tempnode const *node;
|
|
|
|
for (node = temphead; node; node = node->next)
|
|
unlink (node->name);
|
|
temphead = nullptr;
|
|
}
|
|
|
|
/* Handle interrupts and hangups. */
|
|
|
|
static void
|
|
sighandler (int sig)
|
|
{
|
|
if (! SA_NOCLDSTOP)
|
|
signal (sig, SIG_IGN);
|
|
|
|
cleanup ();
|
|
|
|
signal (sig, SIG_DFL);
|
|
raise (sig);
|
|
}
|
|
|
|
/* Report MESSAGE for FILE, then clean up and exit.
|
|
If FILE is null, it represents standard output. */
|
|
|
|
static void
|
|
sort_die (char const *message, char const *file)
|
|
{
|
|
/* If we got EPIPE writing to stdout (from a previous fwrite() or fclose()
|
|
and SIGPIPE was originally SIG_DFL, mimic standard SIGPIPE behavior. */
|
|
if (errno == EPIPE && !file && default_SIGPIPE)
|
|
sighandler (SIGPIPE);
|
|
|
|
error (SORT_FAILURE, errno, "%s: %s", message,
|
|
quotef (file ? file : _("standard output")));
|
|
}
|
|
|
|
void
|
|
usage (int status)
|
|
{
|
|
if (status != EXIT_SUCCESS)
|
|
emit_try_help ();
|
|
else
|
|
{
|
|
printf (_("\
|
|
Usage: %s [OPTION]... [FILE]...\n\
|
|
or: %s [OPTION]... --files0-from=F\n\
|
|
"),
|
|
program_name, program_name);
|
|
fputs (_("\
|
|
Write sorted concatenation of all FILE(s) to standard output.\n\
|
|
"), stdout);
|
|
|
|
emit_stdin_note ();
|
|
emit_mandatory_arg_note ();
|
|
|
|
fputs (_("\
|
|
Ordering options:\n\
|
|
\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
-b, --ignore-leading-blanks ignore leading blanks\n\
|
|
-d, --dictionary-order consider only blanks and alphanumeric characters\
|
|
\n\
|
|
-f, --ignore-case fold lower case to upper case characters\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
-g, --general-numeric-sort compare according to general numerical value\n\
|
|
-i, --ignore-nonprinting consider only printable characters\n\
|
|
-M, --month-sort compare (unknown) < 'JAN' < ... < 'DEC'\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
-h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
-n, --numeric-sort compare according to string numerical value;\n\
|
|
see full documentation for supported strings\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
-R, --random-sort shuffle, but group identical keys. See shuf(1)\n\
|
|
--random-source=FILE get random bytes from FILE\n\
|
|
-r, --reverse reverse the result of comparisons\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
--sort=WORD sort according to WORD:\n\
|
|
general-numeric -g, human-numeric -h, month -M,\
|
|
\n\
|
|
numeric -n, random -R, version -V\n\
|
|
-V, --version-sort natural sort of (version) numbers within text\n\
|
|
\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
Other options:\n\
|
|
\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
--batch-size=NMERGE merge at most NMERGE inputs at once;\n\
|
|
for more use temp files\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
-c, --check, --check=diagnose-first check for sorted input; do not sort\n\
|
|
-C, --check=quiet, --check=silent like -c, but do not report first bad line\
|
|
\n\
|
|
--compress-program=PROG compress temporaries with PROG;\n\
|
|
decompress them with PROG -d\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
--debug annotate the part of the line used to sort, and\n\
|
|
warn about questionable usage to standard error\n\
|
|
--files0-from=F read input from the files specified by\n\
|
|
NUL-terminated names in file F;\n\
|
|
If F is - then read names from standard input\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
-k, --key=KEYDEF sort via a key; KEYDEF gives location and type\n\
|
|
-m, --merge merge already sorted files; do not sort\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
-o, --output=FILE write result to FILE instead of standard output\n\
|
|
-s, --stable stabilize sort by disabling last-resort comparison\
|
|
\n\
|
|
-S, --buffer-size=SIZE use SIZE for main memory buffer\n\
|
|
"), stdout);
|
|
printf (_("\
|
|
-t, --field-separator=SEP use SEP instead of non-blank to blank transition\n\
|
|
-T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or %s;\n\
|
|
multiple options specify multiple directories\n\
|
|
--parallel=N change the number of sorts run concurrently to N\n\
|
|
-u, --unique output only the first of lines with equal keys;\n\
|
|
with -c, check for strict ordering\n\
|
|
"), DEFAULT_TMPDIR);
|
|
fputs (_("\
|
|
-z, --zero-terminated line delimiter is NUL, not newline\n\
|
|
"), stdout);
|
|
fputs (HELP_OPTION_DESCRIPTION, stdout);
|
|
fputs (VERSION_OPTION_DESCRIPTION, stdout);
|
|
fputs (_("\
|
|
\n\
|
|
KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, where F is a\n\
|
|
field number and C a character position in the field; both are origin 1, and\n\
|
|
the stop position defaults to the line's end. If neither -t nor -b is in\n\
|
|
effect, characters in a field are counted from the beginning of the preceding\n\
|
|
whitespace. OPTS is one or more single-letter ordering options [bdfgiMhnRrV],\
|
|
\n\
|
|
which override global ordering options for that key. If no key is given, use\n\
|
|
the entire line as the key. Use --debug to diagnose incorrect key usage.\n\
|
|
\n\
|
|
SIZE may be followed by the following multiplicative suffixes:\n\
|
|
"), stdout);
|
|
fputs (_("\
|
|
% 1% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y, R, Q.\
|
|
\n\n\
|
|
*** WARNING ***\n\
|
|
The locale specified by the environment affects sort order.\n\
|
|
Set LC_ALL=C to get the traditional sort order that uses\n\
|
|
native byte values.\n\
|
|
"), stdout );
|
|
emit_ancillary_info (PROGRAM_NAME);
|
|
}
|
|
|
|
exit (status);
|
|
}
|
|
|
|
/* For long options that have no equivalent short option, use a
|
|
non-character as a pseudo short option, starting with CHAR_MAX + 1. */
|
|
enum
|
|
{
|
|
CHECK_OPTION = CHAR_MAX + 1,
|
|
COMPRESS_PROGRAM_OPTION,
|
|
DEBUG_PROGRAM_OPTION,
|
|
FILES0_FROM_OPTION,
|
|
NMERGE_OPTION,
|
|
RANDOM_SOURCE_OPTION,
|
|
SORT_OPTION,
|
|
PARALLEL_OPTION
|
|
};
|
|
|
|
static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
|
|
|
|
static struct option const long_options[] =
|
|
{
|
|
{"ignore-leading-blanks", no_argument, nullptr, 'b'},
|
|
{"check", optional_argument, nullptr, CHECK_OPTION},
|
|
{"compress-program", required_argument, nullptr, COMPRESS_PROGRAM_OPTION},
|
|
{"debug", no_argument, nullptr, DEBUG_PROGRAM_OPTION},
|
|
{"dictionary-order", no_argument, nullptr, 'd'},
|
|
{"ignore-case", no_argument, nullptr, 'f'},
|
|
{"files0-from", required_argument, nullptr, FILES0_FROM_OPTION},
|
|
{"general-numeric-sort", no_argument, nullptr, 'g'},
|
|
{"ignore-nonprinting", no_argument, nullptr, 'i'},
|
|
{"key", required_argument, nullptr, 'k'},
|
|
{"merge", no_argument, nullptr, 'm'},
|
|
{"month-sort", no_argument, nullptr, 'M'},
|
|
{"numeric-sort", no_argument, nullptr, 'n'},
|
|
{"human-numeric-sort", no_argument, nullptr, 'h'},
|
|
{"version-sort", no_argument, nullptr, 'V'},
|
|
{"random-sort", no_argument, nullptr, 'R'},
|
|
{"random-source", required_argument, nullptr, RANDOM_SOURCE_OPTION},
|
|
{"sort", required_argument, nullptr, SORT_OPTION},
|
|
{"output", required_argument, nullptr, 'o'},
|
|
{"reverse", no_argument, nullptr, 'r'},
|
|
{"stable", no_argument, nullptr, 's'},
|
|
{"batch-size", required_argument, nullptr, NMERGE_OPTION},
|
|
{"buffer-size", required_argument, nullptr, 'S'},
|
|
{"field-separator", required_argument, nullptr, 't'},
|
|
{"temporary-directory", required_argument, nullptr, 'T'},
|
|
{"unique", no_argument, nullptr, 'u'},
|
|
{"zero-terminated", no_argument, nullptr, 'z'},
|
|
{"parallel", required_argument, nullptr, PARALLEL_OPTION},
|
|
{GETOPT_HELP_OPTION_DECL},
|
|
{GETOPT_VERSION_OPTION_DECL},
|
|
{nullptr, 0, nullptr, 0},
|
|
};
|
|
|
|
#define CHECK_TABLE \
|
|
_ct_("quiet", 'C') \
|
|
_ct_("silent", 'C') \
|
|
_ct_("diagnose-first", 'c')
|
|
|
|
static char const *const check_args[] =
|
|
{
|
|
#define _ct_(_s, _c) _s,
|
|
CHECK_TABLE nullptr
|
|
#undef _ct_
|
|
};
|
|
static char const check_types[] =
|
|
{
|
|
#define _ct_(_s, _c) _c,
|
|
CHECK_TABLE
|
|
#undef _ct_
|
|
};
|
|
|
|
#define SORT_TABLE \
|
|
_st_("general-numeric", 'g') \
|
|
_st_("human-numeric", 'h') \
|
|
_st_("month", 'M') \
|
|
_st_("numeric", 'n') \
|
|
_st_("random", 'R') \
|
|
_st_("version", 'V')
|
|
|
|
static char const *const sort_args[] =
|
|
{
|
|
#define _st_(_s, _c) _s,
|
|
SORT_TABLE nullptr
|
|
#undef _st_
|
|
};
|
|
static char const sort_types[] =
|
|
{
|
|
#define _st_(_s, _c) _c,
|
|
SORT_TABLE
|
|
#undef _st_
|
|
};
|
|
|
|
/* The set of signals that are caught. */
|
|
static sigset_t caught_signals;
|
|
|
|
/* Critical section status. */
|
|
struct cs_status
|
|
{
|
|
bool valid;
|
|
sigset_t sigs;
|
|
};
|
|
|
|
/* Enter a critical section. */
|
|
static void
|
|
cs_enter (struct cs_status *status)
|
|
{
|
|
int ret = pthread_sigmask (SIG_BLOCK, &caught_signals, &status->sigs);
|
|
status->valid = ret == 0;
|
|
}
|
|
|
|
/* Leave a critical section. */
|
|
static void
|
|
cs_leave (struct cs_status const *status)
|
|
{
|
|
if (status->valid)
|
|
{
|
|
/* Ignore failure when restoring the signal mask. */
|
|
pthread_sigmask (SIG_SETMASK, &status->sigs, nullptr);
|
|
}
|
|
}
|
|
|
|
/* Possible states for a temp file. If compressed, the file's status
|
|
is unreaped or reaped, depending on whether 'sort' has waited for
|
|
the subprocess to finish. */
|
|
enum { UNCOMPRESSED, UNREAPED, REAPED };
|
|
|
|
/* A file to be sorted. */
|
|
struct sortfile
|
|
{
|
|
/* The file's name. */
|
|
char const *name;
|
|
|
|
/* Non-null if this is a temporary file, in which case NAME == TEMP->name. */
|
|
struct tempnode *temp;
|
|
};
|
|
|
|
/* Map PIDs of unreaped subprocesses to their struct tempnode objects. */
|
|
static Hash_table *proctab;
|
|
|
|
enum { INIT_PROCTAB_SIZE = 47 };
|
|
|
|
static size_t
|
|
proctab_hasher (void const *entry, size_t tabsize)
|
|
{
|
|
struct tempnode const *node = entry;
|
|
return node->pid % tabsize;
|
|
}
|
|
|
|
static bool
|
|
proctab_comparator (void const *e1, void const *e2)
|
|
{
|
|
struct tempnode const *n1 = e1;
|
|
struct tempnode const *n2 = e2;
|
|
return n1->pid == n2->pid;
|
|
}
|
|
|
|
/* The number of unreaped child processes. */
|
|
static pid_t nprocs;
|
|
|
|
static bool delete_proc (pid_t);
|
|
|
|
/* If PID is positive, wait for the child process with that PID to
|
|
exit, and assume that PID has already been removed from the process
|
|
table. If PID is 0 or -1, clean up some child that has exited (by
|
|
waiting for it, and removing it from the proc table) and return the
|
|
child's process ID. However, if PID is 0 and no children have
|
|
exited, return 0 without waiting. */
|
|
|
|
static pid_t
|
|
reap (pid_t pid)
|
|
{
|
|
int status;
|
|
pid_t cpid = waitpid ((pid ? pid : -1), &status, (pid ? 0 : WNOHANG));
|
|
|
|
if (cpid < 0)
|
|
error (SORT_FAILURE, errno, _("waiting for %s [-d]"),
|
|
quoteaf (compress_program));
|
|
else if (0 < cpid && (0 < pid || delete_proc (cpid)))
|
|
{
|
|
if (! WIFEXITED (status) || WEXITSTATUS (status))
|
|
error (SORT_FAILURE, 0, _("%s [-d] terminated abnormally"),
|
|
quoteaf (compress_program));
|
|
--nprocs;
|
|
}
|
|
|
|
return cpid;
|
|
}
|
|
|
|
/* TEMP represents a new process; add it to the process table. Create
|
|
the process table the first time it's called. */
|
|
|
|
static void
|
|
register_proc (struct tempnode *temp)
|
|
{
|
|
if (! proctab)
|
|
{
|
|
proctab = hash_initialize (INIT_PROCTAB_SIZE, nullptr,
|
|
proctab_hasher,
|
|
proctab_comparator,
|
|
nullptr);
|
|
if (! proctab)
|
|
xalloc_die ();
|
|
}
|
|
|
|
temp->state = UNREAPED;
|
|
|
|
if (! hash_insert (proctab, temp))
|
|
xalloc_die ();
|
|
}
|
|
|
|
/* If PID is in the process table, remove it and return true.
|
|
Otherwise, return false. */
|
|
|
|
static bool
|
|
delete_proc (pid_t pid)
|
|
{
|
|
struct tempnode test;
|
|
|
|
test.pid = pid;
|
|
struct tempnode *node = hash_remove (proctab, &test);
|
|
if (! node)
|
|
return false;
|
|
node->state = REAPED;
|
|
return true;
|
|
}
|
|
|
|
/* Remove PID from the process table, and wait for it to exit if it
|
|
hasn't already. */
|
|
|
|
static void
|
|
wait_proc (pid_t pid)
|
|
{
|
|
if (delete_proc (pid))
|
|
reap (pid);
|
|
}
|
|
|
|
/* Reap any exited children. Do not block; reap only those that have
|
|
already exited. */
|
|
|
|
static void
|
|
reap_exited (void)
|
|
{
|
|
while (0 < nprocs && reap (0))
|
|
continue;
|
|
}
|
|
|
|
/* Reap at least one exited child, waiting if necessary. */
|
|
|
|
static void
|
|
reap_some (void)
|
|
{
|
|
reap (-1);
|
|
reap_exited ();
|
|
}
|
|
|
|
/* Reap all children, waiting if necessary. */
|
|
|
|
static void
|
|
reap_all (void)
|
|
{
|
|
while (0 < nprocs)
|
|
reap (-1);
|
|
}
|
|
|
|
/* Cleanup actions to take when exiting. */
|
|
|
|
static void
|
|
exit_cleanup (void)
|
|
{
|
|
if (temphead)
|
|
{
|
|
/* Clean up any remaining temporary files in a critical section so
|
|
that a signal handler does not try to clean them too. */
|
|
struct cs_status cs;
|
|
cs_enter (&cs);
|
|
cleanup ();
|
|
cs_leave (&cs);
|
|
}
|
|
|
|
close_stdout ();
|
|
}
|
|
|
|
/* Create a new temporary file, returning its newly allocated tempnode.
|
|
Store into *PFD the file descriptor open for writing.
|
|
If the creation fails, return nullptr and store -1 into *PFD if the
|
|
failure is due to file descriptor exhaustion and
|
|
SURVIVE_FD_EXHAUSTION; otherwise, die. */
|
|
|
|
static struct tempnode *
|
|
create_temp_file (int *pfd, bool survive_fd_exhaustion)
|
|
{
|
|
static char const slashbase[] = "/sortXXXXXX";
|
|
static idx_t temp_dir_index;
|
|
int fd;
|
|
int saved_errno;
|
|
char const *temp_dir = temp_dirs[temp_dir_index];
|
|
size_t len = strlen (temp_dir);
|
|
struct tempnode *node =
|
|
xmalloc (FLEXSIZEOF (struct tempnode, name, len + sizeof slashbase));
|
|
char *file = node->name;
|
|
struct cs_status cs;
|
|
|
|
memcpy (file, temp_dir, len);
|
|
memcpy (file + len, slashbase, sizeof slashbase);
|
|
node->next = nullptr;
|
|
if (++temp_dir_index == temp_dir_count)
|
|
temp_dir_index = 0;
|
|
|
|
/* Create the temporary file in a critical section, to avoid races. */
|
|
cs_enter (&cs);
|
|
fd = mkostemp (file, O_CLOEXEC);
|
|
if (0 <= fd)
|
|
{
|
|
*temptail = node;
|
|
temptail = &node->next;
|
|
}
|
|
saved_errno = errno;
|
|
cs_leave (&cs);
|
|
errno = saved_errno;
|
|
|
|
if (fd < 0)
|
|
{
|
|
if (! (survive_fd_exhaustion && errno == EMFILE))
|
|
error (SORT_FAILURE, errno, _("cannot create temporary file in %s"),
|
|
quoteaf (temp_dir));
|
|
free (node);
|
|
node = nullptr;
|
|
}
|
|
|
|
*pfd = fd;
|
|
return node;
|
|
}
|
|
|
|
/* Return a pointer to stdout status, or nullptr on failure. */
|
|
|
|
static struct stat *
|
|
get_outstatus (void)
|
|
{
|
|
static int outstat_errno;
|
|
static struct stat outstat;
|
|
if (outstat_errno == 0)
|
|
outstat_errno = fstat (STDOUT_FILENO, &outstat) == 0 ? -1 : errno;
|
|
return outstat_errno < 0 ? &outstat : nullptr;
|
|
}
|
|
|
|
/* Return a stream for FILE, opened with mode HOW. If HOW is "w",
|
|
the file is already open on standard output, and needs to be
|
|
truncated unless FILE is null. When opening for input, "-"
|
|
means standard input. To avoid confusion, do not return file
|
|
descriptors STDIN_FILENO, STDOUT_FILENO, or STDERR_FILENO when
|
|
opening an ordinary FILE. Return nullptr if unsuccessful.
|
|
|
|
Use fadvise to specify an access pattern for input files.
|
|
There are a few hints we could possibly provide,
|
|
and after careful testing it was decided that
|
|
specifying FADVISE_SEQUENTIAL was not detrimental
|
|
to any cases. On Linux 2.6.31, this option doubles
|
|
the size of read ahead performed and thus was seen to
|
|
benefit these cases:
|
|
Merging
|
|
Sorting with a smaller internal buffer
|
|
Reading from faster flash devices
|
|
|
|
In _addition_ one could also specify other hints...
|
|
|
|
FADVISE_WILLNEED was tested, but Linux 2.6.31
|
|
at least uses that to _synchronously_ prepopulate the cache
|
|
with the specified range. While sort does need to
|
|
read all of its input before outputting, a synchronous
|
|
read of the whole file up front precludes any processing
|
|
that sort could do in parallel with the system doing
|
|
read ahead of the data. This was seen to have negative effects
|
|
in a couple of cases:
|
|
Merging
|
|
Sorting with a smaller internal buffer
|
|
This option was seen to shorten the runtime for sort
|
|
on a multicore system with lots of RAM and other processes
|
|
competing for CPU. It could be argued that more explicit
|
|
scheduling hints with 'nice' et. al. are more appropriate
|
|
for this situation.
|
|
|
|
FADVISE_NOREUSE is a possibility as it could lower
|
|
the priority of input data in the cache as sort will
|
|
only need to process it once. However its functionality
|
|
has changed over Linux kernel versions and as of 2.6.31
|
|
it does nothing and thus we can't depend on what it might
|
|
do in future.
|
|
|
|
FADVISE_DONTNEED is not appropriate for user specified
|
|
input files, but for temp files we do want to drop the
|
|
cache immediately after processing. This is done implicitly
|
|
however when the files are unlinked. */
|
|
|
|
static FILE *
|
|
stream_open (char const *file, char const *how)
|
|
{
|
|
FILE *fp;
|
|
|
|
if (*how == 'r')
|
|
{
|
|
if (streq (file, "-"))
|
|
{
|
|
have_read_stdin = true;
|
|
fp = stdin;
|
|
}
|
|
else
|
|
{
|
|
int fd = open (file, O_RDONLY | O_CLOEXEC);
|
|
fp = fd < 0 ? nullptr : fdopen (fd, how);
|
|
}
|
|
fadvise (fp, FADVISE_SEQUENTIAL);
|
|
}
|
|
else if (*how == 'w')
|
|
{
|
|
if (file && ftruncate (STDOUT_FILENO, 0) != 0)
|
|
{
|
|
int ftruncate_errno = errno;
|
|
struct stat *outst = get_outstatus ();
|
|
if (!outst || S_ISREG (outst->st_mode) || S_TYPEISSHM (outst))
|
|
error (SORT_FAILURE, ftruncate_errno, _("%s: error truncating"),
|
|
quotef (file));
|
|
}
|
|
fp = stdout;
|
|
}
|
|
else
|
|
affirm (!"unexpected mode passed to stream_open");
|
|
|
|
return fp;
|
|
}
|
|
|
|
/* Same as stream_open, except always return a non-null value; die on
|
|
failure. */
|
|
|
|
static FILE *
|
|
xfopen (char const *file, char const *how)
|
|
{
|
|
FILE *fp = stream_open (file, how);
|
|
if (!fp)
|
|
sort_die (_("open failed"), file);
|
|
return fp;
|
|
}
|
|
|
|
/* Close FP, whose name is FILE, and report any errors. */
|
|
|
|
static void
|
|
xfclose (FILE *fp, char const *file)
|
|
{
|
|
switch (fileno (fp))
|
|
{
|
|
case STDIN_FILENO:
|
|
/* Allow reading stdin from tty more than once. */
|
|
clearerr (fp);
|
|
break;
|
|
|
|
case STDOUT_FILENO:
|
|
/* Don't close stdout just yet. close_stdout does that. */
|
|
if (fflush (fp) != 0)
|
|
sort_die (_("fflush failed"), file);
|
|
break;
|
|
|
|
default:
|
|
if (fclose (fp) != 0)
|
|
sort_die (_("close failed"), file);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Move OLDFD to NEWFD. If OLDFD != NEWFD, NEWFD is not close-on-exec. */
|
|
|
|
static void
|
|
move_fd (int oldfd, int newfd)
|
|
{
|
|
if (oldfd != newfd)
|
|
{
|
|
/* These should never fail for our usage. */
|
|
ignore_value (dup2 (oldfd, newfd));
|
|
ignore_value (close (oldfd));
|
|
}
|
|
}
|
|
|
|
/* Setup ACTION to move OLDFD to NEWFD. If OLDFD != NEWFD, NEWFD is not
|
|
close-on-exec. Returns 0 if successful, or an error number otherwise. */
|
|
|
|
static int
|
|
posix_spawn_file_actions_move_fd (posix_spawn_file_actions_t *actions,
|
|
int oldfd, int newfd)
|
|
{
|
|
int result = 0;
|
|
if (oldfd != newfd)
|
|
{
|
|
result = posix_spawn_file_actions_adddup2 (actions, oldfd, newfd);
|
|
if (result == 0)
|
|
result = posix_spawn_file_actions_addclose (actions, oldfd);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/* Execute COMPRESS_PROGRAM in a child process. The child processes pid is
|
|
stored in PD. The TRIES parameter specifies how many times to try to create
|
|
a child process before giving up. Return 0 on success, or an error number
|
|
otherwise. */
|
|
|
|
static int
|
|
pipe_child (pid_t *pid, int pipefds[2], int tempfd, bool decompress,
|
|
size_t tries)
|
|
{
|
|
char const *resolved_compress_program;
|
|
char *compress_program_to_free;
|
|
struct tempnode *saved_temphead;
|
|
double wait_retry = 0.25;
|
|
struct cs_status cs;
|
|
int result;
|
|
posix_spawnattr_t attr;
|
|
posix_spawn_file_actions_t actions;
|
|
|
|
/* Lookup the program before we spawn, so that we consistently
|
|
handle access issues to COMPRESS_PROGRAM, because on some
|
|
implementations/emulations of posix_spawn we get only a
|
|
generic (fatal) error from the child in that case. */
|
|
resolved_compress_program =
|
|
find_in_given_path (compress_program, getenv ("PATH"), NULL, false);
|
|
if (resolved_compress_program == NULL)
|
|
return errno;
|
|
compress_program_to_free = nullptr;
|
|
if (resolved_compress_program != compress_program)
|
|
compress_program_to_free = (char *) resolved_compress_program;
|
|
|
|
if ((result = posix_spawnattr_init (&attr)))
|
|
return result;
|
|
if ((result = posix_spawnattr_setflags (&attr, POSIX_SPAWN_USEVFORK))
|
|
|| (result = posix_spawn_file_actions_init (&actions)))
|
|
{
|
|
posix_spawnattr_destroy (&attr);
|
|
free (compress_program_to_free);
|
|
return result;
|
|
}
|
|
|
|
if (pipe2 (pipefds, O_CLOEXEC) < 0)
|
|
{
|
|
int saved_errno = errno;
|
|
posix_spawnattr_destroy (&attr);
|
|
posix_spawn_file_actions_destroy (&actions);
|
|
free (compress_program_to_free);
|
|
return saved_errno;
|
|
}
|
|
|
|
if ((result = posix_spawn_file_actions_addclose (&actions, STDIN_FILENO))
|
|
|| (result = posix_spawn_file_actions_addclose (&actions, STDOUT_FILENO))
|
|
|| (decompress
|
|
? ((result = posix_spawn_file_actions_addclose (&actions,
|
|
pipefds[0]))
|
|
|| (result = posix_spawn_file_actions_move_fd (&actions, tempfd,
|
|
STDIN_FILENO))
|
|
|| (result = posix_spawn_file_actions_move_fd (&actions,
|
|
pipefds[1],
|
|
STDOUT_FILENO)))
|
|
: ((result = posix_spawn_file_actions_addclose (&actions,
|
|
pipefds[1]))
|
|
|| (result = posix_spawn_file_actions_move_fd (&actions, tempfd,
|
|
STDOUT_FILENO))
|
|
|| (result = posix_spawn_file_actions_move_fd (&actions,
|
|
pipefds[0],
|
|
STDIN_FILENO)))))
|
|
{
|
|
close (pipefds[0]);
|
|
close (pipefds[1]);
|
|
posix_spawnattr_destroy (&attr);
|
|
posix_spawn_file_actions_destroy (&actions);
|
|
free (compress_program_to_free);
|
|
return result;
|
|
}
|
|
|
|
char const *const argv[] =
|
|
{
|
|
resolved_compress_program,
|
|
decompress ? "-d" : nullptr,
|
|
nullptr
|
|
};
|
|
|
|
/* At least NMERGE + 1 subprocesses are needed. More could be created, but
|
|
uncontrolled subprocess generation can hurt performance significantly.
|
|
Allow at most NMERGE + 2 subprocesses, on the theory that there
|
|
may be some useful parallelism by letting compression for the
|
|
previous merge finish (1 subprocess) in parallel with the current
|
|
merge (NMERGE + 1 subprocesses). */
|
|
|
|
if (nmerge + 1 < nprocs)
|
|
reap_some ();
|
|
|
|
while (tries--)
|
|
{
|
|
/* This is so the child process won't delete our temp files
|
|
if it receives a signal before exec-ing. */
|
|
cs_enter (&cs);
|
|
saved_temphead = temphead;
|
|
temphead = nullptr;
|
|
|
|
result = posix_spawnp (pid, resolved_compress_program, &actions, &attr,
|
|
(char * const *) argv, environ);
|
|
|
|
temphead = saved_temphead;
|
|
|
|
cs_leave (&cs);
|
|
|
|
if (result != EAGAIN)
|
|
break;
|
|
else
|
|
{
|
|
/* [v]fork/clone are indicating resource constraints,
|
|
so back-off for a while and retry. */
|
|
xnanosleep (wait_retry);
|
|
wait_retry *= 2;
|
|
reap_exited ();
|
|
}
|
|
}
|
|
|
|
posix_spawnattr_destroy (&attr);
|
|
posix_spawn_file_actions_destroy (&actions);
|
|
free (compress_program_to_free);
|
|
|
|
if (result)
|
|
{
|
|
close (pipefds[0]);
|
|
close (pipefds[1]);
|
|
}
|
|
else
|
|
++nprocs;
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Create a temporary file and, if asked for, start a compressor
|
|
to that file. Set *PFP to the file handle and return
|
|
the address of the new temp node. If the creation
|
|
fails, return nullptr if the failure is due to file descriptor
|
|
exhaustion and SURVIVE_FD_EXHAUSTION; otherwise, die. */
|
|
|
|
static struct tempnode *
|
|
maybe_create_temp (FILE **pfp, bool survive_fd_exhaustion)
|
|
{
|
|
int tempfd;
|
|
struct tempnode *node = create_temp_file (&tempfd, survive_fd_exhaustion);
|
|
if (! node)
|
|
return nullptr;
|
|
|
|
node->state = UNCOMPRESSED;
|
|
|
|
if (compress_program)
|
|
{
|
|
int pipefds[2];
|
|
static int last_result = 0;
|
|
|
|
int result = pipe_child (&node->pid, pipefds, tempfd, false,
|
|
MAX_TRIES_COMPRESS);
|
|
|
|
if (result)
|
|
{
|
|
if (result != last_result)
|
|
error (0, result, _("could not run compress program %s"),
|
|
quoteaf (compress_program));
|
|
}
|
|
else
|
|
{
|
|
close (tempfd);
|
|
close (pipefds[0]);
|
|
tempfd = pipefds[1];
|
|
|
|
register_proc (node);
|
|
}
|
|
|
|
last_result = result;
|
|
}
|
|
|
|
*pfp = fdopen (tempfd, "w");
|
|
if (! *pfp)
|
|
sort_die (_("couldn't create temporary file"), node->name);
|
|
|
|
return node;
|
|
}
|
|
|
|
/* Create a temporary file and, if asked for, start a compressor
|
|
to that file. Set *PFP to the file handle and return the address
|
|
of the new temp node. Die on failure. */
|
|
|
|
static struct tempnode *
|
|
create_temp (FILE **pfp)
|
|
{
|
|
return maybe_create_temp (pfp, false);
|
|
}
|
|
|
|
/* Open a compressed temp file and start a decompression process through
|
|
which to filter the input. Return nullptr (setting errno to
|
|
EMFILE) if we ran out of file descriptors, and die on any other
|
|
kind of failure. */
|
|
|
|
static FILE *
|
|
open_temp (struct tempnode *temp)
|
|
{
|
|
int tempfd, pipefds[2];
|
|
FILE *fp = nullptr;
|
|
|
|
if (temp->state == UNREAPED)
|
|
wait_proc (temp->pid);
|
|
|
|
tempfd = open (temp->name, O_RDONLY);
|
|
if (tempfd < 0)
|
|
return nullptr;
|
|
|
|
pid_t child;
|
|
int result = pipe_child (&child, pipefds, tempfd, true,
|
|
MAX_TRIES_DECOMPRESS);
|
|
|
|
if (result)
|
|
{
|
|
if (result != EMFILE)
|
|
error (SORT_FAILURE, result, _("could not run compress program %s -d"),
|
|
quoteaf (compress_program));
|
|
close (tempfd);
|
|
errno = EMFILE;
|
|
}
|
|
else
|
|
{
|
|
temp->pid = child;
|
|
register_proc (temp);
|
|
close (tempfd);
|
|
close (pipefds[1]);
|
|
|
|
fp = fdopen (pipefds[0], "r");
|
|
if (! fp)
|
|
{
|
|
int saved_errno = errno;
|
|
close (pipefds[0]);
|
|
errno = saved_errno;
|
|
}
|
|
}
|
|
|
|
return fp;
|
|
}
|
|
|
|
/* Append DIR to the array of temporary directory names. */
|
|
static void
|
|
add_temp_dir (char const *dir)
|
|
{
|
|
if (temp_dir_count == temp_dir_alloc)
|
|
temp_dirs = xpalloc (temp_dirs, &temp_dir_alloc, 1, -1, sizeof *temp_dirs);
|
|
|
|
temp_dirs[temp_dir_count++] = dir;
|
|
}
|
|
|
|
/* Remove NAME from the list of temporary files. */
|
|
|
|
static void
|
|
zaptemp (char const *name)
|
|
{
|
|
struct tempnode *volatile *pnode;
|
|
struct tempnode *node;
|
|
struct tempnode *next;
|
|
int unlink_status;
|
|
int unlink_errno = 0;
|
|
struct cs_status cs;
|
|
|
|
for (pnode = &temphead; (node = *pnode)->name != name; pnode = &node->next)
|
|
continue;
|
|
|
|
if (node->state == UNREAPED)
|
|
wait_proc (node->pid);
|
|
|
|
/* Unlink the temporary file in a critical section to avoid races. */
|
|
next = node->next;
|
|
cs_enter (&cs);
|
|
unlink_status = unlink (name);
|
|
unlink_errno = errno;
|
|
*pnode = next;
|
|
cs_leave (&cs);
|
|
|
|
if (unlink_status != 0)
|
|
error (0, unlink_errno, _("warning: cannot remove: %s"), quotef (name));
|
|
if (! next)
|
|
temptail = pnode;
|
|
free (node);
|
|
}
|
|
|
|
#if HAVE_NL_LANGINFO
|
|
|
|
static int
|
|
struct_month_cmp (void const *m1, void const *m2)
|
|
{
|
|
struct month const *month1 = m1;
|
|
struct month const *month2 = m2;
|
|
return strcmp (month1->name, month2->name);
|
|
}
|
|
|
|
#endif
|
|
|
|
/* Initialize the character class tables. */
|
|
|
|
static void
|
|
inittables (void)
|
|
{
|
|
size_t i;
|
|
|
|
for (i = 0; i < UCHAR_LIM; ++i)
|
|
{
|
|
blanks[i] = i == '\n' || isblank (i);
|
|
nondictionary[i] = ! blanks[i] && ! isalnum (i);
|
|
nonprinting[i] = ! isprint (i);
|
|
fold_toupper[i] = toupper (i);
|
|
}
|
|
|
|
#if HAVE_NL_LANGINFO
|
|
/* If we're not in the "C" locale, read different names for months. */
|
|
if (hard_LC_TIME)
|
|
{
|
|
for (i = 0; i < MONTHS_PER_YEAR; i++)
|
|
{
|
|
char const *s;
|
|
size_t s_len;
|
|
size_t j, k;
|
|
char *name;
|
|
|
|
s = nl_langinfo (ABMON_1 + i);
|
|
s_len = strlen (s);
|
|
monthtab[i].name = name = xmalloc (s_len + 1);
|
|
monthtab[i].val = i + 1;
|
|
|
|
for (j = k = 0; j < s_len; j++)
|
|
if (! isblank (to_uchar (s[j])))
|
|
name[k++] = fold_toupper[to_uchar (s[j])];
|
|
name[k] = '\0';
|
|
}
|
|
qsort (monthtab, MONTHS_PER_YEAR, sizeof *monthtab, struct_month_cmp);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/* Specify how many inputs may be merged at once.
|
|
This may be set on the command-line with the
|
|
--batch-size option. */
|
|
static void
|
|
specify_nmerge (int oi, char c, char const *s)
|
|
{
|
|
uintmax_t n;
|
|
struct rlimit rlimit;
|
|
enum strtol_error e = xstrtoumax (s, nullptr, 10, &n, "");
|
|
|
|
/* Try to find out how many file descriptors we'll be able
|
|
to open. We need at least nmerge + 3 (STDIN_FILENO,
|
|
STDOUT_FILENO and STDERR_FILENO). */
|
|
unsigned int max_nmerge = ((getrlimit (RLIMIT_NOFILE, &rlimit) == 0
|
|
? rlimit.rlim_cur
|
|
: OPEN_MAX)
|
|
- 3);
|
|
|
|
if (e == LONGINT_OK)
|
|
{
|
|
nmerge = n;
|
|
if (nmerge != n)
|
|
e = LONGINT_OVERFLOW;
|
|
else
|
|
{
|
|
if (nmerge < 2)
|
|
{
|
|
error (0, 0, _("invalid --%s argument %s"),
|
|
long_options[oi].name, quote (s));
|
|
error (SORT_FAILURE, 0,
|
|
_("minimum --%s argument is %s"),
|
|
long_options[oi].name, quote ("2"));
|
|
}
|
|
else if (max_nmerge < nmerge)
|
|
{
|
|
e = LONGINT_OVERFLOW;
|
|
}
|
|
else
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (e == LONGINT_OVERFLOW)
|
|
{
|
|
error (0, 0, _("--%s argument %s too large"),
|
|
long_options[oi].name, quote (s));
|
|
error (SORT_FAILURE, 0,
|
|
_("maximum --%s argument with current rlimit is %u"),
|
|
long_options[oi].name, max_nmerge);
|
|
}
|
|
else
|
|
xstrtol_fatal (e, oi, c, long_options, s);
|
|
}
|
|
|
|
/* Specify the amount of main memory to use when sorting. */
|
|
static void
|
|
specify_sort_size (int oi, char c, char const *s)
|
|
{
|
|
uintmax_t n;
|
|
char *suffix;
|
|
enum strtol_error e = xstrtoumax (s, &suffix, 10, &n, "EgGkKmMPQRtTYZ");
|
|
|
|
/* The default unit is KiB. */
|
|
if (e == LONGINT_OK && c_isdigit (suffix[-1]) && ckd_mul (&n, n, 1024))
|
|
e = LONGINT_OVERFLOW;
|
|
|
|
/* A 'b' suffix means bytes; a '%' suffix means percent of memory. */
|
|
if (e == LONGINT_INVALID_SUFFIX_CHAR && c_isdigit (suffix[-1]) && ! suffix[1])
|
|
switch (suffix[0])
|
|
{
|
|
case 'b':
|
|
e = LONGINT_OK;
|
|
break;
|
|
|
|
case '%':
|
|
{
|
|
double mem = physmem_total () * n / 100;
|
|
|
|
/* Use "<", not "<=", to avoid problems with rounding. */
|
|
if (mem < UINTMAX_MAX)
|
|
{
|
|
n = mem;
|
|
e = LONGINT_OK;
|
|
}
|
|
else
|
|
e = LONGINT_OVERFLOW;
|
|
}
|
|
break;
|
|
}
|
|
|
|
if (e == LONGINT_OK)
|
|
{
|
|
/* If multiple sort sizes are specified, take the maximum, so
|
|
that option order does not matter. */
|
|
if (n < sort_size)
|
|
return;
|
|
|
|
sort_size = n;
|
|
if (sort_size == n)
|
|
{
|
|
sort_size = MAX (sort_size, MIN_SORT_SIZE);
|
|
return;
|
|
}
|
|
|
|
e = LONGINT_OVERFLOW;
|
|
}
|
|
|
|
xstrtol_fatal (e, oi, c, long_options, s);
|
|
}
|
|
|
|
/* Specify the number of threads to spawn during internal sort. */
|
|
static size_t
|
|
specify_nthreads (int oi, char c, char const *s)
|
|
{
|
|
uintmax_t nthreads;
|
|
enum strtol_error e = xstrtoumax (s, nullptr, 10, &nthreads, "");
|
|
if (e == LONGINT_OVERFLOW)
|
|
return SIZE_MAX;
|
|
if (e != LONGINT_OK)
|
|
xstrtol_fatal (e, oi, c, long_options, s);
|
|
if (SIZE_MAX < nthreads)
|
|
nthreads = SIZE_MAX;
|
|
if (nthreads == 0)
|
|
error (SORT_FAILURE, 0, _("number in parallel must be nonzero"));
|
|
return nthreads;
|
|
}
|
|
|
|
/* Return the default sort size. */
|
|
static size_t
|
|
default_sort_size (void)
|
|
{
|
|
/* Let SIZE be MEM, but no more than the maximum object size,
|
|
total memory, or system resource limits. Don't bother to check
|
|
for values like RLIM_INFINITY since in practice they are not much
|
|
less than SIZE_MAX. */
|
|
size_t size = SIZE_MAX;
|
|
struct rlimit rlimit;
|
|
if (getrlimit (RLIMIT_DATA, &rlimit) == 0 && rlimit.rlim_cur < size)
|
|
size = rlimit.rlim_cur;
|
|
#ifdef RLIMIT_AS
|
|
if (getrlimit (RLIMIT_AS, &rlimit) == 0 && rlimit.rlim_cur < size)
|
|
size = rlimit.rlim_cur;
|
|
#endif
|
|
|
|
/* Leave a large safety margin for the above limits, as failure can
|
|
occur when they are exceeded. */
|
|
size /= 2;
|
|
|
|
#ifdef RLIMIT_RSS
|
|
/* Leave a 1/16 margin for RSS to leave room for code, stack, etc.
|
|
Exceeding RSS is not fatal, but can be quite slow. */
|
|
if (getrlimit (RLIMIT_RSS, &rlimit) == 0 && rlimit.rlim_cur / 16 * 15 < size)
|
|
size = rlimit.rlim_cur / 16 * 15;
|
|
#endif
|
|
|
|
/* Let MEM be available memory or 1/8 of total memory, whichever
|
|
is greater. */
|
|
double avail = physmem_available ();
|
|
double total = physmem_total ();
|
|
double mem = MAX (avail, total / 8);
|
|
|
|
/* Leave a 1/4 margin for physical memory. */
|
|
if (total * 0.75 < size)
|
|
size = total * 0.75;
|
|
|
|
/* Return the minimum of MEM and SIZE, but no less than
|
|
MIN_SORT_SIZE. Avoid the MIN macro here, as it is not quite
|
|
right when only one argument is floating point. */
|
|
if (mem < size)
|
|
size = mem;
|
|
return MAX (size, MIN_SORT_SIZE);
|
|
}
|
|
|
|
/* Return the sort buffer size to use with the input files identified
|
|
by FPS and FILES, which are alternate names of the same files.
|
|
NFILES gives the number of input files; NFPS may be less. Assume
|
|
that each input line requires LINE_BYTES extra bytes' worth of line
|
|
information. Do not exceed the size bound specified by the user
|
|
(or a default size bound, if the user does not specify one). */
|
|
|
|
static size_t
|
|
sort_buffer_size (FILE *const *fps, size_t nfps,
|
|
char *const *files, size_t nfiles,
|
|
size_t line_bytes)
|
|
{
|
|
/* A bound on the input size. If zero, the bound hasn't been
|
|
determined yet. */
|
|
static size_t size_bound;
|
|
|
|
/* In the worst case, each input byte is a newline. */
|
|
size_t worst_case_per_input_byte = line_bytes + 1;
|
|
|
|
/* Keep enough room for one extra input line and an extra byte.
|
|
This extra room might be needed when preparing to read EOF. */
|
|
size_t size = worst_case_per_input_byte + 1;
|
|
|
|
for (size_t i = 0; i < nfiles; i++)
|
|
{
|
|
struct stat st;
|
|
off_t file_size;
|
|
size_t worst_case;
|
|
|
|
if ((i < nfps ? fstat (fileno (fps[i]), &st)
|
|
: streq (files[i], "-") ? fstat (STDIN_FILENO, &st)
|
|
: stat (files[i], &st))
|
|
!= 0)
|
|
sort_die (_("stat failed"), files[i]);
|
|
|
|
if (usable_st_size (&st) && 0 < st.st_size)
|
|
file_size = st.st_size;
|
|
else
|
|
{
|
|
/* The file has unknown size. If the user specified a sort
|
|
buffer size, use that; otherwise, guess the size. */
|
|
if (sort_size)
|
|
return sort_size;
|
|
file_size = INPUT_FILE_SIZE_GUESS;
|
|
}
|
|
|
|
if (! size_bound)
|
|
{
|
|
size_bound = sort_size;
|
|
if (! size_bound)
|
|
size_bound = default_sort_size ();
|
|
}
|
|
|
|
/* Add the amount of memory needed to represent the worst case
|
|
where the input consists entirely of newlines followed by a
|
|
single non-newline. Check for overflow. */
|
|
worst_case = file_size * worst_case_per_input_byte + 1;
|
|
if (file_size != worst_case / worst_case_per_input_byte
|
|
|| size_bound - size <= worst_case)
|
|
return size_bound;
|
|
size += worst_case;
|
|
}
|
|
|
|
return size;
|
|
}
|
|
|
|
/* Initialize BUF. Reserve LINE_BYTES bytes for each line; LINE_BYTES
|
|
must be at least sizeof (struct line). Allocate ALLOC bytes
|
|
initially. */
|
|
|
|
static void
|
|
initbuf (struct buffer *buf, size_t line_bytes, size_t alloc)
|
|
{
|
|
/* Ensure that the line array is properly aligned. If the desired
|
|
size cannot be allocated, repeatedly halve it until allocation
|
|
succeeds. The smaller allocation may hurt overall performance,
|
|
but that's better than failing. */
|
|
while (true)
|
|
{
|
|
alloc += sizeof (struct line) - alloc % sizeof (struct line);
|
|
buf->buf = malloc (alloc);
|
|
if (buf->buf)
|
|
break;
|
|
alloc /= 2;
|
|
if (alloc <= line_bytes + 1)
|
|
xalloc_die ();
|
|
}
|
|
|
|
buf->line_bytes = line_bytes;
|
|
buf->alloc = alloc;
|
|
buf->used = buf->left = buf->nlines = 0;
|
|
buf->eof = false;
|
|
}
|
|
|
|
/* Return one past the limit of the line array. */
|
|
|
|
static inline struct line *
|
|
buffer_linelim (struct buffer const *buf)
|
|
{
|
|
void *linelim = buf->buf + buf->alloc;
|
|
return linelim;
|
|
}
|
|
|
|
/* Return a pointer to the first character of the field specified
|
|
by KEY in LINE. */
|
|
|
|
static char *
|
|
begfield (struct line const *line, struct keyfield const *key)
|
|
{
|
|
char *ptr = line->text, *lim = ptr + line->length - 1;
|
|
size_t sword = key->sword;
|
|
size_t schar = key->schar;
|
|
|
|
/* The leading field separator itself is included in a field when -t
|
|
is absent. */
|
|
|
|
if (tab != TAB_DEFAULT)
|
|
while (ptr < lim && sword--)
|
|
{
|
|
while (ptr < lim && *ptr != tab)
|
|
++ptr;
|
|
if (ptr < lim)
|
|
++ptr;
|
|
}
|
|
else
|
|
while (ptr < lim && sword--)
|
|
{
|
|
while (ptr < lim && blanks[to_uchar (*ptr)])
|
|
++ptr;
|
|
while (ptr < lim && !blanks[to_uchar (*ptr)])
|
|
++ptr;
|
|
}
|
|
|
|
/* If we're ignoring leading blanks when computing the Start
|
|
of the field, skip past them here. */
|
|
if (key->skipsblanks)
|
|
while (ptr < lim && blanks[to_uchar (*ptr)])
|
|
++ptr;
|
|
|
|
/* Advance PTR by SCHAR (if possible), but no further than LIM. */
|
|
size_t remaining_bytes = lim - ptr;
|
|
if (schar < remaining_bytes)
|
|
ptr += schar;
|
|
else
|
|
ptr = lim;
|
|
|
|
return ptr;
|
|
}
|
|
|
|
/* Return the limit of (a pointer to the first character after) the field
|
|
in LINE specified by KEY. */
|
|
|
|
ATTRIBUTE_PURE
|
|
static char *
|
|
limfield (struct line const *line, struct keyfield const *key)
|
|
{
|
|
char *ptr = line->text, *lim = ptr + line->length - 1;
|
|
size_t eword = key->eword, echar = key->echar;
|
|
|
|
if (echar == 0)
|
|
eword++; /* Skip all of end field. */
|
|
|
|
/* Move PTR past EWORD fields or to one past the last byte on LINE,
|
|
whichever comes first. If there are more than EWORD fields, leave
|
|
PTR pointing at the beginning of the field having zero-based index,
|
|
EWORD. If a delimiter character was specified (via -t), then that
|
|
'beginning' is the first character following the delimiting TAB.
|
|
Otherwise, leave PTR pointing at the first 'blank' character after
|
|
the preceding field. */
|
|
if (tab != TAB_DEFAULT)
|
|
while (ptr < lim && eword--)
|
|
{
|
|
while (ptr < lim && *ptr != tab)
|
|
++ptr;
|
|
if (ptr < lim && (eword || echar))
|
|
++ptr;
|
|
}
|
|
else
|
|
while (ptr < lim && eword--)
|
|
{
|
|
while (ptr < lim && blanks[to_uchar (*ptr)])
|
|
++ptr;
|
|
while (ptr < lim && !blanks[to_uchar (*ptr)])
|
|
++ptr;
|
|
}
|
|
|
|
#ifdef POSIX_UNSPECIFIED
|
|
/* The following block of code makes GNU sort incompatible with
|
|
standard Unix sort, so it's ifdef'd out for now.
|
|
The POSIX spec isn't clear on how to interpret this.
|
|
FIXME: request clarification.
|
|
|
|
From: kwzh@gnu.ai.mit.edu (Karl Heuer)
|
|
Date: Thu, 30 May 96 12:20:41 -0400
|
|
[Translated to POSIX 1003.1-2001 terminology by Paul Eggert.]
|
|
|
|
[...]I believe I've found another bug in 'sort'.
|
|
|
|
$ cat /tmp/sort.in
|
|
a b c 2 d
|
|
pq rs 1 t
|
|
$ textutils-1.15/src/sort -k1.7,1.7 </tmp/sort.in
|
|
a b c 2 d
|
|
pq rs 1 t
|
|
$ /bin/sort -k1.7,1.7 </tmp/sort.in
|
|
pq rs 1 t
|
|
a b c 2 d
|
|
|
|
Unix sort produced the answer I expected: sort on the single character
|
|
in column 7. GNU sort produced different results, because it disagrees
|
|
on the interpretation of the key-end spec "M.N". Unix sort reads this
|
|
as "skip M-1 fields, then N-1 characters"; but GNU sort wants it to mean
|
|
"skip M-1 fields, then either N-1 characters or the rest of the current
|
|
field, whichever comes first". This extra clause applies only to
|
|
key-ends, not key-starts.
|
|
*/
|
|
|
|
/* Make LIM point to the end of (one byte past) the current field. */
|
|
if (tab != TAB_DEFAULT)
|
|
{
|
|
char *newlim;
|
|
newlim = memchr (ptr, tab, lim - ptr);
|
|
if (newlim)
|
|
lim = newlim;
|
|
}
|
|
else
|
|
{
|
|
char *newlim;
|
|
newlim = ptr;
|
|
while (newlim < lim && blanks[to_uchar (*newlim)])
|
|
++newlim;
|
|
while (newlim < lim && !blanks[to_uchar (*newlim)])
|
|
++newlim;
|
|
lim = newlim;
|
|
}
|
|
#endif
|
|
|
|
if (echar != 0) /* We need to skip over a portion of the end field. */
|
|
{
|
|
/* If we're ignoring leading blanks when computing the End
|
|
of the field, skip past them here. */
|
|
if (key->skipeblanks)
|
|
while (ptr < lim && blanks[to_uchar (*ptr)])
|
|
++ptr;
|
|
|
|
/* Advance PTR by ECHAR (if possible), but no further than LIM. */
|
|
size_t remaining_bytes = lim - ptr;
|
|
if (echar < remaining_bytes)
|
|
ptr += echar;
|
|
else
|
|
ptr = lim;
|
|
}
|
|
|
|
return ptr;
|
|
}
|
|
|
|
/* Fill BUF reading from FP, moving buf->left bytes from the end
|
|
of buf->buf to the beginning first. If EOF is reached and the
|
|
file wasn't terminated by a newline, supply one. Set up BUF's line
|
|
table too. FILE is the name of the file corresponding to FP.
|
|
Return true if some input was read. */
|
|
|
|
static bool
|
|
fillbuf (struct buffer *buf, FILE *fp, char const *file)
|
|
{
|
|
struct keyfield const *key = keylist;
|
|
char eol = eolchar;
|
|
size_t line_bytes = buf->line_bytes;
|
|
size_t mergesize = merge_buffer_size - MIN_MERGE_BUFFER_SIZE;
|
|
|
|
if (buf->eof)
|
|
return false;
|
|
|
|
if (buf->used != buf->left)
|
|
{
|
|
memmove (buf->buf, buf->buf + buf->used - buf->left, buf->left);
|
|
buf->used = buf->left;
|
|
buf->nlines = 0;
|
|
}
|
|
|
|
while (true)
|
|
{
|
|
char *ptr = buf->buf + buf->used;
|
|
struct line *linelim = buffer_linelim (buf);
|
|
struct line *line = linelim - buf->nlines;
|
|
size_t avail = (char *) linelim - buf->nlines * line_bytes - ptr;
|
|
char *line_start = buf->nlines ? line->text + line->length : buf->buf;
|
|
|
|
while (line_bytes + 1 < avail)
|
|
{
|
|
/* Read as many bytes as possible, but do not read so many
|
|
bytes that there might not be enough room for the
|
|
corresponding line array. The worst case is when the
|
|
rest of the input file consists entirely of newlines,
|
|
except that the last byte is not a newline. */
|
|
size_t readsize = (avail - 1) / (line_bytes + 1);
|
|
size_t bytes_read = fread (ptr, 1, readsize, fp);
|
|
char *ptrlim = ptr + bytes_read;
|
|
char *p;
|
|
avail -= bytes_read;
|
|
|
|
if (bytes_read != readsize)
|
|
{
|
|
if (ferror (fp))
|
|
sort_die (_("read failed"), file);
|
|
if (feof (fp))
|
|
{
|
|
buf->eof = true;
|
|
if (buf->buf == ptrlim)
|
|
return false;
|
|
if (line_start != ptrlim && ptrlim[-1] != eol)
|
|
*ptrlim++ = eol;
|
|
}
|
|
}
|
|
|
|
/* Find and record each line in the just-read input. */
|
|
while ((p = memchr (ptr, eol, ptrlim - ptr)))
|
|
{
|
|
/* Delimit the line with NUL. This eliminates the need to
|
|
temporarily replace the last byte with NUL when calling
|
|
xmemcoll, which increases performance. */
|
|
*p = '\0';
|
|
ptr = p + 1;
|
|
line--;
|
|
line->text = line_start;
|
|
line->length = ptr - line_start;
|
|
mergesize = MAX (mergesize, line->length);
|
|
avail -= line_bytes;
|
|
|
|
if (key)
|
|
{
|
|
/* Precompute the position of the first key for
|
|
efficiency. */
|
|
line->keylim = (key->eword == SIZE_MAX
|
|
? p
|
|
: limfield (line, key));
|
|
|
|
if (key->sword != SIZE_MAX)
|
|
line->keybeg = begfield (line, key);
|
|
else
|
|
{
|
|
if (key->skipsblanks)
|
|
while (blanks[to_uchar (*line_start)])
|
|
line_start++;
|
|
line->keybeg = line_start;
|
|
}
|
|
}
|
|
|
|
line_start = ptr;
|
|
}
|
|
|
|
ptr = ptrlim;
|
|
if (buf->eof)
|
|
break;
|
|
}
|
|
|
|
buf->used = ptr - buf->buf;
|
|
buf->nlines = buffer_linelim (buf) - line;
|
|
if (buf->nlines != 0)
|
|
{
|
|
buf->left = ptr - line_start;
|
|
merge_buffer_size = mergesize + MIN_MERGE_BUFFER_SIZE;
|
|
return true;
|
|
}
|
|
|
|
{
|
|
/* The current input line is too long to fit in the buffer.
|
|
Increase the buffer size and try again, keeping it properly
|
|
aligned. */
|
|
idx_t line_alloc = buf->alloc / sizeof (struct line);
|
|
buf->buf = xpalloc (buf->buf, &line_alloc, 1, -1, sizeof (struct line));
|
|
buf->alloc = line_alloc * sizeof (struct line);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Table that maps characters to order-of-magnitude values. */
|
|
static char const unit_order[UCHAR_LIM] =
|
|
{
|
|
#if ! ('K' == 75 && 'M' == 77 && 'G' == 71 && 'T' == 84 && 'P' == 80 \
|
|
&& 'E' == 69 && 'Z' == 90 && 'Y' == 89 && 'R' == 82 && 'Q' == 81 \
|
|
&& 'k' == 107)
|
|
/* This initializer syntax works on all C99 hosts. For now, use
|
|
it only on non-ASCII hosts, to ease the pain of porting to
|
|
pre-C99 ASCII hosts. */
|
|
['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
|
|
['R']=9, ['Q']=10,
|
|
['k']=1,
|
|
#else
|
|
/* Generate the following table with this command:
|
|
perl -e 'my %a=(k=>1,K=>1,M=>2,G=>3,T=>4,P=>5,E=>6,Z=>7,Y=>8,R=>9,Q=>10);
|
|
foreach my $i (0..255) {my $c=chr($i); $a{$c} ||= 0;print "$a{$c}, "}'\
|
|
|fmt */
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3,
|
|
0, 0, 0, 1, 0, 2, 0, 0, 5, 10, 9, 0, 4, 0, 0, 0, 0, 8, 7, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
#endif
|
|
};
|
|
|
|
/* Traverse number given as *number consisting of digits, thousands_sep, and
|
|
decimal_point chars only. Returns the highest digit found in the number,
|
|
or '\0' if no digit has been found. Upon return *number points at the
|
|
character that immediately follows after the given number. */
|
|
static char
|
|
traverse_raw_number (char const **number)
|
|
{
|
|
char const *p = *number;
|
|
char ch;
|
|
char max_digit = '\0';
|
|
bool ends_with_thousands_sep = false;
|
|
|
|
/* Scan to end of number.
|
|
Decimals or separators not followed by digits stop the scan.
|
|
Numbers ending in decimals or separators are thus considered
|
|
to be lacking in units.
|
|
FIXME: add support for multibyte thousands_sep and decimal_point. */
|
|
|
|
while (c_isdigit (ch = *p++))
|
|
{
|
|
if (max_digit < ch)
|
|
max_digit = ch;
|
|
|
|
/* Allow to skip only one occurrence of thousands_sep to avoid finding
|
|
the unit in the next column in case thousands_sep matches as blank
|
|
and is used as column delimiter. */
|
|
ends_with_thousands_sep = (*p == thousands_sep);
|
|
if (ends_with_thousands_sep)
|
|
++p;
|
|
}
|
|
|
|
if (ends_with_thousands_sep)
|
|
{
|
|
/* thousands_sep not followed by digit is not allowed. */
|
|
*number = p - 2;
|
|
return max_digit;
|
|
}
|
|
|
|
if (ch == decimal_point)
|
|
while (c_isdigit (ch = *p++))
|
|
if (max_digit < ch)
|
|
max_digit = ch;
|
|
|
|
*number = p - 1;
|
|
return max_digit;
|
|
}
|
|
|
|
/* Return an integer that represents the order of magnitude of the
|
|
unit following the number. The number may contain thousands
|
|
separators and a decimal point, but it may not contain leading blanks.
|
|
Negative numbers get negative orders; zero numbers have a zero order. */
|
|
|
|
ATTRIBUTE_PURE
|
|
static int
|
|
find_unit_order (char const *number)
|
|
{
|
|
bool minus_sign = (*number == '-');
|
|
char const *p = number + minus_sign;
|
|
char max_digit = traverse_raw_number (&p);
|
|
if ('0' < max_digit)
|
|
{
|
|
unsigned char ch = *p;
|
|
int order = unit_order[ch];
|
|
return (minus_sign ? -order : order);
|
|
}
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
/* Compare numbers A and B ending in units with SI or IEC prefixes
|
|
<none/unknown> < K/k < M < G < T < P < E < Z < Y < R < Q */
|
|
|
|
ATTRIBUTE_PURE
|
|
static int
|
|
human_numcompare (char const *a, char const *b)
|
|
{
|
|
while (blanks[to_uchar (*a)])
|
|
a++;
|
|
while (blanks[to_uchar (*b)])
|
|
b++;
|
|
|
|
int diff = find_unit_order (a) - find_unit_order (b);
|
|
return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep));
|
|
}
|
|
|
|
/* Compare strings A and B as numbers without explicitly converting them to
|
|
machine numbers. Comparatively slow for short strings, but asymptotically
|
|
hideously fast. */
|
|
|
|
ATTRIBUTE_PURE
|
|
static int
|
|
numcompare (char const *a, char const *b)
|
|
{
|
|
while (blanks[to_uchar (*a)])
|
|
a++;
|
|
while (blanks[to_uchar (*b)])
|
|
b++;
|
|
|
|
return strnumcmp (a, b, decimal_point, thousands_sep);
|
|
}
|
|
|
|
static int
|
|
nan_compare (long double a, long double b)
|
|
{
|
|
char buf[2][sizeof "-nan""()" + CHAR_BIT * sizeof a];
|
|
snprintf (buf[0], sizeof buf[0], "%Lf", a);
|
|
snprintf (buf[1], sizeof buf[1], "%Lf", b);
|
|
return strcmp (buf[0], buf[1]);
|
|
}
|
|
|
|
static int
|
|
general_numcompare (char const *sa, char const *sb)
|
|
{
|
|
/* FIXME: maybe add option to try expensive FP conversion
|
|
only if A and B can't be compared more cheaply/accurately. */
|
|
|
|
char *ea;
|
|
char *eb;
|
|
long double a = strtold (sa, &ea);
|
|
long double b = strtold (sb, &eb);
|
|
|
|
/* Put conversion errors at the start of the collating sequence. */
|
|
if (sa == ea)
|
|
return sb == eb ? 0 : -1;
|
|
if (sb == eb)
|
|
return 1;
|
|
|
|
/* Sort numbers in the usual way, where -0 == +0. Put NaNs after
|
|
conversion errors but before numbers; sort them by internal
|
|
bit-pattern, for lack of a more portable alternative. */
|
|
return (a < b ? -1
|
|
: a > b ? 1
|
|
: a == b ? 0
|
|
: b == b ? -1
|
|
: a == a ? 1
|
|
: nan_compare (a, b));
|
|
}
|
|
|
|
/* Return an integer in 1..12 of the month name MONTH.
|
|
Return 0 if the name in S is not recognized. */
|
|
|
|
static int
|
|
getmonth (char const *month, char **ea)
|
|
{
|
|
size_t lo = 0;
|
|
size_t hi = MONTHS_PER_YEAR;
|
|
|
|
while (blanks[to_uchar (*month)])
|
|
month++;
|
|
|
|
do
|
|
{
|
|
size_t ix = (lo + hi) / 2;
|
|
char const *m = month;
|
|
char const *n = monthtab[ix].name;
|
|
|
|
for (;; m++, n++)
|
|
{
|
|
if (!*n)
|
|
{
|
|
if (ea)
|
|
*ea = (char *) m;
|
|
return monthtab[ix].val;
|
|
}
|
|
if (to_uchar (fold_toupper[to_uchar (*m)]) < to_uchar (*n))
|
|
{
|
|
hi = ix;
|
|
break;
|
|
}
|
|
else if (to_uchar (fold_toupper[to_uchar (*m)]) > to_uchar (*n))
|
|
{
|
|
lo = ix + 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
while (lo < hi);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* When using the OpenSSL implementation, dynamically link only if -R.
|
|
This saves startup time in the usual (sans -R) case. */
|
|
|
|
#if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
|
|
/* In the typical case where md5.h does not #undef HAVE_OPENSSL_MD5,
|
|
trick md5.h into declaring and using pointers to functions not functions.
|
|
This causes the compiler's -lcrypto option to have no effect,
|
|
as sort.o no longer uses any crypto symbols statically. */
|
|
|
|
# if 14 <= __GNUC__
|
|
# pragma GCC diagnostic push
|
|
# pragma GCC diagnostic ignored "-Wmissing-variable-declarations"
|
|
# endif
|
|
# define MD5_Init (*ptr_MD5_Init)
|
|
# define MD5_Update (*ptr_MD5_Update)
|
|
# define MD5_Final (*ptr_MD5_Final)
|
|
#endif
|
|
|
|
#include "md5.h"
|
|
|
|
#if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
|
|
# if 14 <= __GNUC__
|
|
# pragma GCC diagnostic pop
|
|
# endif
|
|
# include <dlfcn.h>
|
|
|
|
/* Diagnose a dynamic linking failure. */
|
|
static void
|
|
link_failure (void)
|
|
{
|
|
error (SORT_FAILURE, 0, "%s", dlerror ());
|
|
}
|
|
|
|
/* Return a function pointer in HANDLE for SYMBOL. */
|
|
static void *
|
|
symbol_address (void *handle, char const *symbol)
|
|
{
|
|
void *address = dlsym (handle, symbol);
|
|
if (!address)
|
|
link_failure ();
|
|
return address;
|
|
}
|
|
#endif
|
|
|
|
/* Dynamically link the crypto library, if it needs linking. */
|
|
static void
|
|
link_libcrypto (void)
|
|
{
|
|
#if DLOPEN_LIBCRYPTO && HAVE_OPENSSL_MD5
|
|
void *handle = dlopen (LIBCRYPTO_SONAME, RTLD_LAZY | RTLD_GLOBAL);
|
|
if (!handle)
|
|
link_failure ();
|
|
ptr_MD5_Init = symbol_address (handle, "MD5_Init");
|
|
ptr_MD5_Update = symbol_address (handle, "MD5_Update");
|
|
ptr_MD5_Final = symbol_address (handle, "MD5_Final");
|
|
#endif
|
|
}
|
|
|
|
/* A randomly chosen MD5 state, used for random comparison. */
|
|
static struct md5_ctx random_md5_state;
|
|
|
|
/* Initialize the randomly chosen MD5 state. */
|
|
|
|
static void
|
|
random_md5_state_init (char const *random_source)
|
|
{
|
|
unsigned char buf[MD5_DIGEST_SIZE];
|
|
struct randread_source *r = randread_new (random_source, sizeof buf);
|
|
if (! r)
|
|
sort_die (_("open failed"), random_source ? random_source : "getrandom");
|
|
randread (r, buf, sizeof buf);
|
|
if (randread_free (r) != 0)
|
|
sort_die (_("close failed"), random_source);
|
|
link_libcrypto ();
|
|
md5_init_ctx (&random_md5_state);
|
|
md5_process_bytes (buf, sizeof buf, &random_md5_state);
|
|
}
|
|
|
|
/* This is like strxfrm, except it reports any error and exits. */
|
|
|
|
static size_t
|
|
xstrxfrm (char *restrict dest, char const *restrict src, size_t destsize)
|
|
{
|
|
errno = 0;
|
|
size_t translated_size = strxfrm (dest, src, destsize);
|
|
|
|
if (errno)
|
|
{
|
|
error (0, errno, _("string transformation failed"));
|
|
error (0, 0, _("set LC_ALL='C' to work around the problem"));
|
|
error (SORT_FAILURE, 0,
|
|
_("the original string was %s"),
|
|
quotearg_n_style (0, locale_quoting_style, src));
|
|
}
|
|
|
|
return translated_size;
|
|
}
|
|
|
|
/* Compare the keys TEXTA (of length LENA) and TEXTB (of length LENB)
|
|
using one or more random hash functions. TEXTA[LENA] and
|
|
TEXTB[LENB] must be zero. */
|
|
|
|
static int
|
|
compare_random (char *restrict texta, size_t lena,
|
|
char *restrict textb, size_t lenb)
|
|
{
|
|
/* XFRM_DIFF records the equivalent of memcmp on the transformed
|
|
data. This is used to break ties if there is a checksum
|
|
collision, and this is good enough given the astronomically low
|
|
probability of a collision. */
|
|
int xfrm_diff = 0;
|
|
|
|
char stackbuf[4000];
|
|
char *buf = stackbuf;
|
|
size_t bufsize = sizeof stackbuf;
|
|
void *allocated = nullptr;
|
|
uint32_t dig[2][MD5_DIGEST_SIZE / sizeof (uint32_t)];
|
|
struct md5_ctx s[2];
|
|
s[0] = s[1] = random_md5_state;
|
|
|
|
if (hard_LC_COLLATE)
|
|
{
|
|
char const *lima = texta + lena;
|
|
char const *limb = textb + lenb;
|
|
|
|
while (true)
|
|
{
|
|
/* Transform the text into the basis of comparison, so that byte
|
|
strings that would otherwise considered to be equal are
|
|
considered equal here even if their bytes differ.
|
|
|
|
Each time through this loop, transform one
|
|
null-terminated string's worth from TEXTA or from TEXTB
|
|
or both. That way, there's no need to store the
|
|
transformation of the whole line, if it contains many
|
|
null-terminated strings. */
|
|
|
|
/* Store the transformed data into a big-enough buffer. */
|
|
|
|
/* A 3X size guess avoids the overhead of calling strxfrm
|
|
twice on typical implementations. Don't worry about
|
|
size_t overflow, as the guess need not be correct. */
|
|
size_t guess_bufsize = 3 * (lena + lenb) + 2;
|
|
if (bufsize < guess_bufsize)
|
|
{
|
|
bufsize = MAX (guess_bufsize, bufsize * 3 / 2);
|
|
free (allocated);
|
|
buf = allocated = malloc (bufsize);
|
|
if (! buf)
|
|
{
|
|
buf = stackbuf;
|
|
bufsize = sizeof stackbuf;
|
|
}
|
|
}
|
|
|
|
size_t sizea =
|
|
(texta < lima ? xstrxfrm (buf, texta, bufsize) + 1 : 0);
|
|
bool a_fits = sizea <= bufsize;
|
|
size_t sizeb =
|
|
(textb < limb
|
|
? (xstrxfrm ((a_fits ? buf + sizea : nullptr), textb,
|
|
(a_fits ? bufsize - sizea : 0))
|
|
+ 1)
|
|
: 0);
|
|
|
|
if (! (a_fits && sizea + sizeb <= bufsize))
|
|
{
|
|
bufsize = sizea + sizeb;
|
|
if (bufsize < SIZE_MAX / 3)
|
|
bufsize = bufsize * 3 / 2;
|
|
free (allocated);
|
|
buf = allocated = xmalloc (bufsize);
|
|
if (texta < lima)
|
|
strxfrm (buf, texta, sizea);
|
|
if (textb < limb)
|
|
strxfrm (buf + sizea, textb, sizeb);
|
|
}
|
|
|
|
/* Advance past NULs to the next part of each input string,
|
|
exiting the loop if both strings are exhausted. When
|
|
exiting the loop, prepare to finish off the tiebreaker
|
|
comparison properly. */
|
|
if (texta < lima)
|
|
texta += strlen (texta) + 1;
|
|
if (textb < limb)
|
|
textb += strlen (textb) + 1;
|
|
if (! (texta < lima || textb < limb))
|
|
{
|
|
lena = sizea; texta = buf;
|
|
lenb = sizeb; textb = buf + sizea;
|
|
break;
|
|
}
|
|
|
|
/* Accumulate the transformed data in the corresponding
|
|
checksums. */
|
|
md5_process_bytes (buf, sizea, &s[0]);
|
|
md5_process_bytes (buf + sizea, sizeb, &s[1]);
|
|
|
|
/* Update the tiebreaker comparison of the transformed data. */
|
|
if (! xfrm_diff)
|
|
{
|
|
xfrm_diff = memcmp (buf, buf + sizea, MIN (sizea, sizeb));
|
|
if (! xfrm_diff)
|
|
xfrm_diff = _GL_CMP (sizea, sizeb);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Compute and compare the checksums. */
|
|
md5_process_bytes (texta, lena, &s[0]); md5_finish_ctx (&s[0], dig[0]);
|
|
md5_process_bytes (textb, lenb, &s[1]); md5_finish_ctx (&s[1], dig[1]);
|
|
int diff = memcmp (dig[0], dig[1], sizeof dig[0]);
|
|
|
|
/* Fall back on the tiebreaker if the checksums collide. */
|
|
if (! diff)
|
|
{
|
|
if (! xfrm_diff)
|
|
{
|
|
xfrm_diff = memcmp (texta, textb, MIN (lena, lenb));
|
|
if (! xfrm_diff)
|
|
xfrm_diff = _GL_CMP (lena, lenb);
|
|
}
|
|
|
|
diff = xfrm_diff;
|
|
}
|
|
|
|
free (allocated);
|
|
|
|
return diff;
|
|
}
|
|
|
|
/* Return the printable width of the block of memory starting at
|
|
TEXT and ending just before LIM, counting each tab as one byte.
|
|
FIXME: Should we generally be counting non printable chars? */
|
|
|
|
static size_t
|
|
debug_width (char const *text, char const *lim)
|
|
{
|
|
size_t width = mbsnwidth (text, lim - text, 0);
|
|
while (text < lim)
|
|
width += (*text++ == '\t');
|
|
return width;
|
|
}
|
|
|
|
/* For debug mode, "underline" a key at the
|
|
specified offset and screen width. */
|
|
|
|
static void
|
|
mark_key (size_t offset, size_t width)
|
|
{
|
|
while (offset--)
|
|
putchar (' ');
|
|
|
|
if (!width)
|
|
printf (_("^ no match for key\n"));
|
|
else
|
|
{
|
|
do
|
|
putchar ('_');
|
|
while (--width);
|
|
|
|
putchar ('\n');
|
|
}
|
|
}
|
|
|
|
/* Return true if KEY is a numeric key. */
|
|
|
|
static inline bool
|
|
key_numeric (struct keyfield const *key)
|
|
{
|
|
return key->numeric || key->general_numeric || key->human_numeric;
|
|
}
|
|
|
|
/* For LINE, output a debugging line that underlines KEY in LINE.
|
|
If KEY is null, underline the whole line. */
|
|
|
|
static void
|
|
debug_key (struct line const *line, struct keyfield const *key)
|
|
{
|
|
char *text = line->text;
|
|
char *beg = text;
|
|
char *lim = text + line->length - 1;
|
|
|
|
if (key)
|
|
{
|
|
if (key->sword != SIZE_MAX)
|
|
beg = begfield (line, key);
|
|
if (key->eword != SIZE_MAX)
|
|
{
|
|
lim = limfield (line, key);
|
|
/* Treat field ends before field starts as empty fields. */
|
|
lim = MAX (beg, lim);
|
|
}
|
|
|
|
if ((key->skipsblanks && key->sword == SIZE_MAX)
|
|
|| key->month || key_numeric (key))
|
|
{
|
|
char saved = *lim;
|
|
*lim = '\0';
|
|
|
|
while (blanks[to_uchar (*beg)])
|
|
beg++;
|
|
|
|
char *tighter_lim = beg;
|
|
|
|
if (lim < beg)
|
|
tighter_lim = lim;
|
|
else if (key->month)
|
|
getmonth (beg, &tighter_lim);
|
|
else if (key->general_numeric)
|
|
ignore_value (strtold (beg, &tighter_lim));
|
|
else if (key->numeric || key->human_numeric)
|
|
{
|
|
char const *p = beg + (beg < lim && *beg == '-');
|
|
char max_digit = traverse_raw_number (&p);
|
|
if ('0' <= max_digit)
|
|
{
|
|
unsigned char ch = *p;
|
|
tighter_lim = (char *) p
|
|
+ (key->human_numeric && unit_order[ch]);
|
|
}
|
|
}
|
|
else
|
|
tighter_lim = lim;
|
|
|
|
*lim = saved;
|
|
lim = tighter_lim;
|
|
}
|
|
}
|
|
|
|
size_t offset = debug_width (text, beg);
|
|
size_t width = debug_width (beg, lim);
|
|
mark_key (offset, width);
|
|
}
|
|
|
|
/* Debug LINE by underlining its keys. */
|
|
|
|
static void
|
|
debug_line (struct line const *line)
|
|
{
|
|
struct keyfield const *key = keylist;
|
|
|
|
do
|
|
debug_key (line, key);
|
|
while (key && ((key = key->next) || ! (unique || stable)));
|
|
}
|
|
|
|
/* Return whether sorting options specified for key. */
|
|
|
|
static bool
|
|
default_key_compare (struct keyfield const *key)
|
|
{
|
|
return ! (key->ignore
|
|
|| key->translate
|
|
|| key->skipsblanks
|
|
|| key->skipeblanks
|
|
|| key_numeric (key)
|
|
|| key->month
|
|
|| key->version
|
|
|| key->random
|
|
/* || key->reverse */
|
|
);
|
|
}
|
|
|
|
/* Convert a key to the short options used to specify it. */
|
|
|
|
static void
|
|
key_to_opts (struct keyfield const *key, char *opts)
|
|
{
|
|
if (key->skipsblanks || key->skipeblanks)
|
|
*opts++ = 'b';/* either disables global -b */
|
|
if (key->ignore == nondictionary)
|
|
*opts++ = 'd';
|
|
if (key->translate)
|
|
*opts++ = 'f';
|
|
if (key->general_numeric)
|
|
*opts++ = 'g';
|
|
if (key->human_numeric)
|
|
*opts++ = 'h';
|
|
if (key->ignore == nonprinting)
|
|
*opts++ = 'i';
|
|
if (key->month)
|
|
*opts++ = 'M';
|
|
if (key->numeric)
|
|
*opts++ = 'n';
|
|
if (key->random)
|
|
*opts++ = 'R';
|
|
if (key->reverse)
|
|
*opts++ = 'r';
|
|
if (key->version)
|
|
*opts++ = 'V';
|
|
*opts = '\0';
|
|
}
|
|
|
|
/* Output data independent key warnings to stderr. */
|
|
|
|
static void
|
|
key_warnings (struct keyfield const *gkey, bool gkey_only)
|
|
{
|
|
struct keyfield const *key;
|
|
struct keyfield ugkey = *gkey;
|
|
unsigned long keynum = 1;
|
|
bool basic_numeric_field = false;
|
|
bool general_numeric_field = false;
|
|
bool basic_numeric_field_span = false;
|
|
bool general_numeric_field_span = false;
|
|
|
|
for (key = keylist; key; key = key->next, keynum++)
|
|
{
|
|
if (key_numeric (key))
|
|
{
|
|
if (key->general_numeric)
|
|
general_numeric_field = true;
|
|
else
|
|
basic_numeric_field = true;
|
|
}
|
|
|
|
if (key->traditional_used)
|
|
{
|
|
size_t sword = key->sword;
|
|
size_t eword = key->eword;
|
|
char tmp[INT_BUFSIZE_BOUND (uintmax_t)];
|
|
/* obsolescent syntax +A.x -B.y is equivalent to:
|
|
-k A+1.x+1,B.y (when y = 0)
|
|
-k A+1.x+1,B+1.y (when y > 0) */
|
|
char obuf[INT_BUFSIZE_BOUND (sword) * 2 + 4]; /* +# -# */
|
|
char nbuf[INT_BUFSIZE_BOUND (sword) * 2 + 5]; /* -k #,# */
|
|
char *po = obuf;
|
|
char *pn = nbuf;
|
|
|
|
if (sword == SIZE_MAX)
|
|
sword++;
|
|
|
|
po = stpcpy (stpcpy (po, "+"), umaxtostr (sword, tmp));
|
|
pn = stpcpy (stpcpy (pn, "-k "), umaxtostr (sword + 1, tmp));
|
|
if (key->eword != SIZE_MAX)
|
|
{
|
|
stpcpy (stpcpy (po, " -"), umaxtostr (eword + 1, tmp));
|
|
stpcpy (stpcpy (pn, ","),
|
|
umaxtostr (eword + 1
|
|
+ (key->echar == SIZE_MAX), tmp));
|
|
}
|
|
error (0, 0, _("obsolescent key %s used; consider %s instead"),
|
|
quote_n (0, obuf), quote_n (1, nbuf));
|
|
}
|
|
|
|
/* Warn about field specs that will never match. */
|
|
bool zero_width = key->sword != SIZE_MAX && key->eword < key->sword;
|
|
if (zero_width)
|
|
error (0, 0, _("key %lu has zero width and will be ignored"), keynum);
|
|
|
|
/* Warn about significant leading blanks. */
|
|
bool implicit_skip = key_numeric (key) || key->month;
|
|
bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */
|
|
if (!zero_width && !gkey_only && tab == TAB_DEFAULT && !line_offset
|
|
&& ((!key->skipsblanks && !implicit_skip)
|
|
|| (!key->skipsblanks && key->schar)
|
|
|| (!key->skipeblanks && key->echar)))
|
|
error (0, 0, _("leading blanks are significant in key %lu; "
|
|
"consider also specifying 'b'"), keynum);
|
|
|
|
/* Warn about numeric comparisons spanning fields,
|
|
as field delimiters could be interpreted as part
|
|
of the number (maybe only in other locales). */
|
|
if (!gkey_only && key_numeric (key))
|
|
{
|
|
size_t sword = key->sword + 1;
|
|
size_t eword = key->eword + 1;
|
|
if (!sword)
|
|
sword++;
|
|
if (!eword || sword < eword)
|
|
{
|
|
error (0, 0, _("key %lu is numeric and spans multiple fields"),
|
|
keynum);
|
|
if (key->general_numeric)
|
|
general_numeric_field_span = true;
|
|
else
|
|
basic_numeric_field_span = true;
|
|
}
|
|
}
|
|
|
|
/* Flag global options not copied or specified in any key. */
|
|
if (ugkey.ignore && (ugkey.ignore == key->ignore))
|
|
ugkey.ignore = nullptr;
|
|
if (ugkey.translate && (ugkey.translate == key->translate))
|
|
ugkey.translate = nullptr;
|
|
ugkey.skipsblanks &= !key->skipsblanks;
|
|
ugkey.skipeblanks &= !key->skipeblanks;
|
|
ugkey.month &= !key->month;
|
|
ugkey.numeric &= !key->numeric;
|
|
ugkey.general_numeric &= !key->general_numeric;
|
|
ugkey.human_numeric &= !key->human_numeric;
|
|
ugkey.random &= !key->random;
|
|
ugkey.version &= !key->version;
|
|
ugkey.reverse &= !key->reverse;
|
|
}
|
|
|
|
/* Explicitly warn if field delimiters in this locale
|
|
don't constrain numbers. */
|
|
bool number_locale_warned = false;
|
|
if (basic_numeric_field_span)
|
|
{
|
|
if (tab == TAB_DEFAULT
|
|
? thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep)))
|
|
: tab == thousands_sep)
|
|
{
|
|
error (0, 0,
|
|
_("field separator %s is treated as a "
|
|
"group separator in numbers"),
|
|
quote (((char []) {thousands_sep, 0})));
|
|
number_locale_warned = true;
|
|
}
|
|
}
|
|
if (basic_numeric_field_span || general_numeric_field_span)
|
|
{
|
|
if (tab == TAB_DEFAULT
|
|
? thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point)))
|
|
: tab == decimal_point)
|
|
{
|
|
error (0, 0,
|
|
_("field separator %s is treated as a "
|
|
"decimal point in numbers"),
|
|
quote (((char []) {decimal_point, 0})));
|
|
number_locale_warned = true;
|
|
}
|
|
else if (tab == '-')
|
|
{
|
|
error (0, 0,
|
|
_("field separator %s is treated as a "
|
|
"minus sign in numbers"),
|
|
quote (((char []) {tab, 0})));
|
|
}
|
|
else if (general_numeric_field_span && tab == '+')
|
|
{
|
|
error (0, 0,
|
|
_("field separator %s is treated as a "
|
|
"plus sign in numbers"),
|
|
quote (((char []) {tab, 0})));
|
|
}
|
|
}
|
|
|
|
/* Explicitly indicate the decimal point used in this locale,
|
|
as it suggests that robust scripts need to consider
|
|
setting the locale when comparing numbers. */
|
|
if ((basic_numeric_field || general_numeric_field) && ! number_locale_warned)
|
|
{
|
|
error (0, 0,
|
|
_("numbers use %s as a decimal point in this locale"),
|
|
quote (((char []) {decimal_point, 0})));
|
|
|
|
}
|
|
|
|
if (basic_numeric_field && thousands_sep_ignored)
|
|
{
|
|
error (0, 0,
|
|
_("the multi-byte number group separator "
|
|
"in this locale is not supported"));
|
|
}
|
|
|
|
/* Warn about ignored global options flagged above.
|
|
This clears all flags if UGKEY is the only one in the list. */
|
|
if (!default_key_compare (&ugkey)
|
|
|| (ugkey.reverse && (stable || unique) && keylist))
|
|
{
|
|
bool ugkey_reverse = ugkey.reverse;
|
|
if (!(stable || unique))
|
|
ugkey.reverse = false;
|
|
/* The following is too big, but guaranteed to be "big enough". */
|
|
char opts[sizeof short_options];
|
|
key_to_opts (&ugkey, opts);
|
|
error (0, 0,
|
|
ngettext ("option '-%s' is ignored",
|
|
"options '-%s' are ignored",
|
|
select_plural (strlen (opts))), opts);
|
|
ugkey.reverse = ugkey_reverse;
|
|
}
|
|
if (ugkey.reverse && !(stable || unique) && keylist)
|
|
error (0, 0, _("option '-r' only applies to last-resort comparison"));
|
|
}
|
|
|
|
/* Return either the sense of DIFF or its reverse, depending on REVERSED.
|
|
If REVERSED, do not simply negate DIFF as that can mishandle INT_MIN. */
|
|
|
|
static int
|
|
diff_reversed (int diff, bool reversed)
|
|
{
|
|
return reversed ? _GL_CMP (0, diff) : diff;
|
|
}
|
|
|
|
/* Compare two lines A and B trying every key in sequence until there
|
|
are no more keys or a difference is found. */
|
|
|
|
static int
|
|
keycompare (struct line const *a, struct line const *b)
|
|
{
|
|
struct keyfield *key = keylist;
|
|
|
|
/* For the first iteration only, the key positions have been
|
|
precomputed for us. */
|
|
char *texta = a->keybeg;
|
|
char *textb = b->keybeg;
|
|
char *lima = a->keylim;
|
|
char *limb = b->keylim;
|
|
|
|
int diff;
|
|
|
|
while (true)
|
|
{
|
|
char const *translate = key->translate;
|
|
bool const *ignore = key->ignore;
|
|
|
|
/* Treat field ends before field starts as empty fields. */
|
|
lima = MAX (texta, lima);
|
|
limb = MAX (textb, limb);
|
|
|
|
/* Find the lengths. */
|
|
size_t lena = lima - texta;
|
|
size_t lenb = limb - textb;
|
|
|
|
if (hard_LC_COLLATE || key_numeric (key)
|
|
|| key->month || key->random || key->version)
|
|
{
|
|
/* Ordinarily use the keys in-place, temporarily null-terminated. */
|
|
char *ta = texta;
|
|
char *tb = textb;
|
|
size_t tlena = lena;
|
|
size_t tlenb = lenb;
|
|
char enda = ta[tlena];
|
|
char endb = tb[tlenb];
|
|
|
|
void *allocated = nullptr;
|
|
char stackbuf[4000];
|
|
|
|
if (ignore || translate)
|
|
{
|
|
/* Compute with copies of the keys, which are the result of
|
|
translating or ignoring characters, and which need their
|
|
own storage. */
|
|
|
|
size_t i;
|
|
|
|
/* Allocate space for copies. */
|
|
size_t size = lena + 1 + lenb + 1;
|
|
if (size <= sizeof stackbuf)
|
|
ta = stackbuf;
|
|
else
|
|
ta = allocated = xmalloc (size);
|
|
tb = ta + lena + 1;
|
|
|
|
/* Put into each copy a version of the key in which the
|
|
requested characters are ignored or translated. */
|
|
for (tlena = i = 0; i < lena; i++)
|
|
if (! (ignore && ignore[to_uchar (texta[i])]))
|
|
ta[tlena++] = (translate
|
|
? translate[to_uchar (texta[i])]
|
|
: texta[i]);
|
|
|
|
for (tlenb = i = 0; i < lenb; i++)
|
|
if (! (ignore && ignore[to_uchar (textb[i])]))
|
|
tb[tlenb++] = (translate
|
|
? translate[to_uchar (textb[i])]
|
|
: textb[i]);
|
|
}
|
|
|
|
ta[tlena] = '\0';
|
|
tb[tlenb] = '\0';
|
|
|
|
if (key->numeric)
|
|
diff = numcompare (ta, tb);
|
|
else if (key->general_numeric)
|
|
diff = general_numcompare (ta, tb);
|
|
else if (key->human_numeric)
|
|
diff = human_numcompare (ta, tb);
|
|
else if (key->month)
|
|
diff = getmonth (ta, nullptr) - getmonth (tb, nullptr);
|
|
else if (key->random)
|
|
diff = compare_random (ta, tlena, tb, tlenb);
|
|
else if (key->version)
|
|
diff = filenvercmp (ta, tlena, tb, tlenb);
|
|
else
|
|
{
|
|
/* Locale-dependent string sorting. This is slower than
|
|
C-locale sorting, which is implemented below. */
|
|
if (tlena == 0)
|
|
diff = - NONZERO (tlenb);
|
|
else if (tlenb == 0)
|
|
diff = 1;
|
|
else
|
|
diff = xmemcoll0 (ta, tlena + 1, tb, tlenb + 1);
|
|
}
|
|
|
|
ta[tlena] = enda;
|
|
tb[tlenb] = endb;
|
|
|
|
free (allocated);
|
|
}
|
|
else if (ignore)
|
|
{
|
|
#define CMP_WITH_IGNORE(A, B) \
|
|
do \
|
|
{ \
|
|
while (true) \
|
|
{ \
|
|
while (texta < lima && ignore[to_uchar (*texta)]) \
|
|
++texta; \
|
|
while (textb < limb && ignore[to_uchar (*textb)]) \
|
|
++textb; \
|
|
if (! (texta < lima && textb < limb)) \
|
|
{ \
|
|
diff = (texta < lima) - (textb < limb); \
|
|
break; \
|
|
} \
|
|
diff = to_uchar (A) - to_uchar (B); \
|
|
if (diff) \
|
|
break; \
|
|
++texta; \
|
|
++textb; \
|
|
} \
|
|
\
|
|
} \
|
|
while (0)
|
|
|
|
if (translate)
|
|
CMP_WITH_IGNORE (translate[to_uchar (*texta)],
|
|
translate[to_uchar (*textb)]);
|
|
else
|
|
CMP_WITH_IGNORE (*texta, *textb);
|
|
}
|
|
else
|
|
{
|
|
size_t lenmin = MIN (lena, lenb);
|
|
if (lenmin == 0)
|
|
diff = 0;
|
|
else if (translate)
|
|
{
|
|
size_t i = 0;
|
|
do
|
|
{
|
|
diff = (to_uchar (translate[to_uchar (texta[i])])
|
|
- to_uchar (translate[to_uchar (textb[i])]));
|
|
if (diff)
|
|
break;
|
|
i++;
|
|
}
|
|
while (i < lenmin);
|
|
}
|
|
else
|
|
diff = memcmp (texta, textb, lenmin);
|
|
|
|
if (! diff)
|
|
diff = _GL_CMP (lena, lenb);
|
|
}
|
|
|
|
if (diff)
|
|
break;
|
|
|
|
key = key->next;
|
|
if (! key)
|
|
return 0;
|
|
|
|
/* Find the beginning and limit of the next field. */
|
|
if (key->eword != SIZE_MAX)
|
|
lima = limfield (a, key), limb = limfield (b, key);
|
|
else
|
|
lima = a->text + a->length - 1, limb = b->text + b->length - 1;
|
|
|
|
if (key->sword != SIZE_MAX)
|
|
texta = begfield (a, key), textb = begfield (b, key);
|
|
else
|
|
{
|
|
texta = a->text, textb = b->text;
|
|
if (key->skipsblanks)
|
|
{
|
|
while (texta < lima && blanks[to_uchar (*texta)])
|
|
++texta;
|
|
while (textb < limb && blanks[to_uchar (*textb)])
|
|
++textb;
|
|
}
|
|
}
|
|
}
|
|
|
|
return diff_reversed (diff, key->reverse);
|
|
}
|
|
|
|
/* Compare two lines A and B, returning negative, zero, or positive
|
|
depending on whether A compares less than, equal to, or greater than B. */
|
|
|
|
static int
|
|
compare (struct line const *a, struct line const *b)
|
|
{
|
|
int diff;
|
|
size_t alen, blen;
|
|
|
|
/* First try to compare on the specified keys (if any).
|
|
The only two cases with no key at all are unadorned sort,
|
|
and unadorned sort -r. */
|
|
if (keylist)
|
|
{
|
|
diff = keycompare (a, b);
|
|
if (diff || unique || stable)
|
|
return diff;
|
|
}
|
|
|
|
/* If the keys all compare equal (or no keys were specified)
|
|
fall through to the default comparison. */
|
|
alen = a->length - 1, blen = b->length - 1;
|
|
|
|
if (alen == 0)
|
|
diff = - NONZERO (blen);
|
|
else if (blen == 0)
|
|
diff = 1;
|
|
else if (hard_LC_COLLATE)
|
|
{
|
|
/* xmemcoll0 is a performance enhancement as
|
|
it will not unconditionally write '\0' after the
|
|
passed in buffers, which was seen to give around
|
|
a 3% increase in performance for short lines. */
|
|
diff = xmemcoll0 (a->text, alen + 1, b->text, blen + 1);
|
|
}
|
|
else
|
|
{
|
|
diff = memcmp (a->text, b->text, MIN (alen, blen));
|
|
if (!diff)
|
|
diff = _GL_CMP (alen, blen);
|
|
}
|
|
|
|
return diff_reversed (diff, reverse);
|
|
}
|
|
|
|
/* Write LINE to output stream FP; the output file's name is
|
|
OUTPUT_FILE if OUTPUT_FILE is non-null, and is the standard output
|
|
otherwise. If debugging is enabled and FP is standard output,
|
|
append some debugging information. */
|
|
|
|
static void
|
|
write_line (struct line const *line, FILE *fp, char const *output_file)
|
|
{
|
|
char *buf = line->text;
|
|
size_t n_bytes = line->length;
|
|
char *ebuf = buf + n_bytes;
|
|
|
|
if (!output_file && debug)
|
|
{
|
|
/* Convert TAB to '>' and EOL to \n, and then output debugging info. */
|
|
char const *c = buf;
|
|
|
|
while (c < ebuf)
|
|
{
|
|
char wc = *c++;
|
|
if (wc == '\t')
|
|
wc = '>';
|
|
else if (c == ebuf)
|
|
wc = '\n';
|
|
if (fputc (wc, fp) == EOF)
|
|
sort_die (_("write failed"), output_file);
|
|
}
|
|
|
|
debug_line (line);
|
|
}
|
|
else
|
|
{
|
|
ebuf[-1] = eolchar;
|
|
if (fwrite (buf, 1, n_bytes, fp) != n_bytes)
|
|
sort_die (_("write failed"), output_file);
|
|
ebuf[-1] = '\0';
|
|
}
|
|
}
|
|
|
|
/* Check that the lines read from FILE_NAME come in order. Return
|
|
true if they are in order. If CHECKONLY == 'c', also print a
|
|
diagnostic (FILE_NAME, line number, contents of line) to stderr if
|
|
they are not in order. */
|
|
|
|
static bool
|
|
check (char const *file_name, char checkonly)
|
|
{
|
|
FILE *fp = xfopen (file_name, "r");
|
|
struct buffer buf; /* Input buffer. */
|
|
struct line temp; /* Copy of previous line. */
|
|
size_t alloc = 0;
|
|
uintmax_t line_number = 0;
|
|
struct keyfield const *key = keylist;
|
|
bool nonunique = ! unique;
|
|
bool ordered = true;
|
|
|
|
initbuf (&buf, sizeof (struct line),
|
|
MAX (merge_buffer_size, sort_size));
|
|
temp.text = nullptr;
|
|
|
|
while (fillbuf (&buf, fp, file_name))
|
|
{
|
|
struct line const *line = buffer_linelim (&buf);
|
|
struct line const *linebase = line - buf.nlines;
|
|
|
|
/* Make sure the line saved from the old buffer contents is
|
|
less than or equal to the first line of the new buffer. */
|
|
if (alloc && nonunique <= compare (&temp, line - 1))
|
|
{
|
|
found_disorder:
|
|
{
|
|
if (checkonly == 'c')
|
|
{
|
|
struct line const *disorder_line = line - 1;
|
|
uintmax_t disorder_line_number =
|
|
buffer_linelim (&buf) - disorder_line + line_number;
|
|
fprintf (stderr, _("%s: %s:%ju: disorder: "),
|
|
program_name, file_name, disorder_line_number);
|
|
write_line (disorder_line, stderr, _("standard error"));
|
|
}
|
|
|
|
ordered = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Compare each line in the buffer with its successor. */
|
|
while (linebase < --line)
|
|
if (nonunique <= compare (line, line - 1))
|
|
goto found_disorder;
|
|
|
|
line_number += buf.nlines;
|
|
|
|
/* Save the last line of the buffer. */
|
|
if (alloc < line->length)
|
|
{
|
|
do
|
|
{
|
|
alloc *= 2;
|
|
if (! alloc)
|
|
{
|
|
alloc = line->length;
|
|
break;
|
|
}
|
|
}
|
|
while (alloc < line->length);
|
|
|
|
free (temp.text);
|
|
temp.text = xmalloc (alloc);
|
|
}
|
|
memcpy (temp.text, line->text, line->length);
|
|
temp.length = line->length;
|
|
if (key)
|
|
{
|
|
temp.keybeg = temp.text + (line->keybeg - line->text);
|
|
temp.keylim = temp.text + (line->keylim - line->text);
|
|
}
|
|
}
|
|
|
|
xfclose (fp, file_name);
|
|
free (buf.buf);
|
|
free (temp.text);
|
|
return ordered;
|
|
}
|
|
|
|
/* Open FILES (there are NFILES of them) and store the resulting array
|
|
of stream pointers into (*PFPS). Allocate the array. Return the
|
|
number of successfully opened files, setting errno if this value is
|
|
less than NFILES. */
|
|
|
|
static size_t
|
|
open_input_files (struct sortfile *files, size_t nfiles, FILE ***pfps)
|
|
{
|
|
FILE **fps = *pfps = xnmalloc (nfiles, sizeof *fps);
|
|
int i;
|
|
|
|
/* Open as many input files as we can. */
|
|
for (i = 0; i < nfiles; i++)
|
|
{
|
|
fps[i] = (files[i].temp && files[i].temp->state != UNCOMPRESSED
|
|
? open_temp (files[i].temp)
|
|
: stream_open (files[i].name, "r"));
|
|
if (!fps[i])
|
|
break;
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
/* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
|
|
files (all of which are at the start of the FILES array), and
|
|
NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
|
|
FPS is the vector of open stream corresponding to the files.
|
|
Close input and output streams before returning.
|
|
OUTPUT_FILE gives the name of the output file. If it is null,
|
|
the output file is standard output. */
|
|
|
|
static void
|
|
mergefps (struct sortfile *files, size_t ntemps, size_t nfiles,
|
|
FILE *ofp, char const *output_file, FILE **fps)
|
|
{
|
|
struct buffer *buffer = xnmalloc (nfiles, sizeof *buffer);
|
|
/* Input buffers for each file. */
|
|
struct line saved; /* Saved line storage for unique check. */
|
|
struct line const *savedline = nullptr;
|
|
/* &saved if there is a saved line. */
|
|
size_t savealloc = 0; /* Size allocated for the saved line. */
|
|
struct line const **cur = xnmalloc (nfiles, sizeof *cur);
|
|
/* Current line in each line table. */
|
|
struct line const **base = xnmalloc (nfiles, sizeof *base);
|
|
/* Base of each line table. */
|
|
size_t *ord = xnmalloc (nfiles, sizeof *ord);
|
|
/* Table representing a permutation of fps,
|
|
such that cur[ord[0]] is the smallest line
|
|
and will be next output. */
|
|
size_t i;
|
|
size_t j;
|
|
size_t t;
|
|
struct keyfield const *key = keylist;
|
|
saved.text = nullptr;
|
|
|
|
/* Read initial lines from each input file. */
|
|
for (i = 0; i < nfiles; )
|
|
{
|
|
initbuf (&buffer[i], sizeof (struct line),
|
|
MAX (merge_buffer_size, sort_size / nfiles));
|
|
if (fillbuf (&buffer[i], fps[i], files[i].name))
|
|
{
|
|
struct line const *linelim = buffer_linelim (&buffer[i]);
|
|
cur[i] = linelim - 1;
|
|
base[i] = linelim - buffer[i].nlines;
|
|
i++;
|
|
}
|
|
else
|
|
{
|
|
/* fps[i] is empty; eliminate it from future consideration. */
|
|
xfclose (fps[i], files[i].name);
|
|
if (i < ntemps)
|
|
{
|
|
ntemps--;
|
|
zaptemp (files[i].name);
|
|
}
|
|
free (buffer[i].buf);
|
|
--nfiles;
|
|
for (j = i; j < nfiles; ++j)
|
|
{
|
|
files[j] = files[j + 1];
|
|
fps[j] = fps[j + 1];
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Set up the ord table according to comparisons among input lines.
|
|
Since this only reorders two items if one is strictly greater than
|
|
the other, it is stable. */
|
|
for (i = 0; i < nfiles; ++i)
|
|
ord[i] = i;
|
|
for (i = 1; i < nfiles; ++i)
|
|
if (0 < compare (cur[ord[i - 1]], cur[ord[i]]))
|
|
t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
|
|
|
|
/* Repeatedly output the smallest line until no input remains. */
|
|
while (nfiles)
|
|
{
|
|
struct line const *smallest = cur[ord[0]];
|
|
|
|
/* If uniquified output is turned on, output only the first of
|
|
an identical series of lines. */
|
|
if (unique)
|
|
{
|
|
if (savedline && compare (savedline, smallest))
|
|
{
|
|
savedline = nullptr;
|
|
write_line (&saved, ofp, output_file);
|
|
}
|
|
if (!savedline)
|
|
{
|
|
savedline = &saved;
|
|
if (savealloc < smallest->length)
|
|
{
|
|
do
|
|
if (! savealloc)
|
|
{
|
|
savealloc = smallest->length;
|
|
break;
|
|
}
|
|
while ((savealloc *= 2) < smallest->length);
|
|
|
|
free (saved.text);
|
|
saved.text = xmalloc (savealloc);
|
|
}
|
|
saved.length = smallest->length;
|
|
memcpy (saved.text, smallest->text, saved.length);
|
|
if (key)
|
|
{
|
|
saved.keybeg =
|
|
saved.text + (smallest->keybeg - smallest->text);
|
|
saved.keylim =
|
|
saved.text + (smallest->keylim - smallest->text);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
write_line (smallest, ofp, output_file);
|
|
|
|
/* Check if we need to read more lines into memory. */
|
|
if (base[ord[0]] < smallest)
|
|
cur[ord[0]] = smallest - 1;
|
|
else
|
|
{
|
|
if (fillbuf (&buffer[ord[0]], fps[ord[0]], files[ord[0]].name))
|
|
{
|
|
struct line const *linelim = buffer_linelim (&buffer[ord[0]]);
|
|
cur[ord[0]] = linelim - 1;
|
|
base[ord[0]] = linelim - buffer[ord[0]].nlines;
|
|
}
|
|
else
|
|
{
|
|
/* We reached EOF on fps[ord[0]]. */
|
|
for (i = 1; i < nfiles; ++i)
|
|
if (ord[i] > ord[0])
|
|
--ord[i];
|
|
--nfiles;
|
|
xfclose (fps[ord[0]], files[ord[0]].name);
|
|
if (ord[0] < ntemps)
|
|
{
|
|
ntemps--;
|
|
zaptemp (files[ord[0]].name);
|
|
}
|
|
free (buffer[ord[0]].buf);
|
|
for (i = ord[0]; i < nfiles; ++i)
|
|
{
|
|
fps[i] = fps[i + 1];
|
|
files[i] = files[i + 1];
|
|
buffer[i] = buffer[i + 1];
|
|
cur[i] = cur[i + 1];
|
|
base[i] = base[i + 1];
|
|
}
|
|
for (i = 0; i < nfiles; ++i)
|
|
ord[i] = ord[i + 1];
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* The new line just read in may be larger than other lines
|
|
already in main memory; push it back in the queue until we
|
|
encounter a line larger than it. Optimize for the common
|
|
case where the new line is smallest. */
|
|
{
|
|
size_t lo = 1;
|
|
size_t hi = nfiles;
|
|
size_t probe = lo;
|
|
size_t ord0 = ord[0];
|
|
size_t count_of_smaller_lines;
|
|
|
|
while (lo < hi)
|
|
{
|
|
int cmp = compare (cur[ord0], cur[ord[probe]]);
|
|
if (cmp < 0 || (cmp == 0 && ord0 < ord[probe]))
|
|
hi = probe;
|
|
else
|
|
lo = probe + 1;
|
|
probe = (lo + hi) / 2;
|
|
}
|
|
|
|
count_of_smaller_lines = lo - 1;
|
|
for (j = 0; j < count_of_smaller_lines; j++)
|
|
ord[j] = ord[j + 1];
|
|
ord[count_of_smaller_lines] = ord0;
|
|
}
|
|
}
|
|
|
|
if (unique && savedline)
|
|
{
|
|
write_line (&saved, ofp, output_file);
|
|
free (saved.text);
|
|
}
|
|
|
|
xfclose (ofp, output_file);
|
|
free (fps);
|
|
free (buffer);
|
|
free (ord);
|
|
free (base);
|
|
free (cur);
|
|
}
|
|
|
|
/* Merge lines from FILES onto OFP. NTEMPS is the number of temporary
|
|
files (all of which are at the start of the FILES array), and
|
|
NFILES is the number of files; 0 <= NTEMPS <= NFILES <= NMERGE.
|
|
Close input and output files before returning.
|
|
OUTPUT_FILE gives the name of the output file.
|
|
|
|
Return the number of files successfully merged. This number can be
|
|
less than NFILES if we ran low on file descriptors, but in this
|
|
case it is never less than 2. */
|
|
|
|
static size_t
|
|
mergefiles (struct sortfile *files, size_t ntemps, size_t nfiles,
|
|
FILE *ofp, char const *output_file)
|
|
{
|
|
FILE **fps;
|
|
size_t nopened = open_input_files (files, nfiles, &fps);
|
|
if (nopened < nfiles && nopened < 2)
|
|
sort_die (_("open failed"), files[nopened].name);
|
|
mergefps (files, ntemps, nopened, ofp, output_file, fps);
|
|
return nopened;
|
|
}
|
|
|
|
/* Merge into T (of size NLINES) the two sorted arrays of lines
|
|
LO (with NLINES / 2 members), and
|
|
T - (NLINES / 2) (with NLINES - NLINES / 2 members).
|
|
T and LO point just past their respective arrays, and the arrays
|
|
are in reverse order. NLINES must be at least 2. */
|
|
|
|
static void
|
|
mergelines (struct line *restrict t, size_t nlines,
|
|
struct line const *restrict lo)
|
|
{
|
|
size_t nlo = nlines / 2;
|
|
size_t nhi = nlines - nlo;
|
|
struct line *hi = t - nlo;
|
|
|
|
while (true)
|
|
if (compare (lo - 1, hi - 1) <= 0)
|
|
{
|
|
*--t = *--lo;
|
|
if (! --nlo)
|
|
{
|
|
/* HI must equal T now, and there is no need to copy from
|
|
HI to T. */
|
|
return;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*--t = *--hi;
|
|
if (! --nhi)
|
|
{
|
|
do
|
|
*--t = *--lo;
|
|
while (--nlo);
|
|
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Sort the array LINES with NLINES members, using TEMP for temporary space.
|
|
Do this all within one thread. NLINES must be at least 2.
|
|
If TO_TEMP, put the sorted output into TEMP, and TEMP is as large as LINES.
|
|
Otherwise the sort is in-place and TEMP is half-sized.
|
|
The input and output arrays are in reverse order, and LINES and
|
|
TEMP point just past the end of their respective arrays.
|
|
|
|
Use a recursive divide-and-conquer algorithm, in the style
|
|
suggested by Knuth volume 3 (2nd edition), exercise 5.2.4-23. Use
|
|
the optimization suggested by exercise 5.2.4-10; this requires room
|
|
for only 1.5*N lines, rather than the usual 2*N lines. Knuth
|
|
writes that this memory optimization was originally published by
|
|
D. A. Bell, Comp J. 1 (1958), 75. */
|
|
|
|
static void
|
|
sequential_sort (struct line *restrict lines, size_t nlines,
|
|
struct line *restrict temp, bool to_temp)
|
|
{
|
|
if (nlines == 2)
|
|
{
|
|
/* Declare 'swap' as int, not bool, to work around a bug
|
|
<https://lists.gnu.org/r/bug-coreutils/2005-10/msg00086.html>
|
|
in the IBM xlc 6.0.0.0 compiler in 64-bit mode. */
|
|
int swap = (0 < compare (&lines[-1], &lines[-2]));
|
|
if (to_temp)
|
|
{
|
|
temp[-1] = lines[-1 - swap];
|
|
temp[-2] = lines[-2 + swap];
|
|
}
|
|
else if (swap)
|
|
{
|
|
temp[-1] = lines[-1];
|
|
lines[-1] = lines[-2];
|
|
lines[-2] = temp[-1];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
size_t nlo = nlines / 2;
|
|
size_t nhi = nlines - nlo;
|
|
struct line *lo = lines;
|
|
struct line *hi = lines - nlo;
|
|
|
|
sequential_sort (hi, nhi, temp - (to_temp ? nlo : 0), to_temp);
|
|
if (1 < nlo)
|
|
sequential_sort (lo, nlo, temp, !to_temp);
|
|
else if (!to_temp)
|
|
temp[-1] = lo[-1];
|
|
|
|
struct line *dest;
|
|
struct line const *sorted_lo;
|
|
if (to_temp)
|
|
{
|
|
dest = temp;
|
|
sorted_lo = lines;
|
|
}
|
|
else
|
|
{
|
|
dest = lines;
|
|
sorted_lo = temp;
|
|
}
|
|
mergelines (dest, nlines, sorted_lo);
|
|
}
|
|
}
|
|
|
|
static struct merge_node *init_node (struct merge_node *restrict,
|
|
struct merge_node *restrict,
|
|
struct line *, size_t, size_t, bool);
|
|
|
|
|
|
/* Create and return a merge tree for NTHREADS threads, sorting NLINES
|
|
lines, with destination DEST. */
|
|
static struct merge_node *
|
|
merge_tree_init (size_t nthreads, size_t nlines, struct line *dest)
|
|
{
|
|
struct merge_node *merge_tree = xmalloc (2 * sizeof *merge_tree * nthreads);
|
|
|
|
struct merge_node *root = merge_tree;
|
|
root->lo = root->hi = root->end_lo = root->end_hi = nullptr;
|
|
root->dest = nullptr;
|
|
root->nlo = root->nhi = nlines;
|
|
root->parent = nullptr;
|
|
root->level = MERGE_END;
|
|
root->queued = false;
|
|
pthread_mutex_init (&root->lock, nullptr);
|
|
|
|
init_node (root, root + 1, dest, nthreads, nlines, false);
|
|
return merge_tree;
|
|
}
|
|
|
|
/* Destroy the merge tree. */
|
|
static void
|
|
merge_tree_destroy (size_t nthreads, struct merge_node *merge_tree)
|
|
{
|
|
size_t n_nodes = nthreads * 2;
|
|
struct merge_node *node = merge_tree;
|
|
|
|
while (n_nodes--)
|
|
{
|
|
pthread_mutex_destroy (&node->lock);
|
|
node++;
|
|
}
|
|
|
|
free (merge_tree);
|
|
}
|
|
|
|
/* Initialize a merge tree node and its descendants. The node's
|
|
parent is PARENT. The node and its descendants are taken from the
|
|
array of nodes NODE_POOL. Their destination starts at DEST; they
|
|
will consume NTHREADS threads. The total number of sort lines is
|
|
TOTAL_LINES. IS_LO_CHILD is true if the node is the low child of
|
|
its parent. */
|
|
|
|
static struct merge_node *
|
|
init_node (struct merge_node *restrict parent,
|
|
struct merge_node *restrict node_pool,
|
|
struct line *dest, size_t nthreads,
|
|
size_t total_lines, bool is_lo_child)
|
|
{
|
|
size_t nlines = (is_lo_child ? parent->nlo : parent->nhi);
|
|
size_t nlo = nlines / 2;
|
|
size_t nhi = nlines - nlo;
|
|
struct line *lo = dest - total_lines;
|
|
struct line *hi = lo - nlo;
|
|
struct line **parent_end = (is_lo_child ? &parent->end_lo : &parent->end_hi);
|
|
|
|
struct merge_node *node = node_pool++;
|
|
node->lo = node->end_lo = lo;
|
|
node->hi = node->end_hi = hi;
|
|
node->dest = parent_end;
|
|
node->nlo = nlo;
|
|
node->nhi = nhi;
|
|
node->parent = parent;
|
|
node->level = parent->level + 1;
|
|
node->queued = false;
|
|
pthread_mutex_init (&node->lock, nullptr);
|
|
|
|
if (nthreads > 1)
|
|
{
|
|
size_t lo_threads = nthreads / 2;
|
|
size_t hi_threads = nthreads - lo_threads;
|
|
node->lo_child = node_pool;
|
|
node_pool = init_node (node, node_pool, lo, lo_threads,
|
|
total_lines, true);
|
|
node->hi_child = node_pool;
|
|
node_pool = init_node (node, node_pool, hi, hi_threads,
|
|
total_lines, false);
|
|
}
|
|
else
|
|
{
|
|
node->lo_child = nullptr;
|
|
node->hi_child = nullptr;
|
|
}
|
|
return node_pool;
|
|
}
|
|
|
|
|
|
/* Compare two merge nodes A and B for priority. */
|
|
|
|
static int
|
|
compare_nodes (void const *a, void const *b)
|
|
{
|
|
struct merge_node const *nodea = a;
|
|
struct merge_node const *nodeb = b;
|
|
if (nodea->level == nodeb->level)
|
|
return (nodea->nlo + nodea->nhi) < (nodeb->nlo + nodeb->nhi);
|
|
return nodea->level < nodeb->level;
|
|
}
|
|
|
|
/* Lock a merge tree NODE. */
|
|
|
|
static inline void
|
|
lock_node (struct merge_node *node)
|
|
{
|
|
pthread_mutex_lock (&node->lock);
|
|
}
|
|
|
|
/* Unlock a merge tree NODE. */
|
|
|
|
static inline void
|
|
unlock_node (struct merge_node *node)
|
|
{
|
|
pthread_mutex_unlock (&node->lock);
|
|
}
|
|
|
|
/* Destroy merge QUEUE. */
|
|
|
|
static void
|
|
queue_destroy (struct merge_node_queue *queue)
|
|
{
|
|
heap_free (queue->priority_queue);
|
|
pthread_cond_destroy (&queue->cond);
|
|
pthread_mutex_destroy (&queue->mutex);
|
|
}
|
|
|
|
/* Initialize merge QUEUE, allocating space suitable for a maximum of
|
|
NTHREADS threads. */
|
|
|
|
static void
|
|
queue_init (struct merge_node_queue *queue, size_t nthreads)
|
|
{
|
|
/* Though it's highly unlikely all nodes are in the heap at the same
|
|
time, the heap should accommodate all of them. Counting a null
|
|
dummy head for the heap, reserve 2 * NTHREADS nodes. */
|
|
queue->priority_queue = heap_alloc (compare_nodes, 2 * nthreads);
|
|
pthread_mutex_init (&queue->mutex, nullptr);
|
|
pthread_cond_init (&queue->cond, nullptr);
|
|
}
|
|
|
|
/* Insert NODE into QUEUE. The caller either holds a lock on NODE, or
|
|
does not need to lock NODE. */
|
|
|
|
static void
|
|
queue_insert (struct merge_node_queue *queue, struct merge_node *node)
|
|
{
|
|
pthread_mutex_lock (&queue->mutex);
|
|
heap_insert (queue->priority_queue, node);
|
|
node->queued = true;
|
|
pthread_cond_signal (&queue->cond);
|
|
pthread_mutex_unlock (&queue->mutex);
|
|
}
|
|
|
|
/* Pop the top node off the priority QUEUE, lock the node, return it. */
|
|
|
|
static struct merge_node *
|
|
queue_pop (struct merge_node_queue *queue)
|
|
{
|
|
struct merge_node *node;
|
|
pthread_mutex_lock (&queue->mutex);
|
|
while (! (node = heap_remove_top (queue->priority_queue)))
|
|
pthread_cond_wait (&queue->cond, &queue->mutex);
|
|
pthread_mutex_unlock (&queue->mutex);
|
|
lock_node (node);
|
|
node->queued = false;
|
|
return node;
|
|
}
|
|
|
|
/* Output LINE to TFP, unless -u is specified and the line compares
|
|
equal to the previous line. TEMP_OUTPUT is the name of TFP, or
|
|
is null if TFP is standard output.
|
|
|
|
This function does not save the line for comparison later, so it is
|
|
appropriate only for internal sort. */
|
|
|
|
static void
|
|
write_unique (struct line const *line, FILE *tfp, char const *temp_output)
|
|
{
|
|
if (unique)
|
|
{
|
|
if (saved_line.text && ! compare (line, &saved_line))
|
|
return;
|
|
saved_line = *line;
|
|
}
|
|
|
|
write_line (line, tfp, temp_output);
|
|
}
|
|
|
|
/* Merge the lines currently available to a NODE in the binary
|
|
merge tree. Merge a number of lines appropriate for this merge
|
|
level, assuming TOTAL_LINES is the total number of lines.
|
|
|
|
If merging at the top level, send output to TFP. TEMP_OUTPUT is
|
|
the name of TFP, or is null if TFP is standard output. */
|
|
|
|
static void
|
|
mergelines_node (struct merge_node *restrict node, size_t total_lines,
|
|
FILE *tfp, char const *temp_output)
|
|
{
|
|
struct line *lo_orig = node->lo;
|
|
struct line *hi_orig = node->hi;
|
|
size_t to_merge = MAX_MERGE (total_lines, node->level);
|
|
size_t merged_lo;
|
|
size_t merged_hi;
|
|
|
|
if (node->level > MERGE_ROOT)
|
|
{
|
|
/* Merge to destination buffer. */
|
|
struct line *dest = *node->dest;
|
|
while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
|
|
if (compare (node->lo - 1, node->hi - 1) <= 0)
|
|
*--dest = *--node->lo;
|
|
else
|
|
*--dest = *--node->hi;
|
|
|
|
merged_lo = lo_orig - node->lo;
|
|
merged_hi = hi_orig - node->hi;
|
|
|
|
if (node->nhi == merged_hi)
|
|
while (node->lo != node->end_lo && to_merge--)
|
|
*--dest = *--node->lo;
|
|
else if (node->nlo == merged_lo)
|
|
while (node->hi != node->end_hi && to_merge--)
|
|
*--dest = *--node->hi;
|
|
*node->dest = dest;
|
|
}
|
|
else
|
|
{
|
|
/* Merge directly to output. */
|
|
while (node->lo != node->end_lo && node->hi != node->end_hi && to_merge--)
|
|
{
|
|
if (compare (node->lo - 1, node->hi - 1) <= 0)
|
|
write_unique (--node->lo, tfp, temp_output);
|
|
else
|
|
write_unique (--node->hi, tfp, temp_output);
|
|
}
|
|
|
|
merged_lo = lo_orig - node->lo;
|
|
merged_hi = hi_orig - node->hi;
|
|
|
|
if (node->nhi == merged_hi)
|
|
{
|
|
while (node->lo != node->end_lo && to_merge--)
|
|
write_unique (--node->lo, tfp, temp_output);
|
|
}
|
|
else if (node->nlo == merged_lo)
|
|
{
|
|
while (node->hi != node->end_hi && to_merge--)
|
|
write_unique (--node->hi, tfp, temp_output);
|
|
}
|
|
}
|
|
|
|
/* Update NODE. */
|
|
merged_lo = lo_orig - node->lo;
|
|
merged_hi = hi_orig - node->hi;
|
|
node->nlo -= merged_lo;
|
|
node->nhi -= merged_hi;
|
|
}
|
|
|
|
/* Into QUEUE, insert NODE if it is not already queued, and if one of
|
|
NODE's children has available lines and the other either has
|
|
available lines or has exhausted its lines. */
|
|
|
|
static void
|
|
queue_check_insert (struct merge_node_queue *queue, struct merge_node *node)
|
|
{
|
|
if (! node->queued)
|
|
{
|
|
bool lo_avail = (node->lo - node->end_lo) != 0;
|
|
bool hi_avail = (node->hi - node->end_hi) != 0;
|
|
if (lo_avail ? hi_avail || ! node->nhi : hi_avail && ! node->nlo)
|
|
queue_insert (queue, node);
|
|
}
|
|
}
|
|
|
|
/* Into QUEUE, insert NODE's parent if the parent can now be worked on. */
|
|
|
|
static void
|
|
queue_check_insert_parent (struct merge_node_queue *queue,
|
|
struct merge_node *node)
|
|
{
|
|
if (node->level > MERGE_ROOT)
|
|
{
|
|
lock_node (node->parent);
|
|
queue_check_insert (queue, node->parent);
|
|
unlock_node (node->parent);
|
|
}
|
|
else if (node->nlo + node->nhi == 0)
|
|
{
|
|
/* If the MERGE_ROOT NODE has finished merging, insert the
|
|
MERGE_END node. */
|
|
queue_insert (queue, node->parent);
|
|
}
|
|
}
|
|
|
|
/* Repeatedly pop QUEUE for a node with lines to merge, and merge at least
|
|
some of those lines, until the MERGE_END node is popped.
|
|
TOTAL_LINES is the total number of lines. If merging at the top
|
|
level, send output to TFP. TEMP_OUTPUT is the name of TFP, or is
|
|
null if TFP is standard output. */
|
|
|
|
static void
|
|
merge_loop (struct merge_node_queue *queue,
|
|
size_t total_lines, FILE *tfp, char const *temp_output)
|
|
{
|
|
while (true)
|
|
{
|
|
struct merge_node *node = queue_pop (queue);
|
|
|
|
if (node->level == MERGE_END)
|
|
{
|
|
unlock_node (node);
|
|
/* Reinsert so other threads can pop it. */
|
|
queue_insert (queue, node);
|
|
break;
|
|
}
|
|
mergelines_node (node, total_lines, tfp, temp_output);
|
|
queue_check_insert (queue, node);
|
|
queue_check_insert_parent (queue, node);
|
|
|
|
unlock_node (node);
|
|
}
|
|
}
|
|
|
|
|
|
static void sortlines (struct line *restrict, size_t, size_t,
|
|
struct merge_node *, struct merge_node_queue *,
|
|
FILE *, char const *);
|
|
|
|
/* Thread arguments for sortlines_thread. */
|
|
|
|
struct thread_args
|
|
{
|
|
/* Source, i.e., the array of lines to sort. This points just past
|
|
the end of the array. */
|
|
struct line *lines;
|
|
|
|
/* Number of threads to use. If 0 or 1, sort single-threaded. */
|
|
size_t nthreads;
|
|
|
|
/* Number of lines in LINES and DEST. */
|
|
size_t const total_lines;
|
|
|
|
/* Merge node. Lines from this node and this node's sibling will merged
|
|
to this node's parent. */
|
|
struct merge_node *const node;
|
|
|
|
/* The priority queue controlling available work for the entire
|
|
internal sort. */
|
|
struct merge_node_queue *const queue;
|
|
|
|
/* If at the top level, the file to output to, and the file's name.
|
|
If the file is standard output, the file's name is null. */
|
|
FILE *tfp;
|
|
char const *output_temp;
|
|
};
|
|
|
|
/* Like sortlines, except with a signature acceptable to pthread_create. */
|
|
|
|
static void *
|
|
sortlines_thread (void *data)
|
|
{
|
|
struct thread_args const *args = data;
|
|
sortlines (args->lines, args->nthreads, args->total_lines,
|
|
args->node, args->queue, args->tfp,
|
|
args->output_temp);
|
|
return nullptr;
|
|
}
|
|
|
|
/* Sort lines, possibly in parallel. The arguments are as in struct
|
|
thread_args above.
|
|
|
|
The algorithm has three phases: node creation, sequential sort,
|
|
and binary merge.
|
|
|
|
During node creation, sortlines recursively visits each node in the
|
|
binary merge tree and creates a NODE structure corresponding to all the
|
|
future line merging NODE is responsible for. For each call to
|
|
sortlines, half the available threads are assigned to each recursive
|
|
call, until a leaf node having only 1 available thread is reached.
|
|
|
|
Each leaf node then performs two sequential sorts, one on each half of
|
|
the lines it is responsible for. It records in its NODE structure that
|
|
there are two sorted sublists available to merge from, and inserts its
|
|
NODE into the priority queue.
|
|
|
|
The binary merge phase then begins. Each thread drops into a loop
|
|
where the thread retrieves a NODE from the priority queue, merges lines
|
|
available to that NODE, and potentially insert NODE or its parent back
|
|
into the queue if there are sufficient available lines for them to
|
|
merge. This continues until all lines at all nodes of the merge tree
|
|
have been merged. */
|
|
|
|
static void
|
|
sortlines (struct line *restrict lines, size_t nthreads,
|
|
size_t total_lines, struct merge_node *node,
|
|
struct merge_node_queue *queue, FILE *tfp, char const *temp_output)
|
|
{
|
|
size_t nlines = node->nlo + node->nhi;
|
|
|
|
/* Calculate thread arguments. */
|
|
size_t lo_threads = nthreads / 2;
|
|
size_t hi_threads = nthreads - lo_threads;
|
|
pthread_t thread;
|
|
struct thread_args args = {lines, lo_threads, total_lines,
|
|
node->lo_child, queue, tfp, temp_output};
|
|
|
|
if (nthreads > 1 && SUBTHREAD_LINES_HEURISTIC <= nlines
|
|
&& pthread_create (&thread, nullptr, sortlines_thread, &args) == 0)
|
|
{
|
|
sortlines (lines - node->nlo, hi_threads, total_lines,
|
|
node->hi_child, queue, tfp, temp_output);
|
|
pthread_join (thread, nullptr);
|
|
}
|
|
else
|
|
{
|
|
/* Nthreads = 1, this is a leaf NODE, or pthread_create failed.
|
|
Sort with 1 thread. */
|
|
size_t nlo = node->nlo;
|
|
size_t nhi = node->nhi;
|
|
struct line *temp = lines - total_lines;
|
|
if (1 < nhi)
|
|
sequential_sort (lines - nlo, nhi, temp - nlo / 2, false);
|
|
if (1 < nlo)
|
|
sequential_sort (lines, nlo, temp, false);
|
|
|
|
/* Update merge NODE. No need to lock yet. */
|
|
node->lo = lines;
|
|
node->hi = lines - nlo;
|
|
node->end_lo = lines - nlo;
|
|
node->end_hi = lines - nlo - nhi;
|
|
|
|
queue_insert (queue, node);
|
|
merge_loop (queue, total_lines, tfp, temp_output);
|
|
}
|
|
}
|
|
|
|
/* Scan through FILES[NTEMPS .. NFILES-1] looking for files that are
|
|
the same as OUTFILE. If found, replace each with the same
|
|
temporary copy that can be merged into OUTFILE without destroying
|
|
OUTFILE before it is completely read. This temporary copy does not
|
|
count as a merge temp, so don't worry about incrementing NTEMPS in
|
|
the caller; final cleanup will remove it, not zaptemp.
|
|
|
|
This test ensures that an otherwise-erroneous use like
|
|
"sort -m -o FILE ... FILE ..." copies FILE before writing to it.
|
|
It's not clear that POSIX requires this nicety.
|
|
Detect common error cases, but don't try to catch obscure cases like
|
|
"cat ... FILE ... | sort -m -o FILE"
|
|
where traditional "sort" doesn't copy the input and where
|
|
people should know that they're getting into trouble anyway.
|
|
Catching these obscure cases would slow down performance in
|
|
common cases. */
|
|
|
|
static void
|
|
avoid_trashing_input (struct sortfile *files, size_t ntemps,
|
|
size_t nfiles, char const *outfile)
|
|
{
|
|
struct tempnode *tempcopy = nullptr;
|
|
|
|
for (size_t i = ntemps; i < nfiles; i++)
|
|
{
|
|
bool is_stdin = streq (files[i].name, "-");
|
|
bool same;
|
|
struct stat instat;
|
|
|
|
if (outfile && streq (outfile, files[i].name) && !is_stdin)
|
|
same = true;
|
|
else
|
|
{
|
|
struct stat *outst = get_outstatus ();
|
|
if (!outst)
|
|
break;
|
|
|
|
same = (((is_stdin
|
|
? fstat (STDIN_FILENO, &instat)
|
|
: stat (files[i].name, &instat))
|
|
== 0)
|
|
&& psame_inode (&instat, outst));
|
|
}
|
|
|
|
if (same)
|
|
{
|
|
if (! tempcopy)
|
|
{
|
|
FILE *tftp;
|
|
tempcopy = create_temp (&tftp);
|
|
mergefiles (&files[i], 0, 1, tftp, tempcopy->name);
|
|
}
|
|
|
|
files[i].name = tempcopy->name;
|
|
files[i].temp = tempcopy;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Scan the input files to ensure all are accessible.
|
|
Otherwise exit with a diagnostic.
|
|
|
|
This will catch common issues with permissions etc.
|
|
but will fail to notice issues where you can open but not read,
|
|
like when a directory is specified on some systems.
|
|
Catching these obscure cases could slow down performance in
|
|
common cases. */
|
|
|
|
static void
|
|
check_inputs (char *const *files, size_t nfiles)
|
|
{
|
|
for (size_t i = 0; i < nfiles; i++)
|
|
{
|
|
if (streq (files[i], "-"))
|
|
continue;
|
|
|
|
if (euidaccess (files[i], R_OK) != 0)
|
|
sort_die (_("cannot read"), files[i]);
|
|
}
|
|
}
|
|
|
|
/* Ensure a specified output file can be created or written to,
|
|
and point stdout to it. Do not truncate the file.
|
|
Exit with a diagnostic on failure. */
|
|
|
|
static void
|
|
check_output (char const *outfile)
|
|
{
|
|
if (outfile)
|
|
{
|
|
int oflags = O_WRONLY | O_BINARY | O_CLOEXEC | O_CREAT;
|
|
int outfd = open (outfile, oflags, MODE_RW_UGO);
|
|
if (outfd < 0)
|
|
sort_die (_("open failed"), outfile);
|
|
move_fd (outfd, STDOUT_FILENO);
|
|
}
|
|
}
|
|
|
|
/* Merge the input FILES. NTEMPS is the number of files at the
|
|
start of FILES that are temporary; it is zero at the top level.
|
|
NFILES is the total number of files. Put the output in
|
|
OUTPUT_FILE; a null OUTPUT_FILE stands for standard output. */
|
|
|
|
static void
|
|
merge (struct sortfile *files, size_t ntemps, size_t nfiles,
|
|
char const *output_file)
|
|
{
|
|
while (nmerge < nfiles)
|
|
{
|
|
/* Number of input files processed so far. */
|
|
size_t in;
|
|
|
|
/* Number of output files generated so far. */
|
|
size_t out;
|
|
|
|
/* nfiles % NMERGE; this counts input files that are left over
|
|
after all full-sized merges have been done. */
|
|
size_t remainder;
|
|
|
|
/* Number of easily-available slots at the next loop iteration. */
|
|
size_t cheap_slots;
|
|
|
|
/* Do as many NMERGE-size merges as possible. In the case that
|
|
nmerge is bogus, increment by the maximum number of file
|
|
descriptors allowed. */
|
|
for (out = in = 0; nmerge <= nfiles - in; out++)
|
|
{
|
|
FILE *tfp;
|
|
struct tempnode *temp = create_temp (&tfp);
|
|
size_t num_merged = mergefiles (&files[in], MIN (ntemps, nmerge),
|
|
nmerge, tfp, temp->name);
|
|
ntemps -= MIN (ntemps, num_merged);
|
|
files[out].name = temp->name;
|
|
files[out].temp = temp;
|
|
in += num_merged;
|
|
}
|
|
|
|
remainder = nfiles - in;
|
|
cheap_slots = nmerge - out % nmerge;
|
|
|
|
if (cheap_slots < remainder)
|
|
{
|
|
/* So many files remain that they can't all be put into the last
|
|
NMERGE-sized output window. Do one more merge. Merge as few
|
|
files as possible, to avoid needless I/O. */
|
|
size_t nshortmerge = remainder - cheap_slots + 1;
|
|
FILE *tfp;
|
|
struct tempnode *temp = create_temp (&tfp);
|
|
size_t num_merged = mergefiles (&files[in], MIN (ntemps, nshortmerge),
|
|
nshortmerge, tfp, temp->name);
|
|
ntemps -= MIN (ntemps, num_merged);
|
|
files[out].name = temp->name;
|
|
files[out++].temp = temp;
|
|
in += num_merged;
|
|
}
|
|
|
|
/* Put the remaining input files into the last NMERGE-sized output
|
|
window, so they will be merged in the next pass. */
|
|
memmove (&files[out], &files[in], (nfiles - in) * sizeof *files);
|
|
ntemps += out;
|
|
nfiles -= in - out;
|
|
}
|
|
|
|
avoid_trashing_input (files, ntemps, nfiles, output_file);
|
|
|
|
/* We aren't guaranteed that this final mergefiles will work, therefore we
|
|
try to merge into the output, and then merge as much as we can into a
|
|
temp file if we can't. Repeat. */
|
|
|
|
while (true)
|
|
{
|
|
/* Merge directly into the output file if possible. */
|
|
FILE **fps;
|
|
size_t nopened = open_input_files (files, nfiles, &fps);
|
|
|
|
if (nopened == nfiles)
|
|
{
|
|
FILE *ofp = stream_open (output_file, "w");
|
|
if (ofp)
|
|
{
|
|
mergefps (files, ntemps, nfiles, ofp, output_file, fps);
|
|
break;
|
|
}
|
|
if (errno != EMFILE || nopened <= 2)
|
|
sort_die (_("open failed"), output_file);
|
|
}
|
|
else if (nopened <= 2)
|
|
sort_die (_("open failed"), files[nopened].name);
|
|
|
|
/* We ran out of file descriptors. Close one of the input
|
|
files, to gain a file descriptor. Then create a temporary
|
|
file with our spare file descriptor. Retry if that failed
|
|
(e.g., some other process could open a file between the time
|
|
we closed and tried to create). */
|
|
FILE *tfp;
|
|
struct tempnode *temp;
|
|
do
|
|
{
|
|
nopened--;
|
|
xfclose (fps[nopened], files[nopened].name);
|
|
temp = maybe_create_temp (&tfp, ! (nopened <= 2));
|
|
}
|
|
while (!temp);
|
|
|
|
/* Merge into the newly allocated temporary. */
|
|
mergefps (&files[0], MIN (ntemps, nopened), nopened, tfp, temp->name,
|
|
fps);
|
|
ntemps -= MIN (ntemps, nopened);
|
|
files[0].name = temp->name;
|
|
files[0].temp = temp;
|
|
|
|
memmove (&files[1], &files[nopened], (nfiles - nopened) * sizeof *files);
|
|
ntemps++;
|
|
nfiles -= nopened - 1;
|
|
}
|
|
}
|
|
|
|
/* Sort NFILES FILES onto OUTPUT_FILE. Use at most NTHREADS threads. */
|
|
|
|
static void
|
|
sort (char *const *files, size_t nfiles, char const *output_file,
|
|
size_t nthreads)
|
|
{
|
|
struct buffer buf;
|
|
size_t ntemps = 0;
|
|
bool output_file_created = false;
|
|
|
|
buf.alloc = 0;
|
|
|
|
while (nfiles)
|
|
{
|
|
char const *temp_output;
|
|
char const *file = *files;
|
|
FILE *fp = xfopen (file, "r");
|
|
FILE *tfp;
|
|
|
|
size_t bytes_per_line;
|
|
if (nthreads > 1)
|
|
{
|
|
/* Get log P. */
|
|
size_t tmp = 1;
|
|
size_t mult = 1;
|
|
while (tmp < nthreads)
|
|
{
|
|
tmp *= 2;
|
|
mult++;
|
|
}
|
|
bytes_per_line = (mult * sizeof (struct line));
|
|
}
|
|
else
|
|
bytes_per_line = sizeof (struct line) * 3 / 2;
|
|
|
|
if (! buf.alloc)
|
|
initbuf (&buf, bytes_per_line,
|
|
sort_buffer_size (&fp, 1, files, nfiles, bytes_per_line));
|
|
buf.eof = false;
|
|
files++;
|
|
nfiles--;
|
|
|
|
while (fillbuf (&buf, fp, file))
|
|
{
|
|
struct line *line;
|
|
|
|
if (buf.eof && nfiles
|
|
&& (bytes_per_line + 1
|
|
< (buf.alloc - buf.used - bytes_per_line * buf.nlines)))
|
|
{
|
|
/* End of file, but there is more input and buffer room.
|
|
Concatenate the next input file; this is faster in
|
|
the usual case. */
|
|
buf.left = buf.used;
|
|
break;
|
|
}
|
|
|
|
saved_line.text = nullptr;
|
|
line = buffer_linelim (&buf);
|
|
if (buf.eof && !nfiles && !ntemps && !buf.left)
|
|
{
|
|
xfclose (fp, file);
|
|
tfp = xfopen (output_file, "w");
|
|
temp_output = output_file;
|
|
output_file_created = true;
|
|
}
|
|
else
|
|
{
|
|
++ntemps;
|
|
temp_output = create_temp (&tfp)->name;
|
|
}
|
|
if (1 < buf.nlines)
|
|
{
|
|
struct merge_node_queue queue;
|
|
queue_init (&queue, nthreads);
|
|
struct merge_node *merge_tree =
|
|
merge_tree_init (nthreads, buf.nlines, line);
|
|
|
|
sortlines (line, nthreads, buf.nlines, merge_tree + 1,
|
|
&queue, tfp, temp_output);
|
|
|
|
merge_tree_destroy (nthreads, merge_tree);
|
|
queue_destroy (&queue);
|
|
}
|
|
else
|
|
write_unique (line - 1, tfp, temp_output);
|
|
|
|
xfclose (tfp, temp_output);
|
|
|
|
if (output_file_created)
|
|
goto finish;
|
|
}
|
|
xfclose (fp, file);
|
|
}
|
|
|
|
finish:
|
|
free (buf.buf);
|
|
|
|
if (! output_file_created)
|
|
{
|
|
struct tempnode *node = temphead;
|
|
struct sortfile *tempfiles = xnmalloc (ntemps, sizeof *tempfiles);
|
|
for (size_t i = 0; node; i++)
|
|
{
|
|
tempfiles[i].name = node->name;
|
|
tempfiles[i].temp = node;
|
|
node = node->next;
|
|
}
|
|
merge (tempfiles, ntemps, ntemps, output_file);
|
|
free (tempfiles);
|
|
}
|
|
|
|
reap_all ();
|
|
}
|
|
|
|
/* Insert a malloc'd copy of key KEY_ARG at the end of the key list. */
|
|
|
|
static void
|
|
insertkey (struct keyfield *key_arg)
|
|
{
|
|
struct keyfield **p;
|
|
struct keyfield *key = xmemdup (key_arg, sizeof *key);
|
|
|
|
for (p = &keylist; *p; p = &(*p)->next)
|
|
continue;
|
|
*p = key;
|
|
key->next = nullptr;
|
|
}
|
|
|
|
/* Report a bad field specification SPEC, with extra info MSGID. */
|
|
|
|
static void
|
|
badfieldspec (char const *spec, char const *msgid)
|
|
{
|
|
error (SORT_FAILURE, 0, _("%s: invalid field specification %s"),
|
|
_(msgid), quote (spec));
|
|
}
|
|
|
|
/* Report incompatible options. */
|
|
|
|
static void
|
|
incompatible_options (char const *opts)
|
|
{
|
|
error (SORT_FAILURE, 0, _("options '-%s' are incompatible"), (opts));
|
|
}
|
|
|
|
/* Check compatibility of ordering options. */
|
|
|
|
static void
|
|
check_ordering_compatibility (void)
|
|
{
|
|
struct keyfield *key;
|
|
|
|
for (key = keylist; key; key = key->next)
|
|
if (1 < (key->numeric + key->general_numeric + key->human_numeric
|
|
+ key->month + (key->version | key->random | !!key->ignore)))
|
|
{
|
|
/* The following is too big, but guaranteed to be "big enough". */
|
|
char opts[sizeof short_options];
|
|
/* Clear flags we're not interested in. */
|
|
key->skipsblanks = key->skipeblanks = key->reverse = false;
|
|
key_to_opts (key, opts);
|
|
incompatible_options (opts);
|
|
}
|
|
}
|
|
|
|
/* Parse the leading integer in STRING and store the resulting value
|
|
(which must fit into size_t) into *VAL. Return the address of the
|
|
suffix after the integer. If the value is too large, silently
|
|
substitute SIZE_MAX. If MSGID is null, return nullptr after
|
|
failure; otherwise, report MSGID and exit on failure. */
|
|
|
|
static char const *
|
|
parse_field_count (char const *string, size_t *val, char const *msgid)
|
|
{
|
|
char *suffix;
|
|
uintmax_t n;
|
|
|
|
switch (xstrtoumax (string, &suffix, 10, &n, ""))
|
|
{
|
|
case LONGINT_OK:
|
|
case LONGINT_INVALID_SUFFIX_CHAR:
|
|
*val = n;
|
|
if (*val == n)
|
|
break;
|
|
FALLTHROUGH;
|
|
case LONGINT_OVERFLOW:
|
|
case LONGINT_OVERFLOW | LONGINT_INVALID_SUFFIX_CHAR:
|
|
*val = SIZE_MAX;
|
|
break;
|
|
|
|
case LONGINT_INVALID:
|
|
if (msgid)
|
|
error (SORT_FAILURE, 0, _("%s: invalid count at start of %s"),
|
|
_(msgid), quote (string));
|
|
return nullptr;
|
|
}
|
|
|
|
return suffix;
|
|
}
|
|
|
|
/* Set the ordering options for KEY specified in S.
|
|
Return the address of the first character in S that
|
|
is not a valid ordering option.
|
|
BLANKTYPE is the kind of blanks that 'b' should skip. */
|
|
|
|
static char *
|
|
set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype)
|
|
{
|
|
while (*s)
|
|
{
|
|
switch (*s)
|
|
{
|
|
case 'b':
|
|
if (blanktype == bl_start || blanktype == bl_both)
|
|
key->skipsblanks = true;
|
|
if (blanktype == bl_end || blanktype == bl_both)
|
|
key->skipeblanks = true;
|
|
break;
|
|
case 'd':
|
|
key->ignore = nondictionary;
|
|
break;
|
|
case 'f':
|
|
key->translate = fold_toupper;
|
|
break;
|
|
case 'g':
|
|
key->general_numeric = true;
|
|
break;
|
|
case 'h':
|
|
key->human_numeric = true;
|
|
break;
|
|
case 'i':
|
|
/* Option order should not matter, so don't let -i override
|
|
-d. -d implies -i, but -i does not imply -d. */
|
|
if (! key->ignore)
|
|
key->ignore = nonprinting;
|
|
break;
|
|
case 'M':
|
|
key->month = true;
|
|
break;
|
|
case 'n':
|
|
key->numeric = true;
|
|
break;
|
|
case 'R':
|
|
key->random = true;
|
|
break;
|
|
case 'r':
|
|
key->reverse = true;
|
|
break;
|
|
case 'V':
|
|
key->version = true;
|
|
break;
|
|
default:
|
|
return (char *) s;
|
|
}
|
|
++s;
|
|
}
|
|
return (char *) s;
|
|
}
|
|
|
|
/* Initialize KEY. */
|
|
|
|
static struct keyfield *
|
|
key_init (struct keyfield *key)
|
|
{
|
|
memset (key, 0, sizeof *key);
|
|
key->eword = SIZE_MAX;
|
|
return key;
|
|
}
|
|
|
|
int
|
|
main (int argc, char **argv)
|
|
{
|
|
struct keyfield *key;
|
|
struct keyfield key_buf;
|
|
struct keyfield gkey;
|
|
bool gkey_only = false;
|
|
char const *s;
|
|
int c = 0;
|
|
char checkonly = 0;
|
|
bool mergeonly = false;
|
|
char *random_source = nullptr;
|
|
bool need_random = false;
|
|
size_t nthreads = 0;
|
|
size_t nfiles = 0;
|
|
bool posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr);
|
|
int posix_ver = posix2_version ();
|
|
bool traditional_usage = ! (200112 <= posix_ver && posix_ver < 200809);
|
|
char **files;
|
|
char *files_from = nullptr;
|
|
struct Tokens tok;
|
|
char const *outfile = nullptr;
|
|
bool locale_ok;
|
|
|
|
initialize_main (&argc, &argv);
|
|
set_program_name (argv[0]);
|
|
locale_ok = !! setlocale (LC_ALL, "");
|
|
bindtextdomain (PACKAGE, LOCALEDIR);
|
|
textdomain (PACKAGE);
|
|
|
|
initialize_exit_failure (SORT_FAILURE);
|
|
|
|
hard_LC_COLLATE = hard_locale (LC_COLLATE);
|
|
#if HAVE_NL_LANGINFO
|
|
hard_LC_TIME = hard_locale (LC_TIME);
|
|
#endif
|
|
|
|
/* Get locale's representation of the decimal point. */
|
|
{
|
|
struct lconv const *locale = localeconv ();
|
|
|
|
/* If the locale doesn't define a decimal point, or if the decimal
|
|
point is multibyte, use the C locale's decimal point. FIXME:
|
|
add support for multibyte decimal points. */
|
|
decimal_point = locale->decimal_point[0];
|
|
if (! decimal_point || locale->decimal_point[1])
|
|
decimal_point = '.';
|
|
|
|
/* FIXME: add support for multibyte thousands separators. */
|
|
thousands_sep = locale->thousands_sep[0];
|
|
if (thousands_sep && locale->thousands_sep[1])
|
|
thousands_sep_ignored = true;
|
|
if (! thousands_sep || locale->thousands_sep[1])
|
|
thousands_sep = NON_CHAR;
|
|
}
|
|
|
|
have_read_stdin = false;
|
|
inittables ();
|
|
|
|
{
|
|
size_t i;
|
|
static int const sig[] =
|
|
{
|
|
/* The usual suspects. */
|
|
SIGALRM, SIGHUP, SIGINT, SIGQUIT, SIGTERM,
|
|
#ifdef SIGPOLL
|
|
SIGPOLL,
|
|
#endif
|
|
#ifdef SIGPROF
|
|
SIGPROF,
|
|
#endif
|
|
#ifdef SIGVTALRM
|
|
SIGVTALRM,
|
|
#endif
|
|
#ifdef SIGXCPU
|
|
SIGXCPU,
|
|
#endif
|
|
#ifdef SIGXFSZ
|
|
SIGXFSZ,
|
|
#endif
|
|
};
|
|
enum { nsigs = countof (sig) };
|
|
|
|
#if SA_NOCLDSTOP
|
|
struct sigaction act;
|
|
|
|
sigemptyset (&caught_signals);
|
|
for (i = 0; i < nsigs; i++)
|
|
{
|
|
sigaction (sig[i], nullptr, &act);
|
|
if (act.sa_handler != SIG_IGN)
|
|
sigaddset (&caught_signals, sig[i]);
|
|
}
|
|
|
|
act.sa_handler = sighandler;
|
|
act.sa_mask = caught_signals;
|
|
act.sa_flags = 0;
|
|
|
|
for (i = 0; i < nsigs; i++)
|
|
if (sigismember (&caught_signals, sig[i]))
|
|
sigaction (sig[i], &act, nullptr);
|
|
#else
|
|
for (i = 0; i < nsigs; i++)
|
|
if (signal (sig[i], SIG_IGN) != SIG_IGN)
|
|
{
|
|
signal (sig[i], sighandler);
|
|
siginterrupt (sig[i], 1);
|
|
}
|
|
#endif
|
|
}
|
|
signal (SIGCHLD, SIG_DFL); /* Don't inherit CHLD handling from parent. */
|
|
|
|
/* Ignore SIGPIPE so write failures are reported via EPIPE errno.
|
|
For stdout, sort_die() will reraise SIGPIPE if it was originally SIG_DFL.
|
|
For compression pipes, sort_die() will exit with SORT_FAILURE. */
|
|
default_SIGPIPE = (signal (SIGPIPE, SIG_IGN) == SIG_DFL);
|
|
|
|
/* The signal mask is known, so it is safe to invoke exit_cleanup. */
|
|
atexit (exit_cleanup);
|
|
|
|
key_init (&gkey);
|
|
gkey.sword = SIZE_MAX;
|
|
|
|
files = xnmalloc (argc, sizeof *files);
|
|
|
|
while (true)
|
|
{
|
|
/* Parse an operand as a file after "--" was seen; or if
|
|
pedantic and a file was seen, unless the POSIX version
|
|
is not 1003.1-2001 and -c was not seen and the operand is
|
|
"-o FILE" or "-oFILE". */
|
|
int oi = -1;
|
|
|
|
if (c == -1
|
|
|| (posixly_correct && nfiles != 0
|
|
&& ! (traditional_usage
|
|
&& ! checkonly
|
|
&& optind != argc
|
|
&& argv[optind][0] == '-' && argv[optind][1] == 'o'
|
|
&& (argv[optind][2] || optind + 1 != argc)))
|
|
|| ((c = getopt_long (argc, argv, short_options,
|
|
long_options, &oi))
|
|
== -1))
|
|
{
|
|
if (argc <= optind)
|
|
break;
|
|
files[nfiles++] = argv[optind++];
|
|
}
|
|
else switch (c)
|
|
{
|
|
case 1:
|
|
key = nullptr;
|
|
if (optarg[0] == '+')
|
|
{
|
|
bool minus_pos_usage = (optind != argc && argv[optind][0] == '-'
|
|
&& c_isdigit (argv[optind][1]));
|
|
traditional_usage |= minus_pos_usage && !posixly_correct;
|
|
if (traditional_usage)
|
|
{
|
|
/* Treat +POS1 [-POS2] as a key if possible; but silently
|
|
treat an operand as a file if it is not a valid +POS1. */
|
|
key = key_init (&key_buf);
|
|
s = parse_field_count (optarg + 1, &key->sword, nullptr);
|
|
if (s && *s == '.')
|
|
s = parse_field_count (s + 1, &key->schar, nullptr);
|
|
if (! (key->sword || key->schar))
|
|
key->sword = SIZE_MAX;
|
|
if (! s || *set_ordering (s, key, bl_start))
|
|
key = nullptr;
|
|
else
|
|
{
|
|
if (minus_pos_usage)
|
|
{
|
|
char const *optarg1 = argv[optind++];
|
|
s = parse_field_count (optarg1 + 1, &key->eword,
|
|
N_("invalid number after '-'"));
|
|
if (*s == '.')
|
|
s = parse_field_count (s + 1, &key->echar,
|
|
N_("invalid number after '.'"));
|
|
if (!key->echar && key->eword)
|
|
{
|
|
/* obsolescent syntax +A.x -B.y is equivalent to:
|
|
-k A+1.x+1,B.y (when y = 0)
|
|
-k A+1.x+1,B+1.y (when y > 0)
|
|
So eword is decremented as in the -k case
|
|
only when the end field (B) is specified and
|
|
echar (y) is 0. */
|
|
key->eword--;
|
|
}
|
|
if (*set_ordering (s, key, bl_end))
|
|
badfieldspec (optarg1,
|
|
N_("stray character in field spec"));
|
|
}
|
|
key->traditional_used = true;
|
|
insertkey (key);
|
|
}
|
|
}
|
|
}
|
|
if (! key)
|
|
files[nfiles++] = optarg;
|
|
break;
|
|
|
|
case SORT_OPTION:
|
|
c = XARGMATCH ("--sort", optarg, sort_args, sort_types);
|
|
FALLTHROUGH;
|
|
case 'b':
|
|
case 'd':
|
|
case 'f':
|
|
case 'g':
|
|
case 'h':
|
|
case 'i':
|
|
case 'M':
|
|
case 'n':
|
|
case 'r':
|
|
case 'R':
|
|
case 'V':
|
|
{
|
|
char str[2];
|
|
str[0] = c;
|
|
str[1] = '\0';
|
|
set_ordering (str, &gkey, bl_both);
|
|
}
|
|
break;
|
|
|
|
case CHECK_OPTION:
|
|
c = (optarg
|
|
? XARGMATCH ("--check", optarg, check_args, check_types)
|
|
: 'c');
|
|
FALLTHROUGH;
|
|
case 'c':
|
|
case 'C':
|
|
if (checkonly && checkonly != c)
|
|
incompatible_options ("cC");
|
|
checkonly = c;
|
|
break;
|
|
|
|
case COMPRESS_PROGRAM_OPTION:
|
|
if (compress_program && !streq (compress_program, optarg))
|
|
error (SORT_FAILURE, 0, _("multiple compress programs specified"));
|
|
compress_program = optarg;
|
|
break;
|
|
|
|
case DEBUG_PROGRAM_OPTION:
|
|
debug = true;
|
|
break;
|
|
|
|
case FILES0_FROM_OPTION:
|
|
files_from = optarg;
|
|
break;
|
|
|
|
case 'k':
|
|
key = key_init (&key_buf);
|
|
|
|
/* Get POS1. */
|
|
s = parse_field_count (optarg, &key->sword,
|
|
N_("invalid number at field start"));
|
|
if (! key->sword--)
|
|
{
|
|
/* Provoke with 'sort -k0' */
|
|
badfieldspec (optarg, N_("field number is zero"));
|
|
}
|
|
if (*s == '.')
|
|
{
|
|
s = parse_field_count (s + 1, &key->schar,
|
|
N_("invalid number after '.'"));
|
|
if (! key->schar--)
|
|
{
|
|
/* Provoke with 'sort -k1.0' */
|
|
badfieldspec (optarg, N_("character offset is zero"));
|
|
}
|
|
}
|
|
if (! (key->sword || key->schar))
|
|
key->sword = SIZE_MAX;
|
|
s = set_ordering (s, key, bl_start);
|
|
if (*s != ',')
|
|
{
|
|
key->eword = SIZE_MAX;
|
|
key->echar = 0;
|
|
}
|
|
else
|
|
{
|
|
/* Get POS2. */
|
|
s = parse_field_count (s + 1, &key->eword,
|
|
N_("invalid number after ','"));
|
|
if (! key->eword--)
|
|
{
|
|
/* Provoke with 'sort -k1,0' */
|
|
badfieldspec (optarg, N_("field number is zero"));
|
|
}
|
|
if (*s == '.')
|
|
{
|
|
s = parse_field_count (s + 1, &key->echar,
|
|
N_("invalid number after '.'"));
|
|
}
|
|
s = set_ordering (s, key, bl_end);
|
|
}
|
|
if (*s)
|
|
badfieldspec (optarg, N_("stray character in field spec"));
|
|
insertkey (key);
|
|
break;
|
|
|
|
case 'm':
|
|
mergeonly = true;
|
|
break;
|
|
|
|
case NMERGE_OPTION:
|
|
specify_nmerge (oi, c, optarg);
|
|
break;
|
|
|
|
case 'o':
|
|
if (outfile && !streq (outfile, optarg))
|
|
error (SORT_FAILURE, 0, _("multiple output files specified"));
|
|
outfile = optarg;
|
|
break;
|
|
|
|
case RANDOM_SOURCE_OPTION:
|
|
if (random_source && !streq (random_source, optarg))
|
|
error (SORT_FAILURE, 0, _("multiple random sources specified"));
|
|
random_source = optarg;
|
|
break;
|
|
|
|
case 's':
|
|
stable = true;
|
|
break;
|
|
|
|
case 'S':
|
|
specify_sort_size (oi, c, optarg);
|
|
break;
|
|
|
|
case 't':
|
|
{
|
|
char newtab = optarg[0];
|
|
if (! newtab)
|
|
error (SORT_FAILURE, 0, _("empty tab"));
|
|
if (optarg[1])
|
|
{
|
|
if (streq (optarg, "\\0"))
|
|
newtab = '\0';
|
|
else
|
|
{
|
|
/* Provoke with 'sort -txx'. Complain about
|
|
"multi-character tab" instead of "multibyte tab", so
|
|
that the diagnostic's wording does not need to be
|
|
changed once multibyte characters are supported. */
|
|
error (SORT_FAILURE, 0, _("multi-character tab %s"),
|
|
quote (optarg));
|
|
}
|
|
}
|
|
if (tab != TAB_DEFAULT && tab != newtab)
|
|
error (SORT_FAILURE, 0, _("incompatible tabs"));
|
|
tab = newtab;
|
|
}
|
|
break;
|
|
|
|
case 'T':
|
|
add_temp_dir (optarg);
|
|
break;
|
|
|
|
case PARALLEL_OPTION:
|
|
nthreads = specify_nthreads (oi, c, optarg);
|
|
break;
|
|
|
|
case 'u':
|
|
unique = true;
|
|
break;
|
|
|
|
case 'y':
|
|
/* Accept and ignore e.g. -y0 for compatibility with Solaris 2.x
|
|
through Solaris 7. It is also accepted by many non-Solaris
|
|
"sort" implementations, e.g., AIX 5.2, HP-UX 11i v2, IRIX 6.5.
|
|
-y is marked as obsolete starting with Solaris 8 (1999), but is
|
|
still accepted as of Solaris 10 prerelease (2004).
|
|
|
|
Solaris 2.5.1 "sort -y 100" reads the input file "100", but
|
|
emulate Solaris 8 and 9 "sort -y 100" which ignores the "100",
|
|
and which in general ignores the argument after "-y" if it
|
|
consists entirely of digits (it can even be empty). */
|
|
if (optarg == argv[optind - 1])
|
|
{
|
|
char const *p;
|
|
for (p = optarg; c_isdigit (*p); p++)
|
|
continue;
|
|
optind -= (*p != '\0');
|
|
}
|
|
break;
|
|
|
|
case 'z':
|
|
eolchar = 0;
|
|
break;
|
|
|
|
case_GETOPT_HELP_CHAR;
|
|
|
|
case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
|
|
|
|
default:
|
|
usage (SORT_FAILURE);
|
|
}
|
|
}
|
|
|
|
if (files_from)
|
|
{
|
|
/* When using --files0-from=F, you may not specify any files
|
|
on the command-line. */
|
|
if (nfiles)
|
|
{
|
|
error (0, 0, _("extra operand %s"), quoteaf (files[0]));
|
|
fprintf (stderr, "%s\n",
|
|
_("file operands cannot be combined with --files0-from"));
|
|
usage (SORT_FAILURE);
|
|
}
|
|
|
|
FILE *stream = xfopen (files_from, "r");
|
|
|
|
readtokens0_init (&tok);
|
|
|
|
if (! readtokens0 (stream, &tok))
|
|
error (SORT_FAILURE, 0, _("cannot read file names from %s"),
|
|
quoteaf (files_from));
|
|
xfclose (stream, files_from);
|
|
|
|
if (tok.n_tok)
|
|
{
|
|
free (files);
|
|
files = tok.tok;
|
|
nfiles = tok.n_tok;
|
|
for (size_t i = 0; i < nfiles; i++)
|
|
{
|
|
if (streq (files[i], "-"))
|
|
error (SORT_FAILURE, 0, _("when reading file names from "
|
|
"standard input, "
|
|
"no file name of %s allowed"),
|
|
quoteaf (files[i]));
|
|
else if (files[i][0] == '\0')
|
|
{
|
|
/* Using the standard 'filename:line-number:' prefix here is
|
|
not totally appropriate, since NUL is the separator,
|
|
not NL, but it might be better than nothing. */
|
|
unsigned long int file_number = i + 1;
|
|
error (SORT_FAILURE, 0,
|
|
_("%s:%lu: invalid zero-length file name"),
|
|
quotef (files_from), file_number);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
error (SORT_FAILURE, 0, _("no input from %s"),
|
|
quoteaf (files_from));
|
|
}
|
|
|
|
/* Inheritance of global options to individual keys. */
|
|
for (key = keylist; key; key = key->next)
|
|
{
|
|
if (default_key_compare (key) && !key->reverse)
|
|
{
|
|
key->ignore = gkey.ignore;
|
|
key->translate = gkey.translate;
|
|
key->skipsblanks = gkey.skipsblanks;
|
|
key->skipeblanks = gkey.skipeblanks;
|
|
key->month = gkey.month;
|
|
key->numeric = gkey.numeric;
|
|
key->general_numeric = gkey.general_numeric;
|
|
key->human_numeric = gkey.human_numeric;
|
|
key->version = gkey.version;
|
|
key->random = gkey.random;
|
|
key->reverse = gkey.reverse;
|
|
}
|
|
|
|
need_random |= key->random;
|
|
}
|
|
|
|
if (!keylist && !default_key_compare (&gkey))
|
|
{
|
|
gkey_only = true;
|
|
insertkey (&gkey);
|
|
need_random |= gkey.random;
|
|
}
|
|
|
|
check_ordering_compatibility ();
|
|
|
|
if (debug)
|
|
{
|
|
if (checkonly || outfile)
|
|
{
|
|
static char opts[] = "X --debug";
|
|
opts[0] = (checkonly ? checkonly : 'o');
|
|
incompatible_options (opts);
|
|
}
|
|
|
|
/* Always output the locale in debug mode, since this
|
|
is such a common source of confusion. */
|
|
|
|
/* OpenBSD can only set some categories with LC_ALL above,
|
|
so set LC_COLLATE explicitly to flag errors. */
|
|
if (locale_ok)
|
|
locale_ok = !! setlocale (LC_COLLATE, "");
|
|
if (! locale_ok)
|
|
error (0, 0, "%s", _("failed to set locale"));
|
|
if (hard_LC_COLLATE)
|
|
error (0, 0, _("text ordering performed using %s sorting rules"),
|
|
quote (setlocale (LC_COLLATE, nullptr)));
|
|
else
|
|
error (0, 0, "%s",
|
|
_("text ordering performed using simple byte comparison"));
|
|
|
|
key_warnings (&gkey, gkey_only);
|
|
}
|
|
|
|
reverse = gkey.reverse;
|
|
|
|
if (need_random)
|
|
random_md5_state_init (random_source);
|
|
|
|
if (temp_dir_count == 0)
|
|
{
|
|
char const *tmp_dir = getenv ("TMPDIR");
|
|
add_temp_dir (tmp_dir ? tmp_dir : DEFAULT_TMPDIR);
|
|
}
|
|
|
|
if (nfiles == 0)
|
|
{
|
|
nfiles = 1;
|
|
free (files);
|
|
files = xmalloc (sizeof *files);
|
|
*files = (char *) "-";
|
|
}
|
|
|
|
/* Need to re-check that we meet the minimum requirement for memory
|
|
usage with the final value for NMERGE. */
|
|
if (0 < sort_size)
|
|
sort_size = MAX (sort_size, MIN_SORT_SIZE);
|
|
|
|
if (checkonly)
|
|
{
|
|
if (nfiles > 1)
|
|
error (SORT_FAILURE, 0, _("extra operand %s not allowed with -%c"),
|
|
quoteaf (files[1]), checkonly);
|
|
|
|
if (outfile)
|
|
{
|
|
static char opts[] = {0, 'o', 0};
|
|
opts[0] = checkonly;
|
|
incompatible_options (opts);
|
|
}
|
|
|
|
/* POSIX requires that sort return 1 IFF invoked with -c or -C and the
|
|
input is not properly sorted. */
|
|
exit (check (files[0], checkonly) ? EXIT_SUCCESS : SORT_OUT_OF_ORDER);
|
|
}
|
|
|
|
/* Check all inputs are accessible, or exit immediately. */
|
|
check_inputs (files, nfiles);
|
|
|
|
/* Check output is writable, or exit immediately. */
|
|
check_output (outfile);
|
|
|
|
if (mergeonly)
|
|
{
|
|
struct sortfile *sortfiles = xcalloc (nfiles, sizeof *sortfiles);
|
|
|
|
for (size_t i = 0; i < nfiles; ++i)
|
|
sortfiles[i].name = files[i];
|
|
|
|
merge (sortfiles, 0, nfiles, outfile);
|
|
}
|
|
else
|
|
{
|
|
if (!nthreads)
|
|
{
|
|
unsigned long int np = num_processors (NPROC_CURRENT_OVERRIDABLE);
|
|
nthreads = MIN (np, DEFAULT_MAX_THREADS);
|
|
}
|
|
|
|
/* Avoid integer overflow later. */
|
|
size_t nthreads_max = SIZE_MAX / (2 * sizeof (struct merge_node));
|
|
nthreads = MIN (nthreads, nthreads_max);
|
|
|
|
sort (files, nfiles, outfile, nthreads);
|
|
}
|
|
|
|
if (have_read_stdin && fclose (stdin) == EOF)
|
|
sort_die (_("close failed"), "-");
|
|
|
|
main_exit (EXIT_SUCCESS);
|
|
}
|