mirror of
git://git.sv.gnu.org/coreutils.git
synced 2026-04-19 10:15:48 +02:00
join: new options: --check-order and --nocheck-order.
* src/join.c: Support --check-order and --nocheck-order. New variables check_input_order, seen_unpairable and issued_disorder_warning[]. For --check-order, verify that the input files are in sorted order. For the default case, check the order only if there are unpairable lines. (join): Perform ordering checks after reaching EOF on either input. (usage): Mention --check-order and --nocheck-order. (dupline): Save a copy of the previously-read input line so that we can detect disorder on the input. (get_line): Temporarily save a copy of the previous line (by calling dupline) and check relative ordering (by calling checkorder) before returning the newly-read line. (getseq, join): Tell get_line which file we are reading from. (advance_seq): New function, factoring out some of the code commonly surrounding calls to getseq. (checkorder): New function. Verifies that a pair of consecutive input lines are in sorted order. * doc/coreutils.texi (join invocation): Document the new options --check-order and --nocheck-order. * tests/join/Test.pm (tv): Added tests for --check-order and --nocheck-order. * NEWS: Mention this new feature.
This commit is contained in:
committed by
Jim Meyering
parent
4242d4f5c4
commit
a1e715698a
5
NEWS
5
NEWS
@@ -16,6 +16,11 @@ GNU coreutils NEWS -*- outline -*-
|
||||
rather than reporting the invalid string format.
|
||||
[bug introduced in coreutils-6.0]
|
||||
|
||||
** New features
|
||||
|
||||
join now verifies that the inputs are in sorted order. This check can
|
||||
be turned off with the --nocheck-order option.
|
||||
|
||||
** Improvements
|
||||
|
||||
ls --color no longer outputs unnecessary escape sequences
|
||||
|
||||
@@ -5149,10 +5149,10 @@ sort a file on its default join field, but if you select a non-default
|
||||
locale, join field, separator, or comparison options, then you should
|
||||
do so consistently between @command{join} and @command{sort}.
|
||||
|
||||
As a @acronym{GNU} extension, if the input has no unpairable lines the
|
||||
sort order can be any order that considers two fields to be equal if and
|
||||
only if the sort comparison described above considers them to be equal.
|
||||
For example:
|
||||
If the input has no unpairable lines, a @acronym{GNU} extension is
|
||||
available; the sort order can be any order that considers two fields
|
||||
to be equal if and only if the sort comparison described above
|
||||
considers them to be equal. For example:
|
||||
|
||||
@example
|
||||
$ cat file1
|
||||
@@ -5169,6 +5169,19 @@ c c1 c2
|
||||
b b1 b2
|
||||
@end example
|
||||
|
||||
If the @option{--check-order} option is given, unsorted inputs will
|
||||
cause a fatal error message. If the option @option{--nocheck-order}
|
||||
is given, unsorted inputs will never cause an error message. If
|
||||
neither of these options is given, wrongly sorted inputs are diagnosed
|
||||
only if an input file is found to contain unpairable lines. If an
|
||||
input file is diagnosed as being unsorted, the @command{join} command
|
||||
will exit with a nonzero status (and the output should not be used).
|
||||
|
||||
Forcing @command{join} to process wrongly sorted input files
|
||||
containing unpairable lines by specifying @option{--nocheck-order} is
|
||||
not guaranteed to produce any particular output. The output will
|
||||
probably not correspond with whatever you hoped it would be.
|
||||
|
||||
The defaults are:
|
||||
@itemize
|
||||
@item the join field is the first field in each line;
|
||||
@@ -5188,6 +5201,12 @@ The program accepts the following options. Also see @ref{Common options}.
|
||||
Print a line for each unpairable line in file @var{file-number} (either
|
||||
@samp{1} or @samp{2}), in addition to the normal output.
|
||||
|
||||
@item --check-order
|
||||
Fail with an error message if either input file is wrongly ordered.
|
||||
|
||||
@item --nocheck-order
|
||||
Do not check that both input files are in sorted order. This is the default.
|
||||
|
||||
@item -e @var{string}
|
||||
@opindex -e
|
||||
Replace those output fields that are missing in the input with
|
||||
|
||||
216
src/join.c
216
src/join.c
@@ -1,5 +1,5 @@
|
||||
/* join - join lines of two files on a common field
|
||||
Copyright (C) 91, 1995-2006 Free Software Foundation, Inc.
|
||||
Copyright (C) 91, 1995-2006, 2008 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@@ -90,6 +90,12 @@ static bool print_unpairables_1, print_unpairables_2;
|
||||
/* If nonzero, print pairable lines. */
|
||||
static bool print_pairables;
|
||||
|
||||
/* If nonzero, we have seen at least one unpairable line. */
|
||||
static bool seen_unpairable;
|
||||
|
||||
/* If nonzero, we have warned about disorder in that file. */
|
||||
static bool issued_disorder_warning[2];
|
||||
|
||||
/* Empty output field filler. */
|
||||
static char const *empty_filler;
|
||||
|
||||
@@ -108,9 +114,26 @@ static struct outlist *outlist_end = &outlist_head;
|
||||
tab character whose value (when cast to unsigned char) equals TAB. */
|
||||
static int tab = -1;
|
||||
|
||||
/* If nonzero, check that the input is correctly ordered. */
|
||||
static enum
|
||||
{
|
||||
CHECK_ORDER_DEFAULT,
|
||||
CHECK_ORDER_ENABLED,
|
||||
CHECK_ORDER_DISABLED
|
||||
} check_input_order;
|
||||
|
||||
enum
|
||||
{
|
||||
CHECK_ORDER_OPTION = CHAR_MAX + 1,
|
||||
NOCHECK_ORDER_OPTION
|
||||
};
|
||||
|
||||
|
||||
static struct option const longopts[] =
|
||||
{
|
||||
{"ignore-case", no_argument, NULL, 'i'},
|
||||
{"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
|
||||
{"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
|
||||
{GETOPT_HELP_OPTION_DECL},
|
||||
{GETOPT_VERSION_OPTION_DECL},
|
||||
{NULL, 0, NULL, 0}
|
||||
@@ -122,6 +145,9 @@ static struct line uni_blank;
|
||||
/* If nonzero, ignore case when comparing join fields. */
|
||||
static bool ignore_case;
|
||||
|
||||
|
||||
static void checkorder (const struct line *, const struct line *, int);
|
||||
|
||||
void
|
||||
usage (int status)
|
||||
{
|
||||
@@ -153,6 +179,9 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
|
||||
-v FILENUM like -a FILENUM, but suppress joined output lines\n\
|
||||
-1 FIELD join on this FIELD of file 1\n\
|
||||
-2 FIELD join on this FIELD of file 2\n\
|
||||
--check-order check that the input is correctly sorted, even\n\
|
||||
if all input lines are pairable\n\
|
||||
--nocheck-order do not check that the input is correctly sorted\n\
|
||||
"), stdout);
|
||||
fputs (HELP_OPTION_DESCRIPTION, stdout);
|
||||
fputs (VERSION_OPTION_DESCRIPTION, stdout);
|
||||
@@ -167,6 +196,8 @@ separated by CHAR.\n\
|
||||
\n\
|
||||
Important: FILE1 and FILE2 must be sorted on the join fields.\n\
|
||||
E.g., use `sort -k 1b,1' if `join' has no options.\n\
|
||||
If the input is not sorted and some lines cannot be joined, a\n\
|
||||
warning message will be given.\n\
|
||||
"), stdout);
|
||||
emit_bug_reporting_address ();
|
||||
}
|
||||
@@ -228,12 +259,49 @@ xfields (struct line *line)
|
||||
extract_field (line, ptr, lim - ptr);
|
||||
}
|
||||
|
||||
static struct line *
|
||||
dup_line (const struct line *old)
|
||||
{
|
||||
struct line *newline = xmalloc (sizeof *newline);
|
||||
size_t i;
|
||||
|
||||
/* Duplicate the buffer. */
|
||||
initbuffer (&newline->buf);
|
||||
newline->buf.buffer = xmalloc (old->buf.size);
|
||||
newline->buf.size = old->buf.size;
|
||||
memcpy (newline->buf.buffer, old->buf.buffer, old->buf.length);
|
||||
newline->buf.length = old->buf.length;
|
||||
|
||||
/* Duplicate the field positions. */
|
||||
newline->fields = xnmalloc (old->nfields_allocated, sizeof *newline->fields);
|
||||
newline->nfields = old->nfields;
|
||||
newline->nfields_allocated = old->nfields_allocated;
|
||||
|
||||
for (i = 0; i < old->nfields; i++)
|
||||
{
|
||||
newline->fields[i].len = old->fields[i].len;
|
||||
newline->fields[i].beg = newline->buf.buffer + (old->fields[i].beg
|
||||
- old->buf.buffer);
|
||||
}
|
||||
return newline;
|
||||
}
|
||||
|
||||
static void
|
||||
freeline (struct line *line)
|
||||
{
|
||||
free (line->fields);
|
||||
free (line->buf.buffer);
|
||||
line->buf.buffer = NULL;
|
||||
}
|
||||
|
||||
/* Read a line from FP into LINE and split it into fields.
|
||||
Return true if successful. */
|
||||
|
||||
static bool
|
||||
get_line (FILE *fp, struct line *line)
|
||||
get_line (FILE *fp, struct line *line, int which)
|
||||
{
|
||||
static struct line *prevline[2];
|
||||
|
||||
initbuffer (&line->buf);
|
||||
|
||||
if (! readlinebuffer (&line->buf, fp))
|
||||
@@ -249,15 +317,14 @@ get_line (FILE *fp, struct line *line)
|
||||
line->nfields = 0;
|
||||
line->fields = NULL;
|
||||
xfields (line);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
freeline (struct line *line)
|
||||
{
|
||||
free (line->fields);
|
||||
free (line->buf.buffer);
|
||||
line->buf.buffer = NULL;
|
||||
if (prevline[which - 1])
|
||||
{
|
||||
checkorder (prevline[which - 1], line, which);
|
||||
freeline (prevline[which - 1]);
|
||||
}
|
||||
prevline[which - 1] = dup_line (line);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -271,12 +338,12 @@ initseq (struct seq *seq)
|
||||
/* Read a line from FP and add it to SEQ. Return true if successful. */
|
||||
|
||||
static bool
|
||||
getseq (FILE *fp, struct seq *seq)
|
||||
getseq (FILE *fp, struct seq *seq, int whichfile)
|
||||
{
|
||||
if (seq->count == seq->alloc)
|
||||
seq->lines = X2NREALLOC (seq->lines, &seq->alloc);
|
||||
|
||||
if (get_line (fp, &seq->lines[seq->count]))
|
||||
if (get_line (fp, &seq->lines[seq->count], whichfile))
|
||||
{
|
||||
++seq->count;
|
||||
return true;
|
||||
@@ -284,6 +351,20 @@ getseq (FILE *fp, struct seq *seq)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Read a line from FP and add it to SEQ, as the first item if FIRST is
|
||||
* true, else as the next.
|
||||
*/
|
||||
static bool
|
||||
advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile)
|
||||
{
|
||||
if (first)
|
||||
{
|
||||
freeline (&seq->lines[0]);
|
||||
seq->count = 0;
|
||||
}
|
||||
return getseq (fp, seq, whichfile);
|
||||
}
|
||||
|
||||
static void
|
||||
delseq (struct seq *seq)
|
||||
{
|
||||
@@ -354,6 +435,44 @@ keycmp (struct line const *line1, struct line const *line2)
|
||||
return len1 < len2 ? -1 : len1 != len2;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* Check that successive input lines PREV and CURRENT from input file
|
||||
* WHATFILE are presented in order, unless the user may be relying on
|
||||
* the GNU extension that input lines may be out of order if no input
|
||||
* lines are unpairable.
|
||||
*
|
||||
* If the user specified --nocheck-order, the check is not made.
|
||||
* If the user specified --check-order, the problem is fatal.
|
||||
* Otherwise (the default), the message is simply a warning.
|
||||
*
|
||||
* A message is printed at most once per input file.
|
||||
*/
|
||||
static void
|
||||
checkorder (const struct line *prev,
|
||||
const struct line *current,
|
||||
int whatfile)
|
||||
{
|
||||
if (check_input_order != CHECK_ORDER_DISABLED
|
||||
&& ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
|
||||
{
|
||||
if (!issued_disorder_warning[whatfile-1])
|
||||
{
|
||||
if (keycmp (prev, current) > 0)
|
||||
{
|
||||
error ((check_input_order == CHECK_ORDER_ENABLED ? 1 : 0),
|
||||
0, _("File %d is not in sorted order"), whatfile);
|
||||
|
||||
/* If we get to here, the message was just a warning, but we
|
||||
want only to issue it once. */
|
||||
issued_disorder_warning[whatfile-1] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* Print field N of LINE if it exists and is nonempty, otherwise
|
||||
`empty_filler' if it is nonempty. */
|
||||
|
||||
@@ -464,13 +583,13 @@ join (FILE *fp1, FILE *fp2)
|
||||
struct seq seq1, seq2;
|
||||
struct line line;
|
||||
int diff;
|
||||
bool eof1, eof2;
|
||||
bool eof1, eof2, checktail;
|
||||
|
||||
/* Read the first line of each file. */
|
||||
initseq (&seq1);
|
||||
getseq (fp1, &seq1);
|
||||
getseq (fp1, &seq1, 1);
|
||||
initseq (&seq2);
|
||||
getseq (fp2, &seq2);
|
||||
getseq (fp2, &seq2, 2);
|
||||
|
||||
while (seq1.count && seq2.count)
|
||||
{
|
||||
@@ -480,18 +599,16 @@ join (FILE *fp1, FILE *fp2)
|
||||
{
|
||||
if (print_unpairables_1)
|
||||
prjoin (&seq1.lines[0], &uni_blank);
|
||||
freeline (&seq1.lines[0]);
|
||||
seq1.count = 0;
|
||||
getseq (fp1, &seq1);
|
||||
advance_seq (fp1, &seq1, true, 1);
|
||||
seen_unpairable = true;
|
||||
continue;
|
||||
}
|
||||
if (diff > 0)
|
||||
{
|
||||
if (print_unpairables_2)
|
||||
prjoin (&uni_blank, &seq2.lines[0]);
|
||||
freeline (&seq2.lines[0]);
|
||||
seq2.count = 0;
|
||||
getseq (fp2, &seq2);
|
||||
advance_seq (fp2, &seq2, true, 2);
|
||||
seen_unpairable = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -499,7 +616,7 @@ join (FILE *fp1, FILE *fp2)
|
||||
match the current line from file2. */
|
||||
eof1 = false;
|
||||
do
|
||||
if (!getseq (fp1, &seq1))
|
||||
if (!advance_seq (fp1, &seq1, false, 1))
|
||||
{
|
||||
eof1 = true;
|
||||
++seq1.count;
|
||||
@@ -511,7 +628,7 @@ join (FILE *fp1, FILE *fp2)
|
||||
match the current line from file1. */
|
||||
eof2 = false;
|
||||
do
|
||||
if (!getseq (fp2, &seq2))
|
||||
if (!advance_seq (fp2, &seq2, false, 2))
|
||||
{
|
||||
eof2 = true;
|
||||
++seq2.count;
|
||||
@@ -550,25 +667,46 @@ join (FILE *fp1, FILE *fp2)
|
||||
seq2.count = 0;
|
||||
}
|
||||
|
||||
if (print_unpairables_1 && seq1.count)
|
||||
/* If the user did not specify --check-order, and the we read the
|
||||
* tail ends of both inputs to verify that they are in order. We
|
||||
* skip the rest of the tail once we have issued a warning for that
|
||||
* file, unless we actually need to print the unpairable lines.
|
||||
*/
|
||||
if (check_input_order != CHECK_ORDER_DISABLED
|
||||
&& !(issued_disorder_warning[0] && issued_disorder_warning[1]))
|
||||
checktail = true;
|
||||
else
|
||||
checktail = false;
|
||||
|
||||
if ((print_unpairables_1 || checktail) && seq1.count)
|
||||
{
|
||||
prjoin (&seq1.lines[0], &uni_blank);
|
||||
if (print_unpairables_1)
|
||||
prjoin (&seq1.lines[0], &uni_blank);
|
||||
freeline (&seq1.lines[0]);
|
||||
while (get_line (fp1, &line))
|
||||
seen_unpairable = true;
|
||||
while (get_line (fp1, &line, 1))
|
||||
{
|
||||
prjoin (&line, &uni_blank);
|
||||
if (print_unpairables_1)
|
||||
prjoin (&line, &uni_blank);
|
||||
freeline (&line);
|
||||
if (issued_disorder_warning[0] && !print_unpairables_1)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (print_unpairables_2 && seq2.count)
|
||||
if ((print_unpairables_2 || checktail) && seq2.count)
|
||||
{
|
||||
prjoin (&uni_blank, &seq2.lines[0]);
|
||||
if (print_unpairables_2)
|
||||
prjoin (&uni_blank, &seq2.lines[0]);
|
||||
freeline (&seq2.lines[0]);
|
||||
while (get_line (fp2, &line))
|
||||
seen_unpairable = true;
|
||||
while (get_line (fp2, &line, 2))
|
||||
{
|
||||
prjoin (&uni_blank, &line);
|
||||
if (print_unpairables_2)
|
||||
prjoin (&uni_blank, &line);
|
||||
freeline (&line);
|
||||
if (issued_disorder_warning[1] && !print_unpairables_2)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -789,6 +927,9 @@ main (int argc, char **argv)
|
||||
atexit (close_stdout);
|
||||
|
||||
print_pairables = true;
|
||||
seen_unpairable = false;
|
||||
issued_disorder_warning[0] = issued_disorder_warning[1] = false;
|
||||
check_input_order = CHECK_ORDER_DEFAULT;
|
||||
|
||||
while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
|
||||
longopts, NULL))
|
||||
@@ -875,6 +1016,14 @@ main (int argc, char **argv)
|
||||
}
|
||||
break;
|
||||
|
||||
case NOCHECK_ORDER_OPTION:
|
||||
check_input_order = CHECK_ORDER_DISABLED;
|
||||
break;
|
||||
|
||||
case CHECK_ORDER_OPTION:
|
||||
check_input_order = CHECK_ORDER_ENABLED;
|
||||
break;
|
||||
|
||||
case 1: /* Non-option argument. */
|
||||
add_file_name (optarg, names, operand_status, joption_count,
|
||||
&nfiles, &prev_optc_status, &optc_status);
|
||||
@@ -935,5 +1084,8 @@ main (int argc, char **argv)
|
||||
if (fclose (fp2) != 0)
|
||||
error (EXIT_FAILURE, errno, "%s", names[1]);
|
||||
|
||||
exit (EXIT_SUCCESS);
|
||||
if (issued_disorder_warning[0] || issued_disorder_warning[1])
|
||||
exit (EXIT_FAILURE);
|
||||
else
|
||||
exit (EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Test "join".
|
||||
|
||||
# Copyright (C) 1996, 1999, 2000, 2003, 2004 Free Software Foundation, Inc.
|
||||
# Copyright (C) 1996, 1999-2000, 2003-2004, 2008 Free Software Foundation, Inc.
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
@@ -140,7 +140,38 @@ my @tv = (
|
||||
# FIXME: change this to ensure the diagnostic makes sense
|
||||
['invalid-j', '-j x', {}, "", 1],
|
||||
|
||||
);
|
||||
# With ordering check, inputs in order
|
||||
['chkodr-1', '--check-order',
|
||||
[" a 1\n b 2\n", " a Y\n b Z\n"], "a 1 Y\nb 2 Z\n", 0],
|
||||
|
||||
# Without check, inputs in order
|
||||
['chkodr-2', '--nocheck-order',
|
||||
[" a 1\n b 2\n", " a Y\n b Z\n"], "a 1 Y\nb 2 Z\n", 0],
|
||||
|
||||
# Without check, both inputs out of order (in fact, in reverse order)
|
||||
# but all pairable. Support for this is a GNU extension.
|
||||
['chkodr-3', '--nocheck-order',
|
||||
[" b 1\n a 2\n", " b Y\n a Z\n"], "b 1 Y\na 2 Z\n", 0],
|
||||
|
||||
# The extension should work without --nocheck-order, since that is the
|
||||
# default.
|
||||
['chkodr-4', '',
|
||||
[" b 1\n a 2\n", " b Y\n a Z\n"], "b 1 Y\na 2 Z\n", 0],
|
||||
|
||||
# With check, both inputs out of order (in fact, in reverse order)
|
||||
['chkodr-5', '--check-order',
|
||||
[" b 1\n a 2\n", " b Y\n a Z\n"], "", 1],
|
||||
|
||||
# Without order check, both inputs out of order and some lines
|
||||
# unpairable. This is NOT supported by the GNU extension. All that
|
||||
# we really care about for this test is that the return status is
|
||||
# zero, since that is the only way to actually verify that the
|
||||
# --nocheck-order option had any effect. We don't actually want to
|
||||
# guarantee that join produces this output on stdout.
|
||||
['chkodr-6', '--nocheck-order',
|
||||
[" b 1\n a 2\n", " b Y\n c Z\n"], "b 1 Y\n", 0]
|
||||
)
|
||||
;
|
||||
|
||||
|
||||
sub test_vector
|
||||
|
||||
Reference in New Issue
Block a user