1
0
mirror of git://git.sv.gnu.org/coreutils.git synced 2026-04-19 10:15:48 +02:00

join: new options: --check-order and --nocheck-order.

* src/join.c: Support --check-order and --nocheck-order.
New variables check_input_order, seen_unpairable and
issued_disorder_warning[]. For --check-order, verify that the
input files are in sorted order.  For the default case, check the
order only if there are unpairable lines.
(join): Perform ordering checks after reaching EOF on either
input.
(usage): Mention --check-order and --nocheck-order.
(dupline): Save a copy of the previously-read input line so that
we can detect disorder on the input.
(get_line): Temporarily save a copy of the previous line (by
calling dupline) and check relative ordering (by calling
checkorder) before returning the newly-read line.
(getseq, join): Tell get_line which file we are reading from.
(advance_seq): New function, factoring out some of the code
commonly surrounding calls to getseq.
(checkorder): New function.  Verifies that a pair of consecutive
input lines are in sorted order.
* doc/coreutils.texi (join invocation): Document the new options
--check-order and --nocheck-order.
* tests/join/Test.pm (tv): Added tests for --check-order and
--nocheck-order.
* NEWS: Mention this new feature.
This commit is contained in:
James Youngman
2008-02-19 14:13:00 +01:00
committed by Jim Meyering
parent 4242d4f5c4
commit a1e715698a
4 changed files with 245 additions and 38 deletions

5
NEWS
View File

@@ -16,6 +16,11 @@ GNU coreutils NEWS -*- outline -*-
rather than reporting the invalid string format.
[bug introduced in coreutils-6.0]
** New features
join now verifies that the inputs are in sorted order. This check can
be turned off with the --nocheck-order option.
** Improvements
ls --color no longer outputs unnecessary escape sequences

View File

@@ -5149,10 +5149,10 @@ sort a file on its default join field, but if you select a non-default
locale, join field, separator, or comparison options, then you should
do so consistently between @command{join} and @command{sort}.
As a @acronym{GNU} extension, if the input has no unpairable lines the
sort order can be any order that considers two fields to be equal if and
only if the sort comparison described above considers them to be equal.
For example:
If the input has no unpairable lines, a @acronym{GNU} extension is
available; the sort order can be any order that considers two fields
to be equal if and only if the sort comparison described above
considers them to be equal. For example:
@example
$ cat file1
@@ -5169,6 +5169,19 @@ c c1 c2
b b1 b2
@end example
If the @option{--check-order} option is given, unsorted inputs will
cause a fatal error message. If the option @option{--nocheck-order}
is given, unsorted inputs will never cause an error message. If
neither of these options is given, wrongly sorted inputs are diagnosed
only if an input file is found to contain unpairable lines. If an
input file is diagnosed as being unsorted, the @command{join} command
will exit with a nonzero status (and the output should not be used).
Forcing @command{join} to process wrongly sorted input files
containing unpairable lines by specifying @option{--nocheck-order} is
not guaranteed to produce any particular output. The output will
probably not correspond with whatever you hoped it would be.
The defaults are:
@itemize
@item the join field is the first field in each line;
@@ -5188,6 +5201,12 @@ The program accepts the following options. Also see @ref{Common options}.
Print a line for each unpairable line in file @var{file-number} (either
@samp{1} or @samp{2}), in addition to the normal output.
@item --check-order
Fail with an error message if either input file is wrongly ordered.
@item --nocheck-order
Do not check that both input files are in sorted order. This is the default.
@item -e @var{string}
@opindex -e
Replace those output fields that are missing in the input with

View File

@@ -1,5 +1,5 @@
/* join - join lines of two files on a common field
Copyright (C) 91, 1995-2006 Free Software Foundation, Inc.
Copyright (C) 91, 1995-2006, 2008 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -90,6 +90,12 @@ static bool print_unpairables_1, print_unpairables_2;
/* If nonzero, print pairable lines. */
static bool print_pairables;
/* If nonzero, we have seen at least one unpairable line. */
static bool seen_unpairable;
/* If nonzero, we have warned about disorder in that file. */
static bool issued_disorder_warning[2];
/* Empty output field filler. */
static char const *empty_filler;
@@ -108,9 +114,26 @@ static struct outlist *outlist_end = &outlist_head;
tab character whose value (when cast to unsigned char) equals TAB. */
static int tab = -1;
/* If nonzero, check that the input is correctly ordered. */
static enum
{
CHECK_ORDER_DEFAULT,
CHECK_ORDER_ENABLED,
CHECK_ORDER_DISABLED
} check_input_order;
enum
{
CHECK_ORDER_OPTION = CHAR_MAX + 1,
NOCHECK_ORDER_OPTION
};
static struct option const longopts[] =
{
{"ignore-case", no_argument, NULL, 'i'},
{"check-order", no_argument, NULL, CHECK_ORDER_OPTION},
{"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION},
{GETOPT_HELP_OPTION_DECL},
{GETOPT_VERSION_OPTION_DECL},
{NULL, 0, NULL, 0}
@@ -122,6 +145,9 @@ static struct line uni_blank;
/* If nonzero, ignore case when comparing join fields. */
static bool ignore_case;
static void checkorder (const struct line *, const struct line *, int);
void
usage (int status)
{
@@ -153,6 +179,9 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
-v FILENUM like -a FILENUM, but suppress joined output lines\n\
-1 FIELD join on this FIELD of file 1\n\
-2 FIELD join on this FIELD of file 2\n\
--check-order check that the input is correctly sorted, even\n\
if all input lines are pairable\n\
--nocheck-order do not check that the input is correctly sorted\n\
"), stdout);
fputs (HELP_OPTION_DESCRIPTION, stdout);
fputs (VERSION_OPTION_DESCRIPTION, stdout);
@@ -167,6 +196,8 @@ separated by CHAR.\n\
\n\
Important: FILE1 and FILE2 must be sorted on the join fields.\n\
E.g., use `sort -k 1b,1' if `join' has no options.\n\
If the input is not sorted and some lines cannot be joined, a\n\
warning message will be given.\n\
"), stdout);
emit_bug_reporting_address ();
}
@@ -228,12 +259,49 @@ xfields (struct line *line)
extract_field (line, ptr, lim - ptr);
}
static struct line *
dup_line (const struct line *old)
{
struct line *newline = xmalloc (sizeof *newline);
size_t i;
/* Duplicate the buffer. */
initbuffer (&newline->buf);
newline->buf.buffer = xmalloc (old->buf.size);
newline->buf.size = old->buf.size;
memcpy (newline->buf.buffer, old->buf.buffer, old->buf.length);
newline->buf.length = old->buf.length;
/* Duplicate the field positions. */
newline->fields = xnmalloc (old->nfields_allocated, sizeof *newline->fields);
newline->nfields = old->nfields;
newline->nfields_allocated = old->nfields_allocated;
for (i = 0; i < old->nfields; i++)
{
newline->fields[i].len = old->fields[i].len;
newline->fields[i].beg = newline->buf.buffer + (old->fields[i].beg
- old->buf.buffer);
}
return newline;
}
static void
freeline (struct line *line)
{
free (line->fields);
free (line->buf.buffer);
line->buf.buffer = NULL;
}
/* Read a line from FP into LINE and split it into fields.
Return true if successful. */
static bool
get_line (FILE *fp, struct line *line)
get_line (FILE *fp, struct line *line, int which)
{
static struct line *prevline[2];
initbuffer (&line->buf);
if (! readlinebuffer (&line->buf, fp))
@@ -249,15 +317,14 @@ get_line (FILE *fp, struct line *line)
line->nfields = 0;
line->fields = NULL;
xfields (line);
return true;
}
static void
freeline (struct line *line)
{
free (line->fields);
free (line->buf.buffer);
line->buf.buffer = NULL;
if (prevline[which - 1])
{
checkorder (prevline[which - 1], line, which);
freeline (prevline[which - 1]);
}
prevline[which - 1] = dup_line (line);
return true;
}
static void
@@ -271,12 +338,12 @@ initseq (struct seq *seq)
/* Read a line from FP and add it to SEQ. Return true if successful. */
static bool
getseq (FILE *fp, struct seq *seq)
getseq (FILE *fp, struct seq *seq, int whichfile)
{
if (seq->count == seq->alloc)
seq->lines = X2NREALLOC (seq->lines, &seq->alloc);
if (get_line (fp, &seq->lines[seq->count]))
if (get_line (fp, &seq->lines[seq->count], whichfile))
{
++seq->count;
return true;
@@ -284,6 +351,20 @@ getseq (FILE *fp, struct seq *seq)
return false;
}
/* Read a line from FP and add it to SEQ, as the first item if FIRST is
* true, else as the next.
*/
static bool
advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile)
{
if (first)
{
freeline (&seq->lines[0]);
seq->count = 0;
}
return getseq (fp, seq, whichfile);
}
static void
delseq (struct seq *seq)
{
@@ -354,6 +435,44 @@ keycmp (struct line const *line1, struct line const *line2)
return len1 < len2 ? -1 : len1 != len2;
}
/* Check that successive input lines PREV and CURRENT from input file
* WHATFILE are presented in order, unless the user may be relying on
* the GNU extension that input lines may be out of order if no input
* lines are unpairable.
*
* If the user specified --nocheck-order, the check is not made.
* If the user specified --check-order, the problem is fatal.
* Otherwise (the default), the message is simply a warning.
*
* A message is printed at most once per input file.
*/
static void
checkorder (const struct line *prev,
const struct line *current,
int whatfile)
{
if (check_input_order != CHECK_ORDER_DISABLED
&& ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
{
if (!issued_disorder_warning[whatfile-1])
{
if (keycmp (prev, current) > 0)
{
error ((check_input_order == CHECK_ORDER_ENABLED ? 1 : 0),
0, _("File %d is not in sorted order"), whatfile);
/* If we get to here, the message was just a warning, but we
want only to issue it once. */
issued_disorder_warning[whatfile-1] = true;
}
}
}
}
/* Print field N of LINE if it exists and is nonempty, otherwise
`empty_filler' if it is nonempty. */
@@ -464,13 +583,13 @@ join (FILE *fp1, FILE *fp2)
struct seq seq1, seq2;
struct line line;
int diff;
bool eof1, eof2;
bool eof1, eof2, checktail;
/* Read the first line of each file. */
initseq (&seq1);
getseq (fp1, &seq1);
getseq (fp1, &seq1, 1);
initseq (&seq2);
getseq (fp2, &seq2);
getseq (fp2, &seq2, 2);
while (seq1.count && seq2.count)
{
@@ -480,18 +599,16 @@ join (FILE *fp1, FILE *fp2)
{
if (print_unpairables_1)
prjoin (&seq1.lines[0], &uni_blank);
freeline (&seq1.lines[0]);
seq1.count = 0;
getseq (fp1, &seq1);
advance_seq (fp1, &seq1, true, 1);
seen_unpairable = true;
continue;
}
if (diff > 0)
{
if (print_unpairables_2)
prjoin (&uni_blank, &seq2.lines[0]);
freeline (&seq2.lines[0]);
seq2.count = 0;
getseq (fp2, &seq2);
advance_seq (fp2, &seq2, true, 2);
seen_unpairable = true;
continue;
}
@@ -499,7 +616,7 @@ join (FILE *fp1, FILE *fp2)
match the current line from file2. */
eof1 = false;
do
if (!getseq (fp1, &seq1))
if (!advance_seq (fp1, &seq1, false, 1))
{
eof1 = true;
++seq1.count;
@@ -511,7 +628,7 @@ join (FILE *fp1, FILE *fp2)
match the current line from file1. */
eof2 = false;
do
if (!getseq (fp2, &seq2))
if (!advance_seq (fp2, &seq2, false, 2))
{
eof2 = true;
++seq2.count;
@@ -550,25 +667,46 @@ join (FILE *fp1, FILE *fp2)
seq2.count = 0;
}
if (print_unpairables_1 && seq1.count)
/* If the user did not specify --check-order, and the we read the
* tail ends of both inputs to verify that they are in order. We
* skip the rest of the tail once we have issued a warning for that
* file, unless we actually need to print the unpairable lines.
*/
if (check_input_order != CHECK_ORDER_DISABLED
&& !(issued_disorder_warning[0] && issued_disorder_warning[1]))
checktail = true;
else
checktail = false;
if ((print_unpairables_1 || checktail) && seq1.count)
{
prjoin (&seq1.lines[0], &uni_blank);
if (print_unpairables_1)
prjoin (&seq1.lines[0], &uni_blank);
freeline (&seq1.lines[0]);
while (get_line (fp1, &line))
seen_unpairable = true;
while (get_line (fp1, &line, 1))
{
prjoin (&line, &uni_blank);
if (print_unpairables_1)
prjoin (&line, &uni_blank);
freeline (&line);
if (issued_disorder_warning[0] && !print_unpairables_1)
break;
}
}
if (print_unpairables_2 && seq2.count)
if ((print_unpairables_2 || checktail) && seq2.count)
{
prjoin (&uni_blank, &seq2.lines[0]);
if (print_unpairables_2)
prjoin (&uni_blank, &seq2.lines[0]);
freeline (&seq2.lines[0]);
while (get_line (fp2, &line))
seen_unpairable = true;
while (get_line (fp2, &line, 2))
{
prjoin (&uni_blank, &line);
if (print_unpairables_2)
prjoin (&uni_blank, &line);
freeline (&line);
if (issued_disorder_warning[1] && !print_unpairables_2)
break;
}
}
@@ -789,6 +927,9 @@ main (int argc, char **argv)
atexit (close_stdout);
print_pairables = true;
seen_unpairable = false;
issued_disorder_warning[0] = issued_disorder_warning[1] = false;
check_input_order = CHECK_ORDER_DEFAULT;
while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
longopts, NULL))
@@ -875,6 +1016,14 @@ main (int argc, char **argv)
}
break;
case NOCHECK_ORDER_OPTION:
check_input_order = CHECK_ORDER_DISABLED;
break;
case CHECK_ORDER_OPTION:
check_input_order = CHECK_ORDER_ENABLED;
break;
case 1: /* Non-option argument. */
add_file_name (optarg, names, operand_status, joption_count,
&nfiles, &prev_optc_status, &optc_status);
@@ -935,5 +1084,8 @@ main (int argc, char **argv)
if (fclose (fp2) != 0)
error (EXIT_FAILURE, errno, "%s", names[1]);
exit (EXIT_SUCCESS);
if (issued_disorder_warning[0] || issued_disorder_warning[1])
exit (EXIT_FAILURE);
else
exit (EXIT_SUCCESS);
}

View File

@@ -1,6 +1,6 @@
# Test "join".
# Copyright (C) 1996, 1999, 2000, 2003, 2004 Free Software Foundation, Inc.
# Copyright (C) 1996, 1999-2000, 2003-2004, 2008 Free Software Foundation, Inc.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -140,7 +140,38 @@ my @tv = (
# FIXME: change this to ensure the diagnostic makes sense
['invalid-j', '-j x', {}, "", 1],
);
# With ordering check, inputs in order
['chkodr-1', '--check-order',
[" a 1\n b 2\n", " a Y\n b Z\n"], "a 1 Y\nb 2 Z\n", 0],
# Without check, inputs in order
['chkodr-2', '--nocheck-order',
[" a 1\n b 2\n", " a Y\n b Z\n"], "a 1 Y\nb 2 Z\n", 0],
# Without check, both inputs out of order (in fact, in reverse order)
# but all pairable. Support for this is a GNU extension.
['chkodr-3', '--nocheck-order',
[" b 1\n a 2\n", " b Y\n a Z\n"], "b 1 Y\na 2 Z\n", 0],
# The extension should work without --nocheck-order, since that is the
# default.
['chkodr-4', '',
[" b 1\n a 2\n", " b Y\n a Z\n"], "b 1 Y\na 2 Z\n", 0],
# With check, both inputs out of order (in fact, in reverse order)
['chkodr-5', '--check-order',
[" b 1\n a 2\n", " b Y\n a Z\n"], "", 1],
# Without order check, both inputs out of order and some lines
# unpairable. This is NOT supported by the GNU extension. All that
# we really care about for this test is that the return status is
# zero, since that is the only way to actually verify that the
# --nocheck-order option had any effect. We don't actually want to
# guarantee that join produces this output on stdout.
['chkodr-6', '--nocheck-order',
[" b 1\n a 2\n", " b Y\n c Z\n"], "b 1 Y\n", 0]
)
;
sub test_vector