1
0
mirror of git://git.sv.gnu.org/coreutils.git synced 2026-04-04 03:06:23 +02:00
Files
coreutils/src/join.c
2004-12-08 23:05:13 +00:00

874 lines
20 KiB
C

/* join - join lines of two files on a common field
Copyright (C) 91, 1995-2004 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software Foundation,
Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
Written by Mike Haertel, mike@gnu.ai.mit.edu. */
#include <config.h>
#include <stdio.h>
#include <assert.h>
#include <sys/types.h>
#include <getopt.h>
#include "system.h"
#include "error.h"
#include "hard-locale.h"
#include "linebuffer.h"
#include "memcasecmp.h"
#include "posixver.h"
#include "quote.h"
#include "xmemcoll.h"
#include "xstrtol.h"
/* The official name of this program (e.g., no `g' prefix). */
#define PROGRAM_NAME "join"
#define AUTHORS "Mike Haertel"
#define join system_join
/* An element of the list identifying which fields to print for each
output line. */
struct outlist
{
/* File number: 0, 1, or 2. 0 means use the join field.
1 means use the first file argument, 2 the second. */
int file;
/* Field index (zero-based), specified only when FILE is 1 or 2. */
size_t field;
struct outlist *next;
};
/* A field of a line. */
struct field
{
char *beg; /* First character in field. */
size_t len; /* The length of the field. */
};
/* A line read from an input file. */
struct line
{
struct linebuffer buf; /* The line itself. */
size_t nfields; /* Number of elements in `fields'. */
size_t nfields_allocated; /* Number of elements allocated for `fields'. */
struct field *fields;
};
/* One or more consecutive lines read from a file that all have the
same join field value. */
struct seq
{
size_t count; /* Elements used in `lines'. */
size_t alloc; /* Elements allocated in `lines'. */
struct line *lines;
};
/* The name this program was run with. */
char *program_name;
/* True if the LC_COLLATE locale is hard. */
static bool hard_LC_COLLATE;
/* True if obsolete option usage should be supported. */
static bool obsolete_usage;
/* If nonzero, print unpairable lines in file 1 or 2. */
static bool print_unpairables_1, print_unpairables_2;
/* If nonzero, print pairable lines. */
static bool print_pairables;
/* Empty output field filler. */
static char const *empty_filler;
/* Field to join on. */
static size_t join_field_1, join_field_2;
/* List of fields to print. */
static struct outlist outlist_head;
/* Last element in `outlist', where a new element can be added. */
static struct outlist *outlist_end = &outlist_head;
/* Tab character separating fields; if this is NUL fields are separated
by any nonempty string of white space, otherwise by exactly one
tab character. */
static char tab;
/* When using getopt_long_only, no long option can start with
a character that is a short option. */
static struct option const longopts[] =
{
/* These three options are obsolete; see OBSOLETE_LONG_OPTIONS below. */
{"j", required_argument, NULL, 'j'},
{"j1", required_argument, NULL, '1'},
{"j2", required_argument, NULL, '2'},
{"ignore-case", no_argument, NULL, 'i'},
{GETOPT_HELP_OPTION_DECL},
{GETOPT_VERSION_OPTION_DECL},
{NULL, 0, NULL, 0}
};
/* Number of options at the start of longopts that are obsolete. */
enum { OBSOLETE_LONG_OPTIONS = 3 };
/* Used to print non-joining lines */
static struct line uni_blank;
/* If nonzero, ignore case when comparing join fields. */
static bool ignore_case;
/* Get the next option from the argument vector. */
static int
get_option (int argc, char **argv)
{
return (obsolete_usage
? getopt_long_only (argc, argv, "-a:e:i1:2:o:t:v:", longopts, NULL)
: getopt_long (argc, argv, "a:e:ij:1:2:o:t:v:",
longopts + OBSOLETE_LONG_OPTIONS, NULL));
}
void
usage (int status)
{
if (status != EXIT_SUCCESS)
fprintf (stderr, _("Try `%s --help' for more information.\n"),
program_name);
else
{
printf (_("\
Usage: %s [OPTION]... FILE1 FILE2\n\
"),
program_name);
fputs (_("\
For each pair of input lines with identical join fields, write a line to\n\
standard output. The default join field is the first, delimited\n\
by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
\n\
-a FILENUM print unpairable lines coming from file FILENUM, where\n\
FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
-e EMPTY replace missing input fields with EMPTY\n\
"), stdout);
fputs (_("\
-i, --ignore-case ignore differences in case when comparing fields\n\
-j FIELD equivalent to `-1 FIELD -2 FIELD'\n\
-o FORMAT obey FORMAT while constructing output line\n\
-t CHAR use CHAR as input and output field separator\n\
"), stdout);
fputs (_("\
-v FILENUM like -a FILENUM, but suppress joined output lines\n\
-1 FIELD join on this FIELD of file 1\n\
-2 FIELD join on this FIELD of file 2\n\
"), stdout);
fputs (HELP_OPTION_DESCRIPTION, stdout);
fputs (VERSION_OPTION_DESCRIPTION, stdout);
fputs (_("\
\n\
Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
else fields are separated by CHAR. Any FIELD is a field number counted\n\
from 1. FORMAT is one or more comma or blank separated specifications,\n\
each being `FILENUM.FIELD' or `0'. Default FORMAT outputs the join field,\n\
the remaining fields from FILE1, the remaining fields from FILE2, all\n\
separated by CHAR.\n\
\n\
Important: FILE1 and FILE2 must be sorted on the join fields.\n\
"), stdout);
printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
}
exit (status);
}
/* Return true if C is a blank (a default input field separator). */
static inline bool
is_blank (unsigned char c)
{
return ISBLANK (c) != 0;
}
/* Record a field in LINE, with location FIELD and size LEN. */
static void
extract_field (struct line *line, char *field, size_t len)
{
if (line->nfields >= line->nfields_allocated)
{
line->fields = x2nrealloc (line->fields, &line->nfields_allocated,
sizeof (struct field));
}
line->fields[line->nfields].beg = field;
line->fields[line->nfields].len = len;
++(line->nfields);
}
/* Fill in the `fields' structure in LINE. */
static void
xfields (struct line *line)
{
char *ptr = line->buf.buffer;
char const *lim = ptr + line->buf.length - 1;
if (ptr == lim)
return;
if (tab)
{
unsigned char t = tab;
char *sep;
for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
extract_field (line, ptr, sep - ptr);
}
else
{
/* Skip leading blanks before the first field. */
while (is_blank (*ptr))
if (++ptr == lim)
return;
do
{
char *sep;
for (sep = ptr + 1; sep != lim && ! is_blank (*sep); sep++)
continue;
extract_field (line, ptr, sep - ptr);
if (sep == lim)
return;
for (ptr = sep + 1; ptr != lim && is_blank (*ptr); ptr++)
continue;
}
while (ptr != lim);
}
extract_field (line, ptr, lim - ptr);
}
/* Read a line from FP into LINE and split it into fields.
Return true if successful. */
static bool
get_line (FILE *fp, struct line *line)
{
initbuffer (&line->buf);
if (! readlinebuffer (&line->buf, fp))
{
if (ferror (fp))
error (EXIT_FAILURE, errno, _("read error"));
free (line->buf.buffer);
line->buf.buffer = NULL;
return false;
}
line->nfields_allocated = 0;
line->nfields = 0;
line->fields = NULL;
xfields (line);
return true;
}
static void
freeline (struct line *line)
{
free (line->fields);
free (line->buf.buffer);
line->buf.buffer = NULL;
}
static void
initseq (struct seq *seq)
{
seq->count = 0;
seq->alloc = 0;
seq->lines = NULL;
}
/* Read a line from FP and add it to SEQ. Return true if successful. */
static bool
getseq (FILE *fp, struct seq *seq)
{
if (seq->count == seq->alloc)
seq->lines = x2nrealloc (seq->lines, &seq->alloc, sizeof *seq->lines);
if (get_line (fp, &seq->lines[seq->count]))
{
++seq->count;
return true;
}
return false;
}
static void
delseq (struct seq *seq)
{
size_t i;
for (i = 0; i < seq->count; i++)
if (seq->lines[i].buf.buffer)
freeline (&seq->lines[i]);
free (seq->lines);
}
/* Return <0 if the join field in LINE1 compares less than the one in LINE2;
>0 if it compares greater; 0 if it compares equal.
Report an error and exit if the comparison fails. */
static int
keycmp (struct line const *line1, struct line const *line2)
{
/* Start of field to compare in each file. */
char *beg1;
char *beg2;
size_t len1;
size_t len2; /* Length of fields to compare. */
int diff;
if (join_field_1 < line1->nfields)
{
beg1 = line1->fields[join_field_1].beg;
len1 = line1->fields[join_field_1].len;
}
else
{
beg1 = NULL;
len1 = 0;
}
if (join_field_2 < line2->nfields)
{
beg2 = line2->fields[join_field_2].beg;
len2 = line2->fields[join_field_2].len;
}
else
{
beg2 = NULL;
len2 = 0;
}
if (len1 == 0)
return len2 == 0 ? 0 : -1;
if (len2 == 0)
return 1;
if (ignore_case)
{
/* FIXME: ignore_case does not work with NLS (in particular,
with multibyte chars). */
diff = memcasecmp (beg1, beg2, MIN (len1, len2));
}
else
{
if (hard_LC_COLLATE)
return xmemcoll (beg1, len1, beg2, len2);
diff = memcmp (beg1, beg2, MIN (len1, len2));
}
if (diff)
return diff;
return len1 < len2 ? -1 : len1 != len2;
}
/* Print field N of LINE if it exists and is nonempty, otherwise
`empty_filler' if it is nonempty. */
static void
prfield (size_t n, struct line const *line)
{
size_t len;
if (n < line->nfields)
{
len = line->fields[n].len;
if (len)
fwrite (line->fields[n].beg, 1, len, stdout);
else if (empty_filler)
fputs (empty_filler, stdout);
}
else if (empty_filler)
fputs (empty_filler, stdout);
}
/* Print the join of LINE1 and LINE2. */
static void
prjoin (struct line const *line1, struct line const *line2)
{
const struct outlist *outlist;
char output_separator = tab ? tab : ' ';
outlist = outlist_head.next;
if (outlist)
{
const struct outlist *o;
o = outlist;
while (1)
{
size_t field;
struct line const *line;
if (o->file == 0)
{
if (line1 == &uni_blank)
{
line = line2;
field = join_field_2;
}
else
{
line = line1;
field = join_field_1;
}
}
else
{
line = (o->file == 1 ? line1 : line2);
field = o->field;
}
prfield (field, line);
o = o->next;
if (o == NULL)
break;
putchar (output_separator);
}
putchar ('\n');
}
else
{
size_t i;
if (line1 == &uni_blank)
{
struct line const *t;
t = line1;
line1 = line2;
line2 = t;
}
prfield (join_field_1, line1);
for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
{
putchar (output_separator);
prfield (i, line1);
}
for (i = join_field_1 + 1; i < line1->nfields; ++i)
{
putchar (output_separator);
prfield (i, line1);
}
for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
{
putchar (output_separator);
prfield (i, line2);
}
for (i = join_field_2 + 1; i < line2->nfields; ++i)
{
putchar (output_separator);
prfield (i, line2);
}
putchar ('\n');
}
}
/* Print the join of the files in FP1 and FP2. */
static void
join (FILE *fp1, FILE *fp2)
{
struct seq seq1, seq2;
struct line line;
int diff;
bool eof1, eof2;
/* Read the first line of each file. */
initseq (&seq1);
getseq (fp1, &seq1);
initseq (&seq2);
getseq (fp2, &seq2);
while (seq1.count && seq2.count)
{
size_t i;
diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
if (diff < 0)
{
if (print_unpairables_1)
prjoin (&seq1.lines[0], &uni_blank);
freeline (&seq1.lines[0]);
seq1.count = 0;
getseq (fp1, &seq1);
continue;
}
if (diff > 0)
{
if (print_unpairables_2)
prjoin (&uni_blank, &seq2.lines[0]);
freeline (&seq2.lines[0]);
seq2.count = 0;
getseq (fp2, &seq2);
continue;
}
/* Keep reading lines from file1 as long as they continue to
match the current line from file2. */
eof1 = false;
do
if (!getseq (fp1, &seq1))
{
eof1 = true;
++seq1.count;
break;
}
while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
/* Keep reading lines from file2 as long as they continue to
match the current line from file1. */
eof2 = false;
do
if (!getseq (fp2, &seq2))
{
eof2 = true;
++seq2.count;
break;
}
while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
if (print_pairables)
{
for (i = 0; i < seq1.count - 1; ++i)
{
size_t j;
for (j = 0; j < seq2.count - 1; ++j)
prjoin (&seq1.lines[i], &seq2.lines[j]);
}
}
for (i = 0; i < seq1.count - 1; ++i)
freeline (&seq1.lines[i]);
if (!eof1)
{
seq1.lines[0] = seq1.lines[seq1.count - 1];
seq1.count = 1;
}
else
seq1.count = 0;
for (i = 0; i < seq2.count - 1; ++i)
freeline (&seq2.lines[i]);
if (!eof2)
{
seq2.lines[0] = seq2.lines[seq2.count - 1];
seq2.count = 1;
}
else
seq2.count = 0;
}
if (print_unpairables_1 && seq1.count)
{
prjoin (&seq1.lines[0], &uni_blank);
freeline (&seq1.lines[0]);
while (get_line (fp1, &line))
{
prjoin (&line, &uni_blank);
freeline (&line);
}
}
if (print_unpairables_2 && seq2.count)
{
prjoin (&uni_blank, &seq2.lines[0]);
freeline (&seq2.lines[0]);
while (get_line (fp2, &line))
{
prjoin (&uni_blank, &line);
freeline (&line);
}
}
delseq (&seq1);
delseq (&seq2);
}
/* Add a field spec for field FIELD of file FILE to `outlist'. */
static void
add_field (int file, size_t field)
{
struct outlist *o;
assert (file == 0 || file == 1 || file == 2);
assert (file != 0 || field == 0);
o = xmalloc (sizeof *o);
o->file = file;
o->field = field;
o->next = NULL;
/* Add to the end of the list so the fields are in the right order. */
outlist_end->next = o;
outlist_end = o;
}
/* Convert a string of decimal digits, STR (the 1-based join field number),
to an integral value. Upon successful conversion, return one less
(the zero-based field number). If it cannot be converted, give a
diagnostic and exit. */
static size_t
string_to_join_field (char const *str, char const *err_msg_fmt)
{
size_t result;
uintmax_t val;
strtol_error s_err = xstrtoumax (str, NULL, 10, &val, "");
if (s_err == LONGINT_OVERFLOW || (s_err == LONGINT_OK && SIZE_MAX < val))
{
error (EXIT_FAILURE, 0,
_("value %s is so large that it is not representable"),
quote (str));
}
if (s_err != LONGINT_OK || val == 0)
error (EXIT_FAILURE, 0, err_msg_fmt, quote (str));
result = val - 1;
return result;
}
/* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
If S is valid, return true. Otherwise, give a diagnostic, don't update
*FILE_INDEX or *FIELD_INDEX, and return false. */
static bool
decode_field_spec (const char *s, int *file_index, size_t *field_index)
{
bool valid = false;
/* The first character must be 0, 1, or 2. */
switch (s[0])
{
case '0':
if (s[1] == '\0')
{
*file_index = 0;
*field_index = 0;
valid = true;
}
else
{
/* `0' must be all alone -- no `.FIELD'. */
error (0, 0, _("invalid field specifier: `%s'"), s);
}
break;
case '1':
case '2':
if (s[1] == '.' && s[2] != '\0')
{
*field_index
= string_to_join_field (s + 2, _("invalid field number: %s"));
*file_index = s[0] - '0';
valid = true;
}
break;
default:
error (0, 0, _("invalid file number in field spec: `%s'"), s);
break;
}
return valid;
}
/* Add the comma or blank separated field spec(s) in STR to `outlist'.
Return true if successful. */
static bool
add_field_list (char *str)
{
char *p = str;
do
{
int file_index;
size_t field_index;
char const *spec_item = p;
p = strpbrk (p, ", \t");
if (p)
*p++ = 0;
if (! decode_field_spec (spec_item, &file_index, &field_index))
return false;
add_field (file_index, field_index);
}
while (p);
return true;
}
/* Add NAME to the array of input file NAMES; currently there are
*NFILES names in the list. */
static void
add_file_name (char const *name, char const *names[2], int *nfiles)
{
if (*nfiles == 2)
{
error (0, 0, _("extra operand %s"), quote (name));
usage (EXIT_FAILURE);
}
names[(*nfiles)++] = name;
}
int
main (int argc, char **argv)
{
char const *names[2];
FILE *fp1, *fp2;
int optc, prev_optc = 0, nfiles;
initialize_main (&argc, &argv);
program_name = argv[0];
setlocale (LC_ALL, "");
bindtextdomain (PACKAGE, LOCALEDIR);
textdomain (PACKAGE);
hard_LC_COLLATE = hard_locale (LC_COLLATE);
obsolete_usage = (posix2_version () < 200112);
atexit (close_stdout);
nfiles = 0;
print_pairables = true;
while ((optc = get_option (argc, argv)) != -1)
{
long int val;
switch (optc)
{
case 'v':
print_pairables = false;
/* Fall through. */
case 'a':
if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
|| (val != 1 && val != 2))
error (EXIT_FAILURE, 0, _("invalid field number: `%s'"), optarg);
if (val == 1)
print_unpairables_1 = true;
else
print_unpairables_2 = true;
break;
case 'e':
empty_filler = optarg;
break;
case 'i':
ignore_case = true;
break;
case '1':
join_field_1 =
string_to_join_field (optarg,
_("invalid field number for file 1: `%s'"));
break;
case '2':
join_field_2 =
string_to_join_field (optarg,
_("invalid field number for file 2: `%s'"));
break;
case 'j':
join_field_1 = join_field_2 =
string_to_join_field (optarg,
_("invalid field number: `%s'"));
break;
case 'o':
if (! add_field_list (optarg))
exit (EXIT_FAILURE);
break;
case 't':
tab = *optarg;
break;
case 1: /* Non-option argument. */
if (prev_optc == 'o' && optind <= argc - 2)
{
if (! add_field_list (optarg))
exit (EXIT_FAILURE);
/* Might be continuation of args to -o. */
continue; /* Don't change `prev_optc'. */
}
add_file_name (optarg, names, &nfiles);
break;
case_GETOPT_HELP_CHAR;
case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
default:
usage (EXIT_FAILURE);
}
prev_optc = optc;
}
if (! obsolete_usage)
while (optind < argc)
add_file_name (argv[optind++], names, &nfiles);
if (nfiles != 2)
{
if (nfiles == 0)
error (0, 0, _("missing operand"));
else
error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
usage (EXIT_FAILURE);
}
fp1 = STREQ (names[0], "-") ? stdin : fopen (names[0], "r");
if (!fp1)
error (EXIT_FAILURE, errno, "%s", names[0]);
fp2 = STREQ (names[1], "-") ? stdin : fopen (names[1], "r");
if (!fp2)
error (EXIT_FAILURE, errno, "%s", names[1]);
if (fp1 == fp2)
error (EXIT_FAILURE, errno, _("both files cannot be standard input"));
join (fp1, fp2);
if (fp1 != stdin && fclose (fp1) == EOF)
error (EXIT_FAILURE, errno, "%s", names[0]);
if (fp2 != stdin && fclose (fp2) == EOF)
error (EXIT_FAILURE, errno, "%s", names[1]);
if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF)
error (EXIT_FAILURE, errno, "-");
exit (EXIT_SUCCESS);
}