1
0
mirror of git://git.sv.gnu.org/coreutils.git synced 2026-04-20 02:36:16 +02:00

(utf8_wctomb): New function.

(print_unicode_char): Pass the Unicode character to iconv in UTF-8
format instead of in UCS-4 with platform dependent endianness.
This commit is contained in:
Jim Meyering
2000-03-17 19:32:27 +00:00
parent 4587e728aa
commit 03bc0dd04d

View File

@@ -28,6 +28,7 @@
#endif
#include <stdio.h>
#include <string.h>
#include <errno.h>
#ifndef errno
@@ -36,12 +37,6 @@ extern int errno;
#if HAVE_ICONV
# include <iconv.h>
/* Name of UCS-4 encoding with machine dependent endianness and alignment. */
# ifdef _LIBICONV_VERSION
# define UCS4_NAME "UCS-4-INTERNAL"
# else
# define UCS4_NAME "INTERNAL"
# endif
#endif
#include <error.h>
@@ -55,72 +50,141 @@ extern int errno;
#include "unicodeio.h"
/* Use md5.h for its nice detection of unsigned 32-bit type. */
#include "md5.h"
#undef uint32_t
#define uint32_t md5_uint32
/* When we pass a Unicode character to iconv(), we must pass it in a
suitable encoding. The standardized Unicode encodings are
UTF-8, UCS-2, UCS-4, UTF-16, UTF-16BE, UTF-16LE, UTF-7.
UCS-2 supports only characters up to \U0000FFFF.
UTF-16 and variants support only characters up to \U0010FFFF.
UTF-7 is way too complex and not supported by glibc-2.1.
UCS-4 specification leaves doubts about endianness and byte order
mark. glibc currently interprets it as big endian without byte order
mark, but this is not backed by an RFC.
So we use UTF-8. It supports characters up to \U7FFFFFFF and is
unambiguously defined. */
/* Stores the UTF-8 representation of the Unicode character wc in r[0..5].
Returns the number of bytes stored, or -1 if wc is out of range. */
static int
utf8_wctomb (unsigned char *r, unsigned int wc)
{
int count;
if (wc < 0x80)
count = 1;
else if (wc < 0x800)
count = 2;
else if (wc < 0x10000)
count = 3;
else if (wc < 0x200000)
count = 4;
else if (wc < 0x4000000)
count = 5;
else if (wc <= 0x7fffffff)
count = 6;
else
return -1;
switch (count)
{
/* Note: code falls through cases! */
case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
case 1: r[0] = wc;
}
return count;
}
/* Luckily, the encoding's name is platform independent. */
#define UTF8_NAME "UTF-8"
/* Outputs the Unicode character CODE to the output stream STREAM.
Assumes that the locale doesn't change between two calls. */
void
print_unicode_char (FILE *stream, unsigned int code)
{
#if HAVE_ICONV
static int initialized;
static iconv_t ucs4_to_local;
static int is_utf8;
#if HAVE_ICONV
static iconv_t utf8_to_local;
#endif
uint32_t in;
char outbuf[25];
const char *inptr;
size_t inbytesleft;
char *outptr;
size_t outbytesleft;
size_t res;
char inbuf[6];
int count;
if (!initialized)
{
extern const char *locale_charset (void);
const char *charset = locale_charset ();
ucs4_to_local = (charset != NULL
? iconv_open (charset, UCS4_NAME)
: (iconv_t)(-1));
if (ucs4_to_local == (iconv_t)(-1))
is_utf8 = (charset != NULL && !strcmp (charset, UTF8_NAME));
#if HAVE_ICONV
if (!is_utf8)
{
/* For an unknown encoding, assume ASCII. */
ucs4_to_local = iconv_open ("ASCII", UCS4_NAME);
if (ucs4_to_local == (iconv_t)(-1))
error (1, 0, _("cannot output U+%04X: iconv function not usable"),
code);
utf8_to_local = (charset != NULL
? iconv_open (charset, UTF8_NAME)
: (iconv_t)(-1));
if (utf8_to_local == (iconv_t)(-1))
{
/* For an unknown encoding, assume ASCII. */
utf8_to_local = iconv_open ("ASCII", UTF8_NAME);
if (utf8_to_local == (iconv_t)(-1))
error (1, 0,
_("cannot output U+%04X: iconv function not usable"),
code);
}
}
#endif
initialized = 1;
}
in = code;
inptr = (char *) &in;
inbytesleft = sizeof (in);
outptr = outbuf;
outbytesleft = sizeof (outbuf);
/* Convert the character to UTF-8. */
count = utf8_wctomb ((unsigned char *) inbuf, code);
if (count < 0)
error (1, 0, _("U+%04X: character out of range"), code);
/* Convert the character. */
res = iconv (ucs4_to_local, &inptr, &inbytesleft, &outptr, &outbytesleft);
if (inbytesleft > 0 || res == (size_t)(-1))
error (1, res == (size_t)(-1) ? errno : 0,
_("cannot convert U+%04X to local character set"), code);
if (is_utf8)
{
fwrite (inbuf, 1, count, stream);
}
else
{
#if HAVE_ICONV
char outbuf[25];
const char *inptr;
size_t inbytesleft;
char *outptr;
size_t outbytesleft;
size_t res;
/* Avoid glibc-2.1 bug. */
inptr = inbuf;
inbytesleft = count;
outptr = outbuf;
outbytesleft = sizeof (outbuf);
/* Convert the character from UTF-8 to the locale's charset. */
res = iconv (utf8_to_local, &inptr, &inbytesleft, &outptr, &outbytesleft);
if (inbytesleft > 0 || res == (size_t)(-1))
error (1, res == (size_t)(-1) ? errno : 0,
_("cannot convert U+%04X to local character set"), code);
/* Avoid glibc-2.1 bug. */
# if defined _LIBICONV_VERSION || !(__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
/* Get back to the initial shift state. */
res = iconv (ucs4_to_local, NULL, NULL, &outptr, &outbytesleft);
if (res == (size_t)(-1))
error (1, errno, _("cannot convert U+%04X to local character set"), code);
/* Get back to the initial shift state. */
res = iconv (utf8_to_local, NULL, NULL, &outptr, &outbytesleft);
if (res == (size_t)(-1))
error (1, errno, _("cannot convert U+%04X to local character set"),
code);
# endif
fwrite (outbuf, 1, outptr - outbuf, stream);
fwrite (outbuf, 1, outptr - outbuf, stream);
#else
error (1, 0, _("cannot output U+%04X: iconv function not available"), code);
error (1, 0, _("cannot output U+%04X: iconv function not available"),
code);
#endif
}
}