mirror of
git://git.sv.gnu.org/coreutils.git
synced 2026-04-18 09:46:33 +02:00
printf: with \U, support all valid unicode points
Previously this was restricted to the C99 universal character subset, which restricted most values <= 0x9F, as that simplifies the C lexer. However printf(1) doesn't need this restriction. Note also the bash builtin printf already supports all values <= 0x9F. * src/printf.c (main): Relax the restriction on points <= 0x9F. * doc/coreutils.texi (printf invocation): Adjust description. * tests/misc/printf-cov.pl: Adjust accordingly. Add new cases. * NEWS: Mention the change in behavior. Reported at https://bugs.debian.org/1022857
This commit is contained in:
4
NEWS
4
NEWS
@@ -41,6 +41,10 @@ GNU coreutils NEWS -*- outline -*-
|
||||
reverting to the behavior in coreutils-9.0 and earlier.
|
||||
This behavior is now documented.
|
||||
|
||||
printf unicode \uNNNN, \UNNNNNNNN syntax, now supports all valid
|
||||
unicode code points. Previously is was restricted to the C
|
||||
universal character subset, which restricted most points <= 0x9F.
|
||||
|
||||
runcon now exits with status 125 for internal errors. Previously upon
|
||||
internal errors it would exit with status 1, which was less distinguishable
|
||||
from errors from the invoked command.
|
||||
|
||||
@@ -13209,16 +13209,16 @@ For example, @samp{printf '\400'} is equivalent to @samp{printf '\0'}.
|
||||
@cindex Unicode
|
||||
@cindex ISO/IEC 10646
|
||||
@vindex LC_CTYPE
|
||||
@command{printf} interprets two character syntaxes introduced in
|
||||
ISO C 99:
|
||||
@samp{\u} for 16-bit Unicode (ISO/IEC 10646)
|
||||
characters, specified as
|
||||
@command{printf} interprets two syntaxes for specifying Unicode
|
||||
(ISO/IEC 10646) characters.
|
||||
@samp{\u} for 16-bit Unicode characters, specified as
|
||||
four hexadecimal digits @var{hhhh}, and @samp{\U} for 32-bit Unicode
|
||||
characters, specified as eight hexadecimal digits @var{hhhhhhhh}.
|
||||
@command{printf} outputs the Unicode characters
|
||||
according to the @env{LC_CTYPE} locale. Unicode characters in the ranges
|
||||
U+0000@dots{}U+009F, U+D800@dots{}U+DFFF cannot be specified by this syntax,
|
||||
except for U+0024 ($), U+0040 (@@), and U+0060 (@`).
|
||||
according to the @env{LC_CTYPE} locale. Unicode characters in the range
|
||||
U+D800@dots{}U+DFFF cannot be specified by this syntax.
|
||||
This syntax fully supports the universal character subset
|
||||
introduced in ISO C 99.
|
||||
|
||||
The processing of @samp{\u} and @samp{\U} requires a full-featured
|
||||
@code{iconv} facility. It is activated on systems with glibc 2.2 (or newer),
|
||||
|
||||
11
src/printf.c
11
src/printf.c
@@ -298,14 +298,9 @@ print_esc (char const *escstart, bool octal_0)
|
||||
uni_value = uni_value * 16 + hextobin (*p);
|
||||
}
|
||||
|
||||
/* A universal character name shall not specify a character short
|
||||
identifier in the range 00000000 through 00000020, 0000007F through
|
||||
0000009F, or 0000D800 through 0000DFFF inclusive. A universal
|
||||
character name shall not designate a character in the required
|
||||
character set. */
|
||||
if ((uni_value <= 0x9f
|
||||
&& uni_value != 0x24 && uni_value != 0x40 && uni_value != 0x60)
|
||||
|| (uni_value >= 0xd800 && uni_value <= 0xdfff))
|
||||
/* Error for invalid code points 0000D800 through 0000DFFF inclusive.
|
||||
Note print_unicode_char() would print the literal \u.. in this case. */
|
||||
if (uni_value >= 0xd800 && uni_value <= 0xdfff)
|
||||
die (EXIT_FAILURE, 0, _("invalid universal character name \\%c%0*x"),
|
||||
esc_char, (esc_char == 'u' ? 4 : 8), uni_value);
|
||||
|
||||
|
||||
@@ -66,9 +66,14 @@ my @Tests =
|
||||
['esc', q('\xaa\0377'), {OUT=>"\xaa\0377"}],
|
||||
['esc-bad-hex', q('\x'), {EXIT=>1},
|
||||
{ERR=>"$prog: missing hexadecimal number in escape\n"}],
|
||||
# ['u4', q('\u09ac'), {OUT=>"\xe0a6ac"}],
|
||||
['u-invalid', q('\u0000'), {EXIT=>1},
|
||||
{ERR=>"$prog: invalid universal character name \\u0000\n"}],
|
||||
['u-bad-hex', q('\u00'), {EXIT=>1},
|
||||
{ERR=>"$prog: missing hexadecimal number in escape\n"}],
|
||||
['U-bad-hex', q('\U0000'), {EXIT=>1},
|
||||
{ERR=>"$prog: missing hexadecimal number in escape\n"}],
|
||||
['u4', q('\u0030'), {OUT=>"0"}],
|
||||
['U8', q('\U00000030'), {OUT=>"0"}],
|
||||
['u-invalid', q('\ud800'), {EXIT=>1},
|
||||
{ERR=>"$prog: invalid universal character name \\ud800\n"}],
|
||||
['u-missing', q('\u'), {EXIT=>1},
|
||||
{ERR=>"$prog: missing hexadecimal number in escape\n"}],
|
||||
['d-invalid', '%d no-num', {OUT=>'0'}, {EXIT=>1},
|
||||
|
||||
Reference in New Issue
Block a user