Magellan Linux

Annotation of /trunk/coreutils/patches-5.3.0/coreutils-5.3.0-i18n.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 44 - (hide annotations) (download)
Thu Oct 13 21:17:16 2005 UTC (18 years, 7 months ago) by niro
File size: 102675 byte(s)
patch set for coretutils-5.3.0

1 niro 44 http://www.openi18n.org/download/utildev/coreutils-5.3.0-i18n-0.1.patch
2    
3     --- coreutils/lib/linebuffer.h
4     +++ coreutils/lib/linebuffer.h
5     @@ -22,6 +22,11 @@
6    
7     # include <stdio.h>
8    
9     +/* Get mbstate_t. */
10     +# if HAVE_WCHAR_H
11     +# include <wchar.h>
12     +# endif
13     +
14     /* A `struct linebuffer' holds a line of text. */
15    
16     struct linebuffer
17     @@ -29,6 +34,9 @@ struct linebuffer
18     size_t size; /* Allocated. */
19     size_t length; /* Used. */
20     char *buffer;
21     +# if HAVE_WCHAR_H
22     + mbstate_t state;
23     +# endif
24     };
25    
26     /* Initialize linebuffer LINEBUFFER for use. */
27     --- coreutils/src/cut.c
28     +++ coreutils/src/cut.c
29     @@ -29,6 +29,12 @@
30     #include <assert.h>
31     #include <getopt.h>
32     #include <sys/types.h>
33     +
34     +/* Get mbstate_t, mbrtowc(). */
35     +#if HAVE_WCHAR_H
36     +# include <wchar.h>
37     +#endif
38     +
39     #include "system.h"
40    
41     #include "error.h"
42     @@ -37,6 +43,13 @@
43     #include "quote.h"
44     #include "xstrndup.h"
45    
46     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
47     + installation; work around this configuration error. */
48     +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
49     +# undef MB_LEN_MAX
50     +# define MB_LEN_MAX 16
51     +#endif
52     +
53     /* The official name of this program (e.g., no `g' prefix). */
54     #define PROGRAM_NAME "cut"
55    
56     @@ -73,6 +86,54 @@ struct range_pair
57     size_t hi;
58     };
59    
60     +/* Refill the buffer BUF. */
61     +#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
62     + do \
63     + { \
64     + if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
65     + { \
66     + memmove (BUF, BUFPOS, BUFLEN); \
67     + BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
68     + BUFPOS = BUF; \
69     + } \
70     + } \
71     + while (0)
72     +
73     +/* Get wide character which starts at BUFPOS. If the byte sequence is
74     + not valid as a character, CONVFAIL is 1. Otherwise 0. */
75     +#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
76     + do \
77     + { \
78     + wchar_t tmp; \
79     + mbstate_t state_bak; \
80     + \
81     + if (BUFLEN < 1) \
82     + { \
83     + WC = WEOF; \
84     + break; \
85     + } \
86     + \
87     + /* Get a wide character. */ \
88     + CONVFAIL = 0; \
89     + state_bak = STATE; \
90     + MBLENGTH = mbrtowc (&tmp, BUFPOS, BUFLEN, &STATE); \
91     + WC = tmp; \
92     + \
93     + switch (MBLENGTH) \
94     + { \
95     + case (size_t)-1: \
96     + case (size_t)-2: \
97     + ++CONVFAIL; \
98     + STATE = state_bak; \
99     + /* Fall througn. */ \
100     + \
101     + case 0: \
102     + MBLENGTH = 1; \
103     + break; \
104     + } \
105     + } \
106     + while (0)
107     +
108     /* This buffer is used to support the semantics of the -s option
109     (or lack of same) when the specified field list includes (does
110     not include) the first field. In both of those cases, the entire
111     @@ -85,7 +146,7 @@ static char *field_1_buffer;
112     /* The number of bytes allocated for FIELD_1_BUFFER. */
113     static size_t field_1_bufsize;
114    
115     -/* The largest field or byte index used as an endpoint of a closed
116     +/* The largest field, character or byte index used as an endpoint of a closed
117     or degenerate range specification; this doesn't include the starting
118     index of right-open-ended ranges. For example, with either range spec
119     `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
120     @@ -97,10 +158,11 @@ static size_t eol_range_start;
121    
122     /* This is a bit vector.
123     In byte mode, which bytes to output.
124     + In character mode, which characters to output.
125     In field mode, which DELIM-separated fields to output.
126     - Both bytes and fields are numbered starting with 1,
127     + Bytes, characters and fields are numbered starting with 1,
128     so the zeroth bit of this array is unused.
129     - A field or byte K has been selected if
130     + A byte, character or field K has been selected if
131     (K <= MAX_RANGE_ENDPOINT and is_printable_field(K))
132     || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
133     static unsigned char *printable_field;
134     @@ -109,9 +171,12 @@ enum operating_mode
135     {
136     undefined_mode,
137    
138     - /* Output characters that are in the given bytes. */
139     + /* Output bytes that are in the given bytes. */
140     byte_mode,
141    
142     + /* Output characters that are at the given positions. */
143     + character_mode,
144     +
145     /* Output the given delimeter-separated fields. */
146     field_mode
147     };
148     @@ -121,6 +186,13 @@ char *program_name;
149    
150     static enum operating_mode operating_mode;
151    
152     +/* If true, when in byte mode, don't split multibyte characters. */
153     +static bool byte_mode_character_aware;
154     +
155     +/* If true, the function for single byte locale is work
156     + if this program runs on multibyte locale. */
157     +static bool force_singlebyte_mode;
158     +
159     /* If true do not output lines containing no delimeter characters.
160     Otherwise, all such lines are printed. This option is valid only
161     with field mode. */
162     @@ -132,6 +204,9 @@ static bool complement;
163    
164     /* The delimeter character for field mode. */
165     static unsigned char delim;
166     +#if HAVE_WCHAR_H
167     +static wchar_t wcdelim;
168     +#endif
169    
170     /* True if the --output-delimiter=STRING option was specified. */
171     static bool output_delimiter_specified;
172     @@ -205,7 +280,7 @@ Mandatory arguments to long options are
173     -f, --fields=LIST select only these fields; also print any line\n\
174     that contains no delimiter character, unless\n\
175     the -s option is specified\n\
176     - -n (ignored)\n\
177     + -n with -b: don't split multibyte characters\n\
178     "), stdout);
179     fputs (_("\
180     --complement complement the set of selected bytes, characters\n\
181     @@ -360,7 +435,7 @@ set_fields (const char *fieldstr)
182     in_digits = false;
183     /* Starting a range. */
184     if (dash_found)
185     - FATAL_ERROR (_("invalid byte or field list"));
186     + FATAL_ERROR (_("invalid byte, character or field list"));
187     dash_found = true;
188     fieldstr++;
189    
190     @@ -385,14 +460,16 @@ set_fields (const char *fieldstr)
191     if (value == 0)
192     {
193     /* `n-'. From `initial' to end of line. */
194     - eol_range_start = initial;
195     + if(eol_range_start == 0 ||
196     + (eol_range_start != 0 && eol_range_start > initial))
197     + eol_range_start = initial;
198     field_found = true;
199     }
200     else
201     {
202     /* `m-n' or `-n' (1-n). */
203     if (value < initial)
204     - FATAL_ERROR (_("invalid byte or field list"));
205     + FATAL_ERROR (_("invalid byte, character or field list"));
206    
207     /* Is there already a range going to end of line? */
208     if (eol_range_start != 0)
209     @@ -478,7 +555,7 @@ set_fields (const char *fieldstr)
210     fieldstr++;
211     }
212     else
213     - FATAL_ERROR (_("invalid byte or field list"));
214     + FATAL_ERROR (_("invalid byte, character or field list"));
215     }
216    
217     max_range_endpoint = 0;
218     @@ -571,6 +648,81 @@ cut_bytes (FILE *stream)
219     }
220     }
221    
222     +#if HAVE_MBRTOWC
223     +/* This function is in use for the following case.
224     +
225     + 1. Read from the stream STREAM, printing to standard output any selected
226     + characters.
227     +
228     + 2. Read from stream STREAM, printing to standard output any selected bytes,
229     + without splitting multibyte characters. */
230     +
231     +static void
232     +cut_characters_or_cut_bytes_no_split (FILE *stream)
233     +{
234     + size_t idx; /* Number of bytes or characters in the line so far. */
235     + /* Whether to begin printing delimiters between ranges for the current line.
236     + Set after we've begun printing data corresponding to the first range. */
237     + bool print_delimiter;
238     +
239     + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
240     + char *bufpos; /* Next read position of BUF. */
241     + size_t buflen; /* The length of the byte sequence in buf. */
242     + wint_t wc; /* A gotten wide character. */
243     + size_t mblength; /* The byte size of a multibyte character which shows
244     + as same character as WC. */
245     + mbstate_t state; /* State of the stream. */
246     + int convfail; /* 1, when conversion is failed. Otherwise 0. */
247     +
248     +
249     + idx = 0;
250     + print_delimiter = false;
251     + buflen = 0;
252     + bufpos = buf;
253     + memset (&state, '\0', sizeof(mbstate_t));
254     +
255     + while (1)
256     + {
257     + REFILL_BUFFER (buf, bufpos, buflen, stream);
258     +
259     + GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
260     +
261     + if (wc == WEOF)
262     + {
263     + if (idx > 0)
264     + putchar ('\n');
265     + break;
266     + }
267     + else if (wc == L'\n')
268     + {
269     + putchar ('\n');
270     + idx = 0;
271     + print_delimiter = false;
272     + }
273     + else
274     + {
275     + bool range_start;
276     + bool *rs = output_delimiter_specified ? &range_start : NULL;
277     +
278     + idx += (operating_mode == byte_mode) ? mblength : 1;
279     + if (print_kth (idx, rs))
280     + {
281     + if (rs && *rs && print_delimiter)
282     + {
283     + fwrite (output_delimiter_string, sizeof (char),
284     + output_delimiter_length, stdout);
285     + }
286     + print_delimiter = true;
287     + fwrite (bufpos, mblength, sizeof(char), stdout);
288     + }
289     + }
290     +
291     + buflen -= mblength;
292     + bufpos += mblength;
293     + }
294     +}
295     +#endif
296     +
297     /* Read from stream STREAM, printing to standard output any selected fields. */
298    
299     static void
300     @@ -692,13 +844,190 @@ cut_fields (FILE *stream)
301     }
302     }
303    
304     +#if HAVE_MBRTOWC
305     +static void
306     +cut_fields_mb (FILE *stream)
307     +{
308     + int c;
309     + size_t field_idx = 1;
310     + bool found_any_selected_field = false;
311     + bool buffer_first_field;
312     + int empty_input;
313     + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
314     + char *bufpos; /* Next read position of BUF. */
315     + size_t buflen; /* The length of the byte sequence in buf. */
316     + wint_t wc = 0; /* A gotten wide character. */
317     + size_t mblength; /* The byte size of a multibyte character which shows
318     + as same character as WC. */
319     + mbstate_t state; /* State of the stream. */
320     + int convfail; /* 1, when conversion is failed. Otherwise 0. */
321     +
322     + bufpos = buf;
323     + buflen = 0;
324     + memset (&state, '\0', sizeof(mbstate_t));
325     +
326     + c = getc (stream);
327     + empty_input = (c == EOF);
328     + if (c != EOF)
329     + ungetc (c, stream);
330     + else
331     + wc = WEOF;
332     +
333     + /* To support the semantics of the -s flag, we may have to buffer
334     + all of the first field to determine whether it is `delimited.'
335     + But that is unnecessary if all non-delimited lines must be printed
336     + and the first field has been selected, or if non-delimited lines
337     + must be suppressed and the first field has *not* been selected.
338     + That is because a non-delimited line has exactly one field. */
339     + buffer_first_field = (suppress_non_delimited ^ !print_kth (1, NULL));
340     +
341     + while (1)
342     + {
343     + if (field_idx == 1 && buffer_first_field)
344     + {
345     + size_t n_bytes = 0;
346     +
347     + while (1)
348     + {
349     + REFILL_BUFFER (buf, bufpos, buflen, stream);
350     +
351     + GET_NEXT_WC_FROM_BUFFER
352     + (wc, bufpos, buflen, mblength, state, convfail);
353     +
354     + if (wc == WEOF)
355     + break;
356     +
357     + field_1_buffer = xrealloc (field_1_buffer, n_bytes + mblength);
358     + memcpy (field_1_buffer + n_bytes, bufpos, mblength);
359     + n_bytes += mblength;
360     + buflen -= mblength;
361     + bufpos += mblength;
362     +
363     + if (!convfail && (wc == L'\n' || wc == wcdelim))
364     + break;
365     + }
366     +
367     + if (wc == WEOF)
368     + break;
369     +
370     + /* If the first field extends to the end of line (it is not
371     + delimited) and we are printing all non-delimited lines,
372     + print this one. */
373     + if (convfail || (!convfail && wc != wcdelim))
374     + {
375     + if (suppress_non_delimited)
376     + {
377     + /* Empty. */
378     + }
379     + else
380     + {
381     + fwrite (field_1_buffer, sizeof (char), n_bytes, stdout);
382     + /* Make sure the output line is newline terminated. */
383     + if (convfail || (!convfail && wc != L'\n'))
384     + putchar ('\n');
385     + }
386     + continue;
387     + }
388     +
389     + if (print_kth (1, NULL))
390     + {
391     + /* Print the field, but not the trailing delimiter. */
392     + fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout);
393     + found_any_selected_field = true;
394     + }
395     + ++field_idx;
396     + }
397     +
398     + if (wc != WEOF)
399     + {
400     + if (print_kth (field_idx, NULL))
401     + {
402     + if (found_any_selected_field)
403     + {
404     + fwrite (output_delimiter_string, sizeof (char),
405     + output_delimiter_length, stdout);
406     + }
407     + found_any_selected_field = true;
408     + }
409     +
410     + while (1)
411     + {
412     + REFILL_BUFFER (buf, bufpos, buflen, stream);
413     +
414     + GET_NEXT_WC_FROM_BUFFER
415     + (wc, bufpos, buflen, mblength, state, convfail);
416     +
417     + if (wc == WEOF)
418     + break;
419     + else if (!convfail && (wc == wcdelim || wc == L'\n'))
420     + {
421     + buflen -= mblength;
422     + bufpos += mblength;
423     + break;
424     + }
425     +
426     + if (print_kth (field_idx, NULL))
427     + fwrite (bufpos, mblength, sizeof(char), stdout);
428     +
429     + buflen -= mblength;
430     + bufpos += mblength;
431     + }
432     + }
433     +
434     + if ((!convfail || wc == L'\n') && buflen < 1)
435     + wc = WEOF;
436     +
437     + if (!convfail && wc == wcdelim)
438     + ++field_idx;
439     + else if (wc == WEOF || (!convfail && wc == L'\n'))
440     + {
441     + if (found_any_selected_field
442     + || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
443     + putchar ('\n');
444     + if (wc == WEOF)
445     + break;
446     + field_idx = 1;
447     + found_any_selected_field = false;
448     + }
449     + }
450     +}
451     +#endif
452     +
453     static void
454     cut_stream (FILE *stream)
455     {
456     - if (operating_mode == byte_mode)
457     - cut_bytes (stream);
458     +#if HAVE_MBRTOWC
459     + if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
460     + {
461     + switch (operating_mode)
462     + {
463     + case byte_mode:
464     + if (byte_mode_character_aware)
465     + cut_characters_or_cut_bytes_no_split (stream);
466     + else
467     + cut_bytes (stream);
468     + break;
469     +
470     + case character_mode:
471     + cut_characters_or_cut_bytes_no_split (stream);
472     + break;
473     +
474     + case field_mode:
475     + cut_fields_mb (stream);
476     + break;
477     +
478     + default:
479     + abort ();
480     + }
481     + }
482     else
483     - cut_fields (stream);
484     +#endif
485     + {
486     + if (operating_mode == field_mode)
487     + cut_fields (stream);
488     + else
489     + cut_bytes (stream);
490     + }
491     }
492    
493     /* Process file FILE to standard output.
494     @@ -748,6 +1077,8 @@ main (int argc, char **argv)
495     bool ok;
496     bool delim_specified = false;
497     char *spec_list_string IF_LINT(= NULL);
498     + char mbdelim[MB_LEN_MAX + 1];
499     + size_t delimlen = 0;
500    
501     initialize_main (&argc, &argv);
502     program_name = argv[0];
503     @@ -770,7 +1101,6 @@ main (int argc, char **argv)
504     switch (optc)
505     {
506     case 'b':
507     - case 'c':
508     /* Build the byte list. */
509     if (operating_mode != undefined_mode)
510     FATAL_ERROR (_("only one type of list may be specified"));
511     @@ -778,6 +1108,14 @@ main (int argc, char **argv)
512     spec_list_string = optarg;
513     break;
514    
515     + case 'c':
516     + /* Build the character list. */
517     + if (operating_mode != undefined_mode)
518     + FATAL_ERROR (_("only one type of list may be specified"));
519     + operating_mode = character_mode;
520     + spec_list_string = optarg;
521     + break;
522     +
523     case 'f':
524     /* Build the field list. */
525     if (operating_mode != undefined_mode)
526     @@ -789,9 +1127,32 @@ main (int argc, char **argv)
527     case 'd':
528     /* New delimiter. */
529     /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
530     - if (optarg[0] != '\0' && optarg[1] != '\0')
531     - FATAL_ERROR (_("the delimiter must be a single character"));
532     - delim = optarg[0];
533     +#if HAVE_MBRTOWC
534     + if(MB_CUR_MAX > 1)
535     + {
536     + mbstate_t state;
537     +
538     + memset (&state, '\0', sizeof(mbstate_t));
539     + delimlen = mbrtowc (&wcdelim, optarg, MB_LEN_MAX, &state);
540     +
541     + if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
542     + force_singlebyte_mode = true;
543     + else
544     + {
545     + delimlen = (delimlen < 1) ? 1 : delimlen;
546     + if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
547     + FATAL_ERROR (_("the delimiter must be a single character"));
548     + memcpy (mbdelim, optarg, delimlen);
549     + }
550     + }
551     +
552     + if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
553     +#endif
554     + {
555     + if (optarg[0] != '\0' && optarg[1] != '\0')
556     + FATAL_ERROR (_("the delimiter must be a single character"));
557     + delim = (unsigned char) optarg[0];
558     + }
559     delim_specified = true;
560     break;
561    
562     @@ -805,6 +1166,7 @@ main (int argc, char **argv)
563     break;
564    
565     case 'n':
566     + byte_mode_character_aware = true;
567     break;
568    
569     case 's':
570     @@ -827,7 +1189,7 @@ main (int argc, char **argv)
571     if (operating_mode == undefined_mode)
572     FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
573    
574     - if (delim != '\0' && operating_mode != field_mode)
575     + if (delim_specified && operating_mode != field_mode)
576     FATAL_ERROR (_("an input delimiter may be specified only\
577     when operating on fields"));
578    
579     @@ -854,15 +1216,34 @@ main (int argc, char **argv)
580     }
581    
582     if (!delim_specified)
583     - delim = '\t';
584     + {
585     + delim = '\t';
586     +#ifdef HAVE_MBRTOWC
587     + wcdelim = L'\t';
588     + mbdelim[0] = '\t';
589     + mbdelim[1] = '\0';
590     + delimlen = 1;
591     + }
592     +#endif
593    
594     if (output_delimiter_string == NULL)
595     {
596     - static char dummy[2];
597     - dummy[0] = delim;
598     - dummy[1] = '\0';
599     - output_delimiter_string = dummy;
600     - output_delimiter_length = 1;
601     +#ifdef HAVE_MBRTOWC
602     + if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
603     + {
604     + output_delimiter_string = xstrdup(mbdelim);
605     + output_delimiter_length = delimlen;
606     + }
607     +
608     + if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
609     +#endif
610     + {
611     + static char dummy[2];
612     + dummy[0] = delim;
613     + dummy[1] = '\0';
614     + output_delimiter_string = dummy;
615     + output_delimiter_length = 1;
616     + }
617     }
618    
619     if (optind == argc)
620     --- coreutils/src/expand.c
621     +++ coreutils/src/expand.c
622     @@ -38,12 +38,32 @@
623     #include <stdio.h>
624     #include <getopt.h>
625     #include <sys/types.h>
626     +
627     +/* Get mbstate_t, mbrtowc, wcwidth. */
628     +#if HAVE_WCHAR_H
629     +# include <wchar.h>
630     +#endif
631     +#if HAVE_WCTYPE_H
632     +# include <wctype.h>
633     +#endif
634     +
635     #include "system.h"
636     #include "error.h"
637     #include "posixver.h"
638     #include "quote.h"
639     #include "xstrndup.h"
640    
641     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
642     + installation; work around this configuration error. */
643     +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
644     +# define MB_LEN_MAX 16
645     +#endif
646     +
647     +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
648     +#if HAVE_MBRTOWC && defined mbstate_t
649     +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
650     +#endif
651     +
652     /* The official name of this program (e.g., no `g' prefix). */
653     #define PROGRAM_NAME "expand"
654    
655     @@ -352,8 +372,9 @@ expand (void)
656     }
657     else
658     {
659     - column++;
660     + if (!ISCNTRL (c))
661     + column++;
662     - if (!column)
663     + if (column >= UINTMAX_MAX)
664     error (EXIT_FAILURE, 0, _("input line is too long"));
665     }
666    
667     @@ -370,6 +391,163 @@ expand (void)
668     }
669     }
670    
671     +#if HAVE_MBRTOWC && HAVE_WCTYPE_H
672     +static void
673     +expand_multibyte (void)
674     +{
675     + /* Input stream. */
676     + FILE *fp = next_file (NULL);
677     +
678     + mbstate_t i_state; /* Current shift state of the input stream. */
679     + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
680     + char *bufpos; /* Next read position of BUF. */
681     + size_t buflen = 0; /* The length of the byte sequence in buf. */
682     +
683     + if (!fp)
684     + return;
685     +
686     + /* Binary I/O will preserve the original EOL style (DOS/Unix) of files. */
687     + SET_BINARY2 (fileno (fp), STDOUT_FILENO);
688     +
689     + for (;;)
690     + {
691     + /* Input character, or EOF. */
692     + wint_t wc;
693     +
694     + /* If true, perform translations. */
695     + bool convert = true;
696     +
697     +
698     + /* The following variables have valid values only when CONVERT
699     + is true: */
700     +
701     + /* Column of next input character. */
702     + uintmax_t column = 0;
703     +
704     + /* Index in TAB_LIST of next tab stop to examine. */
705     + size_t tab_index = 0;
706     +
707     +
708     + /* Convert a line of text. */
709     +
710     + do
711     + {
712     + wchar_t w;
713     + size_t mblength; /* The byte size of a multibyte character
714     + which shows as same character as WC. */
715     + mbstate_t i_state_bak; /* Back up the I_STATE. */
716     +
717     + /* Fill buffer */
718     + if (buflen < MB_LEN_MAX)
719     + {
720     + if (!feof(fp) && !ferror(fp)) {
721     + if (buflen > 0) memmove(buf, bufpos, buflen);
722     + buflen += fread(buf + buflen, sizeof(char), BUFSIZ, fp);
723     + bufpos = buf;
724     + }
725     + }
726     +
727     + if (buflen < 1) {
728     + /* Move to the next file */
729     + if (feof(fp) || ferror(fp)) {
730     + fp = next_file(fp);
731     + }
732     + if (!fp)
733     + return;
734     + memset (&i_state, '\0', sizeof (mbstate_t));
735     + SET_BINARY2 (fileno (fp), STDOUT_FILENO);
736     + continue;
737     + }
738     +
739     + i_state_bak = i_state;
740     + mblength = mbrtowc (&w, bufpos, buflen, &i_state);
741     + wc = w;
742     +
743     + if (mblength == (size_t) -1 || mblength == (size_t) -2) {
744     + i_state = i_state_bak;
745     + wc = L'\0';
746     + column += convert;
747     + mblength = 1;
748     + }
749     +
750     + if (convert)
751     + {
752     + if (wc == L'\t')
753     + {
754     + /* Column the next input tab stop is on. */
755     + uintmax_t next_tab_column;
756     +
757     + if (tab_size)
758     + next_tab_column = column + (tab_size - column % tab_size);
759     + else
760     + for (;;)
761     + if (tab_index == first_free_tab)
762     + {
763     + next_tab_column = column + 1;
764     + break;
765     + }
766     + else
767     + {
768     + uintmax_t tab = tab_list[tab_index++];
769     + if (column < tab)
770     + {
771     + next_tab_column = tab;
772     + break;
773     + }
774     + }
775     +
776     + if (next_tab_column < column)
777     + error (EXIT_FAILURE, 0, _("input line is too long"));
778     +
779     + while (++column < next_tab_column)
780     + if (putchar (' ') < 0)
781     + error (EXIT_FAILURE, errno, _("write error"));
782     +
783     + *bufpos = ' ';
784     + }
785     + else if (wc == L'\b')
786     + {
787     + /* Go back one column, and force recalculation of the
788     + next tab stop. */
789     + column -= !!column;
790     + tab_index -= !!tab_index;
791     + }
792     + else
793     + {
794     + if (!iswcntrl (wc))
795     + {
796     + int width = wcwidth (wc);
797     + if (width > 0) {
798     + if (column > (column + width))
799     + error (EXIT_FAILURE, 0, _("input line is too long"));
800     + column += width;
801     + }
802     + }
803     + }
804     +
805     + convert &= convert_entire_line | iswblank (wc);
806     + }
807     +
808     + if (mblength)
809     + {
810     + if (fwrite (bufpos, sizeof(char), mblength, stdout) < mblength)
811     + error (EXIT_FAILURE, errno, _("write error"));
812     + }
813     + else
814     + {
815     + if (putchar('\0'))
816     + error (EXIT_FAILURE, errno, _("write error"));
817     + mblength = 1;
818     + }
819     +
820     + buflen -= mblength;
821     + bufpos += mblength;
822     + }
823     + while (wc != L'\n');
824     + }
825     +}
826     +#endif
827     +
828     int
829     main (int argc, char **argv)
830     {
831     @@ -446,7 +624,12 @@ main (int argc, char **argv)
832    
833     file_list = (optind < argc ? &argv[optind] : stdin_argv);
834    
835     - expand ();
836     +#if HAVE_MBRTOWC
837     + if (MB_CUR_MAX > 1)
838     + expand_multibyte ();
839     + else
840     +#endif
841     + expand ();
842    
843     if (have_read_stdin && fclose (stdin) != 0)
844     error (EXIT_FAILURE, errno, "-");
845     --- coreutils/src/fold.c
846     +++ coreutils/src/fold.c
847     @@ -23,6 +23,19 @@
848     #include <getopt.h>
849     #include <sys/types.h>
850    
851     +/* Get MB_CUR_MAX. */
852     +#include <stdlib.h>
853     +
854     +/* Get mbrtowc, mbstate_t, wcwidth(). */
855     +#if HAVE_WCHAR_H
856     +# include <wchar.h>
857     +#endif
858     +
859     +/* Get iswprint(), iswctype(), wctype(). */
860     +#if HAVE_WCTYPE_H
861     +# include <wctype.h>
862     +#endif
863     +
864     #include "system.h"
865     #include "error.h"
866     #include "posixver.h"
867     @@ -30,14 +43,57 @@
868    
869     #define TAB_WIDTH 8
870    
871     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
872     + installation; work around this configuration error. */
873     +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
874     +# undef MB_LEN_MAX
875     +# define MB_LEN_MAX 16
876     +#endif
877     +
878     +#ifndef HAVE_DECL_WCWIDTH
879     +"this configure-time declaration test was not run"
880     +#endif
881     +#if !HAVE_DECL_WCWIDTH
882     +extern int wcwidth ();
883     +#endif
884     +
885     +/* If wcwidth() doesn't exist, assume all printable characters have
886     + width 1. */
887     +#if !defined wcwidth && !HAVE_WCWIDTH
888     +# define wcwidth(wc) ((wc) == 0 ? 0 : iswprint (wc) ? 1 : -1)
889     +#endif
890     +
891     /* The official name of this program (e.g., no `g' prefix). */
892     #define PROGRAM_NAME "fold"
893    
894     #define AUTHORS "David MacKenzie"
895    
896     +#define FATAL_ERROR(Message) \
897     +do \
898     +{ \
899     + error (0, 0, (Message)); \
900     + usage (2); \
901     +} \
902     +while (0)
903     +
904     +enum operating_mode
905     +{
906     + /* Fold texts by columns that are at the given positions. */
907     + column_mode,
908     +
909     + /* Fold texts by bytes that are at the given positions. */
910     + byte_mode,
911     +
912     + /* Fold texts by characters that are at the given positions. */
913     + character_mode,
914     +};
915     +
916     /* The name this program was run with. */
917     char *program_name;
918    
919     +/* The argument shows current mode. (Default: column_mode) */
920     +static enum operating_mode operating_mode;
921     +
922     /* If nonzero, try to break on whitespace. */
923     static bool break_spaces;
924    
925     @@ -47,9 +103,15 @@ static bool count_bytes;
926     /* If nonzero, at least one of the files we read was standard input. */
927     static bool have_read_stdin;
928    
929     +/* wide character class `blank' */
930     +#if HAVE_MBRTOWC
931     +wctype_t blank_type;
932     +#endif
933     +
934     static struct option const longopts[] =
935     {
936     {"bytes", no_argument, NULL, 'b'},
937     + {"characters", no_argument, NULL, 'c'},
938     {"spaces", no_argument, NULL, 's'},
939     {"width", required_argument, NULL, 'w'},
940     {GETOPT_HELP_OPTION_DECL},
941     @@ -79,6 +141,7 @@ Mandatory arguments to long options are
942     "), stdout);
943     fputs (_("\
944     -b, --bytes count bytes rather than columns\n\
945     + -c, --characters count characters rather than columns\n\
946     -s, --spaces break at spaces\n\
947     -w, --width=WIDTH use WIDTH columns instead of 80\n\
948     "), stdout);
949     @@ -96,7 +159,7 @@ Mandatory arguments to long options are
950     static size_t
951     adjust_column (size_t column, char c)
952     {
953     - if (!count_bytes)
954     + if (operating_mode != byte_mode)
955     {
956     if (c == '\b')
957     {
958     @@ -115,14 +178,9 @@ adjust_column (size_t column, char c)
959     return column;
960     }
961    
962     -/* Fold file FILENAME, or standard input if FILENAME is "-",
963     - to stdout, with maximum line length WIDTH.
964     - Return true if successful. */
965     -
966     -static bool
967     -fold_file (char *filename, size_t width)
968     +static int
969     +fold_text (FILE *istream, int width)
970     {
971     - FILE *istream;
972     register int c;
973     size_t column = 0; /* Screen column where next char will go. */
974     size_t offset_out = 0; /* Index in `line_out' for next char. */
975     @@ -130,20 +188,6 @@ fold_file (char *filename, size_t width)
976     static size_t allocated_out = 0;
977     int saved_errno;
978    
979     - if (STREQ (filename, "-"))
980     - {
981     - istream = stdin;
982     - have_read_stdin = true;
983     - }
984     - else
985     - istream = fopen (filename, "r");
986     -
987     - if (istream == NULL)
988     - {
989     - error (0, errno, "%s", filename);
990     - return false;
991     - }
992     -
993     while ((c = getc (istream)) != EOF)
994     {
995     if (offset_out + 1 >= allocated_out)
996     @@ -221,6 +265,233 @@ fold_file (char *filename, size_t width)
997     if (offset_out)
998     fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
999    
1000     + return saved_errno;
1001     +}
1002     +
1003     +#if HAVE_MBRTOWC
1004     +static void
1005     +fold_multibyte_text (FILE *istream, int width)
1006     +{
1007     + int i;
1008     + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
1009     + size_t buflen; /* The length of the byte sequence in buf. */
1010     + char *bufpos; /* Next read position of BUF. */
1011     + wint_t wc; /* A gotten wide character. */
1012     + wchar_t tmp;
1013     + size_t mblength; /* The byte size of a multibyte character which shows
1014     + as same character as WC. */
1015     + mbstate_t state, state_bak; /* State of the stream. */
1016     + int convfail; /* 1, when conversion is failed. Otherwise 0. */
1017     +
1018     + char *line_out = NULL;
1019     + size_t offset_out = 0; /* Index in `line_out' for next char. */
1020     + size_t allocated_out = 1024;
1021     +
1022     + int increment;
1023     + size_t column = 0;
1024     +
1025     + size_t last_blank_pos;
1026     + size_t last_blank_column;
1027     + int is_blank_seen;
1028     + int last_blank_increment;
1029     + int is_bs_following_last_blank;
1030     + size_t bs_following_last_blank_num;
1031     + int is_cr_after_last_blank;
1032     +
1033     +
1034     +#define CLEAR_FLAGS \
1035     + do \
1036     + { \
1037     + last_blank_pos = 0; \
1038     + last_blank_column = 0; \
1039     + is_blank_seen = 0; \
1040     + is_bs_following_last_blank = 0; \
1041     + bs_following_last_blank_num = 0; \
1042     + is_cr_after_last_blank = 0; \
1043     + } \
1044     + while (0)
1045     +
1046     +#define START_NEW_LINE \
1047     + do \
1048     + { \
1049     + putchar ('\n'); \
1050     + column = 0; \
1051     + offset_out = 0; \
1052     + CLEAR_FLAGS; \
1053     + } \
1054     + while (0)
1055     +
1056     + CLEAR_FLAGS;
1057     +
1058     + memset (&state, '\0', sizeof(mbstate_t));
1059     + line_out = xmalloc (allocated_out);
1060     +
1061     + buflen = fread (buf, sizeof(char), BUFSIZ, istream);
1062     + bufpos = buf;
1063     +
1064     + for (;; bufpos += mblength, buflen -= mblength)
1065     + {
1066     + if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream))
1067     + {
1068     + memmove (buf, bufpos, buflen);
1069     + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream);
1070     + bufpos = buf;
1071     + }
1072     +
1073     + if (buflen < 1)
1074     + break;
1075     +
1076     + /* Get a wide character. */
1077     + convfail = 0;
1078     + state_bak = state;
1079     + mblength = mbrtowc (&tmp, bufpos, buflen, &state);
1080     + wc = tmp;
1081     +
1082     + switch (mblength)
1083     + {
1084     + case (size_t)-1:
1085     + case (size_t)-2:
1086     + convfail++;
1087     + state = state_bak;
1088     + /* Fall through. */
1089     +
1090     + case 0:
1091     + mblength = 1;
1092     + break;
1093     + }
1094     +
1095     + if (!convfail && wc == L'\n')
1096     + {
1097     + if (offset_out > 0)
1098     + {
1099     + fwrite (line_out, sizeof(char), offset_out, stdout);
1100     + START_NEW_LINE;
1101     + }
1102     + continue;
1103     + }
1104     +
1105     +rescan:
1106     + if (operating_mode == byte_mode) /* byte mode */
1107     + increment = mblength;
1108     + else if (operating_mode == character_mode) /* character mode */
1109     + increment = 1;
1110     + else /* column mode */
1111     + {
1112     + if (convfail)
1113     + increment = 1;
1114     + else
1115     + {
1116     + switch (wc)
1117     + {
1118     + case L'\b':
1119     + increment = (column > 0) ? -1 : 0;
1120     + break;
1121     +
1122     + case L'\r':
1123     + increment = -1 * column;
1124     + break;
1125     +
1126     + case L'\t':
1127     + increment = 8 - column % 8;
1128     + break;
1129     +
1130     + default:
1131     + increment = wcwidth (wc);
1132     + increment = (increment < 0) ? 0 : increment;
1133     + }
1134     + }
1135     + }
1136     +
1137     + if (column + increment > width && break_spaces && last_blank_pos)
1138     + {
1139     + fwrite (line_out, sizeof(char), last_blank_pos, stdout);
1140     + putchar ('\n');
1141     +
1142     + offset_out = offset_out - last_blank_pos;
1143     + column = column - last_blank_column + ((is_cr_after_last_blank)
1144     + ? last_blank_increment : bs_following_last_blank_num);
1145     + memmove (line_out, line_out + last_blank_pos, offset_out);
1146     + CLEAR_FLAGS;
1147     + goto rescan;
1148     + }
1149     +
1150     + if (column + increment > width && column != 0)
1151     + {
1152     + fwrite (line_out, sizeof(char), offset_out, stdout);
1153     + START_NEW_LINE;
1154     + goto rescan;
1155     + }
1156     +
1157     + if (allocated_out < offset_out + mblength)
1158     + line_out = x2nrealloc (line_out, &allocated_out, sizeof *line_out);
1159     +
1160     + for (i = 0; i < mblength; i++)
1161     + {
1162     + *(line_out + offset_out) = *(bufpos + i);
1163     + ++offset_out;
1164     + }
1165     +
1166     + column += increment;
1167     +
1168     + if (is_blank_seen && !convfail && wc == L'\r')
1169     + is_cr_after_last_blank = 1;
1170     +
1171     + if (is_bs_following_last_blank && !convfail && wc == L'\b')
1172     + ++bs_following_last_blank_num;
1173     + else
1174     + is_bs_following_last_blank = 0;
1175     +
1176     + if (break_spaces && !convfail && iswctype (wc, blank_type))
1177     + {
1178     + last_blank_pos = offset_out;
1179     + last_blank_column = column;
1180     + is_blank_seen = 1;
1181     + last_blank_increment = increment;
1182     + is_bs_following_last_blank = 1;
1183     + bs_following_last_blank_num = 0;
1184     + is_cr_after_last_blank = 0;
1185     + }
1186     + }
1187     +
1188     + if (offset_out)
1189     + fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
1190     +
1191     + free(line_out);
1192     +}
1193     +#endif
1194     +
1195     +/* Fold file FILENAME, or standard input if FILENAME is "-",
1196     + to stdout, with maximum line length WIDTH.
1197     + Return true if successful. */
1198     +
1199     +static bool
1200     +fold_file (char *filename, int width)
1201     +{
1202     + FILE *istream;
1203     + int saved_errno;
1204     +
1205     + if (STREQ (filename, "-"))
1206     + {
1207     + istream = stdin;
1208     + have_read_stdin = true;
1209     + }
1210     + else
1211     + istream = fopen (filename, "r");
1212     +
1213     + if (istream == NULL)
1214     + {
1215     + error (0, errno, "%s", filename);
1216     + return false;
1217     + }
1218     +
1219     + /* Define how ISTREAM is being folded. */
1220     +#if HAVE_MBRTOWC
1221     + if (MB_CUR_MAX > 1)
1222     + fold_multibyte_text (istream, width);
1223     + else
1224     +#endif
1225     + saved_errno = fold_text (istream, width);
1226     +
1227     if (ferror (istream))
1228     {
1229     error (0, saved_errno, "%s", filename);
1230     @@ -253,6 +524,10 @@ main (int argc, char **argv)
1231    
1232     atexit (close_stdout);
1233    
1234     +#if HAVE_MBRTOWC
1235     + blank_type = wctype ("blank");
1236     +#endif
1237     + operating_mode = column_mode;
1238     break_spaces = count_bytes = have_read_stdin = false;
1239    
1240     /* Turn any numeric options into -w options. */
1241     @@ -280,12 +555,23 @@ main (int argc, char **argv)
1242     }
1243     }
1244    
1245     - while ((optc = getopt_long (argc, argv, "bsw:", longopts, NULL)) != -1)
1246     + while ((optc = getopt_long (argc, argv, "bcsw:", longopts, NULL)) != -1)
1247     {
1248     switch (optc)
1249     {
1250     + case 0:
1251     + break;
1252     +
1253     case 'b': /* Count bytes rather than columns. */
1254     - count_bytes = true;
1255     + if (operating_mode != column_mode)
1256     + FATAL_ERROR (_("only one way of folding may be specified"));
1257     + operating_mode = byte_mode;
1258     + break;
1259     +
1260     + case 'c': /* Count characters rather than columns. */
1261     + if (operating_mode != column_mode)
1262     + FATAL_ERROR (_("only one way of folding may be specified"));
1263     + operating_mode = character_mode;
1264     break;
1265    
1266     case 's': /* Break at word boundaries. */
1267     --- coreutils/src/join.c
1268     +++ coreutils/src/join.c
1269     @@ -24,6 +24,16 @@
1270     #include <sys/types.h>
1271     #include <getopt.h>
1272    
1273     +/* Get mbstate_t, mbrtowc, mbrtowc, wcwidth. */
1274     +#if HAVE_WCHAR_H
1275     +# include <wchar.h>
1276     +#endif
1277     +
1278     +/* Get iswblank, towupper. */
1279     +#if HAVE_WCTYPE_H
1280     +# include <wctype.h>
1281     +#endif
1282     +
1283     #include "system.h"
1284     #include "error.h"
1285     #include "hard-locale.h"
1286     @@ -34,6 +44,11 @@
1287     #include "xmemcoll.h"
1288     #include "xstrtol.h"
1289    
1290     +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1291     +#if HAVE_MBRTOWC && defined mbstate_t
1292     +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1293     +#endif
1294     +
1295     /* The official name of this program (e.g., no `g' prefix). */
1296     #define PROGRAM_NAME "join"
1297    
1298     @@ -110,7 +125,10 @@ static struct outlist *outlist_end = &ou
1299     /* Tab character separating fields; if this is NUL fields are separated
1300     by any nonempty string of white space, otherwise by exactly one
1301     tab character. */
1302     -static char tab;
1303     +static char *tab = NULL;
1304     +
1305     +/* The number of bytes used for tab. */
1306     +static size_t tablen = 0;
1307    
1308     /* When using getopt_long_only, no long option can start with
1309     a character that is a short option. */
1310     @@ -233,7 +251,7 @@ xfields (struct line *line)
1311    
1312     if (tab)
1313     {
1314     - unsigned char t = tab;
1315     + unsigned char t = tab[0];
1316     char *sep;
1317     for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
1318     extract_field (line, ptr, sep - ptr);
1319     @@ -262,6 +280,133 @@ xfields (struct line *line)
1320     extract_field (line, ptr, lim - ptr);
1321     }
1322    
1323     +#if HAVE_MBRTOWC
1324     +static void
1325     +xfields_multibyte (struct line *line)
1326     +{
1327     + int i;
1328     + char *ptr0 = line->buf.buffer;
1329     + char *ptr;
1330     + char *lim;
1331     + wchar_t wc = 0;
1332     + size_t mblength;
1333     + mbstate_t state, state_bak;
1334     +
1335     + memset (&state, 0, sizeof (mbstate_t));
1336     +
1337     + ptr = ptr0;
1338     + lim = ptr0 + line->buf.length - 1;
1339     +
1340     + if (tab == NULL)
1341     + {
1342     + /* Skip leading blanks before the first field. */
1343     + while (ptr < lim)
1344     + {
1345     + state_bak = state;
1346     + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1347     +
1348     + if (mblength == (size_t) -1 || mblength == (size_t) -2)
1349     + {
1350     + mblength = 1;
1351     + state = state_bak;
1352     + break;
1353     + }
1354     + mblength = (mblength < 1) ? 1 : mblength;
1355     +
1356     + if (!iswblank (wc))
1357     + break;
1358     + ptr += mblength;
1359     + }
1360     + }
1361     +
1362     + for (i = 0; ptr < lim; ++i)
1363     + {
1364     + if (tab != NULL)
1365     + {
1366     + char *beg = ptr;
1367     + while (ptr < lim)
1368     + {
1369     + state_bak = state;
1370     + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1371     +
1372     + if (mblength == (size_t) -1 || mblength == (size_t) -2)
1373     + {
1374     + mblength = 1;
1375     + state = state_bak;
1376     + }
1377     + mblength = (mblength < 1) ? 1 : mblength;
1378     +
1379     + if (mblength == tablen && !memcmp (ptr, tab, mblength))
1380     + break;
1381     + else
1382     + {
1383     + ptr += mblength;
1384     + continue;
1385     + }
1386     + }
1387     +
1388     + extract_field (line, beg, ptr - beg);
1389     + if (ptr < lim)
1390     + ptr += mblength;
1391     + }
1392     + else
1393     + {
1394     + char *beg = ptr;
1395     + while (ptr < lim)
1396     + {
1397     + state_bak = state;
1398     + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1399     +
1400     + if (mblength == (size_t) -1 || mblength == (size_t) -2)
1401     + {
1402     + mblength = 1;
1403     + state = state_bak;
1404     + }
1405     + mblength = (mblength < 1) ? 1 : mblength;
1406     +
1407     + if (iswblank (wc))
1408     + break;
1409     + else
1410     + {
1411     + ptr += mblength;
1412     + continue;
1413     + }
1414     + }
1415     +
1416     + extract_field (line, beg, ptr - beg);
1417     + if (ptr < lim)
1418     + ptr += mblength;
1419     + }
1420     + }
1421     +
1422     + if (ptr != ptr0)
1423     + {
1424     + mblength = mbrtowc (&wc, ptr - mblength, mblength, &state);
1425     + wc = (mbsinit (&state) && *(ptr - mblength) == '\0') ? L'\0' : wc;
1426     + if (tab != NULL)
1427     + {
1428     + if (mblength == (size_t) -1 || mblength == (size_t) -2)
1429     + mblength = 1;
1430     +
1431     + if (mblength == tablen && !memcmp (ptr - mblength, tab, mblength))
1432     + /* Add one more (empty) field because the last character of
1433     + the line was a delimiter. */
1434     + extract_field (line, NULL, 0);
1435     + }
1436     + else
1437     + {
1438     + if (mblength != (size_t) -1 && mblength != (size_t) -2)
1439     + {
1440     + if (iswblank (wc))
1441     + /* Add one more (empty) field because the last character of
1442     + the line was a delimiter. */
1443     + extract_field (line, NULL, 0);
1444     + }
1445     + }
1446     + }
1447     +}
1448     +#endif
1449     +
1450     /* Read a line from FP into LINE and split it into fields.
1451     Return true if successful. */
1452    
1453     @@ -282,7 +427,13 @@ get_line (FILE *fp, struct line *line)
1454     line->nfields_allocated = 0;
1455     line->nfields = 0;
1456     line->fields = NULL;
1457     - xfields (line);
1458     +
1459     +#if HAVE_MBRTOWC
1460     + if (MB_CUR_MAX > 1)
1461     + xfields_multibyte (line);
1462     + else
1463     +#endif
1464     + xfields (line);
1465     return true;
1466     }
1467    
1468     @@ -336,56 +487,115 @@ static int
1469     keycmp (struct line const *line1, struct line const *line2)
1470     {
1471     /* Start of field to compare in each file. */
1472     - char *beg1;
1473     - char *beg2;
1474     -
1475     - size_t len1;
1476     - size_t len2; /* Length of fields to compare. */
1477     + char *beg[2];
1478     + char *copy[2];
1479     + size_t len[2]; /* Length of fields to compare. */
1480     int diff;
1481     + int i, j;
1482    
1483     if (join_field_1 < line1->nfields)
1484     {
1485     - beg1 = line1->fields[join_field_1].beg;
1486     - len1 = line1->fields[join_field_1].len;
1487     + beg[0] = line1->fields[join_field_1].beg;
1488     + len[0] = line1->fields[join_field_1].len;
1489     }
1490     else
1491     {
1492     - beg1 = NULL;
1493     - len1 = 0;
1494     + beg[0] = NULL;
1495     + len[0] = 0;
1496     }
1497    
1498     if (join_field_2 < line2->nfields)
1499     {
1500     - beg2 = line2->fields[join_field_2].beg;
1501     - len2 = line2->fields[join_field_2].len;
1502     + beg[1] = line2->fields[join_field_2].beg;
1503     + len[1] = line2->fields[join_field_2].len;
1504     }
1505     else
1506     {
1507     - beg2 = NULL;
1508     - len2 = 0;
1509     + beg[1] = NULL;
1510     + len[1] = 0;
1511     }
1512    
1513     - if (len1 == 0)
1514     - return len2 == 0 ? 0 : -1;
1515     - if (len2 == 0)
1516     + if (len[0] == 0)
1517     + return len[1] == 0 ? 0 : -1;
1518     + if (len[1] == 0)
1519     return 1;
1520    
1521     if (ignore_case)
1522     {
1523     - /* FIXME: ignore_case does not work with NLS (in particular,
1524     - with multibyte chars). */
1525     - diff = memcasecmp (beg1, beg2, MIN (len1, len2));
1526     +#ifdef HAVE_MBRTOWC
1527     + if (MB_CUR_MAX > 1)
1528     + {
1529     + size_t mblength;
1530     + wchar_t wc, uwc;
1531     + mbstate_t state, state_bak;
1532     +
1533     + memset (&state, '\0', sizeof (mbstate_t));
1534     +
1535     + for (i = 0; i < 2; i++)
1536     + {
1537     + copy[i] = alloca (len[i] + 1);
1538     +
1539     + for (j = 0; j < MIN (len[0], len[1]);)
1540     + {
1541     + state_bak = state;
1542     + mblength = mbrtowc (&wc, beg[i] + j, len[i] - j, &state);
1543     +
1544     + switch (mblength)
1545     + {
1546     + case (size_t) -1:
1547     + case (size_t) -2:
1548     + state = state_bak;
1549     + /* Fall through */
1550     + case 0:
1551     + mblength = 1;
1552     + break;
1553     +
1554     + default:
1555     + uwc = towupper (wc);
1556     +
1557     + if (uwc != wc)
1558     + {
1559     + mbstate_t state_wc;
1560     +
1561     + memset (&state_wc, '\0', sizeof (mbstate_t));
1562     + wcrtomb (copy[i] + j, uwc, &state_wc);
1563     + }
1564     + else
1565     + memcpy (copy[i] + j, beg[i] + j, mblength);
1566     + }
1567     + j += mblength;
1568     + }
1569     + copy[i][j] = '\0';
1570     + }
1571     + return xmemcoll (copy[0], len[0], copy[1], len[1]);
1572     + }
1573     +#endif
1574     + if (hard_LC_COLLATE)
1575     + {
1576     + for (i = 0; i < 2; i++)
1577     + {
1578     + copy[i] = alloca (len[i] + 1);
1579     +
1580     + for (j = 0; j < MIN (len[0], len[1]); j++)
1581     + copy[i][j] = toupper (beg[i][j]);
1582     +
1583     + copy[i][j] = '\0';
1584     + }
1585     + return xmemcoll (copy[0], len[0], copy[1], len[1]);
1586     + }
1587     + else
1588     + diff = memcasecmp (beg[0], beg[1], MIN (len[0], len[1]));
1589     }
1590     else
1591     {
1592     if (hard_LC_COLLATE)
1593     - return xmemcoll (beg1, len1, beg2, len2);
1594     - diff = memcmp (beg1, beg2, MIN (len1, len2));
1595     + return xmemcoll (beg[0], len[0], beg[1], len[1]);
1596     + diff = memcmp (beg[0], beg[1], MIN (len[0], len[1]));
1597     }
1598    
1599     if (diff)
1600     return diff;
1601     - return len1 < len2 ? -1 : len1 != len2;
1602     + return len[0] < len[1] ? -1 : len[0] != len[1];
1603     }
1604    
1605     /* Print field N of LINE if it exists and is nonempty, otherwise
1606     @@ -414,7 +624,8 @@ static void
1607     prjoin (struct line const *line1, struct line const *line2)
1608     {
1609     const struct outlist *outlist;
1610     - char output_separator = tab ? tab : ' ';
1611     + char *output_separator = tab ? tab : " ";
1612     + size_t output_separator_len = tab ? tablen : 1;
1613    
1614     outlist = outlist_head.next;
1615     if (outlist)
1616     @@ -449,7 +660,7 @@ prjoin (struct line const *line1, struct
1617     o = o->next;
1618     if (o == NULL)
1619     break;
1620     - putchar (output_separator);
1621     + fwrite (output_separator, 1, output_separator_len, stdout);
1622     }
1623     putchar ('\n');
1624     }
1625     @@ -467,23 +678,23 @@ prjoin (struct line const *line1, struct
1626     prfield (join_field_1, line1);
1627     for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
1628     {
1629     - putchar (output_separator);
1630     + fwrite (output_separator, 1, output_separator_len, stdout);
1631     prfield (i, line1);
1632     }
1633     for (i = join_field_1 + 1; i < line1->nfields; ++i)
1634     {
1635     - putchar (output_separator);
1636     + fwrite (output_separator, 1, output_separator_len, stdout);
1637     prfield (i, line1);
1638     }
1639    
1640     for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
1641     {
1642     - putchar (output_separator);
1643     + fwrite (output_separator, 1, output_separator_len, stdout);
1644     prfield (i, line2);
1645     }
1646     for (i = join_field_2 + 1; i < line2->nfields; ++i)
1647     {
1648     - putchar (output_separator);
1649     + fwrite (output_separator, 1, output_separator_len, stdout);
1650     prfield (i, line2);
1651     }
1652     putchar ('\n');
1653     @@ -814,7 +1025,21 @@ main (int argc, char **argv)
1654     break;
1655    
1656     case 't':
1657     - tab = *optarg;
1658     + tab = xstrdup (optarg);
1659     +#if HAVE_MBRTOWC
1660     + if (MB_CUR_MAX > 1)
1661     + {
1662     + mbstate_t state;
1663     +
1664     + memset (&state, 0, sizeof (mbstate_t));
1665     + tablen = mbrtowc (NULL, optarg, strlen (optarg), &state);
1666     + if (tablen == (size_t) 0
1667     + || tablen == (size_t) -1 || tablen == (size_t) -2)
1668     + tablen = 1;
1669     + }
1670     + else
1671     +#endif
1672     + tablen = 1;
1673     break;
1674    
1675     case 1: /* Non-option argument. */
1676     --- coreutils/src/pr.c
1677     +++ coreutils/src/pr.c
1678     @@ -314,6 +314,32 @@
1679     #include <stdio.h>
1680     #include <getopt.h>
1681     #include <sys/types.h>
1682     +
1683     +/* Get MB_LEN_MAX. */
1684     +#include <limits.h>
1685     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
1686     + installation; work around this configuration error. */
1687     +#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
1688     +# define MB_LEN_MAX 16
1689     +#endif
1690     +
1691     +/* Get MB_CUR_MAX. */
1692     +#include <stdlib.h>
1693     +
1694     +/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
1695     +/* Get mbstate_t, mbrtowc(), wcwidth(). */
1696     +#if HAVE_WCHAR_H
1697     +# include <wchar.h>
1698     +#endif
1699     +
1700     +/* Get iswprint(). -- for wcwidth(). */
1701     +#if HAVE_WCTYPE_H
1702     +# include <wctype.h>
1703     +#endif
1704     +#if !defined iswprint && !HAVE_ISWPRINT
1705     +# define iswprint(wc) 1
1706     +#endif
1707     +
1708     #include "system.h"
1709     #include "error.h"
1710     #include "hard-locale.h"
1711     @@ -321,6 +347,18 @@
1712     #include "posixver.h"
1713     #include "xstrtol.h"
1714    
1715     +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1716     +#if HAVE_MBRTOWC && defined mbstate_t
1717     +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1718     +#endif
1719     +
1720     +#ifndef HAVE_DECL_WCWIDTH
1721     +"this configure-time declaration test was not run"
1722     +#endif
1723     +#if !HAVE_DECL_WCWIDTH
1724     +extern int wcwidth ();
1725     +#endif
1726     +
1727     #if ! (HAVE_DECL_STRTOUMAX || defined strtoumax)
1728     uintmax_t strtoumax ();
1729     #endif
1730     @@ -416,8 +454,21 @@ struct COLUMN
1731     typedef struct COLUMN COLUMN;
1732    
1733     #define NULLCOL (COLUMN *)0
1734     +
1735     +/* Funtion pointers to switch functions for single byte locale or for
1736     + multibyte locale. If multibyte functions do not exist in your sysytem,
1737     + these pointers always point the function for single byte locale. */
1738     +static void (*print_char) (char c);
1739     +static int (*char_to_clump) (char c);
1740     +
1741     +/* Functions for single byte locale. */
1742     +static void print_char_single (char c);
1743     +static int char_to_clump_single (char c);
1744     +
1745     +/* Functions for multibyte locale. */
1746     +static void print_char_multi (char c);
1747     +static int char_to_clump_multi (char c);
1748    
1749     -static int char_to_clump (char c);
1750     static bool read_line (COLUMN *p);
1751     static bool print_page (void);
1752     static bool print_stored (COLUMN *p);
1753     @@ -427,6 +478,7 @@ static void print_header (void);
1754     static void pad_across_to (int position);
1755     static void add_line_number (COLUMN *p);
1756     static void getoptarg (char *arg, char switch_char, char *character,
1757     + int *character_length, int *character_width,
1758     int *number);
1759     void usage (int status);
1760     static void print_files (int number_of_files, char **av);
1761     @@ -441,7 +493,6 @@ static void store_char (char c);
1762     static void pad_down (int lines);
1763     static void read_rest_of_line (COLUMN *p);
1764     static void skip_read (COLUMN *p, int column_number);
1765     -static void print_char (char c);
1766     static void cleanup (void);
1767     static void print_sep_string (void);
1768     static void separator_string (const char *optarg_S);
1769     @@ -456,7 +507,7 @@ static COLUMN *column_vector;
1770     we store the leftmost columns contiguously in buff.
1771     To print a line from buff, get the index of the first character
1772     from line_vector[i], and print up to line_vector[i + 1]. */
1773     -static char *buff;
1774     +static unsigned char *buff;
1775    
1776     /* Index of the position in buff where the next character
1777     will be stored. */
1778     @@ -560,7 +611,7 @@ static int chars_per_column;
1779     static bool untabify_input = false;
1780    
1781     /* (-e) The input tab character. */
1782     -static char input_tab_char = '\t';
1783     +static char input_tab_char[MB_LEN_MAX] = "\t";
1784    
1785     /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ...
1786     where the leftmost column is 1. */
1787     @@ -570,7 +621,10 @@ static int chars_per_input_tab = 8;
1788     static bool tabify_output = false;
1789    
1790     /* (-i) The output tab character. */
1791     -static char output_tab_char = '\t';
1792     +static char output_tab_char[MB_LEN_MAX] = "\t";
1793     +
1794     +/* (-i) The byte length of output tab character. */
1795     +static int output_tab_char_length = 1;
1796    
1797     /* (-i) The width of the output tab. */
1798     static int chars_per_output_tab = 8;
1799     @@ -644,7 +698,13 @@ static int power_10;
1800     static bool numbered_lines = false;
1801    
1802     /* (-n) Character which follows each line number. */
1803     -static char number_separator = '\t';
1804     +static char number_separator[MB_LEN_MAX] = "\t";
1805     +
1806     +/* (-n) The byte length of the character which follows each line number. */
1807     +static int number_separator_length = 1;
1808     +
1809     +/* (-n) The character width of the character which follows each line number. */
1810     +static int number_separator_width = 0;
1811    
1812     /* (-n) line counting starts with 1st line of input file (not with 1st
1813     line of 1st page printed). */
1814     @@ -697,6 +757,7 @@ static bool use_col_separator = false;
1815     -a|COLUMN|-m is a `space' and with the -J option a `tab'. */
1816     static char *col_sep_string = "";
1817     static int col_sep_length = 0;
1818     +static int col_sep_width = 0;
1819     static char *column_separator = " ";
1820     static char *line_separator = "\t";
1821    
1822     @@ -840,6 +901,13 @@ separator_string (const char *optarg_S)
1823     col_sep_length = (int) strlen (optarg_S);
1824     col_sep_string = xmalloc (col_sep_length + 1);
1825     strcpy (col_sep_string, optarg_S);
1826     +
1827     +#if HAVE_MBRTOWC
1828     + if (MB_CUR_MAX > 1)
1829     + col_sep_width = mbswidth (col_sep_string, 0);
1830     + else
1831     +#endif
1832     + col_sep_width = col_sep_length;
1833     }
1834    
1835     int
1836     @@ -864,6 +932,21 @@ main (int argc, char **argv)
1837    
1838     atexit (close_stdout);
1839    
1840     +/* Define which functions are used, the ones for single byte locale or the ones
1841     + for multibyte locale. */
1842     +#if HAVE_MBRTOWC
1843     + if (MB_CUR_MAX > 1)
1844     + {
1845     + print_char = print_char_multi;
1846     + char_to_clump = char_to_clump_multi;
1847     + }
1848     + else
1849     +#endif
1850     + {
1851     + print_char = print_char_single;
1852     + char_to_clump = char_to_clump_single;
1853     + }
1854     +
1855     n_files = 0;
1856     file_names = (argc > 1
1857     ? xmalloc ((argc - 1) * sizeof (char *))
1858     @@ -938,8 +1021,12 @@ main (int argc, char **argv)
1859     break;
1860     case 'e':
1861     if (optarg)
1862     - getoptarg (optarg, 'e', &input_tab_char,
1863     - &chars_per_input_tab);
1864     + {
1865     + int dummy_length, dummy_width;
1866     +
1867     + getoptarg (optarg, 'e', input_tab_char, &dummy_length,
1868     + &dummy_width, &chars_per_input_tab);
1869     + }
1870     /* Could check tab width > 0. */
1871     untabify_input = true;
1872     break;
1873     @@ -952,8 +1039,12 @@ main (int argc, char **argv)
1874     break;
1875     case 'i':
1876     if (optarg)
1877     - getoptarg (optarg, 'i', &output_tab_char,
1878     - &chars_per_output_tab);
1879     + {
1880     + int dummy_width;
1881     +
1882     + getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length,
1883     + &dummy_width, &chars_per_output_tab);
1884     + }
1885     /* Could check tab width > 0. */
1886     tabify_output = true;
1887     break;
1888     @@ -980,8 +1071,8 @@ main (int argc, char **argv)
1889     case 'n':
1890     numbered_lines = true;
1891     if (optarg)
1892     - getoptarg (optarg, 'n', &number_separator,
1893     - &chars_per_number);
1894     + getoptarg (optarg, 'n', number_separator, &number_separator_length,
1895     + &number_separator_width, &chars_per_number);
1896     break;
1897     case 'N':
1898     skip_count = false;
1899     @@ -1020,7 +1111,7 @@ main (int argc, char **argv)
1900     old_s = false;
1901     /* Reset an additional input of -s, -S dominates -s */
1902     col_sep_string = "";
1903     - col_sep_length = 0;
1904     + col_sep_length = col_sep_width = 0;
1905     use_col_separator = true;
1906     if (optarg)
1907     separator_string (optarg);
1908     @@ -1169,10 +1260,45 @@ main (int argc, char **argv)
1909     a number. */
1910    
1911     static void
1912     -getoptarg (char *arg, char switch_char, char *character, int *number)
1913     +getoptarg (char *arg, char switch_char, char *character, int *character_length,
1914     + int *character_width, int *number)
1915     {
1916     if (!ISDIGIT (*arg))
1917     - *character = *arg++;
1918     + {
1919     +#ifdef HAVE_MBRTOWC
1920     + if (MB_CUR_MAX > 1) /* for multibyte locale. */
1921     + {
1922     + wchar_t wc;
1923     + size_t mblength;
1924     + int width;
1925     + mbstate_t state = {'\0'};
1926     +
1927     + mblength = mbrtowc (&wc, arg, strlen (arg), &state);
1928     +
1929     + if (mblength == (size_t) -1 || mblength == (size_t) -2)
1930     + {
1931     + *character_length = 1;
1932     + *character_width = 1;
1933     + }
1934     + else
1935     + {
1936     + *character_length = (mblength < 1) ? 1 : mblength;
1937     + width = wcwidth (wc);
1938     + *character_width = (width < 0) ? 0 : width;
1939     + }
1940     +
1941     + strncpy (character, arg, *character_length);
1942     + arg += *character_length;
1943     + }
1944     + else /* for single byte locale. */
1945     +#endif
1946     + {
1947     + *character = *arg++;
1948     + *character_length = 1;
1949     + *character_width = 1;
1950     + }
1951     + }
1952     +
1953     if (*arg)
1954     {
1955     long int tmp_long;
1956     @@ -1237,7 +1363,7 @@ init_parameters (int number_of_files)
1957     else
1958     col_sep_string = column_separator;
1959    
1960     - col_sep_length = 1;
1961     + col_sep_length = col_sep_width = 1;
1962     use_col_separator = true;
1963     }
1964     /* It's rather pointless to define a TAB separator with column
1965     @@ -1269,11 +1395,11 @@ init_parameters (int number_of_files)
1966     TAB_WIDTH (chars_per_input_tab, chars_per_number); */
1967    
1968     /* Estimate chars_per_text without any margin and keep it constant. */
1969     - if (number_separator == '\t')
1970     + if (number_separator[0] == '\t')
1971     number_width = chars_per_number +
1972     TAB_WIDTH (chars_per_default_tab, chars_per_number);
1973     else
1974     - number_width = chars_per_number + 1;
1975     + number_width = chars_per_number + number_separator_width;
1976    
1977     /* The number is part of the column width unless we are
1978     printing files in parallel. */
1979     @@ -1288,7 +1414,7 @@ init_parameters (int number_of_files)
1980     }
1981    
1982     chars_per_column = (chars_per_line - chars_used_by_number -
1983     - (columns - 1) * col_sep_length) / columns;
1984     + (columns - 1) * col_sep_width) / columns;
1985    
1986     if (chars_per_column < 1)
1987     error (EXIT_FAILURE, 0, _("page width too narrow"));
1988     @@ -1416,7 +1542,7 @@ init_funcs (void)
1989    
1990     /* Enlarge p->start_position of first column to use the same form of
1991     padding_not_printed with all columns. */
1992     - h = h + col_sep_length;
1993     + h = h + col_sep_width;
1994    
1995     /* This loop takes care of all but the rightmost column. */
1996    
1997     @@ -1450,7 +1576,7 @@ init_funcs (void)
1998     }
1999     else
2000     {
2001     - h = h_next + col_sep_length;
2002     + h = h_next + col_sep_width;
2003     h_next = h + chars_per_column;
2004     }
2005     }
2006     @@ -1734,9 +1860,9 @@ static void
2007     align_column (COLUMN *p)
2008     {
2009     padding_not_printed = p->start_position;
2010     - if (padding_not_printed - col_sep_length > 0)
2011     + if (padding_not_printed - col_sep_width > 0)
2012     {
2013     - pad_across_to (padding_not_printed - col_sep_length);
2014     + pad_across_to (padding_not_printed - col_sep_width);
2015     padding_not_printed = ANYWHERE;
2016     }
2017    
2018     @@ -2010,13 +2136,13 @@ store_char (char c)
2019     /* May be too generous. */
2020     buff = x2nrealloc (buff, &buff_allocated, sizeof *buff);
2021     }
2022     - buff[buff_current++] = c;
2023     + buff[buff_current++] = (unsigned char) c;
2024     }
2025    
2026     static void
2027     add_line_number (COLUMN *p)
2028     {
2029     - int i;
2030     + int i, j;
2031     char *s;
2032     int left_cut;
2033    
2034     @@ -2039,22 +2165,24 @@ add_line_number (COLUMN *p)
2035     /* Tabification is assumed for multiple columns, also for n-separators,
2036     but `default n-separator = TAB' hasn't been given priority over
2037     equal column_width also specified by POSIX. */
2038     - if (number_separator == '\t')
2039     + if (number_separator[0] == '\t')
2040     {
2041     i = number_width - chars_per_number;
2042     while (i-- > 0)
2043     (p->char_func) (' ');
2044     }
2045     else
2046     - (p->char_func) (number_separator);
2047     + for (j = 0; j < number_separator_length; j++)
2048     + (p->char_func) (number_separator[j]);
2049     }
2050     else
2051     /* To comply with POSIX, we avoid any expansion of default TAB
2052     separator with a single column output. No column_width requirement
2053     has to be considered. */
2054     {
2055     - (p->char_func) (number_separator);
2056     - if (number_separator == '\t')
2057     + for (j = 0; j < number_separator_length; j++)
2058     + (p->char_func) (number_separator[j]);
2059     + if (number_separator[0] == '\t')
2060     output_position = POS_AFTER_TAB (chars_per_output_tab,
2061     output_position);
2062     }
2063     @@ -2215,7 +2343,7 @@ print_white_space (void)
2064     while (goal - h_old > 1
2065     && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal)
2066     {
2067     - putchar (output_tab_char);
2068     + fwrite (output_tab_char, 1, output_tab_char_length, stdout);
2069     h_old = h_new;
2070     }
2071     while (++h_old <= goal)
2072     @@ -2235,6 +2363,7 @@ print_sep_string ()
2073     {
2074     char *s;
2075     int l = col_sep_length;
2076     + int not_space_flag;
2077    
2078     s = col_sep_string;
2079    
2080     @@ -2248,6 +2377,7 @@ print_sep_string ()
2081     {
2082     for (; separators_not_printed > 0; --separators_not_printed)
2083     {
2084     + not_space_flag = 0;
2085     while (l-- > 0)
2086     {
2087     /* 3 types of sep_strings: spaces only, spaces and chars,
2088     @@ -2261,12 +2391,15 @@ print_sep_string ()
2089     }
2090     else
2091     {
2092     + not_space_flag = 1;
2093     if (spaces_not_printed > 0)
2094     print_white_space ();
2095     putchar (*s++);
2096     - ++output_position;
2097     }
2098     }
2099     + if (not_space_flag)
2100     + output_position += col_sep_width;
2101     +
2102     /* sep_string ends with some spaces */
2103     if (spaces_not_printed > 0)
2104     print_white_space ();
2105     @@ -2293,8 +2426,9 @@ print_clump (COLUMN *p, int n, char *clu
2106     a nonspace is encountered, call print_white_space() to print the
2107     required number of tabs and spaces. */
2108    
2109     +
2110     static void
2111     -print_char (char c)
2112     +print_char_single (char c)
2113     {
2114     if (tabify_output)
2115     {
2116     @@ -2318,6 +2452,75 @@ print_char (char c)
2117     putchar (c);
2118     }
2119    
2120     +#ifdef HAVE_MBRTOWC
2121     +static void
2122     +print_char_multi (char c)
2123     +{
2124     + static size_t mbc_pos = 0;
2125     + static unsigned char mbc[MB_LEN_MAX] = {'\0'};
2126     + static mbstate_t state = {'\0'};
2127     + mbstate_t state_bak;
2128     + wchar_t wc;
2129     + unsigned char uc = (unsigned char) c;
2130     + size_t mblength;
2131     + int width;
2132     +
2133     + if (tabify_output)
2134     + {
2135     + state_bak = state;
2136     + mbc[mbc_pos++] = uc;
2137     + mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2138     +
2139     + while (mbc_pos > 0)
2140     + {
2141     + switch (mblength)
2142     + {
2143     + case (size_t) -2:
2144     + state = state_bak;
2145     + return;
2146     +
2147     + case (size_t) -1:
2148     + state = state_bak;
2149     + ++output_position;
2150     + putchar (mbc[0]);
2151     + memmove (mbc, mbc + 1, MB_CUR_MAX - 1);
2152     + --mbc_pos;
2153     + break;
2154     +
2155     + case 0:
2156     + mblength = 1;
2157     +
2158     + default:
2159     + if (wc == L' ')
2160     + {
2161     + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2162     + --mbc_pos;
2163     + ++spaces_not_printed;
2164     + return;
2165     + }
2166     + else if (spaces_not_printed > 0)
2167     + print_white_space ();
2168     +
2169     + /* Nonprintables are assumed to have width 0, except L'\b'. */
2170     + if ((width = wcwidth (wc)) < 1)
2171     + {
2172     + if (wc == L'\b')
2173     + --output_position;
2174     + }
2175     + else
2176     + output_position += width;
2177     +
2178     + fwrite (mbc, 1, mblength, stdout);
2179     + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2180     + mbc_pos -= mblength;
2181     + }
2182     + }
2183     + return;
2184     + }
2185     + putchar (uc);
2186     +}
2187     +#endif
2188     +
2189     /* Skip to page PAGE before printing.
2190     PAGE may be larger than total number of pages. */
2191    
2192     @@ -2498,9 +2701,9 @@ read_line (COLUMN *p)
2193     align_empty_cols = false;
2194     }
2195    
2196     - if (padding_not_printed - col_sep_length > 0)
2197     + if (padding_not_printed - col_sep_width > 0)
2198     {
2199     - pad_across_to (padding_not_printed - col_sep_length);
2200     + pad_across_to (padding_not_printed - col_sep_width);
2201     padding_not_printed = ANYWHERE;
2202     }
2203    
2204     @@ -2601,9 +2804,9 @@ print_stored (COLUMN *p)
2205     }
2206     }
2207    
2208     - if (padding_not_printed - col_sep_length > 0)
2209     + if (padding_not_printed - col_sep_width > 0)
2210     {
2211     - pad_across_to (padding_not_printed - col_sep_length);
2212     + pad_across_to (padding_not_printed - col_sep_width);
2213     padding_not_printed = ANYWHERE;
2214     }
2215    
2216     @@ -2616,8 +2819,8 @@ print_stored (COLUMN *p)
2217     if (spaces_not_printed == 0)
2218     {
2219     output_position = p->start_position + end_vector[line];
2220     - if (p->start_position - col_sep_length == chars_per_margin)
2221     - output_position -= col_sep_length;
2222     + if (p->start_position - col_sep_width == chars_per_margin)
2223     + output_position -= col_sep_width;
2224     }
2225    
2226     return true;
2227     @@ -2635,8 +2838,9 @@ print_stored (COLUMN *p)
2228     characters in clump_buff. (e.g, the width of '\b' is -1, while the
2229     number of characters is 1.) */
2230    
2231     +
2232     static int
2233     -char_to_clump (char c)
2234     +char_to_clump_single (char c)
2235     {
2236     unsigned char uc = c;
2237     register char *s = clump_buff;
2238     @@ -2646,10 +2850,10 @@ char_to_clump (char c)
2239     int chars;
2240     int chars_per_c = 8;
2241    
2242     - if (c == input_tab_char)
2243     + if (c == input_tab_char[0])
2244     chars_per_c = chars_per_input_tab;
2245    
2246     - if (c == input_tab_char || c == '\t')
2247     + if (c == input_tab_char[0] || c == '\t')
2248     {
2249     width = TAB_WIDTH (chars_per_c, input_position);
2250    
2251     @@ -2720,6 +2924,155 @@ char_to_clump (char c)
2252     return chars;
2253     }
2254    
2255     +#ifdef HAVE_MBRTOWC
2256     +static int
2257     +char_to_clump_multi (char c)
2258     +{
2259     + static size_t mbc_pos = 0;
2260     + static unsigned char mbc[MB_LEN_MAX] = {'\0'};
2261     + static mbstate_t state = {'\0'};
2262     + mbstate_t state_bak;
2263     + wchar_t wc;
2264     + unsigned char uc = (unsigned char) c;
2265     + size_t mblength;
2266     + int wc_width;
2267     + register char *s = clump_buff;
2268     + register int i, j;
2269     + char esc_buff[4];
2270     + int width;
2271     + int chars;
2272     + int chars_per_c = 8;
2273     +
2274     + state_bak = state;
2275     + mbc[mbc_pos++] = uc;
2276     + mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2277     +
2278     + width = 0;
2279     + chars = 0;
2280     + while (mbc_pos > 0)
2281     + {
2282     + switch (mblength)
2283     + {
2284     + case (size_t) -2:
2285     + state = state_bak;
2286     + return 0;
2287     +
2288     + case (size_t) -1:
2289     + state = state_bak;
2290     + mblength = 1;
2291     +
2292     + if (use_esc_sequence || use_cntrl_prefix)
2293     + {
2294     + width = +4;
2295     + chars = +4;
2296     + *s++ = '\\';
2297     + sprintf (esc_buff, "%03o", mbc[0]);
2298     + for (i = 0; i <= 2; ++i)
2299     + *s++ = (int) esc_buff[i];
2300     + }
2301     + else
2302     + {
2303     + width += 1;
2304     + chars += 1;
2305     + *s++ = mbc[0];
2306     + }
2307     + break;
2308     +
2309     + case 0:
2310     + mblength = 1;
2311     + /* Fall through */
2312     +
2313     + default:
2314     + if (memcmp (mbc, input_tab_char, mblength) == 0)
2315     + chars_per_c = chars_per_input_tab;
2316     +
2317     + if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t')
2318     + {
2319     + int width_inc;
2320     +
2321     + width_inc = TAB_WIDTH (chars_per_c, input_position);
2322     + width += width_inc;
2323     +
2324     + if (untabify_input)
2325     + {
2326     + for (i = width_inc; i; --i)
2327     + *s++ = ' ';
2328     + chars += width_inc;
2329     + }
2330     + else
2331     + {
2332     + for (i = 0; i < mblength; i++)
2333     + *s++ = mbc[i];
2334     + chars += mblength;
2335     + }
2336     + }
2337     + else if ((wc_width = wcwidth (wc)) < 1)
2338     + {
2339     + if (use_esc_sequence)
2340     + {
2341     + for (i = 0; i < mblength; i++)
2342     + {
2343     + width += 4;
2344     + chars += 4;
2345     + *s++ = '\\';
2346     + sprintf (esc_buff, "%03o", uc);
2347     + for (j = 0; j <= 2; ++j)
2348     + *s++ = (int) esc_buff[j];
2349     + }
2350     + }
2351     + else if (use_cntrl_prefix)
2352     + {
2353     + if (wc < 0200)
2354     + {
2355     + width += 2;
2356     + chars += 2;
2357     + *s++ = '^';
2358     + *s++ = wc ^ 0100;
2359     + }
2360     + else
2361     + {
2362     + for (i = 0; i < mblength; i++)
2363     + {
2364     + width += 4;
2365     + chars += 4;
2366     + *s++ = '\\';
2367     + sprintf (esc_buff, "%03o", uc);
2368     + for (j = 0; j <= 2; ++j)
2369     + *s++ = (int) esc_buff[j];
2370     + }
2371     + }
2372     + }
2373     + else if (wc == L'\b')
2374     + {
2375     + width += -1;
2376     + chars += 1;
2377     + *s++ = c;
2378     + }
2379     + else
2380     + {
2381     + width += 0;
2382     + chars += mblength;
2383     + for (i = 0; i < mblength; i++)
2384     + *s++ = mbc[i];
2385     + }
2386     + }
2387     + else
2388     + {
2389     + width += wc_width;
2390     + chars += mblength;
2391     + for (i = 0; i < mblength; i++)
2392     + *s++ = mbc[i];
2393     + }
2394     + }
2395     + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2396     + mbc_pos -= mblength;
2397     + }
2398     +
2399     + input_position += width;
2400     + return chars;
2401     +}
2402     +#endif
2403     +
2404     /* We've just printed some files and need to clean up things before
2405     looking for more options and printing the next batch of files.
2406    
2407     --- coreutils/src/sort.c
2408     +++ coreutils/src/sort.c
2409     @@ -27,6 +27,19 @@
2410     #include <sys/types.h>
2411     #include <signal.h>
2412     #include <stdio.h>
2413     +#include <assert.h>
2414     +
2415     +/* Get mbstate_t, mbrtowc(), wcrtomb(). */
2416     +#if HAVE_WCHAR_H
2417     +# include <wchar.h>
2418     +#endif
2419     +
2420     +/* Get iswprint(), iswctype() towupper(). */
2421     +#if HAVE_WCTYPE_H
2422     +# include <wctype.h>
2423     +wctype_t blank_type; /* = wctype ("blank"); */
2424     +#endif
2425     +
2426     #include "system.h"
2427     #include "error.h"
2428     #include "hard-locale.h"
2429     @@ -46,6 +59,17 @@ struct rlimit { size_t rlim_cur; };
2430     # define getrlimit(Resource, Rlp) (-1)
2431     #endif
2432    
2433     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
2434     + installation; work around this configuration error. */
2435     +#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
2436     +# define MB_LEN_MAX 16
2437     +#endif
2438     +
2439     +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
2440     +#if HAVE_MBRTOWC && defined mbstate_t
2441     +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
2442     +#endif
2443     +
2444     /* The official name of this program (e.g., no `g' prefix). */
2445     #define PROGRAM_NAME "sort"
2446    
2447     @@ -91,14 +115,38 @@ static char decimal_point;
2448     /* Thousands separator; if CHAR_MAX + 1, then there isn't one. */
2449     static int thousands_sep;
2450    
2451     +static int force_general_numcompare = 0;
2452     +
2453     /* Nonzero if the corresponding locales are hard. */
2454     static bool hard_LC_COLLATE;
2455     -#if HAVE_NL_LANGINFO
2456     +#if HAVE_LANGINFO_CODESET
2457     static bool hard_LC_TIME;
2458     #endif
2459    
2460     #define NONZERO(x) ((x) != 0)
2461    
2462     +/* get a multibyte character's byte length. */
2463     +#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \
2464     + do \
2465     + { \
2466     + wchar_t wc; \
2467     + mbstate_t state_bak; \
2468     + \
2469     + state_bak = STATE; \
2470     + mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \
2471     + \
2472     + switch (MBLENGTH) \
2473     + { \
2474     + case (size_t)-1: \
2475     + case (size_t)-2: \
2476     + STATE = state_bak; \
2477     + /* Fall through. */ \
2478     + case 0: \
2479     + MBLENGTH = 1; \
2480     + } \
2481     + } \
2482     + while (0)
2483     +
2484     /* The kind of blanks for '-b' to skip in various options. */
2485     enum blanktype { bl_start, bl_end, bl_both };
2486    
2487     @@ -235,13 +283,22 @@ static bool reverse;
2488     they were read if all keys compare equal. */
2489     static bool stable;
2490    
2491     -/* If TAB has this value, blanks separate fields. */
2492     +/* If TAB has this value, blanks separate fields.
2493     enum { TAB_DEFAULT = CHAR_MAX + 1 };
2494     +*/
2495    
2496     /* Tab character separating fields. If TAB_DEFAULT, then fields are
2497     separated by the empty string between a non-blank character and a blank
2498     - character. */
2499     + character.
2500     static int tab = TAB_DEFAULT;
2501     +*/
2502     +
2503     +/* Tab character separating fields. If NUL, then fields are separated
2504     + by the empty string between a non-whitespace character and a whitespace
2505     + character. */
2506     +static bool tab_default = true;
2507     +static unsigned char tab[MB_LEN_MAX + 1];
2508     +static size_t tab_length = 1;
2509    
2510     /* Flag to remove consecutive duplicate lines from the output.
2511     Only the last of a sequence of equal lines will be output. */
2512     @@ -386,6 +443,43 @@ struct tempnode
2513     static struct tempnode *volatile temphead;
2514     static struct tempnode *volatile *temptail = &temphead;
2515    
2516     +/* Fucntion pointers. */
2517     +static char *
2518     +(* begfield) (const struct line *line, const struct keyfield *key);
2519     +
2520     +static char *
2521     +(* limfield) (const struct line *line, const struct keyfield *key);
2522     +
2523     +static int
2524     +(*getmonth) (const char *s, size_t len);
2525     +
2526     +static int
2527     +(* keycompare) (const struct line *a, const struct line *b);
2528     +
2529     +/* Test for white space multibyte character.
2530     + Set LENGTH the byte length of investigated multibyte character. */
2531     +#if HAVE_MBRTOWC
2532     +static int
2533     +ismbblank (const char *str, size_t *length)
2534     +{
2535     + size_t mblength;
2536     + wchar_t wc;
2537     + mbstate_t state;
2538     +
2539     + memset (&state, '\0', sizeof(mbstate_t));
2540     + mblength = mbrtowc (&wc, str, MB_LEN_MAX, &state);
2541     +
2542     + if (mblength == (size_t)-1 || mblength == (size_t)-2)
2543     + {
2544     + *length = 1;
2545     + return 0;
2546     + }
2547     +
2548     + *length = (mblength < 1) ? 1 : mblength;
2549     + return (iswctype (wc, blank_type));
2550     +}
2551     +#endif
2552     +
2553     /* Clean up any remaining temporary files. */
2554    
2555     static void
2556     @@ -535,7 +629,7 @@ zaptemp (const char *name)
2557     free (node);
2558     }
2559    
2560     -#if HAVE_NL_LANGINFO
2561     +#if HAVE_LANGINFO_CODESET
2562    
2563     static int
2564     struct_month_cmp (const void *m1, const void *m2)
2565     @@ -562,7 +656,7 @@ inittables (void)
2566     fold_toupper[i] = (ISLOWER (i) ? toupper (i) : i);
2567     }
2568    
2569     -#if HAVE_NL_LANGINFO
2570     +#if HAVE_LANGINFO_CODESET
2571     /* If we're not in the "C" locale, read different names for months. */
2572     if (hard_LC_TIME)
2573     {
2574     @@ -588,6 +682,71 @@ inittables (void)
2575     #endif
2576     }
2577    
2578     +#if HAVE_MBRTOWC
2579     +static void
2580     +inittables_mb (void)
2581     +{
2582     + int i, j, k, l;
2583     + char *name, *s;
2584     + size_t s_len, mblength;
2585     + char mbc[MB_LEN_MAX];
2586     + wchar_t wc, pwc;
2587     + mbstate_t state_mb, state_wc;
2588     +
2589     + for (i = 0; i < MONTHS_PER_YEAR; i++)
2590     + {
2591     + s = (char *) nl_langinfo (ABMON_1 + i);
2592     + s_len = strlen (s);
2593     + monthtab[i].name = name = (char *) xmalloc (s_len + 1);
2594     + monthtab[i].val = i + 1;
2595     +
2596     + memset (&state_mb, '\0', sizeof (mbstate_t));
2597     + memset (&state_wc, '\0', sizeof (mbstate_t));
2598     +
2599     + for (j = 0; j < s_len;)
2600     + {
2601     + if (!ismbblank (s + j, &mblength))
2602     + break;
2603     + j += mblength;
2604     + }
2605     +
2606     + for (k = 0; j < s_len;)
2607     + {
2608     + mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
2609     + /* If conversion is failed, fall back into single byte sorting. */
2610     + if (mblength == (size_t)-1 || mblength == (size_t)-2)
2611     + {
2612     + for (l = 0; l <= i; l++)
2613     + free ((void *)monthtab[l].name);
2614     + inittables();
2615     + return;
2616     + }
2617     + else if (mblength == 0)
2618     + break;
2619     +
2620     + pwc = towupper (wc);
2621     + if (pwc == wc)
2622     + {
2623     + memcpy (mbc, s + j, mblength);
2624     + j += mblength;
2625     + }
2626     + else
2627     + {
2628     + j += mblength;
2629     + mblength = wcrtomb (mbc, wc, &state_wc);
2630     + assert (mblength != (size_t)0 && mblength != (size_t)-1);
2631     + }
2632     +
2633     + for (l = 0; l < mblength; l++)
2634     + name[k++] = mbc[l];
2635     + }
2636     + name[k] = '\0';
2637     + }
2638     + qsort ((void *) monthtab, MONTHS_PER_YEAR,
2639     + sizeof *monthtab, struct_month_cmp);
2640     +}
2641     +#endif
2642     +
2643     /* Specify the amount of main memory to use when sorting. */
2644     static void
2645     specify_sort_size (char const *s)
2646     @@ -798,7 +957,7 @@ buffer_linelim (struct buffer const *buf
2647     by KEY in LINE. */
2648    
2649     static char *
2650     -begfield (const struct line *line, const struct keyfield *key)
2651     +begfield_uni (const struct line *line, const struct keyfield *key)
2652     {
2653     register char *ptr = line->text, *lim = ptr + line->length - 1;
2654     register size_t sword = key->sword;
2655     @@ -808,10 +967,10 @@ begfield (const struct line *line, const
2656     /* The leading field separator itself is included in a field when -t
2657     is absent. */
2658    
2659     - if (tab != TAB_DEFAULT)
2660     + if (!tab_default)
2661     while (ptr < lim && sword--)
2662     {
2663     - while (ptr < lim && *ptr != tab)
2664     + while (ptr < lim && *ptr != tab[0])
2665     ++ptr;
2666     if (ptr < lim)
2667     ++ptr;
2668     @@ -839,11 +998,70 @@ begfield (const struct line *line, const
2669     return ptr;
2670     }
2671    
2672     +#if HAVE_MBRTOWC
2673     +static char *
2674     +begfield_mb (const struct line *line, const struct keyfield *key)
2675     +{
2676     + int i;
2677     + char *ptr = line->text, *lim = ptr + line->length - 1;
2678     + size_t sword = key->sword;
2679     + size_t schar = key->schar;
2680     + size_t mblength;
2681     + mbstate_t state;
2682     +
2683     + memset (&state, '\0', sizeof(mbstate_t));
2684     +
2685     + if (!tab_default)
2686     + while (ptr < lim && sword--)
2687     + {
2688     + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
2689     + {
2690     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2691     + ptr += mblength;
2692     + }
2693     + if (ptr < lim)
2694     + {
2695     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2696     + ptr += mblength;
2697     + }
2698     + }
2699     + else
2700     + while (ptr < lim && sword--)
2701     + {
2702     + while (ptr < lim && ismbblank (ptr, &mblength))
2703     + ptr += mblength;
2704     + if (ptr < lim)
2705     + {
2706     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2707     + ptr += mblength;
2708     + }
2709     + while (ptr < lim && !ismbblank (ptr, &mblength))
2710     + ptr += mblength;
2711     + }
2712     +
2713     + if (key->skipsblanks)
2714     + while (ptr < lim && ismbblank (ptr, &mblength))
2715     + ptr += mblength;
2716     +
2717     + for (i = 0; i < schar; i++)
2718     + {
2719     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2720     +
2721     + if (ptr + mblength > lim)
2722     + break;
2723     + else
2724     + ptr += mblength;
2725     + }
2726     +
2727     + return ptr;
2728     +}
2729     +#endif
2730     +
2731     /* Return the limit of (a pointer to the first character after) the field
2732     in LINE specified by KEY. */
2733    
2734     static char *
2735     -limfield (const struct line *line, const struct keyfield *key)
2736     +limfield_uni (const struct line *line, const struct keyfield *key)
2737     {
2738     register char *ptr = line->text, *lim = ptr + line->length - 1;
2739     register size_t eword = key->eword, echar = key->echar;
2740     @@ -856,10 +1074,10 @@ limfield (const struct line *line, const
2741     `beginning' is the first character following the delimiting TAB.
2742     Otherwise, leave PTR pointing at the first `blank' character after
2743     the preceding field. */
2744     - if (tab != TAB_DEFAULT)
2745     + if (!tab_default)
2746     while (ptr < lim && eword--)
2747     {
2748     - while (ptr < lim && *ptr != tab)
2749     + while (ptr < lim && *ptr != tab[0])
2750     ++ptr;
2751     if (ptr < lim && (eword | echar))
2752     ++ptr;
2753     @@ -905,7 +1123,7 @@ limfield (const struct line *line, const
2754     */
2755    
2756     /* Make LIM point to the end of (one byte past) the current field. */
2757     - if (tab != TAB_DEFAULT)
2758     + if (!tab_default)
2759     {
2760     char *newlim;
2761     newlim = memchr (ptr, tab, lim - ptr);
2762     @@ -941,6 +1159,107 @@ limfield (const struct line *line, const
2763     return ptr;
2764     }
2765    
2766     +#if HAVE_MBRTOWC
2767     +static char *
2768     +limfield_mb (const struct line *line, const struct keyfield *key)
2769     +{
2770     + char *ptr = line->text, *lim = ptr + line->length - 1;
2771     + size_t eword = key->eword, echar = key->echar;
2772     + int i;
2773     + size_t mblength;
2774     + mbstate_t state;
2775     +
2776     + memset (&state, '\0', sizeof(mbstate_t));
2777     +
2778     + if (!tab_default)
2779     + while (ptr < lim && eword--)
2780     + {
2781     + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
2782     + {
2783     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2784     + ptr += mblength;
2785     + }
2786     + if (ptr < lim)
2787     + {
2788     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2789     + ptr += mblength;
2790     + }
2791     + }
2792     + else
2793     + while (ptr < lim && eword--)
2794     + {
2795     + while (ptr < lim && ismbblank (ptr, &mblength))
2796     + ptr += mblength;
2797     + if (ptr < lim)
2798     + {
2799     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2800     + ptr += mblength;
2801     + }
2802     + while (ptr < lim && !ismbblank (ptr, &mblength))
2803     + ptr += mblength;
2804     + }
2805     +
2806     +# ifdef POSIX_UNSPECIFIED
2807     +
2808     + /* Make LIM point to the end of (one byte past) the current field. */
2809     + if (!tab_default)
2810     + {
2811     + char *newlim, *p;
2812     +
2813     + newlim = NULL;
2814     + for (p = ptr; p < lim;)
2815     + {
2816     + if (memcmp (p, tab, tab_length) == 0)
2817     + {
2818     + newlim = p;
2819     + break;
2820     + }
2821     +
2822     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2823     + p += mblength;
2824     + }
2825     + }
2826     + else
2827     + {
2828     + char *newlim;
2829     + newlim = ptr;
2830     +
2831     + while (newlim < lim && ismbblank (newlim, &mblength))
2832     + newlim += mblength;
2833     + if (ptr < lim)
2834     + {
2835     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2836     + ptr += mblength;
2837     + }
2838     + while (newlim < lim && !ismbblank (newlim, &mblength))
2839     + newlim += mblength;
2840     + lim = newlim;
2841     + }
2842     +# endif
2843     +
2844     + /* If we're skipping leading blanks, don't start counting characters
2845     + until after skipping past any leading blanks. */
2846     + if (key->skipeblanks)
2847     + while (ptr < lim && ismbblank (ptr, &mblength))
2848     + ptr += mblength;
2849     +
2850     + memset (&state, '\0', sizeof(mbstate_t));
2851     +
2852     + /* Advance PTR by ECHAR (if possible), but no further than LIM. */
2853     + for (i = 0; i < echar; i++)
2854     + {
2855     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2856     +
2857     + if (ptr + mblength > lim)
2858     + break;
2859     + else
2860     + ptr += mblength;
2861     + }
2862     +
2863     + return ptr;
2864     +}
2865     +#endif
2866     +
2867     /* Fill BUF reading from FP, moving buf->left bytes from the end
2868     of buf->buf to the beginning first. If EOF is reached and the
2869     file wasn't terminated by a newline, supply one. Set up BUF's line
2870     @@ -1023,8 +1342,22 @@ fillbuf (struct buffer *buf, register FI
2871     else
2872     {
2873     if (key->skipsblanks)
2874     - while (blanks[to_uchar (*line_start)])
2875     - line_start++;
2876     + {
2877     +#if HAVE_MBRTOWC
2878     + if (MB_CUR_MAX > 1)
2879     + {
2880     + size_t mblength;
2881     +
2882     + while (ismbblank (line_start, &mblength))
2883     + line_start += mblength;
2884     + }
2885     + else
2886     +#endif
2887     + {
2888     + while (blanks[to_uchar (*line_start)])
2889     + line_start++;
2890     + }
2891     + }
2892     line->keybeg = line_start;
2893     }
2894     }
2895     @@ -1130,10 +1463,27 @@ numcompare (register const char *a, regi
2896     size_t log_a;
2897     size_t log_b;
2898    
2899     - while (blanks[to_uchar (tmpa = *a)])
2900     - a++;
2901     - while (blanks[to_uchar (tmpb = *b)])
2902     - b++;
2903     +#if HAVE_MBRTOWC
2904     + if (MB_CUR_MAX > 1)
2905     + {
2906     + size_t mblength;
2907     +
2908     + while (ismbblank (a, &mblength))
2909     + a += mblength;
2910     + while (ismbblank (b, &mblength))
2911     + b += mblength;
2912     +
2913     + tmpa = *a;
2914     + tmpb = *b;
2915     + }
2916     + else
2917     +#endif
2918     + {
2919     + while (blanks[to_uchar (tmpa = *a)])
2920     + a++;
2921     + while (blanks[to_uchar (tmpb = *b)])
2922     + b++;
2923     + }
2924    
2925     if (tmpa == NEGATION_SIGN)
2926     {
2927     @@ -1263,15 +1613,59 @@ general_numcompare (const char *sa, cons
2928     /* FIXME: maybe add option to try expensive FP conversion
2929     only if A and B can't be compared more cheaply/accurately. */
2930    
2931     - char *ea;
2932     - char *eb;
2933     - double a = strtod (sa, &ea);
2934     - double b = strtod (sb, &eb);
2935     + char *bufa, *ea;
2936     + char *bufb, *eb;
2937     + double a;
2938     + double b;
2939     +
2940     + char *p;
2941     + struct lconv *lconvp = localeconv ();
2942     + size_t thousands_sep_len = strlen (lconvp->thousands_sep);
2943     +
2944     + bufa = (char *) xmalloc (strlen (sa) + 1);
2945     + bufb = (char *) xmalloc (strlen (sb) + 1);
2946     + strcpy (bufa, sa);
2947     + strcpy (bufb, sb);
2948     +
2949     + if (force_general_numcompare)
2950     + {
2951     + while (1)
2952     + {
2953     + a = strtod (bufa, &ea);
2954     + if (memcmp (ea, lconvp->thousands_sep, thousands_sep_len) == 0)
2955     + {
2956     + for (p = ea; *(p + thousands_sep_len) != '\0'; p++)
2957     + *p = *(p + thousands_sep_len);
2958     + *p = '\0';
2959     + continue;
2960     + }
2961     + break;
2962     + }
2963     +
2964     + while (1)
2965     + {
2966     + b = strtod (bufb, &eb);
2967     + if (memcmp (eb, lconvp->thousands_sep, thousands_sep_len) == 0)
2968     + {
2969     + for (p = eb; *(p + thousands_sep_len) != '\0'; p++)
2970     + *p = *(p + thousands_sep_len);
2971     + *p = '\0';
2972     + continue;
2973     + }
2974     + break;
2975     + }
2976     + }
2977     + else
2978     + {
2979     + a = strtod (bufa, &ea);
2980     + b = strtod (bufb, &eb);
2981     + }
2982     +
2983    
2984     /* Put conversion errors at the start of the collating sequence. */
2985     - if (sa == ea)
2986     - return sb == eb ? 0 : -1;
2987     - if (sb == eb)
2988     + if (bufa == ea)
2989     + return bufb == eb ? 0 : -1;
2990     + if (bufb == eb)
2991     return 1;
2992    
2993     /* Sort numbers in the usual way, where -0 == +0. Put NaNs after
2994     @@ -1289,7 +1683,7 @@ general_numcompare (const char *sa, cons
2995     Return 0 if the name in S is not recognized. */
2996    
2997     static int
2998     -getmonth (char const *month, size_t len)
2999     +getmonth_uni (char const *month, size_t len)
3000     {
3001     size_t lo = 0;
3002     size_t hi = MONTHS_PER_YEAR;
3003     @@ -1331,11 +1725,79 @@ getmonth (char const *month, size_t len)
3004     return 0;
3005     }
3006    
3007     +#if HAVE_MBRTOWC
3008     +static int
3009     +getmonth_mb (char const *s, size_t len)
3010     +{
3011     + char *month;
3012     + register size_t i;
3013     + register int lo = 0, hi = MONTHS_PER_YEAR, result;
3014     + char *tmp;
3015     + size_t wclength, mblength;
3016     + const char **pp;
3017     + const wchar_t **wpp;
3018     + wchar_t *month_wcs;
3019     + mbstate_t state;
3020     +
3021     + while (len > 0 && ismbblank (s, &mblength))
3022     + {
3023     + s += mblength;
3024     + len -= mblength;
3025     + }
3026     +
3027     + if (len == 0)
3028     + return 0;
3029     +
3030     + month = (char *) alloca (len + 1);
3031     +
3032     + tmp = (char *) alloca (len + 1);
3033     + memcpy (tmp, s, len);
3034     + tmp[len] = '\0';
3035     + pp = (const char **)&tmp;
3036     + month_wcs = (wchar_t *) alloca ((len + 1) * sizeof (wchar_t));
3037     + memset (&state, '\0', sizeof(mbstate_t));
3038     +
3039     + wclength = mbsrtowcs (month_wcs, pp, len + 1, &state);
3040     + assert (wclength != (size_t)1 && *pp == NULL);
3041     +
3042     + for (i = 0; i < wclength; i++)
3043     + {
3044     + month_wcs[i] = towupper(month_wcs[i]);
3045     + if (iswctype (month_wcs[i], blank_type))
3046     + {
3047     + month_wcs[i] = L'\0';
3048     + break;
3049     + }
3050     + }
3051     +
3052     + wpp = (const wchar_t **)&month_wcs;
3053     +
3054     + mblength = wcsrtombs (month, wpp, len + 1, &state);
3055     + assert (mblength != (-1) && *wpp == NULL);
3056     +
3057     + do
3058     + {
3059     + int ix = (lo + hi) / 2;
3060     +
3061     + if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
3062     + hi = ix;
3063     + else
3064     + lo = ix;
3065     + }
3066     + while (hi - lo > 1);
3067     +
3068     + result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
3069     + ? monthtab[lo].val : 0);
3070     +
3071     + return result;
3072     +}
3073     +#endif
3074     +
3075     /* Compare two lines A and B trying every key in sequence until there
3076     are no more keys or a difference is found. */
3077    
3078     static int
3079     -keycompare (const struct line *a, const struct line *b)
3080     +keycompare_uni (const struct line *a, const struct line *b)
3081     {
3082     struct keyfield const *key = keylist;
3083    
3084     @@ -1499,11 +1961,188 @@ keycompare (const struct line *a, const
3085    
3086     return 0;
3087    
3088     - greater:
3089     +greater:
3090     + diff = 1;
3091     +not_equal:
3092     + return key->reverse ? -diff : diff;
3093     +}
3094     +
3095     +#if HAVE_MBRTOWC
3096     +static int
3097     +keycompare_mb (const struct line *a, const struct line *b)
3098     +{
3099     + struct keyfield *key = keylist;
3100     +
3101     + /* For the first iteration only, the key positions have been
3102     + precomputed for us. */
3103     + char *texta = a->keybeg;
3104     + char *textb = b->keybeg;
3105     + char *lima = a->keylim;
3106     + char *limb = b->keylim;
3107     +
3108     + size_t mblength_a, mblength_b;
3109     + wchar_t wc_a, wc_b;
3110     + mbstate_t state_a, state_b;
3111     +
3112     + int diff;
3113     +
3114     + memset (&state_a, '\0', sizeof(mbstate_t));
3115     + memset (&state_b, '\0', sizeof(mbstate_t));
3116     +
3117     + for (;;)
3118     + {
3119     + register char const *translate = key->translate;
3120     + register bool const *ignore = key->ignore;
3121     +
3122     + /* Find the lengths. */
3123     + size_t lena = lima <= texta ? 0 : lima - texta;
3124     + size_t lenb = limb <= textb ? 0 : limb - textb;
3125     +
3126     + /* Actually compare the fields. */
3127     + if (key->numeric | key->general_numeric)
3128     + {
3129     + char savea = *lima, saveb = *limb;
3130     +
3131     + *lima = *limb = '\0';
3132     + if (force_general_numcompare)
3133     + diff = general_numcompare (texta, textb);
3134     + else
3135     + diff = ((key->numeric ? numcompare : general_numcompare)
3136     + (texta, textb));
3137     + *lima = savea, *limb = saveb;
3138     + }
3139     + else if (key->month)
3140     + diff = getmonth (texta, lena) - getmonth (textb, lenb);
3141     + else
3142     + {
3143     + if (ignore || translate)
3144     + {
3145     + char buf[4000];
3146     + size_t size = lena + 1 + lenb + 1;
3147     + char *copy_a = (size <= sizeof buf ? buf : xmalloc (size));
3148     + char *copy_b = copy_a + lena + 1;
3149     + size_t new_len_a, new_len_b;
3150     + size_t i, j;
3151     +
3152     + /* Ignore and/or translate chars before comparing. */
3153     +# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
3154     + do \
3155     + { \
3156     + wchar_t uwc; \
3157     + char mbc[MB_LEN_MAX]; \
3158     + mbstate_t state_wc; \
3159     + \
3160     + for (NEW_LEN = i = 0; i < LEN;) \
3161     + { \
3162     + mbstate_t state_bak; \
3163     + \
3164     + state_bak = STATE; \
3165     + MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \
3166     + \
3167     + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \
3168     + || MBLENGTH == 0) \
3169     + { \
3170     + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \
3171     + STATE = state_bak; \
3172     + if (!ignore) \
3173     + COPY[NEW_LEN++] = TEXT[i++]; \
3174     + continue; \
3175     + } \
3176     + \
3177     + if (ignore) \
3178     + { \
3179     + if ((ignore == nonprinting && !iswprint (WC)) \
3180     + || (ignore == nondictionary \
3181     + && !iswalnum (WC) && !iswctype (WC, blank_type))) \
3182     + { \
3183     + i += MBLENGTH; \
3184     + continue; \
3185     + } \
3186     + } \
3187     + \
3188     + if (translate) \
3189     + { \
3190     + \
3191     + uwc = toupper(WC); \
3192     + if (WC == uwc) \
3193     + { \
3194     + memcpy (mbc, TEXT + i, MBLENGTH); \
3195     + i += MBLENGTH; \
3196     + } \
3197     + else \
3198     + { \
3199     + i += MBLENGTH; \
3200     + WC = uwc; \
3201     + memset (&state_wc, '\0', sizeof (mbstate_t)); \
3202     + \
3203     + MBLENGTH = wcrtomb (mbc, WC, &state_wc); \
3204     + assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \
3205     + } \
3206     + \
3207     + for (j = 0; j < MBLENGTH; j++) \
3208     + COPY[NEW_LEN++] = mbc[j]; \
3209     + } \
3210     + else \
3211     + for (j = 0; j < MBLENGTH; j++) \
3212     + COPY[NEW_LEN++] = TEXT[i++]; \
3213     + } \
3214     + COPY[NEW_LEN] = '\0'; \
3215     + } \
3216     + while (0)
3217     +
3218     + IGNORE_CHARS (new_len_a, lena, texta, copy_a,
3219     + wc_a, mblength_a, state_a);
3220     + IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
3221     + wc_b, mblength_b, state_b);
3222     + diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
3223     +
3224     + if (sizeof buf < size)
3225     + free (copy_a);
3226     + }
3227     + else if (lena == 0)
3228     + diff = - NONZERO (lenb);
3229     + else if (lenb == 0)
3230     + goto greater;
3231     + else
3232     + diff = xmemcoll (texta, lena, textb, lenb);
3233     + }
3234     +
3235     + if (diff)
3236     + goto not_equal;
3237     +
3238     + key = key->next;
3239     + if (! key)
3240     + break;
3241     +
3242     + /* Find the beginning and limit of the next field. */
3243     + if (key->eword != SIZE_MAX)
3244     + lima = limfield (a, key), limb = limfield (b, key);
3245     + else
3246     + lima = a->text + a->length - 1, limb = b->text + b->length - 1;
3247     +
3248     + if (key->sword != SIZE_MAX)
3249     + texta = begfield (a, key), textb = begfield (b, key);
3250     + else
3251     + {
3252     + texta = a->text, textb = b->text;
3253     + if (key->skipsblanks)
3254     + {
3255     + while (texta < lima && ismbblank (texta, &mblength_a))
3256     + texta += mblength_a;
3257     + while (textb < limb && ismbblank (textb, &mblength_b))
3258     + textb += mblength_b;
3259     + }
3260     + }
3261     + }
3262     +
3263     + return 0;
3264     +
3265     +greater:
3266     diff = 1;
3267     - not_equal:
3268     +not_equal:
3269     return key->reverse ? -diff : diff;
3270     }
3271     +#endif
3272    
3273     /* Compare two lines A and B, returning negative, zero, or positive
3274     depending on whether A compares less than, equal to, or greater than B. */
3275     @@ -2243,6 +2882,11 @@ set_ordering (register const char *s, st
3276     break;
3277     case 'M':
3278     key->month = true;
3279     +#if HAVE_MBRTOWC
3280     + if (strcmp (setlocale (LC_CTYPE, NULL), setlocale (LC_TIME, NULL)))
3281     + error (0, 0, _("As LC_TIME differs from LC_CTYPE, the results may be strange."));
3282     + inittables_mb ();
3283     +#endif
3284     break;
3285     case 'n':
3286     key->numeric = true;
3287     @@ -2296,7 +2940,7 @@ main (int argc, char **argv)
3288     atexit (close_stdout);
3289    
3290     hard_LC_COLLATE = hard_locale (LC_COLLATE);
3291     -#if HAVE_NL_LANGINFO
3292     +#if HAVE_LANGINFO_CODESET
3293     hard_LC_TIME = hard_locale (LC_TIME);
3294     #endif
3295    
3296     @@ -2309,14 +2953,40 @@ main (int argc, char **argv)
3297     add support for multibyte decimal points. */
3298     decimal_point = locale->decimal_point[0];
3299     if (! decimal_point || locale->decimal_point[1])
3300     - decimal_point = '.';
3301     + {
3302     + decimal_point = '.';
3303     + if (locale->decimal_point[0] && locale->decimal_point[1])
3304     + force_general_numcompare = 1;
3305     + }
3306    
3307     /* FIXME: add support for multibyte thousands separators. */
3308     thousands_sep = *locale->thousands_sep;
3309     if (! thousands_sep || locale->thousands_sep[1])
3310     - thousands_sep = CHAR_MAX + 1;
3311     + {
3312     + thousands_sep = CHAR_MAX + 1;
3313     + if (locale->thousands_sep[0] && locale->thousands_sep[1])
3314     + force_general_numcompare = 1;
3315     + }
3316     }
3317    
3318     +#if HAVE_MBRTOWC
3319     + if (MB_CUR_MAX > 1)
3320     + {
3321     + blank_type = wctype ("blank");
3322     + begfield = begfield_mb;
3323     + limfield = limfield_mb;
3324     + getmonth = getmonth_mb;
3325     + keycompare = keycompare_mb;
3326     + }
3327     + else
3328     +#endif
3329     + {
3330     + begfield = begfield_uni;
3331     + limfield = limfield_uni;
3332     + keycompare = keycompare_uni;
3333     + getmonth = getmonth_uni;
3334     + }
3335     +
3336     have_read_stdin = false;
3337     inittables ();
3338    
3339     @@ -2514,28 +3184,48 @@ main (int argc, char **argv)
3340     break;
3341    
3342     case 't':
3343     - {
3344     - char newtab = optarg[0];
3345     - if (! newtab)
3346     + {
3347     + if (! optarg[0])
3348     error (SORT_FAILURE, 0, _("empty tab"));
3349     +
3350     + strncpy (tab, optarg, MB_LEN_MAX);
3351     +
3352     if (optarg[1])
3353     {
3354     - if (STREQ (optarg, "\\0"))
3355     - newtab = '\0';
3356     + if (strcmp (optarg, "\\0") == 0)
3357     + tab[0] = '\0';
3358     else
3359     {
3360     - /* Provoke with `sort -txx'. Complain about
3361     - "multi-character tab" instead of "multibyte tab", so
3362     - that the diagnostic's wording does not need to be
3363     - changed once multibyte characters are supported. */
3364     - error (SORT_FAILURE, 0, _("multi-character tab `%s'"),
3365     - optarg);
3366     +#if HAVE_MBRTOWC
3367     + if (MB_CUR_MAX > 1)
3368     + {
3369     + wchar_t wc;
3370     + mbstate_t state;
3371     +
3372     + memset (&state, '\0', sizeof (mbstate_t));
3373     + tab_length = mbrtowc (&wc, tab, MB_LEN_MAX, &state);
3374     + tab_length = (tab_length == (size_t)-1
3375     + || tab_length == (size_t)-2
3376     + || tab_length == 0) ? 1 : tab_length;
3377     + if (optarg[tab_length])
3378     + {
3379     + error (SORT_FAILURE, 0,
3380     + _("multi-character tab `%s'"), optarg);
3381     + }
3382     + }
3383     + else
3384     +#endif
3385     + {
3386     + error (SORT_FAILURE, 0, _("multi-character tab `%s'"),
3387     + optarg);
3388     + }
3389     }
3390     }
3391     - if (tab != TAB_DEFAULT && tab != newtab)
3392     + if (!tab_default)
3393     error (SORT_FAILURE, 0, _("incompatible tabs"));
3394     - tab = newtab;
3395     + tab_default = false;
3396     }
3397     +
3398     break;
3399    
3400     case 'T':
3401     --- coreutils/src/unexpand.c
3402     +++ coreutils/src/unexpand.c
3403     @@ -39,12 +39,35 @@
3404     #include <stdio.h>
3405     #include <getopt.h>
3406     #include <sys/types.h>
3407     +
3408     +/* Get mbstate_t, mbrtowc(), wcwidth() */
3409     +#if HAVE_WCHAR_H
3410     +# include <wchar.h>
3411     +#endif
3412     +/* Get iswblank */
3413     +#if HAVE_WCTYPE_H
3414     +# include <wctype.h>
3415     +#endif
3416     +
3417     +
3418     +/* A sentinel value that's placed at the end of the list of tab stops.
3419     + * This value must be a large number, but not so large that adding the
3420     + * length of a line to it would cause the column variable to overflow. */
3421     +#define TAB_STOP_SENTINEL INT_MAX
3422     +
3423     #include "system.h"
3424     #include "error.h"
3425     #include "posixver.h"
3426     #include "quote.h"
3427     #include "xstrndup.h"
3428    
3429     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
3430     + installation; work around this configuration error. */
3431     +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
3432     +# undef MB_LEN_MAX
3433     +# define MB_LEN_MAX 16
3434     +#endif
3435     +
3436     /* The official name of this program (e.g., no `g' prefix). */
3437     #define PROGRAM_NAME "unexpand"
3438    
3439     @@ -460,6 +483,236 @@ unexpand (void)
3440     }
3441     }
3442    
3443     +#if HAVE_MBRTOWC && HAVE_WCTYPE_H
3444     +static void
3445     +unexpand_multibyte (void)
3446     +{
3447     + /* Input stream. */
3448     + FILE *fp = next_file (NULL);
3449     +
3450     + mbstate_t i_state; /* Current shift state of the input stream. */
3451     + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
3452     + char *bufpos; /* Next read position of BUF. */
3453     + size_t buflen = 0; /* The length of the byte sequence in buf. */
3454     +
3455     + /* The array of pending blanks. In non-POSIX locales, blanks can
3456     + include characters other than spaces, so the blanks must be
3457     + stored, not merely counted. */
3458     + char *pending_blank;
3459     +
3460     + if (!fp)
3461     + return;
3462     +
3463     + /* Binary I/O will preserve the original EOL style (DOS/Unix) of files. */
3464     + SET_BINARY2 (fileno (fp), STDOUT_FILENO);
3465     +
3466     + /* The worst case is a non-blank character, then one blank, then a
3467     + tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so
3468     + allocate MAX_COLUMN_WIDTH bytes to store the blanks. */
3469     + pending_blank = xmalloc (max_column_width);
3470     +
3471     + memset (&i_state, '\0', sizeof(mbstate_t));
3472     +
3473     + for (;;)
3474     + {
3475     + /* A gotten wide character. */
3476     + wint_t wc;
3477     +
3478     + /* If true, perform translations. */
3479     + bool convert = true;
3480     +
3481     + /* The following variables have valid values only when CONVERT
3482     + is true: */
3483     +
3484     + /* Column of next input character. */
3485     + uintmax_t column = 0;
3486     +
3487     + /* Column the next input tab stop is on. */
3488     + uintmax_t next_tab_column = 0;
3489     +
3490     + /* Index in TAB_LIST of next tab stop to examine. */
3491     + size_t tab_index = 0;
3492     +
3493     + /* If true, the first pending blank came just before a tab stop. */
3494     + bool one_blank_before_tab_stop = false;
3495     +
3496     + /* If true, the previous input character was a blank. This is
3497     + initially true, since initial strings of blanks are treated
3498     + as if the line was preceded by a blank. */
3499     + bool prev_blank = true;
3500     +
3501     + /* Number of pending columns of blanks. */
3502     + size_t pending = 0;
3503     +
3504     + /* Convert a line of text. */
3505     + do
3506     + {
3507     + wchar_t w;
3508     + size_t mblength; /* The byte size of a multibyte character
3509     + which shows as same character as WC. */
3510     + mbstate_t i_state_bak; /* Back up the I_STATE. */
3511     +
3512     + /* Fill buffer */
3513     + if (buflen < MB_LEN_MAX)
3514     + {
3515     + if (!feof(fp) && !ferror(fp)) {
3516     + if (buflen > 0) memmove(buf, bufpos, buflen);
3517     + buflen += fread(buf + buflen, sizeof(char), BUFSIZ, fp);
3518     + bufpos = buf;
3519     + }
3520     + }
3521     +
3522     + if (buflen < 1) {
3523     + /* Move to the next file */
3524     + if (feof(fp) || ferror(fp)) {
3525     + fp = next_file(fp);
3526     + }
3527     + if (!fp) {
3528     + if (pending)
3529     + {
3530     + if (fwrite (pending_blank, 1, pending, stdout) != pending)
3531     + error (EXIT_FAILURE, errno, _("write error"));
3532     + }
3533     + free (pending_blank);
3534     + return;
3535     + }
3536     + SET_BINARY2 (fileno (fp), STDOUT_FILENO);
3537     + continue;
3538     + }
3539     +
3540     + i_state_bak = i_state;
3541     + mblength = mbrtowc (&w, bufpos, buflen, &i_state);
3542     + wc = w;
3543     +
3544     + if (mblength == (size_t) -1 || mblength == (size_t) -2) {
3545     + i_state = i_state_bak;
3546     + wc = L'\0';
3547     + column += convert;
3548     + mblength = 1;
3549     + }
3550     +
3551     + if (convert)
3552     + {
3553     + bool blank = iswblank (wc);
3554     +
3555     + if (blank)
3556     + {
3557     + if (next_tab_column <= column)
3558     + {
3559     + if (tab_size)
3560     + next_tab_column =
3561     + column + (tab_size - column % tab_size);
3562     + else
3563     + for (;;)
3564     + if (tab_index == first_free_tab)
3565     + {
3566     + convert = false;
3567     + break;
3568     + }
3569     + else
3570     + {
3571     + uintmax_t tab = tab_list[tab_index++];
3572     + if (column < tab)
3573     + {
3574     + next_tab_column = tab;
3575     + break;
3576     + }
3577     + }
3578     + }
3579     +
3580     + if (convert)
3581     + {
3582     + if (next_tab_column < column)
3583     + error (EXIT_FAILURE, 0, _("input line is too long"));
3584     +
3585     + if (wc == L'\t')
3586     + {
3587     + column = next_tab_column;
3588     +
3589     + /* Discard pending blanks, unless it was a single
3590     + blank just before the previous tab stop. */
3591     + if (! (pending == 1 && one_blank_before_tab_stop))
3592     + {
3593     + pending = 0;
3594     + one_blank_before_tab_stop = false;
3595     + }
3596     + }
3597     + else
3598     + {
3599     + column++;
3600     +
3601     + if (! (prev_blank && column == next_tab_column))
3602     + {
3603     + /* It is not yet known whether the pending blanks
3604     + will be replaced by tabs. */
3605     + if (column == next_tab_column)
3606     + one_blank_before_tab_stop = true;
3607     + pending_blank[pending++] = ' ';
3608     + prev_blank = true;
3609     + buflen -= mblength;
3610     + bufpos += mblength;
3611     + continue;
3612     + }
3613     +
3614     + /* Replace the pending blanks by a tab or two. */
3615     + pending_blank[0] = *bufpos = '\t';
3616     + pending = one_blank_before_tab_stop;
3617     + }
3618     + }
3619     + }
3620     + else if (wc == L'\b')
3621     + {
3622     + /* Go back one column, and force recalculation of the
3623     + next tab stop. */
3624     + column -= !!column;
3625     + next_tab_column = column;
3626     + tab_index -= !!tab_index;
3627     + }
3628     + else
3629     + {
3630     + if (!iswcntrl (wc))
3631     + {
3632     + int width = wcwidth (wc);
3633     + if (width > 0) {
3634     + if (column > (column + width))
3635     + error (EXIT_FAILURE, 0, _("input line is too long"));
3636     + column += width;
3637     + }
3638     + }
3639     + }
3640     +
3641     + if (pending)
3642     + {
3643     + if (fwrite (pending_blank, 1, pending, stdout) != pending)
3644     + error (EXIT_FAILURE, errno, _("write error"));
3645     + pending = 0;
3646     + one_blank_before_tab_stop = false;
3647     + }
3648     +
3649     + prev_blank = blank;
3650     + convert &= convert_entire_line | blank;
3651     + }
3652     +
3653     + if (mblength)
3654     + {
3655     + if (fwrite (bufpos, sizeof(char), mblength, stdout) < mblength)
3656     + error (EXIT_FAILURE, errno, _("write error"));
3657     + }
3658     + else
3659     + {
3660     + if (putchar('\0'))
3661     + error (EXIT_FAILURE, errno, _("write error"));
3662     + mblength = 1;
3663     + }
3664     +
3665     + buflen -= mblength;
3666     + bufpos += mblength;
3667     + }
3668     + while (wc != L'\n');
3669     + }
3670     +}
3671     +#endif
3672     +
3673     int
3674     main (int argc, char **argv)
3675     {
3676     @@ -548,7 +801,12 @@ main (int argc, char **argv)
3677    
3678     file_list = (optind < argc ? &argv[optind] : stdin_argv);
3679    
3680     - unexpand ();
3681     +#if HAVE_MBRTOWC
3682     + if (MB_CUR_MAX > 1)
3683     + unexpand_multibyte ();
3684     + else
3685     +#endif
3686     + unexpand ();
3687    
3688     if (have_read_stdin && fclose (stdin) != 0)
3689     error (EXIT_FAILURE, errno, "-");
3690     --- coreutils/src/uniq.c
3691     +++ coreutils/src/uniq.c
3692     @@ -23,6 +23,16 @@
3693     #include <getopt.h>
3694     #include <sys/types.h>
3695    
3696     +/* Get mbstate_t, mbrtowc(), wcrtomb() */
3697     +#if HAVE_WCHAR_H
3698     +# include <wchar.h>
3699     +#endif
3700     +
3701     +/* Get iswctype(), wctype(), towupper)(. */
3702     +#if HAVE_WCTYPE_H
3703     +# include <wctype.h>
3704     +#endif
3705     +
3706     #include "system.h"
3707     #include "argmatch.h"
3708     #include "linebuffer.h"
3709     @@ -34,6 +44,13 @@
3710     #include "xstrtol.h"
3711     #include "memcasecmp.h"
3712    
3713     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
3714     + installation; work around this configuration error. */
3715     +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
3716     +# undef MB_LEN_MAX
3717     +# define MB_LEN_MAX 16
3718     +#endif
3719     +
3720     /* The official name of this program (e.g., no `g' prefix). */
3721     #define PROGRAM_NAME "uniq"
3722    
3723     @@ -109,6 +126,12 @@ static enum delimit_method const delimit
3724     /* Select whether/how to delimit groups of duplicate lines. */
3725     static enum delimit_method delimit_groups;
3726    
3727     +/* Function pointers. */
3728     +static char * (*find_field) (struct linebuffer *line);
3729     +
3730     +/* Show the blank character class. */
3731     +wctype_t blank_type;
3732     +
3733     static struct option const longopts[] =
3734     {
3735     {"count", no_argument, NULL, 'c'},
3736     @@ -189,7 +212,7 @@ size_opt (char const *opt, char const *m
3737     return a pointer to the beginning of the line's field to be compared. */
3738    
3739     static char *
3740     -find_field (const struct linebuffer *line)
3741     +find_field_uni (struct linebuffer *line)
3742     {
3743     register size_t count;
3744     register char *lp = line->buffer;
3745     @@ -210,6 +233,83 @@ find_field (const struct linebuffer *lin
3746     return lp + i;
3747     }
3748    
3749     +#if HAVE_MBRTOWC
3750     +
3751     +# define MBCHAR_TO_WCHAR(WC, MBLENGTH, LP, POS, SIZE, STATEP, CONVFAIL) \
3752     + do \
3753     + { \
3754     + mbstate_t state_bak; \
3755     + \
3756     + CONVFAIL = 0; \
3757     + state_bak = *STATEP; \
3758     + \
3759     + MBLENGTH = mbrtowc (&WC, LP + POS, SIZE - POS, STATEP); \
3760     + \
3761     + switch (MBLENGTH) \
3762     + { \
3763     + case (size_t)-2: \
3764     + case (size_t)-1: \
3765     + *STATEP = state_bak; \
3766     + CONVFAIL++; \
3767     + /* Fall through */ \
3768     + case 0: \
3769     + MBLENGTH = 1; \
3770     + } \
3771     + } \
3772     + while (0)
3773     +
3774     +static char *
3775     +find_field_multi (struct linebuffer *line)
3776     +{
3777     + size_t count;
3778     + char *lp = line->buffer;
3779     + size_t size = line->length - 1;
3780     + size_t pos;
3781     + size_t mblength;
3782     + wchar_t wc;
3783     + mbstate_t *statep;
3784     + int convfail;
3785     +
3786     + pos = 0;
3787     + statep = &(line->state);
3788     +
3789     + /* skip fields. */
3790     + for (count = 0; count < skip_fields && pos < size; count++)
3791     + {
3792     + while (pos < size)
3793     + {
3794     + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3795     +
3796     + if (convfail || !iswctype (wc, blank_type))
3797     + {
3798     + pos += mblength;
3799     + break;
3800     + }
3801     + pos += mblength;
3802     + }
3803     +
3804     + while (pos < size)
3805     + {
3806     + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3807     +
3808     + if (!convfail && iswctype (wc, blank_type))
3809     + break;
3810     +
3811     + pos += mblength;
3812     + }
3813     + }
3814     +
3815     + /* skip fields. */
3816     + for (count = 0; count < skip_chars && pos < size; count++)
3817     + {
3818     + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3819     + pos += mblength;
3820     + }
3821     +
3822     + return lp + pos;
3823     +}
3824     +#endif
3825     +
3826     /* Return false if two strings OLD and NEW match, true if not.
3827     OLD and NEW point not to the beginnings of the lines
3828     but rather to the beginnings of the fields to compare.
3829     @@ -234,6 +334,73 @@ different (char *old, char *new, size_t
3830     return oldlen != newlen || memcmp (old, new, oldlen);
3831     }
3832    
3833     +#if HAVE_MBRTOWC
3834     +static int
3835     +different_multi (const char *old, const char *new, size_t oldlen, size_t newlen, mbstate_t oldstate, mbstate_t newstate)
3836     +{
3837     + size_t i, j, chars;
3838     + const char *str[2];
3839     + char *copy[2];
3840     + size_t len[2];
3841     + mbstate_t state[2];
3842     + size_t mblength;
3843     + wchar_t wc, uwc;
3844     + mbstate_t state_bak;
3845     +
3846     + str[0] = old;
3847     + str[1] = new;
3848     + len[0] = oldlen;
3849     + len[1] = newlen;
3850     + state[0] = oldstate;
3851     + state[1] = newstate;
3852     +
3853     + for (i = 0; i < 2; i++)
3854     + {
3855     + copy[i] = alloca (len[i] + 1);
3856     +
3857     + for (j = 0, chars = 0; j < len[i] && chars < check_chars; chars++)
3858     + {
3859     + state_bak = state[i];
3860     + mblength = mbrtowc (&wc, str[i] + j, len[i] - j, &(state[i]));
3861     +
3862     + switch (mblength)
3863     + {
3864     + case (size_t)-1:
3865     + case (size_t)-2:
3866     + state[i] = state_bak;
3867     + /* Fall through */
3868     + case 0:
3869     + mblength = 1;
3870     + break;
3871     +
3872     + default:
3873     + if (ignore_case)
3874     + {
3875     + uwc = towupper (wc);
3876     +
3877     + if (uwc != wc)
3878     + {
3879     + mbstate_t state_wc;
3880     +
3881     + memset (&state_wc, '\0', sizeof(mbstate_t));
3882     + wcrtomb (copy[i] + j, uwc, &state_wc);
3883     + }
3884     + else
3885     + memcpy (copy[i] + j, str[i] + j, mblength);
3886     + }
3887     + else
3888     + memcpy (copy[i] + j, str[i] + j, mblength);
3889     + }
3890     + j += mblength;
3891     + }
3892     + copy[i][j] = '\0';
3893     + len[i] = j;
3894     + }
3895     +
3896     + return xmemcoll (copy[0], len[0], copy[1], len[1]);
3897     +}
3898     +#endif
3899     +
3900     /* Output the line in linebuffer LINE to stream STREAM
3901     provided that the switches say it should be output.
3902     MATCH is true if the line matches the previous line.
3903     @@ -297,15 +464,42 @@ check_file (const char *infile, const ch
3904     {
3905     char *prevfield IF_LINT (= NULL);
3906     size_t prevlen IF_LINT (= 0);
3907     +#if HAVE_MBRTOWC
3908     + mbstate_t prevstate;
3909     +
3910     + memset (&prevstate, '\0', sizeof (mbstate_t));
3911     +#endif
3912    
3913     while (!feof (istream))
3914     {
3915     char *thisfield;
3916     size_t thislen;
3917     +#if HAVE_MBRTOWC
3918     + mbstate_t thisstate;
3919     +#endif
3920     if (readlinebuffer (thisline, istream) == 0)
3921     break;
3922     thisfield = find_field (thisline);
3923     thislen = thisline->length - 1 - (thisfield - thisline->buffer);
3924     +#if HAVE_MBRTOWC
3925     + if (MB_CUR_MAX > 1)
3926     + {
3927     + thisstate = thisline->state;
3928     +
3929     + if (prevline->length == 0 || different_multi
3930     + (thisfield, prevfield, thislen, prevlen, thisstate, prevstate))
3931     + {
3932     + fwrite (thisline->buffer, sizeof (char),
3933     + thisline->length, ostream);
3934     +
3935     + SWAP_LINES (prevline, thisline);
3936     + prevfield = thisfield;
3937     + prevlen = thislen;
3938     + prevstate = thisstate;
3939     + }
3940     + }
3941     + else
3942     +#endif
3943     if (prevline->length == 0
3944     || different (thisfield, prevfield, thislen, prevlen))
3945     {
3946     @@ -324,17 +518,26 @@ check_file (const char *infile, const ch
3947     size_t prevlen;
3948     uintmax_t match_count = 0;
3949     bool first_delimiter = true;
3950     +#if HAVE_MBRTOWC
3951     + mbstate_t prevstate;
3952     +#endif
3953    
3954     if (readlinebuffer (prevline, istream) == 0)
3955     goto closefiles;
3956     prevfield = find_field (prevline);
3957     prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
3958     +#if HAVE_MBRTOWC
3959     + prevstate = prevline->state;
3960     +#endif
3961    
3962     while (!feof (istream))
3963     {
3964     bool match;
3965     char *thisfield;
3966     size_t thislen;
3967     +#if HAVE_MBRTOWC
3968     + mbstate_t thisstate;
3969     +#endif
3970     if (readlinebuffer (thisline, istream) == 0)
3971     {
3972     if (ferror (istream))
3973     @@ -343,6 +546,15 @@ check_file (const char *infile, const ch
3974     }
3975     thisfield = find_field (thisline);
3976     thislen = thisline->length - 1 - (thisfield - thisline->buffer);
3977     +#if HAVE_MBRTOWC
3978     + if (MB_CUR_MAX > 1)
3979     + {
3980     + thisstate = thisline->state;
3981     + match = !different_multi (thisfield, prevfield,
3982     + thislen, prevlen, thisstate, prevstate);
3983     + }
3984     + else
3985     +#endif
3986     match = !different (thisfield, prevfield, thislen, prevlen);
3987     match_count += match;
3988    
3989     @@ -375,6 +587,9 @@ check_file (const char *infile, const ch
3990     SWAP_LINES (prevline, thisline);
3991     prevfield = thisfield;
3992     prevlen = thislen;
3993     +#if HAVE_MBRTOWC
3994     + prevstate = thisstate;
3995     +#endif
3996     if (!match)
3997     match_count = 0;
3998     }
3999     @@ -420,6 +635,18 @@ main (int argc, char **argv)
4000    
4001     atexit (close_stdout);
4002    
4003     +#if HAVE_MBRTOWC
4004     + if (MB_CUR_MAX > 1)
4005     + {
4006     + find_field = find_field_multi;
4007     + blank_type = wctype ("blank");
4008     + }
4009     + else
4010     +#endif
4011     + {
4012     + find_field = find_field_uni;
4013     + }
4014     +
4015     skip_chars = 0;
4016     skip_fields = 0;
4017     check_chars = SIZE_MAX;