Magellan Linux

Annotation of /trunk/coreutils/patches-6.9/coreutils-6.9-i18n.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 223 - (hide annotations) (download)
Sat Jun 23 13:51:30 2007 UTC (16 years, 11 months ago) by niro
File size: 103622 byte(s)
-rev bump to 6.9

1 niro 223 Submitted by: Matt Burgess (matthew at linuxfromscratch.org)
2     Date: 2007-04-07
3     Initial Package Version: 6.9
4     Upstream Status: Rejected
5     Origin: Based on coreutils-5.93-i18n-2.patch by Alexander Patrakov
6     Description: This patch fixes various problems with multibyte character support.
7     LSB >= 2.0 tests for features added by this patch, but only Coreutils-5.2.1 plus
8     http://www.linuxfromscratch.org/~alexander/patches/coreutils-5.2.1-i18n_fixes-1.patch
9     actually pass the Li18nux2000-level1 testsuite.
10    
11     diff -Naur coreutils-6.9.orig/lib/linebuffer.h coreutils-6.9/lib/linebuffer.h
12     --- coreutils-6.9.orig/lib/linebuffer.h 2005-05-14 06:03:58.000000000 +0000
13     +++ coreutils-6.9/lib/linebuffer.h 2007-04-07 16:59:55.000000000 +0000
14     @@ -22,6 +22,11 @@
15    
16     # include <stdio.h>
17    
18     +/* Get mbstate_t. */
19     +# if HAVE_WCHAR_H
20     +# include <wchar.h>
21     +# endif
22     +
23     /* A `struct linebuffer' holds a line of text. */
24    
25     struct linebuffer
26     @@ -29,6 +34,9 @@
27     size_t size; /* Allocated. */
28     size_t length; /* Used. */
29     char *buffer;
30     +# if HAVE_WCHAR_H
31     + mbstate_t state;
32     +# endif
33     };
34    
35     /* Initialize linebuffer LINEBUFFER for use. */
36     diff -Naur coreutils-6.9.orig/src/cut.c coreutils-6.9/src/cut.c
37     --- coreutils-6.9.orig/src/cut.c 2007-03-18 21:36:43.000000000 +0000
38     +++ coreutils-6.9/src/cut.c 2007-04-07 16:59:55.000000000 +0000
39     @@ -29,6 +29,11 @@
40     #include <assert.h>
41     #include <getopt.h>
42     #include <sys/types.h>
43     +
44     +/* Get mbstate_t, mbrtowc(). */
45     +#if HAVE_WCHAR_H
46     +# include <wchar.h>
47     +#endif
48     #include "system.h"
49    
50     #include "error.h"
51     @@ -37,6 +42,18 @@
52     #include "quote.h"
53     #include "xstrndup.h"
54    
55     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
56     + installation; work around this configuration error. */
57     +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
58     +# undef MB_LEN_MAX
59     +# define MB_LEN_MAX 16
60     +#endif
61     +
62     +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
63     +#if HAVE_MBRTOWC && defined mbstate_t
64     +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
65     +#endif
66     +
67     /* The official name of this program (e.g., no `g' prefix). */
68     #define PROGRAM_NAME "cut"
69    
70     @@ -67,6 +84,52 @@
71     } \
72     while (0)
73    
74     +/* Refill the buffer BUF to get a multibyte character. */
75     +#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
76     + do \
77     + { \
78     + if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
79     + { \
80     + memmove (BUF, BUFPOS, BUFLEN); \
81     + BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
82     + BUFPOS = BUF; \
83     + } \
84     + } \
85     + while (0)
86     +
87     +/* Get wide character on BUFPOS. BUFPOS is not included after that.
88     + If byte sequence is not valid as a character, CONVFAIL is 1. Otherwise 0. */
89     +#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
90     + do \
91     + { \
92     + mbstate_t state_bak; \
93     + \
94     + if (BUFLEN < 1) \
95     + { \
96     + WC = WEOF; \
97     + break; \
98     + } \
99     + \
100     + /* Get a wide character. */ \
101     + CONVFAIL = 0; \
102     + state_bak = STATE; \
103     + MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
104     + \
105     + switch (MBLENGTH) \
106     + { \
107     + case (size_t)-1: \
108     + case (size_t)-2: \
109     + CONVFAIL++; \
110     + STATE = state_bak; \
111     + /* Fall througn. */ \
112     + \
113     + case 0: \
114     + MBLENGTH = 1; \
115     + break; \
116     + } \
117     + } \
118     + while (0)
119     +
120     struct range_pair
121     {
122     size_t lo;
123     @@ -85,7 +148,7 @@
124     /* The number of bytes allocated for FIELD_1_BUFFER. */
125     static size_t field_1_bufsize;
126    
127     -/* The largest field or byte index used as an endpoint of a closed
128     +/* The largest byte, character or field index used as an endpoint of a closed
129     or degenerate range specification; this doesn't include the starting
130     index of right-open-ended ranges. For example, with either range spec
131     `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
132     @@ -97,10 +160,11 @@
133    
134     /* This is a bit vector.
135     In byte mode, which bytes to output.
136     + In character mode, which characters to output.
137     In field mode, which DELIM-separated fields to output.
138     - Both bytes and fields are numbered starting with 1,
139     + Bytes, characters and fields are numbered starting with 1,
140     so the zeroth bit of this array is unused.
141     - A field or byte K has been selected if
142     + A byte, character or field K has been selected if
143     (K <= MAX_RANGE_ENDPOINT and is_printable_field(K))
144     || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
145     static unsigned char *printable_field;
146     @@ -109,9 +173,12 @@
147     {
148     undefined_mode,
149    
150     - /* Output characters that are in the given bytes. */
151     + /* Output bytes that are at the given positions. */
152     byte_mode,
153    
154     + /* Output characters that are at the given positions. */
155     + character_mode,
156     +
157     /* Output the given delimeter-separated fields. */
158     field_mode
159     };
160     @@ -121,6 +188,13 @@
161    
162     static enum operating_mode operating_mode;
163    
164     +/* If nonzero, when in byte mode, don't split multibyte characters. */
165     +static int byte_mode_character_aware;
166     +
167     +/* If nonzero, the function for single byte locale is work
168     + if this program runs on multibyte locale. */
169     +static int force_singlebyte_mode;
170     +
171     /* If true do not output lines containing no delimeter characters.
172     Otherwise, all such lines are printed. This option is valid only
173     with field mode. */
174     @@ -132,6 +206,9 @@
175    
176     /* The delimeter character for field mode. */
177     static unsigned char delim;
178     +#if HAVE_WCHAR_H
179     +static wchar_t wcdelim;
180     +#endif
181    
182     /* True if the --output-delimiter=STRING option was specified. */
183     static bool output_delimiter_specified;
184     @@ -205,7 +282,7 @@
185     -f, --fields=LIST select only these fields; also print any line\n\
186     that contains no delimiter character, unless\n\
187     the -s option is specified\n\
188     - -n (ignored)\n\
189     + -n with -b: don't split multibyte characters\n\
190     "), stdout);
191     fputs (_("\
192     --complement complement the set of selected bytes, characters\n\
193     @@ -362,7 +439,7 @@
194     in_digits = false;
195     /* Starting a range. */
196     if (dash_found)
197     - FATAL_ERROR (_("invalid byte or field list"));
198     + FATAL_ERROR (_("invalid byte, character or field list"));
199     dash_found = true;
200     fieldstr++;
201    
202     @@ -387,14 +464,16 @@
203     if (value == 0)
204     {
205     /* `n-'. From `initial' to end of line. */
206     - eol_range_start = initial;
207     + if (eol_range_start == 0 ||
208     + (eol_range_start != 0 && eol_range_start > initial))
209     + eol_range_start = initial;
210     field_found = true;
211     }
212     else
213     {
214     /* `m-n' or `-n' (1-n). */
215     if (value < initial)
216     - FATAL_ERROR (_("invalid byte or field list"));
217     + FATAL_ERROR (_("invalid byte, character or field list"));
218    
219     /* Is there already a range going to end of line? */
220     if (eol_range_start != 0)
221     @@ -467,6 +546,9 @@
222     if (operating_mode == byte_mode)
223     error (0, 0,
224     _("byte offset %s is too large"), quote (bad_num));
225     + else if (operating_mode == character_mode)
226     + error (0, 0,
227     + _("character offset %s is too large"), quote (bad_num));
228     else
229     error (0, 0,
230     _("field number %s is too large"), quote (bad_num));
231     @@ -477,7 +559,7 @@
232     fieldstr++;
233     }
234     else
235     - FATAL_ERROR (_("invalid byte or field list"));
236     + FATAL_ERROR (_("invalid byte, character or field list"));
237     }
238    
239     max_range_endpoint = 0;
240     @@ -570,6 +652,63 @@
241     }
242     }
243    
244     +#if HAVE_MBRTOWC
245     +/* This function is in use for the following case.
246     +
247     + 1. Read from the stream STREAM, printing to standard output any selected
248     + characters.
249     +
250     + 2. Read from stream STREAM, printing to standard output any selected bytes,
251     + without splitting multibyte characters. */
252     +
253     +static void
254     +cut_characters_or_cut_bytes_no_split (FILE *stream)
255     +{
256     + int idx; /* number of bytes or characters in the line so far. */
257     + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
258     + char *bufpos; /* Next read position of BUF. */
259     + size_t buflen; /* The length of the byte sequence in buf. */
260     + wint_t wc; /* A gotten wide character. */
261     + size_t mblength; /* The byte size of a multibyte character which shows
262     + as same character as WC. */
263     + mbstate_t state; /* State of the stream. */
264     + int convfail; /* 1, when conversion is failed. Otherwise 0. */
265     +
266     + idx = 0;
267     + buflen = 0;
268     + bufpos = buf;
269     + memset (&state, '\0', sizeof(mbstate_t));
270     +
271     + while (1)
272     + {
273     + REFILL_BUFFER (buf, bufpos, buflen, stream);
274     +
275     + GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
276     +
277     + if (wc == WEOF)
278     + {
279     + if (idx > 0)
280     + putchar ('\n');
281     + break;
282     + }
283     + else if (wc == L'\n')
284     + {
285     + putchar ('\n');
286     + idx = 0;
287     + }
288     + else
289     + {
290     + idx += (operating_mode == byte_mode) ? mblength : 1;
291     + if (print_kth (idx, NULL))
292     + fwrite (bufpos, mblength, sizeof(char), stdout);
293     + }
294     +
295     + buflen -= mblength;
296     + bufpos += mblength;
297     + }
298     +}
299     +#endif
300     +
301     /* Read from stream STREAM, printing to standard output any selected fields. */
302    
303     static void
304     @@ -692,13 +831,192 @@
305     }
306     }
307    
308     +#if HAVE_MBRTOWC
309     +static void
310     +cut_fields_mb (FILE *stream)
311     +{
312     + int c;
313     + unsigned int field_idx;
314     + int found_any_selected_field;
315     + int buffer_first_field;
316     + int empty_input;
317     + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
318     + char *bufpos; /* Next read position of BUF. */
319     + size_t buflen; /* The length of the byte sequence in buf. */
320     + wint_t wc = 0; /* A gotten wide character. */
321     + size_t mblength; /* The byte size of a multibyte character which shows
322     + as same character as WC. */
323     + mbstate_t state; /* State of the stream. */
324     + int convfail; /* 1, when conversion is failed. Otherwise 0. */
325     +
326     + found_any_selected_field = 0;
327     + field_idx = 1;
328     + bufpos = buf;
329     + buflen = 0;
330     + memset (&state, '\0', sizeof(mbstate_t));
331     +
332     + c = getc (stream);
333     + empty_input = (c == EOF);
334     + if (c != EOF)
335     + ungetc (c, stream);
336     + else
337     + wc = WEOF;
338     +
339     + /* To support the semantics of the -s flag, we may have to buffer
340     + all of the first field to determine whether it is `delimited.'
341     + But that is unnecessary if all non-delimited lines must be printed
342     + and the first field has been selected, or if non-delimited lines
343     + must be suppressed and the first field has *not* been selected.
344     + That is because a non-delimited line has exactly one field. */
345     + buffer_first_field = (suppress_non_delimited ^ !print_kth (1, NULL));
346     +
347     + while (1)
348     + {
349     + if (field_idx == 1 && buffer_first_field)
350     + {
351     + int len = 0;
352     +
353     + while (1)
354     + {
355     + REFILL_BUFFER (buf, bufpos, buflen, stream);
356     +
357     + GET_NEXT_WC_FROM_BUFFER
358     + (wc, bufpos, buflen, mblength, state, convfail);
359     +
360     + if (wc == WEOF)
361     + break;
362     +
363     + field_1_buffer = xrealloc (field_1_buffer, len + mblength);
364     + memcpy (field_1_buffer + len, bufpos, mblength);
365     + len += mblength;
366     + buflen -= mblength;
367     + bufpos += mblength;
368     +
369     + if (!convfail && (wc == L'\n' || wc == wcdelim))
370     + break;
371     + }
372     +
373     + if (wc == WEOF)
374     + break;
375     +
376     + /* If the first field extends to the end of line (it is not
377     + delimited) and we are printing all non-delimited lines,
378     + print this one. */
379     + if (convfail || (!convfail && wc != wcdelim))
380     + {
381     + if (suppress_non_delimited)
382     + {
383     + /* Empty. */
384     + }
385     + else
386     + {
387     + fwrite (field_1_buffer, sizeof (char), len, stdout);
388     + /* Make sure the output line is newline terminated. */
389     + if (convfail || (!convfail && wc != L'\n'))
390     + putchar ('\n');
391     + }
392     + continue;
393     + }
394     +
395     + if (print_kth (1, NULL))
396     + {
397     + /* Print the field, but not the trailing delimiter. */
398     + fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
399     + found_any_selected_field = 1;
400     + }
401     + ++field_idx;
402     + }
403     +
404     + if (wc != WEOF)
405     + {
406     + if (print_kth (field_idx, NULL))
407     + {
408     + if (found_any_selected_field)
409     + {
410     + fwrite (output_delimiter_string, sizeof (char),
411     + output_delimiter_length, stdout);
412     + }
413     + found_any_selected_field = 1;
414     + }
415     +
416     + while (1)
417     + {
418     + REFILL_BUFFER (buf, bufpos, buflen, stream);
419     +
420     + GET_NEXT_WC_FROM_BUFFER
421     + (wc, bufpos, buflen, mblength, state, convfail);
422     +
423     + if (wc == WEOF)
424     + break;
425     + else if (!convfail && (wc == wcdelim || wc == L'\n'))
426     + {
427     + buflen -= mblength;
428     + bufpos += mblength;
429     + break;
430     + }
431     +
432     + if (print_kth (field_idx, NULL))
433     + fwrite (bufpos, mblength, sizeof(char), stdout);
434     +
435     + buflen -= mblength;
436     + bufpos += mblength;
437     + }
438     + }
439     +
440     + if ((!convfail || wc == L'\n') && buflen < 1)
441     + wc = WEOF;
442     +
443     + if (!convfail && wc == wcdelim)
444     + ++field_idx;
445     + else if (wc == WEOF || (!convfail && wc == L'\n'))
446     + {
447     + if (found_any_selected_field
448     + || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
449     + putchar ('\n');
450     + if (wc == WEOF)
451     + break;
452     + field_idx = 1;
453     + found_any_selected_field = 0;
454     + }
455     + }
456     +}
457     +#endif
458     +
459     static void
460     cut_stream (FILE *stream)
461     {
462     - if (operating_mode == byte_mode)
463     - cut_bytes (stream);
464     +#if HAVE_MBRTOWC
465     + if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
466     + {
467     + switch (operating_mode)
468     + {
469     + case byte_mode:
470     + if (byte_mode_character_aware)
471     + cut_characters_or_cut_bytes_no_split (stream);
472     + else
473     + cut_bytes (stream);
474     + break;
475     +
476     + case character_mode:
477     + cut_characters_or_cut_bytes_no_split (stream);
478     + break;
479     +
480     + case field_mode:
481     + cut_fields_mb (stream);
482     + break;
483     +
484     + default:
485     + abort ();
486     + }
487     + }
488     else
489     - cut_fields (stream);
490     +#endif
491     + {
492     + if (operating_mode == field_mode)
493     + cut_fields (stream);
494     + else
495     + cut_bytes (stream);
496     + }
497     }
498    
499     /* Process file FILE to standard output.
500     @@ -748,6 +1066,8 @@
501     bool ok;
502     bool delim_specified = false;
503     char *spec_list_string IF_LINT(= NULL);
504     + char mbdelim[MB_LEN_MAX + 1];
505     + size_t delimlen = 0;
506    
507     initialize_main (&argc, &argv);
508     program_name = argv[0];
509     @@ -770,7 +1090,6 @@
510     switch (optc)
511     {
512     case 'b':
513     - case 'c':
514     /* Build the byte list. */
515     if (operating_mode != undefined_mode)
516     FATAL_ERROR (_("only one type of list may be specified"));
517     @@ -778,6 +1097,14 @@
518     spec_list_string = optarg;
519     break;
520    
521     + case 'c':
522     + /* Build the character list. */
523     + if (operating_mode != undefined_mode)
524     + FATAL_ERROR (_("only one type of list may be specified"));
525     + operating_mode = character_mode;
526     + spec_list_string = optarg;
527     + break;
528     +
529     case 'f':
530     /* Build the field list. */
531     if (operating_mode != undefined_mode)
532     @@ -789,10 +1116,35 @@
533     case 'd':
534     /* New delimiter. */
535     /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
536     - if (optarg[0] != '\0' && optarg[1] != '\0')
537     - FATAL_ERROR (_("the delimiter must be a single character"));
538     - delim = optarg[0];
539     - delim_specified = true;
540     +#if HAVE_MBRTOWC
541     + {
542     + if(MB_CUR_MAX > 1)
543     + {
544     + mbstate_t state;
545     +
546     + memset (&state, '\0', sizeof(mbstate_t));
547     + delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
548     +
549     + if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
550     + ++force_singlebyte_mode;
551     + else
552     + {
553     + delimlen = (delimlen < 1) ? 1 : delimlen;
554     + if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
555     + FATAL_ERROR (_("the delimiter must be a single character"));
556     + memcpy (mbdelim, optarg, delimlen);
557     + }
558     + }
559     +
560     + if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
561     +#endif
562     + {
563     + if (optarg[0] != '\0' && optarg[1] != '\0')
564     + FATAL_ERROR (_("the delimiter must be a single character"));
565     + delim = (unsigned char) optarg[0];
566     + }
567     + delim_specified = true;
568     + }
569     break;
570    
571     case OUTPUT_DELIMITER_OPTION:
572     @@ -805,6 +1157,7 @@
573     break;
574    
575     case 'n':
576     + byte_mode_character_aware = 1;
577     break;
578    
579     case 's':
580     @@ -827,7 +1180,7 @@
581     if (operating_mode == undefined_mode)
582     FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
583    
584     - if (delim != '\0' && operating_mode != field_mode)
585     + if (delim_specified && operating_mode != field_mode)
586     FATAL_ERROR (_("an input delimiter may be specified only\
587     when operating on fields"));
588    
589     @@ -854,15 +1207,34 @@
590     }
591    
592     if (!delim_specified)
593     - delim = '\t';
594     + {
595     + delim = '\t';
596     +#ifdef HAVE_MBRTOWC
597     + wcdelim = L'\t';
598     + mbdelim[0] = '\t';
599     + mbdelim[1] = '\0';
600     + delimlen = 1;
601     +#endif
602     + }
603    
604     if (output_delimiter_string == NULL)
605     {
606     - static char dummy[2];
607     - dummy[0] = delim;
608     - dummy[1] = '\0';
609     - output_delimiter_string = dummy;
610     - output_delimiter_length = 1;
611     +#ifdef HAVE_MBRTOWC
612     + if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
613     + {
614     + output_delimiter_string = xstrdup(mbdelim);
615     + output_delimiter_length = delimlen;
616     + }
617     +
618     + if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
619     +#endif
620     + {
621     + static char dummy[2];
622     + dummy[0] = delim;
623     + dummy[1] = '\0';
624     + output_delimiter_string = dummy;
625     + output_delimiter_length = 1;
626     + }
627     }
628    
629     if (optind == argc)
630     diff -Naur coreutils-6.9.orig/src/expand.c coreutils-6.9/src/expand.c
631     --- coreutils-6.9.orig/src/expand.c 2007-03-18 21:36:43.000000000 +0000
632     +++ coreutils-6.9/src/expand.c 2007-04-07 16:59:55.000000000 +0000
633     @@ -38,11 +38,28 @@
634     #include <stdio.h>
635     #include <getopt.h>
636     #include <sys/types.h>
637     +
638     +/* Get mbstate_t, mbrtowc(), wcwidth(). */
639     +#if HAVE_WCHAR_H
640     +# include <wchar.h>
641     +#endif
642     +
643     #include "system.h"
644     #include "error.h"
645     #include "quote.h"
646     #include "xstrndup.h"
647    
648     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
649     + installation; work around this configuration error. */
650     +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
651     +# define MB_LEN_MAX 16
652     +#endif
653     +
654     +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
655     +#if HAVE_MBRTOWC && defined mbstate_t
656     +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
657     +#endif
658     +
659     /* The official name of this program (e.g., no `g' prefix). */
660     #define PROGRAM_NAME "expand"
661    
662     @@ -183,6 +200,7 @@
663     stops = num_start + len - 1;
664     }
665     }
666     +
667     else
668     {
669     error (0, 0, _("tab size contains invalid character(s): %s"),
670     @@ -365,6 +383,142 @@
671     }
672     }
673    
674     +#if HAVE_MBRTOWC
675     +static void
676     +expand_multibyte (void)
677     +{
678     + FILE *fp; /* Input strem. */
679     + mbstate_t i_state; /* Current shift state of the input stream. */
680     + mbstate_t i_state_bak; /* Back up the I_STATE. */
681     + mbstate_t o_state; /* Current shift state of the output stream. */
682     + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
683     + char *bufpos; /* Next read position of BUF. */
684     + size_t buflen = 0; /* The length of the byte sequence in buf. */
685     + wchar_t wc; /* A gotten wide character. */
686     + size_t mblength; /* The byte size of a multibyte character
687     + which shows as same character as WC. */
688     + int tab_index = 0; /* Index in `tab_list' of next tabstop. */
689     + int column = 0; /* Column on screen of the next char. */
690     + int next_tab_column; /* Column the next tab stop is on. */
691     + int convert = 1; /* If nonzero, perform translations. */
692     +
693     + fp = next_file ((FILE *) NULL);
694     + if (fp == NULL)
695     + return;
696     +
697     + memset (&o_state, '\0', sizeof(mbstate_t));
698     + memset (&i_state, '\0', sizeof(mbstate_t));
699     +
700     + for (;;)
701     + {
702     + /* Refill the buffer BUF. */
703     + if (buflen < MB_LEN_MAX && !feof(fp) && !ferror(fp))
704     + {
705     + memmove (buf, bufpos, buflen);
706     + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, fp);
707     + bufpos = buf;
708     + }
709     +
710     + /* No character is left in BUF. */
711     + if (buflen < 1)
712     + {
713     + fp = next_file (fp);
714     +
715     + if (fp == NULL)
716     + break; /* No more files. */
717     + else
718     + {
719     + memset (&i_state, '\0', sizeof(mbstate_t));
720     + continue;
721     + }
722     + }
723     +
724     + /* Get a wide character. */
725     + i_state_bak = i_state;
726     + mblength = mbrtowc (&wc, bufpos, buflen, &i_state);
727     +
728     + switch (mblength)
729     + {
730     + case (size_t)-1: /* illegal byte sequence. */
731     + case (size_t)-2:
732     + mblength = 1;
733     + i_state = i_state_bak;
734     + if (convert)
735     + {
736     + ++column;
737     + if (convert_entire_line == 0)
738     + convert = 0;
739     + }
740     + putchar (*bufpos);
741     + break;
742     +
743     + case 0: /* null. */
744     + mblength = 1;
745     + if (convert && convert_entire_line == 0)
746     + convert = 0;
747     + putchar ('\0');
748     + break;
749     +
750     + default:
751     + if (wc == L'\n') /* LF. */
752     + {
753     + tab_index = 0;
754     + column = 0;
755     + convert = 1;
756     + putchar ('\n');
757     + }
758     + else if (wc == L'\t' && convert) /* Tab. */
759     + {
760     + if (tab_size == 0)
761     + {
762     + /* Do not let tab_index == first_free_tab;
763     + stop when it is 1 less. */
764     + while (tab_index < first_free_tab - 1
765     + && column >= tab_list[tab_index])
766     + tab_index++;
767     + next_tab_column = tab_list[tab_index];
768     + if (tab_index < first_free_tab - 1)
769     + tab_index++;
770     + if (column >= next_tab_column)
771     + next_tab_column = column + 1;
772     + }
773     + else
774     + next_tab_column = column + tab_size - column % tab_size;
775     +
776     + while (column < next_tab_column)
777     + {
778     + putchar (' ');
779     + ++column;
780     + }
781     + }
782     + else /* Others. */
783     + {
784     + if (convert)
785     + {
786     + if (wc == L'\b')
787     + {
788     + if (column > 0)
789     + --column;
790     + }
791     + else
792     + {
793     + int width; /* The width of WC. */
794     +
795     + width = wcwidth (wc);
796     + column += (width > 0) ? width : 0;
797     + if (convert_entire_line == 0)
798     + convert = 0;
799     + }
800     + }
801     + fwrite (bufpos, sizeof(char), mblength, stdout);
802     + }
803     + }
804     + buflen -= mblength;
805     + bufpos += mblength;
806     + }
807     +}
808     +#endif
809     +
810     int
811     main (int argc, char **argv)
812     {
813     @@ -429,7 +583,12 @@
814    
815     file_list = (optind < argc ? &argv[optind] : stdin_argv);
816    
817     - expand ();
818     +#if HAVE_MBRTOWC
819     + if (MB_CUR_MAX > 1)
820     + expand_multibyte ();
821     + else
822     +#endif
823     + expand ();
824    
825     if (have_read_stdin && fclose (stdin) != 0)
826     error (EXIT_FAILURE, errno, "-");
827     diff -Naur coreutils-6.9.orig/src/fold.c coreutils-6.9/src/fold.c
828     --- coreutils-6.9.orig/src/fold.c 2007-03-18 21:36:43.000000000 +0000
829     +++ coreutils-6.9/src/fold.c 2007-04-07 16:59:55.000000000 +0000
830     @@ -23,11 +23,33 @@
831     #include <getopt.h>
832     #include <sys/types.h>
833    
834     +/* Get mbstate_t, mbrtowc(), wcwidth(). */
835     +#if HAVE_WCHAR_H
836     +# include <wchar.h>
837     +#endif
838     +
839     +/* Get iswprint(), iswblank(), wcwidth(). */
840     +#if HAVE_WCTYPE_H
841     +# include <wctype.h>
842     +#endif
843     +
844     #include "system.h"
845     #include "error.h"
846     #include "quote.h"
847     #include "xstrtol.h"
848    
849     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
850     + installation; work around this configuration error. */
851     +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
852     +# undef MB_LEN_MAX
853     +# define MB_LEN_MAX 16
854     +#endif
855     +
856     +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
857     +#if HAVE_MBRTOWC && defined mbstate_t
858     +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
859     +#endif
860     +
861     #define TAB_WIDTH 8
862    
863     /* The official name of this program (e.g., no `g' prefix). */
864     @@ -35,23 +57,44 @@
865    
866     #define AUTHORS "David MacKenzie"
867    
868     +#define FATAL_ERROR(Message) \
869     + do \
870     + { \
871     + error (0, 0, (Message)); \
872     + usage (2); \
873     + } \
874     + while (0)
875     +
876     +enum operating_mode
877     +{
878     + /* Fold texts by columns that are at the given positions. */
879     + column_mode,
880     +
881     + /* Fold texts by bytes that are at the given positions. */
882     + byte_mode,
883     +
884     + /* Fold texts by characters that are at the given positions. */
885     + character_mode,
886     +};
887     +
888     /* The name this program was run with. */
889     char *program_name;
890    
891     +/* The argument shows current mode. (Default: column_mode) */
892     +static enum operating_mode operating_mode;
893     +
894     /* If nonzero, try to break on whitespace. */
895     static bool break_spaces;
896    
897     -/* If nonzero, count bytes, not column positions. */
898     -static bool count_bytes;
899     -
900     /* If nonzero, at least one of the files we read was standard input. */
901     static bool have_read_stdin;
902    
903     -static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
904     +static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
905    
906     static struct option const longopts[] =
907     {
908     {"bytes", no_argument, NULL, 'b'},
909     + {"characters", no_argument, NULL, 'c'},
910     {"spaces", no_argument, NULL, 's'},
911     {"width", required_argument, NULL, 'w'},
912     {GETOPT_HELP_OPTION_DECL},
913     @@ -81,6 +124,7 @@
914     "), stdout);
915     fputs (_("\
916     -b, --bytes count bytes rather than columns\n\
917     + -c, --characters count characters rather than columns\n\
918     -s, --spaces break at spaces\n\
919     -w, --width=WIDTH use WIDTH columns instead of 80\n\
920     "), stdout);
921     @@ -98,7 +142,7 @@
922     static size_t
923     adjust_column (size_t column, char c)
924     {
925     - if (!count_bytes)
926     + if (operating_mode != byte_mode)
927     {
928     if (c == '\b')
929     {
930     @@ -117,34 +161,14 @@
931     return column;
932     }
933    
934     -/* Fold file FILENAME, or standard input if FILENAME is "-",
935     - to stdout, with maximum line length WIDTH.
936     - Return true if successful. */
937     -
938     -static bool
939     -fold_file (char const *filename, size_t width)
940     +static void
941     +fold_text (FILE *istream, size_t width, int *saved_errno)
942     {
943     - FILE *istream;
944     int c;
945     size_t column = 0; /* Screen column where next char will go. */
946     size_t offset_out = 0; /* Index in `line_out' for next char. */
947     static char *line_out = NULL;
948     static size_t allocated_out = 0;
949     - int saved_errno;
950     -
951     - if (STREQ (filename, "-"))
952     - {
953     - istream = stdin;
954     - have_read_stdin = true;
955     - }
956     - else
957     - istream = fopen (filename, "r");
958     -
959     - if (istream == NULL)
960     - {
961     - error (0, errno, "%s", filename);
962     - return false;
963     - }
964    
965     while ((c = getc (istream)) != EOF)
966     {
967     @@ -172,6 +196,15 @@
968     bool found_blank = false;
969     size_t logical_end = offset_out;
970    
971     + /* If LINE_OUT has no wide character,
972     + put a new wide character in LINE_OUT
973     + if column is bigger than width. */
974     + if (offset_out == 0)
975     + {
976     + line_out[offset_out++] = c;
977     + continue;
978     + }
979     +
980     /* Look for the last blank. */
981     while (logical_end)
982     {
983     @@ -218,11 +251,225 @@
984     line_out[offset_out++] = c;
985     }
986    
987     - saved_errno = errno;
988     + *saved_errno = errno;
989    
990     if (offset_out)
991     fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
992    
993     + free(line_out);
994     +}
995     +
996     +#if HAVE_MBRTOWC
997     +static void
998     +fold_multibyte_text (FILE *istream, int width, int *saved_errno)
999     +{
1000     + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
1001     + size_t buflen = 0; /* The length of the byte sequence in buf. */
1002     + char *bufpos; /* Next read position of BUF. */
1003     + wint_t wc; /* A gotten wide character. */
1004     + size_t mblength; /* The byte size of a multibyte character which shows
1005     + as same character as WC. */
1006     + mbstate_t state, state_bak; /* State of the stream. */
1007     + int convfail; /* 1, when conversion is failed. Otherwise 0. */
1008     +
1009     + char *line_out = NULL;
1010     + size_t offset_out = 0; /* Index in `line_out' for next char. */
1011     + size_t allocated_out = 0;
1012     +
1013     + int increment;
1014     + size_t column = 0;
1015     +
1016     + size_t last_blank_pos;
1017     + size_t last_blank_column;
1018     + int is_blank_seen;
1019     + int last_blank_increment;
1020     + int is_bs_following_last_blank;
1021     + size_t bs_following_last_blank_num;
1022     + int is_cr_after_last_blank;
1023     +
1024     +#define CLEAR_FLAGS \
1025     + do \
1026     + { \
1027     + last_blank_pos = 0; \
1028     + last_blank_column = 0; \
1029     + is_blank_seen = 0; \
1030     + is_bs_following_last_blank = 0; \
1031     + bs_following_last_blank_num = 0; \
1032     + is_cr_after_last_blank = 0; \
1033     + } \
1034     + while (0)
1035     +
1036     +#define START_NEW_LINE \
1037     + do \
1038     + { \
1039     + putchar ('\n'); \
1040     + column = 0; \
1041     + offset_out = 0; \
1042     + CLEAR_FLAGS; \
1043     + } \
1044     + while (0)
1045     +
1046     + CLEAR_FLAGS;
1047     + memset (&state, '\0', sizeof(mbstate_t));
1048     +
1049     + for (;; bufpos += mblength, buflen -= mblength)
1050     + {
1051     + if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream))
1052     + {
1053     + memmove (buf, bufpos, buflen);
1054     + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream);
1055     + bufpos = buf;
1056     + }
1057     +
1058     + if (buflen < 1)
1059     + break;
1060     +
1061     + /* Get a wide character. */
1062     + convfail = 0;
1063     + state_bak = state;
1064     + mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &state);
1065     +
1066     + switch (mblength)
1067     + {
1068     + case (size_t)-1:
1069     + case (size_t)-2:
1070     + convfail++;
1071     + state = state_bak;
1072     + /* Fall through. */
1073     +
1074     + case 0:
1075     + mblength = 1;
1076     + break;
1077     + }
1078     +
1079     +rescan:
1080     + if (operating_mode == byte_mode) /* byte mode */
1081     + increment = mblength;
1082     + else if (operating_mode == character_mode) /* character mode */
1083     + increment = 1;
1084     + else /* column mode */
1085     + {
1086     + if (convfail)
1087     + increment = 1;
1088     + else
1089     + {
1090     + switch (wc)
1091     + {
1092     + case L'\n':
1093     + fwrite (line_out, sizeof(char), offset_out, stdout);
1094     + START_NEW_LINE;
1095     + continue;
1096     +
1097     + case L'\b':
1098     + increment = (column > 0) ? -1 : 0;
1099     + break;
1100     +
1101     + case L'\r':
1102     + increment = -1 * column;
1103     + break;
1104     +
1105     + case L'\t':
1106     + increment = 8 - column % 8;
1107     + break;
1108     +
1109     + default:
1110     + increment = wcwidth (wc);
1111     + increment = (increment < 0) ? 0 : increment;
1112     + }
1113     + }
1114     + }
1115     +
1116     + if (column + increment > width && break_spaces && last_blank_pos)
1117     + {
1118     + fwrite (line_out, sizeof(char), last_blank_pos, stdout);
1119     + putchar ('\n');
1120     +
1121     + offset_out = offset_out - last_blank_pos;
1122     + column = column - last_blank_column + ((is_cr_after_last_blank)
1123     + ? last_blank_increment : bs_following_last_blank_num);
1124     + memmove (line_out, line_out + last_blank_pos, offset_out);
1125     + CLEAR_FLAGS;
1126     + goto rescan;
1127     + }
1128     +
1129     + if (column + increment > width && column != 0)
1130     + {
1131     + fwrite (line_out, sizeof(char), offset_out, stdout);
1132     + START_NEW_LINE;
1133     + goto rescan;
1134     + }
1135     +
1136     + if (allocated_out < offset_out + mblength)
1137     + {
1138     + allocated_out += 1024;
1139     + line_out = xrealloc (line_out, allocated_out);
1140     + }
1141     +
1142     + memcpy (line_out + offset_out, bufpos, mblength);
1143     + offset_out += mblength;
1144     + column += increment;
1145     +
1146     + if (is_blank_seen && !convfail && wc == L'\r')
1147     + is_cr_after_last_blank = 1;
1148     +
1149     + if (is_bs_following_last_blank && !convfail && wc == L'\b')
1150     + ++bs_following_last_blank_num;
1151     + else
1152     + is_bs_following_last_blank = 0;
1153     +
1154     + if (break_spaces && !convfail && iswblank (wc))
1155     + {
1156     + last_blank_pos = offset_out;
1157     + last_blank_column = column;
1158     + is_blank_seen = 1;
1159     + last_blank_increment = increment;
1160     + is_bs_following_last_blank = 1;
1161     + bs_following_last_blank_num = 0;
1162     + is_cr_after_last_blank = 0;
1163     + }
1164     + }
1165     +
1166     + *saved_errno = errno;
1167     +
1168     + if (offset_out)
1169     + fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
1170     +
1171     + free(line_out);
1172     +}
1173     +#endif
1174     +
1175     +/* Fold file FILENAME, or standard input if FILENAME is "-",
1176     + to stdout, with maximum line length WIDTH.
1177     + Return 0 if successful, 1 if an error occurs. */
1178     +
1179     +static int
1180     +fold_file (char *filename, int width)
1181     +{
1182     + FILE *istream;
1183     + int saved_errno;
1184     +
1185     + if (STREQ (filename, "-"))
1186     + {
1187     + istream = stdin;
1188     + have_read_stdin = 1;
1189     + }
1190     + else
1191     + istream = fopen (filename, "r");
1192     +
1193     + if (istream == NULL)
1194     + {
1195     + error (0, errno, "%s", filename);
1196     + return 1;
1197     + }
1198     +
1199     + /* Define how ISTREAM is being folded. */
1200     +#if HAVE_MBRTOWC
1201     + if (MB_CUR_MAX > 1)
1202     + fold_multibyte_text (istream, width, &saved_errno);
1203     + else
1204     +#endif
1205     + fold_text (istream, width, &saved_errno);
1206     +
1207     if (ferror (istream))
1208     {
1209     error (0, saved_errno, "%s", filename);
1210     @@ -255,7 +502,8 @@
1211    
1212     atexit (close_stdout);
1213    
1214     - break_spaces = count_bytes = have_read_stdin = false;
1215     + operating_mode = column_mode;
1216     + break_spaces = have_read_stdin = false;
1217    
1218     while ((optc = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
1219     {
1220     @@ -264,7 +512,15 @@
1221     switch (optc)
1222     {
1223     case 'b': /* Count bytes rather than columns. */
1224     - count_bytes = true;
1225     + if (operating_mode != column_mode)
1226     + FATAL_ERROR (_("only one way of folding may be specified"));
1227     + operating_mode = byte_mode;
1228     + break;
1229     +
1230     + case 'c':
1231     + if (operating_mode != column_mode)
1232     + FATAL_ERROR (_("only one way of folding may be specified"));
1233     + operating_mode = character_mode;
1234     break;
1235    
1236     case 's': /* Break at word boundaries. */
1237     diff -Naur coreutils-6.9.orig/src/join.c coreutils-6.9/src/join.c
1238     --- coreutils-6.9.orig/src/join.c 2007-03-18 21:36:43.000000000 +0000
1239     +++ coreutils-6.9/src/join.c 2007-04-07 16:59:55.000000000 +0000
1240     @@ -23,16 +23,30 @@
1241     #include <sys/types.h>
1242     #include <getopt.h>
1243    
1244     +/* Get mbstate_t, mbrtowc(), mbrtowc(), wcwidth(). */
1245     +#if HAVE_WCHAR_H
1246     +# include <wchar.h>
1247     +#endif
1248     +
1249     +/* Get iswblank(), towupper. */
1250     +#if HAVE_WCTYPE_H
1251     +# include <wctype.h>
1252     +#endif
1253     +
1254     #include "system.h"
1255     #include "error.h"
1256     #include "hard-locale.h"
1257     #include "linebuffer.h"
1258     -#include "memcasecmp.h"
1259     #include "quote.h"
1260     #include "stdio--.h"
1261     #include "xmemcoll.h"
1262     #include "xstrtol.h"
1263    
1264     +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1265     +#if HAVE_MBRTOWC && defined mbstate_t
1266     +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1267     +#endif
1268     +
1269     /* The official name of this program (e.g., no `g' prefix). */
1270     #define PROGRAM_NAME "join"
1271    
1272     @@ -104,10 +118,12 @@
1273     /* Last element in `outlist', where a new element can be added. */
1274     static struct outlist *outlist_end = &outlist_head;
1275    
1276     -/* Tab character separating fields. If negative, fields are separated
1277     - by any nonempty string of blanks, otherwise by exactly one
1278     - tab character whose value (when cast to unsigned char) equals TAB. */
1279     -static int tab = -1;
1280     +/* Tab character separating fields. If NULL, fields are separated
1281     + by any nonempty string of blanks. */
1282     +static char *tab = NULL;
1283     +
1284     +/* The number of bytes used for tab. */
1285     +static size_t tablen = 0;
1286    
1287     static struct option const longopts[] =
1288     {
1289     @@ -190,6 +206,8 @@
1290    
1291     /* Fill in the `fields' structure in LINE. */
1292    
1293     +/* Fill in the `fields' structure in LINE. */
1294     +
1295     static void
1296     xfields (struct line *line)
1297     {
1298     @@ -199,10 +217,11 @@
1299     if (ptr == lim)
1300     return;
1301    
1302     - if (0 <= tab)
1303     + if (tab != NULL)
1304     {
1305     + unsigned char t = tab[0];
1306     char *sep;
1307     - for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
1308     + for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
1309     extract_field (line, ptr, sep - ptr);
1310     }
1311     else
1312     @@ -229,6 +248,148 @@
1313     extract_field (line, ptr, lim - ptr);
1314     }
1315    
1316     +#if HAVE_MBRTOWC
1317     +static void
1318     +xfields_multibyte (struct line *line)
1319     +{
1320     + char *ptr = line->buf.buffer;
1321     + char const *lim = ptr + line->buf.length - 1;
1322     + wchar_t wc = 0;
1323     + size_t mblength = 1;
1324     + mbstate_t state, state_bak;
1325     +
1326     + memset (&state, 0, sizeof (mbstate_t));
1327     +
1328     + if (ptr == lim)
1329     + return;
1330     +
1331     + if (tab != NULL)
1332     + {
1333     + unsigned char t = tab[0];
1334     + char *sep = ptr;
1335     + for (; ptr < lim; ptr = sep + mblength)
1336     + {
1337     + sep = ptr;
1338     + while (sep < lim)
1339     + {
1340     + state_bak = state;
1341     + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1342     +
1343     + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1344     + {
1345     + mblength = 1;
1346     + state = state_bak;
1347     + }
1348     + mblength = (mblength < 1) ? 1 : mblength;
1349     +
1350     + if (mblength == tablen && !memcmp (sep, tab, mblength))
1351     + break;
1352     + else
1353     + {
1354     + sep += mblength;
1355     + continue;
1356     + }
1357     + }
1358     +
1359     + if (sep == lim)
1360     + break;
1361     +
1362     + extract_field (line, ptr, sep - ptr);
1363     + }
1364     + }
1365     + else
1366     + {
1367     + /* Skip leading blanks before the first field. */
1368     + while(ptr < lim)
1369     + {
1370     + state_bak = state;
1371     + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1372     +
1373     + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1374     + {
1375     + mblength = 1;
1376     + state = state_bak;
1377     + break;
1378     + }
1379     + mblength = (mblength < 1) ? 1 : mblength;
1380     +
1381     + if (!iswblank(wc))
1382     + break;
1383     + ptr += mblength;
1384     + }
1385     +
1386     + do
1387     + {
1388     + char *sep;
1389     + state_bak = state;
1390     + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1391     + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1392     + {
1393     + mblength = 1;
1394     + state = state_bak;
1395     + break;
1396     + }
1397     + mblength = (mblength < 1) ? 1 : mblength;
1398     +
1399     + sep = ptr + mblength;
1400     + while (sep != lim)
1401     + {
1402     + state_bak = state;
1403     + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1404     + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1405     + {
1406     + mblength = 1;
1407     + state = state_bak;
1408     + break;
1409     + }
1410     + mblength = (mblength < 1) ? 1 : mblength;
1411     +
1412     + if (iswblank (wc))
1413     + break;
1414     +
1415     + sep += mblength;
1416     + }
1417     +
1418     + extract_field (line, ptr, sep - ptr);
1419     + if (sep == lim)
1420     + return;
1421     +
1422     + state_bak = state;
1423     + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1424     + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1425     + {
1426     + mblength = 1;
1427     + state = state_bak;
1428     + break;
1429     + }
1430     + mblength = (mblength < 1) ? 1 : mblength;
1431     +
1432     + ptr = sep + mblength;
1433     + while (ptr != lim)
1434     + {
1435     + state_bak = state;
1436     + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1437     + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1438     + {
1439     + mblength = 1;
1440     + state = state_bak;
1441     + break;
1442     + }
1443     + mblength = (mblength < 1) ? 1 : mblength;
1444     +
1445     + if (!iswblank (wc))
1446     + break;
1447     +
1448     + ptr += mblength;
1449     + }
1450     + }
1451     + while (ptr != lim);
1452     + }
1453     +
1454     + extract_field (line, ptr, lim - ptr);
1455     +}
1456     +#endif
1457     +
1458     /* Read a line from FP into LINE and split it into fields.
1459     Return true if successful. */
1460    
1461     @@ -249,6 +410,11 @@
1462     line->nfields_allocated = 0;
1463     line->nfields = 0;
1464     line->fields = NULL;
1465     +#if HAVE_MBRTOWC
1466     + if (MB_CUR_MAX > 1)
1467     + xfields_multibyte (line);
1468     + else
1469     +#endif
1470     xfields (line);
1471     return true;
1472     }
1473     @@ -303,56 +469,114 @@
1474     keycmp (struct line const *line1, struct line const *line2)
1475     {
1476     /* Start of field to compare in each file. */
1477     - char *beg1;
1478     - char *beg2;
1479     -
1480     - size_t len1;
1481     - size_t len2; /* Length of fields to compare. */
1482     + char *beg[2];
1483     + char *copy[2];
1484     + size_t len[2]; /* Length of fields to compare. */
1485     int diff;
1486     + int i, j;
1487    
1488     if (join_field_1 < line1->nfields)
1489     {
1490     - beg1 = line1->fields[join_field_1].beg;
1491     - len1 = line1->fields[join_field_1].len;
1492     + beg[0] = line1->fields[join_field_1].beg;
1493     + len[0] = line1->fields[join_field_1].len;
1494     }
1495     else
1496     {
1497     - beg1 = NULL;
1498     - len1 = 0;
1499     + beg[0] = NULL;
1500     + len[0] = 0;
1501     }
1502    
1503     if (join_field_2 < line2->nfields)
1504     {
1505     - beg2 = line2->fields[join_field_2].beg;
1506     - len2 = line2->fields[join_field_2].len;
1507     + beg[1] = line2->fields[join_field_2].beg;
1508     + len[1] = line2->fields[join_field_2].len;
1509     }
1510     else
1511     {
1512     - beg2 = NULL;
1513     - len2 = 0;
1514     + beg[1] = NULL;
1515     + len[1] = 0;
1516     }
1517    
1518     - if (len1 == 0)
1519     - return len2 == 0 ? 0 : -1;
1520     - if (len2 == 0)
1521     + if (len[0] == 0)
1522     + return len[1] == 0 ? 0 : -1;
1523     + if (len[1] == 0)
1524     return 1;
1525    
1526     if (ignore_case)
1527     {
1528     - /* FIXME: ignore_case does not work with NLS (in particular,
1529     - with multibyte chars). */
1530     - diff = memcasecmp (beg1, beg2, MIN (len1, len2));
1531     +#ifdef HAVE_MBRTOWC
1532     + if (MB_CUR_MAX > 1)
1533     + {
1534     + size_t mblength;
1535     + wchar_t wc, uwc;
1536     + mbstate_t state, state_bak;
1537     +
1538     + memset (&state, '\0', sizeof (mbstate_t));
1539     +
1540     + for (i = 0; i < 2; i++)
1541     + {
1542     + copy[i] = alloca (len[i] + 1);
1543     +
1544     + for (j = 0; j < MIN (len[0], len[1]);)
1545     + {
1546     + state_bak = state;
1547     + mblength = mbrtowc (&wc, beg[i] + j, len[i] - j, &state);
1548     +
1549     + switch (mblength)
1550     + {
1551     + case (size_t) -1:
1552     + case (size_t) -2:
1553     + state = state_bak;
1554     + /* Fall through */
1555     + case 0:
1556     + mblength = 1;
1557     + break;
1558     +
1559     + default:
1560     + uwc = towupper (wc);
1561     +
1562     + if (uwc != wc)
1563     + {
1564     + mbstate_t state_wc;
1565     +
1566     + memset (&state_wc, '\0', sizeof (mbstate_t));
1567     + wcrtomb (copy[i] + j, uwc, &state_wc);
1568     + }
1569     + else
1570     + memcpy (copy[i] + j, beg[i] + j, mblength);
1571     + }
1572     + j += mblength;
1573     + }
1574     + copy[i][j] = '\0';
1575     + }
1576     + }
1577     + else
1578     +#endif
1579     + {
1580     + for (i = 0; i < 2; i++)
1581     + {
1582     + copy[i] = alloca (len[i] + 1);
1583     +
1584     + for (j = 0; j < MIN (len[0], len[1]); j++)
1585     + copy[i][j] = toupper (beg[i][j]);
1586     +
1587     + copy[i][j] = '\0';
1588     + }
1589     + }
1590     }
1591     else
1592     {
1593     - if (hard_LC_COLLATE)
1594     - return xmemcoll (beg1, len1, beg2, len2);
1595     - diff = memcmp (beg1, beg2, MIN (len1, len2));
1596     + copy[0] = (unsigned char *) beg[0];
1597     + copy[1] = (unsigned char *) beg[1];
1598     }
1599    
1600     + if (hard_LC_COLLATE)
1601     + return xmemcoll ((char *) copy[0], len[0], (char *) copy[1], len[1]);
1602     + diff = memcmp (copy[0], copy[1], MIN (len[0], len[1]));
1603     +
1604     if (diff)
1605     return diff;
1606     - return len1 < len2 ? -1 : len1 != len2;
1607     + return len[0] - len[1];
1608     }
1609    
1610     /* Print field N of LINE if it exists and is nonempty, otherwise
1611     @@ -377,11 +601,18 @@
1612    
1613     /* Print the join of LINE1 and LINE2. */
1614    
1615     +#define PUT_TAB_CHAR \
1616     + do \
1617     + { \
1618     + (tab != NULL) ? \
1619     + fwrite(tab, sizeof(char), tablen, stdout) : putchar (' '); \
1620     + } \
1621     + while (0)
1622     +
1623     static void
1624     prjoin (struct line const *line1, struct line const *line2)
1625     {
1626     const struct outlist *outlist;
1627     - char output_separator = tab < 0 ? ' ' : tab;
1628    
1629     outlist = outlist_head.next;
1630     if (outlist)
1631     @@ -397,12 +628,12 @@
1632     if (o->file == 0)
1633     {
1634     if (line1 == &uni_blank)
1635     - {
1636     + {
1637     line = line2;
1638     field = join_field_2;
1639     }
1640     else
1641     - {
1642     + {
1643     line = line1;
1644     field = join_field_1;
1645     }
1646     @@ -416,7 +647,7 @@
1647     o = o->next;
1648     if (o == NULL)
1649     break;
1650     - putchar (output_separator);
1651     + PUT_TAB_CHAR;
1652     }
1653     putchar ('\n');
1654     }
1655     @@ -434,23 +665,23 @@
1656     prfield (join_field_1, line1);
1657     for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
1658     {
1659     - putchar (output_separator);
1660     + PUT_TAB_CHAR;
1661     prfield (i, line1);
1662     }
1663     for (i = join_field_1 + 1; i < line1->nfields; ++i)
1664     {
1665     - putchar (output_separator);
1666     + PUT_TAB_CHAR;
1667     prfield (i, line1);
1668     }
1669    
1670     for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
1671     {
1672     - putchar (output_separator);
1673     + PUT_TAB_CHAR;
1674     prfield (i, line2);
1675     }
1676     for (i = join_field_2 + 1; i < line2->nfields; ++i)
1677     {
1678     - putchar (output_separator);
1679     + PUT_TAB_CHAR;
1680     prfield (i, line2);
1681     }
1682     putchar ('\n');
1683     @@ -859,20 +1090,41 @@
1684    
1685     case 't':
1686     {
1687     - unsigned char newtab = optarg[0];
1688     - if (! newtab)
1689     + char *newtab;
1690     + size_t newtablen;
1691     + if (! optarg[0])
1692     error (EXIT_FAILURE, 0, _("empty tab"));
1693     - if (optarg[1])
1694     + newtab = xstrdup (optarg);
1695     +#if HAVE_MBRTOWC
1696     + if (MB_CUR_MAX > 1)
1697     + {
1698     + mbstate_t state;
1699     +
1700     + memset (&state, 0, sizeof (mbstate_t));
1701     + newtablen = mbrtowc (NULL, newtab,
1702     + strnlen (newtab, MB_LEN_MAX),
1703     + &state);
1704     + if (newtablen == (size_t) 0
1705     + || newtablen == (size_t) -1
1706     + || newtablen == (size_t) -2)
1707     + newtablen = 1;
1708     + }
1709     + else
1710     +#endif
1711     + newtablen = 1;
1712     +
1713     + if (newtablen == 1 && newtab[1])
1714     + {
1715     + if (STREQ (newtab, "\\0"))
1716     + newtab[0] = '\0';
1717     + }
1718     + if (tab != NULL && strcmp (tab, newtab))
1719     {
1720     - if (STREQ (optarg, "\\0"))
1721     - newtab = '\0';
1722     - else
1723     - error (EXIT_FAILURE, 0, _("multi-character tab %s"),
1724     - quote (optarg));
1725     + free (newtab);
1726     + error (EXIT_FAILURE, 0, _("incompatible tabs"));
1727     }
1728     - if (0 <= tab && tab != newtab)
1729     - error (EXIT_FAILURE, 0, _("incompatible tabs"));
1730     tab = newtab;
1731     + tablen = newtablen;
1732     }
1733     break;
1734    
1735     diff -Naur coreutils-6.9.orig/src/pr.c coreutils-6.9/src/pr.c
1736     --- coreutils-6.9.orig/src/pr.c 2007-03-18 21:36:43.000000000 +0000
1737     +++ coreutils-6.9/src/pr.c 2007-04-07 16:59:55.000000000 +0000
1738     @@ -313,6 +313,32 @@
1739    
1740     #include <getopt.h>
1741     #include <sys/types.h>
1742     +
1743     +/* Get MB_LEN_MAX. */
1744     +#include <limits.h>
1745     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
1746     + installation; work around this configuration error. */
1747     +#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
1748     +# define MB_LEN_MAX 16
1749     +#endif
1750     +
1751     +/* Get MB_CUR_MAX. */
1752     +#include <stdlib.h>
1753     +
1754     +/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
1755     +/* Get mbstate_t, mbrtowc(), wcwidth(). */
1756     +#if HAVE_WCHAR_H
1757     +# include <wchar.h>
1758     +#endif
1759     +
1760     +/* Get iswprint(). -- for wcwidth(). */
1761     +#if HAVE_WCTYPE_H
1762     +# include <wctype.h>
1763     +#endif
1764     +#if !defined iswprint && !HAVE_ISWPRINT
1765     +# define iswprint(wc) 1
1766     +#endif
1767     +
1768     #include "system.h"
1769     #include "error.h"
1770     #include "hard-locale.h"
1771     @@ -324,6 +350,18 @@
1772     #include "strftime.h"
1773     #include "xstrtol.h"
1774    
1775     +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1776     +#if HAVE_MBRTOWC && defined mbstate_t
1777     +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1778     +#endif
1779     +
1780     +#ifndef HAVE_DECL_WCWIDTH
1781     +"this configure-time declaration test was not run"
1782     +#endif
1783     +#if !HAVE_DECL_WCWIDTH
1784     +extern int wcwidth ();
1785     +#endif
1786     +
1787     /* The official name of this program (e.g., no `g' prefix). */
1788     #define PROGRAM_NAME "pr"
1789    
1790     @@ -416,7 +454,20 @@
1791    
1792     #define NULLCOL (COLUMN *)0
1793    
1794     -static int char_to_clump (char c);
1795     +/* Funtion pointers to switch functions for single byte locale or for
1796     + multibyte locale. If multibyte functions do not exist in your sysytem,
1797     + these pointers always point the function for single byte locale. */
1798     +static void (*print_char) (char c);
1799     +static int (*char_to_clump) (char c);
1800     +
1801     +/* Functions for single byte locale. */
1802     +static void print_char_single (char c);
1803     +static int char_to_clump_single (char c);
1804     +
1805     +/* Functions for multibyte locale. */
1806     +static void print_char_multi (char c);
1807     +static int char_to_clump_multi (char c);
1808     +
1809     static bool read_line (COLUMN *p);
1810     static bool print_page (void);
1811     static bool print_stored (COLUMN *p);
1812     @@ -426,6 +477,7 @@
1813     static void pad_across_to (int position);
1814     static void add_line_number (COLUMN *p);
1815     static void getoptarg (char *arg, char switch_char, char *character,
1816     + int *character_length, int *character_width,
1817     int *number);
1818     void usage (int status);
1819     static void print_files (int number_of_files, char **av);
1820     @@ -440,7 +492,6 @@
1821     static void pad_down (int lines);
1822     static void read_rest_of_line (COLUMN *p);
1823     static void skip_read (COLUMN *p, int column_number);
1824     -static void print_char (char c);
1825     static void cleanup (void);
1826     static void print_sep_string (void);
1827     static void separator_string (const char *optarg_S);
1828     @@ -455,7 +506,7 @@
1829     we store the leftmost columns contiguously in buff.
1830     To print a line from buff, get the index of the first character
1831     from line_vector[i], and print up to line_vector[i + 1]. */
1832     -static char *buff;
1833     +static unsigned char *buff;
1834    
1835     /* Index of the position in buff where the next character
1836     will be stored. */
1837     @@ -559,7 +610,7 @@
1838     static bool untabify_input = false;
1839    
1840     /* (-e) The input tab character. */
1841     -static char input_tab_char = '\t';
1842     +static char input_tab_char[MB_LEN_MAX] = "\t";
1843    
1844     /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ...
1845     where the leftmost column is 1. */
1846     @@ -569,7 +620,10 @@
1847     static bool tabify_output = false;
1848    
1849     /* (-i) The output tab character. */
1850     -static char output_tab_char = '\t';
1851     +static char output_tab_char[MB_LEN_MAX] = "\t";
1852     +
1853     +/* (-i) The byte length of output tab character. */
1854     +static int output_tab_char_length = 1;
1855    
1856     /* (-i) The width of the output tab. */
1857     static int chars_per_output_tab = 8;
1858     @@ -643,7 +697,13 @@
1859     static bool numbered_lines = false;
1860    
1861     /* (-n) Character which follows each line number. */
1862     -static char number_separator = '\t';
1863     +static char number_separator[MB_LEN_MAX] = "\t";
1864     +
1865     +/* (-n) The byte length of the character which follows each line number. */
1866     +static int number_separator_length = 1;
1867     +
1868     +/* (-n) The character width of the character which follows each line number. */
1869     +static int number_separator_width = 0;
1870    
1871     /* (-n) line counting starts with 1st line of input file (not with 1st
1872     line of 1st page printed). */
1873     @@ -696,6 +756,7 @@
1874     -a|COLUMN|-m is a `space' and with the -J option a `tab'. */
1875     static char *col_sep_string = "";
1876     static int col_sep_length = 0;
1877     +static int col_sep_width = 0;
1878     static char *column_separator = " ";
1879     static char *line_separator = "\t";
1880    
1881     @@ -852,6 +913,13 @@
1882     col_sep_length = (int) strlen (optarg_S);
1883     col_sep_string = xmalloc (col_sep_length + 1);
1884     strcpy (col_sep_string, optarg_S);
1885     +
1886     +#if HAVE_MBRTOWC
1887     + if (MB_CUR_MAX > 1)
1888     + col_sep_width = mbswidth (col_sep_string, 0);
1889     + else
1890     +#endif
1891     + col_sep_width = col_sep_length;
1892     }
1893    
1894     int
1895     @@ -877,6 +945,21 @@
1896    
1897     atexit (close_stdout);
1898    
1899     +/* Define which functions are used, the ones for single byte locale or the ones
1900     + for multibyte locale. */
1901     +#if HAVE_MBRTOWC
1902     + if (MB_CUR_MAX > 1)
1903     + {
1904     + print_char = print_char_multi;
1905     + char_to_clump = char_to_clump_multi;
1906     + }
1907     + else
1908     +#endif
1909     + {
1910     + print_char = print_char_single;
1911     + char_to_clump = char_to_clump_single;
1912     + }
1913     +
1914     n_files = 0;
1915     file_names = (argc > 1
1916     ? xmalloc ((argc - 1) * sizeof (char *))
1917     @@ -949,8 +1032,12 @@
1918     break;
1919     case 'e':
1920     if (optarg)
1921     - getoptarg (optarg, 'e', &input_tab_char,
1922     - &chars_per_input_tab);
1923     + {
1924     + int dummy_length, dummy_width;
1925     +
1926     + getoptarg (optarg, 'e', input_tab_char, &dummy_length,
1927     + &dummy_width, &chars_per_input_tab);
1928     + }
1929     /* Could check tab width > 0. */
1930     untabify_input = true;
1931     break;
1932     @@ -963,8 +1050,12 @@
1933     break;
1934     case 'i':
1935     if (optarg)
1936     - getoptarg (optarg, 'i', &output_tab_char,
1937     - &chars_per_output_tab);
1938     + {
1939     + int dummy_width;
1940     +
1941     + getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length,
1942     + &dummy_width, &chars_per_output_tab);
1943     + }
1944     /* Could check tab width > 0. */
1945     tabify_output = true;
1946     break;
1947     @@ -991,8 +1082,8 @@
1948     case 'n':
1949     numbered_lines = true;
1950     if (optarg)
1951     - getoptarg (optarg, 'n', &number_separator,
1952     - &chars_per_number);
1953     + getoptarg (optarg, 'n', number_separator, &number_separator_length,
1954     + &number_separator_width, &chars_per_number);
1955     break;
1956     case 'N':
1957     skip_count = false;
1958     @@ -1031,7 +1122,7 @@
1959     old_s = false;
1960     /* Reset an additional input of -s, -S dominates -s */
1961     col_sep_string = "";
1962     - col_sep_length = 0;
1963     + col_sep_length = col_sep_width = 0;
1964     use_col_separator = true;
1965     if (optarg)
1966     separator_string (optarg);
1967     @@ -1188,10 +1279,45 @@
1968     a number. */
1969    
1970     static void
1971     -getoptarg (char *arg, char switch_char, char *character, int *number)
1972     +getoptarg (char *arg, char switch_char, char *character, int *character_length,
1973     + int *character_width, int *number)
1974     {
1975     if (!ISDIGIT (*arg))
1976     - *character = *arg++;
1977     + {
1978     +#ifdef HAVE_MBRTOWC
1979     + if (MB_CUR_MAX > 1) /* for multibyte locale. */
1980     + {
1981     + wchar_t wc;
1982     + size_t mblength;
1983     + int width;
1984     + mbstate_t state = {'\0'};
1985     +
1986     + mblength = mbrtowc (&wc, arg, strnlen(arg, MB_LEN_MAX), &state);
1987     +
1988     + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1989     + {
1990     + *character_length = 1;
1991     + *character_width = 1;
1992     + }
1993     + else
1994     + {
1995     + *character_length = (mblength < 1) ? 1 : mblength;
1996     + width = wcwidth (wc);
1997     + *character_width = (width < 0) ? 0 : width;
1998     + }
1999     +
2000     + strncpy (character, arg, *character_length);
2001     + arg += *character_length;
2002     + }
2003     + else /* for single byte locale. */
2004     +#endif
2005     + {
2006     + *character = *arg++;
2007     + *character_length = 1;
2008     + *character_width = 1;
2009     + }
2010     + }
2011     +
2012     if (*arg)
2013     {
2014     long int tmp_long;
2015     @@ -1256,7 +1382,7 @@
2016     else
2017     col_sep_string = column_separator;
2018    
2019     - col_sep_length = 1;
2020     + col_sep_length = col_sep_width = 1;
2021     use_col_separator = true;
2022     }
2023     /* It's rather pointless to define a TAB separator with column
2024     @@ -1287,11 +1413,11 @@
2025     TAB_WIDTH (chars_per_input_tab, chars_per_number); */
2026    
2027     /* Estimate chars_per_text without any margin and keep it constant. */
2028     - if (number_separator == '\t')
2029     + if (number_separator[0] == '\t')
2030     number_width = chars_per_number +
2031     TAB_WIDTH (chars_per_default_tab, chars_per_number);
2032     else
2033     - number_width = chars_per_number + 1;
2034     + number_width = chars_per_number + number_separator_width;
2035    
2036     /* The number is part of the column width unless we are
2037     printing files in parallel. */
2038     @@ -1306,7 +1432,7 @@
2039     }
2040    
2041     chars_per_column = (chars_per_line - chars_used_by_number -
2042     - (columns - 1) * col_sep_length) / columns;
2043     + (columns - 1) * col_sep_width) / columns;
2044    
2045     if (chars_per_column < 1)
2046     error (EXIT_FAILURE, 0, _("page width too narrow"));
2047     @@ -1431,7 +1557,7 @@
2048    
2049     /* Enlarge p->start_position of first column to use the same form of
2050     padding_not_printed with all columns. */
2051     - h = h + col_sep_length;
2052     + h = h + col_sep_width;
2053    
2054     /* This loop takes care of all but the rightmost column. */
2055    
2056     @@ -1465,7 +1591,7 @@
2057     }
2058     else
2059     {
2060     - h = h_next + col_sep_length;
2061     + h = h_next + col_sep_width;
2062     h_next = h + chars_per_column;
2063     }
2064     }
2065     @@ -1755,9 +1881,9 @@
2066     align_column (COLUMN *p)
2067     {
2068     padding_not_printed = p->start_position;
2069     - if (padding_not_printed - col_sep_length > 0)
2070     + if (padding_not_printed - col_sep_width > 0)
2071     {
2072     - pad_across_to (padding_not_printed - col_sep_length);
2073     + pad_across_to (padding_not_printed - col_sep_width);
2074     padding_not_printed = ANYWHERE;
2075     }
2076    
2077     @@ -2028,13 +2154,13 @@
2078     /* May be too generous. */
2079     buff = X2REALLOC (buff, &buff_allocated);
2080     }
2081     - buff[buff_current++] = c;
2082     + buff[buff_current++] = (unsigned char) c;
2083     }
2084    
2085     static void
2086     add_line_number (COLUMN *p)
2087     {
2088     - int i;
2089     + int i, j;
2090     char *s;
2091     int left_cut;
2092    
2093     @@ -2057,22 +2183,24 @@
2094     /* Tabification is assumed for multiple columns, also for n-separators,
2095     but `default n-separator = TAB' hasn't been given priority over
2096     equal column_width also specified by POSIX. */
2097     - if (number_separator == '\t')
2098     + if (number_separator[0] == '\t')
2099     {
2100     i = number_width - chars_per_number;
2101     while (i-- > 0)
2102     (p->char_func) (' ');
2103     }
2104     else
2105     - (p->char_func) (number_separator);
2106     + for (j = 0; j < number_separator_length; j++)
2107     + (p->char_func) (number_separator[j]);
2108     }
2109     else
2110     /* To comply with POSIX, we avoid any expansion of default TAB
2111     separator with a single column output. No column_width requirement
2112     has to be considered. */
2113     {
2114     - (p->char_func) (number_separator);
2115     - if (number_separator == '\t')
2116     + for (j = 0; j < number_separator_length; j++)
2117     + (p->char_func) (number_separator[j]);
2118     + if (number_separator[0] == '\t')
2119     output_position = POS_AFTER_TAB (chars_per_output_tab,
2120     output_position);
2121     }
2122     @@ -2233,7 +2361,7 @@
2123     while (goal - h_old > 1
2124     && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal)
2125     {
2126     - putchar (output_tab_char);
2127     + fwrite (output_tab_char, sizeof(char), output_tab_char_length, stdout);
2128     h_old = h_new;
2129     }
2130     while (++h_old <= goal)
2131     @@ -2253,6 +2381,7 @@
2132     {
2133     char *s;
2134     int l = col_sep_length;
2135     + int not_space_flag;
2136    
2137     s = col_sep_string;
2138    
2139     @@ -2266,6 +2395,7 @@
2140     {
2141     for (; separators_not_printed > 0; --separators_not_printed)
2142     {
2143     + not_space_flag = 0;
2144     while (l-- > 0)
2145     {
2146     /* 3 types of sep_strings: spaces only, spaces and chars,
2147     @@ -2279,12 +2409,15 @@
2148     }
2149     else
2150     {
2151     + not_space_flag = 1;
2152     if (spaces_not_printed > 0)
2153     print_white_space ();
2154     putchar (*s++);
2155     - ++output_position;
2156     }
2157     }
2158     + if (not_space_flag)
2159     + output_position += col_sep_width;
2160     +
2161     /* sep_string ends with some spaces */
2162     if (spaces_not_printed > 0)
2163     print_white_space ();
2164     @@ -2312,7 +2445,7 @@
2165     required number of tabs and spaces. */
2166    
2167     static void
2168     -print_char (char c)
2169     +print_char_single (char c)
2170     {
2171     if (tabify_output)
2172     {
2173     @@ -2336,6 +2469,74 @@
2174     putchar (c);
2175     }
2176    
2177     +#ifdef HAVE_MBRTOWC
2178     +static void
2179     +print_char_multi (char c)
2180     +{
2181     + static size_t mbc_pos = 0;
2182     + static unsigned char mbc[MB_LEN_MAX] = {'\0'};
2183     + static mbstate_t state = {'\0'};
2184     + mbstate_t state_bak;
2185     + wchar_t wc;
2186     + size_t mblength;
2187     + int width;
2188     +
2189     + if (tabify_output)
2190     + {
2191     + state_bak = state;
2192     + mbc[mbc_pos++] = (unsigned char)c;
2193     + mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2194     +
2195     + while (mbc_pos > 0)
2196     + {
2197     + switch (mblength)
2198     + {
2199     + case (size_t)-2:
2200     + state = state_bak;
2201     + return;
2202     +
2203     + case (size_t)-1:
2204     + state = state_bak;
2205     + ++output_position;
2206     + putchar (mbc[0]);
2207     + memmove (mbc, mbc + 1, MB_CUR_MAX - 1);
2208     + --mbc_pos;
2209     + break;
2210     +
2211     + case 0:
2212     + mblength = 1;
2213     +
2214     + default:
2215     + if (wc == L' ')
2216     + {
2217     + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2218     + --mbc_pos;
2219     + ++spaces_not_printed;
2220     + return;
2221     + }
2222     + else if (spaces_not_printed > 0)
2223     + print_white_space ();
2224     +
2225     + /* Nonprintables are assumed to have width 0, except L'\b'. */
2226     + if ((width = wcwidth (wc)) < 1)
2227     + {
2228     + if (wc == L'\b')
2229     + --output_position;
2230     + }
2231     + else
2232     + output_position += width;
2233     +
2234     + fwrite (mbc, sizeof(char), mblength, stdout);
2235     + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2236     + mbc_pos -= mblength;
2237     + }
2238     + }
2239     + return;
2240     + }
2241     + putchar (c);
2242     +}
2243     +#endif
2244     +
2245     /* Skip to page PAGE before printing.
2246     PAGE may be larger than total number of pages. */
2247    
2248     @@ -2516,9 +2717,9 @@
2249     align_empty_cols = false;
2250     }
2251    
2252     - if (padding_not_printed - col_sep_length > 0)
2253     + if (padding_not_printed - col_sep_width > 0)
2254     {
2255     - pad_across_to (padding_not_printed - col_sep_length);
2256     + pad_across_to (padding_not_printed - col_sep_width);
2257     padding_not_printed = ANYWHERE;
2258     }
2259    
2260     @@ -2619,9 +2820,9 @@
2261     }
2262     }
2263    
2264     - if (padding_not_printed - col_sep_length > 0)
2265     + if (padding_not_printed - col_sep_width > 0)
2266     {
2267     - pad_across_to (padding_not_printed - col_sep_length);
2268     + pad_across_to (padding_not_printed - col_sep_width);
2269     padding_not_printed = ANYWHERE;
2270     }
2271    
2272     @@ -2634,8 +2835,8 @@
2273     if (spaces_not_printed == 0)
2274     {
2275     output_position = p->start_position + end_vector[line];
2276     - if (p->start_position - col_sep_length == chars_per_margin)
2277     - output_position -= col_sep_length;
2278     + if (p->start_position - col_sep_width == chars_per_margin)
2279     + output_position -= col_sep_width;
2280     }
2281    
2282     return true;
2283     @@ -2654,7 +2855,7 @@
2284     number of characters is 1.) */
2285    
2286     static int
2287     -char_to_clump (char c)
2288     +char_to_clump_single (char c)
2289     {
2290     unsigned char uc = c;
2291     char *s = clump_buff;
2292     @@ -2664,10 +2865,10 @@
2293     int chars;
2294     int chars_per_c = 8;
2295    
2296     - if (c == input_tab_char)
2297     + if (c == input_tab_char[0])
2298     chars_per_c = chars_per_input_tab;
2299    
2300     - if (c == input_tab_char || c == '\t')
2301     + if (c == input_tab_char[0] || c == '\t')
2302     {
2303     width = TAB_WIDTH (chars_per_c, input_position);
2304    
2305     @@ -2738,6 +2939,154 @@
2306     return chars;
2307     }
2308    
2309     +#ifdef HAVE_MBRTOWC
2310     +static int
2311     +char_to_clump_multi (char c)
2312     +{
2313     + static size_t mbc_pos = 0;
2314     + static char mbc[MB_LEN_MAX] = {'\0'};
2315     + static mbstate_t state = {'\0'};
2316     + mbstate_t state_bak;
2317     + wchar_t wc;
2318     + size_t mblength;
2319     + int wc_width;
2320     + register int *s = clump_buff;
2321     + register int i, j;
2322     + char esc_buff[4];
2323     + int width;
2324     + int chars;
2325     + int chars_per_c = 8;
2326     +
2327     + state_bak = state;
2328     + mbc[mbc_pos++] = c;
2329     + mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2330     +
2331     + width = 0;
2332     + chars = 0;
2333     + while (mbc_pos > 0)
2334     + {
2335     + switch (mblength)
2336     + {
2337     + case (size_t)-2:
2338     + state = state_bak;
2339     + return 0;
2340     +
2341     + case (size_t)-1:
2342     + state = state_bak;
2343     + mblength = 1;
2344     +
2345     + if (use_esc_sequence || use_cntrl_prefix)
2346     + {
2347     + width = +4;
2348     + chars = +4;
2349     + *s++ = '\\';
2350     + sprintf (esc_buff, "%03o", mbc[0]);
2351     + for (i = 0; i <= 2; ++i)
2352     + *s++ = (int) esc_buff[i];
2353     + }
2354     + else
2355     + {
2356     + width += 1;
2357     + chars += 1;
2358     + *s++ = mbc[0];
2359     + }
2360     + break;
2361     +
2362     + case 0:
2363     + mblength = 1;
2364     + /* Fall through */
2365     +
2366     + default:
2367     + if (memcmp (mbc, input_tab_char, mblength) == 0)
2368     + chars_per_c = chars_per_input_tab;
2369     +
2370     + if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t')
2371     + {
2372     + int width_inc;
2373     +
2374     + width_inc = TAB_WIDTH (chars_per_c, input_position);
2375     + width += width_inc;
2376     +
2377     + if (untabify_input)
2378     + {
2379     + for (i = width_inc; i; --i)
2380     + *s++ = ' ';
2381     + chars += width_inc;
2382     + }
2383     + else
2384     + {
2385     + for (i = 0; i < mblength; i++)
2386     + *s++ = mbc[i];
2387     + chars += mblength;
2388     + }
2389     + }
2390     + else if ((wc_width = wcwidth (wc)) < 1)
2391     + {
2392     + if (use_esc_sequence)
2393     + {
2394     + for (i = 0; i < mblength; i++)
2395     + {
2396     + width += 4;
2397     + chars += 4;
2398     + *s++ = '\\';
2399     + sprintf (esc_buff, "%03o", c);
2400     + for (j = 0; j <= 2; ++j)
2401     + *s++ = (int) esc_buff[j];
2402     + }
2403     + }
2404     + else if (use_cntrl_prefix)
2405     + {
2406     + if (wc < 0200)
2407     + {
2408     + width += 2;
2409     + chars += 2;
2410     + *s++ = '^';
2411     + *s++ = wc ^ 0100;
2412     + }
2413     + else
2414     + {
2415     + for (i = 0; i < mblength; i++)
2416     + {
2417     + width += 4;
2418     + chars += 4;
2419     + *s++ = '\\';
2420     + sprintf (esc_buff, "%03o", c);
2421     + for (j = 0; j <= 2; ++j)
2422     + *s++ = (int) esc_buff[j];
2423     + }
2424     + }
2425     + }
2426     + else if (wc == L'\b')
2427     + {
2428     + width += -1;
2429     + chars += 1;
2430     + *s++ = c;
2431     + }
2432     + else
2433     + {
2434     + width += 0;
2435     + chars += mblength;
2436     + for (i = 0; i < mblength; i++)
2437     + *s++ = mbc[i];
2438     + }
2439     + }
2440     + else
2441     + {
2442     + width += wc_width;
2443     + chars += mblength;
2444     + for (i = 0; i < mblength; i++)
2445     + *s++ = mbc[i];
2446     + }
2447     + }
2448     + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2449     + mbc_pos -= mblength;
2450     + }
2451     +
2452     + input_position += width;
2453     + return chars;
2454     +}
2455     +#endif
2456     +
2457     /* We've just printed some files and need to clean up things before
2458     looking for more options and printing the next batch of files.
2459    
2460     diff -Naur coreutils-6.9.orig/src/sort.c coreutils-6.9/src/sort.c
2461     --- coreutils-6.9.orig/src/sort.c 2007-03-18 21:36:43.000000000 +0000
2462     +++ coreutils-6.9/src/sort.c 2007-04-07 17:11:06.000000000 +0000
2463     @@ -23,10 +23,18 @@
2464    
2465     #include <config.h>
2466    
2467     +#include <assert.h>
2468     #include <getopt.h>
2469     #include <sys/types.h>
2470     #include <sys/wait.h>
2471     #include <signal.h>
2472     +#if HAVE_WCHAR_H
2473     +# include <wchar.h>
2474     +#endif
2475     +/* Get isw* functions. */
2476     +#if HAVE_WCTYPE_H
2477     +# include <wctype.h>
2478     +#endif
2479     #include "system.h"
2480     #include "argmatch.h"
2481     #include "error.h"
2482     @@ -116,14 +124,38 @@
2483     /* Thousands separator; if -1, then there isn't one. */
2484     static int thousands_sep;
2485    
2486     +static int force_general_numcompare = 0;
2487     +
2488     /* Nonzero if the corresponding locales are hard. */
2489     static bool hard_LC_COLLATE;
2490     -#if HAVE_NL_LANGINFO
2491     +#if HAVE_LANGINFO_CODESET
2492     static bool hard_LC_TIME;
2493     #endif
2494    
2495     #define NONZERO(x) ((x) != 0)
2496    
2497     +/* get a multibyte character's byte length. */
2498     +#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \
2499     + do \
2500     + { \
2501     + wchar_t wc; \
2502     + mbstate_t state_bak; \
2503     + \
2504     + state_bak = STATE; \
2505     + mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \
2506     + \
2507     + switch (MBLENGTH) \
2508     + { \
2509     + case (size_t)-1: \
2510     + case (size_t)-2: \
2511     + STATE = state_bak; \
2512     + /* Fall through. */ \
2513     + case 0: \
2514     + MBLENGTH = 1; \
2515     + } \
2516     + } \
2517     + while (0)
2518     +
2519     /* The kind of blanks for '-b' to skip in various options. */
2520     enum blanktype { bl_start, bl_end, bl_both };
2521    
2522     @@ -261,13 +293,11 @@
2523     they were read if all keys compare equal. */
2524     static bool stable;
2525    
2526     -/* If TAB has this value, blanks separate fields. */
2527     -enum { TAB_DEFAULT = CHAR_MAX + 1 };
2528     -
2529     -/* Tab character separating fields. If TAB_DEFAULT, then fields are
2530     +/* Tab character separating fields. If tab_length is 0, then fields are
2531     separated by the empty string between a non-blank character and a blank
2532     character. */
2533     -static int tab = TAB_DEFAULT;
2534     +static char tab[MB_LEN_MAX + 1];
2535     +static size_t tab_length = 0;
2536    
2537     /* Flag to remove consecutive duplicate lines from the output.
2538     Only the last of a sequence of equal lines will be output. */
2539     @@ -639,6 +669,44 @@
2540     update_proc (pid);
2541     }
2542    
2543     +/* Function pointers. */
2544     +static void
2545     +(*inittables) (void);
2546     +static char *
2547     +(*begfield) (const struct line*, const struct keyfield *);
2548     +static char *
2549     +(*limfield) (const struct line*, const struct keyfield *);
2550     +static int
2551     +(*getmonth) (char const *, size_t);
2552     +static int
2553     +(*keycompare) (const struct line *, const struct line *);
2554     +static int
2555     +(*numcompare) (const char *, const char *);
2556     +
2557     +/* Test for white space multibyte character.
2558     + Set LENGTH the byte length of investigated multibyte character. */
2559     +#if HAVE_MBRTOWC
2560     +static int
2561     +ismbblank (const char *str, size_t len, size_t *length)
2562     +{
2563     + size_t mblength;
2564     + wchar_t wc;
2565     + mbstate_t state;
2566     +
2567     + memset (&state, '\0', sizeof(mbstate_t));
2568     + mblength = mbrtowc (&wc, str, len, &state);
2569     +
2570     + if (mblength == (size_t)-1 || mblength == (size_t)-2)
2571     + {
2572     + *length = 1;
2573     + return 0;
2574     + }
2575     +
2576     + *length = (mblength < 1) ? 1 : mblength;
2577     + return iswblank (wc);
2578     +}
2579     +#endif
2580     +
2581     /* Clean up any remaining temporary files. */
2582    
2583     static void
2584     @@ -978,7 +1046,7 @@
2585     free (node);
2586     }
2587    
2588     -#if HAVE_NL_LANGINFO
2589     +#if HAVE_LANGINFO_CODESET
2590    
2591     static int
2592     struct_month_cmp (const void *m1, const void *m2)
2593     @@ -993,7 +1061,7 @@
2594     /* Initialize the character class tables. */
2595    
2596     static void
2597     -inittables (void)
2598     +inittables_uni (void)
2599     {
2600     size_t i;
2601    
2602     @@ -1005,7 +1073,7 @@
2603     fold_toupper[i] = toupper (i);
2604     }
2605    
2606     -#if HAVE_NL_LANGINFO
2607     +#if HAVE_LANGINFO_CODESET
2608     /* If we're not in the "C" locale, read different names for months. */
2609     if (hard_LC_TIME)
2610     {
2611     @@ -1031,6 +1099,64 @@
2612     #endif
2613     }
2614    
2615     +#if HAVE_MBRTOWC
2616     +static void
2617     +inittables_mb (void)
2618     +{
2619     + int i, j, k, l;
2620     + char *name, *s;
2621     + size_t s_len, mblength;
2622     + char mbc[MB_LEN_MAX];
2623     + wchar_t wc, pwc;
2624     + mbstate_t state_mb, state_wc;
2625     +
2626     + for (i = 0; i < MONTHS_PER_YEAR; i++)
2627     + {
2628     + s = (char *) nl_langinfo (ABMON_1 + i);
2629     + s_len = strlen (s);
2630     + monthtab[i].name = name = (char *) xmalloc (s_len + 1);
2631     + monthtab[i].val = i + 1;
2632     +
2633     + memset (&state_mb, '\0', sizeof (mbstate_t));
2634     + memset (&state_wc, '\0', sizeof (mbstate_t));
2635     +
2636     + for (j = 0; j < s_len;)
2637     + {
2638     + if (!ismbblank (s + j, s_len - j, &mblength))
2639     + break;
2640     + j += mblength;
2641     + }
2642     +
2643     + for (k = 0; j < s_len;)
2644     + {
2645     + mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
2646     + assert (mblength != (size_t)-1 && mblength != (size_t)-2);
2647     + if (mblength == 0)
2648     + break;
2649     +
2650     + pwc = towupper (wc);
2651     + if (pwc == wc)
2652     + {
2653     + memcpy (mbc, s + j, mblength);
2654     + j += mblength;
2655     + }
2656     + else
2657     + {
2658     + j += mblength;
2659     + mblength = wcrtomb (mbc, pwc, &state_wc);
2660     + assert (mblength != (size_t)0 && mblength != (size_t)-1);
2661     + }
2662     +
2663     + for (l = 0; l < mblength; l++)
2664     + name[k++] = mbc[l];
2665     + }
2666     + name[k] = '\0';
2667     + }
2668     + qsort ((void *) monthtab, MONTHS_PER_YEAR,
2669     + sizeof (struct month), struct_month_cmp);
2670     +}
2671     +#endif
2672     +
2673     /* Specify the amount of main memory to use when sorting. */
2674     static void
2675     specify_sort_size (char const *s)
2676     @@ -1241,7 +1367,7 @@
2677     by KEY in LINE. */
2678    
2679     static char *
2680     -begfield (const struct line *line, const struct keyfield *key)
2681     +begfield_uni (const struct line *line, const struct keyfield *key)
2682     {
2683     char *ptr = line->text, *lim = ptr + line->length - 1;
2684     size_t sword = key->sword;
2685     @@ -1251,10 +1377,10 @@
2686     /* The leading field separator itself is included in a field when -t
2687     is absent. */
2688    
2689     - if (tab != TAB_DEFAULT)
2690     + if (tab_length)
2691     while (ptr < lim && sword--)
2692     {
2693     - while (ptr < lim && *ptr != tab)
2694     + while (ptr < lim && *ptr != tab[0])
2695     ++ptr;
2696     if (ptr < lim)
2697     ++ptr;
2698     @@ -1282,11 +1408,70 @@
2699     return ptr;
2700     }
2701    
2702     +#if HAVE_MBRTOWC
2703     +static char *
2704     +begfield_mb (const struct line *line, const struct keyfield *key)
2705     +{
2706     + int i;
2707     + char *ptr = line->text, *lim = ptr + line->length - 1;
2708     + size_t sword = key->sword;
2709     + size_t schar = key->schar;
2710     + size_t mblength;
2711     + mbstate_t state;
2712     +
2713     + memset (&state, '\0', sizeof(mbstate_t));
2714     +
2715     + if (tab_length)
2716     + while (ptr < lim && sword--)
2717     + {
2718     + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
2719     + {
2720     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2721     + ptr += mblength;
2722     + }
2723     + if (ptr < lim)
2724     + {
2725     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2726     + ptr += mblength;
2727     + }
2728     + }
2729     + else
2730     + while (ptr < lim && sword--)
2731     + {
2732     + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2733     + ptr += mblength;
2734     + if (ptr < lim)
2735     + {
2736     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2737     + ptr += mblength;
2738     + }
2739     + while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
2740     + ptr += mblength;
2741     + }
2742     +
2743     + if (key->skipsblanks)
2744     + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2745     + ptr += mblength;
2746     +
2747     + for (i = 0; i < schar; i++)
2748     + {
2749     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2750     +
2751     + if (ptr + mblength > lim)
2752     + break;
2753     + else
2754     + ptr += mblength;
2755     + }
2756     +
2757     + return ptr;
2758     +}
2759     +#endif
2760     +
2761     /* Return the limit of (a pointer to the first character after) the field
2762     in LINE specified by KEY. */
2763    
2764     static char *
2765     -limfield (const struct line *line, const struct keyfield *key)
2766     +limfield_uni (const struct line *line, const struct keyfield *key)
2767     {
2768     char *ptr = line->text, *lim = ptr + line->length - 1;
2769     size_t eword = key->eword, echar = key->echar;
2770     @@ -1299,10 +1484,10 @@
2771     `beginning' is the first character following the delimiting TAB.
2772     Otherwise, leave PTR pointing at the first `blank' character after
2773     the preceding field. */
2774     - if (tab != TAB_DEFAULT)
2775     + if (tab_length)
2776     while (ptr < lim && eword--)
2777     {
2778     - while (ptr < lim && *ptr != tab)
2779     + while (ptr < lim && *ptr != tab[0])
2780     ++ptr;
2781     if (ptr < lim && (eword | echar))
2782     ++ptr;
2783     @@ -1348,10 +1533,10 @@
2784     */
2785    
2786     /* Make LIM point to the end of (one byte past) the current field. */
2787     - if (tab != TAB_DEFAULT)
2788     + if (tab_length)
2789     {
2790     char *newlim;
2791     - newlim = memchr (ptr, tab, lim - ptr);
2792     + newlim = memchr (ptr, tab[0], lim - ptr);
2793     if (newlim)
2794     lim = newlim;
2795     }
2796     @@ -1384,6 +1569,107 @@
2797     return ptr;
2798     }
2799    
2800     +#if HAVE_MBRTOWC
2801     +static char *
2802     +limfield_mb (const struct line *line, const struct keyfield *key)
2803     +{
2804     + char *ptr = line->text, *lim = ptr + line->length - 1;
2805     + size_t eword = key->eword, echar = key->echar;
2806     + int i;
2807     + size_t mblength;
2808     + mbstate_t state;
2809     +
2810     + memset (&state, '\0', sizeof(mbstate_t));
2811     +
2812     + if (tab_length)
2813     + while (ptr < lim && eword--)
2814     + {
2815     + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
2816     + {
2817     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2818     + ptr += mblength;
2819     + }
2820     + if (ptr < lim && (eword | echar))
2821     + {
2822     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2823     + ptr += mblength;
2824     + }
2825     + }
2826     + else
2827     + while (ptr < lim && eword--)
2828     + {
2829     + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2830     + ptr += mblength;
2831     + if (ptr < lim)
2832     + {
2833     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2834     + ptr += mblength;
2835     + }
2836     + while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
2837     + ptr += mblength;
2838     + }
2839     +
2840     +
2841     +# ifdef POSIX_UNSPECIFIED
2842     + /* Make LIM point to the end of (one byte past) the current field. */
2843     + if (tab_length)
2844     + {
2845     + char *newlim, *p;
2846     +
2847     + newlim = NULL;
2848     + for (p = ptr; p < lim;)
2849     + {
2850     + if (memcmp (p, tab, tab_length) == 0)
2851     + {
2852     + newlim = p;
2853     + break;
2854     + }
2855     +
2856     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2857     + p += mblength;
2858     + }
2859     + }
2860     + else
2861     + {
2862     + char *newlim;
2863     + newlim = ptr;
2864     +
2865     + while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength))
2866     + newlim += mblength;
2867     + if (ptr < lim)
2868     + {
2869     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2870     + ptr += mblength;
2871     + }
2872     + while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength))
2873     + newlim += mblength;
2874     + lim = newlim;
2875     + }
2876     +# endif
2877     +
2878     + /* If we're skipping leading blanks, don't start counting characters
2879     + * until after skipping past any leading blanks. */
2880     + if (key->skipsblanks)
2881     + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2882     + ptr += mblength;
2883     +
2884     + memset (&state, '\0', sizeof(mbstate_t));
2885     +
2886     + /* Advance PTR by ECHAR (if possible), but no further than LIM. */
2887     + for (i = 0; i < echar; i++)
2888     + {
2889     + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2890     +
2891     + if (ptr + mblength > lim)
2892     + break;
2893     + else
2894     + ptr += mblength;
2895     + }
2896     +
2897     + return ptr;
2898     +}
2899     +#endif
2900     +
2901     /* Fill BUF reading from FP, moving buf->left bytes from the end
2902     of buf->buf to the beginning first. If EOF is reached and the
2903     file wasn't terminated by a newline, supply one. Set up BUF's line
2904     @@ -1500,7 +1786,7 @@
2905     hideously fast. */
2906    
2907     static int
2908     -numcompare (const char *a, const char *b)
2909     +numcompare_uni (const char *a, const char *b)
2910     {
2911     while (blanks[to_uchar (*a)])
2912     a++;
2913     @@ -1510,6 +1796,25 @@
2914     return strnumcmp (a, b, decimal_point, thousands_sep);
2915     }
2916    
2917     +#if HAVE_MBRTOWC
2918     +static int
2919     +numcompare_mb (const char *a, const char *b)
2920     +{
2921     + size_t mblength, len;
2922     + len = strlen (a); /* okay for UTF-8 */
2923     + while (*a && ismbblank (a, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
2924     + {
2925     + a += mblength;
2926     + len -= mblength;
2927     + }
2928     + len = strlen (b); /* okay for UTF-8 */
2929     + while (*b && ismbblank (b, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
2930     + b += mblength;
2931     +
2932     + return strnumcmp (a, b, decimal_point, thousands_sep);
2933     +}
2934     +#endif /* HAV_EMBRTOWC */
2935     +
2936     static int
2937     general_numcompare (const char *sa, const char *sb)
2938     {
2939     @@ -1543,7 +1848,7 @@
2940     Return 0 if the name in S is not recognized. */
2941    
2942     static int
2943     -getmonth (char const *month, size_t len)
2944     +getmonth_uni (char const *month, size_t len)
2945     {
2946     size_t lo = 0;
2947     size_t hi = MONTHS_PER_YEAR;
2948     @@ -1698,11 +2003,79 @@
2949     return diff;
2950     }
2951    
2952     +#if HAVE_MBRTOWC
2953     +static int
2954     +getmonth_mb (const char *s, size_t len)
2955     +{
2956     + char *month;
2957     + register size_t i;
2958     + register int lo = 0, hi = MONTHS_PER_YEAR, result;
2959     + char *tmp;
2960     + size_t wclength, mblength;
2961     + const char **pp;
2962     + const wchar_t **wpp;
2963     + wchar_t *month_wcs;
2964     + mbstate_t state;
2965     +
2966     + while (len > 0 && ismbblank (s, len, &mblength))
2967     + {
2968     + s += mblength;
2969     + len -= mblength;
2970     + }
2971     +
2972     + if (len == 0)
2973     + return 0;
2974     +
2975     + month = (char *) alloca (len + 1);
2976     +
2977     + tmp = (char *) alloca (len + 1);
2978     + memcpy (tmp, s, len);
2979     + tmp[len] = '\0';
2980     + pp = (const char **)&tmp;
2981     + month_wcs = (wchar_t *) alloca ((len + 1) * sizeof (wchar_t));
2982     + memset (&state, '\0', sizeof(mbstate_t));
2983     +
2984     + wclength = mbsrtowcs (month_wcs, pp, len + 1, &state);
2985     + assert (wclength != (size_t)-1 && *pp == NULL);
2986     +
2987     + for (i = 0; i < wclength; i++)
2988     + {
2989     + month_wcs[i] = towupper(month_wcs[i]);
2990     + if (iswblank (month_wcs[i]))
2991     + {
2992     + month_wcs[i] = L'\0';
2993     + break;
2994     + }
2995     + }
2996     +
2997     + wpp = (const wchar_t **)&month_wcs;
2998     +
2999     + mblength = wcsrtombs (month, wpp, len + 1, &state);
3000     + assert (mblength != (-1) && *wpp == NULL);
3001     +
3002     + do
3003     + {
3004     + int ix = (lo + hi) / 2;
3005     +
3006     + if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
3007     + hi = ix;
3008     + else
3009     + lo = ix;
3010     + }
3011     + while (hi - lo > 1);
3012     +
3013     + result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
3014     + ? monthtab[lo].val : 0);
3015     +
3016     + return result;
3017     +}
3018     +#endif
3019     +
3020     /* Compare two lines A and B trying every key in sequence until there
3021     are no more keys or a difference is found. */
3022    
3023     static int
3024     -keycompare (const struct line *a, const struct line *b)
3025     +keycompare_uni (const struct line *a, const struct line *b)
3026     {
3027     struct keyfield const *key = keylist;
3028    
3029     @@ -1875,6 +2248,177 @@
3030     return key->reverse ? -diff : diff;
3031     }
3032    
3033     +#if HAVE_MBRTOWC
3034     +static int
3035     +keycompare_mb (const struct line *a, const struct line *b)
3036     +{
3037     + struct keyfield *key = keylist;
3038     +
3039     + /* For the first iteration only, the key positions have been
3040     + precomputed for us. */
3041     + char *texta = a->keybeg;
3042     + char *textb = b->keybeg;
3043     + char *lima = a->keylim;
3044     + char *limb = b->keylim;
3045     +
3046     + size_t mblength_a, mblength_b;
3047     + wchar_t wc_a, wc_b;
3048     + mbstate_t state_a, state_b;
3049     +
3050     + int diff;
3051     +
3052     + memset (&state_a, '\0', sizeof(mbstate_t));
3053     + memset (&state_b, '\0', sizeof(mbstate_t));
3054     +
3055     + for (;;)
3056     + {
3057     + unsigned char *translate = (unsigned char *) key->translate;
3058     + bool const *ignore = key->ignore;
3059     +
3060     + /* Find the lengths. */
3061     + size_t lena = lima <= texta ? 0 : lima - texta;
3062     + size_t lenb = limb <= textb ? 0 : limb - textb;
3063     +
3064     + /* Actually compare the fields. */
3065     + if (key->numeric | key->general_numeric)
3066     + {
3067     + char savea = *lima, saveb = *limb;
3068     +
3069     + *lima = *limb = '\0';
3070     + if (force_general_numcompare)
3071     + diff = general_numcompare (texta, textb);
3072     + else
3073     + diff = ((key->numeric ? numcompare : general_numcompare)
3074     + (texta, textb));
3075     + *lima = savea, *limb = saveb;
3076     + }
3077     + else if (key->month)
3078     + diff = getmonth (texta, lena) - getmonth (textb, lenb);
3079     + else
3080     + {
3081     + if (ignore || translate)
3082     + {
3083     + char *copy_a = (char *) alloca (lena + 1 + lenb + 1);
3084     + char *copy_b = copy_a + lena + 1;
3085     + size_t new_len_a, new_len_b;
3086     + size_t i, j;
3087     +
3088     + /* Ignore and/or translate chars before comparing. */
3089     +# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
3090     + do \
3091     + { \
3092     + wchar_t uwc; \
3093     + char mbc[MB_LEN_MAX]; \
3094     + mbstate_t state_wc; \
3095     + \
3096     + for (NEW_LEN = i = 0; i < LEN;) \
3097     + { \
3098     + mbstate_t state_bak; \
3099     + \
3100     + state_bak = STATE; \
3101     + MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \
3102     + \
3103     + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \
3104     + || MBLENGTH == 0) \
3105     + { \
3106     + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \
3107     + STATE = state_bak; \
3108     + if (!ignore) \
3109     + COPY[NEW_LEN++] = TEXT[i++]; \
3110     + continue; \
3111     + } \
3112     + \
3113     + if (ignore) \
3114     + { \
3115     + if ((ignore == nonprinting && !iswprint (WC)) \
3116     + || (ignore == nondictionary \
3117     + && !iswalnum (WC) && !iswblank (WC))) \
3118     + { \
3119     + i += MBLENGTH; \
3120     + continue; \
3121     + } \
3122     + } \
3123     + \
3124     + if (translate) \
3125     + { \
3126     + \
3127     + uwc = towupper(WC); \
3128     + if (WC == uwc) \
3129     + { \
3130     + memcpy (mbc, TEXT + i, MBLENGTH); \
3131     + i += MBLENGTH; \
3132     + } \
3133     + else \
3134     + { \
3135     + i += MBLENGTH; \
3136     + WC = uwc; \
3137     + memset (&state_wc, '\0', sizeof (mbstate_t)); \
3138     + \
3139     + MBLENGTH = wcrtomb (mbc, WC, &state_wc); \
3140     + assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \
3141     + } \
3142     + \
3143     + for (j = 0; j < MBLENGTH; j++) \
3144     + COPY[NEW_LEN++] = mbc[j]; \
3145     + } \
3146     + else \
3147     + for (j = 0; j < MBLENGTH; j++) \
3148     + COPY[NEW_LEN++] = TEXT[i++]; \
3149     + } \
3150     + COPY[NEW_LEN] = '\0'; \
3151     + } \
3152     + while (0)
3153     + IGNORE_CHARS (new_len_a, lena, texta, copy_a,
3154     + wc_a, mblength_a, state_a);
3155     + IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
3156     + wc_b, mblength_b, state_b);
3157     + diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
3158     + }
3159     + else if (lena == 0)
3160     + diff = - NONZERO (lenb);
3161     + else if (lenb == 0)
3162     + goto greater;
3163     + else
3164     + diff = xmemcoll (texta, lena, textb, lenb);
3165     + }
3166     +
3167     + if (diff)
3168     + goto not_equal;
3169     +
3170     + key = key->next;
3171     + if (! key)
3172     + break;
3173     +
3174     + /* Find the beginning and limit of the next field. */
3175     + if (key->eword != -1)
3176     + lima = limfield (a, key), limb = limfield (b, key);
3177     + else
3178     + lima = a->text + a->length - 1, limb = b->text + b->length - 1;
3179     +
3180     + if (key->sword != -1)
3181     + texta = begfield (a, key), textb = begfield (b, key);
3182     + else
3183     + {
3184     + texta = a->text, textb = b->text;
3185     + if (key->skipsblanks)
3186     + {
3187     + while (texta < lima && ismbblank (texta, lima - texta, &mblength_a))
3188     + texta += mblength_a;
3189     + while (textb < limb && ismbblank (textb, limb - textb, &mblength_b))
3190     + textb += mblength_b;
3191     + }
3192     + }
3193     + }
3194     +
3195     + return 0;
3196     +
3197     +greater:
3198     + diff = 1;
3199     +not_equal:
3200     + return key->reverse ? -diff : diff;
3201     +}
3202     +#endif
3203     +
3204     /* Compare two lines A and B, returning negative, zero, or positive
3205     depending on whether A compares less than, equal to, or greater than B. */
3206    
3207     @@ -2744,7 +3288,7 @@
3208     initialize_exit_failure (SORT_FAILURE);
3209    
3210     hard_LC_COLLATE = hard_locale (LC_COLLATE);
3211     -#if HAVE_NL_LANGINFO
3212     +#if HAVE_LANGINFO_CODESET
3213     hard_LC_TIME = hard_locale (LC_TIME);
3214     #endif
3215    
3216     @@ -2765,6 +3309,27 @@
3217     thousands_sep = -1;
3218     }
3219    
3220     +#if HAVE_MBRTOWC
3221     + if (MB_CUR_MAX > 1)
3222     + {
3223     + inittables = inittables_mb;
3224     + begfield = begfield_mb;
3225     + limfield = limfield_mb;
3226     + getmonth = getmonth_mb;
3227     + keycompare = keycompare_mb;
3228     + numcompare = numcompare_mb;
3229     + }
3230     + else
3231     +#endif
3232     + {
3233     + inittables = inittables_uni;
3234     + begfield = begfield_uni;
3235     + limfield = limfield_uni;
3236     + getmonth = getmonth_uni;
3237     + keycompare = keycompare_uni;
3238     + numcompare = numcompare_uni;
3239     + }
3240     +
3241     have_read_stdin = false;
3242     inittables ();
3243    
3244     @@ -3015,13 +3580,35 @@
3245    
3246     case 't':
3247     {
3248     - char newtab = optarg[0];
3249     - if (! newtab)
3250     + char newtab[MB_LEN_MAX + 1];
3251     + size_t newtab_length = 1;
3252     + strncpy (newtab, optarg, MB_LEN_MAX);
3253     + if (! newtab[0])
3254     error (SORT_FAILURE, 0, _("empty tab"));
3255     - if (optarg[1])
3256     +#if HAVE_MBRTOWC
3257     + if (MB_CUR_MAX > 1)
3258     + {
3259     + wchar_t wc;
3260     + mbstate_t state;
3261     + size_t i;
3262     +
3263     + memset (&state, '\0', sizeof (mbstate_t));
3264     + newtab_length = mbrtowc (&wc, newtab, strnlen (newtab,
3265     + MB_LEN_MAX),
3266     + &state);
3267     + switch (newtab_length)
3268     + {
3269     + case (size_t) -1:
3270     + case (size_t) -2:
3271     + case 0:
3272     + newtab_length = 1;
3273     + }
3274     + }
3275     +#endif
3276     + if (newtab_length == 1 && optarg[1])
3277     {
3278     if (STREQ (optarg, "\\0"))
3279     - newtab = '\0';
3280     + newtab[0] = '\0';
3281     else
3282     {
3283     /* Provoke with `sort -txx'. Complain about
3284     @@ -3032,9 +3619,12 @@
3285     quote (optarg));
3286     }
3287     }
3288     - if (tab != TAB_DEFAULT && tab != newtab)
3289     + if (tab_length
3290     + && (tab_length != newtab_length
3291     + || memcmp (tab, newtab, tab_length) != 0))
3292     error (SORT_FAILURE, 0, _("incompatible tabs"));
3293     - tab = newtab;
3294     + memcpy (tab, newtab, newtab_length);
3295     + tab_length = newtab_length;
3296     }
3297     break;
3298    
3299     diff -Naur coreutils-6.9.orig/src/unexpand.c coreutils-6.9/src/unexpand.c
3300     --- coreutils-6.9.orig/src/unexpand.c 2007-03-18 21:36:43.000000000 +0000
3301     +++ coreutils-6.9/src/unexpand.c 2007-04-07 16:59:55.000000000 +0000
3302     @@ -39,11 +39,28 @@
3303     #include <stdio.h>
3304     #include <getopt.h>
3305     #include <sys/types.h>
3306     +
3307     +/* Get mbstate_t, mbrtowc(), wcwidth(). */
3308     +#if HAVE_WCHAR_H
3309     +# include <wchar.h>
3310     +#endif
3311     +
3312     #include "system.h"
3313     #include "error.h"
3314     #include "quote.h"
3315     #include "xstrndup.h"
3316    
3317     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
3318     + installation; work around this configuration error. */
3319     +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
3320     +# define MB_LEN_MAX 16
3321     +#endif
3322     +
3323     +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
3324     +#if HAVE_MBRTOWC && defined mbstate_t
3325     +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
3326     +#endif
3327     +
3328     /* The official name of this program (e.g., no `g' prefix). */
3329     #define PROGRAM_NAME "unexpand"
3330    
3331     @@ -110,6 +127,208 @@
3332     {NULL, 0, NULL, 0}
3333     };
3334    
3335     +static FILE *next_file (FILE *fp);
3336     +
3337     +#if HAVE_MBRTOWC
3338     +static void
3339     +unexpand_multibyte (void)
3340     +{
3341     + FILE *fp; /* Input stream. */
3342     + mbstate_t i_state; /* Current shift state of the input stream. */
3343     + mbstate_t i_state_bak; /* Back up the I_STATE. */
3344     + mbstate_t o_state; /* Current shift state of the output stream. */
3345     + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
3346     + char *bufpos; /* Next read position of BUF. */
3347     + size_t buflen = 0; /* The length of the byte sequence in buf. */
3348     + wint_t wc; /* A gotten wide character. */
3349     + size_t mblength; /* The byte size of a multibyte character
3350     + which shows as same character as WC. */
3351     +
3352     + /* Index in `tab_list' of next tabstop: */
3353     + int tab_index = 0; /* For calculating width of pending tabs. */
3354     + int print_tab_index = 0; /* For printing as many tabs as possible. */
3355     + unsigned int column = 0; /* Column on screen of next char. */
3356     + int next_tab_column; /* Column the next tab stop is on. */
3357     + int convert = 1; /* If nonzero, perform translations. */
3358     + unsigned int pending = 0; /* Pending columns of blanks. */
3359     +
3360     + fp = next_file ((FILE *) NULL);
3361     + if (fp == NULL)
3362     + return;
3363     +
3364     + memset (&o_state, '\0', sizeof(mbstate_t));
3365     + memset (&i_state, '\0', sizeof(mbstate_t));
3366     +
3367     + for (;;)
3368     + {
3369     + if (buflen < MB_LEN_MAX && !feof(fp) && !ferror(fp))
3370     + {
3371     + memmove (buf, bufpos, buflen);
3372     + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, fp);
3373     + bufpos = buf;
3374     + }
3375     +
3376     + /* Get a wide character. */
3377     + if (buflen < 1)
3378     + {
3379     + mblength = 1;
3380     + wc = WEOF;
3381     + }
3382     + else
3383     + {
3384     + i_state_bak = i_state;
3385     + mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &i_state);
3386     + }
3387     +
3388     + if (mblength == (size_t)-1 || mblength == (size_t)-2)
3389     + {
3390     + i_state = i_state_bak;
3391     + wc = L'\0';
3392     + }
3393     +
3394     + if (wc == L' ' && convert && column < INT_MAX)
3395     + {
3396     + ++pending;
3397     + ++column;
3398     + }
3399     + else if (wc == L'\t' && convert)
3400     + {
3401     + if (tab_size == 0)
3402     + {
3403     + /* Do not let tab_index == first_free_tab;
3404     + stop when it is 1 less. */
3405     + while (tab_index < first_free_tab - 1
3406     + && column >= tab_list[tab_index])
3407     + tab_index++;
3408     + next_tab_column = tab_list[tab_index];
3409     + if (tab_index < first_free_tab - 1)
3410     + tab_index++;
3411     + if (column >= next_tab_column)
3412     + {
3413     + convert = 0; /* Ran out of tab stops. */
3414     + goto flush_pend_mb;
3415     + }
3416     + }
3417     + else
3418     + {
3419     + next_tab_column = column + tab_size - column % tab_size;
3420     + }
3421     + pending += next_tab_column - column;
3422     + column = next_tab_column;
3423     + }
3424     + else
3425     + {
3426     +flush_pend_mb:
3427     + /* Flush pending spaces. Print as many tabs as possible,
3428     + then print the rest as spaces. */
3429     + if (pending == 1)
3430     + {
3431     + putchar (' ');
3432     + pending = 0;
3433     + }
3434     + column -= pending;
3435     + while (pending > 0)
3436     + {
3437     + if (tab_size == 0)
3438     + {
3439     + /* Do not let print_tab_index == first_free_tab;
3440     + stop when it is 1 less. */
3441     + while (print_tab_index < first_free_tab - 1
3442     + && column >= tab_list[print_tab_index])
3443     + print_tab_index++;
3444     + next_tab_column = tab_list[print_tab_index];
3445     + if (print_tab_index < first_free_tab - 1)
3446     + print_tab_index++;
3447     + }
3448     + else
3449     + {
3450     + next_tab_column =
3451     + column + tab_size - column % tab_size;
3452     + }
3453     + if (next_tab_column - column <= pending)
3454     + {
3455     + putchar ('\t');
3456     + pending -= next_tab_column - column;
3457     + column = next_tab_column;
3458     + }
3459     + else
3460     + {
3461     + --print_tab_index;
3462     + column += pending;
3463     + while (pending != 0)
3464     + {
3465     + putchar (' ');
3466     + pending--;
3467     + }
3468     + }
3469     + }
3470     +
3471     + if (wc == WEOF)
3472     + {
3473     + fp = next_file (fp);
3474     + if (fp == NULL)
3475     + break; /* No more files. */
3476     + else
3477     + {
3478     + memset (&i_state, '\0', sizeof(mbstate_t));
3479     + continue;
3480     + }
3481     + }
3482     +
3483     + if (mblength == (size_t)-1 || mblength == (size_t)-2)
3484     + {
3485     + if (convert)
3486     + {
3487     + ++column;
3488     + if (convert_entire_line == 0)
3489     + convert = 0;
3490     + }
3491     + mblength = 1;
3492     + putchar (buf[0]);
3493     + }
3494     + else if (mblength == 0)
3495     + {
3496     + if (convert && convert_entire_line == 0)
3497     + convert = 0;
3498     + mblength = 1;
3499     + putchar ('\0');
3500     + }
3501     + else
3502     + {
3503     + if (convert)
3504     + {
3505     + if (wc == L'\b')
3506     + {
3507     + if (column > 0)
3508     + --column;
3509     + }
3510     + else
3511     + {
3512     + int width; /* The width of WC. */
3513     +
3514     + width = wcwidth (wc);
3515     + column += (width > 0) ? width : 0;
3516     + if (convert_entire_line == 0)
3517     + convert = 0;
3518     + }
3519     + }
3520     +
3521     + if (wc == L'\n')
3522     + {
3523     + tab_index = print_tab_index = 0;
3524     + column = pending = 0;
3525     + convert = 1;
3526     + }
3527     + fwrite (bufpos, sizeof(char), mblength, stdout);
3528     + }
3529     + }
3530     + buflen -= mblength;
3531     + bufpos += mblength;
3532     + }
3533     +}
3534     +#endif
3535     +
3536     +
3537     void
3538     usage (int status)
3539     {
3540     @@ -531,7 +750,12 @@
3541    
3542     file_list = (optind < argc ? &argv[optind] : stdin_argv);
3543    
3544     - unexpand ();
3545     +#if HAVE_MBRTOWC
3546     + if (MB_CUR_MAX > 1)
3547     + unexpand_multibyte ();
3548     + else
3549     +#endif
3550     + unexpand ();
3551    
3552     if (have_read_stdin && fclose (stdin) != 0)
3553     error (EXIT_FAILURE, errno, "-");
3554     diff -Naur coreutils-6.9.orig/src/uniq.c coreutils-6.9/src/uniq.c
3555     --- coreutils-6.9.orig/src/uniq.c 2007-03-18 21:36:43.000000000 +0000
3556     +++ coreutils-6.9/src/uniq.c 2007-04-07 16:59:55.000000000 +0000
3557     @@ -23,6 +23,16 @@
3558     #include <getopt.h>
3559     #include <sys/types.h>
3560    
3561     +/* Get mbstate_t, mbrtowc(). */
3562     +#if HAVE_WCHAR_H
3563     +# include <wchar.h>
3564     +#endif
3565     +
3566     +/* Get isw* functions. */
3567     +#if HAVE_WCTYPE_H
3568     +# include <wctype.h>
3569     +#endif
3570     +
3571     #include "system.h"
3572     #include "argmatch.h"
3573     #include "linebuffer.h"
3574     @@ -32,7 +42,19 @@
3575     #include "quote.h"
3576     #include "xmemcoll.h"
3577     #include "xstrtol.h"
3578     -#include "memcasecmp.h"
3579     +#include "xmemcoll.h"
3580     +
3581     +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
3582     + installation; work around this configuration error. */
3583     +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
3584     +# define MB_LEN_MAX 16
3585     +#endif
3586     +
3587     +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
3588     +#if HAVE_MBRTOWC && defined mbstate_t
3589     +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
3590     +#endif
3591     +
3592    
3593     /* The official name of this program (e.g., no `g' prefix). */
3594     #define PROGRAM_NAME "uniq"
3595     @@ -109,6 +131,10 @@
3596     /* Select whether/how to delimit groups of duplicate lines. */
3597     static enum delimit_method delimit_groups;
3598    
3599     +/* Function pointers. */
3600     +static char *
3601     +(*find_field) (struct linebuffer *line);
3602     +
3603     static struct option const longopts[] =
3604     {
3605     {"count", no_argument, NULL, 'c'},
3606     @@ -198,7 +224,7 @@
3607     return a pointer to the beginning of the line's field to be compared. */
3608    
3609     static char *
3610     -find_field (const struct linebuffer *line)
3611     +find_field_uni (struct linebuffer *line)
3612     {
3613     size_t count;
3614     char *lp = line->buffer;
3615     @@ -219,6 +245,83 @@
3616     return lp + i;
3617     }
3618    
3619     +#if HAVE_MBRTOWC
3620     +
3621     +# define MBCHAR_TO_WCHAR(WC, MBLENGTH, LP, POS, SIZE, STATEP, CONVFAIL) \
3622     + do \
3623     + { \
3624     + mbstate_t state_bak; \
3625     + \
3626     + CONVFAIL = 0; \
3627     + state_bak = *STATEP; \
3628     + \
3629     + MBLENGTH = mbrtowc (&WC, LP + POS, SIZE - POS, STATEP); \
3630     + \
3631     + switch (MBLENGTH) \
3632     + { \
3633     + case (size_t)-2: \
3634     + case (size_t)-1: \
3635     + *STATEP = state_bak; \
3636     + CONVFAIL++; \
3637     + /* Fall through */ \
3638     + case 0: \
3639     + MBLENGTH = 1; \
3640     + } \
3641     + } \
3642     + while (0)
3643     +
3644     +static char *
3645     +find_field_multi (struct linebuffer *line)
3646     +{
3647     + size_t count;
3648     + char *lp = line->buffer;
3649     + size_t size = line->length - 1;
3650     + size_t pos;
3651     + size_t mblength;
3652     + wchar_t wc;
3653     + mbstate_t *statep;
3654     + int convfail;
3655     +
3656     + pos = 0;
3657     + statep = &(line->state);
3658     +
3659     + /* skip fields. */
3660     + for (count = 0; count < skip_fields && pos < size; count++)
3661     + {
3662     + while (pos < size)
3663     + {
3664     + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3665     +
3666     + if (convfail || !iswblank (wc))
3667     + {
3668     + pos += mblength;
3669     + break;
3670     + }
3671     + pos += mblength;
3672     + }
3673     +
3674     + while (pos < size)
3675     + {
3676     + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3677     +
3678     + if (!convfail && iswblank (wc))
3679     + break;
3680     +
3681     + pos += mblength;
3682     + }
3683     + }
3684     +
3685     + /* skip fields. */
3686     + for (count = 0; count < skip_chars && pos < size; count++)
3687     + {
3688     + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3689     + pos += mblength;
3690     + }
3691     +
3692     + return lp + pos;
3693     +}
3694     +#endif
3695     +
3696     /* Return false if two strings OLD and NEW match, true if not.
3697     OLD and NEW point not to the beginnings of the lines
3698     but rather to the beginnings of the fields to compare.
3699     @@ -227,6 +330,8 @@
3700     static bool
3701     different (char *old, char *new, size_t oldlen, size_t newlen)
3702     {
3703     + char *copy_old, *copy_new;
3704     +
3705     if (check_chars < oldlen)
3706     oldlen = check_chars;
3707     if (check_chars < newlen)
3708     @@ -234,14 +339,92 @@
3709    
3710     if (ignore_case)
3711     {
3712     - /* FIXME: This should invoke strcoll somehow. */
3713     - return oldlen != newlen || memcasecmp (old, new, oldlen);
3714     + size_t i;
3715     +
3716     + copy_old = alloca (oldlen + 1);
3717     + copy_new = alloca (oldlen + 1);
3718     +
3719     + for (i = 0; i < oldlen; i++)
3720     + {
3721     + copy_old[i] = toupper (old[i]);
3722     + copy_new[i] = toupper (new[i]);
3723     + }
3724     }
3725     - else if (hard_LC_COLLATE)
3726     - return xmemcoll (old, oldlen, new, newlen) != 0;
3727     else
3728     - return oldlen != newlen || memcmp (old, new, oldlen);
3729     + {
3730     + copy_old = (char *)old;
3731     + copy_new = (char *)new;
3732     + }
3733     +
3734     + return xmemcoll (copy_old, oldlen, copy_new, newlen);
3735     +}
3736     +
3737     +#if HAVE_MBRTOWC
3738     +static int
3739     +different_multi (const char *old, const char *new, size_t oldlen, size_t newlen, mbstate_t oldstate, mbstate_t newstate)
3740     +{
3741     + size_t i, j, chars;
3742     + const char *str[2];
3743     + char *copy[2];
3744     + size_t len[2];
3745     + mbstate_t state[2];
3746     + size_t mblength;
3747     + wchar_t wc, uwc;
3748     + mbstate_t state_bak;
3749     +
3750     + str[0] = old;
3751     + str[1] = new;
3752     + len[0] = oldlen;
3753     + len[1] = newlen;
3754     + state[0] = oldstate;
3755     + state[1] = newstate;
3756     +
3757     + for (i = 0; i < 2; i++)
3758     + {
3759     + copy[i] = alloca (len[i] + 1);
3760     +
3761     + for (j = 0, chars = 0; j < len[i] && chars < check_chars; chars++)
3762     + {
3763     + state_bak = state[i];
3764     + mblength = mbrtowc (&wc, str[i] + j, len[i] - j, &(state[i]));
3765     +
3766     + switch (mblength)
3767     + {
3768     + case (size_t)-1:
3769     + case (size_t)-2:
3770     + state[i] = state_bak;
3771     + /* Fall through */
3772     + case 0:
3773     + mblength = 1;
3774     + break;
3775     +
3776     + default:
3777     + if (ignore_case)
3778     + {
3779     + uwc = towupper (wc);
3780     +
3781     + if (uwc != wc)
3782     + {
3783     + mbstate_t state_wc;
3784     +
3785     + memset (&state_wc, '\0', sizeof(mbstate_t));
3786     + wcrtomb (copy[i] + j, uwc, &state_wc);
3787     + }
3788     + else
3789     + memcpy (copy[i] + j, str[i] + j, mblength);
3790     + }
3791     + else
3792     + memcpy (copy[i] + j, str[i] + j, mblength);
3793     + }
3794     + j += mblength;
3795     + }
3796     + copy[i][j] = '\0';
3797     + len[i] = j;
3798     + }
3799     +
3800     + return xmemcoll (copy[0], len[0], copy[1], len[1]);
3801     }
3802     +#endif
3803    
3804     /* Output the line in linebuffer LINE to standard output
3805     provided that the switches say it should be output.
3806     @@ -295,15 +478,43 @@
3807     {
3808     char *prevfield IF_LINT (= NULL);
3809     size_t prevlen IF_LINT (= 0);
3810     +#if HAVE_MBRTOWC
3811     + mbstate_t prevstate;
3812     +
3813     + memset (&prevstate, '\0', sizeof (mbstate_t));
3814     +#endif
3815    
3816     while (!feof (stdin))
3817     {
3818     char *thisfield;
3819     size_t thislen;
3820     +#if HAVE_MBRTOWC
3821     + mbstate_t thisstate;
3822     +#endif
3823     +
3824     if (readlinebuffer (thisline, stdin) == 0)
3825     break;
3826     thisfield = find_field (thisline);
3827     thislen = thisline->length - 1 - (thisfield - thisline->buffer);
3828     +#if HAVE_MBRTOWC
3829     + if (MB_CUR_MAX > 1)
3830     + {
3831     + thisstate = thisline->state;
3832     +
3833     + if (prevline->length == 0 || different_multi
3834     + (thisfield, prevfield, thislen, prevlen, thisstate, prevstate))
3835     + {
3836     + fwrite (thisline->buffer, sizeof (char),
3837     + thisline->length, stdout);
3838     +
3839     + SWAP_LINES (prevline, thisline);
3840     + prevfield = thisfield;
3841     + prevlen = thislen;
3842     + prevstate = thisstate;
3843     + }
3844     + }
3845     + else
3846     +#endif
3847     if (prevline->length == 0
3848     || different (thisfield, prevfield, thislen, prevlen))
3849     {
3850     @@ -322,17 +533,26 @@
3851     size_t prevlen;
3852     uintmax_t match_count = 0;
3853     bool first_delimiter = true;
3854     +#if HAVE_MBRTOWC
3855     + mbstate_t prevstate;
3856     +#endif
3857    
3858     if (readlinebuffer (prevline, stdin) == 0)
3859     goto closefiles;
3860     prevfield = find_field (prevline);
3861     prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
3862     +#if HAVE_MBRTOWC
3863     + prevstate = prevline->state;
3864     +#endif
3865    
3866     while (!feof (stdin))
3867     {
3868     bool match;
3869     char *thisfield;
3870     size_t thislen;
3871     +#if HAVE_MBRTOWC
3872     + mbstate_t thisstate;
3873     +#endif
3874     if (readlinebuffer (thisline, stdin) == 0)
3875     {
3876     if (ferror (stdin))
3877     @@ -341,6 +561,15 @@
3878     }
3879     thisfield = find_field (thisline);
3880     thislen = thisline->length - 1 - (thisfield - thisline->buffer);
3881     +#if HAVE_MBRTOWC
3882     + if (MB_CUR_MAX > 1)
3883     + {
3884     + thisstate = thisline->state;
3885     + match = !different_multi (thisfield, prevfield,
3886     + thislen, prevlen, thisstate, prevstate);
3887     + }
3888     + else
3889     +#endif
3890     match = !different (thisfield, prevfield, thislen, prevlen);
3891     match_count += match;
3892    
3893     @@ -373,6 +602,9 @@
3894     SWAP_LINES (prevline, thisline);
3895     prevfield = thisfield;
3896     prevlen = thislen;
3897     +#if HAVE_MBRTOWC
3898     + prevstate = thisstate;
3899     +#endif
3900     if (!match)
3901     match_count = 0;
3902     }
3903     @@ -417,6 +649,19 @@
3904    
3905     atexit (close_stdout);
3906    
3907     +#if HAVE_MBRTOWC
3908     + if (MB_CUR_MAX > 1)
3909     + {
3910     + find_field = find_field_multi;
3911     + }
3912     + else
3913     +#endif
3914     + {
3915     + find_field = find_field_uni;
3916     + }
3917     +
3918     +
3919     +
3920     skip_chars = 0;
3921     skip_fields = 0;
3922     check_chars = SIZE_MAX;
3923     diff -Naur coreutils-6.9.orig/tests/sort/Makefile.am coreutils-6.9/tests/sort/Makefile.am
3924     --- coreutils-6.9.orig/tests/sort/Makefile.am 2007-03-18 21:36:44.000000000 +0000
3925     +++ coreutils-6.9/tests/sort/Makefile.am 2007-04-07 17:03:36.000000000 +0000
3926     @@ -66,15 +66,17 @@
3927     bigfield.O bigfield.E
3928     ##test-files-end
3929    
3930     -EXTRA_DIST = Test.pm $x-tests $(explicit) $(maint_gen)
3931     -noinst_SCRIPTS = $x-tests
3932     +run_gen += mb1.0 mb2.0
3933     +
3934     +EXTRA_DIST = Test.pm $x-tests $(explicit) $(maint_gen) mb1.I mb1.X mb2.I mb2.X
3935     +noinst_SCRIPTS = $x-tests # $x-mb-tests
3936     TESTS_ENVIRONMENT = \
3937     CU_TEST_NAME=`basename $(abs_srcdir)`,$$tst \
3938     PATH="$(VG_PATH_PREFIX)`pwd`/../../src$(PATH_SEPARATOR)$$PATH"
3939    
3940     editpl = sed -e 's,@''PERL''@,$(PERL),g' -e 's,@''srcdir''@,$(srcdir),g'
3941    
3942     -TESTS = $x-tests
3943     +TESTS = $x-tests $x-mb-tests
3944    
3945     mk_script = $(srcdir)/../mk-script
3946     $(srcdir)/$x-tests: $(mk_script) Test.pm Makefile.am
3947     diff -Naur coreutils-6.9.orig/tests/sort/Makefile.in coreutils-6.9/tests/sort/Makefile.in
3948     --- coreutils-6.9.orig/tests/sort/Makefile.in 2007-03-22 21:20:25.000000000 +0000
3949     +++ coreutils-6.9/tests/sort/Makefile.in 2007-04-07 17:01:55.000000000 +0000
3950     @@ -540,14 +540,16 @@
3951     incompat5.O incompat5.E incompat6.O incompat6.E nul-tab.O nul-tab.E \
3952     bigfield.O bigfield.E
3953    
3954     -EXTRA_DIST = Test.pm $x-tests $(explicit) $(maint_gen)
3955     -noinst_SCRIPTS = $x-tests
3956     +run_gen += mb1.0 mb2.0
3957     +
3958     +EXTRA_DIST = Test.pm $x-tests $(explicit) $(maint_gen) mb1.I mb1.X mb2.I mb2.X
3959     +noinst_SCRIPTS = $x-tests # $x-mb-tests
3960     TESTS_ENVIRONMENT = \
3961     CU_TEST_NAME=`basename $(abs_srcdir)`,$$tst \
3962     PATH="$(VG_PATH_PREFIX)`pwd`/../../src$(PATH_SEPARATOR)$$PATH"
3963    
3964     editpl = sed -e 's,@''PERL''@,$(PERL),g' -e 's,@''srcdir''@,$(srcdir),g'
3965     -TESTS = $x-tests
3966     +TESTS = $x-tests $x-mb-tests
3967     mk_script = $(srcdir)/../mk-script
3968     MAINTAINERCLEANFILES = $x-tests $(maint_gen)
3969     CLEANFILES = $(run_gen)
3970     diff -Naur coreutils-6.9.orig/tests/sort/mb1.I coreutils-6.9/tests/sort/mb1.I
3971     --- coreutils-6.9.orig/tests/sort/mb1.I 1970-01-01 00:00:00.000000000 +0000
3972     +++ coreutils-6.9/tests/sort/mb1.I 2007-04-07 16:59:55.000000000 +0000
3973     @@ -0,0 +1,4 @@
3974     +Apple@10
3975     +Banana@5
3976     +Citrus@20
3977     +Cherry@30
3978     diff -Naur coreutils-6.9.orig/tests/sort/mb1.X coreutils-6.9/tests/sort/mb1.X
3979     --- coreutils-6.9.orig/tests/sort/mb1.X 1970-01-01 00:00:00.000000000 +0000
3980     +++ coreutils-6.9/tests/sort/mb1.X 2007-04-07 16:59:55.000000000 +0000
3981     @@ -0,0 +1,4 @@
3982     +Banana@5
3983     +Apple@10
3984     +Citrus@20
3985     +Cherry@30
3986     diff -Naur coreutils-6.9.orig/tests/sort/mb2.I coreutils-6.9/tests/sort/mb2.I
3987     --- coreutils-6.9.orig/tests/sort/mb2.I 1970-01-01 00:00:00.000000000 +0000
3988     +++ coreutils-6.9/tests/sort/mb2.I 2007-04-07 16:59:55.000000000 +0000
3989     @@ -0,0 +1,4 @@
3990     +Apple@AA10@@20
3991     +Banana@AA5@@30
3992     +Citrus@AA20@@5
3993     +Cherry@AA30@@10
3994     diff -Naur coreutils-6.9.orig/tests/sort/mb2.X coreutils-6.9/tests/sort/mb2.X
3995     --- coreutils-6.9.orig/tests/sort/mb2.X 1970-01-01 00:00:00.000000000 +0000
3996     +++ coreutils-6.9/tests/sort/mb2.X 2007-04-07 16:59:55.000000000 +0000
3997     @@ -0,0 +1,4 @@
3998     +Citrus@AA20@@5
3999     +Cherry@AA30@@10
4000     +Apple@AA10@@20
4001     +Banana@AA5@@30
4002     diff -Naur coreutils-6.9.orig/tests/sort/sort-mb-tests coreutils-6.9/tests/sort/sort-mb-tests
4003     --- coreutils-6.9.orig/tests/sort/sort-mb-tests 1970-01-01 00:00:00.000000000 +0000
4004     +++ coreutils-6.9/tests/sort/sort-mb-tests 2007-04-07 16:59:55.000000000 +0000
4005     @@ -0,0 +1,58 @@
4006     +#! /bin/sh
4007     +case $# in
4008     + 0) xx='../../src/sort';;
4009     + *) xx="$1";;
4010     +esac
4011     +test "$VERBOSE" && echo=echo || echo=:
4012     +$echo testing program: $xx
4013     +errors=0
4014     +test "$srcdir" || srcdir=.
4015     +test "$VERBOSE" && $xx --version 2> /dev/null
4016     +
4017     +export LC_ALL=en_US.UTF-8
4018     +locale -k LC_CTYPE 2>&1 | grep -q charmap.*UTF-8 || exit 77
4019     +errors=0
4020     +
4021     +$xx -t @ -k2 -n mb1.I > mb1.O
4022     +code=$?
4023     +if test $code != 0; then
4024     + $echo "Test mb1 failed: $xx return code $code differs from expected value 0" 1>&2
4025     + errors=`expr $errors + 1`
4026     +else
4027     + cmp mb1.O $srcdir/mb1.X > /dev/null 2>&1
4028     + case $? in
4029     + 0) if test "$VERBOSE"; then $echo "passed mb1"; fi;;
4030     + 1) $echo "Test mb1 failed: files mb1.O and $srcdir/mb1.X differ" 1>&2
4031     + (diff -c mb1.O $srcdir/mb1.X) 2> /dev/null
4032     + errors=`expr $errors + 1`;;
4033     + 2) $echo "Test mb1 may have failed." 1>&2
4034     + $echo The command "cmp mb1.O $srcdir/mb1.X" failed. 1>&2
4035     + errors=`expr $errors + 1`;;
4036     + esac
4037     +fi
4038     +
4039     +$xx -t @ -k4 -n mb2.I > mb2.O
4040     +code=$?
4041     +if test $code != 0; then
4042     + $echo "Test mb2 failed: $xx return code $code differs from expected value 0" 1>&2
4043     + errors=`expr $errors + 1`
4044     +else
4045     + cmp mb2.O $srcdir/mb2.X > /dev/null 2>&1
4046     + case $? in
4047     + 0) if test "$VERBOSE"; then $echo "passed mb2"; fi;;
4048     + 1) $echo "Test mb2 failed: files mb2.O and $srcdir/mb2.X differ" 1>&2
4049     + (diff -c mb2.O $srcdir/mb2.X) 2> /dev/null
4050     + errors=`expr $errors + 1`;;
4051     + 2) $echo "Test mb2 may have failed." 1>&2
4052     + $echo The command "cmp mb2.O $srcdir/mb2.X" failed. 1>&2
4053     + errors=`expr $errors + 1`;;
4054     + esac
4055     +fi
4056     +
4057     +if test $errors = 0; then
4058     + $echo Passed all 113 tests. 1>&2
4059     +else
4060     + $echo Failed $errors tests. 1>&2
4061     +fi
4062     +test $errors = 0 || errors=1
4063     +exit $errors