Magellan Linux

Contents of /trunk/coreutils/patches-6.10/coreutils-6.9-i18n.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 536 - (show annotations) (download)
Tue Mar 25 13:54:58 2008 UTC (16 years, 1 month ago) by niro
File size: 103622 byte(s)
-coreutils-6.10

1 Submitted by: Matt Burgess (matthew at linuxfromscratch.org)
2 Date: 2007-04-07
3 Initial Package Version: 6.9
4 Upstream Status: Rejected
5 Origin: Based on coreutils-5.93-i18n-2.patch by Alexander Patrakov
6 Description: This patch fixes various problems with multibyte character support.
7 LSB >= 2.0 tests for features added by this patch, but only Coreutils-5.2.1 plus
8 http://www.linuxfromscratch.org/~alexander/patches/coreutils-5.2.1-i18n_fixes-1.patch
9 actually pass the Li18nux2000-level1 testsuite.
10
11 diff -Naur coreutils-6.9.orig/lib/linebuffer.h coreutils-6.9/lib/linebuffer.h
12 --- coreutils-6.9.orig/lib/linebuffer.h 2005-05-14 06:03:58.000000000 +0000
13 +++ coreutils-6.9/lib/linebuffer.h 2007-04-07 16:59:55.000000000 +0000
14 @@ -22,6 +22,11 @@
15
16 # include <stdio.h>
17
18 +/* Get mbstate_t. */
19 +# if HAVE_WCHAR_H
20 +# include <wchar.h>
21 +# endif
22 +
23 /* A `struct linebuffer' holds a line of text. */
24
25 struct linebuffer
26 @@ -29,6 +34,9 @@
27 size_t size; /* Allocated. */
28 size_t length; /* Used. */
29 char *buffer;
30 +# if HAVE_WCHAR_H
31 + mbstate_t state;
32 +# endif
33 };
34
35 /* Initialize linebuffer LINEBUFFER for use. */
36 diff -Naur coreutils-6.9.orig/src/cut.c coreutils-6.9/src/cut.c
37 --- coreutils-6.9.orig/src/cut.c 2007-03-18 21:36:43.000000000 +0000
38 +++ coreutils-6.9/src/cut.c 2007-04-07 16:59:55.000000000 +0000
39 @@ -29,6 +29,11 @@
40 #include <assert.h>
41 #include <getopt.h>
42 #include <sys/types.h>
43 +
44 +/* Get mbstate_t, mbrtowc(). */
45 +#if HAVE_WCHAR_H
46 +# include <wchar.h>
47 +#endif
48 #include "system.h"
49
50 #include "error.h"
51 @@ -37,6 +42,18 @@
52 #include "quote.h"
53 #include "xstrndup.h"
54
55 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
56 + installation; work around this configuration error. */
57 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
58 +# undef MB_LEN_MAX
59 +# define MB_LEN_MAX 16
60 +#endif
61 +
62 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
63 +#if HAVE_MBRTOWC && defined mbstate_t
64 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
65 +#endif
66 +
67 /* The official name of this program (e.g., no `g' prefix). */
68 #define PROGRAM_NAME "cut"
69
70 @@ -67,6 +84,52 @@
71 } \
72 while (0)
73
74 +/* Refill the buffer BUF to get a multibyte character. */
75 +#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \
76 + do \
77 + { \
78 + if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \
79 + { \
80 + memmove (BUF, BUFPOS, BUFLEN); \
81 + BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \
82 + BUFPOS = BUF; \
83 + } \
84 + } \
85 + while (0)
86 +
87 +/* Get wide character on BUFPOS. BUFPOS is not included after that.
88 + If byte sequence is not valid as a character, CONVFAIL is 1. Otherwise 0. */
89 +#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \
90 + do \
91 + { \
92 + mbstate_t state_bak; \
93 + \
94 + if (BUFLEN < 1) \
95 + { \
96 + WC = WEOF; \
97 + break; \
98 + } \
99 + \
100 + /* Get a wide character. */ \
101 + CONVFAIL = 0; \
102 + state_bak = STATE; \
103 + MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \
104 + \
105 + switch (MBLENGTH) \
106 + { \
107 + case (size_t)-1: \
108 + case (size_t)-2: \
109 + CONVFAIL++; \
110 + STATE = state_bak; \
111 + /* Fall througn. */ \
112 + \
113 + case 0: \
114 + MBLENGTH = 1; \
115 + break; \
116 + } \
117 + } \
118 + while (0)
119 +
120 struct range_pair
121 {
122 size_t lo;
123 @@ -85,7 +148,7 @@
124 /* The number of bytes allocated for FIELD_1_BUFFER. */
125 static size_t field_1_bufsize;
126
127 -/* The largest field or byte index used as an endpoint of a closed
128 +/* The largest byte, character or field index used as an endpoint of a closed
129 or degenerate range specification; this doesn't include the starting
130 index of right-open-ended ranges. For example, with either range spec
131 `2-5,9-', `2-3,5,9-' this variable would be set to 5. */
132 @@ -97,10 +160,11 @@
133
134 /* This is a bit vector.
135 In byte mode, which bytes to output.
136 + In character mode, which characters to output.
137 In field mode, which DELIM-separated fields to output.
138 - Both bytes and fields are numbered starting with 1,
139 + Bytes, characters and fields are numbered starting with 1,
140 so the zeroth bit of this array is unused.
141 - A field or byte K has been selected if
142 + A byte, character or field K has been selected if
143 (K <= MAX_RANGE_ENDPOINT and is_printable_field(K))
144 || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */
145 static unsigned char *printable_field;
146 @@ -109,9 +173,12 @@
147 {
148 undefined_mode,
149
150 - /* Output characters that are in the given bytes. */
151 + /* Output bytes that are at the given positions. */
152 byte_mode,
153
154 + /* Output characters that are at the given positions. */
155 + character_mode,
156 +
157 /* Output the given delimeter-separated fields. */
158 field_mode
159 };
160 @@ -121,6 +188,13 @@
161
162 static enum operating_mode operating_mode;
163
164 +/* If nonzero, when in byte mode, don't split multibyte characters. */
165 +static int byte_mode_character_aware;
166 +
167 +/* If nonzero, the function for single byte locale is work
168 + if this program runs on multibyte locale. */
169 +static int force_singlebyte_mode;
170 +
171 /* If true do not output lines containing no delimeter characters.
172 Otherwise, all such lines are printed. This option is valid only
173 with field mode. */
174 @@ -132,6 +206,9 @@
175
176 /* The delimeter character for field mode. */
177 static unsigned char delim;
178 +#if HAVE_WCHAR_H
179 +static wchar_t wcdelim;
180 +#endif
181
182 /* True if the --output-delimiter=STRING option was specified. */
183 static bool output_delimiter_specified;
184 @@ -205,7 +282,7 @@
185 -f, --fields=LIST select only these fields; also print any line\n\
186 that contains no delimiter character, unless\n\
187 the -s option is specified\n\
188 - -n (ignored)\n\
189 + -n with -b: don't split multibyte characters\n\
190 "), stdout);
191 fputs (_("\
192 --complement complement the set of selected bytes, characters\n\
193 @@ -362,7 +439,7 @@
194 in_digits = false;
195 /* Starting a range. */
196 if (dash_found)
197 - FATAL_ERROR (_("invalid byte or field list"));
198 + FATAL_ERROR (_("invalid byte, character or field list"));
199 dash_found = true;
200 fieldstr++;
201
202 @@ -387,14 +464,16 @@
203 if (value == 0)
204 {
205 /* `n-'. From `initial' to end of line. */
206 - eol_range_start = initial;
207 + if (eol_range_start == 0 ||
208 + (eol_range_start != 0 && eol_range_start > initial))
209 + eol_range_start = initial;
210 field_found = true;
211 }
212 else
213 {
214 /* `m-n' or `-n' (1-n). */
215 if (value < initial)
216 - FATAL_ERROR (_("invalid byte or field list"));
217 + FATAL_ERROR (_("invalid byte, character or field list"));
218
219 /* Is there already a range going to end of line? */
220 if (eol_range_start != 0)
221 @@ -467,6 +546,9 @@
222 if (operating_mode == byte_mode)
223 error (0, 0,
224 _("byte offset %s is too large"), quote (bad_num));
225 + else if (operating_mode == character_mode)
226 + error (0, 0,
227 + _("character offset %s is too large"), quote (bad_num));
228 else
229 error (0, 0,
230 _("field number %s is too large"), quote (bad_num));
231 @@ -477,7 +559,7 @@
232 fieldstr++;
233 }
234 else
235 - FATAL_ERROR (_("invalid byte or field list"));
236 + FATAL_ERROR (_("invalid byte, character or field list"));
237 }
238
239 max_range_endpoint = 0;
240 @@ -570,6 +652,63 @@
241 }
242 }
243
244 +#if HAVE_MBRTOWC
245 +/* This function is in use for the following case.
246 +
247 + 1. Read from the stream STREAM, printing to standard output any selected
248 + characters.
249 +
250 + 2. Read from stream STREAM, printing to standard output any selected bytes,
251 + without splitting multibyte characters. */
252 +
253 +static void
254 +cut_characters_or_cut_bytes_no_split (FILE *stream)
255 +{
256 + int idx; /* number of bytes or characters in the line so far. */
257 + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
258 + char *bufpos; /* Next read position of BUF. */
259 + size_t buflen; /* The length of the byte sequence in buf. */
260 + wint_t wc; /* A gotten wide character. */
261 + size_t mblength; /* The byte size of a multibyte character which shows
262 + as same character as WC. */
263 + mbstate_t state; /* State of the stream. */
264 + int convfail; /* 1, when conversion is failed. Otherwise 0. */
265 +
266 + idx = 0;
267 + buflen = 0;
268 + bufpos = buf;
269 + memset (&state, '\0', sizeof(mbstate_t));
270 +
271 + while (1)
272 + {
273 + REFILL_BUFFER (buf, bufpos, buflen, stream);
274 +
275 + GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail);
276 +
277 + if (wc == WEOF)
278 + {
279 + if (idx > 0)
280 + putchar ('\n');
281 + break;
282 + }
283 + else if (wc == L'\n')
284 + {
285 + putchar ('\n');
286 + idx = 0;
287 + }
288 + else
289 + {
290 + idx += (operating_mode == byte_mode) ? mblength : 1;
291 + if (print_kth (idx, NULL))
292 + fwrite (bufpos, mblength, sizeof(char), stdout);
293 + }
294 +
295 + buflen -= mblength;
296 + bufpos += mblength;
297 + }
298 +}
299 +#endif
300 +
301 /* Read from stream STREAM, printing to standard output any selected fields. */
302
303 static void
304 @@ -692,13 +831,192 @@
305 }
306 }
307
308 +#if HAVE_MBRTOWC
309 +static void
310 +cut_fields_mb (FILE *stream)
311 +{
312 + int c;
313 + unsigned int field_idx;
314 + int found_any_selected_field;
315 + int buffer_first_field;
316 + int empty_input;
317 + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
318 + char *bufpos; /* Next read position of BUF. */
319 + size_t buflen; /* The length of the byte sequence in buf. */
320 + wint_t wc = 0; /* A gotten wide character. */
321 + size_t mblength; /* The byte size of a multibyte character which shows
322 + as same character as WC. */
323 + mbstate_t state; /* State of the stream. */
324 + int convfail; /* 1, when conversion is failed. Otherwise 0. */
325 +
326 + found_any_selected_field = 0;
327 + field_idx = 1;
328 + bufpos = buf;
329 + buflen = 0;
330 + memset (&state, '\0', sizeof(mbstate_t));
331 +
332 + c = getc (stream);
333 + empty_input = (c == EOF);
334 + if (c != EOF)
335 + ungetc (c, stream);
336 + else
337 + wc = WEOF;
338 +
339 + /* To support the semantics of the -s flag, we may have to buffer
340 + all of the first field to determine whether it is `delimited.'
341 + But that is unnecessary if all non-delimited lines must be printed
342 + and the first field has been selected, or if non-delimited lines
343 + must be suppressed and the first field has *not* been selected.
344 + That is because a non-delimited line has exactly one field. */
345 + buffer_first_field = (suppress_non_delimited ^ !print_kth (1, NULL));
346 +
347 + while (1)
348 + {
349 + if (field_idx == 1 && buffer_first_field)
350 + {
351 + int len = 0;
352 +
353 + while (1)
354 + {
355 + REFILL_BUFFER (buf, bufpos, buflen, stream);
356 +
357 + GET_NEXT_WC_FROM_BUFFER
358 + (wc, bufpos, buflen, mblength, state, convfail);
359 +
360 + if (wc == WEOF)
361 + break;
362 +
363 + field_1_buffer = xrealloc (field_1_buffer, len + mblength);
364 + memcpy (field_1_buffer + len, bufpos, mblength);
365 + len += mblength;
366 + buflen -= mblength;
367 + bufpos += mblength;
368 +
369 + if (!convfail && (wc == L'\n' || wc == wcdelim))
370 + break;
371 + }
372 +
373 + if (wc == WEOF)
374 + break;
375 +
376 + /* If the first field extends to the end of line (it is not
377 + delimited) and we are printing all non-delimited lines,
378 + print this one. */
379 + if (convfail || (!convfail && wc != wcdelim))
380 + {
381 + if (suppress_non_delimited)
382 + {
383 + /* Empty. */
384 + }
385 + else
386 + {
387 + fwrite (field_1_buffer, sizeof (char), len, stdout);
388 + /* Make sure the output line is newline terminated. */
389 + if (convfail || (!convfail && wc != L'\n'))
390 + putchar ('\n');
391 + }
392 + continue;
393 + }
394 +
395 + if (print_kth (1, NULL))
396 + {
397 + /* Print the field, but not the trailing delimiter. */
398 + fwrite (field_1_buffer, sizeof (char), len - 1, stdout);
399 + found_any_selected_field = 1;
400 + }
401 + ++field_idx;
402 + }
403 +
404 + if (wc != WEOF)
405 + {
406 + if (print_kth (field_idx, NULL))
407 + {
408 + if (found_any_selected_field)
409 + {
410 + fwrite (output_delimiter_string, sizeof (char),
411 + output_delimiter_length, stdout);
412 + }
413 + found_any_selected_field = 1;
414 + }
415 +
416 + while (1)
417 + {
418 + REFILL_BUFFER (buf, bufpos, buflen, stream);
419 +
420 + GET_NEXT_WC_FROM_BUFFER
421 + (wc, bufpos, buflen, mblength, state, convfail);
422 +
423 + if (wc == WEOF)
424 + break;
425 + else if (!convfail && (wc == wcdelim || wc == L'\n'))
426 + {
427 + buflen -= mblength;
428 + bufpos += mblength;
429 + break;
430 + }
431 +
432 + if (print_kth (field_idx, NULL))
433 + fwrite (bufpos, mblength, sizeof(char), stdout);
434 +
435 + buflen -= mblength;
436 + bufpos += mblength;
437 + }
438 + }
439 +
440 + if ((!convfail || wc == L'\n') && buflen < 1)
441 + wc = WEOF;
442 +
443 + if (!convfail && wc == wcdelim)
444 + ++field_idx;
445 + else if (wc == WEOF || (!convfail && wc == L'\n'))
446 + {
447 + if (found_any_selected_field
448 + || (!empty_input && !(suppress_non_delimited && field_idx == 1)))
449 + putchar ('\n');
450 + if (wc == WEOF)
451 + break;
452 + field_idx = 1;
453 + found_any_selected_field = 0;
454 + }
455 + }
456 +}
457 +#endif
458 +
459 static void
460 cut_stream (FILE *stream)
461 {
462 - if (operating_mode == byte_mode)
463 - cut_bytes (stream);
464 +#if HAVE_MBRTOWC
465 + if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
466 + {
467 + switch (operating_mode)
468 + {
469 + case byte_mode:
470 + if (byte_mode_character_aware)
471 + cut_characters_or_cut_bytes_no_split (stream);
472 + else
473 + cut_bytes (stream);
474 + break;
475 +
476 + case character_mode:
477 + cut_characters_or_cut_bytes_no_split (stream);
478 + break;
479 +
480 + case field_mode:
481 + cut_fields_mb (stream);
482 + break;
483 +
484 + default:
485 + abort ();
486 + }
487 + }
488 else
489 - cut_fields (stream);
490 +#endif
491 + {
492 + if (operating_mode == field_mode)
493 + cut_fields (stream);
494 + else
495 + cut_bytes (stream);
496 + }
497 }
498
499 /* Process file FILE to standard output.
500 @@ -748,6 +1066,8 @@
501 bool ok;
502 bool delim_specified = false;
503 char *spec_list_string IF_LINT(= NULL);
504 + char mbdelim[MB_LEN_MAX + 1];
505 + size_t delimlen = 0;
506
507 initialize_main (&argc, &argv);
508 program_name = argv[0];
509 @@ -770,7 +1090,6 @@
510 switch (optc)
511 {
512 case 'b':
513 - case 'c':
514 /* Build the byte list. */
515 if (operating_mode != undefined_mode)
516 FATAL_ERROR (_("only one type of list may be specified"));
517 @@ -778,6 +1097,14 @@
518 spec_list_string = optarg;
519 break;
520
521 + case 'c':
522 + /* Build the character list. */
523 + if (operating_mode != undefined_mode)
524 + FATAL_ERROR (_("only one type of list may be specified"));
525 + operating_mode = character_mode;
526 + spec_list_string = optarg;
527 + break;
528 +
529 case 'f':
530 /* Build the field list. */
531 if (operating_mode != undefined_mode)
532 @@ -789,10 +1116,35 @@
533 case 'd':
534 /* New delimiter. */
535 /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */
536 - if (optarg[0] != '\0' && optarg[1] != '\0')
537 - FATAL_ERROR (_("the delimiter must be a single character"));
538 - delim = optarg[0];
539 - delim_specified = true;
540 +#if HAVE_MBRTOWC
541 + {
542 + if(MB_CUR_MAX > 1)
543 + {
544 + mbstate_t state;
545 +
546 + memset (&state, '\0', sizeof(mbstate_t));
547 + delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state);
548 +
549 + if (delimlen == (size_t)-1 || delimlen == (size_t)-2)
550 + ++force_singlebyte_mode;
551 + else
552 + {
553 + delimlen = (delimlen < 1) ? 1 : delimlen;
554 + if (wcdelim != L'\0' && *(optarg + delimlen) != '\0')
555 + FATAL_ERROR (_("the delimiter must be a single character"));
556 + memcpy (mbdelim, optarg, delimlen);
557 + }
558 + }
559 +
560 + if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
561 +#endif
562 + {
563 + if (optarg[0] != '\0' && optarg[1] != '\0')
564 + FATAL_ERROR (_("the delimiter must be a single character"));
565 + delim = (unsigned char) optarg[0];
566 + }
567 + delim_specified = true;
568 + }
569 break;
570
571 case OUTPUT_DELIMITER_OPTION:
572 @@ -805,6 +1157,7 @@
573 break;
574
575 case 'n':
576 + byte_mode_character_aware = 1;
577 break;
578
579 case 's':
580 @@ -827,7 +1180,7 @@
581 if (operating_mode == undefined_mode)
582 FATAL_ERROR (_("you must specify a list of bytes, characters, or fields"));
583
584 - if (delim != '\0' && operating_mode != field_mode)
585 + if (delim_specified && operating_mode != field_mode)
586 FATAL_ERROR (_("an input delimiter may be specified only\
587 when operating on fields"));
588
589 @@ -854,15 +1207,34 @@
590 }
591
592 if (!delim_specified)
593 - delim = '\t';
594 + {
595 + delim = '\t';
596 +#ifdef HAVE_MBRTOWC
597 + wcdelim = L'\t';
598 + mbdelim[0] = '\t';
599 + mbdelim[1] = '\0';
600 + delimlen = 1;
601 +#endif
602 + }
603
604 if (output_delimiter_string == NULL)
605 {
606 - static char dummy[2];
607 - dummy[0] = delim;
608 - dummy[1] = '\0';
609 - output_delimiter_string = dummy;
610 - output_delimiter_length = 1;
611 +#ifdef HAVE_MBRTOWC
612 + if (MB_CUR_MAX > 1 && !force_singlebyte_mode)
613 + {
614 + output_delimiter_string = xstrdup(mbdelim);
615 + output_delimiter_length = delimlen;
616 + }
617 +
618 + if (MB_CUR_MAX <= 1 || force_singlebyte_mode)
619 +#endif
620 + {
621 + static char dummy[2];
622 + dummy[0] = delim;
623 + dummy[1] = '\0';
624 + output_delimiter_string = dummy;
625 + output_delimiter_length = 1;
626 + }
627 }
628
629 if (optind == argc)
630 diff -Naur coreutils-6.9.orig/src/expand.c coreutils-6.9/src/expand.c
631 --- coreutils-6.9.orig/src/expand.c 2007-03-18 21:36:43.000000000 +0000
632 +++ coreutils-6.9/src/expand.c 2007-04-07 16:59:55.000000000 +0000
633 @@ -38,11 +38,28 @@
634 #include <stdio.h>
635 #include <getopt.h>
636 #include <sys/types.h>
637 +
638 +/* Get mbstate_t, mbrtowc(), wcwidth(). */
639 +#if HAVE_WCHAR_H
640 +# include <wchar.h>
641 +#endif
642 +
643 #include "system.h"
644 #include "error.h"
645 #include "quote.h"
646 #include "xstrndup.h"
647
648 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
649 + installation; work around this configuration error. */
650 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
651 +# define MB_LEN_MAX 16
652 +#endif
653 +
654 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
655 +#if HAVE_MBRTOWC && defined mbstate_t
656 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
657 +#endif
658 +
659 /* The official name of this program (e.g., no `g' prefix). */
660 #define PROGRAM_NAME "expand"
661
662 @@ -183,6 +200,7 @@
663 stops = num_start + len - 1;
664 }
665 }
666 +
667 else
668 {
669 error (0, 0, _("tab size contains invalid character(s): %s"),
670 @@ -365,6 +383,142 @@
671 }
672 }
673
674 +#if HAVE_MBRTOWC
675 +static void
676 +expand_multibyte (void)
677 +{
678 + FILE *fp; /* Input strem. */
679 + mbstate_t i_state; /* Current shift state of the input stream. */
680 + mbstate_t i_state_bak; /* Back up the I_STATE. */
681 + mbstate_t o_state; /* Current shift state of the output stream. */
682 + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
683 + char *bufpos; /* Next read position of BUF. */
684 + size_t buflen = 0; /* The length of the byte sequence in buf. */
685 + wchar_t wc; /* A gotten wide character. */
686 + size_t mblength; /* The byte size of a multibyte character
687 + which shows as same character as WC. */
688 + int tab_index = 0; /* Index in `tab_list' of next tabstop. */
689 + int column = 0; /* Column on screen of the next char. */
690 + int next_tab_column; /* Column the next tab stop is on. */
691 + int convert = 1; /* If nonzero, perform translations. */
692 +
693 + fp = next_file ((FILE *) NULL);
694 + if (fp == NULL)
695 + return;
696 +
697 + memset (&o_state, '\0', sizeof(mbstate_t));
698 + memset (&i_state, '\0', sizeof(mbstate_t));
699 +
700 + for (;;)
701 + {
702 + /* Refill the buffer BUF. */
703 + if (buflen < MB_LEN_MAX && !feof(fp) && !ferror(fp))
704 + {
705 + memmove (buf, bufpos, buflen);
706 + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, fp);
707 + bufpos = buf;
708 + }
709 +
710 + /* No character is left in BUF. */
711 + if (buflen < 1)
712 + {
713 + fp = next_file (fp);
714 +
715 + if (fp == NULL)
716 + break; /* No more files. */
717 + else
718 + {
719 + memset (&i_state, '\0', sizeof(mbstate_t));
720 + continue;
721 + }
722 + }
723 +
724 + /* Get a wide character. */
725 + i_state_bak = i_state;
726 + mblength = mbrtowc (&wc, bufpos, buflen, &i_state);
727 +
728 + switch (mblength)
729 + {
730 + case (size_t)-1: /* illegal byte sequence. */
731 + case (size_t)-2:
732 + mblength = 1;
733 + i_state = i_state_bak;
734 + if (convert)
735 + {
736 + ++column;
737 + if (convert_entire_line == 0)
738 + convert = 0;
739 + }
740 + putchar (*bufpos);
741 + break;
742 +
743 + case 0: /* null. */
744 + mblength = 1;
745 + if (convert && convert_entire_line == 0)
746 + convert = 0;
747 + putchar ('\0');
748 + break;
749 +
750 + default:
751 + if (wc == L'\n') /* LF. */
752 + {
753 + tab_index = 0;
754 + column = 0;
755 + convert = 1;
756 + putchar ('\n');
757 + }
758 + else if (wc == L'\t' && convert) /* Tab. */
759 + {
760 + if (tab_size == 0)
761 + {
762 + /* Do not let tab_index == first_free_tab;
763 + stop when it is 1 less. */
764 + while (tab_index < first_free_tab - 1
765 + && column >= tab_list[tab_index])
766 + tab_index++;
767 + next_tab_column = tab_list[tab_index];
768 + if (tab_index < first_free_tab - 1)
769 + tab_index++;
770 + if (column >= next_tab_column)
771 + next_tab_column = column + 1;
772 + }
773 + else
774 + next_tab_column = column + tab_size - column % tab_size;
775 +
776 + while (column < next_tab_column)
777 + {
778 + putchar (' ');
779 + ++column;
780 + }
781 + }
782 + else /* Others. */
783 + {
784 + if (convert)
785 + {
786 + if (wc == L'\b')
787 + {
788 + if (column > 0)
789 + --column;
790 + }
791 + else
792 + {
793 + int width; /* The width of WC. */
794 +
795 + width = wcwidth (wc);
796 + column += (width > 0) ? width : 0;
797 + if (convert_entire_line == 0)
798 + convert = 0;
799 + }
800 + }
801 + fwrite (bufpos, sizeof(char), mblength, stdout);
802 + }
803 + }
804 + buflen -= mblength;
805 + bufpos += mblength;
806 + }
807 +}
808 +#endif
809 +
810 int
811 main (int argc, char **argv)
812 {
813 @@ -429,7 +583,12 @@
814
815 file_list = (optind < argc ? &argv[optind] : stdin_argv);
816
817 - expand ();
818 +#if HAVE_MBRTOWC
819 + if (MB_CUR_MAX > 1)
820 + expand_multibyte ();
821 + else
822 +#endif
823 + expand ();
824
825 if (have_read_stdin && fclose (stdin) != 0)
826 error (EXIT_FAILURE, errno, "-");
827 diff -Naur coreutils-6.9.orig/src/fold.c coreutils-6.9/src/fold.c
828 --- coreutils-6.9.orig/src/fold.c 2007-03-18 21:36:43.000000000 +0000
829 +++ coreutils-6.9/src/fold.c 2007-04-07 16:59:55.000000000 +0000
830 @@ -23,11 +23,33 @@
831 #include <getopt.h>
832 #include <sys/types.h>
833
834 +/* Get mbstate_t, mbrtowc(), wcwidth(). */
835 +#if HAVE_WCHAR_H
836 +# include <wchar.h>
837 +#endif
838 +
839 +/* Get iswprint(), iswblank(), wcwidth(). */
840 +#if HAVE_WCTYPE_H
841 +# include <wctype.h>
842 +#endif
843 +
844 #include "system.h"
845 #include "error.h"
846 #include "quote.h"
847 #include "xstrtol.h"
848
849 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
850 + installation; work around this configuration error. */
851 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
852 +# undef MB_LEN_MAX
853 +# define MB_LEN_MAX 16
854 +#endif
855 +
856 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
857 +#if HAVE_MBRTOWC && defined mbstate_t
858 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
859 +#endif
860 +
861 #define TAB_WIDTH 8
862
863 /* The official name of this program (e.g., no `g' prefix). */
864 @@ -35,23 +57,44 @@
865
866 #define AUTHORS "David MacKenzie"
867
868 +#define FATAL_ERROR(Message) \
869 + do \
870 + { \
871 + error (0, 0, (Message)); \
872 + usage (2); \
873 + } \
874 + while (0)
875 +
876 +enum operating_mode
877 +{
878 + /* Fold texts by columns that are at the given positions. */
879 + column_mode,
880 +
881 + /* Fold texts by bytes that are at the given positions. */
882 + byte_mode,
883 +
884 + /* Fold texts by characters that are at the given positions. */
885 + character_mode,
886 +};
887 +
888 /* The name this program was run with. */
889 char *program_name;
890
891 +/* The argument shows current mode. (Default: column_mode) */
892 +static enum operating_mode operating_mode;
893 +
894 /* If nonzero, try to break on whitespace. */
895 static bool break_spaces;
896
897 -/* If nonzero, count bytes, not column positions. */
898 -static bool count_bytes;
899 -
900 /* If nonzero, at least one of the files we read was standard input. */
901 static bool have_read_stdin;
902
903 -static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
904 +static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
905
906 static struct option const longopts[] =
907 {
908 {"bytes", no_argument, NULL, 'b'},
909 + {"characters", no_argument, NULL, 'c'},
910 {"spaces", no_argument, NULL, 's'},
911 {"width", required_argument, NULL, 'w'},
912 {GETOPT_HELP_OPTION_DECL},
913 @@ -81,6 +124,7 @@
914 "), stdout);
915 fputs (_("\
916 -b, --bytes count bytes rather than columns\n\
917 + -c, --characters count characters rather than columns\n\
918 -s, --spaces break at spaces\n\
919 -w, --width=WIDTH use WIDTH columns instead of 80\n\
920 "), stdout);
921 @@ -98,7 +142,7 @@
922 static size_t
923 adjust_column (size_t column, char c)
924 {
925 - if (!count_bytes)
926 + if (operating_mode != byte_mode)
927 {
928 if (c == '\b')
929 {
930 @@ -117,34 +161,14 @@
931 return column;
932 }
933
934 -/* Fold file FILENAME, or standard input if FILENAME is "-",
935 - to stdout, with maximum line length WIDTH.
936 - Return true if successful. */
937 -
938 -static bool
939 -fold_file (char const *filename, size_t width)
940 +static void
941 +fold_text (FILE *istream, size_t width, int *saved_errno)
942 {
943 - FILE *istream;
944 int c;
945 size_t column = 0; /* Screen column where next char will go. */
946 size_t offset_out = 0; /* Index in `line_out' for next char. */
947 static char *line_out = NULL;
948 static size_t allocated_out = 0;
949 - int saved_errno;
950 -
951 - if (STREQ (filename, "-"))
952 - {
953 - istream = stdin;
954 - have_read_stdin = true;
955 - }
956 - else
957 - istream = fopen (filename, "r");
958 -
959 - if (istream == NULL)
960 - {
961 - error (0, errno, "%s", filename);
962 - return false;
963 - }
964
965 while ((c = getc (istream)) != EOF)
966 {
967 @@ -172,6 +196,15 @@
968 bool found_blank = false;
969 size_t logical_end = offset_out;
970
971 + /* If LINE_OUT has no wide character,
972 + put a new wide character in LINE_OUT
973 + if column is bigger than width. */
974 + if (offset_out == 0)
975 + {
976 + line_out[offset_out++] = c;
977 + continue;
978 + }
979 +
980 /* Look for the last blank. */
981 while (logical_end)
982 {
983 @@ -218,11 +251,225 @@
984 line_out[offset_out++] = c;
985 }
986
987 - saved_errno = errno;
988 + *saved_errno = errno;
989
990 if (offset_out)
991 fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
992
993 + free(line_out);
994 +}
995 +
996 +#if HAVE_MBRTOWC
997 +static void
998 +fold_multibyte_text (FILE *istream, int width, int *saved_errno)
999 +{
1000 + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
1001 + size_t buflen = 0; /* The length of the byte sequence in buf. */
1002 + char *bufpos; /* Next read position of BUF. */
1003 + wint_t wc; /* A gotten wide character. */
1004 + size_t mblength; /* The byte size of a multibyte character which shows
1005 + as same character as WC. */
1006 + mbstate_t state, state_bak; /* State of the stream. */
1007 + int convfail; /* 1, when conversion is failed. Otherwise 0. */
1008 +
1009 + char *line_out = NULL;
1010 + size_t offset_out = 0; /* Index in `line_out' for next char. */
1011 + size_t allocated_out = 0;
1012 +
1013 + int increment;
1014 + size_t column = 0;
1015 +
1016 + size_t last_blank_pos;
1017 + size_t last_blank_column;
1018 + int is_blank_seen;
1019 + int last_blank_increment;
1020 + int is_bs_following_last_blank;
1021 + size_t bs_following_last_blank_num;
1022 + int is_cr_after_last_blank;
1023 +
1024 +#define CLEAR_FLAGS \
1025 + do \
1026 + { \
1027 + last_blank_pos = 0; \
1028 + last_blank_column = 0; \
1029 + is_blank_seen = 0; \
1030 + is_bs_following_last_blank = 0; \
1031 + bs_following_last_blank_num = 0; \
1032 + is_cr_after_last_blank = 0; \
1033 + } \
1034 + while (0)
1035 +
1036 +#define START_NEW_LINE \
1037 + do \
1038 + { \
1039 + putchar ('\n'); \
1040 + column = 0; \
1041 + offset_out = 0; \
1042 + CLEAR_FLAGS; \
1043 + } \
1044 + while (0)
1045 +
1046 + CLEAR_FLAGS;
1047 + memset (&state, '\0', sizeof(mbstate_t));
1048 +
1049 + for (;; bufpos += mblength, buflen -= mblength)
1050 + {
1051 + if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream))
1052 + {
1053 + memmove (buf, bufpos, buflen);
1054 + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream);
1055 + bufpos = buf;
1056 + }
1057 +
1058 + if (buflen < 1)
1059 + break;
1060 +
1061 + /* Get a wide character. */
1062 + convfail = 0;
1063 + state_bak = state;
1064 + mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &state);
1065 +
1066 + switch (mblength)
1067 + {
1068 + case (size_t)-1:
1069 + case (size_t)-2:
1070 + convfail++;
1071 + state = state_bak;
1072 + /* Fall through. */
1073 +
1074 + case 0:
1075 + mblength = 1;
1076 + break;
1077 + }
1078 +
1079 +rescan:
1080 + if (operating_mode == byte_mode) /* byte mode */
1081 + increment = mblength;
1082 + else if (operating_mode == character_mode) /* character mode */
1083 + increment = 1;
1084 + else /* column mode */
1085 + {
1086 + if (convfail)
1087 + increment = 1;
1088 + else
1089 + {
1090 + switch (wc)
1091 + {
1092 + case L'\n':
1093 + fwrite (line_out, sizeof(char), offset_out, stdout);
1094 + START_NEW_LINE;
1095 + continue;
1096 +
1097 + case L'\b':
1098 + increment = (column > 0) ? -1 : 0;
1099 + break;
1100 +
1101 + case L'\r':
1102 + increment = -1 * column;
1103 + break;
1104 +
1105 + case L'\t':
1106 + increment = 8 - column % 8;
1107 + break;
1108 +
1109 + default:
1110 + increment = wcwidth (wc);
1111 + increment = (increment < 0) ? 0 : increment;
1112 + }
1113 + }
1114 + }
1115 +
1116 + if (column + increment > width && break_spaces && last_blank_pos)
1117 + {
1118 + fwrite (line_out, sizeof(char), last_blank_pos, stdout);
1119 + putchar ('\n');
1120 +
1121 + offset_out = offset_out - last_blank_pos;
1122 + column = column - last_blank_column + ((is_cr_after_last_blank)
1123 + ? last_blank_increment : bs_following_last_blank_num);
1124 + memmove (line_out, line_out + last_blank_pos, offset_out);
1125 + CLEAR_FLAGS;
1126 + goto rescan;
1127 + }
1128 +
1129 + if (column + increment > width && column != 0)
1130 + {
1131 + fwrite (line_out, sizeof(char), offset_out, stdout);
1132 + START_NEW_LINE;
1133 + goto rescan;
1134 + }
1135 +
1136 + if (allocated_out < offset_out + mblength)
1137 + {
1138 + allocated_out += 1024;
1139 + line_out = xrealloc (line_out, allocated_out);
1140 + }
1141 +
1142 + memcpy (line_out + offset_out, bufpos, mblength);
1143 + offset_out += mblength;
1144 + column += increment;
1145 +
1146 + if (is_blank_seen && !convfail && wc == L'\r')
1147 + is_cr_after_last_blank = 1;
1148 +
1149 + if (is_bs_following_last_blank && !convfail && wc == L'\b')
1150 + ++bs_following_last_blank_num;
1151 + else
1152 + is_bs_following_last_blank = 0;
1153 +
1154 + if (break_spaces && !convfail && iswblank (wc))
1155 + {
1156 + last_blank_pos = offset_out;
1157 + last_blank_column = column;
1158 + is_blank_seen = 1;
1159 + last_blank_increment = increment;
1160 + is_bs_following_last_blank = 1;
1161 + bs_following_last_blank_num = 0;
1162 + is_cr_after_last_blank = 0;
1163 + }
1164 + }
1165 +
1166 + *saved_errno = errno;
1167 +
1168 + if (offset_out)
1169 + fwrite (line_out, sizeof (char), (size_t) offset_out, stdout);
1170 +
1171 + free(line_out);
1172 +}
1173 +#endif
1174 +
1175 +/* Fold file FILENAME, or standard input if FILENAME is "-",
1176 + to stdout, with maximum line length WIDTH.
1177 + Return 0 if successful, 1 if an error occurs. */
1178 +
1179 +static int
1180 +fold_file (char *filename, int width)
1181 +{
1182 + FILE *istream;
1183 + int saved_errno;
1184 +
1185 + if (STREQ (filename, "-"))
1186 + {
1187 + istream = stdin;
1188 + have_read_stdin = 1;
1189 + }
1190 + else
1191 + istream = fopen (filename, "r");
1192 +
1193 + if (istream == NULL)
1194 + {
1195 + error (0, errno, "%s", filename);
1196 + return 1;
1197 + }
1198 +
1199 + /* Define how ISTREAM is being folded. */
1200 +#if HAVE_MBRTOWC
1201 + if (MB_CUR_MAX > 1)
1202 + fold_multibyte_text (istream, width, &saved_errno);
1203 + else
1204 +#endif
1205 + fold_text (istream, width, &saved_errno);
1206 +
1207 if (ferror (istream))
1208 {
1209 error (0, saved_errno, "%s", filename);
1210 @@ -255,7 +502,8 @@
1211
1212 atexit (close_stdout);
1213
1214 - break_spaces = count_bytes = have_read_stdin = false;
1215 + operating_mode = column_mode;
1216 + break_spaces = have_read_stdin = false;
1217
1218 while ((optc = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1)
1219 {
1220 @@ -264,7 +512,15 @@
1221 switch (optc)
1222 {
1223 case 'b': /* Count bytes rather than columns. */
1224 - count_bytes = true;
1225 + if (operating_mode != column_mode)
1226 + FATAL_ERROR (_("only one way of folding may be specified"));
1227 + operating_mode = byte_mode;
1228 + break;
1229 +
1230 + case 'c':
1231 + if (operating_mode != column_mode)
1232 + FATAL_ERROR (_("only one way of folding may be specified"));
1233 + operating_mode = character_mode;
1234 break;
1235
1236 case 's': /* Break at word boundaries. */
1237 diff -Naur coreutils-6.9.orig/src/join.c coreutils-6.9/src/join.c
1238 --- coreutils-6.9.orig/src/join.c 2007-03-18 21:36:43.000000000 +0000
1239 +++ coreutils-6.9/src/join.c 2007-04-07 16:59:55.000000000 +0000
1240 @@ -23,16 +23,30 @@
1241 #include <sys/types.h>
1242 #include <getopt.h>
1243
1244 +/* Get mbstate_t, mbrtowc(), mbrtowc(), wcwidth(). */
1245 +#if HAVE_WCHAR_H
1246 +# include <wchar.h>
1247 +#endif
1248 +
1249 +/* Get iswblank(), towupper. */
1250 +#if HAVE_WCTYPE_H
1251 +# include <wctype.h>
1252 +#endif
1253 +
1254 #include "system.h"
1255 #include "error.h"
1256 #include "hard-locale.h"
1257 #include "linebuffer.h"
1258 -#include "memcasecmp.h"
1259 #include "quote.h"
1260 #include "stdio--.h"
1261 #include "xmemcoll.h"
1262 #include "xstrtol.h"
1263
1264 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1265 +#if HAVE_MBRTOWC && defined mbstate_t
1266 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1267 +#endif
1268 +
1269 /* The official name of this program (e.g., no `g' prefix). */
1270 #define PROGRAM_NAME "join"
1271
1272 @@ -104,10 +118,12 @@
1273 /* Last element in `outlist', where a new element can be added. */
1274 static struct outlist *outlist_end = &outlist_head;
1275
1276 -/* Tab character separating fields. If negative, fields are separated
1277 - by any nonempty string of blanks, otherwise by exactly one
1278 - tab character whose value (when cast to unsigned char) equals TAB. */
1279 -static int tab = -1;
1280 +/* Tab character separating fields. If NULL, fields are separated
1281 + by any nonempty string of blanks. */
1282 +static char *tab = NULL;
1283 +
1284 +/* The number of bytes used for tab. */
1285 +static size_t tablen = 0;
1286
1287 static struct option const longopts[] =
1288 {
1289 @@ -190,6 +206,8 @@
1290
1291 /* Fill in the `fields' structure in LINE. */
1292
1293 +/* Fill in the `fields' structure in LINE. */
1294 +
1295 static void
1296 xfields (struct line *line)
1297 {
1298 @@ -199,10 +217,11 @@
1299 if (ptr == lim)
1300 return;
1301
1302 - if (0 <= tab)
1303 + if (tab != NULL)
1304 {
1305 + unsigned char t = tab[0];
1306 char *sep;
1307 - for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
1308 + for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
1309 extract_field (line, ptr, sep - ptr);
1310 }
1311 else
1312 @@ -229,6 +248,148 @@
1313 extract_field (line, ptr, lim - ptr);
1314 }
1315
1316 +#if HAVE_MBRTOWC
1317 +static void
1318 +xfields_multibyte (struct line *line)
1319 +{
1320 + char *ptr = line->buf.buffer;
1321 + char const *lim = ptr + line->buf.length - 1;
1322 + wchar_t wc = 0;
1323 + size_t mblength = 1;
1324 + mbstate_t state, state_bak;
1325 +
1326 + memset (&state, 0, sizeof (mbstate_t));
1327 +
1328 + if (ptr == lim)
1329 + return;
1330 +
1331 + if (tab != NULL)
1332 + {
1333 + unsigned char t = tab[0];
1334 + char *sep = ptr;
1335 + for (; ptr < lim; ptr = sep + mblength)
1336 + {
1337 + sep = ptr;
1338 + while (sep < lim)
1339 + {
1340 + state_bak = state;
1341 + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1342 +
1343 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1344 + {
1345 + mblength = 1;
1346 + state = state_bak;
1347 + }
1348 + mblength = (mblength < 1) ? 1 : mblength;
1349 +
1350 + if (mblength == tablen && !memcmp (sep, tab, mblength))
1351 + break;
1352 + else
1353 + {
1354 + sep += mblength;
1355 + continue;
1356 + }
1357 + }
1358 +
1359 + if (sep == lim)
1360 + break;
1361 +
1362 + extract_field (line, ptr, sep - ptr);
1363 + }
1364 + }
1365 + else
1366 + {
1367 + /* Skip leading blanks before the first field. */
1368 + while(ptr < lim)
1369 + {
1370 + state_bak = state;
1371 + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1372 +
1373 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1374 + {
1375 + mblength = 1;
1376 + state = state_bak;
1377 + break;
1378 + }
1379 + mblength = (mblength < 1) ? 1 : mblength;
1380 +
1381 + if (!iswblank(wc))
1382 + break;
1383 + ptr += mblength;
1384 + }
1385 +
1386 + do
1387 + {
1388 + char *sep;
1389 + state_bak = state;
1390 + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1391 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1392 + {
1393 + mblength = 1;
1394 + state = state_bak;
1395 + break;
1396 + }
1397 + mblength = (mblength < 1) ? 1 : mblength;
1398 +
1399 + sep = ptr + mblength;
1400 + while (sep != lim)
1401 + {
1402 + state_bak = state;
1403 + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1404 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1405 + {
1406 + mblength = 1;
1407 + state = state_bak;
1408 + break;
1409 + }
1410 + mblength = (mblength < 1) ? 1 : mblength;
1411 +
1412 + if (iswblank (wc))
1413 + break;
1414 +
1415 + sep += mblength;
1416 + }
1417 +
1418 + extract_field (line, ptr, sep - ptr);
1419 + if (sep == lim)
1420 + return;
1421 +
1422 + state_bak = state;
1423 + mblength = mbrtowc (&wc, sep, lim - sep + 1, &state);
1424 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1425 + {
1426 + mblength = 1;
1427 + state = state_bak;
1428 + break;
1429 + }
1430 + mblength = (mblength < 1) ? 1 : mblength;
1431 +
1432 + ptr = sep + mblength;
1433 + while (ptr != lim)
1434 + {
1435 + state_bak = state;
1436 + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state);
1437 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1438 + {
1439 + mblength = 1;
1440 + state = state_bak;
1441 + break;
1442 + }
1443 + mblength = (mblength < 1) ? 1 : mblength;
1444 +
1445 + if (!iswblank (wc))
1446 + break;
1447 +
1448 + ptr += mblength;
1449 + }
1450 + }
1451 + while (ptr != lim);
1452 + }
1453 +
1454 + extract_field (line, ptr, lim - ptr);
1455 +}
1456 +#endif
1457 +
1458 /* Read a line from FP into LINE and split it into fields.
1459 Return true if successful. */
1460
1461 @@ -249,6 +410,11 @@
1462 line->nfields_allocated = 0;
1463 line->nfields = 0;
1464 line->fields = NULL;
1465 +#if HAVE_MBRTOWC
1466 + if (MB_CUR_MAX > 1)
1467 + xfields_multibyte (line);
1468 + else
1469 +#endif
1470 xfields (line);
1471 return true;
1472 }
1473 @@ -303,56 +469,114 @@
1474 keycmp (struct line const *line1, struct line const *line2)
1475 {
1476 /* Start of field to compare in each file. */
1477 - char *beg1;
1478 - char *beg2;
1479 -
1480 - size_t len1;
1481 - size_t len2; /* Length of fields to compare. */
1482 + char *beg[2];
1483 + char *copy[2];
1484 + size_t len[2]; /* Length of fields to compare. */
1485 int diff;
1486 + int i, j;
1487
1488 if (join_field_1 < line1->nfields)
1489 {
1490 - beg1 = line1->fields[join_field_1].beg;
1491 - len1 = line1->fields[join_field_1].len;
1492 + beg[0] = line1->fields[join_field_1].beg;
1493 + len[0] = line1->fields[join_field_1].len;
1494 }
1495 else
1496 {
1497 - beg1 = NULL;
1498 - len1 = 0;
1499 + beg[0] = NULL;
1500 + len[0] = 0;
1501 }
1502
1503 if (join_field_2 < line2->nfields)
1504 {
1505 - beg2 = line2->fields[join_field_2].beg;
1506 - len2 = line2->fields[join_field_2].len;
1507 + beg[1] = line2->fields[join_field_2].beg;
1508 + len[1] = line2->fields[join_field_2].len;
1509 }
1510 else
1511 {
1512 - beg2 = NULL;
1513 - len2 = 0;
1514 + beg[1] = NULL;
1515 + len[1] = 0;
1516 }
1517
1518 - if (len1 == 0)
1519 - return len2 == 0 ? 0 : -1;
1520 - if (len2 == 0)
1521 + if (len[0] == 0)
1522 + return len[1] == 0 ? 0 : -1;
1523 + if (len[1] == 0)
1524 return 1;
1525
1526 if (ignore_case)
1527 {
1528 - /* FIXME: ignore_case does not work with NLS (in particular,
1529 - with multibyte chars). */
1530 - diff = memcasecmp (beg1, beg2, MIN (len1, len2));
1531 +#ifdef HAVE_MBRTOWC
1532 + if (MB_CUR_MAX > 1)
1533 + {
1534 + size_t mblength;
1535 + wchar_t wc, uwc;
1536 + mbstate_t state, state_bak;
1537 +
1538 + memset (&state, '\0', sizeof (mbstate_t));
1539 +
1540 + for (i = 0; i < 2; i++)
1541 + {
1542 + copy[i] = alloca (len[i] + 1);
1543 +
1544 + for (j = 0; j < MIN (len[0], len[1]);)
1545 + {
1546 + state_bak = state;
1547 + mblength = mbrtowc (&wc, beg[i] + j, len[i] - j, &state);
1548 +
1549 + switch (mblength)
1550 + {
1551 + case (size_t) -1:
1552 + case (size_t) -2:
1553 + state = state_bak;
1554 + /* Fall through */
1555 + case 0:
1556 + mblength = 1;
1557 + break;
1558 +
1559 + default:
1560 + uwc = towupper (wc);
1561 +
1562 + if (uwc != wc)
1563 + {
1564 + mbstate_t state_wc;
1565 +
1566 + memset (&state_wc, '\0', sizeof (mbstate_t));
1567 + wcrtomb (copy[i] + j, uwc, &state_wc);
1568 + }
1569 + else
1570 + memcpy (copy[i] + j, beg[i] + j, mblength);
1571 + }
1572 + j += mblength;
1573 + }
1574 + copy[i][j] = '\0';
1575 + }
1576 + }
1577 + else
1578 +#endif
1579 + {
1580 + for (i = 0; i < 2; i++)
1581 + {
1582 + copy[i] = alloca (len[i] + 1);
1583 +
1584 + for (j = 0; j < MIN (len[0], len[1]); j++)
1585 + copy[i][j] = toupper (beg[i][j]);
1586 +
1587 + copy[i][j] = '\0';
1588 + }
1589 + }
1590 }
1591 else
1592 {
1593 - if (hard_LC_COLLATE)
1594 - return xmemcoll (beg1, len1, beg2, len2);
1595 - diff = memcmp (beg1, beg2, MIN (len1, len2));
1596 + copy[0] = (unsigned char *) beg[0];
1597 + copy[1] = (unsigned char *) beg[1];
1598 }
1599
1600 + if (hard_LC_COLLATE)
1601 + return xmemcoll ((char *) copy[0], len[0], (char *) copy[1], len[1]);
1602 + diff = memcmp (copy[0], copy[1], MIN (len[0], len[1]));
1603 +
1604 if (diff)
1605 return diff;
1606 - return len1 < len2 ? -1 : len1 != len2;
1607 + return len[0] - len[1];
1608 }
1609
1610 /* Print field N of LINE if it exists and is nonempty, otherwise
1611 @@ -377,11 +601,18 @@
1612
1613 /* Print the join of LINE1 and LINE2. */
1614
1615 +#define PUT_TAB_CHAR \
1616 + do \
1617 + { \
1618 + (tab != NULL) ? \
1619 + fwrite(tab, sizeof(char), tablen, stdout) : putchar (' '); \
1620 + } \
1621 + while (0)
1622 +
1623 static void
1624 prjoin (struct line const *line1, struct line const *line2)
1625 {
1626 const struct outlist *outlist;
1627 - char output_separator = tab < 0 ? ' ' : tab;
1628
1629 outlist = outlist_head.next;
1630 if (outlist)
1631 @@ -397,12 +628,12 @@
1632 if (o->file == 0)
1633 {
1634 if (line1 == &uni_blank)
1635 - {
1636 + {
1637 line = line2;
1638 field = join_field_2;
1639 }
1640 else
1641 - {
1642 + {
1643 line = line1;
1644 field = join_field_1;
1645 }
1646 @@ -416,7 +647,7 @@
1647 o = o->next;
1648 if (o == NULL)
1649 break;
1650 - putchar (output_separator);
1651 + PUT_TAB_CHAR;
1652 }
1653 putchar ('\n');
1654 }
1655 @@ -434,23 +665,23 @@
1656 prfield (join_field_1, line1);
1657 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
1658 {
1659 - putchar (output_separator);
1660 + PUT_TAB_CHAR;
1661 prfield (i, line1);
1662 }
1663 for (i = join_field_1 + 1; i < line1->nfields; ++i)
1664 {
1665 - putchar (output_separator);
1666 + PUT_TAB_CHAR;
1667 prfield (i, line1);
1668 }
1669
1670 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
1671 {
1672 - putchar (output_separator);
1673 + PUT_TAB_CHAR;
1674 prfield (i, line2);
1675 }
1676 for (i = join_field_2 + 1; i < line2->nfields; ++i)
1677 {
1678 - putchar (output_separator);
1679 + PUT_TAB_CHAR;
1680 prfield (i, line2);
1681 }
1682 putchar ('\n');
1683 @@ -859,20 +1090,41 @@
1684
1685 case 't':
1686 {
1687 - unsigned char newtab = optarg[0];
1688 - if (! newtab)
1689 + char *newtab;
1690 + size_t newtablen;
1691 + if (! optarg[0])
1692 error (EXIT_FAILURE, 0, _("empty tab"));
1693 - if (optarg[1])
1694 + newtab = xstrdup (optarg);
1695 +#if HAVE_MBRTOWC
1696 + if (MB_CUR_MAX > 1)
1697 + {
1698 + mbstate_t state;
1699 +
1700 + memset (&state, 0, sizeof (mbstate_t));
1701 + newtablen = mbrtowc (NULL, newtab,
1702 + strnlen (newtab, MB_LEN_MAX),
1703 + &state);
1704 + if (newtablen == (size_t) 0
1705 + || newtablen == (size_t) -1
1706 + || newtablen == (size_t) -2)
1707 + newtablen = 1;
1708 + }
1709 + else
1710 +#endif
1711 + newtablen = 1;
1712 +
1713 + if (newtablen == 1 && newtab[1])
1714 + {
1715 + if (STREQ (newtab, "\\0"))
1716 + newtab[0] = '\0';
1717 + }
1718 + if (tab != NULL && strcmp (tab, newtab))
1719 {
1720 - if (STREQ (optarg, "\\0"))
1721 - newtab = '\0';
1722 - else
1723 - error (EXIT_FAILURE, 0, _("multi-character tab %s"),
1724 - quote (optarg));
1725 + free (newtab);
1726 + error (EXIT_FAILURE, 0, _("incompatible tabs"));
1727 }
1728 - if (0 <= tab && tab != newtab)
1729 - error (EXIT_FAILURE, 0, _("incompatible tabs"));
1730 tab = newtab;
1731 + tablen = newtablen;
1732 }
1733 break;
1734
1735 diff -Naur coreutils-6.9.orig/src/pr.c coreutils-6.9/src/pr.c
1736 --- coreutils-6.9.orig/src/pr.c 2007-03-18 21:36:43.000000000 +0000
1737 +++ coreutils-6.9/src/pr.c 2007-04-07 16:59:55.000000000 +0000
1738 @@ -313,6 +313,32 @@
1739
1740 #include <getopt.h>
1741 #include <sys/types.h>
1742 +
1743 +/* Get MB_LEN_MAX. */
1744 +#include <limits.h>
1745 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
1746 + installation; work around this configuration error. */
1747 +#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
1748 +# define MB_LEN_MAX 16
1749 +#endif
1750 +
1751 +/* Get MB_CUR_MAX. */
1752 +#include <stdlib.h>
1753 +
1754 +/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
1755 +/* Get mbstate_t, mbrtowc(), wcwidth(). */
1756 +#if HAVE_WCHAR_H
1757 +# include <wchar.h>
1758 +#endif
1759 +
1760 +/* Get iswprint(). -- for wcwidth(). */
1761 +#if HAVE_WCTYPE_H
1762 +# include <wctype.h>
1763 +#endif
1764 +#if !defined iswprint && !HAVE_ISWPRINT
1765 +# define iswprint(wc) 1
1766 +#endif
1767 +
1768 #include "system.h"
1769 #include "error.h"
1770 #include "hard-locale.h"
1771 @@ -324,6 +350,18 @@
1772 #include "strftime.h"
1773 #include "xstrtol.h"
1774
1775 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
1776 +#if HAVE_MBRTOWC && defined mbstate_t
1777 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
1778 +#endif
1779 +
1780 +#ifndef HAVE_DECL_WCWIDTH
1781 +"this configure-time declaration test was not run"
1782 +#endif
1783 +#if !HAVE_DECL_WCWIDTH
1784 +extern int wcwidth ();
1785 +#endif
1786 +
1787 /* The official name of this program (e.g., no `g' prefix). */
1788 #define PROGRAM_NAME "pr"
1789
1790 @@ -416,7 +454,20 @@
1791
1792 #define NULLCOL (COLUMN *)0
1793
1794 -static int char_to_clump (char c);
1795 +/* Funtion pointers to switch functions for single byte locale or for
1796 + multibyte locale. If multibyte functions do not exist in your sysytem,
1797 + these pointers always point the function for single byte locale. */
1798 +static void (*print_char) (char c);
1799 +static int (*char_to_clump) (char c);
1800 +
1801 +/* Functions for single byte locale. */
1802 +static void print_char_single (char c);
1803 +static int char_to_clump_single (char c);
1804 +
1805 +/* Functions for multibyte locale. */
1806 +static void print_char_multi (char c);
1807 +static int char_to_clump_multi (char c);
1808 +
1809 static bool read_line (COLUMN *p);
1810 static bool print_page (void);
1811 static bool print_stored (COLUMN *p);
1812 @@ -426,6 +477,7 @@
1813 static void pad_across_to (int position);
1814 static void add_line_number (COLUMN *p);
1815 static void getoptarg (char *arg, char switch_char, char *character,
1816 + int *character_length, int *character_width,
1817 int *number);
1818 void usage (int status);
1819 static void print_files (int number_of_files, char **av);
1820 @@ -440,7 +492,6 @@
1821 static void pad_down (int lines);
1822 static void read_rest_of_line (COLUMN *p);
1823 static void skip_read (COLUMN *p, int column_number);
1824 -static void print_char (char c);
1825 static void cleanup (void);
1826 static void print_sep_string (void);
1827 static void separator_string (const char *optarg_S);
1828 @@ -455,7 +506,7 @@
1829 we store the leftmost columns contiguously in buff.
1830 To print a line from buff, get the index of the first character
1831 from line_vector[i], and print up to line_vector[i + 1]. */
1832 -static char *buff;
1833 +static unsigned char *buff;
1834
1835 /* Index of the position in buff where the next character
1836 will be stored. */
1837 @@ -559,7 +610,7 @@
1838 static bool untabify_input = false;
1839
1840 /* (-e) The input tab character. */
1841 -static char input_tab_char = '\t';
1842 +static char input_tab_char[MB_LEN_MAX] = "\t";
1843
1844 /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ...
1845 where the leftmost column is 1. */
1846 @@ -569,7 +620,10 @@
1847 static bool tabify_output = false;
1848
1849 /* (-i) The output tab character. */
1850 -static char output_tab_char = '\t';
1851 +static char output_tab_char[MB_LEN_MAX] = "\t";
1852 +
1853 +/* (-i) The byte length of output tab character. */
1854 +static int output_tab_char_length = 1;
1855
1856 /* (-i) The width of the output tab. */
1857 static int chars_per_output_tab = 8;
1858 @@ -643,7 +697,13 @@
1859 static bool numbered_lines = false;
1860
1861 /* (-n) Character which follows each line number. */
1862 -static char number_separator = '\t';
1863 +static char number_separator[MB_LEN_MAX] = "\t";
1864 +
1865 +/* (-n) The byte length of the character which follows each line number. */
1866 +static int number_separator_length = 1;
1867 +
1868 +/* (-n) The character width of the character which follows each line number. */
1869 +static int number_separator_width = 0;
1870
1871 /* (-n) line counting starts with 1st line of input file (not with 1st
1872 line of 1st page printed). */
1873 @@ -696,6 +756,7 @@
1874 -a|COLUMN|-m is a `space' and with the -J option a `tab'. */
1875 static char *col_sep_string = "";
1876 static int col_sep_length = 0;
1877 +static int col_sep_width = 0;
1878 static char *column_separator = " ";
1879 static char *line_separator = "\t";
1880
1881 @@ -852,6 +913,13 @@
1882 col_sep_length = (int) strlen (optarg_S);
1883 col_sep_string = xmalloc (col_sep_length + 1);
1884 strcpy (col_sep_string, optarg_S);
1885 +
1886 +#if HAVE_MBRTOWC
1887 + if (MB_CUR_MAX > 1)
1888 + col_sep_width = mbswidth (col_sep_string, 0);
1889 + else
1890 +#endif
1891 + col_sep_width = col_sep_length;
1892 }
1893
1894 int
1895 @@ -877,6 +945,21 @@
1896
1897 atexit (close_stdout);
1898
1899 +/* Define which functions are used, the ones for single byte locale or the ones
1900 + for multibyte locale. */
1901 +#if HAVE_MBRTOWC
1902 + if (MB_CUR_MAX > 1)
1903 + {
1904 + print_char = print_char_multi;
1905 + char_to_clump = char_to_clump_multi;
1906 + }
1907 + else
1908 +#endif
1909 + {
1910 + print_char = print_char_single;
1911 + char_to_clump = char_to_clump_single;
1912 + }
1913 +
1914 n_files = 0;
1915 file_names = (argc > 1
1916 ? xmalloc ((argc - 1) * sizeof (char *))
1917 @@ -949,8 +1032,12 @@
1918 break;
1919 case 'e':
1920 if (optarg)
1921 - getoptarg (optarg, 'e', &input_tab_char,
1922 - &chars_per_input_tab);
1923 + {
1924 + int dummy_length, dummy_width;
1925 +
1926 + getoptarg (optarg, 'e', input_tab_char, &dummy_length,
1927 + &dummy_width, &chars_per_input_tab);
1928 + }
1929 /* Could check tab width > 0. */
1930 untabify_input = true;
1931 break;
1932 @@ -963,8 +1050,12 @@
1933 break;
1934 case 'i':
1935 if (optarg)
1936 - getoptarg (optarg, 'i', &output_tab_char,
1937 - &chars_per_output_tab);
1938 + {
1939 + int dummy_width;
1940 +
1941 + getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length,
1942 + &dummy_width, &chars_per_output_tab);
1943 + }
1944 /* Could check tab width > 0. */
1945 tabify_output = true;
1946 break;
1947 @@ -991,8 +1082,8 @@
1948 case 'n':
1949 numbered_lines = true;
1950 if (optarg)
1951 - getoptarg (optarg, 'n', &number_separator,
1952 - &chars_per_number);
1953 + getoptarg (optarg, 'n', number_separator, &number_separator_length,
1954 + &number_separator_width, &chars_per_number);
1955 break;
1956 case 'N':
1957 skip_count = false;
1958 @@ -1031,7 +1122,7 @@
1959 old_s = false;
1960 /* Reset an additional input of -s, -S dominates -s */
1961 col_sep_string = "";
1962 - col_sep_length = 0;
1963 + col_sep_length = col_sep_width = 0;
1964 use_col_separator = true;
1965 if (optarg)
1966 separator_string (optarg);
1967 @@ -1188,10 +1279,45 @@
1968 a number. */
1969
1970 static void
1971 -getoptarg (char *arg, char switch_char, char *character, int *number)
1972 +getoptarg (char *arg, char switch_char, char *character, int *character_length,
1973 + int *character_width, int *number)
1974 {
1975 if (!ISDIGIT (*arg))
1976 - *character = *arg++;
1977 + {
1978 +#ifdef HAVE_MBRTOWC
1979 + if (MB_CUR_MAX > 1) /* for multibyte locale. */
1980 + {
1981 + wchar_t wc;
1982 + size_t mblength;
1983 + int width;
1984 + mbstate_t state = {'\0'};
1985 +
1986 + mblength = mbrtowc (&wc, arg, strnlen(arg, MB_LEN_MAX), &state);
1987 +
1988 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
1989 + {
1990 + *character_length = 1;
1991 + *character_width = 1;
1992 + }
1993 + else
1994 + {
1995 + *character_length = (mblength < 1) ? 1 : mblength;
1996 + width = wcwidth (wc);
1997 + *character_width = (width < 0) ? 0 : width;
1998 + }
1999 +
2000 + strncpy (character, arg, *character_length);
2001 + arg += *character_length;
2002 + }
2003 + else /* for single byte locale. */
2004 +#endif
2005 + {
2006 + *character = *arg++;
2007 + *character_length = 1;
2008 + *character_width = 1;
2009 + }
2010 + }
2011 +
2012 if (*arg)
2013 {
2014 long int tmp_long;
2015 @@ -1256,7 +1382,7 @@
2016 else
2017 col_sep_string = column_separator;
2018
2019 - col_sep_length = 1;
2020 + col_sep_length = col_sep_width = 1;
2021 use_col_separator = true;
2022 }
2023 /* It's rather pointless to define a TAB separator with column
2024 @@ -1287,11 +1413,11 @@
2025 TAB_WIDTH (chars_per_input_tab, chars_per_number); */
2026
2027 /* Estimate chars_per_text without any margin and keep it constant. */
2028 - if (number_separator == '\t')
2029 + if (number_separator[0] == '\t')
2030 number_width = chars_per_number +
2031 TAB_WIDTH (chars_per_default_tab, chars_per_number);
2032 else
2033 - number_width = chars_per_number + 1;
2034 + number_width = chars_per_number + number_separator_width;
2035
2036 /* The number is part of the column width unless we are
2037 printing files in parallel. */
2038 @@ -1306,7 +1432,7 @@
2039 }
2040
2041 chars_per_column = (chars_per_line - chars_used_by_number -
2042 - (columns - 1) * col_sep_length) / columns;
2043 + (columns - 1) * col_sep_width) / columns;
2044
2045 if (chars_per_column < 1)
2046 error (EXIT_FAILURE, 0, _("page width too narrow"));
2047 @@ -1431,7 +1557,7 @@
2048
2049 /* Enlarge p->start_position of first column to use the same form of
2050 padding_not_printed with all columns. */
2051 - h = h + col_sep_length;
2052 + h = h + col_sep_width;
2053
2054 /* This loop takes care of all but the rightmost column. */
2055
2056 @@ -1465,7 +1591,7 @@
2057 }
2058 else
2059 {
2060 - h = h_next + col_sep_length;
2061 + h = h_next + col_sep_width;
2062 h_next = h + chars_per_column;
2063 }
2064 }
2065 @@ -1755,9 +1881,9 @@
2066 align_column (COLUMN *p)
2067 {
2068 padding_not_printed = p->start_position;
2069 - if (padding_not_printed - col_sep_length > 0)
2070 + if (padding_not_printed - col_sep_width > 0)
2071 {
2072 - pad_across_to (padding_not_printed - col_sep_length);
2073 + pad_across_to (padding_not_printed - col_sep_width);
2074 padding_not_printed = ANYWHERE;
2075 }
2076
2077 @@ -2028,13 +2154,13 @@
2078 /* May be too generous. */
2079 buff = X2REALLOC (buff, &buff_allocated);
2080 }
2081 - buff[buff_current++] = c;
2082 + buff[buff_current++] = (unsigned char) c;
2083 }
2084
2085 static void
2086 add_line_number (COLUMN *p)
2087 {
2088 - int i;
2089 + int i, j;
2090 char *s;
2091 int left_cut;
2092
2093 @@ -2057,22 +2183,24 @@
2094 /* Tabification is assumed for multiple columns, also for n-separators,
2095 but `default n-separator = TAB' hasn't been given priority over
2096 equal column_width also specified by POSIX. */
2097 - if (number_separator == '\t')
2098 + if (number_separator[0] == '\t')
2099 {
2100 i = number_width - chars_per_number;
2101 while (i-- > 0)
2102 (p->char_func) (' ');
2103 }
2104 else
2105 - (p->char_func) (number_separator);
2106 + for (j = 0; j < number_separator_length; j++)
2107 + (p->char_func) (number_separator[j]);
2108 }
2109 else
2110 /* To comply with POSIX, we avoid any expansion of default TAB
2111 separator with a single column output. No column_width requirement
2112 has to be considered. */
2113 {
2114 - (p->char_func) (number_separator);
2115 - if (number_separator == '\t')
2116 + for (j = 0; j < number_separator_length; j++)
2117 + (p->char_func) (number_separator[j]);
2118 + if (number_separator[0] == '\t')
2119 output_position = POS_AFTER_TAB (chars_per_output_tab,
2120 output_position);
2121 }
2122 @@ -2233,7 +2361,7 @@
2123 while (goal - h_old > 1
2124 && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal)
2125 {
2126 - putchar (output_tab_char);
2127 + fwrite (output_tab_char, sizeof(char), output_tab_char_length, stdout);
2128 h_old = h_new;
2129 }
2130 while (++h_old <= goal)
2131 @@ -2253,6 +2381,7 @@
2132 {
2133 char *s;
2134 int l = col_sep_length;
2135 + int not_space_flag;
2136
2137 s = col_sep_string;
2138
2139 @@ -2266,6 +2395,7 @@
2140 {
2141 for (; separators_not_printed > 0; --separators_not_printed)
2142 {
2143 + not_space_flag = 0;
2144 while (l-- > 0)
2145 {
2146 /* 3 types of sep_strings: spaces only, spaces and chars,
2147 @@ -2279,12 +2409,15 @@
2148 }
2149 else
2150 {
2151 + not_space_flag = 1;
2152 if (spaces_not_printed > 0)
2153 print_white_space ();
2154 putchar (*s++);
2155 - ++output_position;
2156 }
2157 }
2158 + if (not_space_flag)
2159 + output_position += col_sep_width;
2160 +
2161 /* sep_string ends with some spaces */
2162 if (spaces_not_printed > 0)
2163 print_white_space ();
2164 @@ -2312,7 +2445,7 @@
2165 required number of tabs and spaces. */
2166
2167 static void
2168 -print_char (char c)
2169 +print_char_single (char c)
2170 {
2171 if (tabify_output)
2172 {
2173 @@ -2336,6 +2469,74 @@
2174 putchar (c);
2175 }
2176
2177 +#ifdef HAVE_MBRTOWC
2178 +static void
2179 +print_char_multi (char c)
2180 +{
2181 + static size_t mbc_pos = 0;
2182 + static unsigned char mbc[MB_LEN_MAX] = {'\0'};
2183 + static mbstate_t state = {'\0'};
2184 + mbstate_t state_bak;
2185 + wchar_t wc;
2186 + size_t mblength;
2187 + int width;
2188 +
2189 + if (tabify_output)
2190 + {
2191 + state_bak = state;
2192 + mbc[mbc_pos++] = (unsigned char)c;
2193 + mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2194 +
2195 + while (mbc_pos > 0)
2196 + {
2197 + switch (mblength)
2198 + {
2199 + case (size_t)-2:
2200 + state = state_bak;
2201 + return;
2202 +
2203 + case (size_t)-1:
2204 + state = state_bak;
2205 + ++output_position;
2206 + putchar (mbc[0]);
2207 + memmove (mbc, mbc + 1, MB_CUR_MAX - 1);
2208 + --mbc_pos;
2209 + break;
2210 +
2211 + case 0:
2212 + mblength = 1;
2213 +
2214 + default:
2215 + if (wc == L' ')
2216 + {
2217 + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2218 + --mbc_pos;
2219 + ++spaces_not_printed;
2220 + return;
2221 + }
2222 + else if (spaces_not_printed > 0)
2223 + print_white_space ();
2224 +
2225 + /* Nonprintables are assumed to have width 0, except L'\b'. */
2226 + if ((width = wcwidth (wc)) < 1)
2227 + {
2228 + if (wc == L'\b')
2229 + --output_position;
2230 + }
2231 + else
2232 + output_position += width;
2233 +
2234 + fwrite (mbc, sizeof(char), mblength, stdout);
2235 + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2236 + mbc_pos -= mblength;
2237 + }
2238 + }
2239 + return;
2240 + }
2241 + putchar (c);
2242 +}
2243 +#endif
2244 +
2245 /* Skip to page PAGE before printing.
2246 PAGE may be larger than total number of pages. */
2247
2248 @@ -2516,9 +2717,9 @@
2249 align_empty_cols = false;
2250 }
2251
2252 - if (padding_not_printed - col_sep_length > 0)
2253 + if (padding_not_printed - col_sep_width > 0)
2254 {
2255 - pad_across_to (padding_not_printed - col_sep_length);
2256 + pad_across_to (padding_not_printed - col_sep_width);
2257 padding_not_printed = ANYWHERE;
2258 }
2259
2260 @@ -2619,9 +2820,9 @@
2261 }
2262 }
2263
2264 - if (padding_not_printed - col_sep_length > 0)
2265 + if (padding_not_printed - col_sep_width > 0)
2266 {
2267 - pad_across_to (padding_not_printed - col_sep_length);
2268 + pad_across_to (padding_not_printed - col_sep_width);
2269 padding_not_printed = ANYWHERE;
2270 }
2271
2272 @@ -2634,8 +2835,8 @@
2273 if (spaces_not_printed == 0)
2274 {
2275 output_position = p->start_position + end_vector[line];
2276 - if (p->start_position - col_sep_length == chars_per_margin)
2277 - output_position -= col_sep_length;
2278 + if (p->start_position - col_sep_width == chars_per_margin)
2279 + output_position -= col_sep_width;
2280 }
2281
2282 return true;
2283 @@ -2654,7 +2855,7 @@
2284 number of characters is 1.) */
2285
2286 static int
2287 -char_to_clump (char c)
2288 +char_to_clump_single (char c)
2289 {
2290 unsigned char uc = c;
2291 char *s = clump_buff;
2292 @@ -2664,10 +2865,10 @@
2293 int chars;
2294 int chars_per_c = 8;
2295
2296 - if (c == input_tab_char)
2297 + if (c == input_tab_char[0])
2298 chars_per_c = chars_per_input_tab;
2299
2300 - if (c == input_tab_char || c == '\t')
2301 + if (c == input_tab_char[0] || c == '\t')
2302 {
2303 width = TAB_WIDTH (chars_per_c, input_position);
2304
2305 @@ -2738,6 +2939,154 @@
2306 return chars;
2307 }
2308
2309 +#ifdef HAVE_MBRTOWC
2310 +static int
2311 +char_to_clump_multi (char c)
2312 +{
2313 + static size_t mbc_pos = 0;
2314 + static char mbc[MB_LEN_MAX] = {'\0'};
2315 + static mbstate_t state = {'\0'};
2316 + mbstate_t state_bak;
2317 + wchar_t wc;
2318 + size_t mblength;
2319 + int wc_width;
2320 + register int *s = clump_buff;
2321 + register int i, j;
2322 + char esc_buff[4];
2323 + int width;
2324 + int chars;
2325 + int chars_per_c = 8;
2326 +
2327 + state_bak = state;
2328 + mbc[mbc_pos++] = c;
2329 + mblength = mbrtowc (&wc, mbc, mbc_pos, &state);
2330 +
2331 + width = 0;
2332 + chars = 0;
2333 + while (mbc_pos > 0)
2334 + {
2335 + switch (mblength)
2336 + {
2337 + case (size_t)-2:
2338 + state = state_bak;
2339 + return 0;
2340 +
2341 + case (size_t)-1:
2342 + state = state_bak;
2343 + mblength = 1;
2344 +
2345 + if (use_esc_sequence || use_cntrl_prefix)
2346 + {
2347 + width = +4;
2348 + chars = +4;
2349 + *s++ = '\\';
2350 + sprintf (esc_buff, "%03o", mbc[0]);
2351 + for (i = 0; i <= 2; ++i)
2352 + *s++ = (int) esc_buff[i];
2353 + }
2354 + else
2355 + {
2356 + width += 1;
2357 + chars += 1;
2358 + *s++ = mbc[0];
2359 + }
2360 + break;
2361 +
2362 + case 0:
2363 + mblength = 1;
2364 + /* Fall through */
2365 +
2366 + default:
2367 + if (memcmp (mbc, input_tab_char, mblength) == 0)
2368 + chars_per_c = chars_per_input_tab;
2369 +
2370 + if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t')
2371 + {
2372 + int width_inc;
2373 +
2374 + width_inc = TAB_WIDTH (chars_per_c, input_position);
2375 + width += width_inc;
2376 +
2377 + if (untabify_input)
2378 + {
2379 + for (i = width_inc; i; --i)
2380 + *s++ = ' ';
2381 + chars += width_inc;
2382 + }
2383 + else
2384 + {
2385 + for (i = 0; i < mblength; i++)
2386 + *s++ = mbc[i];
2387 + chars += mblength;
2388 + }
2389 + }
2390 + else if ((wc_width = wcwidth (wc)) < 1)
2391 + {
2392 + if (use_esc_sequence)
2393 + {
2394 + for (i = 0; i < mblength; i++)
2395 + {
2396 + width += 4;
2397 + chars += 4;
2398 + *s++ = '\\';
2399 + sprintf (esc_buff, "%03o", c);
2400 + for (j = 0; j <= 2; ++j)
2401 + *s++ = (int) esc_buff[j];
2402 + }
2403 + }
2404 + else if (use_cntrl_prefix)
2405 + {
2406 + if (wc < 0200)
2407 + {
2408 + width += 2;
2409 + chars += 2;
2410 + *s++ = '^';
2411 + *s++ = wc ^ 0100;
2412 + }
2413 + else
2414 + {
2415 + for (i = 0; i < mblength; i++)
2416 + {
2417 + width += 4;
2418 + chars += 4;
2419 + *s++ = '\\';
2420 + sprintf (esc_buff, "%03o", c);
2421 + for (j = 0; j <= 2; ++j)
2422 + *s++ = (int) esc_buff[j];
2423 + }
2424 + }
2425 + }
2426 + else if (wc == L'\b')
2427 + {
2428 + width += -1;
2429 + chars += 1;
2430 + *s++ = c;
2431 + }
2432 + else
2433 + {
2434 + width += 0;
2435 + chars += mblength;
2436 + for (i = 0; i < mblength; i++)
2437 + *s++ = mbc[i];
2438 + }
2439 + }
2440 + else
2441 + {
2442 + width += wc_width;
2443 + chars += mblength;
2444 + for (i = 0; i < mblength; i++)
2445 + *s++ = mbc[i];
2446 + }
2447 + }
2448 + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength);
2449 + mbc_pos -= mblength;
2450 + }
2451 +
2452 + input_position += width;
2453 + return chars;
2454 +}
2455 +#endif
2456 +
2457 /* We've just printed some files and need to clean up things before
2458 looking for more options and printing the next batch of files.
2459
2460 diff -Naur coreutils-6.9.orig/src/sort.c coreutils-6.9/src/sort.c
2461 --- coreutils-6.9.orig/src/sort.c 2007-03-18 21:36:43.000000000 +0000
2462 +++ coreutils-6.9/src/sort.c 2007-04-07 17:11:06.000000000 +0000
2463 @@ -23,10 +23,18 @@
2464
2465 #include <config.h>
2466
2467 +#include <assert.h>
2468 #include <getopt.h>
2469 #include <sys/types.h>
2470 #include <sys/wait.h>
2471 #include <signal.h>
2472 +#if HAVE_WCHAR_H
2473 +# include <wchar.h>
2474 +#endif
2475 +/* Get isw* functions. */
2476 +#if HAVE_WCTYPE_H
2477 +# include <wctype.h>
2478 +#endif
2479 #include "system.h"
2480 #include "argmatch.h"
2481 #include "error.h"
2482 @@ -116,14 +124,38 @@
2483 /* Thousands separator; if -1, then there isn't one. */
2484 static int thousands_sep;
2485
2486 +static int force_general_numcompare = 0;
2487 +
2488 /* Nonzero if the corresponding locales are hard. */
2489 static bool hard_LC_COLLATE;
2490 -#if HAVE_NL_LANGINFO
2491 +#if HAVE_LANGINFO_CODESET
2492 static bool hard_LC_TIME;
2493 #endif
2494
2495 #define NONZERO(x) ((x) != 0)
2496
2497 +/* get a multibyte character's byte length. */
2498 +#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \
2499 + do \
2500 + { \
2501 + wchar_t wc; \
2502 + mbstate_t state_bak; \
2503 + \
2504 + state_bak = STATE; \
2505 + mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \
2506 + \
2507 + switch (MBLENGTH) \
2508 + { \
2509 + case (size_t)-1: \
2510 + case (size_t)-2: \
2511 + STATE = state_bak; \
2512 + /* Fall through. */ \
2513 + case 0: \
2514 + MBLENGTH = 1; \
2515 + } \
2516 + } \
2517 + while (0)
2518 +
2519 /* The kind of blanks for '-b' to skip in various options. */
2520 enum blanktype { bl_start, bl_end, bl_both };
2521
2522 @@ -261,13 +293,11 @@
2523 they were read if all keys compare equal. */
2524 static bool stable;
2525
2526 -/* If TAB has this value, blanks separate fields. */
2527 -enum { TAB_DEFAULT = CHAR_MAX + 1 };
2528 -
2529 -/* Tab character separating fields. If TAB_DEFAULT, then fields are
2530 +/* Tab character separating fields. If tab_length is 0, then fields are
2531 separated by the empty string between a non-blank character and a blank
2532 character. */
2533 -static int tab = TAB_DEFAULT;
2534 +static char tab[MB_LEN_MAX + 1];
2535 +static size_t tab_length = 0;
2536
2537 /* Flag to remove consecutive duplicate lines from the output.
2538 Only the last of a sequence of equal lines will be output. */
2539 @@ -639,6 +669,44 @@
2540 update_proc (pid);
2541 }
2542
2543 +/* Function pointers. */
2544 +static void
2545 +(*inittables) (void);
2546 +static char *
2547 +(*begfield) (const struct line*, const struct keyfield *);
2548 +static char *
2549 +(*limfield) (const struct line*, const struct keyfield *);
2550 +static int
2551 +(*getmonth) (char const *, size_t);
2552 +static int
2553 +(*keycompare) (const struct line *, const struct line *);
2554 +static int
2555 +(*numcompare) (const char *, const char *);
2556 +
2557 +/* Test for white space multibyte character.
2558 + Set LENGTH the byte length of investigated multibyte character. */
2559 +#if HAVE_MBRTOWC
2560 +static int
2561 +ismbblank (const char *str, size_t len, size_t *length)
2562 +{
2563 + size_t mblength;
2564 + wchar_t wc;
2565 + mbstate_t state;
2566 +
2567 + memset (&state, '\0', sizeof(mbstate_t));
2568 + mblength = mbrtowc (&wc, str, len, &state);
2569 +
2570 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
2571 + {
2572 + *length = 1;
2573 + return 0;
2574 + }
2575 +
2576 + *length = (mblength < 1) ? 1 : mblength;
2577 + return iswblank (wc);
2578 +}
2579 +#endif
2580 +
2581 /* Clean up any remaining temporary files. */
2582
2583 static void
2584 @@ -978,7 +1046,7 @@
2585 free (node);
2586 }
2587
2588 -#if HAVE_NL_LANGINFO
2589 +#if HAVE_LANGINFO_CODESET
2590
2591 static int
2592 struct_month_cmp (const void *m1, const void *m2)
2593 @@ -993,7 +1061,7 @@
2594 /* Initialize the character class tables. */
2595
2596 static void
2597 -inittables (void)
2598 +inittables_uni (void)
2599 {
2600 size_t i;
2601
2602 @@ -1005,7 +1073,7 @@
2603 fold_toupper[i] = toupper (i);
2604 }
2605
2606 -#if HAVE_NL_LANGINFO
2607 +#if HAVE_LANGINFO_CODESET
2608 /* If we're not in the "C" locale, read different names for months. */
2609 if (hard_LC_TIME)
2610 {
2611 @@ -1031,6 +1099,64 @@
2612 #endif
2613 }
2614
2615 +#if HAVE_MBRTOWC
2616 +static void
2617 +inittables_mb (void)
2618 +{
2619 + int i, j, k, l;
2620 + char *name, *s;
2621 + size_t s_len, mblength;
2622 + char mbc[MB_LEN_MAX];
2623 + wchar_t wc, pwc;
2624 + mbstate_t state_mb, state_wc;
2625 +
2626 + for (i = 0; i < MONTHS_PER_YEAR; i++)
2627 + {
2628 + s = (char *) nl_langinfo (ABMON_1 + i);
2629 + s_len = strlen (s);
2630 + monthtab[i].name = name = (char *) xmalloc (s_len + 1);
2631 + monthtab[i].val = i + 1;
2632 +
2633 + memset (&state_mb, '\0', sizeof (mbstate_t));
2634 + memset (&state_wc, '\0', sizeof (mbstate_t));
2635 +
2636 + for (j = 0; j < s_len;)
2637 + {
2638 + if (!ismbblank (s + j, s_len - j, &mblength))
2639 + break;
2640 + j += mblength;
2641 + }
2642 +
2643 + for (k = 0; j < s_len;)
2644 + {
2645 + mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
2646 + assert (mblength != (size_t)-1 && mblength != (size_t)-2);
2647 + if (mblength == 0)
2648 + break;
2649 +
2650 + pwc = towupper (wc);
2651 + if (pwc == wc)
2652 + {
2653 + memcpy (mbc, s + j, mblength);
2654 + j += mblength;
2655 + }
2656 + else
2657 + {
2658 + j += mblength;
2659 + mblength = wcrtomb (mbc, pwc, &state_wc);
2660 + assert (mblength != (size_t)0 && mblength != (size_t)-1);
2661 + }
2662 +
2663 + for (l = 0; l < mblength; l++)
2664 + name[k++] = mbc[l];
2665 + }
2666 + name[k] = '\0';
2667 + }
2668 + qsort ((void *) monthtab, MONTHS_PER_YEAR,
2669 + sizeof (struct month), struct_month_cmp);
2670 +}
2671 +#endif
2672 +
2673 /* Specify the amount of main memory to use when sorting. */
2674 static void
2675 specify_sort_size (char const *s)
2676 @@ -1241,7 +1367,7 @@
2677 by KEY in LINE. */
2678
2679 static char *
2680 -begfield (const struct line *line, const struct keyfield *key)
2681 +begfield_uni (const struct line *line, const struct keyfield *key)
2682 {
2683 char *ptr = line->text, *lim = ptr + line->length - 1;
2684 size_t sword = key->sword;
2685 @@ -1251,10 +1377,10 @@
2686 /* The leading field separator itself is included in a field when -t
2687 is absent. */
2688
2689 - if (tab != TAB_DEFAULT)
2690 + if (tab_length)
2691 while (ptr < lim && sword--)
2692 {
2693 - while (ptr < lim && *ptr != tab)
2694 + while (ptr < lim && *ptr != tab[0])
2695 ++ptr;
2696 if (ptr < lim)
2697 ++ptr;
2698 @@ -1282,11 +1408,70 @@
2699 return ptr;
2700 }
2701
2702 +#if HAVE_MBRTOWC
2703 +static char *
2704 +begfield_mb (const struct line *line, const struct keyfield *key)
2705 +{
2706 + int i;
2707 + char *ptr = line->text, *lim = ptr + line->length - 1;
2708 + size_t sword = key->sword;
2709 + size_t schar = key->schar;
2710 + size_t mblength;
2711 + mbstate_t state;
2712 +
2713 + memset (&state, '\0', sizeof(mbstate_t));
2714 +
2715 + if (tab_length)
2716 + while (ptr < lim && sword--)
2717 + {
2718 + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
2719 + {
2720 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2721 + ptr += mblength;
2722 + }
2723 + if (ptr < lim)
2724 + {
2725 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2726 + ptr += mblength;
2727 + }
2728 + }
2729 + else
2730 + while (ptr < lim && sword--)
2731 + {
2732 + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2733 + ptr += mblength;
2734 + if (ptr < lim)
2735 + {
2736 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2737 + ptr += mblength;
2738 + }
2739 + while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
2740 + ptr += mblength;
2741 + }
2742 +
2743 + if (key->skipsblanks)
2744 + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2745 + ptr += mblength;
2746 +
2747 + for (i = 0; i < schar; i++)
2748 + {
2749 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2750 +
2751 + if (ptr + mblength > lim)
2752 + break;
2753 + else
2754 + ptr += mblength;
2755 + }
2756 +
2757 + return ptr;
2758 +}
2759 +#endif
2760 +
2761 /* Return the limit of (a pointer to the first character after) the field
2762 in LINE specified by KEY. */
2763
2764 static char *
2765 -limfield (const struct line *line, const struct keyfield *key)
2766 +limfield_uni (const struct line *line, const struct keyfield *key)
2767 {
2768 char *ptr = line->text, *lim = ptr + line->length - 1;
2769 size_t eword = key->eword, echar = key->echar;
2770 @@ -1299,10 +1484,10 @@
2771 `beginning' is the first character following the delimiting TAB.
2772 Otherwise, leave PTR pointing at the first `blank' character after
2773 the preceding field. */
2774 - if (tab != TAB_DEFAULT)
2775 + if (tab_length)
2776 while (ptr < lim && eword--)
2777 {
2778 - while (ptr < lim && *ptr != tab)
2779 + while (ptr < lim && *ptr != tab[0])
2780 ++ptr;
2781 if (ptr < lim && (eword | echar))
2782 ++ptr;
2783 @@ -1348,10 +1533,10 @@
2784 */
2785
2786 /* Make LIM point to the end of (one byte past) the current field. */
2787 - if (tab != TAB_DEFAULT)
2788 + if (tab_length)
2789 {
2790 char *newlim;
2791 - newlim = memchr (ptr, tab, lim - ptr);
2792 + newlim = memchr (ptr, tab[0], lim - ptr);
2793 if (newlim)
2794 lim = newlim;
2795 }
2796 @@ -1384,6 +1569,107 @@
2797 return ptr;
2798 }
2799
2800 +#if HAVE_MBRTOWC
2801 +static char *
2802 +limfield_mb (const struct line *line, const struct keyfield *key)
2803 +{
2804 + char *ptr = line->text, *lim = ptr + line->length - 1;
2805 + size_t eword = key->eword, echar = key->echar;
2806 + int i;
2807 + size_t mblength;
2808 + mbstate_t state;
2809 +
2810 + memset (&state, '\0', sizeof(mbstate_t));
2811 +
2812 + if (tab_length)
2813 + while (ptr < lim && eword--)
2814 + {
2815 + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
2816 + {
2817 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2818 + ptr += mblength;
2819 + }
2820 + if (ptr < lim && (eword | echar))
2821 + {
2822 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2823 + ptr += mblength;
2824 + }
2825 + }
2826 + else
2827 + while (ptr < lim && eword--)
2828 + {
2829 + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2830 + ptr += mblength;
2831 + if (ptr < lim)
2832 + {
2833 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2834 + ptr += mblength;
2835 + }
2836 + while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
2837 + ptr += mblength;
2838 + }
2839 +
2840 +
2841 +# ifdef POSIX_UNSPECIFIED
2842 + /* Make LIM point to the end of (one byte past) the current field. */
2843 + if (tab_length)
2844 + {
2845 + char *newlim, *p;
2846 +
2847 + newlim = NULL;
2848 + for (p = ptr; p < lim;)
2849 + {
2850 + if (memcmp (p, tab, tab_length) == 0)
2851 + {
2852 + newlim = p;
2853 + break;
2854 + }
2855 +
2856 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2857 + p += mblength;
2858 + }
2859 + }
2860 + else
2861 + {
2862 + char *newlim;
2863 + newlim = ptr;
2864 +
2865 + while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength))
2866 + newlim += mblength;
2867 + if (ptr < lim)
2868 + {
2869 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2870 + ptr += mblength;
2871 + }
2872 + while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength))
2873 + newlim += mblength;
2874 + lim = newlim;
2875 + }
2876 +# endif
2877 +
2878 + /* If we're skipping leading blanks, don't start counting characters
2879 + * until after skipping past any leading blanks. */
2880 + if (key->skipsblanks)
2881 + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
2882 + ptr += mblength;
2883 +
2884 + memset (&state, '\0', sizeof(mbstate_t));
2885 +
2886 + /* Advance PTR by ECHAR (if possible), but no further than LIM. */
2887 + for (i = 0; i < echar; i++)
2888 + {
2889 + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
2890 +
2891 + if (ptr + mblength > lim)
2892 + break;
2893 + else
2894 + ptr += mblength;
2895 + }
2896 +
2897 + return ptr;
2898 +}
2899 +#endif
2900 +
2901 /* Fill BUF reading from FP, moving buf->left bytes from the end
2902 of buf->buf to the beginning first. If EOF is reached and the
2903 file wasn't terminated by a newline, supply one. Set up BUF's line
2904 @@ -1500,7 +1786,7 @@
2905 hideously fast. */
2906
2907 static int
2908 -numcompare (const char *a, const char *b)
2909 +numcompare_uni (const char *a, const char *b)
2910 {
2911 while (blanks[to_uchar (*a)])
2912 a++;
2913 @@ -1510,6 +1796,25 @@
2914 return strnumcmp (a, b, decimal_point, thousands_sep);
2915 }
2916
2917 +#if HAVE_MBRTOWC
2918 +static int
2919 +numcompare_mb (const char *a, const char *b)
2920 +{
2921 + size_t mblength, len;
2922 + len = strlen (a); /* okay for UTF-8 */
2923 + while (*a && ismbblank (a, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
2924 + {
2925 + a += mblength;
2926 + len -= mblength;
2927 + }
2928 + len = strlen (b); /* okay for UTF-8 */
2929 + while (*b && ismbblank (b, len > MB_CUR_MAX ? MB_CUR_MAX : len, &mblength))
2930 + b += mblength;
2931 +
2932 + return strnumcmp (a, b, decimal_point, thousands_sep);
2933 +}
2934 +#endif /* HAV_EMBRTOWC */
2935 +
2936 static int
2937 general_numcompare (const char *sa, const char *sb)
2938 {
2939 @@ -1543,7 +1848,7 @@
2940 Return 0 if the name in S is not recognized. */
2941
2942 static int
2943 -getmonth (char const *month, size_t len)
2944 +getmonth_uni (char const *month, size_t len)
2945 {
2946 size_t lo = 0;
2947 size_t hi = MONTHS_PER_YEAR;
2948 @@ -1698,11 +2003,79 @@
2949 return diff;
2950 }
2951
2952 +#if HAVE_MBRTOWC
2953 +static int
2954 +getmonth_mb (const char *s, size_t len)
2955 +{
2956 + char *month;
2957 + register size_t i;
2958 + register int lo = 0, hi = MONTHS_PER_YEAR, result;
2959 + char *tmp;
2960 + size_t wclength, mblength;
2961 + const char **pp;
2962 + const wchar_t **wpp;
2963 + wchar_t *month_wcs;
2964 + mbstate_t state;
2965 +
2966 + while (len > 0 && ismbblank (s, len, &mblength))
2967 + {
2968 + s += mblength;
2969 + len -= mblength;
2970 + }
2971 +
2972 + if (len == 0)
2973 + return 0;
2974 +
2975 + month = (char *) alloca (len + 1);
2976 +
2977 + tmp = (char *) alloca (len + 1);
2978 + memcpy (tmp, s, len);
2979 + tmp[len] = '\0';
2980 + pp = (const char **)&tmp;
2981 + month_wcs = (wchar_t *) alloca ((len + 1) * sizeof (wchar_t));
2982 + memset (&state, '\0', sizeof(mbstate_t));
2983 +
2984 + wclength = mbsrtowcs (month_wcs, pp, len + 1, &state);
2985 + assert (wclength != (size_t)-1 && *pp == NULL);
2986 +
2987 + for (i = 0; i < wclength; i++)
2988 + {
2989 + month_wcs[i] = towupper(month_wcs[i]);
2990 + if (iswblank (month_wcs[i]))
2991 + {
2992 + month_wcs[i] = L'\0';
2993 + break;
2994 + }
2995 + }
2996 +
2997 + wpp = (const wchar_t **)&month_wcs;
2998 +
2999 + mblength = wcsrtombs (month, wpp, len + 1, &state);
3000 + assert (mblength != (-1) && *wpp == NULL);
3001 +
3002 + do
3003 + {
3004 + int ix = (lo + hi) / 2;
3005 +
3006 + if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
3007 + hi = ix;
3008 + else
3009 + lo = ix;
3010 + }
3011 + while (hi - lo > 1);
3012 +
3013 + result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
3014 + ? monthtab[lo].val : 0);
3015 +
3016 + return result;
3017 +}
3018 +#endif
3019 +
3020 /* Compare two lines A and B trying every key in sequence until there
3021 are no more keys or a difference is found. */
3022
3023 static int
3024 -keycompare (const struct line *a, const struct line *b)
3025 +keycompare_uni (const struct line *a, const struct line *b)
3026 {
3027 struct keyfield const *key = keylist;
3028
3029 @@ -1875,6 +2248,177 @@
3030 return key->reverse ? -diff : diff;
3031 }
3032
3033 +#if HAVE_MBRTOWC
3034 +static int
3035 +keycompare_mb (const struct line *a, const struct line *b)
3036 +{
3037 + struct keyfield *key = keylist;
3038 +
3039 + /* For the first iteration only, the key positions have been
3040 + precomputed for us. */
3041 + char *texta = a->keybeg;
3042 + char *textb = b->keybeg;
3043 + char *lima = a->keylim;
3044 + char *limb = b->keylim;
3045 +
3046 + size_t mblength_a, mblength_b;
3047 + wchar_t wc_a, wc_b;
3048 + mbstate_t state_a, state_b;
3049 +
3050 + int diff;
3051 +
3052 + memset (&state_a, '\0', sizeof(mbstate_t));
3053 + memset (&state_b, '\0', sizeof(mbstate_t));
3054 +
3055 + for (;;)
3056 + {
3057 + unsigned char *translate = (unsigned char *) key->translate;
3058 + bool const *ignore = key->ignore;
3059 +
3060 + /* Find the lengths. */
3061 + size_t lena = lima <= texta ? 0 : lima - texta;
3062 + size_t lenb = limb <= textb ? 0 : limb - textb;
3063 +
3064 + /* Actually compare the fields. */
3065 + if (key->numeric | key->general_numeric)
3066 + {
3067 + char savea = *lima, saveb = *limb;
3068 +
3069 + *lima = *limb = '\0';
3070 + if (force_general_numcompare)
3071 + diff = general_numcompare (texta, textb);
3072 + else
3073 + diff = ((key->numeric ? numcompare : general_numcompare)
3074 + (texta, textb));
3075 + *lima = savea, *limb = saveb;
3076 + }
3077 + else if (key->month)
3078 + diff = getmonth (texta, lena) - getmonth (textb, lenb);
3079 + else
3080 + {
3081 + if (ignore || translate)
3082 + {
3083 + char *copy_a = (char *) alloca (lena + 1 + lenb + 1);
3084 + char *copy_b = copy_a + lena + 1;
3085 + size_t new_len_a, new_len_b;
3086 + size_t i, j;
3087 +
3088 + /* Ignore and/or translate chars before comparing. */
3089 +# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
3090 + do \
3091 + { \
3092 + wchar_t uwc; \
3093 + char mbc[MB_LEN_MAX]; \
3094 + mbstate_t state_wc; \
3095 + \
3096 + for (NEW_LEN = i = 0; i < LEN;) \
3097 + { \
3098 + mbstate_t state_bak; \
3099 + \
3100 + state_bak = STATE; \
3101 + MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \
3102 + \
3103 + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \
3104 + || MBLENGTH == 0) \
3105 + { \
3106 + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \
3107 + STATE = state_bak; \
3108 + if (!ignore) \
3109 + COPY[NEW_LEN++] = TEXT[i++]; \
3110 + continue; \
3111 + } \
3112 + \
3113 + if (ignore) \
3114 + { \
3115 + if ((ignore == nonprinting && !iswprint (WC)) \
3116 + || (ignore == nondictionary \
3117 + && !iswalnum (WC) && !iswblank (WC))) \
3118 + { \
3119 + i += MBLENGTH; \
3120 + continue; \
3121 + } \
3122 + } \
3123 + \
3124 + if (translate) \
3125 + { \
3126 + \
3127 + uwc = towupper(WC); \
3128 + if (WC == uwc) \
3129 + { \
3130 + memcpy (mbc, TEXT + i, MBLENGTH); \
3131 + i += MBLENGTH; \
3132 + } \
3133 + else \
3134 + { \
3135 + i += MBLENGTH; \
3136 + WC = uwc; \
3137 + memset (&state_wc, '\0', sizeof (mbstate_t)); \
3138 + \
3139 + MBLENGTH = wcrtomb (mbc, WC, &state_wc); \
3140 + assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \
3141 + } \
3142 + \
3143 + for (j = 0; j < MBLENGTH; j++) \
3144 + COPY[NEW_LEN++] = mbc[j]; \
3145 + } \
3146 + else \
3147 + for (j = 0; j < MBLENGTH; j++) \
3148 + COPY[NEW_LEN++] = TEXT[i++]; \
3149 + } \
3150 + COPY[NEW_LEN] = '\0'; \
3151 + } \
3152 + while (0)
3153 + IGNORE_CHARS (new_len_a, lena, texta, copy_a,
3154 + wc_a, mblength_a, state_a);
3155 + IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
3156 + wc_b, mblength_b, state_b);
3157 + diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
3158 + }
3159 + else if (lena == 0)
3160 + diff = - NONZERO (lenb);
3161 + else if (lenb == 0)
3162 + goto greater;
3163 + else
3164 + diff = xmemcoll (texta, lena, textb, lenb);
3165 + }
3166 +
3167 + if (diff)
3168 + goto not_equal;
3169 +
3170 + key = key->next;
3171 + if (! key)
3172 + break;
3173 +
3174 + /* Find the beginning and limit of the next field. */
3175 + if (key->eword != -1)
3176 + lima = limfield (a, key), limb = limfield (b, key);
3177 + else
3178 + lima = a->text + a->length - 1, limb = b->text + b->length - 1;
3179 +
3180 + if (key->sword != -1)
3181 + texta = begfield (a, key), textb = begfield (b, key);
3182 + else
3183 + {
3184 + texta = a->text, textb = b->text;
3185 + if (key->skipsblanks)
3186 + {
3187 + while (texta < lima && ismbblank (texta, lima - texta, &mblength_a))
3188 + texta += mblength_a;
3189 + while (textb < limb && ismbblank (textb, limb - textb, &mblength_b))
3190 + textb += mblength_b;
3191 + }
3192 + }
3193 + }
3194 +
3195 + return 0;
3196 +
3197 +greater:
3198 + diff = 1;
3199 +not_equal:
3200 + return key->reverse ? -diff : diff;
3201 +}
3202 +#endif
3203 +
3204 /* Compare two lines A and B, returning negative, zero, or positive
3205 depending on whether A compares less than, equal to, or greater than B. */
3206
3207 @@ -2744,7 +3288,7 @@
3208 initialize_exit_failure (SORT_FAILURE);
3209
3210 hard_LC_COLLATE = hard_locale (LC_COLLATE);
3211 -#if HAVE_NL_LANGINFO
3212 +#if HAVE_LANGINFO_CODESET
3213 hard_LC_TIME = hard_locale (LC_TIME);
3214 #endif
3215
3216 @@ -2765,6 +3309,27 @@
3217 thousands_sep = -1;
3218 }
3219
3220 +#if HAVE_MBRTOWC
3221 + if (MB_CUR_MAX > 1)
3222 + {
3223 + inittables = inittables_mb;
3224 + begfield = begfield_mb;
3225 + limfield = limfield_mb;
3226 + getmonth = getmonth_mb;
3227 + keycompare = keycompare_mb;
3228 + numcompare = numcompare_mb;
3229 + }
3230 + else
3231 +#endif
3232 + {
3233 + inittables = inittables_uni;
3234 + begfield = begfield_uni;
3235 + limfield = limfield_uni;
3236 + getmonth = getmonth_uni;
3237 + keycompare = keycompare_uni;
3238 + numcompare = numcompare_uni;
3239 + }
3240 +
3241 have_read_stdin = false;
3242 inittables ();
3243
3244 @@ -3015,13 +3580,35 @@
3245
3246 case 't':
3247 {
3248 - char newtab = optarg[0];
3249 - if (! newtab)
3250 + char newtab[MB_LEN_MAX + 1];
3251 + size_t newtab_length = 1;
3252 + strncpy (newtab, optarg, MB_LEN_MAX);
3253 + if (! newtab[0])
3254 error (SORT_FAILURE, 0, _("empty tab"));
3255 - if (optarg[1])
3256 +#if HAVE_MBRTOWC
3257 + if (MB_CUR_MAX > 1)
3258 + {
3259 + wchar_t wc;
3260 + mbstate_t state;
3261 + size_t i;
3262 +
3263 + memset (&state, '\0', sizeof (mbstate_t));
3264 + newtab_length = mbrtowc (&wc, newtab, strnlen (newtab,
3265 + MB_LEN_MAX),
3266 + &state);
3267 + switch (newtab_length)
3268 + {
3269 + case (size_t) -1:
3270 + case (size_t) -2:
3271 + case 0:
3272 + newtab_length = 1;
3273 + }
3274 + }
3275 +#endif
3276 + if (newtab_length == 1 && optarg[1])
3277 {
3278 if (STREQ (optarg, "\\0"))
3279 - newtab = '\0';
3280 + newtab[0] = '\0';
3281 else
3282 {
3283 /* Provoke with `sort -txx'. Complain about
3284 @@ -3032,9 +3619,12 @@
3285 quote (optarg));
3286 }
3287 }
3288 - if (tab != TAB_DEFAULT && tab != newtab)
3289 + if (tab_length
3290 + && (tab_length != newtab_length
3291 + || memcmp (tab, newtab, tab_length) != 0))
3292 error (SORT_FAILURE, 0, _("incompatible tabs"));
3293 - tab = newtab;
3294 + memcpy (tab, newtab, newtab_length);
3295 + tab_length = newtab_length;
3296 }
3297 break;
3298
3299 diff -Naur coreutils-6.9.orig/src/unexpand.c coreutils-6.9/src/unexpand.c
3300 --- coreutils-6.9.orig/src/unexpand.c 2007-03-18 21:36:43.000000000 +0000
3301 +++ coreutils-6.9/src/unexpand.c 2007-04-07 16:59:55.000000000 +0000
3302 @@ -39,11 +39,28 @@
3303 #include <stdio.h>
3304 #include <getopt.h>
3305 #include <sys/types.h>
3306 +
3307 +/* Get mbstate_t, mbrtowc(), wcwidth(). */
3308 +#if HAVE_WCHAR_H
3309 +# include <wchar.h>
3310 +#endif
3311 +
3312 #include "system.h"
3313 #include "error.h"
3314 #include "quote.h"
3315 #include "xstrndup.h"
3316
3317 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
3318 + installation; work around this configuration error. */
3319 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
3320 +# define MB_LEN_MAX 16
3321 +#endif
3322 +
3323 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
3324 +#if HAVE_MBRTOWC && defined mbstate_t
3325 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
3326 +#endif
3327 +
3328 /* The official name of this program (e.g., no `g' prefix). */
3329 #define PROGRAM_NAME "unexpand"
3330
3331 @@ -110,6 +127,208 @@
3332 {NULL, 0, NULL, 0}
3333 };
3334
3335 +static FILE *next_file (FILE *fp);
3336 +
3337 +#if HAVE_MBRTOWC
3338 +static void
3339 +unexpand_multibyte (void)
3340 +{
3341 + FILE *fp; /* Input stream. */
3342 + mbstate_t i_state; /* Current shift state of the input stream. */
3343 + mbstate_t i_state_bak; /* Back up the I_STATE. */
3344 + mbstate_t o_state; /* Current shift state of the output stream. */
3345 + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */
3346 + char *bufpos; /* Next read position of BUF. */
3347 + size_t buflen = 0; /* The length of the byte sequence in buf. */
3348 + wint_t wc; /* A gotten wide character. */
3349 + size_t mblength; /* The byte size of a multibyte character
3350 + which shows as same character as WC. */
3351 +
3352 + /* Index in `tab_list' of next tabstop: */
3353 + int tab_index = 0; /* For calculating width of pending tabs. */
3354 + int print_tab_index = 0; /* For printing as many tabs as possible. */
3355 + unsigned int column = 0; /* Column on screen of next char. */
3356 + int next_tab_column; /* Column the next tab stop is on. */
3357 + int convert = 1; /* If nonzero, perform translations. */
3358 + unsigned int pending = 0; /* Pending columns of blanks. */
3359 +
3360 + fp = next_file ((FILE *) NULL);
3361 + if (fp == NULL)
3362 + return;
3363 +
3364 + memset (&o_state, '\0', sizeof(mbstate_t));
3365 + memset (&i_state, '\0', sizeof(mbstate_t));
3366 +
3367 + for (;;)
3368 + {
3369 + if (buflen < MB_LEN_MAX && !feof(fp) && !ferror(fp))
3370 + {
3371 + memmove (buf, bufpos, buflen);
3372 + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, fp);
3373 + bufpos = buf;
3374 + }
3375 +
3376 + /* Get a wide character. */
3377 + if (buflen < 1)
3378 + {
3379 + mblength = 1;
3380 + wc = WEOF;
3381 + }
3382 + else
3383 + {
3384 + i_state_bak = i_state;
3385 + mblength = mbrtowc ((wchar_t *)&wc, bufpos, buflen, &i_state);
3386 + }
3387 +
3388 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
3389 + {
3390 + i_state = i_state_bak;
3391 + wc = L'\0';
3392 + }
3393 +
3394 + if (wc == L' ' && convert && column < INT_MAX)
3395 + {
3396 + ++pending;
3397 + ++column;
3398 + }
3399 + else if (wc == L'\t' && convert)
3400 + {
3401 + if (tab_size == 0)
3402 + {
3403 + /* Do not let tab_index == first_free_tab;
3404 + stop when it is 1 less. */
3405 + while (tab_index < first_free_tab - 1
3406 + && column >= tab_list[tab_index])
3407 + tab_index++;
3408 + next_tab_column = tab_list[tab_index];
3409 + if (tab_index < first_free_tab - 1)
3410 + tab_index++;
3411 + if (column >= next_tab_column)
3412 + {
3413 + convert = 0; /* Ran out of tab stops. */
3414 + goto flush_pend_mb;
3415 + }
3416 + }
3417 + else
3418 + {
3419 + next_tab_column = column + tab_size - column % tab_size;
3420 + }
3421 + pending += next_tab_column - column;
3422 + column = next_tab_column;
3423 + }
3424 + else
3425 + {
3426 +flush_pend_mb:
3427 + /* Flush pending spaces. Print as many tabs as possible,
3428 + then print the rest as spaces. */
3429 + if (pending == 1)
3430 + {
3431 + putchar (' ');
3432 + pending = 0;
3433 + }
3434 + column -= pending;
3435 + while (pending > 0)
3436 + {
3437 + if (tab_size == 0)
3438 + {
3439 + /* Do not let print_tab_index == first_free_tab;
3440 + stop when it is 1 less. */
3441 + while (print_tab_index < first_free_tab - 1
3442 + && column >= tab_list[print_tab_index])
3443 + print_tab_index++;
3444 + next_tab_column = tab_list[print_tab_index];
3445 + if (print_tab_index < first_free_tab - 1)
3446 + print_tab_index++;
3447 + }
3448 + else
3449 + {
3450 + next_tab_column =
3451 + column + tab_size - column % tab_size;
3452 + }
3453 + if (next_tab_column - column <= pending)
3454 + {
3455 + putchar ('\t');
3456 + pending -= next_tab_column - column;
3457 + column = next_tab_column;
3458 + }
3459 + else
3460 + {
3461 + --print_tab_index;
3462 + column += pending;
3463 + while (pending != 0)
3464 + {
3465 + putchar (' ');
3466 + pending--;
3467 + }
3468 + }
3469 + }
3470 +
3471 + if (wc == WEOF)
3472 + {
3473 + fp = next_file (fp);
3474 + if (fp == NULL)
3475 + break; /* No more files. */
3476 + else
3477 + {
3478 + memset (&i_state, '\0', sizeof(mbstate_t));
3479 + continue;
3480 + }
3481 + }
3482 +
3483 + if (mblength == (size_t)-1 || mblength == (size_t)-2)
3484 + {
3485 + if (convert)
3486 + {
3487 + ++column;
3488 + if (convert_entire_line == 0)
3489 + convert = 0;
3490 + }
3491 + mblength = 1;
3492 + putchar (buf[0]);
3493 + }
3494 + else if (mblength == 0)
3495 + {
3496 + if (convert && convert_entire_line == 0)
3497 + convert = 0;
3498 + mblength = 1;
3499 + putchar ('\0');
3500 + }
3501 + else
3502 + {
3503 + if (convert)
3504 + {
3505 + if (wc == L'\b')
3506 + {
3507 + if (column > 0)
3508 + --column;
3509 + }
3510 + else
3511 + {
3512 + int width; /* The width of WC. */
3513 +
3514 + width = wcwidth (wc);
3515 + column += (width > 0) ? width : 0;
3516 + if (convert_entire_line == 0)
3517 + convert = 0;
3518 + }
3519 + }
3520 +
3521 + if (wc == L'\n')
3522 + {
3523 + tab_index = print_tab_index = 0;
3524 + column = pending = 0;
3525 + convert = 1;
3526 + }
3527 + fwrite (bufpos, sizeof(char), mblength, stdout);
3528 + }
3529 + }
3530 + buflen -= mblength;
3531 + bufpos += mblength;
3532 + }
3533 +}
3534 +#endif
3535 +
3536 +
3537 void
3538 usage (int status)
3539 {
3540 @@ -531,7 +750,12 @@
3541
3542 file_list = (optind < argc ? &argv[optind] : stdin_argv);
3543
3544 - unexpand ();
3545 +#if HAVE_MBRTOWC
3546 + if (MB_CUR_MAX > 1)
3547 + unexpand_multibyte ();
3548 + else
3549 +#endif
3550 + unexpand ();
3551
3552 if (have_read_stdin && fclose (stdin) != 0)
3553 error (EXIT_FAILURE, errno, "-");
3554 diff -Naur coreutils-6.9.orig/src/uniq.c coreutils-6.9/src/uniq.c
3555 --- coreutils-6.9.orig/src/uniq.c 2007-03-18 21:36:43.000000000 +0000
3556 +++ coreutils-6.9/src/uniq.c 2007-04-07 16:59:55.000000000 +0000
3557 @@ -23,6 +23,16 @@
3558 #include <getopt.h>
3559 #include <sys/types.h>
3560
3561 +/* Get mbstate_t, mbrtowc(). */
3562 +#if HAVE_WCHAR_H
3563 +# include <wchar.h>
3564 +#endif
3565 +
3566 +/* Get isw* functions. */
3567 +#if HAVE_WCTYPE_H
3568 +# include <wctype.h>
3569 +#endif
3570 +
3571 #include "system.h"
3572 #include "argmatch.h"
3573 #include "linebuffer.h"
3574 @@ -32,7 +42,19 @@
3575 #include "quote.h"
3576 #include "xmemcoll.h"
3577 #include "xstrtol.h"
3578 -#include "memcasecmp.h"
3579 +#include "xmemcoll.h"
3580 +
3581 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
3582 + installation; work around this configuration error. */
3583 +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2
3584 +# define MB_LEN_MAX 16
3585 +#endif
3586 +
3587 +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
3588 +#if HAVE_MBRTOWC && defined mbstate_t
3589 +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
3590 +#endif
3591 +
3592
3593 /* The official name of this program (e.g., no `g' prefix). */
3594 #define PROGRAM_NAME "uniq"
3595 @@ -109,6 +131,10 @@
3596 /* Select whether/how to delimit groups of duplicate lines. */
3597 static enum delimit_method delimit_groups;
3598
3599 +/* Function pointers. */
3600 +static char *
3601 +(*find_field) (struct linebuffer *line);
3602 +
3603 static struct option const longopts[] =
3604 {
3605 {"count", no_argument, NULL, 'c'},
3606 @@ -198,7 +224,7 @@
3607 return a pointer to the beginning of the line's field to be compared. */
3608
3609 static char *
3610 -find_field (const struct linebuffer *line)
3611 +find_field_uni (struct linebuffer *line)
3612 {
3613 size_t count;
3614 char *lp = line->buffer;
3615 @@ -219,6 +245,83 @@
3616 return lp + i;
3617 }
3618
3619 +#if HAVE_MBRTOWC
3620 +
3621 +# define MBCHAR_TO_WCHAR(WC, MBLENGTH, LP, POS, SIZE, STATEP, CONVFAIL) \
3622 + do \
3623 + { \
3624 + mbstate_t state_bak; \
3625 + \
3626 + CONVFAIL = 0; \
3627 + state_bak = *STATEP; \
3628 + \
3629 + MBLENGTH = mbrtowc (&WC, LP + POS, SIZE - POS, STATEP); \
3630 + \
3631 + switch (MBLENGTH) \
3632 + { \
3633 + case (size_t)-2: \
3634 + case (size_t)-1: \
3635 + *STATEP = state_bak; \
3636 + CONVFAIL++; \
3637 + /* Fall through */ \
3638 + case 0: \
3639 + MBLENGTH = 1; \
3640 + } \
3641 + } \
3642 + while (0)
3643 +
3644 +static char *
3645 +find_field_multi (struct linebuffer *line)
3646 +{
3647 + size_t count;
3648 + char *lp = line->buffer;
3649 + size_t size = line->length - 1;
3650 + size_t pos;
3651 + size_t mblength;
3652 + wchar_t wc;
3653 + mbstate_t *statep;
3654 + int convfail;
3655 +
3656 + pos = 0;
3657 + statep = &(line->state);
3658 +
3659 + /* skip fields. */
3660 + for (count = 0; count < skip_fields && pos < size; count++)
3661 + {
3662 + while (pos < size)
3663 + {
3664 + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3665 +
3666 + if (convfail || !iswblank (wc))
3667 + {
3668 + pos += mblength;
3669 + break;
3670 + }
3671 + pos += mblength;
3672 + }
3673 +
3674 + while (pos < size)
3675 + {
3676 + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3677 +
3678 + if (!convfail && iswblank (wc))
3679 + break;
3680 +
3681 + pos += mblength;
3682 + }
3683 + }
3684 +
3685 + /* skip fields. */
3686 + for (count = 0; count < skip_chars && pos < size; count++)
3687 + {
3688 + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail);
3689 + pos += mblength;
3690 + }
3691 +
3692 + return lp + pos;
3693 +}
3694 +#endif
3695 +
3696 /* Return false if two strings OLD and NEW match, true if not.
3697 OLD and NEW point not to the beginnings of the lines
3698 but rather to the beginnings of the fields to compare.
3699 @@ -227,6 +330,8 @@
3700 static bool
3701 different (char *old, char *new, size_t oldlen, size_t newlen)
3702 {
3703 + char *copy_old, *copy_new;
3704 +
3705 if (check_chars < oldlen)
3706 oldlen = check_chars;
3707 if (check_chars < newlen)
3708 @@ -234,14 +339,92 @@
3709
3710 if (ignore_case)
3711 {
3712 - /* FIXME: This should invoke strcoll somehow. */
3713 - return oldlen != newlen || memcasecmp (old, new, oldlen);
3714 + size_t i;
3715 +
3716 + copy_old = alloca (oldlen + 1);
3717 + copy_new = alloca (oldlen + 1);
3718 +
3719 + for (i = 0; i < oldlen; i++)
3720 + {
3721 + copy_old[i] = toupper (old[i]);
3722 + copy_new[i] = toupper (new[i]);
3723 + }
3724 }
3725 - else if (hard_LC_COLLATE)
3726 - return xmemcoll (old, oldlen, new, newlen) != 0;
3727 else
3728 - return oldlen != newlen || memcmp (old, new, oldlen);
3729 + {
3730 + copy_old = (char *)old;
3731 + copy_new = (char *)new;
3732 + }
3733 +
3734 + return xmemcoll (copy_old, oldlen, copy_new, newlen);
3735 +}
3736 +
3737 +#if HAVE_MBRTOWC
3738 +static int
3739 +different_multi (const char *old, const char *new, size_t oldlen, size_t newlen, mbstate_t oldstate, mbstate_t newstate)
3740 +{
3741 + size_t i, j, chars;
3742 + const char *str[2];
3743 + char *copy[2];
3744 + size_t len[2];
3745 + mbstate_t state[2];
3746 + size_t mblength;
3747 + wchar_t wc, uwc;
3748 + mbstate_t state_bak;
3749 +
3750 + str[0] = old;
3751 + str[1] = new;
3752 + len[0] = oldlen;
3753 + len[1] = newlen;
3754 + state[0] = oldstate;
3755 + state[1] = newstate;
3756 +
3757 + for (i = 0; i < 2; i++)
3758 + {
3759 + copy[i] = alloca (len[i] + 1);
3760 +
3761 + for (j = 0, chars = 0; j < len[i] && chars < check_chars; chars++)
3762 + {
3763 + state_bak = state[i];
3764 + mblength = mbrtowc (&wc, str[i] + j, len[i] - j, &(state[i]));
3765 +
3766 + switch (mblength)
3767 + {
3768 + case (size_t)-1:
3769 + case (size_t)-2:
3770 + state[i] = state_bak;
3771 + /* Fall through */
3772 + case 0:
3773 + mblength = 1;
3774 + break;
3775 +
3776 + default:
3777 + if (ignore_case)
3778 + {
3779 + uwc = towupper (wc);
3780 +
3781 + if (uwc != wc)
3782 + {
3783 + mbstate_t state_wc;
3784 +
3785 + memset (&state_wc, '\0', sizeof(mbstate_t));
3786 + wcrtomb (copy[i] + j, uwc, &state_wc);
3787 + }
3788 + else
3789 + memcpy (copy[i] + j, str[i] + j, mblength);
3790 + }
3791 + else
3792 + memcpy (copy[i] + j, str[i] + j, mblength);
3793 + }
3794 + j += mblength;
3795 + }
3796 + copy[i][j] = '\0';
3797 + len[i] = j;
3798 + }
3799 +
3800 + return xmemcoll (copy[0], len[0], copy[1], len[1]);
3801 }
3802 +#endif
3803
3804 /* Output the line in linebuffer LINE to standard output
3805 provided that the switches say it should be output.
3806 @@ -295,15 +478,43 @@
3807 {
3808 char *prevfield IF_LINT (= NULL);
3809 size_t prevlen IF_LINT (= 0);
3810 +#if HAVE_MBRTOWC
3811 + mbstate_t prevstate;
3812 +
3813 + memset (&prevstate, '\0', sizeof (mbstate_t));
3814 +#endif
3815
3816 while (!feof (stdin))
3817 {
3818 char *thisfield;
3819 size_t thislen;
3820 +#if HAVE_MBRTOWC
3821 + mbstate_t thisstate;
3822 +#endif
3823 +
3824 if (readlinebuffer (thisline, stdin) == 0)
3825 break;
3826 thisfield = find_field (thisline);
3827 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
3828 +#if HAVE_MBRTOWC
3829 + if (MB_CUR_MAX > 1)
3830 + {
3831 + thisstate = thisline->state;
3832 +
3833 + if (prevline->length == 0 || different_multi
3834 + (thisfield, prevfield, thislen, prevlen, thisstate, prevstate))
3835 + {
3836 + fwrite (thisline->buffer, sizeof (char),
3837 + thisline->length, stdout);
3838 +
3839 + SWAP_LINES (prevline, thisline);
3840 + prevfield = thisfield;
3841 + prevlen = thislen;
3842 + prevstate = thisstate;
3843 + }
3844 + }
3845 + else
3846 +#endif
3847 if (prevline->length == 0
3848 || different (thisfield, prevfield, thislen, prevlen))
3849 {
3850 @@ -322,17 +533,26 @@
3851 size_t prevlen;
3852 uintmax_t match_count = 0;
3853 bool first_delimiter = true;
3854 +#if HAVE_MBRTOWC
3855 + mbstate_t prevstate;
3856 +#endif
3857
3858 if (readlinebuffer (prevline, stdin) == 0)
3859 goto closefiles;
3860 prevfield = find_field (prevline);
3861 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
3862 +#if HAVE_MBRTOWC
3863 + prevstate = prevline->state;
3864 +#endif
3865
3866 while (!feof (stdin))
3867 {
3868 bool match;
3869 char *thisfield;
3870 size_t thislen;
3871 +#if HAVE_MBRTOWC
3872 + mbstate_t thisstate;
3873 +#endif
3874 if (readlinebuffer (thisline, stdin) == 0)
3875 {
3876 if (ferror (stdin))
3877 @@ -341,6 +561,15 @@
3878 }
3879 thisfield = find_field (thisline);
3880 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
3881 +#if HAVE_MBRTOWC
3882 + if (MB_CUR_MAX > 1)
3883 + {
3884 + thisstate = thisline->state;
3885 + match = !different_multi (thisfield, prevfield,
3886 + thislen, prevlen, thisstate, prevstate);
3887 + }
3888 + else
3889 +#endif
3890 match = !different (thisfield, prevfield, thislen, prevlen);
3891 match_count += match;
3892
3893 @@ -373,6 +602,9 @@
3894 SWAP_LINES (prevline, thisline);
3895 prevfield = thisfield;
3896 prevlen = thislen;
3897 +#if HAVE_MBRTOWC
3898 + prevstate = thisstate;
3899 +#endif
3900 if (!match)
3901 match_count = 0;
3902 }
3903 @@ -417,6 +649,19 @@
3904
3905 atexit (close_stdout);
3906
3907 +#if HAVE_MBRTOWC
3908 + if (MB_CUR_MAX > 1)
3909 + {
3910 + find_field = find_field_multi;
3911 + }
3912 + else
3913 +#endif
3914 + {
3915 + find_field = find_field_uni;
3916 + }
3917 +
3918 +
3919 +
3920 skip_chars = 0;
3921 skip_fields = 0;
3922 check_chars = SIZE_MAX;
3923 diff -Naur coreutils-6.9.orig/tests/sort/Makefile.am coreutils-6.9/tests/sort/Makefile.am
3924 --- coreutils-6.9.orig/tests/sort/Makefile.am 2007-03-18 21:36:44.000000000 +0000
3925 +++ coreutils-6.9/tests/sort/Makefile.am 2007-04-07 17:03:36.000000000 +0000
3926 @@ -66,15 +66,17 @@
3927 bigfield.O bigfield.E
3928 ##test-files-end
3929
3930 -EXTRA_DIST = Test.pm $x-tests $(explicit) $(maint_gen)
3931 -noinst_SCRIPTS = $x-tests
3932 +run_gen += mb1.0 mb2.0
3933 +
3934 +EXTRA_DIST = Test.pm $x-tests $(explicit) $(maint_gen) mb1.I mb1.X mb2.I mb2.X
3935 +noinst_SCRIPTS = $x-tests # $x-mb-tests
3936 TESTS_ENVIRONMENT = \
3937 CU_TEST_NAME=`basename $(abs_srcdir)`,$$tst \
3938 PATH="$(VG_PATH_PREFIX)`pwd`/../../src$(PATH_SEPARATOR)$$PATH"
3939
3940 editpl = sed -e 's,@''PERL''@,$(PERL),g' -e 's,@''srcdir''@,$(srcdir),g'
3941
3942 -TESTS = $x-tests
3943 +TESTS = $x-tests $x-mb-tests
3944
3945 mk_script = $(srcdir)/../mk-script
3946 $(srcdir)/$x-tests: $(mk_script) Test.pm Makefile.am
3947 diff -Naur coreutils-6.9.orig/tests/sort/Makefile.in coreutils-6.9/tests/sort/Makefile.in
3948 --- coreutils-6.9.orig/tests/sort/Makefile.in 2007-03-22 21:20:25.000000000 +0000
3949 +++ coreutils-6.9/tests/sort/Makefile.in 2007-04-07 17:01:55.000000000 +0000
3950 @@ -540,14 +540,16 @@
3951 incompat5.O incompat5.E incompat6.O incompat6.E nul-tab.O nul-tab.E \
3952 bigfield.O bigfield.E
3953
3954 -EXTRA_DIST = Test.pm $x-tests $(explicit) $(maint_gen)
3955 -noinst_SCRIPTS = $x-tests
3956 +run_gen += mb1.0 mb2.0
3957 +
3958 +EXTRA_DIST = Test.pm $x-tests $(explicit) $(maint_gen) mb1.I mb1.X mb2.I mb2.X
3959 +noinst_SCRIPTS = $x-tests # $x-mb-tests
3960 TESTS_ENVIRONMENT = \
3961 CU_TEST_NAME=`basename $(abs_srcdir)`,$$tst \
3962 PATH="$(VG_PATH_PREFIX)`pwd`/../../src$(PATH_SEPARATOR)$$PATH"
3963
3964 editpl = sed -e 's,@''PERL''@,$(PERL),g' -e 's,@''srcdir''@,$(srcdir),g'
3965 -TESTS = $x-tests
3966 +TESTS = $x-tests $x-mb-tests
3967 mk_script = $(srcdir)/../mk-script
3968 MAINTAINERCLEANFILES = $x-tests $(maint_gen)
3969 CLEANFILES = $(run_gen)
3970 diff -Naur coreutils-6.9.orig/tests/sort/mb1.I coreutils-6.9/tests/sort/mb1.I
3971 --- coreutils-6.9.orig/tests/sort/mb1.I 1970-01-01 00:00:00.000000000 +0000
3972 +++ coreutils-6.9/tests/sort/mb1.I 2007-04-07 16:59:55.000000000 +0000
3973 @@ -0,0 +1,4 @@
3974 +Apple@10
3975 +Banana@5
3976 +Citrus@20
3977 +Cherry@30
3978 diff -Naur coreutils-6.9.orig/tests/sort/mb1.X coreutils-6.9/tests/sort/mb1.X
3979 --- coreutils-6.9.orig/tests/sort/mb1.X 1970-01-01 00:00:00.000000000 +0000
3980 +++ coreutils-6.9/tests/sort/mb1.X 2007-04-07 16:59:55.000000000 +0000
3981 @@ -0,0 +1,4 @@
3982 +Banana@5
3983 +Apple@10
3984 +Citrus@20
3985 +Cherry@30
3986 diff -Naur coreutils-6.9.orig/tests/sort/mb2.I coreutils-6.9/tests/sort/mb2.I
3987 --- coreutils-6.9.orig/tests/sort/mb2.I 1970-01-01 00:00:00.000000000 +0000
3988 +++ coreutils-6.9/tests/sort/mb2.I 2007-04-07 16:59:55.000000000 +0000
3989 @@ -0,0 +1,4 @@
3990 +Apple@AA10@@20
3991 +Banana@AA5@@30
3992 +Citrus@AA20@@5
3993 +Cherry@AA30@@10
3994 diff -Naur coreutils-6.9.orig/tests/sort/mb2.X coreutils-6.9/tests/sort/mb2.X
3995 --- coreutils-6.9.orig/tests/sort/mb2.X 1970-01-01 00:00:00.000000000 +0000
3996 +++ coreutils-6.9/tests/sort/mb2.X 2007-04-07 16:59:55.000000000 +0000
3997 @@ -0,0 +1,4 @@
3998 +Citrus@AA20@@5
3999 +Cherry@AA30@@10
4000 +Apple@AA10@@20
4001 +Banana@AA5@@30
4002 diff -Naur coreutils-6.9.orig/tests/sort/sort-mb-tests coreutils-6.9/tests/sort/sort-mb-tests
4003 --- coreutils-6.9.orig/tests/sort/sort-mb-tests 1970-01-01 00:00:00.000000000 +0000
4004 +++ coreutils-6.9/tests/sort/sort-mb-tests 2007-04-07 16:59:55.000000000 +0000
4005 @@ -0,0 +1,58 @@
4006 +#! /bin/sh
4007 +case $# in
4008 + 0) xx='../../src/sort';;
4009 + *) xx="$1";;
4010 +esac
4011 +test "$VERBOSE" && echo=echo || echo=:
4012 +$echo testing program: $xx
4013 +errors=0
4014 +test "$srcdir" || srcdir=.
4015 +test "$VERBOSE" && $xx --version 2> /dev/null
4016 +
4017 +export LC_ALL=en_US.UTF-8
4018 +locale -k LC_CTYPE 2>&1 | grep -q charmap.*UTF-8 || exit 77
4019 +errors=0
4020 +
4021 +$xx -t @ -k2 -n mb1.I > mb1.O
4022 +code=$?
4023 +if test $code != 0; then
4024 + $echo "Test mb1 failed: $xx return code $code differs from expected value 0" 1>&2
4025 + errors=`expr $errors + 1`
4026 +else
4027 + cmp mb1.O $srcdir/mb1.X > /dev/null 2>&1
4028 + case $? in
4029 + 0) if test "$VERBOSE"; then $echo "passed mb1"; fi;;
4030 + 1) $echo "Test mb1 failed: files mb1.O and $srcdir/mb1.X differ" 1>&2
4031 + (diff -c mb1.O $srcdir/mb1.X) 2> /dev/null
4032 + errors=`expr $errors + 1`;;
4033 + 2) $echo "Test mb1 may have failed." 1>&2
4034 + $echo The command "cmp mb1.O $srcdir/mb1.X" failed. 1>&2
4035 + errors=`expr $errors + 1`;;
4036 + esac
4037 +fi
4038 +
4039 +$xx -t @ -k4 -n mb2.I > mb2.O
4040 +code=$?
4041 +if test $code != 0; then
4042 + $echo "Test mb2 failed: $xx return code $code differs from expected value 0" 1>&2
4043 + errors=`expr $errors + 1`
4044 +else
4045 + cmp mb2.O $srcdir/mb2.X > /dev/null 2>&1
4046 + case $? in
4047 + 0) if test "$VERBOSE"; then $echo "passed mb2"; fi;;
4048 + 1) $echo "Test mb2 failed: files mb2.O and $srcdir/mb2.X differ" 1>&2
4049 + (diff -c mb2.O $srcdir/mb2.X) 2> /dev/null
4050 + errors=`expr $errors + 1`;;
4051 + 2) $echo "Test mb2 may have failed." 1>&2
4052 + $echo The command "cmp mb2.O $srcdir/mb2.X" failed. 1>&2
4053 + errors=`expr $errors + 1`;;
4054 + esac
4055 +fi
4056 +
4057 +if test $errors = 0; then
4058 + $echo Passed all 113 tests. 1>&2
4059 +else
4060 + $echo Failed $errors tests. 1>&2
4061 +fi
4062 +test $errors = 0 || errors=1
4063 +exit $errors