http://www.openi18n.org/download/utildev/coreutils-5.3.0-i18n-0.1.patch --- coreutils/lib/linebuffer.h +++ coreutils/lib/linebuffer.h @@ -22,6 +22,11 @@ # include +/* Get mbstate_t. */ +# if HAVE_WCHAR_H +# include +# endif + /* A `struct linebuffer' holds a line of text. */ struct linebuffer @@ -29,6 +34,9 @@ struct linebuffer size_t size; /* Allocated. */ size_t length; /* Used. */ char *buffer; +# if HAVE_WCHAR_H + mbstate_t state; +# endif }; /* Initialize linebuffer LINEBUFFER for use. */ --- coreutils/src/cut.c +++ coreutils/src/cut.c @@ -29,6 +29,12 @@ #include #include #include + +/* Get mbstate_t, mbrtowc(). */ +#if HAVE_WCHAR_H +# include +#endif + #include "system.h" #include "error.h" @@ -37,6 +43,13 @@ #include "quote.h" #include "xstrndup.h" +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC + installation; work around this configuration error. */ +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2 +# undef MB_LEN_MAX +# define MB_LEN_MAX 16 +#endif + /* The official name of this program (e.g., no `g' prefix). */ #define PROGRAM_NAME "cut" @@ -73,6 +86,54 @@ struct range_pair size_t hi; }; +/* Refill the buffer BUF. */ +#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \ + do \ + { \ + if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \ + { \ + memmove (BUF, BUFPOS, BUFLEN); \ + BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \ + BUFPOS = BUF; \ + } \ + } \ + while (0) + +/* Get wide character which starts at BUFPOS. If the byte sequence is + not valid as a character, CONVFAIL is 1. Otherwise 0. */ +#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \ + do \ + { \ + wchar_t tmp; \ + mbstate_t state_bak; \ + \ + if (BUFLEN < 1) \ + { \ + WC = WEOF; \ + break; \ + } \ + \ + /* Get a wide character. */ \ + CONVFAIL = 0; \ + state_bak = STATE; \ + MBLENGTH = mbrtowc (&tmp, BUFPOS, BUFLEN, &STATE); \ + WC = tmp; \ + \ + switch (MBLENGTH) \ + { \ + case (size_t)-1: \ + case (size_t)-2: \ + ++CONVFAIL; \ + STATE = state_bak; \ + /* Fall througn. */ \ + \ + case 0: \ + MBLENGTH = 1; \ + break; \ + } \ + } \ + while (0) + /* This buffer is used to support the semantics of the -s option (or lack of same) when the specified field list includes (does not include) the first field. In both of those cases, the entire @@ -85,7 +146,7 @@ static char *field_1_buffer; /* The number of bytes allocated for FIELD_1_BUFFER. */ static size_t field_1_bufsize; -/* The largest field or byte index used as an endpoint of a closed +/* The largest field, character or byte index used as an endpoint of a closed or degenerate range specification; this doesn't include the starting index of right-open-ended ranges. For example, with either range spec `2-5,9-', `2-3,5,9-' this variable would be set to 5. */ @@ -97,10 +158,11 @@ static size_t eol_range_start; /* This is a bit vector. In byte mode, which bytes to output. + In character mode, which characters to output. In field mode, which DELIM-separated fields to output. - Both bytes and fields are numbered starting with 1, + Bytes, characters and fields are numbered starting with 1, so the zeroth bit of this array is unused. - A field or byte K has been selected if + A byte, character or field K has been selected if (K <= MAX_RANGE_ENDPOINT and is_printable_field(K)) || (EOL_RANGE_START > 0 && K >= EOL_RANGE_START). */ static unsigned char *printable_field; @@ -109,9 +171,12 @@ enum operating_mode { undefined_mode, - /* Output characters that are in the given bytes. */ + /* Output bytes that are in the given bytes. */ byte_mode, + /* Output characters that are at the given positions. */ + character_mode, + /* Output the given delimeter-separated fields. */ field_mode }; @@ -121,6 +186,13 @@ char *program_name; static enum operating_mode operating_mode; +/* If true, when in byte mode, don't split multibyte characters. */ +static bool byte_mode_character_aware; + +/* If true, the function for single byte locale is work + if this program runs on multibyte locale. */ +static bool force_singlebyte_mode; + /* If true do not output lines containing no delimeter characters. Otherwise, all such lines are printed. This option is valid only with field mode. */ @@ -132,6 +204,9 @@ static bool complement; /* The delimeter character for field mode. */ static unsigned char delim; +#if HAVE_WCHAR_H +static wchar_t wcdelim; +#endif /* True if the --output-delimiter=STRING option was specified. */ static bool output_delimiter_specified; @@ -205,7 +280,7 @@ Mandatory arguments to long options are -f, --fields=LIST select only these fields; also print any line\n\ that contains no delimiter character, unless\n\ the -s option is specified\n\ - -n (ignored)\n\ + -n with -b: don't split multibyte characters\n\ "), stdout); fputs (_("\ --complement complement the set of selected bytes, characters\n\ @@ -360,7 +435,7 @@ set_fields (const char *fieldstr) in_digits = false; /* Starting a range. */ if (dash_found) - FATAL_ERROR (_("invalid byte or field list")); + FATAL_ERROR (_("invalid byte, character or field list")); dash_found = true; fieldstr++; @@ -385,14 +460,16 @@ set_fields (const char *fieldstr) if (value == 0) { /* `n-'. From `initial' to end of line. */ - eol_range_start = initial; + if(eol_range_start == 0 || + (eol_range_start != 0 && eol_range_start > initial)) + eol_range_start = initial; field_found = true; } else { /* `m-n' or `-n' (1-n). */ if (value < initial) - FATAL_ERROR (_("invalid byte or field list")); + FATAL_ERROR (_("invalid byte, character or field list")); /* Is there already a range going to end of line? */ if (eol_range_start != 0) @@ -478,7 +555,7 @@ set_fields (const char *fieldstr) fieldstr++; } else - FATAL_ERROR (_("invalid byte or field list")); + FATAL_ERROR (_("invalid byte, character or field list")); } max_range_endpoint = 0; @@ -571,6 +648,81 @@ cut_bytes (FILE *stream) } } +#if HAVE_MBRTOWC +/* This function is in use for the following case. + + 1. Read from the stream STREAM, printing to standard output any selected + characters. + + 2. Read from stream STREAM, printing to standard output any selected bytes, + without splitting multibyte characters. */ + +static void +cut_characters_or_cut_bytes_no_split (FILE *stream) +{ + size_t idx; /* Number of bytes or characters in the line so far. */ + /* Whether to begin printing delimiters between ranges for the current line. + Set after we've begun printing data corresponding to the first range. */ + bool print_delimiter; + + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ + char *bufpos; /* Next read position of BUF. */ + size_t buflen; /* The length of the byte sequence in buf. */ + wint_t wc; /* A gotten wide character. */ + size_t mblength; /* The byte size of a multibyte character which shows + as same character as WC. */ + mbstate_t state; /* State of the stream. */ + int convfail; /* 1, when conversion is failed. Otherwise 0. */ + + + idx = 0; + print_delimiter = false; + buflen = 0; + bufpos = buf; + memset (&state, '\0', sizeof(mbstate_t)); + + while (1) + { + REFILL_BUFFER (buf, bufpos, buflen, stream); + + GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail); + + if (wc == WEOF) + { + if (idx > 0) + putchar ('\n'); + break; + } + else if (wc == L'\n') + { + putchar ('\n'); + idx = 0; + print_delimiter = false; + } + else + { + bool range_start; + bool *rs = output_delimiter_specified ? &range_start : NULL; + + idx += (operating_mode == byte_mode) ? mblength : 1; + if (print_kth (idx, rs)) + { + if (rs && *rs && print_delimiter) + { + fwrite (output_delimiter_string, sizeof (char), + output_delimiter_length, stdout); + } + print_delimiter = true; + fwrite (bufpos, mblength, sizeof(char), stdout); + } + } + + buflen -= mblength; + bufpos += mblength; + } +} +#endif + /* Read from stream STREAM, printing to standard output any selected fields. */ static void @@ -692,13 +844,190 @@ cut_fields (FILE *stream) } } +#if HAVE_MBRTOWC +static void +cut_fields_mb (FILE *stream) +{ + int c; + size_t field_idx = 1; + bool found_any_selected_field = false; + bool buffer_first_field; + int empty_input; + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ + char *bufpos; /* Next read position of BUF. */ + size_t buflen; /* The length of the byte sequence in buf. */ + wint_t wc = 0; /* A gotten wide character. */ + size_t mblength; /* The byte size of a multibyte character which shows + as same character as WC. */ + mbstate_t state; /* State of the stream. */ + int convfail; /* 1, when conversion is failed. Otherwise 0. */ + + bufpos = buf; + buflen = 0; + memset (&state, '\0', sizeof(mbstate_t)); + + c = getc (stream); + empty_input = (c == EOF); + if (c != EOF) + ungetc (c, stream); + else + wc = WEOF; + + /* To support the semantics of the -s flag, we may have to buffer + all of the first field to determine whether it is `delimited.' + But that is unnecessary if all non-delimited lines must be printed + and the first field has been selected, or if non-delimited lines + must be suppressed and the first field has *not* been selected. + That is because a non-delimited line has exactly one field. */ + buffer_first_field = (suppress_non_delimited ^ !print_kth (1, NULL)); + + while (1) + { + if (field_idx == 1 && buffer_first_field) + { + size_t n_bytes = 0; + + while (1) + { + REFILL_BUFFER (buf, bufpos, buflen, stream); + + GET_NEXT_WC_FROM_BUFFER + (wc, bufpos, buflen, mblength, state, convfail); + + if (wc == WEOF) + break; + + field_1_buffer = xrealloc (field_1_buffer, n_bytes + mblength); + memcpy (field_1_buffer + n_bytes, bufpos, mblength); + n_bytes += mblength; + buflen -= mblength; + bufpos += mblength; + + if (!convfail && (wc == L'\n' || wc == wcdelim)) + break; + } + + if (wc == WEOF) + break; + + /* If the first field extends to the end of line (it is not + delimited) and we are printing all non-delimited lines, + print this one. */ + if (convfail || (!convfail && wc != wcdelim)) + { + if (suppress_non_delimited) + { + /* Empty. */ + } + else + { + fwrite (field_1_buffer, sizeof (char), n_bytes, stdout); + /* Make sure the output line is newline terminated. */ + if (convfail || (!convfail && wc != L'\n')) + putchar ('\n'); + } + continue; + } + + if (print_kth (1, NULL)) + { + /* Print the field, but not the trailing delimiter. */ + fwrite (field_1_buffer, sizeof (char), n_bytes - 1, stdout); + found_any_selected_field = true; + } + ++field_idx; + } + + if (wc != WEOF) + { + if (print_kth (field_idx, NULL)) + { + if (found_any_selected_field) + { + fwrite (output_delimiter_string, sizeof (char), + output_delimiter_length, stdout); + } + found_any_selected_field = true; + } + + while (1) + { + REFILL_BUFFER (buf, bufpos, buflen, stream); + + GET_NEXT_WC_FROM_BUFFER + (wc, bufpos, buflen, mblength, state, convfail); + + if (wc == WEOF) + break; + else if (!convfail && (wc == wcdelim || wc == L'\n')) + { + buflen -= mblength; + bufpos += mblength; + break; + } + + if (print_kth (field_idx, NULL)) + fwrite (bufpos, mblength, sizeof(char), stdout); + + buflen -= mblength; + bufpos += mblength; + } + } + + if ((!convfail || wc == L'\n') && buflen < 1) + wc = WEOF; + + if (!convfail && wc == wcdelim) + ++field_idx; + else if (wc == WEOF || (!convfail && wc == L'\n')) + { + if (found_any_selected_field + || (!empty_input && !(suppress_non_delimited && field_idx == 1))) + putchar ('\n'); + if (wc == WEOF) + break; + field_idx = 1; + found_any_selected_field = false; + } + } +} +#endif + static void cut_stream (FILE *stream) { - if (operating_mode == byte_mode) - cut_bytes (stream); +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1 && !force_singlebyte_mode) + { + switch (operating_mode) + { + case byte_mode: + if (byte_mode_character_aware) + cut_characters_or_cut_bytes_no_split (stream); + else + cut_bytes (stream); + break; + + case character_mode: + cut_characters_or_cut_bytes_no_split (stream); + break; + + case field_mode: + cut_fields_mb (stream); + break; + + default: + abort (); + } + } else - cut_fields (stream); +#endif + { + if (operating_mode == field_mode) + cut_fields (stream); + else + cut_bytes (stream); + } } /* Process file FILE to standard output. @@ -748,6 +1077,8 @@ main (int argc, char **argv) bool ok; bool delim_specified = false; char *spec_list_string IF_LINT(= NULL); + char mbdelim[MB_LEN_MAX + 1]; + size_t delimlen = 0; initialize_main (&argc, &argv); program_name = argv[0]; @@ -770,7 +1101,6 @@ main (int argc, char **argv) switch (optc) { case 'b': - case 'c': /* Build the byte list. */ if (operating_mode != undefined_mode) FATAL_ERROR (_("only one type of list may be specified")); @@ -778,6 +1108,14 @@ main (int argc, char **argv) spec_list_string = optarg; break; + case 'c': + /* Build the character list. */ + if (operating_mode != undefined_mode) + FATAL_ERROR (_("only one type of list may be specified")); + operating_mode = character_mode; + spec_list_string = optarg; + break; + case 'f': /* Build the field list. */ if (operating_mode != undefined_mode) @@ -789,9 +1127,32 @@ main (int argc, char **argv) case 'd': /* New delimiter. */ /* Interpret -d '' to mean `use the NUL byte as the delimiter.' */ - if (optarg[0] != '\0' && optarg[1] != '\0') - FATAL_ERROR (_("the delimiter must be a single character")); - delim = optarg[0]; +#if HAVE_MBRTOWC + if(MB_CUR_MAX > 1) + { + mbstate_t state; + + memset (&state, '\0', sizeof(mbstate_t)); + delimlen = mbrtowc (&wcdelim, optarg, MB_LEN_MAX, &state); + + if (delimlen == (size_t)-1 || delimlen == (size_t)-2) + force_singlebyte_mode = true; + else + { + delimlen = (delimlen < 1) ? 1 : delimlen; + if (wcdelim != L'\0' && *(optarg + delimlen) != '\0') + FATAL_ERROR (_("the delimiter must be a single character")); + memcpy (mbdelim, optarg, delimlen); + } + } + + if (MB_CUR_MAX <= 1 || force_singlebyte_mode) +#endif + { + if (optarg[0] != '\0' && optarg[1] != '\0') + FATAL_ERROR (_("the delimiter must be a single character")); + delim = (unsigned char) optarg[0]; + } delim_specified = true; break; @@ -805,6 +1166,7 @@ main (int argc, char **argv) break; case 'n': + byte_mode_character_aware = true; break; case 's': @@ -827,7 +1189,7 @@ main (int argc, char **argv) if (operating_mode == undefined_mode) FATAL_ERROR (_("you must specify a list of bytes, characters, or fields")); - if (delim != '\0' && operating_mode != field_mode) + if (delim_specified && operating_mode != field_mode) FATAL_ERROR (_("an input delimiter may be specified only\ when operating on fields")); @@ -854,15 +1216,34 @@ main (int argc, char **argv) } if (!delim_specified) - delim = '\t'; + { + delim = '\t'; +#ifdef HAVE_MBRTOWC + wcdelim = L'\t'; + mbdelim[0] = '\t'; + mbdelim[1] = '\0'; + delimlen = 1; + } +#endif if (output_delimiter_string == NULL) { - static char dummy[2]; - dummy[0] = delim; - dummy[1] = '\0'; - output_delimiter_string = dummy; - output_delimiter_length = 1; +#ifdef HAVE_MBRTOWC + if (MB_CUR_MAX > 1 && !force_singlebyte_mode) + { + output_delimiter_string = xstrdup(mbdelim); + output_delimiter_length = delimlen; + } + + if (MB_CUR_MAX <= 1 || force_singlebyte_mode) +#endif + { + static char dummy[2]; + dummy[0] = delim; + dummy[1] = '\0'; + output_delimiter_string = dummy; + output_delimiter_length = 1; + } } if (optind == argc) --- coreutils/src/expand.c +++ coreutils/src/expand.c @@ -38,12 +38,32 @@ #include #include #include + +/* Get mbstate_t, mbrtowc, wcwidth. */ +#if HAVE_WCHAR_H +# include +#endif +#if HAVE_WCTYPE_H +# include +#endif + #include "system.h" #include "error.h" #include "posixver.h" #include "quote.h" #include "xstrndup.h" +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC + installation; work around this configuration error. */ +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2 +# define MB_LEN_MAX 16 +#endif + +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */ +#if HAVE_MBRTOWC && defined mbstate_t +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0) +#endif + /* The official name of this program (e.g., no `g' prefix). */ #define PROGRAM_NAME "expand" @@ -352,8 +372,9 @@ expand (void) } else { - column++; + if (!ISCNTRL (c)) + column++; - if (!column) + if (column >= UINTMAX_MAX) error (EXIT_FAILURE, 0, _("input line is too long")); } @@ -370,6 +391,163 @@ expand (void) } } +#if HAVE_MBRTOWC && HAVE_WCTYPE_H +static void +expand_multibyte (void) +{ + /* Input stream. */ + FILE *fp = next_file (NULL); + + mbstate_t i_state; /* Current shift state of the input stream. */ + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ + char *bufpos; /* Next read position of BUF. */ + size_t buflen = 0; /* The length of the byte sequence in buf. */ + + if (!fp) + return; + + /* Binary I/O will preserve the original EOL style (DOS/Unix) of files. */ + SET_BINARY2 (fileno (fp), STDOUT_FILENO); + + for (;;) + { + /* Input character, or EOF. */ + wint_t wc; + + /* If true, perform translations. */ + bool convert = true; + + + /* The following variables have valid values only when CONVERT + is true: */ + + /* Column of next input character. */ + uintmax_t column = 0; + + /* Index in TAB_LIST of next tab stop to examine. */ + size_t tab_index = 0; + + + /* Convert a line of text. */ + + do + { + wchar_t w; + size_t mblength; /* The byte size of a multibyte character + which shows as same character as WC. */ + mbstate_t i_state_bak; /* Back up the I_STATE. */ + + /* Fill buffer */ + if (buflen < MB_LEN_MAX) + { + if (!feof(fp) && !ferror(fp)) { + if (buflen > 0) memmove(buf, bufpos, buflen); + buflen += fread(buf + buflen, sizeof(char), BUFSIZ, fp); + bufpos = buf; + } + } + + if (buflen < 1) { + /* Move to the next file */ + if (feof(fp) || ferror(fp)) { + fp = next_file(fp); + } + if (!fp) + return; + memset (&i_state, '\0', sizeof (mbstate_t)); + SET_BINARY2 (fileno (fp), STDOUT_FILENO); + continue; + } + + i_state_bak = i_state; + mblength = mbrtowc (&w, bufpos, buflen, &i_state); + wc = w; + + if (mblength == (size_t) -1 || mblength == (size_t) -2) { + i_state = i_state_bak; + wc = L'\0'; + column += convert; + mblength = 1; + } + + if (convert) + { + if (wc == L'\t') + { + /* Column the next input tab stop is on. */ + uintmax_t next_tab_column; + + if (tab_size) + next_tab_column = column + (tab_size - column % tab_size); + else + for (;;) + if (tab_index == first_free_tab) + { + next_tab_column = column + 1; + break; + } + else + { + uintmax_t tab = tab_list[tab_index++]; + if (column < tab) + { + next_tab_column = tab; + break; + } + } + + if (next_tab_column < column) + error (EXIT_FAILURE, 0, _("input line is too long")); + + while (++column < next_tab_column) + if (putchar (' ') < 0) + error (EXIT_FAILURE, errno, _("write error")); + + *bufpos = ' '; + } + else if (wc == L'\b') + { + /* Go back one column, and force recalculation of the + next tab stop. */ + column -= !!column; + tab_index -= !!tab_index; + } + else + { + if (!iswcntrl (wc)) + { + int width = wcwidth (wc); + if (width > 0) { + if (column > (column + width)) + error (EXIT_FAILURE, 0, _("input line is too long")); + column += width; + } + } + } + + convert &= convert_entire_line | iswblank (wc); + } + + if (mblength) + { + if (fwrite (bufpos, sizeof(char), mblength, stdout) < mblength) + error (EXIT_FAILURE, errno, _("write error")); + } + else + { + if (putchar('\0')) + error (EXIT_FAILURE, errno, _("write error")); + mblength = 1; + } + + buflen -= mblength; + bufpos += mblength; + } + while (wc != L'\n'); + } +} +#endif + int main (int argc, char **argv) { @@ -446,7 +624,12 @@ main (int argc, char **argv) file_list = (optind < argc ? &argv[optind] : stdin_argv); - expand (); +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + expand_multibyte (); + else +#endif + expand (); if (have_read_stdin && fclose (stdin) != 0) error (EXIT_FAILURE, errno, "-"); --- coreutils/src/fold.c +++ coreutils/src/fold.c @@ -23,6 +23,19 @@ #include #include +/* Get MB_CUR_MAX. */ +#include + +/* Get mbrtowc, mbstate_t, wcwidth(). */ +#if HAVE_WCHAR_H +# include +#endif + +/* Get iswprint(), iswctype(), wctype(). */ +#if HAVE_WCTYPE_H +# include +#endif + #include "system.h" #include "error.h" #include "posixver.h" @@ -30,14 +43,57 @@ #define TAB_WIDTH 8 +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC + installation; work around this configuration error. */ +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2 +# undef MB_LEN_MAX +# define MB_LEN_MAX 16 +#endif + +#ifndef HAVE_DECL_WCWIDTH +"this configure-time declaration test was not run" +#endif +#if !HAVE_DECL_WCWIDTH +extern int wcwidth (); +#endif + +/* If wcwidth() doesn't exist, assume all printable characters have + width 1. */ +#if !defined wcwidth && !HAVE_WCWIDTH +# define wcwidth(wc) ((wc) == 0 ? 0 : iswprint (wc) ? 1 : -1) +#endif + /* The official name of this program (e.g., no `g' prefix). */ #define PROGRAM_NAME "fold" #define AUTHORS "David MacKenzie" +#define FATAL_ERROR(Message) \ +do \ +{ \ + error (0, 0, (Message)); \ + usage (2); \ +} \ +while (0) + +enum operating_mode +{ + /* Fold texts by columns that are at the given positions. */ + column_mode, + + /* Fold texts by bytes that are at the given positions. */ + byte_mode, + + /* Fold texts by characters that are at the given positions. */ + character_mode, +}; + /* The name this program was run with. */ char *program_name; +/* The argument shows current mode. (Default: column_mode) */ +static enum operating_mode operating_mode; + /* If nonzero, try to break on whitespace. */ static bool break_spaces; @@ -47,9 +103,15 @@ static bool count_bytes; /* If nonzero, at least one of the files we read was standard input. */ static bool have_read_stdin; +/* wide character class `blank' */ +#if HAVE_MBRTOWC +wctype_t blank_type; +#endif + static struct option const longopts[] = { {"bytes", no_argument, NULL, 'b'}, + {"characters", no_argument, NULL, 'c'}, {"spaces", no_argument, NULL, 's'}, {"width", required_argument, NULL, 'w'}, {GETOPT_HELP_OPTION_DECL}, @@ -79,6 +141,7 @@ Mandatory arguments to long options are "), stdout); fputs (_("\ -b, --bytes count bytes rather than columns\n\ + -c, --characters count characters rather than columns\n\ -s, --spaces break at spaces\n\ -w, --width=WIDTH use WIDTH columns instead of 80\n\ "), stdout); @@ -96,7 +159,7 @@ Mandatory arguments to long options are static size_t adjust_column (size_t column, char c) { - if (!count_bytes) + if (operating_mode != byte_mode) { if (c == '\b') { @@ -115,14 +178,9 @@ adjust_column (size_t column, char c) return column; } -/* Fold file FILENAME, or standard input if FILENAME is "-", - to stdout, with maximum line length WIDTH. - Return true if successful. */ - -static bool -fold_file (char *filename, size_t width) +static int +fold_text (FILE *istream, int width) { - FILE *istream; register int c; size_t column = 0; /* Screen column where next char will go. */ size_t offset_out = 0; /* Index in `line_out' for next char. */ @@ -130,20 +188,6 @@ fold_file (char *filename, size_t width) static size_t allocated_out = 0; int saved_errno; - if (STREQ (filename, "-")) - { - istream = stdin; - have_read_stdin = true; - } - else - istream = fopen (filename, "r"); - - if (istream == NULL) - { - error (0, errno, "%s", filename); - return false; - } - while ((c = getc (istream)) != EOF) { if (offset_out + 1 >= allocated_out) @@ -221,6 +265,233 @@ fold_file (char *filename, size_t width) if (offset_out) fwrite (line_out, sizeof (char), (size_t) offset_out, stdout); + return saved_errno; +} + +#if HAVE_MBRTOWC +static void +fold_multibyte_text (FILE *istream, int width) +{ + int i; + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ + size_t buflen; /* The length of the byte sequence in buf. */ + char *bufpos; /* Next read position of BUF. */ + wint_t wc; /* A gotten wide character. */ + wchar_t tmp; + size_t mblength; /* The byte size of a multibyte character which shows + as same character as WC. */ + mbstate_t state, state_bak; /* State of the stream. */ + int convfail; /* 1, when conversion is failed. Otherwise 0. */ + + char *line_out = NULL; + size_t offset_out = 0; /* Index in `line_out' for next char. */ + size_t allocated_out = 1024; + + int increment; + size_t column = 0; + + size_t last_blank_pos; + size_t last_blank_column; + int is_blank_seen; + int last_blank_increment; + int is_bs_following_last_blank; + size_t bs_following_last_blank_num; + int is_cr_after_last_blank; + + +#define CLEAR_FLAGS \ + do \ + { \ + last_blank_pos = 0; \ + last_blank_column = 0; \ + is_blank_seen = 0; \ + is_bs_following_last_blank = 0; \ + bs_following_last_blank_num = 0; \ + is_cr_after_last_blank = 0; \ + } \ + while (0) + +#define START_NEW_LINE \ + do \ + { \ + putchar ('\n'); \ + column = 0; \ + offset_out = 0; \ + CLEAR_FLAGS; \ + } \ + while (0) + + CLEAR_FLAGS; + + memset (&state, '\0', sizeof(mbstate_t)); + line_out = xmalloc (allocated_out); + + buflen = fread (buf, sizeof(char), BUFSIZ, istream); + bufpos = buf; + + for (;; bufpos += mblength, buflen -= mblength) + { + if (buflen < MB_LEN_MAX && !feof (istream) && !ferror (istream)) + { + memmove (buf, bufpos, buflen); + buflen += fread (buf + buflen, sizeof(char), BUFSIZ, istream); + bufpos = buf; + } + + if (buflen < 1) + break; + + /* Get a wide character. */ + convfail = 0; + state_bak = state; + mblength = mbrtowc (&tmp, bufpos, buflen, &state); + wc = tmp; + + switch (mblength) + { + case (size_t)-1: + case (size_t)-2: + convfail++; + state = state_bak; + /* Fall through. */ + + case 0: + mblength = 1; + break; + } + + if (!convfail && wc == L'\n') + { + if (offset_out > 0) + { + fwrite (line_out, sizeof(char), offset_out, stdout); + START_NEW_LINE; + } + continue; + } + +rescan: + if (operating_mode == byte_mode) /* byte mode */ + increment = mblength; + else if (operating_mode == character_mode) /* character mode */ + increment = 1; + else /* column mode */ + { + if (convfail) + increment = 1; + else + { + switch (wc) + { + case L'\b': + increment = (column > 0) ? -1 : 0; + break; + + case L'\r': + increment = -1 * column; + break; + + case L'\t': + increment = 8 - column % 8; + break; + + default: + increment = wcwidth (wc); + increment = (increment < 0) ? 0 : increment; + } + } + } + + if (column + increment > width && break_spaces && last_blank_pos) + { + fwrite (line_out, sizeof(char), last_blank_pos, stdout); + putchar ('\n'); + + offset_out = offset_out - last_blank_pos; + column = column - last_blank_column + ((is_cr_after_last_blank) + ? last_blank_increment : bs_following_last_blank_num); + memmove (line_out, line_out + last_blank_pos, offset_out); + CLEAR_FLAGS; + goto rescan; + } + + if (column + increment > width && column != 0) + { + fwrite (line_out, sizeof(char), offset_out, stdout); + START_NEW_LINE; + goto rescan; + } + + if (allocated_out < offset_out + mblength) + line_out = x2nrealloc (line_out, &allocated_out, sizeof *line_out); + + for (i = 0; i < mblength; i++) + { + *(line_out + offset_out) = *(bufpos + i); + ++offset_out; + } + + column += increment; + + if (is_blank_seen && !convfail && wc == L'\r') + is_cr_after_last_blank = 1; + + if (is_bs_following_last_blank && !convfail && wc == L'\b') + ++bs_following_last_blank_num; + else + is_bs_following_last_blank = 0; + + if (break_spaces && !convfail && iswctype (wc, blank_type)) + { + last_blank_pos = offset_out; + last_blank_column = column; + is_blank_seen = 1; + last_blank_increment = increment; + is_bs_following_last_blank = 1; + bs_following_last_blank_num = 0; + is_cr_after_last_blank = 0; + } + } + + if (offset_out) + fwrite (line_out, sizeof (char), (size_t) offset_out, stdout); + + free(line_out); +} +#endif + +/* Fold file FILENAME, or standard input if FILENAME is "-", + to stdout, with maximum line length WIDTH. + Return true if successful. */ + +static bool +fold_file (char *filename, int width) +{ + FILE *istream; + int saved_errno; + + if (STREQ (filename, "-")) + { + istream = stdin; + have_read_stdin = true; + } + else + istream = fopen (filename, "r"); + + if (istream == NULL) + { + error (0, errno, "%s", filename); + return false; + } + + /* Define how ISTREAM is being folded. */ +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + fold_multibyte_text (istream, width); + else +#endif + saved_errno = fold_text (istream, width); + if (ferror (istream)) { error (0, saved_errno, "%s", filename); @@ -253,6 +524,10 @@ main (int argc, char **argv) atexit (close_stdout); +#if HAVE_MBRTOWC + blank_type = wctype ("blank"); +#endif + operating_mode = column_mode; break_spaces = count_bytes = have_read_stdin = false; /* Turn any numeric options into -w options. */ @@ -280,12 +555,23 @@ main (int argc, char **argv) } } - while ((optc = getopt_long (argc, argv, "bsw:", longopts, NULL)) != -1) + while ((optc = getopt_long (argc, argv, "bcsw:", longopts, NULL)) != -1) { switch (optc) { + case 0: + break; + case 'b': /* Count bytes rather than columns. */ - count_bytes = true; + if (operating_mode != column_mode) + FATAL_ERROR (_("only one way of folding may be specified")); + operating_mode = byte_mode; + break; + + case 'c': /* Count characters rather than columns. */ + if (operating_mode != column_mode) + FATAL_ERROR (_("only one way of folding may be specified")); + operating_mode = character_mode; break; case 's': /* Break at word boundaries. */ --- coreutils/src/join.c +++ coreutils/src/join.c @@ -24,6 +24,16 @@ #include #include +/* Get mbstate_t, mbrtowc, mbrtowc, wcwidth. */ +#if HAVE_WCHAR_H +# include +#endif + +/* Get iswblank, towupper. */ +#if HAVE_WCTYPE_H +# include +#endif + #include "system.h" #include "error.h" #include "hard-locale.h" @@ -34,6 +44,11 @@ #include "xmemcoll.h" #include "xstrtol.h" +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */ +#if HAVE_MBRTOWC && defined mbstate_t +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0) +#endif + /* The official name of this program (e.g., no `g' prefix). */ #define PROGRAM_NAME "join" @@ -110,7 +125,10 @@ static struct outlist *outlist_end = &ou /* Tab character separating fields; if this is NUL fields are separated by any nonempty string of white space, otherwise by exactly one tab character. */ -static char tab; +static char *tab = NULL; + +/* The number of bytes used for tab. */ +static size_t tablen = 0; /* When using getopt_long_only, no long option can start with a character that is a short option. */ @@ -233,7 +251,7 @@ xfields (struct line *line) if (tab) { - unsigned char t = tab; + unsigned char t = tab[0]; char *sep; for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1) extract_field (line, ptr, sep - ptr); @@ -262,6 +280,133 @@ xfields (struct line *line) extract_field (line, ptr, lim - ptr); } +#if HAVE_MBRTOWC +static void +xfields_multibyte (struct line *line) +{ + int i; + char *ptr0 = line->buf.buffer; + char *ptr; + char *lim; + wchar_t wc = 0; + size_t mblength; + mbstate_t state, state_bak; + + memset (&state, 0, sizeof (mbstate_t)); + + ptr = ptr0; + lim = ptr0 + line->buf.length - 1; + + if (tab == NULL) + { + /* Skip leading blanks before the first field. */ + while (ptr < lim) + { + state_bak = state; + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state); + + if (mblength == (size_t) -1 || mblength == (size_t) -2) + { + mblength = 1; + state = state_bak; + break; + } + mblength = (mblength < 1) ? 1 : mblength; + + if (!iswblank (wc)) + break; + ptr += mblength; + } + } + + for (i = 0; ptr < lim; ++i) + { + if (tab != NULL) + { + char *beg = ptr; + while (ptr < lim) + { + state_bak = state; + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state); + + if (mblength == (size_t) -1 || mblength == (size_t) -2) + { + mblength = 1; + state = state_bak; + } + mblength = (mblength < 1) ? 1 : mblength; + + if (mblength == tablen && !memcmp (ptr, tab, mblength)) + break; + else + { + ptr += mblength; + continue; + } + } + + extract_field (line, beg, ptr - beg); + if (ptr < lim) + ptr += mblength; + } + else + { + char *beg = ptr; + while (ptr < lim) + { + state_bak = state; + mblength = mbrtowc (&wc, ptr, lim - ptr + 1, &state); + + if (mblength == (size_t) -1 || mblength == (size_t) -2) + { + mblength = 1; + state = state_bak; + } + mblength = (mblength < 1) ? 1 : mblength; + + if (iswblank (wc)) + break; + else + { + ptr += mblength; + continue; + } + } + + extract_field (line, beg, ptr - beg); + if (ptr < lim) + ptr += mblength; + } + } + + if (ptr != ptr0) + { + mblength = mbrtowc (&wc, ptr - mblength, mblength, &state); + wc = (mbsinit (&state) && *(ptr - mblength) == '\0') ? L'\0' : wc; + if (tab != NULL) + { + if (mblength == (size_t) -1 || mblength == (size_t) -2) + mblength = 1; + + if (mblength == tablen && !memcmp (ptr - mblength, tab, mblength)) + /* Add one more (empty) field because the last character of + the line was a delimiter. */ + extract_field (line, NULL, 0); + } + else + { + if (mblength != (size_t) -1 && mblength != (size_t) -2) + { + if (iswblank (wc)) + /* Add one more (empty) field because the last character of + the line was a delimiter. */ + extract_field (line, NULL, 0); + } + } + } +} +#endif + /* Read a line from FP into LINE and split it into fields. Return true if successful. */ @@ -282,7 +427,13 @@ get_line (FILE *fp, struct line *line) line->nfields_allocated = 0; line->nfields = 0; line->fields = NULL; - xfields (line); + +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + xfields_multibyte (line); + else +#endif + xfields (line); return true; } @@ -336,56 +487,115 @@ static int keycmp (struct line const *line1, struct line const *line2) { /* Start of field to compare in each file. */ - char *beg1; - char *beg2; - - size_t len1; - size_t len2; /* Length of fields to compare. */ + char *beg[2]; + char *copy[2]; + size_t len[2]; /* Length of fields to compare. */ int diff; + int i, j; if (join_field_1 < line1->nfields) { - beg1 = line1->fields[join_field_1].beg; - len1 = line1->fields[join_field_1].len; + beg[0] = line1->fields[join_field_1].beg; + len[0] = line1->fields[join_field_1].len; } else { - beg1 = NULL; - len1 = 0; + beg[0] = NULL; + len[0] = 0; } if (join_field_2 < line2->nfields) { - beg2 = line2->fields[join_field_2].beg; - len2 = line2->fields[join_field_2].len; + beg[1] = line2->fields[join_field_2].beg; + len[1] = line2->fields[join_field_2].len; } else { - beg2 = NULL; - len2 = 0; + beg[1] = NULL; + len[1] = 0; } - if (len1 == 0) - return len2 == 0 ? 0 : -1; - if (len2 == 0) + if (len[0] == 0) + return len[1] == 0 ? 0 : -1; + if (len[1] == 0) return 1; if (ignore_case) { - /* FIXME: ignore_case does not work with NLS (in particular, - with multibyte chars). */ - diff = memcasecmp (beg1, beg2, MIN (len1, len2)); +#ifdef HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + size_t mblength; + wchar_t wc, uwc; + mbstate_t state, state_bak; + + memset (&state, '\0', sizeof (mbstate_t)); + + for (i = 0; i < 2; i++) + { + copy[i] = alloca (len[i] + 1); + + for (j = 0; j < MIN (len[0], len[1]);) + { + state_bak = state; + mblength = mbrtowc (&wc, beg[i] + j, len[i] - j, &state); + + switch (mblength) + { + case (size_t) -1: + case (size_t) -2: + state = state_bak; + /* Fall through */ + case 0: + mblength = 1; + break; + + default: + uwc = towupper (wc); + + if (uwc != wc) + { + mbstate_t state_wc; + + memset (&state_wc, '\0', sizeof (mbstate_t)); + wcrtomb (copy[i] + j, uwc, &state_wc); + } + else + memcpy (copy[i] + j, beg[i] + j, mblength); + } + j += mblength; + } + copy[i][j] = '\0'; + } + return xmemcoll (copy[0], len[0], copy[1], len[1]); + } +#endif + if (hard_LC_COLLATE) + { + for (i = 0; i < 2; i++) + { + copy[i] = alloca (len[i] + 1); + + for (j = 0; j < MIN (len[0], len[1]); j++) + copy[i][j] = toupper (beg[i][j]); + + copy[i][j] = '\0'; + } + return xmemcoll (copy[0], len[0], copy[1], len[1]); + } + else + diff = memcasecmp (beg[0], beg[1], MIN (len[0], len[1])); } else { if (hard_LC_COLLATE) - return xmemcoll (beg1, len1, beg2, len2); - diff = memcmp (beg1, beg2, MIN (len1, len2)); + return xmemcoll (beg[0], len[0], beg[1], len[1]); + diff = memcmp (beg[0], beg[1], MIN (len[0], len[1])); } if (diff) return diff; - return len1 < len2 ? -1 : len1 != len2; + return len[0] < len[1] ? -1 : len[0] != len[1]; } /* Print field N of LINE if it exists and is nonempty, otherwise @@ -414,7 +624,8 @@ static void prjoin (struct line const *line1, struct line const *line2) { const struct outlist *outlist; - char output_separator = tab ? tab : ' '; + char *output_separator = tab ? tab : " "; + size_t output_separator_len = tab ? tablen : 1; outlist = outlist_head.next; if (outlist) @@ -449,7 +660,7 @@ prjoin (struct line const *line1, struct o = o->next; if (o == NULL) break; - putchar (output_separator); + fwrite (output_separator, 1, output_separator_len, stdout); } putchar ('\n'); } @@ -467,23 +678,23 @@ prjoin (struct line const *line1, struct prfield (join_field_1, line1); for (i = 0; i < join_field_1 && i < line1->nfields; ++i) { - putchar (output_separator); + fwrite (output_separator, 1, output_separator_len, stdout); prfield (i, line1); } for (i = join_field_1 + 1; i < line1->nfields; ++i) { - putchar (output_separator); + fwrite (output_separator, 1, output_separator_len, stdout); prfield (i, line1); } for (i = 0; i < join_field_2 && i < line2->nfields; ++i) { - putchar (output_separator); + fwrite (output_separator, 1, output_separator_len, stdout); prfield (i, line2); } for (i = join_field_2 + 1; i < line2->nfields; ++i) { - putchar (output_separator); + fwrite (output_separator, 1, output_separator_len, stdout); prfield (i, line2); } putchar ('\n'); @@ -814,7 +1025,21 @@ main (int argc, char **argv) break; case 't': - tab = *optarg; + tab = xstrdup (optarg); +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + mbstate_t state; + + memset (&state, 0, sizeof (mbstate_t)); + tablen = mbrtowc (NULL, optarg, strlen (optarg), &state); + if (tablen == (size_t) 0 + || tablen == (size_t) -1 || tablen == (size_t) -2) + tablen = 1; + } + else +#endif + tablen = 1; break; case 1: /* Non-option argument. */ --- coreutils/src/pr.c +++ coreutils/src/pr.c @@ -314,6 +314,32 @@ #include #include #include + +/* Get MB_LEN_MAX. */ +#include +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC + installation; work around this configuration error. */ +#if !defined MB_LEN_MAX || MB_LEN_MAX == 1 +# define MB_LEN_MAX 16 +#endif + +/* Get MB_CUR_MAX. */ +#include + +/* Solaris 2.5 has a bug: must be included before . */ +/* Get mbstate_t, mbrtowc(), wcwidth(). */ +#if HAVE_WCHAR_H +# include +#endif + +/* Get iswprint(). -- for wcwidth(). */ +#if HAVE_WCTYPE_H +# include +#endif +#if !defined iswprint && !HAVE_ISWPRINT +# define iswprint(wc) 1 +#endif + #include "system.h" #include "error.h" #include "hard-locale.h" @@ -321,6 +347,18 @@ #include "posixver.h" #include "xstrtol.h" +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */ +#if HAVE_MBRTOWC && defined mbstate_t +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0) +#endif + +#ifndef HAVE_DECL_WCWIDTH +"this configure-time declaration test was not run" +#endif +#if !HAVE_DECL_WCWIDTH +extern int wcwidth (); +#endif + #if ! (HAVE_DECL_STRTOUMAX || defined strtoumax) uintmax_t strtoumax (); #endif @@ -416,8 +454,21 @@ struct COLUMN typedef struct COLUMN COLUMN; #define NULLCOL (COLUMN *)0 + +/* Funtion pointers to switch functions for single byte locale or for + multibyte locale. If multibyte functions do not exist in your sysytem, + these pointers always point the function for single byte locale. */ +static void (*print_char) (char c); +static int (*char_to_clump) (char c); + +/* Functions for single byte locale. */ +static void print_char_single (char c); +static int char_to_clump_single (char c); + +/* Functions for multibyte locale. */ +static void print_char_multi (char c); +static int char_to_clump_multi (char c); -static int char_to_clump (char c); static bool read_line (COLUMN *p); static bool print_page (void); static bool print_stored (COLUMN *p); @@ -427,6 +478,7 @@ static void print_header (void); static void pad_across_to (int position); static void add_line_number (COLUMN *p); static void getoptarg (char *arg, char switch_char, char *character, + int *character_length, int *character_width, int *number); void usage (int status); static void print_files (int number_of_files, char **av); @@ -441,7 +493,6 @@ static void store_char (char c); static void pad_down (int lines); static void read_rest_of_line (COLUMN *p); static void skip_read (COLUMN *p, int column_number); -static void print_char (char c); static void cleanup (void); static void print_sep_string (void); static void separator_string (const char *optarg_S); @@ -456,7 +507,7 @@ static COLUMN *column_vector; we store the leftmost columns contiguously in buff. To print a line from buff, get the index of the first character from line_vector[i], and print up to line_vector[i + 1]. */ -static char *buff; +static unsigned char *buff; /* Index of the position in buff where the next character will be stored. */ @@ -560,7 +611,7 @@ static int chars_per_column; static bool untabify_input = false; /* (-e) The input tab character. */ -static char input_tab_char = '\t'; +static char input_tab_char[MB_LEN_MAX] = "\t"; /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ... where the leftmost column is 1. */ @@ -570,7 +621,10 @@ static int chars_per_input_tab = 8; static bool tabify_output = false; /* (-i) The output tab character. */ -static char output_tab_char = '\t'; +static char output_tab_char[MB_LEN_MAX] = "\t"; + +/* (-i) The byte length of output tab character. */ +static int output_tab_char_length = 1; /* (-i) The width of the output tab. */ static int chars_per_output_tab = 8; @@ -644,7 +698,13 @@ static int power_10; static bool numbered_lines = false; /* (-n) Character which follows each line number. */ -static char number_separator = '\t'; +static char number_separator[MB_LEN_MAX] = "\t"; + +/* (-n) The byte length of the character which follows each line number. */ +static int number_separator_length = 1; + +/* (-n) The character width of the character which follows each line number. */ +static int number_separator_width = 0; /* (-n) line counting starts with 1st line of input file (not with 1st line of 1st page printed). */ @@ -697,6 +757,7 @@ static bool use_col_separator = false; -a|COLUMN|-m is a `space' and with the -J option a `tab'. */ static char *col_sep_string = ""; static int col_sep_length = 0; +static int col_sep_width = 0; static char *column_separator = " "; static char *line_separator = "\t"; @@ -840,6 +901,13 @@ separator_string (const char *optarg_S) col_sep_length = (int) strlen (optarg_S); col_sep_string = xmalloc (col_sep_length + 1); strcpy (col_sep_string, optarg_S); + +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + col_sep_width = mbswidth (col_sep_string, 0); + else +#endif + col_sep_width = col_sep_length; } int @@ -864,6 +932,21 @@ main (int argc, char **argv) atexit (close_stdout); +/* Define which functions are used, the ones for single byte locale or the ones + for multibyte locale. */ +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + print_char = print_char_multi; + char_to_clump = char_to_clump_multi; + } + else +#endif + { + print_char = print_char_single; + char_to_clump = char_to_clump_single; + } + n_files = 0; file_names = (argc > 1 ? xmalloc ((argc - 1) * sizeof (char *)) @@ -938,8 +1021,12 @@ main (int argc, char **argv) break; case 'e': if (optarg) - getoptarg (optarg, 'e', &input_tab_char, - &chars_per_input_tab); + { + int dummy_length, dummy_width; + + getoptarg (optarg, 'e', input_tab_char, &dummy_length, + &dummy_width, &chars_per_input_tab); + } /* Could check tab width > 0. */ untabify_input = true; break; @@ -952,8 +1039,12 @@ main (int argc, char **argv) break; case 'i': if (optarg) - getoptarg (optarg, 'i', &output_tab_char, - &chars_per_output_tab); + { + int dummy_width; + + getoptarg (optarg, 'i', output_tab_char, &output_tab_char_length, + &dummy_width, &chars_per_output_tab); + } /* Could check tab width > 0. */ tabify_output = true; break; @@ -980,8 +1071,8 @@ main (int argc, char **argv) case 'n': numbered_lines = true; if (optarg) - getoptarg (optarg, 'n', &number_separator, - &chars_per_number); + getoptarg (optarg, 'n', number_separator, &number_separator_length, + &number_separator_width, &chars_per_number); break; case 'N': skip_count = false; @@ -1020,7 +1111,7 @@ main (int argc, char **argv) old_s = false; /* Reset an additional input of -s, -S dominates -s */ col_sep_string = ""; - col_sep_length = 0; + col_sep_length = col_sep_width = 0; use_col_separator = true; if (optarg) separator_string (optarg); @@ -1169,10 +1260,45 @@ main (int argc, char **argv) a number. */ static void -getoptarg (char *arg, char switch_char, char *character, int *number) +getoptarg (char *arg, char switch_char, char *character, int *character_length, + int *character_width, int *number) { if (!ISDIGIT (*arg)) - *character = *arg++; + { +#ifdef HAVE_MBRTOWC + if (MB_CUR_MAX > 1) /* for multibyte locale. */ + { + wchar_t wc; + size_t mblength; + int width; + mbstate_t state = {'\0'}; + + mblength = mbrtowc (&wc, arg, strlen (arg), &state); + + if (mblength == (size_t) -1 || mblength == (size_t) -2) + { + *character_length = 1; + *character_width = 1; + } + else + { + *character_length = (mblength < 1) ? 1 : mblength; + width = wcwidth (wc); + *character_width = (width < 0) ? 0 : width; + } + + strncpy (character, arg, *character_length); + arg += *character_length; + } + else /* for single byte locale. */ +#endif + { + *character = *arg++; + *character_length = 1; + *character_width = 1; + } + } + if (*arg) { long int tmp_long; @@ -1237,7 +1363,7 @@ init_parameters (int number_of_files) else col_sep_string = column_separator; - col_sep_length = 1; + col_sep_length = col_sep_width = 1; use_col_separator = true; } /* It's rather pointless to define a TAB separator with column @@ -1269,11 +1395,11 @@ init_parameters (int number_of_files) TAB_WIDTH (chars_per_input_tab, chars_per_number); */ /* Estimate chars_per_text without any margin and keep it constant. */ - if (number_separator == '\t') + if (number_separator[0] == '\t') number_width = chars_per_number + TAB_WIDTH (chars_per_default_tab, chars_per_number); else - number_width = chars_per_number + 1; + number_width = chars_per_number + number_separator_width; /* The number is part of the column width unless we are printing files in parallel. */ @@ -1288,7 +1414,7 @@ init_parameters (int number_of_files) } chars_per_column = (chars_per_line - chars_used_by_number - - (columns - 1) * col_sep_length) / columns; + (columns - 1) * col_sep_width) / columns; if (chars_per_column < 1) error (EXIT_FAILURE, 0, _("page width too narrow")); @@ -1416,7 +1542,7 @@ init_funcs (void) /* Enlarge p->start_position of first column to use the same form of padding_not_printed with all columns. */ - h = h + col_sep_length; + h = h + col_sep_width; /* This loop takes care of all but the rightmost column. */ @@ -1450,7 +1576,7 @@ init_funcs (void) } else { - h = h_next + col_sep_length; + h = h_next + col_sep_width; h_next = h + chars_per_column; } } @@ -1734,9 +1860,9 @@ static void align_column (COLUMN *p) { padding_not_printed = p->start_position; - if (padding_not_printed - col_sep_length > 0) + if (padding_not_printed - col_sep_width > 0) { - pad_across_to (padding_not_printed - col_sep_length); + pad_across_to (padding_not_printed - col_sep_width); padding_not_printed = ANYWHERE; } @@ -2010,13 +2136,13 @@ store_char (char c) /* May be too generous. */ buff = x2nrealloc (buff, &buff_allocated, sizeof *buff); } - buff[buff_current++] = c; + buff[buff_current++] = (unsigned char) c; } static void add_line_number (COLUMN *p) { - int i; + int i, j; char *s; int left_cut; @@ -2039,22 +2165,24 @@ add_line_number (COLUMN *p) /* Tabification is assumed for multiple columns, also for n-separators, but `default n-separator = TAB' hasn't been given priority over equal column_width also specified by POSIX. */ - if (number_separator == '\t') + if (number_separator[0] == '\t') { i = number_width - chars_per_number; while (i-- > 0) (p->char_func) (' '); } else - (p->char_func) (number_separator); + for (j = 0; j < number_separator_length; j++) + (p->char_func) (number_separator[j]); } else /* To comply with POSIX, we avoid any expansion of default TAB separator with a single column output. No column_width requirement has to be considered. */ { - (p->char_func) (number_separator); - if (number_separator == '\t') + for (j = 0; j < number_separator_length; j++) + (p->char_func) (number_separator[j]); + if (number_separator[0] == '\t') output_position = POS_AFTER_TAB (chars_per_output_tab, output_position); } @@ -2215,7 +2343,7 @@ print_white_space (void) while (goal - h_old > 1 && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal) { - putchar (output_tab_char); + fwrite (output_tab_char, 1, output_tab_char_length, stdout); h_old = h_new; } while (++h_old <= goal) @@ -2235,6 +2363,7 @@ print_sep_string () { char *s; int l = col_sep_length; + int not_space_flag; s = col_sep_string; @@ -2248,6 +2377,7 @@ print_sep_string () { for (; separators_not_printed > 0; --separators_not_printed) { + not_space_flag = 0; while (l-- > 0) { /* 3 types of sep_strings: spaces only, spaces and chars, @@ -2261,12 +2391,15 @@ print_sep_string () } else { + not_space_flag = 1; if (spaces_not_printed > 0) print_white_space (); putchar (*s++); - ++output_position; } } + if (not_space_flag) + output_position += col_sep_width; + /* sep_string ends with some spaces */ if (spaces_not_printed > 0) print_white_space (); @@ -2293,8 +2426,9 @@ print_clump (COLUMN *p, int n, char *clu a nonspace is encountered, call print_white_space() to print the required number of tabs and spaces. */ + static void -print_char (char c) +print_char_single (char c) { if (tabify_output) { @@ -2318,6 +2452,75 @@ print_char (char c) putchar (c); } +#ifdef HAVE_MBRTOWC +static void +print_char_multi (char c) +{ + static size_t mbc_pos = 0; + static unsigned char mbc[MB_LEN_MAX] = {'\0'}; + static mbstate_t state = {'\0'}; + mbstate_t state_bak; + wchar_t wc; + unsigned char uc = (unsigned char) c; + size_t mblength; + int width; + + if (tabify_output) + { + state_bak = state; + mbc[mbc_pos++] = uc; + mblength = mbrtowc (&wc, mbc, mbc_pos, &state); + + while (mbc_pos > 0) + { + switch (mblength) + { + case (size_t) -2: + state = state_bak; + return; + + case (size_t) -1: + state = state_bak; + ++output_position; + putchar (mbc[0]); + memmove (mbc, mbc + 1, MB_CUR_MAX - 1); + --mbc_pos; + break; + + case 0: + mblength = 1; + + default: + if (wc == L' ') + { + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength); + --mbc_pos; + ++spaces_not_printed; + return; + } + else if (spaces_not_printed > 0) + print_white_space (); + + /* Nonprintables are assumed to have width 0, except L'\b'. */ + if ((width = wcwidth (wc)) < 1) + { + if (wc == L'\b') + --output_position; + } + else + output_position += width; + + fwrite (mbc, 1, mblength, stdout); + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength); + mbc_pos -= mblength; + } + } + return; + } + putchar (uc); +} +#endif + /* Skip to page PAGE before printing. PAGE may be larger than total number of pages. */ @@ -2498,9 +2701,9 @@ read_line (COLUMN *p) align_empty_cols = false; } - if (padding_not_printed - col_sep_length > 0) + if (padding_not_printed - col_sep_width > 0) { - pad_across_to (padding_not_printed - col_sep_length); + pad_across_to (padding_not_printed - col_sep_width); padding_not_printed = ANYWHERE; } @@ -2601,9 +2804,9 @@ print_stored (COLUMN *p) } } - if (padding_not_printed - col_sep_length > 0) + if (padding_not_printed - col_sep_width > 0) { - pad_across_to (padding_not_printed - col_sep_length); + pad_across_to (padding_not_printed - col_sep_width); padding_not_printed = ANYWHERE; } @@ -2616,8 +2819,8 @@ print_stored (COLUMN *p) if (spaces_not_printed == 0) { output_position = p->start_position + end_vector[line]; - if (p->start_position - col_sep_length == chars_per_margin) - output_position -= col_sep_length; + if (p->start_position - col_sep_width == chars_per_margin) + output_position -= col_sep_width; } return true; @@ -2635,8 +2838,9 @@ print_stored (COLUMN *p) characters in clump_buff. (e.g, the width of '\b' is -1, while the number of characters is 1.) */ + static int -char_to_clump (char c) +char_to_clump_single (char c) { unsigned char uc = c; register char *s = clump_buff; @@ -2646,10 +2850,10 @@ char_to_clump (char c) int chars; int chars_per_c = 8; - if (c == input_tab_char) + if (c == input_tab_char[0]) chars_per_c = chars_per_input_tab; - if (c == input_tab_char || c == '\t') + if (c == input_tab_char[0] || c == '\t') { width = TAB_WIDTH (chars_per_c, input_position); @@ -2720,6 +2924,155 @@ char_to_clump (char c) return chars; } +#ifdef HAVE_MBRTOWC +static int +char_to_clump_multi (char c) +{ + static size_t mbc_pos = 0; + static unsigned char mbc[MB_LEN_MAX] = {'\0'}; + static mbstate_t state = {'\0'}; + mbstate_t state_bak; + wchar_t wc; + unsigned char uc = (unsigned char) c; + size_t mblength; + int wc_width; + register char *s = clump_buff; + register int i, j; + char esc_buff[4]; + int width; + int chars; + int chars_per_c = 8; + + state_bak = state; + mbc[mbc_pos++] = uc; + mblength = mbrtowc (&wc, mbc, mbc_pos, &state); + + width = 0; + chars = 0; + while (mbc_pos > 0) + { + switch (mblength) + { + case (size_t) -2: + state = state_bak; + return 0; + + case (size_t) -1: + state = state_bak; + mblength = 1; + + if (use_esc_sequence || use_cntrl_prefix) + { + width = +4; + chars = +4; + *s++ = '\\'; + sprintf (esc_buff, "%03o", mbc[0]); + for (i = 0; i <= 2; ++i) + *s++ = (int) esc_buff[i]; + } + else + { + width += 1; + chars += 1; + *s++ = mbc[0]; + } + break; + + case 0: + mblength = 1; + /* Fall through */ + + default: + if (memcmp (mbc, input_tab_char, mblength) == 0) + chars_per_c = chars_per_input_tab; + + if (memcmp (mbc, input_tab_char, mblength) == 0 || c == '\t') + { + int width_inc; + + width_inc = TAB_WIDTH (chars_per_c, input_position); + width += width_inc; + + if (untabify_input) + { + for (i = width_inc; i; --i) + *s++ = ' '; + chars += width_inc; + } + else + { + for (i = 0; i < mblength; i++) + *s++ = mbc[i]; + chars += mblength; + } + } + else if ((wc_width = wcwidth (wc)) < 1) + { + if (use_esc_sequence) + { + for (i = 0; i < mblength; i++) + { + width += 4; + chars += 4; + *s++ = '\\'; + sprintf (esc_buff, "%03o", uc); + for (j = 0; j <= 2; ++j) + *s++ = (int) esc_buff[j]; + } + } + else if (use_cntrl_prefix) + { + if (wc < 0200) + { + width += 2; + chars += 2; + *s++ = '^'; + *s++ = wc ^ 0100; + } + else + { + for (i = 0; i < mblength; i++) + { + width += 4; + chars += 4; + *s++ = '\\'; + sprintf (esc_buff, "%03o", uc); + for (j = 0; j <= 2; ++j) + *s++ = (int) esc_buff[j]; + } + } + } + else if (wc == L'\b') + { + width += -1; + chars += 1; + *s++ = c; + } + else + { + width += 0; + chars += mblength; + for (i = 0; i < mblength; i++) + *s++ = mbc[i]; + } + } + else + { + width += wc_width; + chars += mblength; + for (i = 0; i < mblength; i++) + *s++ = mbc[i]; + } + } + memmove (mbc, mbc + mblength, MB_CUR_MAX - mblength); + mbc_pos -= mblength; + } + + input_position += width; + return chars; +} +#endif + /* We've just printed some files and need to clean up things before looking for more options and printing the next batch of files. --- coreutils/src/sort.c +++ coreutils/src/sort.c @@ -27,6 +27,19 @@ #include #include #include +#include + +/* Get mbstate_t, mbrtowc(), wcrtomb(). */ +#if HAVE_WCHAR_H +# include +#endif + +/* Get iswprint(), iswctype() towupper(). */ +#if HAVE_WCTYPE_H +# include +wctype_t blank_type; /* = wctype ("blank"); */ +#endif + #include "system.h" #include "error.h" #include "hard-locale.h" @@ -46,6 +59,17 @@ struct rlimit { size_t rlim_cur; }; # define getrlimit(Resource, Rlp) (-1) #endif +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC + installation; work around this configuration error. */ +#if !defined MB_LEN_MAX || MB_LEN_MAX == 1 +# define MB_LEN_MAX 16 +#endif + +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */ +#if HAVE_MBRTOWC && defined mbstate_t +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0) +#endif + /* The official name of this program (e.g., no `g' prefix). */ #define PROGRAM_NAME "sort" @@ -91,14 +115,38 @@ static char decimal_point; /* Thousands separator; if CHAR_MAX + 1, then there isn't one. */ static int thousands_sep; +static int force_general_numcompare = 0; + /* Nonzero if the corresponding locales are hard. */ static bool hard_LC_COLLATE; -#if HAVE_NL_LANGINFO +#if HAVE_LANGINFO_CODESET static bool hard_LC_TIME; #endif #define NONZERO(x) ((x) != 0) +/* get a multibyte character's byte length. */ +#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \ + do \ + { \ + wchar_t wc; \ + mbstate_t state_bak; \ + \ + state_bak = STATE; \ + mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \ + \ + switch (MBLENGTH) \ + { \ + case (size_t)-1: \ + case (size_t)-2: \ + STATE = state_bak; \ + /* Fall through. */ \ + case 0: \ + MBLENGTH = 1; \ + } \ + } \ + while (0) + /* The kind of blanks for '-b' to skip in various options. */ enum blanktype { bl_start, bl_end, bl_both }; @@ -235,13 +283,22 @@ static bool reverse; they were read if all keys compare equal. */ static bool stable; -/* If TAB has this value, blanks separate fields. */ +/* If TAB has this value, blanks separate fields. enum { TAB_DEFAULT = CHAR_MAX + 1 }; +*/ /* Tab character separating fields. If TAB_DEFAULT, then fields are separated by the empty string between a non-blank character and a blank - character. */ + character. static int tab = TAB_DEFAULT; +*/ + +/* Tab character separating fields. If NUL, then fields are separated + by the empty string between a non-whitespace character and a whitespace + character. */ +static bool tab_default = true; +static unsigned char tab[MB_LEN_MAX + 1]; +static size_t tab_length = 1; /* Flag to remove consecutive duplicate lines from the output. Only the last of a sequence of equal lines will be output. */ @@ -386,6 +443,43 @@ struct tempnode static struct tempnode *volatile temphead; static struct tempnode *volatile *temptail = &temphead; +/* Fucntion pointers. */ +static char * +(* begfield) (const struct line *line, const struct keyfield *key); + +static char * +(* limfield) (const struct line *line, const struct keyfield *key); + +static int +(*getmonth) (const char *s, size_t len); + +static int +(* keycompare) (const struct line *a, const struct line *b); + +/* Test for white space multibyte character. + Set LENGTH the byte length of investigated multibyte character. */ +#if HAVE_MBRTOWC +static int +ismbblank (const char *str, size_t *length) +{ + size_t mblength; + wchar_t wc; + mbstate_t state; + + memset (&state, '\0', sizeof(mbstate_t)); + mblength = mbrtowc (&wc, str, MB_LEN_MAX, &state); + + if (mblength == (size_t)-1 || mblength == (size_t)-2) + { + *length = 1; + return 0; + } + + *length = (mblength < 1) ? 1 : mblength; + return (iswctype (wc, blank_type)); +} +#endif + /* Clean up any remaining temporary files. */ static void @@ -535,7 +629,7 @@ zaptemp (const char *name) free (node); } -#if HAVE_NL_LANGINFO +#if HAVE_LANGINFO_CODESET static int struct_month_cmp (const void *m1, const void *m2) @@ -562,7 +656,7 @@ inittables (void) fold_toupper[i] = (ISLOWER (i) ? toupper (i) : i); } -#if HAVE_NL_LANGINFO +#if HAVE_LANGINFO_CODESET /* If we're not in the "C" locale, read different names for months. */ if (hard_LC_TIME) { @@ -588,6 +682,71 @@ inittables (void) #endif } +#if HAVE_MBRTOWC +static void +inittables_mb (void) +{ + int i, j, k, l; + char *name, *s; + size_t s_len, mblength; + char mbc[MB_LEN_MAX]; + wchar_t wc, pwc; + mbstate_t state_mb, state_wc; + + for (i = 0; i < MONTHS_PER_YEAR; i++) + { + s = (char *) nl_langinfo (ABMON_1 + i); + s_len = strlen (s); + monthtab[i].name = name = (char *) xmalloc (s_len + 1); + monthtab[i].val = i + 1; + + memset (&state_mb, '\0', sizeof (mbstate_t)); + memset (&state_wc, '\0', sizeof (mbstate_t)); + + for (j = 0; j < s_len;) + { + if (!ismbblank (s + j, &mblength)) + break; + j += mblength; + } + + for (k = 0; j < s_len;) + { + mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb); + /* If conversion is failed, fall back into single byte sorting. */ + if (mblength == (size_t)-1 || mblength == (size_t)-2) + { + for (l = 0; l <= i; l++) + free ((void *)monthtab[l].name); + inittables(); + return; + } + else if (mblength == 0) + break; + + pwc = towupper (wc); + if (pwc == wc) + { + memcpy (mbc, s + j, mblength); + j += mblength; + } + else + { + j += mblength; + mblength = wcrtomb (mbc, wc, &state_wc); + assert (mblength != (size_t)0 && mblength != (size_t)-1); + } + + for (l = 0; l < mblength; l++) + name[k++] = mbc[l]; + } + name[k] = '\0'; + } + qsort ((void *) monthtab, MONTHS_PER_YEAR, + sizeof *monthtab, struct_month_cmp); +} +#endif + /* Specify the amount of main memory to use when sorting. */ static void specify_sort_size (char const *s) @@ -798,7 +957,7 @@ buffer_linelim (struct buffer const *buf by KEY in LINE. */ static char * -begfield (const struct line *line, const struct keyfield *key) +begfield_uni (const struct line *line, const struct keyfield *key) { register char *ptr = line->text, *lim = ptr + line->length - 1; register size_t sword = key->sword; @@ -808,10 +967,10 @@ begfield (const struct line *line, const /* The leading field separator itself is included in a field when -t is absent. */ - if (tab != TAB_DEFAULT) + if (!tab_default) while (ptr < lim && sword--) { - while (ptr < lim && *ptr != tab) + while (ptr < lim && *ptr != tab[0]) ++ptr; if (ptr < lim) ++ptr; @@ -839,11 +998,70 @@ begfield (const struct line *line, const return ptr; } +#if HAVE_MBRTOWC +static char * +begfield_mb (const struct line *line, const struct keyfield *key) +{ + int i; + char *ptr = line->text, *lim = ptr + line->length - 1; + size_t sword = key->sword; + size_t schar = key->schar; + size_t mblength; + mbstate_t state; + + memset (&state, '\0', sizeof(mbstate_t)); + + if (!tab_default) + while (ptr < lim && sword--) + { + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + if (ptr < lim) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + } + else + while (ptr < lim && sword--) + { + while (ptr < lim && ismbblank (ptr, &mblength)) + ptr += mblength; + if (ptr < lim) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + while (ptr < lim && !ismbblank (ptr, &mblength)) + ptr += mblength; + } + + if (key->skipsblanks) + while (ptr < lim && ismbblank (ptr, &mblength)) + ptr += mblength; + + for (i = 0; i < schar; i++) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + + if (ptr + mblength > lim) + break; + else + ptr += mblength; + } + + return ptr; +} +#endif + /* Return the limit of (a pointer to the first character after) the field in LINE specified by KEY. */ static char * -limfield (const struct line *line, const struct keyfield *key) +limfield_uni (const struct line *line, const struct keyfield *key) { register char *ptr = line->text, *lim = ptr + line->length - 1; register size_t eword = key->eword, echar = key->echar; @@ -856,10 +1074,10 @@ limfield (const struct line *line, const `beginning' is the first character following the delimiting TAB. Otherwise, leave PTR pointing at the first `blank' character after the preceding field. */ - if (tab != TAB_DEFAULT) + if (!tab_default) while (ptr < lim && eword--) { - while (ptr < lim && *ptr != tab) + while (ptr < lim && *ptr != tab[0]) ++ptr; if (ptr < lim && (eword | echar)) ++ptr; @@ -905,7 +1123,7 @@ limfield (const struct line *line, const */ /* Make LIM point to the end of (one byte past) the current field. */ - if (tab != TAB_DEFAULT) + if (!tab_default) { char *newlim; newlim = memchr (ptr, tab, lim - ptr); @@ -941,6 +1159,107 @@ limfield (const struct line *line, const return ptr; } +#if HAVE_MBRTOWC +static char * +limfield_mb (const struct line *line, const struct keyfield *key) +{ + char *ptr = line->text, *lim = ptr + line->length - 1; + size_t eword = key->eword, echar = key->echar; + int i; + size_t mblength; + mbstate_t state; + + memset (&state, '\0', sizeof(mbstate_t)); + + if (!tab_default) + while (ptr < lim && eword--) + { + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + if (ptr < lim) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + } + else + while (ptr < lim && eword--) + { + while (ptr < lim && ismbblank (ptr, &mblength)) + ptr += mblength; + if (ptr < lim) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + while (ptr < lim && !ismbblank (ptr, &mblength)) + ptr += mblength; + } + +# ifdef POSIX_UNSPECIFIED + + /* Make LIM point to the end of (one byte past) the current field. */ + if (!tab_default) + { + char *newlim, *p; + + newlim = NULL; + for (p = ptr; p < lim;) + { + if (memcmp (p, tab, tab_length) == 0) + { + newlim = p; + break; + } + + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + p += mblength; + } + } + else + { + char *newlim; + newlim = ptr; + + while (newlim < lim && ismbblank (newlim, &mblength)) + newlim += mblength; + if (ptr < lim) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + while (newlim < lim && !ismbblank (newlim, &mblength)) + newlim += mblength; + lim = newlim; + } +# endif + + /* If we're skipping leading blanks, don't start counting characters + until after skipping past any leading blanks. */ + if (key->skipeblanks) + while (ptr < lim && ismbblank (ptr, &mblength)) + ptr += mblength; + + memset (&state, '\0', sizeof(mbstate_t)); + + /* Advance PTR by ECHAR (if possible), but no further than LIM. */ + for (i = 0; i < echar; i++) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + + if (ptr + mblength > lim) + break; + else + ptr += mblength; + } + + return ptr; +} +#endif + /* Fill BUF reading from FP, moving buf->left bytes from the end of buf->buf to the beginning first. If EOF is reached and the file wasn't terminated by a newline, supply one. Set up BUF's line @@ -1023,8 +1342,22 @@ fillbuf (struct buffer *buf, register FI else { if (key->skipsblanks) - while (blanks[to_uchar (*line_start)]) - line_start++; + { +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + size_t mblength; + + while (ismbblank (line_start, &mblength)) + line_start += mblength; + } + else +#endif + { + while (blanks[to_uchar (*line_start)]) + line_start++; + } + } line->keybeg = line_start; } } @@ -1130,10 +1463,27 @@ numcompare (register const char *a, regi size_t log_a; size_t log_b; - while (blanks[to_uchar (tmpa = *a)]) - a++; - while (blanks[to_uchar (tmpb = *b)]) - b++; +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + size_t mblength; + + while (ismbblank (a, &mblength)) + a += mblength; + while (ismbblank (b, &mblength)) + b += mblength; + + tmpa = *a; + tmpb = *b; + } + else +#endif + { + while (blanks[to_uchar (tmpa = *a)]) + a++; + while (blanks[to_uchar (tmpb = *b)]) + b++; + } if (tmpa == NEGATION_SIGN) { @@ -1263,15 +1613,59 @@ general_numcompare (const char *sa, cons /* FIXME: maybe add option to try expensive FP conversion only if A and B can't be compared more cheaply/accurately. */ - char *ea; - char *eb; - double a = strtod (sa, &ea); - double b = strtod (sb, &eb); + char *bufa, *ea; + char *bufb, *eb; + double a; + double b; + + char *p; + struct lconv *lconvp = localeconv (); + size_t thousands_sep_len = strlen (lconvp->thousands_sep); + + bufa = (char *) xmalloc (strlen (sa) + 1); + bufb = (char *) xmalloc (strlen (sb) + 1); + strcpy (bufa, sa); + strcpy (bufb, sb); + + if (force_general_numcompare) + { + while (1) + { + a = strtod (bufa, &ea); + if (memcmp (ea, lconvp->thousands_sep, thousands_sep_len) == 0) + { + for (p = ea; *(p + thousands_sep_len) != '\0'; p++) + *p = *(p + thousands_sep_len); + *p = '\0'; + continue; + } + break; + } + + while (1) + { + b = strtod (bufb, &eb); + if (memcmp (eb, lconvp->thousands_sep, thousands_sep_len) == 0) + { + for (p = eb; *(p + thousands_sep_len) != '\0'; p++) + *p = *(p + thousands_sep_len); + *p = '\0'; + continue; + } + break; + } + } + else + { + a = strtod (bufa, &ea); + b = strtod (bufb, &eb); + } + /* Put conversion errors at the start of the collating sequence. */ - if (sa == ea) - return sb == eb ? 0 : -1; - if (sb == eb) + if (bufa == ea) + return bufb == eb ? 0 : -1; + if (bufb == eb) return 1; /* Sort numbers in the usual way, where -0 == +0. Put NaNs after @@ -1289,7 +1683,7 @@ general_numcompare (const char *sa, cons Return 0 if the name in S is not recognized. */ static int -getmonth (char const *month, size_t len) +getmonth_uni (char const *month, size_t len) { size_t lo = 0; size_t hi = MONTHS_PER_YEAR; @@ -1331,11 +1725,79 @@ getmonth (char const *month, size_t len) return 0; } +#if HAVE_MBRTOWC +static int +getmonth_mb (char const *s, size_t len) +{ + char *month; + register size_t i; + register int lo = 0, hi = MONTHS_PER_YEAR, result; + char *tmp; + size_t wclength, mblength; + const char **pp; + const wchar_t **wpp; + wchar_t *month_wcs; + mbstate_t state; + + while (len > 0 && ismbblank (s, &mblength)) + { + s += mblength; + len -= mblength; + } + + if (len == 0) + return 0; + + month = (char *) alloca (len + 1); + + tmp = (char *) alloca (len + 1); + memcpy (tmp, s, len); + tmp[len] = '\0'; + pp = (const char **)&tmp; + month_wcs = (wchar_t *) alloca ((len + 1) * sizeof (wchar_t)); + memset (&state, '\0', sizeof(mbstate_t)); + + wclength = mbsrtowcs (month_wcs, pp, len + 1, &state); + assert (wclength != (size_t)1 && *pp == NULL); + + for (i = 0; i < wclength; i++) + { + month_wcs[i] = towupper(month_wcs[i]); + if (iswctype (month_wcs[i], blank_type)) + { + month_wcs[i] = L'\0'; + break; + } + } + + wpp = (const wchar_t **)&month_wcs; + + mblength = wcsrtombs (month, wpp, len + 1, &state); + assert (mblength != (-1) && *wpp == NULL); + + do + { + int ix = (lo + hi) / 2; + + if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0) + hi = ix; + else + lo = ix; + } + while (hi - lo > 1); + + result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name)) + ? monthtab[lo].val : 0); + + return result; +} +#endif + /* Compare two lines A and B trying every key in sequence until there are no more keys or a difference is found. */ static int -keycompare (const struct line *a, const struct line *b) +keycompare_uni (const struct line *a, const struct line *b) { struct keyfield const *key = keylist; @@ -1499,11 +1961,188 @@ keycompare (const struct line *a, const return 0; - greater: +greater: + diff = 1; +not_equal: + return key->reverse ? -diff : diff; +} + +#if HAVE_MBRTOWC +static int +keycompare_mb (const struct line *a, const struct line *b) +{ + struct keyfield *key = keylist; + + /* For the first iteration only, the key positions have been + precomputed for us. */ + char *texta = a->keybeg; + char *textb = b->keybeg; + char *lima = a->keylim; + char *limb = b->keylim; + + size_t mblength_a, mblength_b; + wchar_t wc_a, wc_b; + mbstate_t state_a, state_b; + + int diff; + + memset (&state_a, '\0', sizeof(mbstate_t)); + memset (&state_b, '\0', sizeof(mbstate_t)); + + for (;;) + { + register char const *translate = key->translate; + register bool const *ignore = key->ignore; + + /* Find the lengths. */ + size_t lena = lima <= texta ? 0 : lima - texta; + size_t lenb = limb <= textb ? 0 : limb - textb; + + /* Actually compare the fields. */ + if (key->numeric | key->general_numeric) + { + char savea = *lima, saveb = *limb; + + *lima = *limb = '\0'; + if (force_general_numcompare) + diff = general_numcompare (texta, textb); + else + diff = ((key->numeric ? numcompare : general_numcompare) + (texta, textb)); + *lima = savea, *limb = saveb; + } + else if (key->month) + diff = getmonth (texta, lena) - getmonth (textb, lenb); + else + { + if (ignore || translate) + { + char buf[4000]; + size_t size = lena + 1 + lenb + 1; + char *copy_a = (size <= sizeof buf ? buf : xmalloc (size)); + char *copy_b = copy_a + lena + 1; + size_t new_len_a, new_len_b; + size_t i, j; + + /* Ignore and/or translate chars before comparing. */ +# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \ + do \ + { \ + wchar_t uwc; \ + char mbc[MB_LEN_MAX]; \ + mbstate_t state_wc; \ + \ + for (NEW_LEN = i = 0; i < LEN;) \ + { \ + mbstate_t state_bak; \ + \ + state_bak = STATE; \ + MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \ + \ + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \ + || MBLENGTH == 0) \ + { \ + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \ + STATE = state_bak; \ + if (!ignore) \ + COPY[NEW_LEN++] = TEXT[i++]; \ + continue; \ + } \ + \ + if (ignore) \ + { \ + if ((ignore == nonprinting && !iswprint (WC)) \ + || (ignore == nondictionary \ + && !iswalnum (WC) && !iswctype (WC, blank_type))) \ + { \ + i += MBLENGTH; \ + continue; \ + } \ + } \ + \ + if (translate) \ + { \ + \ + uwc = toupper(WC); \ + if (WC == uwc) \ + { \ + memcpy (mbc, TEXT + i, MBLENGTH); \ + i += MBLENGTH; \ + } \ + else \ + { \ + i += MBLENGTH; \ + WC = uwc; \ + memset (&state_wc, '\0', sizeof (mbstate_t)); \ + \ + MBLENGTH = wcrtomb (mbc, WC, &state_wc); \ + assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \ + } \ + \ + for (j = 0; j < MBLENGTH; j++) \ + COPY[NEW_LEN++] = mbc[j]; \ + } \ + else \ + for (j = 0; j < MBLENGTH; j++) \ + COPY[NEW_LEN++] = TEXT[i++]; \ + } \ + COPY[NEW_LEN] = '\0'; \ + } \ + while (0) + + IGNORE_CHARS (new_len_a, lena, texta, copy_a, + wc_a, mblength_a, state_a); + IGNORE_CHARS (new_len_b, lenb, textb, copy_b, + wc_b, mblength_b, state_b); + diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b); + + if (sizeof buf < size) + free (copy_a); + } + else if (lena == 0) + diff = - NONZERO (lenb); + else if (lenb == 0) + goto greater; + else + diff = xmemcoll (texta, lena, textb, lenb); + } + + if (diff) + goto not_equal; + + key = key->next; + if (! key) + break; + + /* Find the beginning and limit of the next field. */ + if (key->eword != SIZE_MAX) + lima = limfield (a, key), limb = limfield (b, key); + else + lima = a->text + a->length - 1, limb = b->text + b->length - 1; + + if (key->sword != SIZE_MAX) + texta = begfield (a, key), textb = begfield (b, key); + else + { + texta = a->text, textb = b->text; + if (key->skipsblanks) + { + while (texta < lima && ismbblank (texta, &mblength_a)) + texta += mblength_a; + while (textb < limb && ismbblank (textb, &mblength_b)) + textb += mblength_b; + } + } + } + + return 0; + +greater: diff = 1; - not_equal: +not_equal: return key->reverse ? -diff : diff; } +#endif /* Compare two lines A and B, returning negative, zero, or positive depending on whether A compares less than, equal to, or greater than B. */ @@ -2243,6 +2882,11 @@ set_ordering (register const char *s, st break; case 'M': key->month = true; +#if HAVE_MBRTOWC + if (strcmp (setlocale (LC_CTYPE, NULL), setlocale (LC_TIME, NULL))) + error (0, 0, _("As LC_TIME differs from LC_CTYPE, the results may be strange.")); + inittables_mb (); +#endif break; case 'n': key->numeric = true; @@ -2296,7 +2940,7 @@ main (int argc, char **argv) atexit (close_stdout); hard_LC_COLLATE = hard_locale (LC_COLLATE); -#if HAVE_NL_LANGINFO +#if HAVE_LANGINFO_CODESET hard_LC_TIME = hard_locale (LC_TIME); #endif @@ -2309,14 +2953,40 @@ main (int argc, char **argv) add support for multibyte decimal points. */ decimal_point = locale->decimal_point[0]; if (! decimal_point || locale->decimal_point[1]) - decimal_point = '.'; + { + decimal_point = '.'; + if (locale->decimal_point[0] && locale->decimal_point[1]) + force_general_numcompare = 1; + } /* FIXME: add support for multibyte thousands separators. */ thousands_sep = *locale->thousands_sep; if (! thousands_sep || locale->thousands_sep[1]) - thousands_sep = CHAR_MAX + 1; + { + thousands_sep = CHAR_MAX + 1; + if (locale->thousands_sep[0] && locale->thousands_sep[1]) + force_general_numcompare = 1; + } } +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + blank_type = wctype ("blank"); + begfield = begfield_mb; + limfield = limfield_mb; + getmonth = getmonth_mb; + keycompare = keycompare_mb; + } + else +#endif + { + begfield = begfield_uni; + limfield = limfield_uni; + keycompare = keycompare_uni; + getmonth = getmonth_uni; + } + have_read_stdin = false; inittables (); @@ -2514,28 +3184,48 @@ main (int argc, char **argv) break; case 't': - { - char newtab = optarg[0]; - if (! newtab) + { + if (! optarg[0]) error (SORT_FAILURE, 0, _("empty tab")); + + strncpy (tab, optarg, MB_LEN_MAX); + if (optarg[1]) { - if (STREQ (optarg, "\\0")) - newtab = '\0'; + if (strcmp (optarg, "\\0") == 0) + tab[0] = '\0'; else { - /* Provoke with `sort -txx'. Complain about - "multi-character tab" instead of "multibyte tab", so - that the diagnostic's wording does not need to be - changed once multibyte characters are supported. */ - error (SORT_FAILURE, 0, _("multi-character tab `%s'"), - optarg); +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + wchar_t wc; + mbstate_t state; + + memset (&state, '\0', sizeof (mbstate_t)); + tab_length = mbrtowc (&wc, tab, MB_LEN_MAX, &state); + tab_length = (tab_length == (size_t)-1 + || tab_length == (size_t)-2 + || tab_length == 0) ? 1 : tab_length; + if (optarg[tab_length]) + { + error (SORT_FAILURE, 0, + _("multi-character tab `%s'"), optarg); + } + } + else +#endif + { + error (SORT_FAILURE, 0, _("multi-character tab `%s'"), + optarg); + } } } - if (tab != TAB_DEFAULT && tab != newtab) + if (!tab_default) error (SORT_FAILURE, 0, _("incompatible tabs")); - tab = newtab; + tab_default = false; } + break; case 'T': --- coreutils/src/unexpand.c +++ coreutils/src/unexpand.c @@ -39,12 +39,35 @@ #include #include #include + +/* Get mbstate_t, mbrtowc(), wcwidth() */ +#if HAVE_WCHAR_H +# include +#endif +/* Get iswblank */ +#if HAVE_WCTYPE_H +# include +#endif + + +/* A sentinel value that's placed at the end of the list of tab stops. + * This value must be a large number, but not so large that adding the + * length of a line to it would cause the column variable to overflow. */ +#define TAB_STOP_SENTINEL INT_MAX + #include "system.h" #include "error.h" #include "posixver.h" #include "quote.h" #include "xstrndup.h" +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC + installation; work around this configuration error. */ +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2 +# undef MB_LEN_MAX +# define MB_LEN_MAX 16 +#endif + /* The official name of this program (e.g., no `g' prefix). */ #define PROGRAM_NAME "unexpand" @@ -460,6 +483,236 @@ unexpand (void) } } +#if HAVE_MBRTOWC && HAVE_WCTYPE_H +static void +unexpand_multibyte (void) +{ + /* Input stream. */ + FILE *fp = next_file (NULL); + + mbstate_t i_state; /* Current shift state of the input stream. */ + char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ + char *bufpos; /* Next read position of BUF. */ + size_t buflen = 0; /* The length of the byte sequence in buf. */ + + /* The array of pending blanks. In non-POSIX locales, blanks can + include characters other than spaces, so the blanks must be + stored, not merely counted. */ + char *pending_blank; + + if (!fp) + return; + + /* Binary I/O will preserve the original EOL style (DOS/Unix) of files. */ + SET_BINARY2 (fileno (fp), STDOUT_FILENO); + + /* The worst case is a non-blank character, then one blank, then a + tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so + allocate MAX_COLUMN_WIDTH bytes to store the blanks. */ + pending_blank = xmalloc (max_column_width); + + memset (&i_state, '\0', sizeof(mbstate_t)); + + for (;;) + { + /* A gotten wide character. */ + wint_t wc; + + /* If true, perform translations. */ + bool convert = true; + + /* The following variables have valid values only when CONVERT + is true: */ + + /* Column of next input character. */ + uintmax_t column = 0; + + /* Column the next input tab stop is on. */ + uintmax_t next_tab_column = 0; + + /* Index in TAB_LIST of next tab stop to examine. */ + size_t tab_index = 0; + + /* If true, the first pending blank came just before a tab stop. */ + bool one_blank_before_tab_stop = false; + + /* If true, the previous input character was a blank. This is + initially true, since initial strings of blanks are treated + as if the line was preceded by a blank. */ + bool prev_blank = true; + + /* Number of pending columns of blanks. */ + size_t pending = 0; + + /* Convert a line of text. */ + do + { + wchar_t w; + size_t mblength; /* The byte size of a multibyte character + which shows as same character as WC. */ + mbstate_t i_state_bak; /* Back up the I_STATE. */ + + /* Fill buffer */ + if (buflen < MB_LEN_MAX) + { + if (!feof(fp) && !ferror(fp)) { + if (buflen > 0) memmove(buf, bufpos, buflen); + buflen += fread(buf + buflen, sizeof(char), BUFSIZ, fp); + bufpos = buf; + } + } + + if (buflen < 1) { + /* Move to the next file */ + if (feof(fp) || ferror(fp)) { + fp = next_file(fp); + } + if (!fp) { + if (pending) + { + if (fwrite (pending_blank, 1, pending, stdout) != pending) + error (EXIT_FAILURE, errno, _("write error")); + } + free (pending_blank); + return; + } + SET_BINARY2 (fileno (fp), STDOUT_FILENO); + continue; + } + + i_state_bak = i_state; + mblength = mbrtowc (&w, bufpos, buflen, &i_state); + wc = w; + + if (mblength == (size_t) -1 || mblength == (size_t) -2) { + i_state = i_state_bak; + wc = L'\0'; + column += convert; + mblength = 1; + } + + if (convert) + { + bool blank = iswblank (wc); + + if (blank) + { + if (next_tab_column <= column) + { + if (tab_size) + next_tab_column = + column + (tab_size - column % tab_size); + else + for (;;) + if (tab_index == first_free_tab) + { + convert = false; + break; + } + else + { + uintmax_t tab = tab_list[tab_index++]; + if (column < tab) + { + next_tab_column = tab; + break; + } + } + } + + if (convert) + { + if (next_tab_column < column) + error (EXIT_FAILURE, 0, _("input line is too long")); + + if (wc == L'\t') + { + column = next_tab_column; + + /* Discard pending blanks, unless it was a single + blank just before the previous tab stop. */ + if (! (pending == 1 && one_blank_before_tab_stop)) + { + pending = 0; + one_blank_before_tab_stop = false; + } + } + else + { + column++; + + if (! (prev_blank && column == next_tab_column)) + { + /* It is not yet known whether the pending blanks + will be replaced by tabs. */ + if (column == next_tab_column) + one_blank_before_tab_stop = true; + pending_blank[pending++] = ' '; + prev_blank = true; + buflen -= mblength; + bufpos += mblength; + continue; + } + + /* Replace the pending blanks by a tab or two. */ + pending_blank[0] = *bufpos = '\t'; + pending = one_blank_before_tab_stop; + } + } + } + else if (wc == L'\b') + { + /* Go back one column, and force recalculation of the + next tab stop. */ + column -= !!column; + next_tab_column = column; + tab_index -= !!tab_index; + } + else + { + if (!iswcntrl (wc)) + { + int width = wcwidth (wc); + if (width > 0) { + if (column > (column + width)) + error (EXIT_FAILURE, 0, _("input line is too long")); + column += width; + } + } + } + + if (pending) + { + if (fwrite (pending_blank, 1, pending, stdout) != pending) + error (EXIT_FAILURE, errno, _("write error")); + pending = 0; + one_blank_before_tab_stop = false; + } + + prev_blank = blank; + convert &= convert_entire_line | blank; + } + + if (mblength) + { + if (fwrite (bufpos, sizeof(char), mblength, stdout) < mblength) + error (EXIT_FAILURE, errno, _("write error")); + } + else + { + if (putchar('\0')) + error (EXIT_FAILURE, errno, _("write error")); + mblength = 1; + } + + buflen -= mblength; + bufpos += mblength; + } + while (wc != L'\n'); + } +} +#endif + int main (int argc, char **argv) { @@ -548,7 +801,12 @@ main (int argc, char **argv) file_list = (optind < argc ? &argv[optind] : stdin_argv); - unexpand (); +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + unexpand_multibyte (); + else +#endif + unexpand (); if (have_read_stdin && fclose (stdin) != 0) error (EXIT_FAILURE, errno, "-"); --- coreutils/src/uniq.c +++ coreutils/src/uniq.c @@ -23,6 +23,16 @@ #include #include +/* Get mbstate_t, mbrtowc(), wcrtomb() */ +#if HAVE_WCHAR_H +# include +#endif + +/* Get iswctype(), wctype(), towupper)(. */ +#if HAVE_WCTYPE_H +# include +#endif + #include "system.h" #include "argmatch.h" #include "linebuffer.h" @@ -34,6 +44,13 @@ #include "xstrtol.h" #include "memcasecmp.h" +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC + installation; work around this configuration error. */ +#if !defined MB_LEN_MAX || MB_LEN_MAX < 2 +# undef MB_LEN_MAX +# define MB_LEN_MAX 16 +#endif + /* The official name of this program (e.g., no `g' prefix). */ #define PROGRAM_NAME "uniq" @@ -109,6 +126,12 @@ static enum delimit_method const delimit /* Select whether/how to delimit groups of duplicate lines. */ static enum delimit_method delimit_groups; +/* Function pointers. */ +static char * (*find_field) (struct linebuffer *line); + +/* Show the blank character class. */ +wctype_t blank_type; + static struct option const longopts[] = { {"count", no_argument, NULL, 'c'}, @@ -189,7 +212,7 @@ size_opt (char const *opt, char const *m return a pointer to the beginning of the line's field to be compared. */ static char * -find_field (const struct linebuffer *line) +find_field_uni (struct linebuffer *line) { register size_t count; register char *lp = line->buffer; @@ -210,6 +233,83 @@ find_field (const struct linebuffer *lin return lp + i; } +#if HAVE_MBRTOWC + +# define MBCHAR_TO_WCHAR(WC, MBLENGTH, LP, POS, SIZE, STATEP, CONVFAIL) \ + do \ + { \ + mbstate_t state_bak; \ + \ + CONVFAIL = 0; \ + state_bak = *STATEP; \ + \ + MBLENGTH = mbrtowc (&WC, LP + POS, SIZE - POS, STATEP); \ + \ + switch (MBLENGTH) \ + { \ + case (size_t)-2: \ + case (size_t)-1: \ + *STATEP = state_bak; \ + CONVFAIL++; \ + /* Fall through */ \ + case 0: \ + MBLENGTH = 1; \ + } \ + } \ + while (0) + +static char * +find_field_multi (struct linebuffer *line) +{ + size_t count; + char *lp = line->buffer; + size_t size = line->length - 1; + size_t pos; + size_t mblength; + wchar_t wc; + mbstate_t *statep; + int convfail; + + pos = 0; + statep = &(line->state); + + /* skip fields. */ + for (count = 0; count < skip_fields && pos < size; count++) + { + while (pos < size) + { + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail); + + if (convfail || !iswctype (wc, blank_type)) + { + pos += mblength; + break; + } + pos += mblength; + } + + while (pos < size) + { + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail); + + if (!convfail && iswctype (wc, blank_type)) + break; + + pos += mblength; + } + } + + /* skip fields. */ + for (count = 0; count < skip_chars && pos < size; count++) + { + MBCHAR_TO_WCHAR (wc, mblength, lp, pos, size, statep, convfail); + pos += mblength; + } + + return lp + pos; +} +#endif + /* Return false if two strings OLD and NEW match, true if not. OLD and NEW point not to the beginnings of the lines but rather to the beginnings of the fields to compare. @@ -234,6 +334,73 @@ different (char *old, char *new, size_t return oldlen != newlen || memcmp (old, new, oldlen); } +#if HAVE_MBRTOWC +static int +different_multi (const char *old, const char *new, size_t oldlen, size_t newlen, mbstate_t oldstate, mbstate_t newstate) +{ + size_t i, j, chars; + const char *str[2]; + char *copy[2]; + size_t len[2]; + mbstate_t state[2]; + size_t mblength; + wchar_t wc, uwc; + mbstate_t state_bak; + + str[0] = old; + str[1] = new; + len[0] = oldlen; + len[1] = newlen; + state[0] = oldstate; + state[1] = newstate; + + for (i = 0; i < 2; i++) + { + copy[i] = alloca (len[i] + 1); + + for (j = 0, chars = 0; j < len[i] && chars < check_chars; chars++) + { + state_bak = state[i]; + mblength = mbrtowc (&wc, str[i] + j, len[i] - j, &(state[i])); + + switch (mblength) + { + case (size_t)-1: + case (size_t)-2: + state[i] = state_bak; + /* Fall through */ + case 0: + mblength = 1; + break; + + default: + if (ignore_case) + { + uwc = towupper (wc); + + if (uwc != wc) + { + mbstate_t state_wc; + + memset (&state_wc, '\0', sizeof(mbstate_t)); + wcrtomb (copy[i] + j, uwc, &state_wc); + } + else + memcpy (copy[i] + j, str[i] + j, mblength); + } + else + memcpy (copy[i] + j, str[i] + j, mblength); + } + j += mblength; + } + copy[i][j] = '\0'; + len[i] = j; + } + + return xmemcoll (copy[0], len[0], copy[1], len[1]); +} +#endif + /* Output the line in linebuffer LINE to stream STREAM provided that the switches say it should be output. MATCH is true if the line matches the previous line. @@ -297,15 +464,42 @@ check_file (const char *infile, const ch { char *prevfield IF_LINT (= NULL); size_t prevlen IF_LINT (= 0); +#if HAVE_MBRTOWC + mbstate_t prevstate; + + memset (&prevstate, '\0', sizeof (mbstate_t)); +#endif while (!feof (istream)) { char *thisfield; size_t thislen; +#if HAVE_MBRTOWC + mbstate_t thisstate; +#endif if (readlinebuffer (thisline, istream) == 0) break; thisfield = find_field (thisline); thislen = thisline->length - 1 - (thisfield - thisline->buffer); +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + thisstate = thisline->state; + + if (prevline->length == 0 || different_multi + (thisfield, prevfield, thislen, prevlen, thisstate, prevstate)) + { + fwrite (thisline->buffer, sizeof (char), + thisline->length, ostream); + + SWAP_LINES (prevline, thisline); + prevfield = thisfield; + prevlen = thislen; + prevstate = thisstate; + } + } + else +#endif if (prevline->length == 0 || different (thisfield, prevfield, thislen, prevlen)) { @@ -324,17 +518,26 @@ check_file (const char *infile, const ch size_t prevlen; uintmax_t match_count = 0; bool first_delimiter = true; +#if HAVE_MBRTOWC + mbstate_t prevstate; +#endif if (readlinebuffer (prevline, istream) == 0) goto closefiles; prevfield = find_field (prevline); prevlen = prevline->length - 1 - (prevfield - prevline->buffer); +#if HAVE_MBRTOWC + prevstate = prevline->state; +#endif while (!feof (istream)) { bool match; char *thisfield; size_t thislen; +#if HAVE_MBRTOWC + mbstate_t thisstate; +#endif if (readlinebuffer (thisline, istream) == 0) { if (ferror (istream)) @@ -343,6 +546,15 @@ check_file (const char *infile, const ch } thisfield = find_field (thisline); thislen = thisline->length - 1 - (thisfield - thisline->buffer); +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + thisstate = thisline->state; + match = !different_multi (thisfield, prevfield, + thislen, prevlen, thisstate, prevstate); + } + else +#endif match = !different (thisfield, prevfield, thislen, prevlen); match_count += match; @@ -375,6 +587,9 @@ check_file (const char *infile, const ch SWAP_LINES (prevline, thisline); prevfield = thisfield; prevlen = thislen; +#if HAVE_MBRTOWC + prevstate = thisstate; +#endif if (!match) match_count = 0; } @@ -420,6 +635,18 @@ main (int argc, char **argv) atexit (close_stdout); +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + find_field = find_field_multi; + blank_type = wctype ("blank"); + } + else +#endif + { + find_field = find_field_uni; + } + skip_chars = 0; skip_fields = 0; check_chars = SIZE_MAX;