16 |
* Licensed under GPLv2 or later, see file LICENSE in this tarball for details. |
* Licensed under GPLv2 or later, see file LICENSE in this tarball for details. |
17 |
*/ |
*/ |
18 |
/* http://www.opengroup.org/onlinepubs/009695399/utilities/tr.html |
/* http://www.opengroup.org/onlinepubs/009695399/utilities/tr.html |
19 |
* TODO: xdigit, graph, print |
* TODO: graph, print |
20 |
*/ |
*/ |
21 |
#include "libbb.h" |
#include "libbb.h" |
22 |
|
|
23 |
#define ASCII 0377 |
enum { |
24 |
|
ASCII = 256, |
25 |
|
/* string buffer needs to be at least as big as the whole "alphabet". |
26 |
|
* BUFSIZ == ASCII is ok, but we will realloc in expand |
27 |
|
* even for smallest patterns, let's avoid that by using *2: |
28 |
|
*/ |
29 |
|
TR_BUFSIZ = (BUFSIZ > ASCII*2) ? BUFSIZ : ASCII*2, |
30 |
|
}; |
31 |
|
|
32 |
static void map(char *pvector, |
static void map(char *pvector, |
33 |
unsigned char *string1, unsigned int string1_len, |
char *string1, unsigned string1_len, |
34 |
unsigned char *string2, unsigned int string2_len) |
char *string2, unsigned string2_len) |
35 |
{ |
{ |
36 |
char last = '0'; |
char last = '0'; |
37 |
unsigned int i, j; |
unsigned i, j; |
38 |
|
|
39 |
for (j = 0, i = 0; i < string1_len; i++) { |
for (j = 0, i = 0; i < string1_len; i++) { |
40 |
if (string2_len <= j) |
if (string2_len <= j) |
41 |
pvector[string1[i]] = last; |
pvector[(unsigned char)(string1[i])] = last; |
42 |
else |
else |
43 |
pvector[string1[i]] = last = string2[j++]; |
pvector[(unsigned char)(string1[i])] = last = string2[j++]; |
44 |
} |
} |
45 |
} |
} |
46 |
|
|
47 |
/* supported constructs: |
/* supported constructs: |
48 |
* Ranges, e.g., 0-9 ==> 0123456789 |
* Ranges, e.g., 0-9 ==> 0123456789 |
|
* Ranges, e.g., [0-9] ==> 0123456789 |
|
49 |
* Escapes, e.g., \a ==> Control-G |
* Escapes, e.g., \a ==> Control-G |
50 |
* Character classes, e.g. [:upper:] ==> A...Z |
* Character classes, e.g. [:upper:] ==> A...Z |
51 |
* Equiv classess, e.g. [=A=] ==> A (hmmmmmmm?) |
* Equiv classess, e.g. [=A=] ==> A (hmmmmmmm?) |
52 |
|
* not supported: |
53 |
|
* \ooo-\ooo - octal ranges |
54 |
|
* [x*N] - repeat char x N times |
55 |
|
* [x*] - repeat char x until it fills STRING2: |
56 |
|
* # echo qwe123 | /usr/bin/tr 123456789 '[d]' |
57 |
|
* qwe[d] |
58 |
|
* # echo qwe123 | /usr/bin/tr 123456789 '[d*]' |
59 |
|
* qweddd |
60 |
*/ |
*/ |
61 |
static unsigned int expand(const char *arg, char *buffer) |
static unsigned expand(const char *arg, char **buffer_p) |
62 |
{ |
{ |
63 |
char *buffer_start = buffer; |
char *buffer = *buffer_p; |
64 |
|
unsigned pos = 0; |
65 |
|
unsigned size = TR_BUFSIZ; |
66 |
unsigned i; /* can't be unsigned char: must be able to hold 256 */ |
unsigned i; /* can't be unsigned char: must be able to hold 256 */ |
67 |
unsigned char ac; |
unsigned char ac; |
68 |
|
|
69 |
while (*arg) { |
while (*arg) { |
70 |
|
if (pos + ASCII > size) { |
71 |
|
size += ASCII; |
72 |
|
*buffer_p = buffer = xrealloc(buffer, size); |
73 |
|
} |
74 |
if (*arg == '\\') { |
if (*arg == '\\') { |
75 |
arg++; |
arg++; |
76 |
*buffer++ = bb_process_escape_sequence(&arg); |
buffer[pos++] = bb_process_escape_sequence(&arg); |
77 |
continue; |
continue; |
78 |
} |
} |
79 |
if (arg[1] == '-') { /* "0-9..." */ |
if (arg[1] == '-') { /* "0-9..." */ |
80 |
ac = arg[2]; |
ac = arg[2]; |
81 |
if (ac == '\0') { /* "0-": copy verbatim */ |
if (ac == '\0') { /* "0-": copy verbatim */ |
82 |
*buffer++ = *arg++; /* copy '0' */ |
buffer[pos++] = *arg++; /* copy '0' */ |
83 |
continue; /* next iter will copy '-' and stop */ |
continue; /* next iter will copy '-' and stop */ |
84 |
} |
} |
85 |
i = *arg; |
i = (unsigned char) *arg; |
86 |
while (i <= ac) /* ok: i is unsigned _int_ */ |
while (i <= ac) /* ok: i is unsigned _int_ */ |
87 |
*buffer++ = i++; |
buffer[pos++] = i++; |
88 |
arg += 3; /* skip 0-9 */ |
arg += 3; /* skip 0-9 */ |
89 |
continue; |
continue; |
90 |
} |
} |
91 |
if (*arg == '[') { /* "[xyz..." */ |
if ((ENABLE_FEATURE_TR_CLASSES || ENABLE_FEATURE_TR_EQUIV) |
92 |
|
&& *arg == '[' |
93 |
|
) { |
94 |
arg++; |
arg++; |
95 |
i = *arg++; |
i = (unsigned char) *arg++; |
96 |
/* "[xyz...", i=x, arg points to y */ |
/* "[xyz...". i=x, arg points to y */ |
97 |
if (ENABLE_FEATURE_TR_CLASSES && i == ':') { |
if (ENABLE_FEATURE_TR_CLASSES && i == ':') { /* [:class:] */ |
98 |
#define CLO ":]\0" |
#define CLO ":]\0" |
99 |
static const char classes[] ALIGN1 = |
static const char classes[] ALIGN1 = |
100 |
"alpha"CLO "alnum"CLO "digit"CLO |
"alpha"CLO "alnum"CLO "digit"CLO |
101 |
"lower"CLO "upper"CLO "space"CLO |
"lower"CLO "upper"CLO "space"CLO |
102 |
"blank"CLO "punct"CLO "cntrl"CLO; |
"blank"CLO "punct"CLO "cntrl"CLO |
103 |
#define CLASS_invalid 0 /* we increment the retval */ |
"xdigit"CLO; |
104 |
#define CLASS_alpha 1 |
enum { |
105 |
#define CLASS_alnum 2 |
CLASS_invalid = 0, /* we increment the retval */ |
106 |
#define CLASS_digit 3 |
CLASS_alpha = 1, |
107 |
#define CLASS_lower 4 |
CLASS_alnum = 2, |
108 |
#define CLASS_upper 5 |
CLASS_digit = 3, |
109 |
#define CLASS_space 6 |
CLASS_lower = 4, |
110 |
#define CLASS_blank 7 |
CLASS_upper = 5, |
111 |
#define CLASS_punct 8 |
CLASS_space = 6, |
112 |
#define CLASS_cntrl 9 |
CLASS_blank = 7, |
113 |
//#define CLASS_xdigit 10 |
CLASS_punct = 8, |
114 |
//#define CLASS_graph 11 |
CLASS_cntrl = 9, |
115 |
//#define CLASS_print 12 |
CLASS_xdigit = 10, |
116 |
|
//CLASS_graph = 11, |
117 |
|
//CLASS_print = 12, |
118 |
|
}; |
119 |
smalluint j; |
smalluint j; |
120 |
{ /* not really pretty.. */ |
char *tmp; |
121 |
char *tmp = xstrndup(arg, 7); // warning: xdigit would need 8, not 7 |
|
122 |
j = index_in_strings(classes, tmp) + 1; |
/* xdigit needs 8, not 7 */ |
123 |
free(tmp); |
i = 7 + (arg[0] == 'x'); |
124 |
} |
tmp = xstrndup(arg, i); |
125 |
if (j == CLASS_alnum || j == CLASS_digit) { |
j = index_in_strings(classes, tmp) + 1; |
126 |
|
free(tmp); |
127 |
|
|
128 |
|
if (j == CLASS_invalid) |
129 |
|
goto skip_bracket; |
130 |
|
|
131 |
|
arg += i; |
132 |
|
if (j == CLASS_alnum || j == CLASS_digit || j == CLASS_xdigit) { |
133 |
for (i = '0'; i <= '9'; i++) |
for (i = '0'; i <= '9'; i++) |
134 |
*buffer++ = i; |
buffer[pos++] = i; |
135 |
} |
} |
136 |
if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_upper) { |
if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_upper) { |
137 |
for (i = 'A'; i <= 'Z'; i++) |
for (i = 'A'; i <= 'Z'; i++) |
138 |
*buffer++ = i; |
buffer[pos++] = i; |
139 |
} |
} |
140 |
if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_lower) { |
if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_lower) { |
141 |
for (i = 'a'; i <= 'z'; i++) |
for (i = 'a'; i <= 'z'; i++) |
142 |
*buffer++ = i; |
buffer[pos++] = i; |
143 |
} |
} |
144 |
if (j == CLASS_space || j == CLASS_blank) { |
if (j == CLASS_space || j == CLASS_blank) { |
145 |
*buffer++ = '\t'; |
buffer[pos++] = '\t'; |
146 |
if (j == CLASS_space) { |
if (j == CLASS_space) { |
147 |
*buffer++ = '\n'; |
buffer[pos++] = '\n'; |
148 |
*buffer++ = '\v'; |
buffer[pos++] = '\v'; |
149 |
*buffer++ = '\f'; |
buffer[pos++] = '\f'; |
150 |
*buffer++ = '\r'; |
buffer[pos++] = '\r'; |
151 |
} |
} |
152 |
*buffer++ = ' '; |
buffer[pos++] = ' '; |
153 |
} |
} |
154 |
if (j == CLASS_punct || j == CLASS_cntrl) { |
if (j == CLASS_punct || j == CLASS_cntrl) { |
155 |
for (i = '\0'; i <= ASCII; i++) |
for (i = '\0'; i < ASCII; i++) { |
156 |
if ((j == CLASS_punct && isprint(i) && !isalnum(i) && !isspace(i)) |
if ((j == CLASS_punct && isprint_asciionly(i) && !isalnum(i) && !isspace(i)) |
157 |
|| (j == CLASS_cntrl && iscntrl(i))) |
|| (j == CLASS_cntrl && iscntrl(i)) |
158 |
*buffer++ = i; |
) { |
159 |
} |
buffer[pos++] = i; |
160 |
if (j == CLASS_invalid) { |
} |
161 |
*buffer++ = '['; |
} |
|
*buffer++ = ':'; |
|
|
continue; |
|
162 |
} |
} |
163 |
break; |
if (j == CLASS_xdigit) { |
164 |
|
for (i = 'A'; i <= 'F'; i++) { |
165 |
|
buffer[pos + 6] = i | 0x20; |
166 |
|
buffer[pos++] = i; |
167 |
|
} |
168 |
|
pos += 6; |
169 |
|
} |
170 |
|
continue; |
171 |
} |
} |
172 |
/* "[xyz...", i=x, arg points to y */ |
/* "[xyz...", i=x, arg points to y */ |
173 |
if (ENABLE_FEATURE_TR_EQUIV && i == '=') { /* [=CHAR=] */ |
if (ENABLE_FEATURE_TR_EQUIV && i == '=') { /* [=CHAR=] */ |
174 |
*buffer++ = *arg; /* copy CHAR */ |
buffer[pos++] = *arg; /* copy CHAR */ |
175 |
if (!*arg || arg[1] != '=' || arg[2] != ']') |
if (!arg[0] || arg[1] != '=' || arg[2] != ']') |
176 |
bb_show_usage(); |
bb_show_usage(); |
177 |
arg += 3; /* skip CHAR=] */ |
arg += 3; /* skip CHAR=] */ |
178 |
continue; |
continue; |
179 |
} |
} |
180 |
if (i == '\0' || *arg != '-') { /* not [x-...] - copy verbatim */ |
/* The rest of "[xyz..." cases is treated as normal |
181 |
*buffer++ = '['; |
* string, "[" has no special meaning here: |
182 |
arg--; /* points to x */ |
* tr "[a-z]" "[A-Z]" can be written as tr "a-z" "A-Z", |
183 |
continue; /* copy all, including eventual ']' */ |
* also try tr "[a-z]" "_A-Z+" and you'll see that |
184 |
} |
* [] is not special here. |
185 |
/* [x-z] */ |
*/ |
186 |
arg++; /* skip - */ |
skip_bracket: |
187 |
if (arg[0] == '\0' || arg[1] != ']') |
arg -= 2; /* points to "[" in "[xyz..." */ |
|
bb_show_usage(); |
|
|
ac = *arg++; |
|
|
while (i <= ac) |
|
|
*buffer++ = i++; |
|
|
arg++; /* skip ] */ |
|
|
continue; |
|
188 |
} |
} |
189 |
*buffer++ = *arg++; |
buffer[pos++] = *arg++; |
190 |
} |
} |
191 |
return (buffer - buffer_start); |
return pos; |
192 |
} |
} |
193 |
|
|
194 |
|
/* NB: buffer is guaranteed to be at least TR_BUFSIZE |
195 |
|
* (which is >= ASCII) big. |
196 |
|
*/ |
197 |
static int complement(char *buffer, int buffer_len) |
static int complement(char *buffer, int buffer_len) |
198 |
{ |
{ |
199 |
int i, j, ix; |
int len; |
200 |
char conv[ASCII + 2]; |
char conv[ASCII]; |
201 |
|
unsigned char ch; |
202 |
ix = 0; |
|
203 |
for (i = '\0'; i <= ASCII; i++) { |
len = 0; |
204 |
for (j = 0; j < buffer_len; j++) |
ch = '\0'; |
205 |
if (buffer[j] == i) |
while (1) { |
206 |
break; |
if (memchr(buffer, ch, buffer_len) == NULL) |
207 |
if (j == buffer_len) |
conv[len++] = ch; |
208 |
conv[ix++] = i & ASCII; |
if (++ch == '\0') |
209 |
|
break; |
210 |
} |
} |
211 |
memcpy(buffer, conv, ix); |
memcpy(buffer, conv, len); |
212 |
return ix; |
return len; |
213 |
} |
} |
214 |
|
|
215 |
int tr_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; |
int tr_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; |
216 |
int tr_main(int argc UNUSED_PARAM, char **argv) |
int tr_main(int argc UNUSED_PARAM, char **argv) |
217 |
{ |
{ |
|
int output_length = 0, input_length; |
|
218 |
int i; |
int i; |
219 |
smalluint flags; |
smalluint opts; |
220 |
ssize_t read_chars = 0; |
ssize_t read_chars; |
221 |
size_t in_index = 0, out_index = 0; |
size_t in_index, out_index; |
222 |
unsigned last = UCHAR_MAX + 1; /* not equal to any char */ |
unsigned last = UCHAR_MAX + 1; /* not equal to any char */ |
223 |
unsigned char coded, c; |
unsigned char coded, c; |
224 |
unsigned char *output = xmalloc(BUFSIZ); |
char *str1 = xmalloc(TR_BUFSIZ); |
225 |
char *vector = xzalloc((ASCII+1) * 3); |
char *str2 = xmalloc(TR_BUFSIZ); |
226 |
char *invec = vector + (ASCII+1); |
int str2_length; |
227 |
char *outvec = vector + (ASCII+1) * 2; |
int str1_length; |
228 |
|
char *vector = xzalloc(ASCII * 3); |
229 |
#define TR_OPT_complement (1 << 0) |
char *invec = vector + ASCII; |
230 |
#define TR_OPT_delete (1 << 1) |
char *outvec = vector + ASCII * 2; |
231 |
#define TR_OPT_squeeze_reps (1 << 2) |
|
232 |
|
#define TR_OPT_complement (3 << 0) |
233 |
flags = getopt32(argv, "+cds"); /* '+': stop at first non-option */ |
#define TR_OPT_delete (1 << 2) |
234 |
argv += optind; |
#define TR_OPT_squeeze_reps (1 << 3) |
235 |
|
|
236 |
for (i = 0; i <= ASCII; i++) { |
for (i = 0; i < ASCII; i++) { |
237 |
vector[i] = i; |
vector[i] = i; |
238 |
/*invec[i] = outvec[i] = FALSE; - done by xzalloc */ |
/*invec[i] = outvec[i] = FALSE; - done by xzalloc */ |
239 |
} |
} |
240 |
|
|
241 |
#define tr_buf bb_common_bufsiz1 |
/* -C/-c difference is that -C complements "characters", |
242 |
if (*argv != NULL) { |
* and -c complements "values" (binary bytes I guess). |
243 |
input_length = expand(*argv++, tr_buf); |
* In POSIX locale, these are the same. |
244 |
if (flags & TR_OPT_complement) |
*/ |
245 |
input_length = complement(tr_buf, input_length); |
|
246 |
if (*argv) { |
opt_complementary = "-1"; |
247 |
if (argv[0][0] == '\0') |
opts = getopt32(argv, "+Ccds"); /* '+': stop at first non-option */ |
248 |
bb_error_msg_and_die("STRING2 cannot be empty"); |
argv += optind; |
249 |
output_length = expand(*argv, (char *)output); |
|
250 |
map(vector, (unsigned char *)tr_buf, input_length, output, output_length); |
str1_length = expand(*argv++, &str1); |
251 |
} |
str2_length = 0; |
252 |
for (i = 0; i < input_length; i++) |
if (opts & TR_OPT_complement) |
253 |
invec[(unsigned char)tr_buf[i]] = TRUE; |
str1_length = complement(str1, str1_length); |
254 |
for (i = 0; i < output_length; i++) |
if (*argv) { |
255 |
outvec[output[i]] = TRUE; |
if (argv[0][0] == '\0') |
256 |
|
bb_error_msg_and_die("STRING2 cannot be empty"); |
257 |
|
str2_length = expand(*argv, &str2); |
258 |
|
map(vector, str1, str1_length, |
259 |
|
str2, str2_length); |
260 |
} |
} |
261 |
|
for (i = 0; i < str1_length; i++) |
262 |
|
invec[(unsigned char)(str1[i])] = TRUE; |
263 |
|
for (i = 0; i < str2_length; i++) |
264 |
|
outvec[(unsigned char)(str2[i])] = TRUE; |
265 |
|
|
266 |
|
goto start_from; |
267 |
|
|
268 |
|
/* In this loop, str1 space is reused as input buffer, |
269 |
|
* str2 - as output one. */ |
270 |
for (;;) { |
for (;;) { |
271 |
/* If we're out of input, flush output and read more input. */ |
/* If we're out of input, flush output and read more input. */ |
272 |
if ((ssize_t)in_index == read_chars) { |
if ((ssize_t)in_index == read_chars) { |
273 |
if (out_index) { |
if (out_index) { |
274 |
xwrite(STDOUT_FILENO, (char *)output, out_index); |
xwrite(STDOUT_FILENO, str2, out_index); |
275 |
|
start_from: |
276 |
out_index = 0; |
out_index = 0; |
277 |
} |
} |
278 |
read_chars = safe_read(STDIN_FILENO, tr_buf, BUFSIZ); |
read_chars = safe_read(STDIN_FILENO, str1, TR_BUFSIZ); |
279 |
if (read_chars <= 0) { |
if (read_chars <= 0) { |
280 |
if (read_chars < 0) |
if (read_chars < 0) |
281 |
bb_perror_msg_and_die(bb_msg_read_error); |
bb_perror_msg_and_die(bb_msg_read_error); |
282 |
exit(EXIT_SUCCESS); |
break; |
283 |
} |
} |
284 |
in_index = 0; |
in_index = 0; |
285 |
} |
} |
286 |
c = tr_buf[in_index++]; |
c = str1[in_index++]; |
287 |
coded = vector[c]; |
if ((opts & TR_OPT_delete) && invec[c]) |
|
if ((flags & TR_OPT_delete) && invec[c]) |
|
288 |
continue; |
continue; |
289 |
if ((flags & TR_OPT_squeeze_reps) && last == coded |
coded = vector[c]; |
290 |
&& (invec[c] || outvec[coded])) |
if ((opts & TR_OPT_squeeze_reps) && last == coded |
291 |
|
&& (invec[c] || outvec[coded]) |
292 |
|
) { |
293 |
continue; |
continue; |
294 |
output[out_index++] = last = coded; |
} |
295 |
|
str2[out_index++] = last = coded; |
296 |
} |
} |
297 |
/* NOTREACHED */ |
|
298 |
return EXIT_SUCCESS; |
return EXIT_SUCCESS; |
299 |
} |
} |