Contents of /trunk/mkinitrd-magellan/busybox/libbb/unicode.c
Parent Directory | Revision Log
Revision 1000 -
(show annotations)
(download)
Sun May 30 12:27:29 2010 UTC (14 years, 3 months ago) by niro
File MIME type: text/plain
File size: 6725 byte(s)
Sun May 30 12:27:29 2010 UTC (14 years, 3 months ago) by niro
File MIME type: text/plain
File size: 6725 byte(s)
-added missing files
1 | /* vi: set sw=4 ts=4: */ |
2 | /* |
3 | * Unicode support routines. |
4 | * |
5 | * Copyright (C) 2009 Denys Vlasenko |
6 | * |
7 | * Licensed under GPL version 2, see file LICENSE in this tarball for details. |
8 | */ |
9 | #include "libbb.h" |
10 | #include "unicode.h" |
11 | |
12 | /* If it's not #defined as a constant in unicode.h... */ |
13 | #ifndef unicode_status |
14 | uint8_t unicode_status; |
15 | #endif |
16 | |
17 | /* This file is compiled only if FEATURE_ASSUME_UNICODE is on. |
18 | * We check other options and decide whether to use libc support |
19 | * via locale, or use our own logic: |
20 | */ |
21 | |
22 | #if ENABLE_LOCALE_SUPPORT |
23 | |
24 | /* Unicode support using libc locale support. */ |
25 | |
26 | void FAST_FUNC init_unicode(void) |
27 | { |
28 | /* In unicode, this is a one character string */ |
29 | static const char unicode_0x394[] = { 0xce, 0x94, 0 }; |
30 | |
31 | if (unicode_status != UNICODE_UNKNOWN) |
32 | return; |
33 | |
34 | unicode_status = unicode_strlen(unicode_0x394) == 1 ? UNICODE_ON : UNICODE_OFF; |
35 | } |
36 | |
37 | #else |
38 | |
39 | /* Homegrown Unicode support. It knows only C and Unicode locales. */ |
40 | |
41 | # if ENABLE_FEATURE_CHECK_UNICODE_IN_ENV |
42 | void FAST_FUNC init_unicode(void) |
43 | { |
44 | char *lang; |
45 | |
46 | if (unicode_status != UNICODE_UNKNOWN) |
47 | return; |
48 | |
49 | unicode_status = UNICODE_OFF; |
50 | lang = getenv("LANG"); |
51 | if (!lang || !(strstr(lang, ".utf") || strstr(lang, ".UTF"))) |
52 | return; |
53 | unicode_status = UNICODE_ON; |
54 | } |
55 | # endif |
56 | |
57 | static size_t wcrtomb_internal(char *s, wchar_t wc) |
58 | { |
59 | int n, i; |
60 | uint32_t v = wc; |
61 | |
62 | if (v <= 0x7f) { |
63 | *s = v; |
64 | return 1; |
65 | } |
66 | |
67 | /* RFC 3629 says that Unicode ends at 10FFFF, |
68 | * but we cover entire 32 bits */ |
69 | |
70 | /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */ |
71 | /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */ |
72 | /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */ |
73 | /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */ |
74 | /* 80-7FF -> 110yyyxx 10xxxxxx */ |
75 | |
76 | /* How many bytes do we need? */ |
77 | n = 2; |
78 | /* (0x80000000+ would result in n = 7, limiting n to 6) */ |
79 | while (v >= 0x800 && n < 6) { |
80 | v >>= 5; |
81 | n++; |
82 | } |
83 | /* Fill bytes n-1..1 */ |
84 | i = n; |
85 | while (--i) { |
86 | s[i] = (wc & 0x3f) | 0x80; |
87 | wc >>= 6; |
88 | } |
89 | /* Fill byte 0 */ |
90 | s[0] = wc | (uint8_t)(0x3f00 >> n); |
91 | return n; |
92 | } |
93 | size_t FAST_FUNC wcrtomb(char *s, wchar_t wc, mbstate_t *ps UNUSED_PARAM) |
94 | { |
95 | if (unicode_status != UNICODE_ON) { |
96 | *s = wc; |
97 | return 1; |
98 | } |
99 | |
100 | return wcrtomb_internal(s, wc); |
101 | } |
102 | size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n) |
103 | { |
104 | size_t org_n = n; |
105 | |
106 | if (unicode_status != UNICODE_ON) { |
107 | while (n) { |
108 | wchar_t c = *src++; |
109 | *dest++ = c; |
110 | if (c == 0) |
111 | break; |
112 | n--; |
113 | } |
114 | return org_n - n; |
115 | } |
116 | |
117 | while (n >= MB_CUR_MAX) { |
118 | wchar_t wc = *src++; |
119 | size_t len = wcrtomb_internal(dest, wc); |
120 | |
121 | if (wc == L'\0') |
122 | return org_n - n; |
123 | dest += len; |
124 | n -= len; |
125 | } |
126 | while (n) { |
127 | char tbuf[MB_CUR_MAX]; |
128 | wchar_t wc = *src++; |
129 | size_t len = wcrtomb_internal(tbuf, wc); |
130 | |
131 | if (len > n) |
132 | len = n; |
133 | memcpy(dest, tbuf, len); |
134 | if (wc == L'\0') |
135 | return org_n - n; |
136 | dest += len; |
137 | n -= len; |
138 | } |
139 | return org_n - n; |
140 | } |
141 | |
142 | static const char *mbstowc_internal(wchar_t *res, const char *src) |
143 | { |
144 | int bytes; |
145 | unsigned c = (unsigned char) *src++; |
146 | |
147 | if (c <= 0x7f) { |
148 | *res = c; |
149 | return src; |
150 | } |
151 | |
152 | /* 80-7FF -> 110yyyxx 10xxxxxx */ |
153 | /* 800-FFFF -> 1110yyyy 10yyyyxx 10xxxxxx */ |
154 | /* 10000-1FFFFF -> 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx */ |
155 | /* 200000-3FFFFFF -> 111110tt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */ |
156 | /* 4000000-FFFFFFFF -> 111111tt 10tttttt 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx */ |
157 | bytes = 0; |
158 | do { |
159 | c <<= 1; |
160 | bytes++; |
161 | } while ((c & 0x80) && bytes < 6); |
162 | if (bytes == 1) |
163 | return NULL; |
164 | c = (uint8_t)(c) >> bytes; |
165 | |
166 | while (--bytes) { |
167 | unsigned ch = (unsigned char) *src++; |
168 | if ((ch & 0xc0) != 0x80) { |
169 | return NULL; |
170 | } |
171 | c = (c << 6) + (ch & 0x3f); |
172 | } |
173 | |
174 | /* TODO */ |
175 | /* Need to check that c isn't produced by overlong encoding */ |
176 | /* Example: 11000000 10000000 converts to NUL */ |
177 | /* 11110000 10000000 10000100 10000000 converts to 0x100 */ |
178 | /* correct encoding: 11000100 10000000 */ |
179 | if (c <= 0x7f) { /* crude check */ |
180 | return NULL; |
181 | //or maybe 0xfffd; /* replacement character */ |
182 | } |
183 | |
184 | *res = c; |
185 | return src; |
186 | } |
187 | size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n) |
188 | { |
189 | size_t org_n = n; |
190 | |
191 | if (unicode_status != UNICODE_ON) { |
192 | while (n) { |
193 | unsigned char c = *src++; |
194 | |
195 | if (dest) |
196 | *dest++ = c; |
197 | if (c == 0) |
198 | break; |
199 | n--; |
200 | } |
201 | return org_n - n; |
202 | } |
203 | |
204 | while (n) { |
205 | wchar_t wc; |
206 | src = mbstowc_internal(&wc, src); |
207 | if (src == NULL) /* error */ |
208 | return (size_t) -1L; |
209 | if (dest) |
210 | *dest++ = wc; |
211 | if (wc == 0) /* end-of-string */ |
212 | break; |
213 | n--; |
214 | } |
215 | |
216 | return org_n - n; |
217 | } |
218 | |
219 | #include "unicode_wcwidth.c" |
220 | |
221 | int FAST_FUNC iswspace(wint_t wc) |
222 | { |
223 | return (unsigned)wc <= 0x7f && isspace(wc); |
224 | } |
225 | |
226 | int FAST_FUNC iswalnum(wint_t wc) |
227 | { |
228 | return (unsigned)wc <= 0x7f && isalnum(wc); |
229 | } |
230 | |
231 | int FAST_FUNC iswpunct(wint_t wc) |
232 | { |
233 | return (unsigned)wc <= 0x7f && ispunct(wc); |
234 | } |
235 | |
236 | #endif /* Homegrown Unicode support */ |
237 | |
238 | |
239 | /* The rest is mostly same for libc and for "homegrown" support */ |
240 | |
241 | size_t FAST_FUNC unicode_strlen(const char *string) |
242 | { |
243 | size_t width = mbstowcs(NULL, string, INT_MAX); |
244 | if (width == (size_t)-1L) |
245 | return strlen(string); |
246 | return width; |
247 | } |
248 | |
249 | char* FAST_FUNC unicode_cut_nchars(unsigned width, const char *src) |
250 | { |
251 | char *dst; |
252 | unsigned dst_len; |
253 | |
254 | if (unicode_status != UNICODE_ON) |
255 | return xasprintf("%-*.*s", width, width, src); |
256 | |
257 | dst = NULL; |
258 | dst_len = 0; |
259 | while (1) { |
260 | int w; |
261 | wchar_t wc; |
262 | |
263 | dst = xrealloc(dst, dst_len + 2 * MB_CUR_MAX); |
264 | #if ENABLE_LOCALE_SUPPORT |
265 | { |
266 | mbstate_t mbst = { 0 }; |
267 | ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst); |
268 | if (rc <= 0) /* error, or end-of-string */ |
269 | break; |
270 | } |
271 | #else |
272 | src = mbstowc_internal(&wc, src); |
273 | if (!src || wc == 0) /* error, or end-of-string */ |
274 | break; |
275 | #endif |
276 | w = wcwidth(wc); |
277 | if (w < 0) /* non-printable wchar */ |
278 | break; |
279 | width -= w; |
280 | if ((int)width < 0) { /* string is longer than width */ |
281 | width += w; |
282 | while (width) { |
283 | dst[dst_len++] = ' '; |
284 | width--; |
285 | } |
286 | break; |
287 | } |
288 | #if ENABLE_LOCALE_SUPPORT |
289 | { |
290 | mbstate_t mbst = { 0 }; |
291 | dst_len += wcrtomb(&dst[dst_len], wc, &mbst); |
292 | } |
293 | #else |
294 | dst_len += wcrtomb_internal(&dst[dst_len], wc); |
295 | #endif |
296 | } |
297 | dst[dst_len] = '\0'; |
298 | return dst; |
299 | } |
300 | |
301 | unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src) |
302 | { |
303 | if (unicode_status != UNICODE_ON) { |
304 | return width - strnlen(src, width); |
305 | } |
306 | |
307 | while (1) { |
308 | int w; |
309 | wchar_t wc; |
310 | |
311 | #if ENABLE_LOCALE_SUPPORT |
312 | { |
313 | mbstate_t mbst = { 0 }; |
314 | ssize_t rc = mbsrtowcs(&wc, &src, 1, &mbst); |
315 | if (rc <= 0) /* error, or end-of-string */ |
316 | return width; |
317 | } |
318 | #else |
319 | src = mbstowc_internal(&wc, src); |
320 | if (!src || wc == 0) /* error, or end-of-string */ |
321 | return width; |
322 | #endif |
323 | w = wcwidth(wc); |
324 | if (w < 0) /* non-printable wchar */ |
325 | return width; |
326 | width -= w; |
327 | if ((int)width <= 0) /* string is longer than width */ |
328 | return 0; |
329 | } |
330 | } |