|
1 /* |
|
2 * Copyright (c) 2000 - 2001 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of the License "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 |
|
19 /***************************************************************** |
|
20 ** File: character.c |
|
21 ** Description: |
|
22 * |
|
23 * Note: all of the char functions assume that the input buffer points |
|
24 * to an array of characters which contains a null somewhere and is |
|
25 * correctly encoded for the particular encoding. Bad things can happen |
|
26 * if this is not the case. In order to avoid having to check these |
|
27 * conditions on every operation, a set of validate functions is provided |
|
28 * to pre-test a string where the caller is not sure these conditions |
|
29 * are met. It is especially important, when calling character operations |
|
30 * on bytecode, to make sure to validate all strings. |
|
31 * |
|
32 *****************************************************************/ |
|
33 #include "cxml_internal.h" |
|
34 #include <xml/cxml/nw_string_char.h> |
|
35 |
|
36 /* |
|
37 * TODO: Note that there is some duplication between the Validate* |
|
38 * calls and the string length function. The Validate*S could return |
|
39 * the length too. |
|
40 */ |
|
41 |
|
42 /* |
|
43 * Check that storage points to a valid UTF8 string no longer |
|
44 * than length bytes. |
|
45 */ |
|
46 static NW_Int32 |
|
47 StringValidUTF8 (NW_Byte * storage, NW_Uint32 length) |
|
48 { |
|
49 NW_Uint32 i; |
|
50 NW_Byte bits; |
|
51 |
|
52 NW_ASSERT(storage != NULL); |
|
53 NW_ASSERT(length != 0); |
|
54 |
|
55 for (i = 0; i < length;) |
|
56 { |
|
57 if (storage[i] == 0) |
|
58 { |
|
59 return 1; |
|
60 } |
|
61 bits = (NW_Byte) (storage[i] >> 4); |
|
62 if (bits < 8) |
|
63 { |
|
64 i++; |
|
65 } |
|
66 else if ((bits == 12) || (bits == 13)) |
|
67 { |
|
68 i+=2; |
|
69 } |
|
70 else if (bits == 14) |
|
71 { |
|
72 i += 3; |
|
73 } |
|
74 else if (bits == 15) |
|
75 { |
|
76 i += 4; |
|
77 } |
|
78 else |
|
79 { |
|
80 return 0; |
|
81 } |
|
82 } |
|
83 return 0; |
|
84 } |
|
85 |
|
86 |
|
87 /* |
|
88 * Check validity of UCS2 string storage |
|
89 */ |
|
90 static NW_Int32 |
|
91 StringValidUCS2 (NW_Byte * storage, NW_Uint32 length) |
|
92 { |
|
93 NW_Uint32 i; |
|
94 |
|
95 NW_ASSERT(storage != NULL); |
|
96 NW_ASSERT(length != 0); |
|
97 |
|
98 for (i = 0; i < (length - 1); i += 2) |
|
99 { |
|
100 if (((storage[i] << 8) | storage[i + 1]) == 0) |
|
101 { |
|
102 return 1; |
|
103 } |
|
104 } |
|
105 return 0; |
|
106 } |
|
107 |
|
108 |
|
109 /* |
|
110 * Check validity of ISO8859 string storage |
|
111 */ |
|
112 static NW_Int32 |
|
113 StringValidISO88591 (NW_Byte * storage, NW_Uint32 length) |
|
114 { |
|
115 |
|
116 NW_Uint32 i; |
|
117 |
|
118 NW_ASSERT(storage != NULL); |
|
119 NW_ASSERT(length != 0); |
|
120 |
|
121 for (i = 0; i < length; i++) |
|
122 { |
|
123 if (storage[i] == 0) |
|
124 { |
|
125 return 1; |
|
126 } |
|
127 } |
|
128 return 0; |
|
129 } |
|
130 |
|
131 |
|
132 /* |
|
133 * Check validity of ASCII string storage |
|
134 */ |
|
135 static NW_Int32 |
|
136 StringValidUSASCII (NW_Byte * storage, NW_Uint32 length) |
|
137 { |
|
138 NW_Uint32 i; |
|
139 |
|
140 NW_ASSERT(storage != NULL); |
|
141 NW_ASSERT(length != 0); |
|
142 |
|
143 for (i = 0; i < length; i++) |
|
144 { |
|
145 if (storage[i] == 0) |
|
146 { |
|
147 return 1; |
|
148 } |
|
149 } |
|
150 return 0; |
|
151 } |
|
152 |
|
153 |
|
154 /* |
|
155 * Check the given charset encoding (MIBENUM) and if it |
|
156 * is supported. |
|
157 */ |
|
158 |
|
159 NW_Status_t |
|
160 NW_String_charsetValid (NW_Uint32 encoding) |
|
161 { |
|
162 switch (encoding) |
|
163 { |
|
164 case HTTP_iso_10646_ucs_2: |
|
165 case HTTP_iso_8859_1: |
|
166 case HTTP_utf_8: |
|
167 case HTTP_us_ascii: |
|
168 return NW_STAT_SUCCESS; |
|
169 default: |
|
170 return NW_STAT_WBXML_ERROR_CHARSET_UNSUPPORTED; |
|
171 } |
|
172 } |
|
173 |
|
174 |
|
175 /* |
|
176 * RETURN -1 if the encoding is not supported |
|
177 */ |
|
178 NW_Int32 |
|
179 NW_String_valid(NW_Byte * storage, NW_Uint32 length, NW_Uint32 encoding) |
|
180 { |
|
181 if (encoding == HTTP_iso_10646_ucs_2) |
|
182 { |
|
183 return StringValidUCS2 (storage, length); |
|
184 } |
|
185 else if (encoding == HTTP_utf_8) |
|
186 { |
|
187 return StringValidUTF8 (storage, length); |
|
188 } |
|
189 else if (encoding == HTTP_iso_8859_1) |
|
190 { |
|
191 return StringValidISO88591 (storage, length); |
|
192 } |
|
193 else if (encoding == HTTP_us_ascii) |
|
194 { |
|
195 return StringValidUSASCII (storage, length); |
|
196 } |
|
197 |
|
198 return -1; |
|
199 } |
|
200 |
|
201 |
|
202 /* |
|
203 * TODO: The following routines are taken from Rainbow. |
|
204 * They should be revisited for better efficiency, etc. |
|
205 */ |
|
206 |
|
207 /* |
|
208 * Read one UTF8 character from a buffer and store it as a NW_Ucs2. |
|
209 * Returns number of input bytes read. |
|
210 */ |
|
211 static NW_Int32 |
|
212 ReadUTF8Char (NW_Byte * buff, NW_Ucs2 * c) |
|
213 { |
|
214 switch ((buff[0] >> 4) & 0xf) |
|
215 { |
|
216 case 0: |
|
217 case 1: |
|
218 case 2: |
|
219 case 3: |
|
220 case 4: |
|
221 case 5: |
|
222 case 6: |
|
223 case 7: |
|
224 /* 1 NW_Byte */ |
|
225 *c = (NW_Ucs2) buff[0]; |
|
226 return 1; |
|
227 |
|
228 case 12: |
|
229 case 13: |
|
230 /* 2 bytes */ |
|
231 if ((buff[1] & 0xC0) != 0x80) |
|
232 { |
|
233 return -1; |
|
234 } |
|
235 *c = (NW_Ucs2) (((buff[0] & 0x1F) << 6) | (buff[1] & 0x3F)); |
|
236 return 2; |
|
237 |
|
238 case 14: |
|
239 /* 3 bytes */ |
|
240 if (((buff[1] & 0xC0) != 0x80) && ((buff[2] & 0xC0) != 0x80)) |
|
241 { |
|
242 return -1; |
|
243 } |
|
244 *c = (NW_Ucs2) (((buff[0] & 0x0F) << 12) | |
|
245 ((buff[1] & 0x3F) << 6) | ((buff[2] & 0x3F) << 0)); |
|
246 return 3; |
|
247 |
|
248 //we used not to handle 4-bytes UTF-8 case (only 16 bits handled), the case 15 is newly added, it may cause |
|
249 //problem if in an application the a 4-byte character would convert to ucs2 encoding. |
|
250 case 15: |
|
251 /* 4 bytes */ |
|
252 |
|
253 if (((buff[1] & 0xC0) != 0x80) && ((buff[2] & 0xC0) != 0x80) && ((buff[3] & 0xC0) != 0x80)) |
|
254 { |
|
255 return -1; |
|
256 } |
|
257 *c = (((buff[0] & 0x07) << 18) | |
|
258 ((buff[1] & 0x3F) << 12) | |
|
259 ((buff[2] & 0x3F) << 6) | |
|
260 (buff[3] & 0x3F)); |
|
261 return 4; |
|
262 |
|
263 |
|
264 |
|
265 default: |
|
266 return -1; /* Bad format */ |
|
267 } |
|
268 } |
|
269 |
|
270 |
|
271 /* |
|
272 * Write a NW_Ucs2 into a buffer as UTF8. Returns number of bytes written |
|
273 */ |
|
274 NW_Uint32 |
|
275 NW_String_writeUTF8Char (NW_Ucs2 c, NW_Byte * buff) |
|
276 { |
|
277 if (c <= 0x007F) |
|
278 { |
|
279 /* 0x0000 - 0x007F: 1 NW_Byte UTF-8 encoding. */ |
|
280 buff[0] = (NW_Byte) c; |
|
281 return 1; |
|
282 } |
|
283 else if (c > 0x07FF) |
|
284 { |
|
285 /* 0x0800 - 0xFFFF: 3 NW_Byte UTF-8 encoding. */ |
|
286 buff[0] = (NW_Byte) (0xE0 | ((c >> 12) & 0x0F)); |
|
287 buff[1] = (NW_Byte) (0x80 | ((c >> 6) & 0x3F)); |
|
288 buff[2] = (NW_Byte) (0x80 | ((c >> 0) & 0x3F)); |
|
289 return 3; |
|
290 } |
|
291 else |
|
292 { |
|
293 /* 0x0080 - 0x07ff: 2 NW_Byte UTF-8 encoding. */ |
|
294 buff[0] = (NW_Byte) (0xC0 | ((c >> 6) & 0x1F)); |
|
295 buff[1] = (NW_Byte) (0x80 | ((c >> 0) & 0x3F)); |
|
296 return 2; |
|
297 } |
|
298 } |
|
299 |
|
300 |
|
301 static NW_Int32 |
|
302 ReadInt16Char (NW_Byte * buff, NW_Ucs2 * c) |
|
303 { |
|
304 /* read unaligned native-endian to aligned native-endian */ |
|
305 (void) NW_Mem_memcpy(c, buff, sizeof(NW_Ucs2)); |
|
306 return sizeof(NW_Ucs2); |
|
307 } |
|
308 |
|
309 static NW_Int32 |
|
310 ReadISO88591Char (NW_Byte * buff, NW_Ucs2 * c) |
|
311 { |
|
312 *c = buff[0]; |
|
313 return 1; |
|
314 } |
|
315 |
|
316 static NW_Int32 |
|
317 ReadUSASCIIChar (NW_Byte * buff, NW_Ucs2 * c) |
|
318 { |
|
319 *c = buff[0]; |
|
320 return 1; |
|
321 } |
|
322 |
|
323 /* |
|
324 * Read one character of some encoding, returning the NW_Ucs2 |
|
325 * equivalent and the count of raw characters read |
|
326 * |
|
327 * RETURN -1 if encoding is not supported |
|
328 */ |
|
329 EXPORT_C NW_Int32 |
|
330 NW_String_readChar (NW_Byte * buff, NW_Ucs2 * c, NW_Uint32 encoding) |
|
331 { |
|
332 NW_Int32 nbytes = 0; |
|
333 |
|
334 if (encoding == HTTP_iso_10646_ucs_2) |
|
335 return ReadInt16Char (&buff[nbytes], c); |
|
336 else if (encoding == HTTP_utf_8) |
|
337 return ReadUTF8Char (&buff[nbytes], c); |
|
338 else if (encoding == HTTP_iso_8859_1) |
|
339 return ReadISO88591Char (&buff[nbytes], c); |
|
340 else if (encoding == HTTP_us_ascii) |
|
341 return ReadUSASCIIChar (&buff[nbytes], c); |
|
342 |
|
343 return -1; |
|
344 } |
|
345 |
|
346 |
|
347 /* |
|
348 * Get the length of a character string in some encoding. Returns the number |
|
349 * of characters (less the terminating char). The out param byte_count returns |
|
350 * the number of bytes of storage scanned (including the terminating char). |
|
351 * Note that there is NO validity check here. This should be done first if |
|
352 * needed. TODO: Also note that the validity check could return the length |
|
353 * directly, thus eliminating the need for call to this function when |
|
354 * doint32 validity checkint32. |
|
355 */ |
|
356 EXPORT_C NW_Int32 |
|
357 NW_String_charBuffGetLength (void *buffer, NW_Uint32 encoding, NW_Uint32 * byte_count) |
|
358 { |
|
359 NW_Int32 chars = 0; |
|
360 NW_Ucs2 c = 1; |
|
361 NW_Int32 retval = 0; |
|
362 |
|
363 *byte_count = 0; |
|
364 while (c) |
|
365 { |
|
366 c = 0; /* partial protection against an infinite loop */ |
|
367 retval = NW_String_readChar ((NW_Byte *) buffer + *byte_count, &c, encoding); |
|
368 if(retval < 0){ |
|
369 return -1; |
|
370 } |
|
371 (*byte_count) += (NW_Uint32) retval; |
|
372 chars++; |
|
373 } |
|
374 |
|
375 return chars - 1; |
|
376 } |
|
377 |
|
378 |
|
379 /* |
|
380 * Conversions among character strings of various types and ucs2. |
|
381 * These functions assume that the length in characters of the |
|
382 * input buffer has been pre-calculated, so that this operation |
|
383 * doesn't have to be performed for every conversion. This works well |
|
384 * for String_t which store the character count. |
|
385 * |
|
386 * RETURN NULL if malloc fails |
|
387 */ |
|
388 NW_String_UCS2Buff_t * |
|
389 NW_String_charToUCS2Buff (NW_Byte * s, NW_Uint32 encoding) |
|
390 { |
|
391 NW_String_UCS2Buff_t *storage; |
|
392 NW_Ucs2 c; |
|
393 NW_Int32 i; |
|
394 NW_Int32 count = 0; |
|
395 NW_Int32 length = 0; |
|
396 NW_Uint32 byteCount = 0; |
|
397 NW_Int32 retval = 0; |
|
398 |
|
399 if (!NW_String_charsetValid(encoding)) |
|
400 { |
|
401 return NULL; |
|
402 } |
|
403 |
|
404 length = NW_String_charBuffGetLength(s, encoding, &byteCount); |
|
405 if(length < 0){ |
|
406 return NULL; |
|
407 } |
|
408 storage = |
|
409 (NW_String_UCS2Buff_t*) |
|
410 NW_Mem_Malloc(((NW_Uint32)length + 1) * sizeof (NW_String_UCS2Buff_t)); |
|
411 if (storage == NULL) |
|
412 { |
|
413 return NULL; |
|
414 } |
|
415 |
|
416 for (i = 0; i < length; i++) |
|
417 { |
|
418 retval = NW_String_readChar (s + count, &c, encoding); |
|
419 if(retval < 0){ |
|
420 NW_Mem_Free(storage); |
|
421 return NULL; |
|
422 } |
|
423 count += retval; |
|
424 storage[i].bytes[0] = (NW_Byte) ((c & 0xff00) >> 8); |
|
425 storage[i].bytes[1] = (NW_Byte) (c & 0xff); |
|
426 } |
|
427 storage[length].bytes[0] = 0; |
|
428 storage[length].bytes[1] = 0; |
|
429 |
|
430 return storage; |
|
431 } |
|
432 |
|
433 |
|
434 /* |
|
435 * TODO: is this a public or private function ??? |
|
436 */ |
|
437 NW_String_UCS2Buff_t * |
|
438 NW_String_UTF8ToUCS2Buff (NW_Byte * s) |
|
439 { |
|
440 return NW_String_charToUCS2Buff (s, HTTP_utf_8); |
|
441 } |
|
442 |
|
443 |
|
444 /* |
|
445 * TODO: is this a public or private function ??? |
|
446 */ |
|
447 NW_String_UCS2Buff_t * |
|
448 NW_String_ISO88591ToUCS2Buff (NW_Byte * s) |
|
449 { |
|
450 return NW_String_charToUCS2Buff (s, HTTP_iso_8859_1); |
|
451 } |
|
452 |
|
453 |
|
454 /* |
|
455 * RETURN NULL if malloc fails |
|
456 */ |
|
457 NW_Byte * |
|
458 NW_String_UCS2ToUTF8 (NW_String_UCS2Buff_t * s, NW_Uint32 length) |
|
459 { |
|
460 NW_Byte *tstore; |
|
461 NW_Byte *storage; |
|
462 NW_Ucs2 c; |
|
463 NW_Uint32 i; |
|
464 NW_Int32 count = 0; |
|
465 NW_Ucs2 *src = (NW_Ucs2 *)s; /*WMS we should use UCS2 pointer, |
|
466 because s is a structure and the size of a structure is not fixed |
|
467 in ARM processor, the size of NW_String_UCS2Buff_t is 4 byte |
|
468 (address alignment issue) */ |
|
469 |
|
470 tstore = (NW_Byte *) NW_Mem_Malloc ((length + 1) * 3); |
|
471 if (tstore == NULL) |
|
472 { |
|
473 return NULL; |
|
474 } |
|
475 |
|
476 for (i = 0; i < length; i++) |
|
477 { |
|
478 ReadInt16Char ((NW_Byte *) (src + i), &c); |
|
479 count += NW_String_writeUTF8Char (c, tstore + count); |
|
480 } |
|
481 *(tstore + count) = 0; |
|
482 storage = (NW_Byte *) NW_Mem_Malloc (count + 1); |
|
483 if (storage) |
|
484 { |
|
485 NW_Mem_memcpy (storage, tstore, count + 1); |
|
486 } |
|
487 NW_Mem_Free (tstore); |
|
488 |
|
489 return storage; |
|
490 } |
|
491 |
|
492 |
|
493 /* |
|
494 * RETURN NULL if malloc fails |
|
495 * byteCount is total allocation size of s as far as conversion is concerned |
|
496 */ |
|
497 NW_Byte * |
|
498 NW_String_UCS2ToISO88591 (NW_String_UCS2Buff_t * s, NW_Uint32 byteCount) |
|
499 { |
|
500 NW_Byte *storage = NULL; |
|
501 NW_Ucs2 c; |
|
502 NW_Uint32 i; |
|
503 NW_Ucs2 *src = (NW_Ucs2 *)s; /*WMS we should use UCS2 pointer, |
|
504 because s is a structure and the size of a structure is not fixed |
|
505 in ARM processor, the size of NW_String_UCS2Buff_t is 4 byte |
|
506 (address alignment issue) */ |
|
507 |
|
508 storage = (NW_Byte *) NW_Mem_Malloc (byteCount + 1); |
|
509 if (storage == NULL) |
|
510 { |
|
511 return NULL; |
|
512 } |
|
513 |
|
514 for (i = 0; i < byteCount; i++) |
|
515 { |
|
516 ReadInt16Char ((NW_Byte *) (src + i), &c); |
|
517 storage[i] = (NW_Byte) (c & 0xff); |
|
518 } |
|
519 storage[byteCount] = 0; |
|
520 |
|
521 return storage; |
|
522 } |
|
523 |
|
524 /* Ordered comparison of ucs2 strings */ |
|
525 NW_Int32 |
|
526 NW_String_UCS2BuffCmp (NW_String_UCS2Buff_t * s1, |
|
527 NW_String_UCS2Buff_t * s2, |
|
528 NW_Bool matchCase) |
|
529 { |
|
530 NW_Ucs2 c1, c2; |
|
531 NW_Ucs2 *src1 = (NW_Ucs2 *)s1; /*WMS we should use UCS2 pointer, */ |
|
532 NW_Ucs2 *src2 = (NW_Ucs2 *)s2; /*because s is a structure and the size of a structure is not fixed |
|
533 in ARM processor, the size of NW_String_UCS2Buff_t is 4 byte |
|
534 (address alignment issue) */ |
|
535 |
|
536 while ( ( *src1 ) || ( *src2 ) ) |
|
537 { |
|
538 ReadInt16Char ((NW_Byte *) src1++, &c1); |
|
539 ReadInt16Char ((NW_Byte *) src2++, &c2); |
|
540 |
|
541 if (matchCase == NW_FALSE) { |
|
542 c1 = CXML_Str_ToLower (c1); |
|
543 c2 = CXML_Str_ToLower (c2); |
|
544 } |
|
545 if (c1 == c2) |
|
546 { |
|
547 continue; |
|
548 } |
|
549 return (c1 < c2) ? -1 : 1; |
|
550 } |
|
551 |
|
552 return 0; |
|
553 |
|
554 } |
|
555 |
|
556 |
|
557 /* Assumes s2 is null terminated, native byte order |
|
558 and aligned for 16-bit access */ |
|
559 NW_Status_t |
|
560 NW_String_CmpToNativeAlignedUCS2 (NW_Uint32 encoding, NW_Uint32 charCount, |
|
561 NW_Uint8 * s1, NW_Uint16 * s2, |
|
562 NW_Int32 * r) |
|
563 { |
|
564 NW_Uint32 i; |
|
565 NW_Int32 byteCount = 0; |
|
566 NW_Ucs2 c1; |
|
567 |
|
568 for (i = 0; i < charCount; i++, s1 += byteCount, s2++) { |
|
569 byteCount = NW_String_readChar (s1, &c1, encoding); |
|
570 if (byteCount < 0) { |
|
571 return NW_STAT_FAILURE; |
|
572 } |
|
573 *r = c1 - *s2; |
|
574 if (*r || (*s2 == 0)) { |
|
575 break; |
|
576 } |
|
577 } |
|
578 /* You can exit the above loop three ways: i == charCount or |
|
579 when i != charCount because one of mismatch or null termination |
|
580 of s2 is encountered. The only one that needs a fixup is if |
|
581 i == charCount but s2 isn't at null termination. */ |
|
582 |
|
583 /*lint -e{794} Conceivable use of null pointer */ |
|
584 if ((i == charCount) && (*s2 != 0)) { |
|
585 *r = -*s2; |
|
586 } |
|
587 return NW_STAT_SUCCESS; |
|
588 } |