diff -r 6bcc0aa4be39 -r 889504eac4fb xml/cxmllibrary/src/string/src/char.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xml/cxmllibrary/src/string/src/char.c Tue Aug 31 17:02:56 2010 +0300 @@ -0,0 +1,588 @@ +/* +* Copyright (c) 2000 - 2001 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of the License "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ + + +/***************************************************************** +** File: character.c +** Description: +* +* Note: all of the char functions assume that the input buffer points +* to an array of characters which contains a null somewhere and is +* correctly encoded for the particular encoding. Bad things can happen +* if this is not the case. In order to avoid having to check these +* conditions on every operation, a set of validate functions is provided +* to pre-test a string where the caller is not sure these conditions +* are met. It is especially important, when calling character operations +* on bytecode, to make sure to validate all strings. +* +*****************************************************************/ +#include "cxml_internal.h" +#include + +/* +* TODO: Note that there is some duplication between the Validate* +* calls and the string length function. The Validate*S could return +* the length too. +*/ + +/* +* Check that storage points to a valid UTF8 string no longer +* than length bytes. +*/ +static NW_Int32 +StringValidUTF8 (NW_Byte * storage, NW_Uint32 length) +{ + NW_Uint32 i; + NW_Byte bits; + + NW_ASSERT(storage != NULL); + NW_ASSERT(length != 0); + + for (i = 0; i < length;) + { + if (storage[i] == 0) + { + return 1; + } + bits = (NW_Byte) (storage[i] >> 4); + if (bits < 8) + { + i++; + } + else if ((bits == 12) || (bits == 13)) + { + i+=2; + } + else if (bits == 14) + { + i += 3; + } + else if (bits == 15) + { + i += 4; + } + else + { + return 0; + } + } + return 0; +} + + +/* +* Check validity of UCS2 string storage +*/ +static NW_Int32 +StringValidUCS2 (NW_Byte * storage, NW_Uint32 length) +{ + NW_Uint32 i; + + NW_ASSERT(storage != NULL); + NW_ASSERT(length != 0); + + for (i = 0; i < (length - 1); i += 2) + { + if (((storage[i] << 8) | storage[i + 1]) == 0) + { + return 1; + } + } + return 0; +} + + +/* +* Check validity of ISO8859 string storage +*/ +static NW_Int32 +StringValidISO88591 (NW_Byte * storage, NW_Uint32 length) +{ + + NW_Uint32 i; + + NW_ASSERT(storage != NULL); + NW_ASSERT(length != 0); + + for (i = 0; i < length; i++) + { + if (storage[i] == 0) + { + return 1; + } + } + return 0; +} + + +/* +* Check validity of ASCII string storage +*/ +static NW_Int32 +StringValidUSASCII (NW_Byte * storage, NW_Uint32 length) +{ + NW_Uint32 i; + + NW_ASSERT(storage != NULL); + NW_ASSERT(length != 0); + + for (i = 0; i < length; i++) + { + if (storage[i] == 0) + { + return 1; + } + } + return 0; +} + + +/* +* Check the given charset encoding (MIBENUM) and if it +* is supported. +*/ + +NW_Status_t +NW_String_charsetValid (NW_Uint32 encoding) +{ + switch (encoding) + { + case HTTP_iso_10646_ucs_2: + case HTTP_iso_8859_1: + case HTTP_utf_8: + case HTTP_us_ascii: + return NW_STAT_SUCCESS; + default: + return NW_STAT_WBXML_ERROR_CHARSET_UNSUPPORTED; + } +} + + +/* +* RETURN -1 if the encoding is not supported +*/ +NW_Int32 +NW_String_valid(NW_Byte * storage, NW_Uint32 length, NW_Uint32 encoding) +{ + if (encoding == HTTP_iso_10646_ucs_2) + { + return StringValidUCS2 (storage, length); + } + else if (encoding == HTTP_utf_8) + { + return StringValidUTF8 (storage, length); + } + else if (encoding == HTTP_iso_8859_1) + { + return StringValidISO88591 (storage, length); + } + else if (encoding == HTTP_us_ascii) + { + return StringValidUSASCII (storage, length); + } + + return -1; +} + + +/* +* TODO: The following routines are taken from Rainbow. +* They should be revisited for better efficiency, etc. +*/ + +/* +* Read one UTF8 character from a buffer and store it as a NW_Ucs2. +* Returns number of input bytes read. +*/ +static NW_Int32 +ReadUTF8Char (NW_Byte * buff, NW_Ucs2 * c) +{ + switch ((buff[0] >> 4) & 0xf) + { + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + /* 1 NW_Byte */ + *c = (NW_Ucs2) buff[0]; + return 1; + + case 12: + case 13: + /* 2 bytes */ + if ((buff[1] & 0xC0) != 0x80) + { + return -1; + } + *c = (NW_Ucs2) (((buff[0] & 0x1F) << 6) | (buff[1] & 0x3F)); + return 2; + + case 14: + /* 3 bytes */ + if (((buff[1] & 0xC0) != 0x80) && ((buff[2] & 0xC0) != 0x80)) + { + return -1; + } + *c = (NW_Ucs2) (((buff[0] & 0x0F) << 12) | + ((buff[1] & 0x3F) << 6) | ((buff[2] & 0x3F) << 0)); + return 3; + + //we used not to handle 4-bytes UTF-8 case (only 16 bits handled), the case 15 is newly added, it may cause + //problem if in an application the a 4-byte character would convert to ucs2 encoding. + case 15: + /* 4 bytes */ + + if (((buff[1] & 0xC0) != 0x80) && ((buff[2] & 0xC0) != 0x80) && ((buff[3] & 0xC0) != 0x80)) + { + return -1; + } + *c = (((buff[0] & 0x07) << 18) | + ((buff[1] & 0x3F) << 12) | + ((buff[2] & 0x3F) << 6) | + (buff[3] & 0x3F)); + return 4; + + + + default: + return -1; /* Bad format */ + } +} + + +/* +* Write a NW_Ucs2 into a buffer as UTF8. Returns number of bytes written +*/ +NW_Uint32 +NW_String_writeUTF8Char (NW_Ucs2 c, NW_Byte * buff) +{ + if (c <= 0x007F) + { + /* 0x0000 - 0x007F: 1 NW_Byte UTF-8 encoding. */ + buff[0] = (NW_Byte) c; + return 1; + } + else if (c > 0x07FF) + { + /* 0x0800 - 0xFFFF: 3 NW_Byte UTF-8 encoding. */ + buff[0] = (NW_Byte) (0xE0 | ((c >> 12) & 0x0F)); + buff[1] = (NW_Byte) (0x80 | ((c >> 6) & 0x3F)); + buff[2] = (NW_Byte) (0x80 | ((c >> 0) & 0x3F)); + return 3; + } + else + { + /* 0x0080 - 0x07ff: 2 NW_Byte UTF-8 encoding. */ + buff[0] = (NW_Byte) (0xC0 | ((c >> 6) & 0x1F)); + buff[1] = (NW_Byte) (0x80 | ((c >> 0) & 0x3F)); + return 2; + } +} + + +static NW_Int32 +ReadInt16Char (NW_Byte * buff, NW_Ucs2 * c) +{ + /* read unaligned native-endian to aligned native-endian */ + (void) NW_Mem_memcpy(c, buff, sizeof(NW_Ucs2)); + return sizeof(NW_Ucs2); +} + +static NW_Int32 +ReadISO88591Char (NW_Byte * buff, NW_Ucs2 * c) +{ + *c = buff[0]; + return 1; +} + +static NW_Int32 +ReadUSASCIIChar (NW_Byte * buff, NW_Ucs2 * c) +{ + *c = buff[0]; + return 1; +} + +/* +* Read one character of some encoding, returning the NW_Ucs2 +* equivalent and the count of raw characters read +* +* RETURN -1 if encoding is not supported +*/ +EXPORT_C NW_Int32 +NW_String_readChar (NW_Byte * buff, NW_Ucs2 * c, NW_Uint32 encoding) +{ + NW_Int32 nbytes = 0; + + if (encoding == HTTP_iso_10646_ucs_2) + return ReadInt16Char (&buff[nbytes], c); + else if (encoding == HTTP_utf_8) + return ReadUTF8Char (&buff[nbytes], c); + else if (encoding == HTTP_iso_8859_1) + return ReadISO88591Char (&buff[nbytes], c); + else if (encoding == HTTP_us_ascii) + return ReadUSASCIIChar (&buff[nbytes], c); + + return -1; +} + + +/* +* Get the length of a character string in some encoding. Returns the number +* of characters (less the terminating char). The out param byte_count returns +* the number of bytes of storage scanned (including the terminating char). +* Note that there is NO validity check here. This should be done first if +* needed. TODO: Also note that the validity check could return the length +* directly, thus eliminating the need for call to this function when +* doint32 validity checkint32. +*/ +EXPORT_C NW_Int32 +NW_String_charBuffGetLength (void *buffer, NW_Uint32 encoding, NW_Uint32 * byte_count) +{ + NW_Int32 chars = 0; + NW_Ucs2 c = 1; + NW_Int32 retval = 0; + + *byte_count = 0; + while (c) + { + c = 0; /* partial protection against an infinite loop */ + retval = NW_String_readChar ((NW_Byte *) buffer + *byte_count, &c, encoding); + if(retval < 0){ + return -1; + } + (*byte_count) += (NW_Uint32) retval; + chars++; + } + + return chars - 1; +} + + +/* +* Conversions among character strings of various types and ucs2. +* These functions assume that the length in characters of the +* input buffer has been pre-calculated, so that this operation +* doesn't have to be performed for every conversion. This works well +* for String_t which store the character count. +* +* RETURN NULL if malloc fails +*/ +NW_String_UCS2Buff_t * +NW_String_charToUCS2Buff (NW_Byte * s, NW_Uint32 encoding) +{ + NW_String_UCS2Buff_t *storage; + NW_Ucs2 c; + NW_Int32 i; + NW_Int32 count = 0; + NW_Int32 length = 0; + NW_Uint32 byteCount = 0; + NW_Int32 retval = 0; + + if (!NW_String_charsetValid(encoding)) + { + return NULL; + } + + length = NW_String_charBuffGetLength(s, encoding, &byteCount); + if(length < 0){ + return NULL; + } + storage = + (NW_String_UCS2Buff_t*) + NW_Mem_Malloc(((NW_Uint32)length + 1) * sizeof (NW_String_UCS2Buff_t)); + if (storage == NULL) + { + return NULL; + } + + for (i = 0; i < length; i++) + { + retval = NW_String_readChar (s + count, &c, encoding); + if(retval < 0){ + NW_Mem_Free(storage); + return NULL; + } + count += retval; + storage[i].bytes[0] = (NW_Byte) ((c & 0xff00) >> 8); + storage[i].bytes[1] = (NW_Byte) (c & 0xff); + } + storage[length].bytes[0] = 0; + storage[length].bytes[1] = 0; + + return storage; +} + + +/* +* TODO: is this a public or private function ??? +*/ +NW_String_UCS2Buff_t * +NW_String_UTF8ToUCS2Buff (NW_Byte * s) +{ + return NW_String_charToUCS2Buff (s, HTTP_utf_8); +} + + +/* +* TODO: is this a public or private function ??? +*/ +NW_String_UCS2Buff_t * +NW_String_ISO88591ToUCS2Buff (NW_Byte * s) +{ + return NW_String_charToUCS2Buff (s, HTTP_iso_8859_1); +} + + +/* +* RETURN NULL if malloc fails +*/ +NW_Byte * +NW_String_UCS2ToUTF8 (NW_String_UCS2Buff_t * s, NW_Uint32 length) +{ + NW_Byte *tstore; + NW_Byte *storage; + NW_Ucs2 c; + NW_Uint32 i; + NW_Int32 count = 0; + NW_Ucs2 *src = (NW_Ucs2 *)s; /*WMS we should use UCS2 pointer, + because s is a structure and the size of a structure is not fixed + in ARM processor, the size of NW_String_UCS2Buff_t is 4 byte + (address alignment issue) */ + + tstore = (NW_Byte *) NW_Mem_Malloc ((length + 1) * 3); + if (tstore == NULL) + { + return NULL; + } + + for (i = 0; i < length; i++) + { + ReadInt16Char ((NW_Byte *) (src + i), &c); + count += NW_String_writeUTF8Char (c, tstore + count); + } + *(tstore + count) = 0; + storage = (NW_Byte *) NW_Mem_Malloc (count + 1); + if (storage) + { + NW_Mem_memcpy (storage, tstore, count + 1); + } + NW_Mem_Free (tstore); + + return storage; +} + + +/* +* RETURN NULL if malloc fails +* byteCount is total allocation size of s as far as conversion is concerned +*/ +NW_Byte * +NW_String_UCS2ToISO88591 (NW_String_UCS2Buff_t * s, NW_Uint32 byteCount) +{ + NW_Byte *storage = NULL; + NW_Ucs2 c; + NW_Uint32 i; + NW_Ucs2 *src = (NW_Ucs2 *)s; /*WMS we should use UCS2 pointer, + because s is a structure and the size of a structure is not fixed + in ARM processor, the size of NW_String_UCS2Buff_t is 4 byte + (address alignment issue) */ + + storage = (NW_Byte *) NW_Mem_Malloc (byteCount + 1); + if (storage == NULL) + { + return NULL; + } + + for (i = 0; i < byteCount; i++) + { + ReadInt16Char ((NW_Byte *) (src + i), &c); + storage[i] = (NW_Byte) (c & 0xff); + } + storage[byteCount] = 0; + + return storage; +} + +/* Ordered comparison of ucs2 strings */ +NW_Int32 +NW_String_UCS2BuffCmp (NW_String_UCS2Buff_t * s1, + NW_String_UCS2Buff_t * s2, + NW_Bool matchCase) +{ + NW_Ucs2 c1, c2; + NW_Ucs2 *src1 = (NW_Ucs2 *)s1; /*WMS we should use UCS2 pointer, */ + NW_Ucs2 *src2 = (NW_Ucs2 *)s2; /*because s is a structure and the size of a structure is not fixed + in ARM processor, the size of NW_String_UCS2Buff_t is 4 byte + (address alignment issue) */ + + while ( ( *src1 ) || ( *src2 ) ) + { + ReadInt16Char ((NW_Byte *) src1++, &c1); + ReadInt16Char ((NW_Byte *) src2++, &c2); + + if (matchCase == NW_FALSE) { + c1 = CXML_Str_ToLower (c1); + c2 = CXML_Str_ToLower (c2); + } + if (c1 == c2) + { + continue; + } + return (c1 < c2) ? -1 : 1; + } + + return 0; + +} + + +/* Assumes s2 is null terminated, native byte order +and aligned for 16-bit access */ +NW_Status_t +NW_String_CmpToNativeAlignedUCS2 (NW_Uint32 encoding, NW_Uint32 charCount, + NW_Uint8 * s1, NW_Uint16 * s2, + NW_Int32 * r) +{ + NW_Uint32 i; + NW_Int32 byteCount = 0; + NW_Ucs2 c1; + + for (i = 0; i < charCount; i++, s1 += byteCount, s2++) { + byteCount = NW_String_readChar (s1, &c1, encoding); + if (byteCount < 0) { + return NW_STAT_FAILURE; + } + *r = c1 - *s2; + if (*r || (*s2 == 0)) { + break; + } + } + /* You can exit the above loop three ways: i == charCount or + when i != charCount because one of mismatch or null termination + of s2 is encountered. The only one that needs a fixup is if + i == charCount but s2 isn't at null termination. */ + + /*lint -e{794} Conceivable use of null pointer */ + if ((i == charCount) && (*s2 != 0)) { + *r = -*s2; + } + return NW_STAT_SUCCESS; +}