diff -r 889504eac4fb -r 604ca70b6235 xml/cxmllibrary/src/string/src/char.c --- a/xml/cxmllibrary/src/string/src/char.c Tue Aug 31 17:02:56 2010 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,588 +0,0 @@ -/* -* Copyright (c) 2000 - 2001 Nokia Corporation and/or its subsidiary(-ies). -* All rights reserved. -* This component and the accompanying materials are made available -* under the terms of the License "Eclipse Public License v1.0" -* which accompanies this distribution, and is available -* at the URL "http://www.eclipse.org/legal/epl-v10.html". -* -* Initial Contributors: -* Nokia Corporation - initial contribution. -* -* Contributors: -* -* Description: -* -*/ - - -/***************************************************************** -** File: character.c -** Description: -* -* Note: all of the char functions assume that the input buffer points -* to an array of characters which contains a null somewhere and is -* correctly encoded for the particular encoding. Bad things can happen -* if this is not the case. In order to avoid having to check these -* conditions on every operation, a set of validate functions is provided -* to pre-test a string where the caller is not sure these conditions -* are met. It is especially important, when calling character operations -* on bytecode, to make sure to validate all strings. -* -*****************************************************************/ -#include "cxml_internal.h" -#include - -/* -* TODO: Note that there is some duplication between the Validate* -* calls and the string length function. The Validate*S could return -* the length too. -*/ - -/* -* Check that storage points to a valid UTF8 string no longer -* than length bytes. -*/ -static NW_Int32 -StringValidUTF8 (NW_Byte * storage, NW_Uint32 length) -{ - NW_Uint32 i; - NW_Byte bits; - - NW_ASSERT(storage != NULL); - NW_ASSERT(length != 0); - - for (i = 0; i < length;) - { - if (storage[i] == 0) - { - return 1; - } - bits = (NW_Byte) (storage[i] >> 4); - if (bits < 8) - { - i++; - } - else if ((bits == 12) || (bits == 13)) - { - i+=2; - } - else if (bits == 14) - { - i += 3; - } - else if (bits == 15) - { - i += 4; - } - else - { - return 0; - } - } - return 0; -} - - -/* -* Check validity of UCS2 string storage -*/ -static NW_Int32 -StringValidUCS2 (NW_Byte * storage, NW_Uint32 length) -{ - NW_Uint32 i; - - NW_ASSERT(storage != NULL); - NW_ASSERT(length != 0); - - for (i = 0; i < (length - 1); i += 2) - { - if (((storage[i] << 8) | storage[i + 1]) == 0) - { - return 1; - } - } - return 0; -} - - -/* -* Check validity of ISO8859 string storage -*/ -static NW_Int32 -StringValidISO88591 (NW_Byte * storage, NW_Uint32 length) -{ - - NW_Uint32 i; - - NW_ASSERT(storage != NULL); - NW_ASSERT(length != 0); - - for (i = 0; i < length; i++) - { - if (storage[i] == 0) - { - return 1; - } - } - return 0; -} - - -/* -* Check validity of ASCII string storage -*/ -static NW_Int32 -StringValidUSASCII (NW_Byte * storage, NW_Uint32 length) -{ - NW_Uint32 i; - - NW_ASSERT(storage != NULL); - NW_ASSERT(length != 0); - - for (i = 0; i < length; i++) - { - if (storage[i] == 0) - { - return 1; - } - } - return 0; -} - - -/* -* Check the given charset encoding (MIBENUM) and if it -* is supported. -*/ - -NW_Status_t -NW_String_charsetValid (NW_Uint32 encoding) -{ - switch (encoding) - { - case HTTP_iso_10646_ucs_2: - case HTTP_iso_8859_1: - case HTTP_utf_8: - case HTTP_us_ascii: - return NW_STAT_SUCCESS; - default: - return NW_STAT_WBXML_ERROR_CHARSET_UNSUPPORTED; - } -} - - -/* -* RETURN -1 if the encoding is not supported -*/ -NW_Int32 -NW_String_valid(NW_Byte * storage, NW_Uint32 length, NW_Uint32 encoding) -{ - if (encoding == HTTP_iso_10646_ucs_2) - { - return StringValidUCS2 (storage, length); - } - else if (encoding == HTTP_utf_8) - { - return StringValidUTF8 (storage, length); - } - else if (encoding == HTTP_iso_8859_1) - { - return StringValidISO88591 (storage, length); - } - else if (encoding == HTTP_us_ascii) - { - return StringValidUSASCII (storage, length); - } - - return -1; -} - - -/* -* TODO: The following routines are taken from Rainbow. -* They should be revisited for better efficiency, etc. -*/ - -/* -* Read one UTF8 character from a buffer and store it as a NW_Ucs2. -* Returns number of input bytes read. -*/ -static NW_Int32 -ReadUTF8Char (NW_Byte * buff, NW_Ucs2 * c) -{ - switch ((buff[0] >> 4) & 0xf) - { - case 0: - case 1: - case 2: - case 3: - case 4: - case 5: - case 6: - case 7: - /* 1 NW_Byte */ - *c = (NW_Ucs2) buff[0]; - return 1; - - case 12: - case 13: - /* 2 bytes */ - if ((buff[1] & 0xC0) != 0x80) - { - return -1; - } - *c = (NW_Ucs2) (((buff[0] & 0x1F) << 6) | (buff[1] & 0x3F)); - return 2; - - case 14: - /* 3 bytes */ - if (((buff[1] & 0xC0) != 0x80) && ((buff[2] & 0xC0) != 0x80)) - { - return -1; - } - *c = (NW_Ucs2) (((buff[0] & 0x0F) << 12) | - ((buff[1] & 0x3F) << 6) | ((buff[2] & 0x3F) << 0)); - return 3; - - //we used not to handle 4-bytes UTF-8 case (only 16 bits handled), the case 15 is newly added, it may cause - //problem if in an application the a 4-byte character would convert to ucs2 encoding. - case 15: - /* 4 bytes */ - - if (((buff[1] & 0xC0) != 0x80) && ((buff[2] & 0xC0) != 0x80) && ((buff[3] & 0xC0) != 0x80)) - { - return -1; - } - *c = (((buff[0] & 0x07) << 18) | - ((buff[1] & 0x3F) << 12) | - ((buff[2] & 0x3F) << 6) | - (buff[3] & 0x3F)); - return 4; - - - - default: - return -1; /* Bad format */ - } -} - - -/* -* Write a NW_Ucs2 into a buffer as UTF8. Returns number of bytes written -*/ -NW_Uint32 -NW_String_writeUTF8Char (NW_Ucs2 c, NW_Byte * buff) -{ - if (c <= 0x007F) - { - /* 0x0000 - 0x007F: 1 NW_Byte UTF-8 encoding. */ - buff[0] = (NW_Byte) c; - return 1; - } - else if (c > 0x07FF) - { - /* 0x0800 - 0xFFFF: 3 NW_Byte UTF-8 encoding. */ - buff[0] = (NW_Byte) (0xE0 | ((c >> 12) & 0x0F)); - buff[1] = (NW_Byte) (0x80 | ((c >> 6) & 0x3F)); - buff[2] = (NW_Byte) (0x80 | ((c >> 0) & 0x3F)); - return 3; - } - else - { - /* 0x0080 - 0x07ff: 2 NW_Byte UTF-8 encoding. */ - buff[0] = (NW_Byte) (0xC0 | ((c >> 6) & 0x1F)); - buff[1] = (NW_Byte) (0x80 | ((c >> 0) & 0x3F)); - return 2; - } -} - - -static NW_Int32 -ReadInt16Char (NW_Byte * buff, NW_Ucs2 * c) -{ - /* read unaligned native-endian to aligned native-endian */ - (void) NW_Mem_memcpy(c, buff, sizeof(NW_Ucs2)); - return sizeof(NW_Ucs2); -} - -static NW_Int32 -ReadISO88591Char (NW_Byte * buff, NW_Ucs2 * c) -{ - *c = buff[0]; - return 1; -} - -static NW_Int32 -ReadUSASCIIChar (NW_Byte * buff, NW_Ucs2 * c) -{ - *c = buff[0]; - return 1; -} - -/* -* Read one character of some encoding, returning the NW_Ucs2 -* equivalent and the count of raw characters read -* -* RETURN -1 if encoding is not supported -*/ -EXPORT_C NW_Int32 -NW_String_readChar (NW_Byte * buff, NW_Ucs2 * c, NW_Uint32 encoding) -{ - NW_Int32 nbytes = 0; - - if (encoding == HTTP_iso_10646_ucs_2) - return ReadInt16Char (&buff[nbytes], c); - else if (encoding == HTTP_utf_8) - return ReadUTF8Char (&buff[nbytes], c); - else if (encoding == HTTP_iso_8859_1) - return ReadISO88591Char (&buff[nbytes], c); - else if (encoding == HTTP_us_ascii) - return ReadUSASCIIChar (&buff[nbytes], c); - - return -1; -} - - -/* -* Get the length of a character string in some encoding. Returns the number -* of characters (less the terminating char). The out param byte_count returns -* the number of bytes of storage scanned (including the terminating char). -* Note that there is NO validity check here. This should be done first if -* needed. TODO: Also note that the validity check could return the length -* directly, thus eliminating the need for call to this function when -* doint32 validity checkint32. -*/ -EXPORT_C NW_Int32 -NW_String_charBuffGetLength (void *buffer, NW_Uint32 encoding, NW_Uint32 * byte_count) -{ - NW_Int32 chars = 0; - NW_Ucs2 c = 1; - NW_Int32 retval = 0; - - *byte_count = 0; - while (c) - { - c = 0; /* partial protection against an infinite loop */ - retval = NW_String_readChar ((NW_Byte *) buffer + *byte_count, &c, encoding); - if(retval < 0){ - return -1; - } - (*byte_count) += (NW_Uint32) retval; - chars++; - } - - return chars - 1; -} - - -/* -* Conversions among character strings of various types and ucs2. -* These functions assume that the length in characters of the -* input buffer has been pre-calculated, so that this operation -* doesn't have to be performed for every conversion. This works well -* for String_t which store the character count. -* -* RETURN NULL if malloc fails -*/ -NW_String_UCS2Buff_t * -NW_String_charToUCS2Buff (NW_Byte * s, NW_Uint32 encoding) -{ - NW_String_UCS2Buff_t *storage; - NW_Ucs2 c; - NW_Int32 i; - NW_Int32 count = 0; - NW_Int32 length = 0; - NW_Uint32 byteCount = 0; - NW_Int32 retval = 0; - - if (!NW_String_charsetValid(encoding)) - { - return NULL; - } - - length = NW_String_charBuffGetLength(s, encoding, &byteCount); - if(length < 0){ - return NULL; - } - storage = - (NW_String_UCS2Buff_t*) - NW_Mem_Malloc(((NW_Uint32)length + 1) * sizeof (NW_String_UCS2Buff_t)); - if (storage == NULL) - { - return NULL; - } - - for (i = 0; i < length; i++) - { - retval = NW_String_readChar (s + count, &c, encoding); - if(retval < 0){ - NW_Mem_Free(storage); - return NULL; - } - count += retval; - storage[i].bytes[0] = (NW_Byte) ((c & 0xff00) >> 8); - storage[i].bytes[1] = (NW_Byte) (c & 0xff); - } - storage[length].bytes[0] = 0; - storage[length].bytes[1] = 0; - - return storage; -} - - -/* -* TODO: is this a public or private function ??? -*/ -NW_String_UCS2Buff_t * -NW_String_UTF8ToUCS2Buff (NW_Byte * s) -{ - return NW_String_charToUCS2Buff (s, HTTP_utf_8); -} - - -/* -* TODO: is this a public or private function ??? -*/ -NW_String_UCS2Buff_t * -NW_String_ISO88591ToUCS2Buff (NW_Byte * s) -{ - return NW_String_charToUCS2Buff (s, HTTP_iso_8859_1); -} - - -/* -* RETURN NULL if malloc fails -*/ -NW_Byte * -NW_String_UCS2ToUTF8 (NW_String_UCS2Buff_t * s, NW_Uint32 length) -{ - NW_Byte *tstore; - NW_Byte *storage; - NW_Ucs2 c; - NW_Uint32 i; - NW_Int32 count = 0; - NW_Ucs2 *src = (NW_Ucs2 *)s; /*WMS we should use UCS2 pointer, - because s is a structure and the size of a structure is not fixed - in ARM processor, the size of NW_String_UCS2Buff_t is 4 byte - (address alignment issue) */ - - tstore = (NW_Byte *) NW_Mem_Malloc ((length + 1) * 3); - if (tstore == NULL) - { - return NULL; - } - - for (i = 0; i < length; i++) - { - ReadInt16Char ((NW_Byte *) (src + i), &c); - count += NW_String_writeUTF8Char (c, tstore + count); - } - *(tstore + count) = 0; - storage = (NW_Byte *) NW_Mem_Malloc (count + 1); - if (storage) - { - NW_Mem_memcpy (storage, tstore, count + 1); - } - NW_Mem_Free (tstore); - - return storage; -} - - -/* -* RETURN NULL if malloc fails -* byteCount is total allocation size of s as far as conversion is concerned -*/ -NW_Byte * -NW_String_UCS2ToISO88591 (NW_String_UCS2Buff_t * s, NW_Uint32 byteCount) -{ - NW_Byte *storage = NULL; - NW_Ucs2 c; - NW_Uint32 i; - NW_Ucs2 *src = (NW_Ucs2 *)s; /*WMS we should use UCS2 pointer, - because s is a structure and the size of a structure is not fixed - in ARM processor, the size of NW_String_UCS2Buff_t is 4 byte - (address alignment issue) */ - - storage = (NW_Byte *) NW_Mem_Malloc (byteCount + 1); - if (storage == NULL) - { - return NULL; - } - - for (i = 0; i < byteCount; i++) - { - ReadInt16Char ((NW_Byte *) (src + i), &c); - storage[i] = (NW_Byte) (c & 0xff); - } - storage[byteCount] = 0; - - return storage; -} - -/* Ordered comparison of ucs2 strings */ -NW_Int32 -NW_String_UCS2BuffCmp (NW_String_UCS2Buff_t * s1, - NW_String_UCS2Buff_t * s2, - NW_Bool matchCase) -{ - NW_Ucs2 c1, c2; - NW_Ucs2 *src1 = (NW_Ucs2 *)s1; /*WMS we should use UCS2 pointer, */ - NW_Ucs2 *src2 = (NW_Ucs2 *)s2; /*because s is a structure and the size of a structure is not fixed - in ARM processor, the size of NW_String_UCS2Buff_t is 4 byte - (address alignment issue) */ - - while ( ( *src1 ) || ( *src2 ) ) - { - ReadInt16Char ((NW_Byte *) src1++, &c1); - ReadInt16Char ((NW_Byte *) src2++, &c2); - - if (matchCase == NW_FALSE) { - c1 = CXML_Str_ToLower (c1); - c2 = CXML_Str_ToLower (c2); - } - if (c1 == c2) - { - continue; - } - return (c1 < c2) ? -1 : 1; - } - - return 0; - -} - - -/* Assumes s2 is null terminated, native byte order -and aligned for 16-bit access */ -NW_Status_t -NW_String_CmpToNativeAlignedUCS2 (NW_Uint32 encoding, NW_Uint32 charCount, - NW_Uint8 * s1, NW_Uint16 * s2, - NW_Int32 * r) -{ - NW_Uint32 i; - NW_Int32 byteCount = 0; - NW_Ucs2 c1; - - for (i = 0; i < charCount; i++, s1 += byteCount, s2++) { - byteCount = NW_String_readChar (s1, &c1, encoding); - if (byteCount < 0) { - return NW_STAT_FAILURE; - } - *r = c1 - *s2; - if (*r || (*s2 == 0)) { - break; - } - } - /* You can exit the above loop three ways: i == charCount or - when i != charCount because one of mismatch or null termination - of s2 is encountered. The only one that needs a fixup is if - i == charCount but s2 isn't at null termination. */ - - /*lint -e{794} Conceivable use of null pointer */ - if ((i == charCount) && (*s2 != 0)) { - *r = -*s2; - } - return NW_STAT_SUCCESS; -}