xml/cxmllibrary/src/string/src/char.c
branchRCL_3
changeset 20 889504eac4fb
equal deleted inserted replaced
19:6bcc0aa4be39 20:889504eac4fb
       
     1 /*
       
     2 * Copyright (c) 2000 - 2001 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of the License "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 
       
    19 /*****************************************************************
       
    20 **  File: character.c
       
    21 **  Description:
       
    22 * 
       
    23 * Note: all of the char functions assume that the input buffer points
       
    24 * to an array of characters which contains a null somewhere and is
       
    25 * correctly encoded for the particular encoding. Bad things can happen
       
    26 * if this is not the case. In order to avoid having to check these 
       
    27 * conditions on every operation, a set of validate functions is provided
       
    28 * to pre-test a string where the caller is not sure these conditions
       
    29 * are met.  It is especially important, when calling character operations
       
    30 * on bytecode, to make sure to validate all strings.
       
    31 * 
       
    32 *****************************************************************/
       
    33 #include "cxml_internal.h"
       
    34 #include <xml/cxml/nw_string_char.h>
       
    35 
       
    36 /*  
       
    37 * TODO:  Note that there is some duplication between the Validate*
       
    38 * calls and the string length function.  The Validate*S could return
       
    39 * the length too.
       
    40 */
       
    41 
       
    42 /*
       
    43 * Check that storage points to a valid UTF8 string no longer
       
    44 * than length bytes.
       
    45 */
       
    46 static NW_Int32
       
    47 StringValidUTF8 (NW_Byte * storage, NW_Uint32 length)
       
    48 {
       
    49   NW_Uint32 i;
       
    50   NW_Byte bits;
       
    51 
       
    52   NW_ASSERT(storage != NULL);
       
    53   NW_ASSERT(length != 0);
       
    54 
       
    55   for (i = 0; i < length;)
       
    56   {
       
    57     if (storage[i] == 0)
       
    58       {
       
    59         return 1;
       
    60       }
       
    61     bits = (NW_Byte) (storage[i] >> 4);
       
    62     if (bits < 8)
       
    63     {
       
    64           i++;
       
    65     }
       
    66     else if ((bits == 12) || (bits == 13))
       
    67     { 
       
    68   	  i+=2;
       
    69     }
       
    70     else if (bits == 14)
       
    71     {
       
    72 	    i += 3;
       
    73     }
       
    74     else if (bits == 15)
       
    75     {
       
    76 	    i += 4;
       
    77     }
       
    78     else
       
    79     {
       
    80           return 0;
       
    81     }
       
    82   }    
       
    83   return 0;
       
    84 }
       
    85 
       
    86 
       
    87 /*  
       
    88 * Check validity of UCS2 string storage 
       
    89 */
       
    90 static NW_Int32
       
    91 StringValidUCS2 (NW_Byte * storage, NW_Uint32 length)
       
    92 {
       
    93   NW_Uint32 i;
       
    94     
       
    95   NW_ASSERT(storage != NULL);
       
    96   NW_ASSERT(length != 0);
       
    97 
       
    98   for (i = 0; i < (length - 1); i += 2)
       
    99   {
       
   100     if (((storage[i] << 8) | storage[i + 1]) == 0)
       
   101     {
       
   102       return 1;
       
   103     }
       
   104   }
       
   105   return 0;
       
   106 }
       
   107 
       
   108 
       
   109 /* 
       
   110 * Check validity of ISO8859 string storage
       
   111 */
       
   112 static NW_Int32
       
   113 StringValidISO88591 (NW_Byte * storage, NW_Uint32 length)
       
   114 {
       
   115 
       
   116   NW_Uint32 i;
       
   117 
       
   118   NW_ASSERT(storage != NULL);
       
   119   NW_ASSERT(length != 0);
       
   120 
       
   121   for (i = 0; i < length; i++)
       
   122   {
       
   123     if (storage[i] == 0)
       
   124     {
       
   125 	return 1;
       
   126     }
       
   127   }
       
   128   return 0;
       
   129 }
       
   130 
       
   131 
       
   132 /* 
       
   133 * Check validity of ASCII string storage
       
   134 */
       
   135 static NW_Int32
       
   136 StringValidUSASCII (NW_Byte * storage, NW_Uint32 length)
       
   137 {
       
   138   NW_Uint32 i;
       
   139 
       
   140   NW_ASSERT(storage != NULL);
       
   141   NW_ASSERT(length != 0);
       
   142 
       
   143   for (i = 0; i < length; i++)
       
   144   {
       
   145     if (storage[i] == 0)
       
   146     {
       
   147 	return 1;
       
   148     }
       
   149   }
       
   150   return 0;
       
   151 }
       
   152 
       
   153 
       
   154 /*
       
   155 * Check the given charset encoding (MIBENUM) and if it
       
   156 *              is supported.
       
   157 */
       
   158 
       
   159 NW_Status_t
       
   160 NW_String_charsetValid (NW_Uint32 encoding)
       
   161 {
       
   162   switch (encoding)
       
   163   {
       
   164   case HTTP_iso_10646_ucs_2:
       
   165   case HTTP_iso_8859_1:
       
   166   case HTTP_utf_8:
       
   167   case HTTP_us_ascii:
       
   168     return NW_STAT_SUCCESS;
       
   169   default:
       
   170     return NW_STAT_WBXML_ERROR_CHARSET_UNSUPPORTED;
       
   171   }
       
   172 }
       
   173 
       
   174 
       
   175 /*
       
   176 * RETURN -1 if the encoding is not supported
       
   177 */
       
   178 NW_Int32
       
   179 NW_String_valid(NW_Byte * storage, NW_Uint32 length, NW_Uint32 encoding)
       
   180 {
       
   181   if (encoding == HTTP_iso_10646_ucs_2)
       
   182   {
       
   183 	  return StringValidUCS2 (storage, length);
       
   184   }
       
   185   else if (encoding == HTTP_utf_8)
       
   186   {
       
   187 	  return StringValidUTF8 (storage, length);
       
   188   }
       
   189   else if (encoding == HTTP_iso_8859_1)
       
   190   {
       
   191 	  return StringValidISO88591 (storage, length);
       
   192   }
       
   193   else if (encoding == HTTP_us_ascii)
       
   194   {
       
   195 	  return StringValidUSASCII (storage, length);
       
   196   }
       
   197   
       
   198   return -1;
       
   199 }
       
   200 
       
   201 
       
   202 /* 
       
   203 * TODO:  The following routines are taken from Rainbow.
       
   204 * They should be revisited for better efficiency, etc.
       
   205 */
       
   206 
       
   207 /*
       
   208 * Read one UTF8 character from a buffer and store it as a NW_Ucs2.
       
   209 * Returns number of input bytes read.
       
   210 */
       
   211 static NW_Int32
       
   212 ReadUTF8Char (NW_Byte * buff, NW_Ucs2 * c)
       
   213 {
       
   214   switch ((buff[0] >> 4) & 0xf)
       
   215   {
       
   216   case 0:
       
   217   case 1:
       
   218   case 2:
       
   219   case 3:
       
   220   case 4:
       
   221   case 5:
       
   222   case 6:
       
   223   case 7:
       
   224     /* 1 NW_Byte */
       
   225     *c = (NW_Ucs2) buff[0];
       
   226     return 1;
       
   227     
       
   228   case 12:  
       
   229   case 13:
       
   230     /* 2 bytes */
       
   231     if ((buff[1] & 0xC0) != 0x80)
       
   232     {
       
   233 	     return -1;
       
   234     }
       
   235     *c = (NW_Ucs2) (((buff[0] & 0x1F) << 6) | (buff[1] & 0x3F));
       
   236     return 2;
       
   237     
       
   238   case 14:
       
   239     /* 3 bytes */
       
   240     if (((buff[1] & 0xC0) != 0x80) && ((buff[2] & 0xC0) != 0x80))
       
   241     {
       
   242 	     return -1;
       
   243     }
       
   244     *c = (NW_Ucs2) (((buff[0] & 0x0F) << 12) |
       
   245       ((buff[1] & 0x3F) << 6) | ((buff[2] & 0x3F) << 0));
       
   246     return 3;
       
   247 
       
   248   //we used not to handle 4-bytes UTF-8 case (only 16 bits handled), the case 15 is newly added, it may cause
       
   249   //problem if in an application the a 4-byte character would convert to ucs2 encoding. 
       
   250   case 15:  
       
   251     /* 4 bytes */
       
   252 
       
   253     if (((buff[1] & 0xC0) != 0x80) && ((buff[2] & 0xC0) != 0x80) && ((buff[3] & 0xC0) != 0x80))
       
   254     {
       
   255 	     return -1;
       
   256     }
       
   257     *c = (((buff[0] & 0x07) << 18) |
       
   258           ((buff[1] & 0x3F) << 12) |
       
   259           ((buff[2] & 0x3F) << 6) |
       
   260            (buff[3] & 0x3F));
       
   261     return 4;
       
   262 
       
   263 
       
   264     
       
   265   default:
       
   266     return -1;    /* Bad format */
       
   267   }
       
   268 }
       
   269 
       
   270 
       
   271 /* 
       
   272 * Write a NW_Ucs2 into a buffer as UTF8. Returns number of bytes written 
       
   273 */
       
   274 NW_Uint32
       
   275 NW_String_writeUTF8Char (NW_Ucs2 c, NW_Byte * buff)
       
   276 {
       
   277   if (c <= 0x007F)
       
   278   {
       
   279     /* 0x0000 - 0x007F: 1 NW_Byte UTF-8 encoding. */
       
   280     buff[0] = (NW_Byte) c;
       
   281     return 1;
       
   282   }
       
   283   else if (c > 0x07FF)
       
   284   {
       
   285     /* 0x0800 - 0xFFFF: 3 NW_Byte UTF-8 encoding. */
       
   286     buff[0] = (NW_Byte) (0xE0 | ((c >> 12) & 0x0F));
       
   287     buff[1] = (NW_Byte) (0x80 | ((c >> 6) & 0x3F));
       
   288     buff[2] = (NW_Byte) (0x80 | ((c >> 0) & 0x3F));
       
   289     return 3;
       
   290   }
       
   291   else
       
   292   {
       
   293     /* 0x0080 - 0x07ff: 2 NW_Byte UTF-8 encoding. */
       
   294     buff[0] = (NW_Byte) (0xC0 | ((c >> 6) & 0x1F));
       
   295     buff[1] = (NW_Byte) (0x80 | ((c >> 0) & 0x3F));
       
   296     return 2;
       
   297   }
       
   298 }
       
   299 
       
   300 
       
   301 static NW_Int32
       
   302 ReadInt16Char (NW_Byte * buff, NW_Ucs2 * c)
       
   303 {
       
   304   /* read unaligned native-endian to aligned native-endian */
       
   305   (void) NW_Mem_memcpy(c, buff, sizeof(NW_Ucs2));
       
   306   return sizeof(NW_Ucs2);
       
   307 }
       
   308 
       
   309 static NW_Int32
       
   310 ReadISO88591Char (NW_Byte * buff, NW_Ucs2 * c)
       
   311 {
       
   312   *c = buff[0];
       
   313   return 1;
       
   314 }
       
   315 
       
   316 static NW_Int32
       
   317 ReadUSASCIIChar (NW_Byte * buff, NW_Ucs2 * c)
       
   318 {
       
   319   *c = buff[0];
       
   320   return 1;
       
   321 }
       
   322 
       
   323 /*
       
   324 * Read one character of some encoding, returning the NW_Ucs2 
       
   325 * equivalent and the count of raw characters read
       
   326 *
       
   327 * RETURN -1 if encoding is not supported
       
   328 */
       
   329 EXPORT_C NW_Int32
       
   330 NW_String_readChar (NW_Byte * buff, NW_Ucs2 * c, NW_Uint32 encoding)
       
   331 {
       
   332   NW_Int32 nbytes = 0;
       
   333   
       
   334   if (encoding == HTTP_iso_10646_ucs_2)
       
   335     return ReadInt16Char (&buff[nbytes], c);
       
   336   else if (encoding == HTTP_utf_8)
       
   337     return ReadUTF8Char (&buff[nbytes], c);
       
   338   else if (encoding == HTTP_iso_8859_1)
       
   339     return ReadISO88591Char (&buff[nbytes], c);
       
   340   else if (encoding == HTTP_us_ascii)
       
   341     return ReadUSASCIIChar (&buff[nbytes], c);
       
   342 
       
   343   return -1;
       
   344 }
       
   345 
       
   346 
       
   347 /* 
       
   348 * Get the length of a character string in some encoding. Returns the number
       
   349 * of characters (less the terminating char). The out param byte_count returns
       
   350 * the number of bytes of storage scanned (including the terminating char).
       
   351 * Note that there is NO validity check here. This should be done first if
       
   352 * needed.  TODO:  Also note that the validity check could return the length 
       
   353 * directly, thus eliminating the need for call to this function when 
       
   354 * doint32 validity checkint32.
       
   355 */
       
   356 EXPORT_C NW_Int32
       
   357 NW_String_charBuffGetLength (void *buffer, NW_Uint32 encoding, NW_Uint32 * byte_count)
       
   358 {
       
   359   NW_Int32 chars = 0;
       
   360   NW_Ucs2 c = 1;
       
   361   NW_Int32 retval = 0;
       
   362   
       
   363   *byte_count = 0;
       
   364   while (c)
       
   365   {
       
   366     c = 0; /* partial protection against an infinite loop */
       
   367     retval = NW_String_readChar ((NW_Byte *) buffer + *byte_count, &c, encoding);
       
   368     if(retval < 0){
       
   369       return -1;
       
   370     }
       
   371     (*byte_count) += (NW_Uint32) retval;
       
   372     chars++;
       
   373   }
       
   374   
       
   375   return chars - 1;
       
   376 }
       
   377 
       
   378 
       
   379 /* 
       
   380 * Conversions among character strings of various types and ucs2.
       
   381 * These functions assume that the length in characters of the 
       
   382 * input buffer has been pre-calculated, so that this operation
       
   383 * doesn't have to be performed for every conversion. This works well
       
   384 * for String_t which store the character count.
       
   385 *
       
   386 * RETURN NULL if malloc fails
       
   387 */
       
   388 NW_String_UCS2Buff_t *
       
   389 NW_String_charToUCS2Buff (NW_Byte * s, NW_Uint32 encoding)
       
   390 {
       
   391   NW_String_UCS2Buff_t *storage;
       
   392   NW_Ucs2 c;
       
   393   NW_Int32 i;
       
   394   NW_Int32 count = 0;
       
   395   NW_Int32 length = 0;
       
   396   NW_Uint32 byteCount = 0;
       
   397   NW_Int32 retval = 0;
       
   398   
       
   399   if (!NW_String_charsetValid(encoding))
       
   400   {
       
   401     return NULL;
       
   402   }
       
   403 
       
   404   length = NW_String_charBuffGetLength(s, encoding, &byteCount);
       
   405   if(length < 0){
       
   406     return NULL;
       
   407   }
       
   408   storage =
       
   409     (NW_String_UCS2Buff_t*) 
       
   410     NW_Mem_Malloc(((NW_Uint32)length + 1) * sizeof (NW_String_UCS2Buff_t));
       
   411   if (storage == NULL)
       
   412   {
       
   413     return NULL;
       
   414   }
       
   415   
       
   416   for (i = 0; i < length; i++)
       
   417   {
       
   418     retval = NW_String_readChar (s + count, &c, encoding);
       
   419     if(retval < 0){
       
   420       NW_Mem_Free(storage);
       
   421       return NULL;
       
   422     }
       
   423     count += retval;
       
   424     storage[i].bytes[0] = (NW_Byte) ((c & 0xff00) >> 8);
       
   425     storage[i].bytes[1] = (NW_Byte) (c & 0xff);
       
   426   }
       
   427   storage[length].bytes[0] = 0;
       
   428   storage[length].bytes[1] = 0;
       
   429   
       
   430   return storage;
       
   431 }
       
   432 
       
   433 
       
   434 /*
       
   435 * TODO: is this a public or private function ???
       
   436 */
       
   437 NW_String_UCS2Buff_t *
       
   438 NW_String_UTF8ToUCS2Buff (NW_Byte * s)
       
   439 {
       
   440   return NW_String_charToUCS2Buff (s, HTTP_utf_8);
       
   441 }
       
   442 
       
   443 
       
   444 /*
       
   445 * TODO: is this a public or private function ???
       
   446 */
       
   447 NW_String_UCS2Buff_t *
       
   448 NW_String_ISO88591ToUCS2Buff (NW_Byte * s)
       
   449 {
       
   450   return NW_String_charToUCS2Buff (s, HTTP_iso_8859_1);
       
   451 }
       
   452 
       
   453 
       
   454 /*
       
   455 * RETURN NULL if malloc fails
       
   456 */
       
   457 NW_Byte *
       
   458 NW_String_UCS2ToUTF8 (NW_String_UCS2Buff_t * s, NW_Uint32 length)
       
   459 {
       
   460   NW_Byte *tstore;
       
   461   NW_Byte *storage;
       
   462   NW_Ucs2 c;
       
   463   NW_Uint32 i;
       
   464   NW_Int32 count = 0;
       
   465   NW_Ucs2 *src = (NW_Ucs2 *)s; /*WMS we should use UCS2 pointer, 
       
   466                                  because s is a structure and the size of a structure is not fixed
       
   467 					             in ARM processor, the size of NW_String_UCS2Buff_t is 4 byte 
       
   468                                  (address alignment issue) */
       
   469     
       
   470   tstore = (NW_Byte *) NW_Mem_Malloc ((length + 1) * 3);
       
   471   if (tstore == NULL)
       
   472   {
       
   473     return NULL;
       
   474   }
       
   475   
       
   476   for (i = 0; i < length; i++)
       
   477     {
       
   478       ReadInt16Char ((NW_Byte *) (src + i), &c);
       
   479       count += NW_String_writeUTF8Char (c, tstore + count);
       
   480     }
       
   481     *(tstore + count) = 0;
       
   482     storage = (NW_Byte *) NW_Mem_Malloc (count + 1);
       
   483     if (storage)
       
   484     {
       
   485       NW_Mem_memcpy (storage, tstore, count + 1);
       
   486     }
       
   487     NW_Mem_Free (tstore);
       
   488     
       
   489     return storage;
       
   490 }
       
   491 
       
   492 
       
   493 /*
       
   494 * RETURN NULL if malloc fails
       
   495 * byteCount is total allocation size of s as far as conversion is concerned
       
   496 */
       
   497 NW_Byte *
       
   498 NW_String_UCS2ToISO88591 (NW_String_UCS2Buff_t * s, NW_Uint32 byteCount)
       
   499 {
       
   500   NW_Byte *storage = NULL;
       
   501   NW_Ucs2 c;
       
   502   NW_Uint32 i;
       
   503   NW_Ucs2 *src = (NW_Ucs2 *)s; /*WMS we should use UCS2 pointer, 
       
   504                                  because s is a structure and the size of a structure is not fixed
       
   505 					             in ARM processor, the size of NW_String_UCS2Buff_t is 4 byte 
       
   506                                  (address alignment issue) */
       
   507  
       
   508   storage = (NW_Byte *) NW_Mem_Malloc (byteCount + 1);
       
   509   if (storage == NULL)
       
   510   {
       
   511     return NULL;
       
   512   }
       
   513   
       
   514   for (i = 0; i < byteCount; i++)
       
   515   {
       
   516     ReadInt16Char ((NW_Byte *) (src + i), &c);
       
   517     storage[i] = (NW_Byte) (c & 0xff);
       
   518   }
       
   519   storage[byteCount] = 0;
       
   520   
       
   521   return storage;
       
   522 }
       
   523 
       
   524 /* Ordered comparison of ucs2 strings */
       
   525 NW_Int32
       
   526 NW_String_UCS2BuffCmp (NW_String_UCS2Buff_t * s1, 
       
   527                        NW_String_UCS2Buff_t * s2,
       
   528                        NW_Bool matchCase)
       
   529 {
       
   530   NW_Ucs2 c1, c2;
       
   531   NW_Ucs2 *src1 = (NW_Ucs2 *)s1; /*WMS we should use UCS2 pointer, */
       
   532   NW_Ucs2 *src2 = (NW_Ucs2 *)s2; /*because s is a structure and the size of a structure is not fixed
       
   533 					             in ARM processor, the size of NW_String_UCS2Buff_t is 4 byte 
       
   534                                  (address alignment issue) */
       
   535 
       
   536   while ( ( *src1 ) || ( *src2 ) )
       
   537   {
       
   538     ReadInt16Char ((NW_Byte *) src1++, &c1);
       
   539     ReadInt16Char ((NW_Byte *) src2++, &c2);
       
   540 
       
   541     if (matchCase == NW_FALSE) {
       
   542       c1 = CXML_Str_ToLower (c1);
       
   543       c2 = CXML_Str_ToLower (c2);
       
   544     }
       
   545     if (c1 == c2)
       
   546     {
       
   547       continue;
       
   548     }
       
   549     return (c1 < c2) ? -1 : 1;
       
   550   }
       
   551   
       
   552   return 0;
       
   553   
       
   554 }
       
   555 
       
   556 
       
   557 /* Assumes s2 is null terminated, native byte order
       
   558 and aligned for 16-bit access */
       
   559 NW_Status_t
       
   560 NW_String_CmpToNativeAlignedUCS2 (NW_Uint32 encoding, NW_Uint32 charCount,
       
   561                                   NW_Uint8 * s1, NW_Uint16 * s2,
       
   562                                   NW_Int32 * r)
       
   563 {
       
   564     NW_Uint32 i;
       
   565     NW_Int32 byteCount = 0;
       
   566     NW_Ucs2 c1;
       
   567 
       
   568     for (i = 0; i < charCount; i++, s1 += byteCount, s2++) {
       
   569         byteCount = NW_String_readChar (s1, &c1, encoding);
       
   570         if (byteCount < 0) {
       
   571             return NW_STAT_FAILURE;
       
   572         }
       
   573         *r = c1 - *s2;
       
   574         if (*r || (*s2 == 0)) {
       
   575             break;
       
   576         }
       
   577     }
       
   578     /* You can exit the above loop three ways: i == charCount or
       
   579     when i != charCount because one of mismatch or null termination
       
   580     of s2 is encountered.  The only one that needs a fixup is if
       
   581     i == charCount but s2 isn't at null termination. */
       
   582 
       
   583     /*lint -e{794} Conceivable use of null pointer */
       
   584     if ((i == charCount) && (*s2 != 0)) {
       
   585         *r = -*s2;
       
   586     }
       
   587     return NW_STAT_SUCCESS;
       
   588 }