javacommons/utils/src/convertutf.cpp
changeset 21 2a9601315dfc
equal deleted inserted replaced
18:e8e63152f320 21:2a9601315dfc
       
     1 /*
       
     2  * Copyright 2001-2004 Unicode, Inc.
       
     3  *
       
     4  * Disclaimer
       
     5  *
       
     6  * This source code is provided as is by Unicode, Inc. No claims are
       
     7  * made as to fitness for any particular purpose. No warranties of any
       
     8  * kind are expressed or implied. The recipient agrees to determine
       
     9  * applicability of information provided. If this file has been
       
    10  * purchased on magnetic or optical media from Unicode, Inc., the
       
    11  * sole remedy for any claim will be exchange of defective media
       
    12  * within 90 days of receipt.
       
    13  *
       
    14  * Limitations on Rights to Redistribute This Code
       
    15  *
       
    16  * Unicode, Inc. hereby grants the right to freely use the information
       
    17  * supplied in this file in the creation of products supporting the
       
    18  * Unicode Standard, and to make copies of this file in any form
       
    19  * for internal or external distribution as long as this notice
       
    20  * remains attached.
       
    21  */
       
    22 
       
    23 /* ---------------------------------------------------------------------
       
    24 
       
    25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
       
    26     Author: Mark E. Davis, 1994.
       
    27     Rev History: Rick McGowan, fixes & updates May 2001.
       
    28     Sept 2001: fixed const & error conditions per
       
    29     mods suggested by S. Parent & A. Lillich.
       
    30     June 2002: Tim Dodd added detection and handling of incomplete
       
    31     source sequences, enhanced error detection, added casts
       
    32     to eliminate compiler warnings.
       
    33     July 2003: slight mods to back out aggressive FFFE detection.
       
    34     Jan 2004: updated switches in from-UTF8 conversions.
       
    35     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
       
    36 
       
    37     See the header file "ConvertUTF.h" for complete documentation.
       
    38 
       
    39 ------------------------------------------------------------------------ */
       
    40 
       
    41 
       
    42 #include "convertutf.h"
       
    43 #ifdef CVTUTF_DEBUG
       
    44 #include <stdio.h>
       
    45 #endif
       
    46 
       
    47 static const int halfShift  = 10; /* used for shifting by 10 bits */
       
    48 
       
    49 static const UTF32 halfBase = 0x0010000UL;
       
    50 static const UTF32 halfMask = 0x3FFUL;
       
    51 
       
    52 #define UNI_SUR_HIGH_START  (UTF32)0xD800
       
    53 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
       
    54 #define UNI_SUR_LOW_START   (UTF32)0xDC00
       
    55 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
       
    56 #define false      0
       
    57 #define true        1
       
    58 
       
    59 /* --------------------------------------------------------------------- */
       
    60 
       
    61 ConversionResult ConvertUTF32toUTF16(
       
    62     const UTF32** sourceStart, const UTF32* sourceEnd,
       
    63     UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags)
       
    64 {
       
    65     ConversionResult result = conversionOK;
       
    66     const UTF32* source = *sourceStart;
       
    67     UTF16* target = *targetStart;
       
    68     while (source < sourceEnd)
       
    69     {
       
    70         UTF32 ch;
       
    71         if (target >= targetEnd)
       
    72         {
       
    73             result = targetExhausted;
       
    74             break;
       
    75         }
       
    76         ch = *source++;
       
    77         if (ch <= UNI_MAX_BMP)   /* Target is a character <= 0xFFFF */
       
    78         {
       
    79             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
       
    80             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
       
    81             {
       
    82                 if (flags == strictConversion)
       
    83                 {
       
    84                     --source; /* return to the illegal value itself */
       
    85                     result = sourceIllegal;
       
    86                     break;
       
    87                 }
       
    88                 else
       
    89                 {
       
    90                     *target++ = UNI_REPLACEMENT_CHAR;
       
    91                 }
       
    92             }
       
    93             else
       
    94             {
       
    95                 *target++ = (UTF16)ch; /* normal case */
       
    96             }
       
    97         }
       
    98         else if (ch > UNI_MAX_LEGAL_UTF32)
       
    99         {
       
   100             if (flags == strictConversion)
       
   101             {
       
   102                 result = sourceIllegal;
       
   103             }
       
   104             else
       
   105             {
       
   106                 *target++ = UNI_REPLACEMENT_CHAR;
       
   107             }
       
   108         }
       
   109         else
       
   110         {
       
   111             /* target is a character in range 0xFFFF - 0x10FFFF. */
       
   112             if (target + 1 >= targetEnd)
       
   113             {
       
   114                 --source; /* Back up source pointer! */
       
   115                 result = targetExhausted;
       
   116                 break;
       
   117             }
       
   118             ch -= halfBase;
       
   119             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
       
   120             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
       
   121         }
       
   122     }
       
   123     *sourceStart = source;
       
   124     *targetStart = target;
       
   125     return result;
       
   126 }
       
   127 
       
   128 /* --------------------------------------------------------------------- */
       
   129 
       
   130 ConversionResult ConvertUTF16toUTF32(
       
   131     const UTF16** sourceStart, const UTF16* sourceEnd,
       
   132     UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags)
       
   133 {
       
   134     ConversionResult result = conversionOK;
       
   135     const UTF16* source = *sourceStart;
       
   136     UTF32* target = *targetStart;
       
   137     UTF32 ch, ch2;
       
   138     while (source < sourceEnd)
       
   139     {
       
   140         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
       
   141         ch = *source++;
       
   142         /* If we have a surrogate pair, convert to UTF32 first. */
       
   143         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
       
   144         {
       
   145             /* If the 16 bits following the high surrogate are in the source buffer... */
       
   146             if (source < sourceEnd)
       
   147             {
       
   148                 ch2 = *source;
       
   149                 /* If it's a low surrogate, convert to UTF32. */
       
   150                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
       
   151                 {
       
   152                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
       
   153                          + (ch2 - UNI_SUR_LOW_START) + halfBase;
       
   154                     ++source;
       
   155                 }
       
   156                 else if (flags == strictConversion)   /* it's an unpaired high surrogate */
       
   157                 {
       
   158                     --source; /* return to the illegal value itself */
       
   159                     result = sourceIllegal;
       
   160                     break;
       
   161                 }
       
   162             }
       
   163             else   /* We don't have the 16 bits following the high surrogate. */
       
   164             {
       
   165                 --source; /* return to the high surrogate */
       
   166                 result = sourceExhausted;
       
   167                 break;
       
   168             }
       
   169         }
       
   170         else if (flags == strictConversion)
       
   171         {
       
   172             /* UTF-16 surrogate values are illegal in UTF-32 */
       
   173             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
       
   174             {
       
   175                 --source; /* return to the illegal value itself */
       
   176                 result = sourceIllegal;
       
   177                 break;
       
   178             }
       
   179         }
       
   180         if (target >= targetEnd)
       
   181         {
       
   182             source = oldSource; /* Back up source pointer! */
       
   183             result = targetExhausted;
       
   184             break;
       
   185         }
       
   186         *target++ = ch;
       
   187     }
       
   188     *sourceStart = source;
       
   189     *targetStart = target;
       
   190 #ifdef CVTUTF_DEBUG
       
   191     if (result == sourceIllegal)
       
   192     {
       
   193         fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
       
   194         fflush(stderr);
       
   195     }
       
   196 #endif
       
   197     return result;
       
   198 }
       
   199 
       
   200 /* --------------------------------------------------------------------- */
       
   201 
       
   202 /*
       
   203  * Index into the table below with the first byte of a UTF-8 sequence to
       
   204  * get the number of trailing bytes that are supposed to follow it.
       
   205  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
       
   206  * left as-is for anyone who may want to do such conversion, which was
       
   207  * allowed in earlier algorithms.
       
   208  */
       
   209 static const char trailingBytesForUTF8[256] =
       
   210 {
       
   211     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   212     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   213     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   214     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   215     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   216     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   217     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
       
   218     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
       
   219 };
       
   220 
       
   221 /*
       
   222  * Magic values subtracted from a buffer value during UTF8 conversion.
       
   223  * This table contains as many values as there might be trailing bytes
       
   224  * in a UTF-8 sequence.
       
   225  */
       
   226 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
       
   227                                         0x03C82080UL, 0xFA082080UL, 0x82082080UL
       
   228                                         };
       
   229 
       
   230 /*
       
   231  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
       
   232  * into the first byte, depending on how many bytes follow.  There are
       
   233  * as many entries in this table as there are UTF-8 sequence types.
       
   234  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
       
   235  * for *legal* UTF-8 will be 4 or fewer bytes total.
       
   236  */
       
   237 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
       
   238 
       
   239 /* --------------------------------------------------------------------- */
       
   240 
       
   241 /* The interface converts a whole buffer to avoid function-call overhead.
       
   242  * Constants have been gathered. Loops & conditionals have been removed as
       
   243  * much as possible for efficiency, in favor of drop-through switches.
       
   244  * (See "Note A" at the bottom of the file for equivalent code.)
       
   245  * If your compiler supports it, the "isLegalUTF8" call can be turned
       
   246  * into an inline function.
       
   247  */
       
   248 
       
   249 /* --------------------------------------------------------------------- */
       
   250 
       
   251 ConversionResult ConvertUTF16toUTF8(
       
   252     const UTF16** sourceStart, const UTF16* sourceEnd,
       
   253     UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags)
       
   254 {
       
   255     ConversionResult result = conversionOK;
       
   256     const UTF16* source = *sourceStart;
       
   257     UTF8* target = *targetStart;
       
   258     while (source < sourceEnd)
       
   259     {
       
   260         UTF32 ch;
       
   261         unsigned short bytesToWrite = 0;
       
   262         const UTF32 byteMask = 0xBF;
       
   263         const UTF32 byteMark = 0x80;
       
   264         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
       
   265         ch = *source++;
       
   266         /* If we have a surrogate pair, convert to UTF32 first. */
       
   267         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
       
   268         {
       
   269             /* If the 16 bits following the high surrogate are in the source buffer... */
       
   270             if (source < sourceEnd)
       
   271             {
       
   272                 UTF32 ch2 = *source;
       
   273                 /* If it's a low surrogate, convert to UTF32. */
       
   274                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
       
   275                 {
       
   276                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
       
   277                          + (ch2 - UNI_SUR_LOW_START) + halfBase;
       
   278                     ++source;
       
   279                 }
       
   280                 else if (flags == strictConversion)   /* it's an unpaired high surrogate */
       
   281                 {
       
   282                     --source; /* return to the illegal value itself */
       
   283                     result = sourceIllegal;
       
   284                     break;
       
   285                 }
       
   286             }
       
   287             else   /* We don't have the 16 bits following the high surrogate. */
       
   288             {
       
   289                 --source; /* return to the high surrogate */
       
   290                 result = sourceExhausted;
       
   291                 break;
       
   292             }
       
   293         }
       
   294         else if (flags == strictConversion)
       
   295         {
       
   296             /* UTF-16 surrogate values are illegal in UTF-32 */
       
   297             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
       
   298             {
       
   299                 --source; /* return to the illegal value itself */
       
   300                 result = sourceIllegal;
       
   301                 break;
       
   302             }
       
   303         }
       
   304         /* Figure out how many bytes the result will require */
       
   305         if (ch < (UTF32)0x80)
       
   306         {
       
   307             bytesToWrite = 1;
       
   308         }
       
   309         else if (ch < (UTF32)0x800)
       
   310         {
       
   311             bytesToWrite = 2;
       
   312         }
       
   313         else if (ch < (UTF32)0x10000)
       
   314         {
       
   315             bytesToWrite = 3;
       
   316         }
       
   317         else if (ch < (UTF32)0x110000)
       
   318         {
       
   319             bytesToWrite = 4;
       
   320         }
       
   321         else
       
   322         {
       
   323             bytesToWrite = 3;
       
   324             ch = UNI_REPLACEMENT_CHAR;
       
   325         }
       
   326 
       
   327         target += bytesToWrite;
       
   328         if (target > targetEnd)
       
   329         {
       
   330             source = oldSource; /* Back up source pointer! */
       
   331             target -= bytesToWrite;
       
   332             result = targetExhausted;
       
   333             break;
       
   334         }
       
   335         switch (bytesToWrite)   /* note: everything falls through. */
       
   336         {
       
   337         case 4:
       
   338             *--target = (UTF8)((ch | byteMark) & byteMask);
       
   339             ch >>= 6;
       
   340         case 3:
       
   341             *--target = (UTF8)((ch | byteMark) & byteMask);
       
   342             ch >>= 6;
       
   343         case 2:
       
   344             *--target = (UTF8)((ch | byteMark) & byteMask);
       
   345             ch >>= 6;
       
   346         case 1:
       
   347             *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
       
   348         }
       
   349         target += bytesToWrite;
       
   350     }
       
   351     *sourceStart = source;
       
   352     *targetStart = target;
       
   353     return result;
       
   354 }
       
   355 
       
   356 /* --------------------------------------------------------------------- */
       
   357 
       
   358 /*
       
   359  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
       
   360  * This must be called with the length pre-determined by the first byte.
       
   361  * If not calling this from ConvertUTF8to*, then the length can be set by:
       
   362  *  length = trailingBytesForUTF8[*source]+1;
       
   363  * and the sequence is illegal right away if there aren't that many bytes
       
   364  * available.
       
   365  * If presented with a length > 4, this returns false.  The Unicode
       
   366  * definition of UTF-8 goes up to 4-byte sequences.
       
   367  */
       
   368 
       
   369 static Boolean isLegalUTF8(const UTF8 *source, int length)
       
   370 {
       
   371     UTF8 a;
       
   372     const UTF8 *srcptr = source+length;
       
   373     switch (length)
       
   374     {
       
   375     default:
       
   376         return false;
       
   377         /* Everything else falls through when "true"... */
       
   378     case 4:
       
   379         if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
       
   380     case 3:
       
   381         if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
       
   382     case 2:
       
   383         if ((a = (*--srcptr)) > 0xBF) return false;
       
   384 
       
   385         switch (*source)
       
   386         {
       
   387             /* no fall-through in this inner switch */
       
   388         case 0xE0:
       
   389             if (a < 0xA0) return false;
       
   390             break;
       
   391         case 0xED:
       
   392             if (a > 0x9F) return false;
       
   393             break;
       
   394         case 0xF0:
       
   395             if (a < 0x90) return false;
       
   396             break;
       
   397         case 0xF4:
       
   398             if (a > 0x8F) return false;
       
   399             break;
       
   400         default:
       
   401             if (a < 0x80) return false;
       
   402         }
       
   403 
       
   404     case 1:
       
   405         if (*source >= 0x80 && *source < 0xC2) return false;
       
   406     }
       
   407     if (*source > 0xF4) return false;
       
   408     return true;
       
   409 }
       
   410 
       
   411 /* --------------------------------------------------------------------- */
       
   412 
       
   413 /*
       
   414  * Exported function to return whether a UTF-8 sequence is legal or not.
       
   415  * This is not used here; it's just exported.
       
   416  */
       
   417 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
       
   418 {
       
   419     int length = trailingBytesForUTF8[*source]+1;
       
   420     if (source+length > sourceEnd)
       
   421     {
       
   422         return false;
       
   423     }
       
   424     return isLegalUTF8(source, length);
       
   425 }
       
   426 
       
   427 /* --------------------------------------------------------------------- */
       
   428 
       
   429 ConversionResult ConvertUTF8toUTF16(
       
   430     const UTF8** sourceStart, const UTF8* sourceEnd,
       
   431     UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags)
       
   432 {
       
   433     ConversionResult result = conversionOK;
       
   434     const UTF8* source = *sourceStart;
       
   435     UTF16* target = *targetStart;
       
   436     while (source < sourceEnd)
       
   437     {
       
   438         UTF32 ch = 0;
       
   439         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
       
   440         if (source + extraBytesToRead >= sourceEnd)
       
   441         {
       
   442             result = sourceExhausted;
       
   443             break;
       
   444         }
       
   445         /* Do this check whether lenient or strict */
       
   446         if (! isLegalUTF8(source, extraBytesToRead+1))
       
   447         {
       
   448             result = sourceIllegal;
       
   449             break;
       
   450         }
       
   451         /*
       
   452          * The cases all fall through. See "Note A" below.
       
   453          */
       
   454         switch (extraBytesToRead)
       
   455         {
       
   456         case 5:
       
   457             ch += *source++;
       
   458             ch <<= 6; /* remember, illegal UTF-8 */
       
   459         case 4:
       
   460             ch += *source++;
       
   461             ch <<= 6; /* remember, illegal UTF-8 */
       
   462         case 3:
       
   463             ch += *source++;
       
   464             ch <<= 6;
       
   465         case 2:
       
   466             ch += *source++;
       
   467             ch <<= 6;
       
   468         case 1:
       
   469             ch += *source++;
       
   470             ch <<= 6;
       
   471         case 0:
       
   472             ch += *source++;
       
   473         }
       
   474         ch -= offsetsFromUTF8[extraBytesToRead];
       
   475 
       
   476         if (target >= targetEnd)
       
   477         {
       
   478             source -= (extraBytesToRead+1); /* Back up source pointer! */
       
   479             result = targetExhausted;
       
   480             break;
       
   481         }
       
   482         if (ch <= UNI_MAX_BMP)   /* Target is a character <= 0xFFFF */
       
   483         {
       
   484             /* UTF-16 surrogate values are illegal in UTF-32 */
       
   485             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
       
   486             {
       
   487                 if (flags == strictConversion)
       
   488                 {
       
   489                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
       
   490                     result = sourceIllegal;
       
   491                     break;
       
   492                 }
       
   493                 else
       
   494                 {
       
   495                     *target++ = UNI_REPLACEMENT_CHAR;
       
   496                 }
       
   497             }
       
   498             else
       
   499             {
       
   500                 *target++ = (UTF16)ch; /* normal case */
       
   501             }
       
   502         }
       
   503         else if (ch > UNI_MAX_UTF16)
       
   504         {
       
   505             if (flags == strictConversion)
       
   506             {
       
   507                 result = sourceIllegal;
       
   508                 source -= (extraBytesToRead+1); /* return to the start */
       
   509                 break; /* Bail out; shouldn't continue */
       
   510             }
       
   511             else
       
   512             {
       
   513                 *target++ = UNI_REPLACEMENT_CHAR;
       
   514             }
       
   515         }
       
   516         else
       
   517         {
       
   518             /* target is a character in range 0xFFFF - 0x10FFFF. */
       
   519             if (target + 1 >= targetEnd)
       
   520             {
       
   521                 source -= (extraBytesToRead+1); /* Back up source pointer! */
       
   522                 result = targetExhausted;
       
   523                 break;
       
   524             }
       
   525             ch -= halfBase;
       
   526             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
       
   527             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
       
   528         }
       
   529     }
       
   530     *sourceStart = source;
       
   531     *targetStart = target;
       
   532     return result;
       
   533 }
       
   534 
       
   535 /* --------------------------------------------------------------------- */
       
   536 
       
   537 ConversionResult ConvertUTF32toUTF8(
       
   538     const UTF32** sourceStart, const UTF32* sourceEnd,
       
   539     UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags)
       
   540 {
       
   541     ConversionResult result = conversionOK;
       
   542     const UTF32* source = *sourceStart;
       
   543     UTF8* target = *targetStart;
       
   544     while (source < sourceEnd)
       
   545     {
       
   546         UTF32 ch;
       
   547         unsigned short bytesToWrite = 0;
       
   548         const UTF32 byteMask = 0xBF;
       
   549         const UTF32 byteMark = 0x80;
       
   550         ch = *source++;
       
   551         if (flags == strictConversion)
       
   552         {
       
   553             /* UTF-16 surrogate values are illegal in UTF-32 */
       
   554             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
       
   555             {
       
   556                 --source; /* return to the illegal value itself */
       
   557                 result = sourceIllegal;
       
   558                 break;
       
   559             }
       
   560         }
       
   561         /*
       
   562          * Figure out how many bytes the result will require. Turn any
       
   563          * illegally large UTF32 things (> Plane 17) into replacement chars.
       
   564          */
       
   565         if (ch < (UTF32)0x80)
       
   566         {
       
   567             bytesToWrite = 1;
       
   568         }
       
   569         else if (ch < (UTF32)0x800)
       
   570         {
       
   571             bytesToWrite = 2;
       
   572         }
       
   573         else if (ch < (UTF32)0x10000)
       
   574         {
       
   575             bytesToWrite = 3;
       
   576         }
       
   577         else if (ch <= UNI_MAX_LEGAL_UTF32)
       
   578         {
       
   579             bytesToWrite = 4;
       
   580         }
       
   581         else
       
   582         {
       
   583             bytesToWrite = 3;
       
   584             ch = UNI_REPLACEMENT_CHAR;
       
   585             result = sourceIllegal;
       
   586         }
       
   587 
       
   588         target += bytesToWrite;
       
   589         if (target > targetEnd)
       
   590         {
       
   591             --source; /* Back up source pointer! */
       
   592             target -= bytesToWrite;
       
   593             result = targetExhausted;
       
   594             break;
       
   595         }
       
   596         switch (bytesToWrite)   /* note: everything falls through. */
       
   597         {
       
   598         case 4:
       
   599             *--target = (UTF8)((ch | byteMark) & byteMask);
       
   600             ch >>= 6;
       
   601         case 3:
       
   602             *--target = (UTF8)((ch | byteMark) & byteMask);
       
   603             ch >>= 6;
       
   604         case 2:
       
   605             *--target = (UTF8)((ch | byteMark) & byteMask);
       
   606             ch >>= 6;
       
   607         case 1:
       
   608             *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
       
   609         }
       
   610         target += bytesToWrite;
       
   611     }
       
   612     *sourceStart = source;
       
   613     *targetStart = target;
       
   614     return result;
       
   615 }
       
   616 
       
   617 /* --------------------------------------------------------------------- */
       
   618 
       
   619 ConversionResult ConvertUTF8toUTF32(
       
   620     const UTF8** sourceStart, const UTF8* sourceEnd,
       
   621     UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags)
       
   622 {
       
   623     ConversionResult result = conversionOK;
       
   624     const UTF8* source = *sourceStart;
       
   625     UTF32* target = *targetStart;
       
   626     while (source < sourceEnd)
       
   627     {
       
   628         UTF32 ch = 0;
       
   629         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
       
   630         if (source + extraBytesToRead >= sourceEnd)
       
   631         {
       
   632             result = sourceExhausted;
       
   633             break;
       
   634         }
       
   635         /* Do this check whether lenient or strict */
       
   636         if (! isLegalUTF8(source, extraBytesToRead+1))
       
   637         {
       
   638             result = sourceIllegal;
       
   639             break;
       
   640         }
       
   641         /*
       
   642          * The cases all fall through. See "Note A" below.
       
   643          */
       
   644         switch (extraBytesToRead)
       
   645         {
       
   646         case 5:
       
   647             ch += *source++;
       
   648             ch <<= 6;
       
   649         case 4:
       
   650             ch += *source++;
       
   651             ch <<= 6;
       
   652         case 3:
       
   653             ch += *source++;
       
   654             ch <<= 6;
       
   655         case 2:
       
   656             ch += *source++;
       
   657             ch <<= 6;
       
   658         case 1:
       
   659             ch += *source++;
       
   660             ch <<= 6;
       
   661         case 0:
       
   662             ch += *source++;
       
   663         }
       
   664         ch -= offsetsFromUTF8[extraBytesToRead];
       
   665 
       
   666         if (target >= targetEnd)
       
   667         {
       
   668             source -= (extraBytesToRead+1); /* Back up the source pointer! */
       
   669             result = targetExhausted;
       
   670             break;
       
   671         }
       
   672         if (ch <= UNI_MAX_LEGAL_UTF32)
       
   673         {
       
   674             /*
       
   675              * UTF-16 surrogate values are illegal in UTF-32, and anything
       
   676              * over Plane 17 (> 0x10FFFF) is illegal.
       
   677              */
       
   678             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
       
   679             {
       
   680                 if (flags == strictConversion)
       
   681                 {
       
   682                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
       
   683                     result = sourceIllegal;
       
   684                     break;
       
   685                 }
       
   686                 else
       
   687                 {
       
   688                     *target++ = UNI_REPLACEMENT_CHAR;
       
   689                 }
       
   690             }
       
   691             else
       
   692             {
       
   693                 *target++ = ch;
       
   694             }
       
   695         }
       
   696         else   /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
       
   697         {
       
   698             result = sourceIllegal;
       
   699             *target++ = UNI_REPLACEMENT_CHAR;
       
   700         }
       
   701     }
       
   702     *sourceStart = source;
       
   703     *targetStart = target;
       
   704     return result;
       
   705 }
       
   706 
       
   707 /* ---------------------------------------------------------------------
       
   708 
       
   709     Note A.
       
   710     The fall-through switches in UTF-8 reading code save a
       
   711     temp variable, some decrements & conditionals.  The switches
       
   712     are equivalent to the following loop:
       
   713     {
       
   714         int tmpBytesToRead = extraBytesToRead+1;
       
   715         do {
       
   716         ch += *source++;
       
   717         --tmpBytesToRead;
       
   718         if (tmpBytesToRead) ch <<= 6;
       
   719         } while (tmpBytesToRead > 0);
       
   720     }
       
   721     In UTF-8 writing code, the switches on "bytesToWrite" are
       
   722     similarly unrolled loops.
       
   723 
       
   724    --------------------------------------------------------------------- */