secureswitools/swianalysistoolkit/source/common/utf8/utf8.cpp
changeset 0 ba25891c3a9e
equal deleted inserted replaced
-1:000000000000 0:ba25891c3a9e
       
     1 /*
       
     2  * Copyright 2001 Unicode, Inc.
       
     3  * 
       
     4  * Disclaimer
       
     5  * 
       
     6  * This source code is provided as is by Unicode, Inc. No claims are
       
     7  * made as to fitness for any particular purpose. No warranties of any
       
     8  * kind are expressed or implied. The recipient agrees to determine
       
     9  * applicability of information provided. If this file has been
       
    10  * purchased on magnetic or optical media from Unicode, Inc., the
       
    11  * sole remedy for any claim will be exchange of defective media
       
    12  * within 90 days of receipt.
       
    13  * 
       
    14  * Limitations on Rights to Redistribute This Code
       
    15  * 
       
    16  * Unicode, Inc. hereby grants the right to freely use the information
       
    17  * supplied in this file in the creation of products supporting the
       
    18  * Unicode Standard, and to make copies of this file in any form
       
    19  * for internal or external distribution as long as this notice
       
    20  * remains attached.
       
    21  */
       
    22 
       
    23 /* ---------------------------------------------------------------------
       
    24 
       
    25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
       
    26 	Author: Mark E. Davis, 1994.
       
    27 	Rev History: Rick McGowan, fixes & updates May 2001.
       
    28 	Sept 2001: fixed const & error conditions per
       
    29 		mods suggested by S. Parent & A. Lillich.
       
    30 
       
    31     See the header file "ConvertUTF.h" for complete documentation.
       
    32 
       
    33 ------------------------------------------------------------------------ */
       
    34 
       
    35 
       
    36 #include "utf8.h"
       
    37 #ifdef CVTUTF_DEBUG
       
    38 #include <stdio.h>
       
    39 #endif
       
    40 
       
    41 static const int halfShift	= 10; /* used for shifting by 10 bits */
       
    42 
       
    43 static const UTF32 halfBase	= 0x0010000UL;
       
    44 static const UTF32 halfMask	= 0x3FFUL;
       
    45 
       
    46 #define UNI_SUR_HIGH_START	(UTF32)0xD800
       
    47 #define UNI_SUR_HIGH_END	(UTF32)0xDBFF
       
    48 #define UNI_SUR_LOW_START	(UTF32)0xDC00
       
    49 #define UNI_SUR_LOW_END		(UTF32)0xDFFF
       
    50 #define false			0
       
    51 #define true			1
       
    52 
       
    53 /* --------------------------------------------------------------------- */
       
    54 /*Added for : PDEF140857*/
       
    55 /*Wrapper function to call ConversionResult ConvertUTF32toUTF16 ( )*/
       
    56 
       
    57 ConversionResult ConvertUCS4toUTF16 (
       
    58 		UCS4** sourceStart, const UCS4* sourceEnd, 
       
    59 		UTF16** targetStart, UTF16* targetEnd) {
       
    60 		
       
    61 			return ConvertUTF32toUTF16(sourceStart, sourceEnd, targetStart, targetEnd, lenientConversion);
       
    62 		
       
    63 		}
       
    64 
       
    65 /*Modified for PDEF140857*/	
       
    66 ConversionResult ConvertUTF32toUTF16 (
       
    67 		UTF32** sourceStart, const UTF32* sourceEnd, 
       
    68 		UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
       
    69 	ConversionResult result = conversionOK;
       
    70 	//const UTF32* source = *sourceStart;
       
    71 	UTF32* source = *sourceStart;
       
    72 	UTF16* target = *targetStart;
       
    73 	while (source < sourceEnd) {
       
    74 		UTF32 ch;
       
    75 		if (target >= targetEnd) {
       
    76 			result = targetExhausted; break;
       
    77 		}
       
    78 		ch = *source++;
       
    79 		if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
       
    80 			if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) {
       
    81 				--source; /* return to the illegal value itself */
       
    82 				result = sourceIllegal;
       
    83 				break;
       
    84 			} else {
       
    85 			    *target++ = (UTF16)ch;	/* normal case */ /*Added cast : PDEF140857*/
       
    86 			}
       
    87 		} else if (ch > UNI_MAX_UTF16) {
       
    88 			if (flags == strictConversion) {
       
    89 				result = sourceIllegal;
       
    90 			} else {
       
    91 				*target++ = UNI_REPLACEMENT_CHAR;
       
    92 			}
       
    93 		} else {
       
    94 			/* target is a character in range 0xFFFF - 0x10FFFF. */
       
    95 			if (target + 1 >= targetEnd) {
       
    96 				--source; /* Back up source pointer! */
       
    97 				result = targetExhausted; break;
       
    98 			}
       
    99 			ch -= halfBase;
       
   100 			*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); /*Added cast : PDEF140857*/
       
   101 			*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);    /*Added cast : PDEF140857*/
       
   102 		}
       
   103 	}
       
   104 	*sourceStart = source;
       
   105 	*targetStart = target;
       
   106 	return result;
       
   107 }
       
   108 
       
   109 /* --------------------------------------------------------------------- */
       
   110 
       
   111 /*Added for : PDEF140857*/
       
   112 /*Wrapper function to call ConversionResult ConvertUTF16toUTF32 ( )*/
       
   113 ConversionResult ConvertUTF16toUCS4 (
       
   114 		UTF16** sourceStart, UTF16* sourceEnd, 
       
   115 		UCS4** targetStart, UCS4* targetEnd) {
       
   116 		
       
   117 			return ConvertUTF16toUTF32(sourceStart, sourceEnd, targetStart, targetEnd, lenientConversion);
       
   118 		
       
   119 		}
       
   120 
       
   121 /*Modified for : PDEF140857*/	
       
   122 ConversionResult ConvertUTF16toUTF32 (
       
   123 		UTF16** sourceStart, const UTF16* sourceEnd, 
       
   124 		UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
       
   125 	ConversionResult result = conversionOK;
       
   126 	//const UTF16* source = *sourceStart;
       
   127 	UTF16* source = *sourceStart;
       
   128 	UTF32* target = *targetStart;
       
   129 	UTF32 ch, ch2;
       
   130 	while (source < sourceEnd) {
       
   131 		//const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
       
   132 		UTF16* oldSource = source; 
       
   133 		ch = *source++;
       
   134 		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
       
   135 			ch2 = *source;
       
   136 			if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
       
   137 				ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
       
   138 					+ (ch2 - UNI_SUR_LOW_START) + halfBase;
       
   139 				++source;
       
   140 			} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
       
   141 				--source; /* return to the illegal value itself */
       
   142 				result = sourceIllegal;
       
   143 				break;
       
   144 			}
       
   145 		} else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) {
       
   146 			/* an unpaired low surrogate */
       
   147 			--source; /* return to the illegal value itself */
       
   148 			result = sourceIllegal;
       
   149 			break;
       
   150 		}
       
   151 		if (target >= targetEnd) {
       
   152 			source = oldSource; /* Back up source pointer! */
       
   153 			result = targetExhausted; break;
       
   154 		}
       
   155 		*target++ = ch;
       
   156 	}
       
   157 	*sourceStart = source;
       
   158 	*targetStart = target;
       
   159 #ifdef CVTUTF_DEBUG
       
   160 if (result == sourceIllegal) {
       
   161     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
       
   162     fflush(stderr);
       
   163 }
       
   164 #endif
       
   165 	return result;
       
   166 }
       
   167 
       
   168 /* --------------------------------------------------------------------- */
       
   169 
       
   170 /*
       
   171  * Index into the table below with the first byte of a UTF-8 sequence to
       
   172  * get the number of trailing bytes that are supposed to follow it.
       
   173  */
       
   174 static const char trailingBytesForUTF8[256] = {
       
   175 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   176 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   177 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   178 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   179 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   180 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   181 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
       
   182 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
       
   183 };
       
   184 
       
   185 /*
       
   186  * Magic values subtracted from a buffer value during UTF8 conversion.
       
   187  * This table contains as many values as there might be trailing bytes
       
   188  * in a UTF-8 sequence.
       
   189  */
       
   190 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
       
   191 					 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
       
   192 
       
   193 /*
       
   194  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
       
   195  * into the first byte, depending on how many bytes follow.  There are
       
   196  * as many entries in this table as there are UTF-8 sequence types.
       
   197  * (I.e., one byte sequence, two byte... six byte sequence.)
       
   198  */
       
   199 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
       
   200 
       
   201 /* --------------------------------------------------------------------- */
       
   202 
       
   203 /* The interface converts a whole buffer to avoid function-call overhead.
       
   204  * Constants have been gathered. Loops & conditionals have been removed as
       
   205  * much as possible for efficiency, in favor of drop-through switches.
       
   206  * (See "Note A" at the bottom of the file for equivalent code.)
       
   207  * If your compiler supports it, the "isLegalUTF8" call can be turned
       
   208  * into an inline function.
       
   209  */
       
   210 
       
   211 /* --------------------------------------------------------------------- */
       
   212 /*Added for : PDEF140857*/
       
   213 /*Wrapper function to call ConversionResult ConvertUTF16toUTF8_PDEF140857 ( )*/
       
   214 ConversionResult	ConvertUTF16toUTF8 (
       
   215 		UTF16** sourceStart, const UTF16* sourceEnd, 
       
   216 		UTF8** targetStart, UTF8* targetEnd) {
       
   217 		
       
   218 		return ConvertUTF16toUTF8_original(sourceStart, sourceEnd, targetStart, targetEnd, lenientConversion);
       
   219 		
       
   220 		}
       
   221 
       
   222 
       
   223 /*Modified for : PDEF140857*/
       
   224 ConversionResult ConvertUTF16toUTF8_original (
       
   225 		UTF16** sourceStart, const UTF16* sourceEnd, 
       
   226 		UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
       
   227 	ConversionResult result = conversionOK;
       
   228 	//const UTF16* source = *sourceStart;
       
   229 	UTF16* source = *sourceStart;
       
   230 	UTF8* target = *targetStart;
       
   231 	while (source < sourceEnd) {
       
   232 		UTF32 ch;
       
   233 		unsigned short bytesToWrite = 0;
       
   234 		const UTF32 byteMask = 0xBF;
       
   235 		const UTF32 byteMark = 0x80; 
       
   236 	//	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
       
   237 		UTF16* oldSource = source; 
       
   238 		ch = *source++;
       
   239 		/* If we have a surrogate pair, convert to UTF32 first. */
       
   240 		if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END && source < sourceEnd) {
       
   241 			UTF32 ch2 = *source;
       
   242 			if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
       
   243 				ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
       
   244 					+ (ch2 - UNI_SUR_LOW_START) + halfBase;
       
   245 				++source;
       
   246 			} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
       
   247 				--source; /* return to the illegal value itself */
       
   248 				result = sourceIllegal;
       
   249 				break;
       
   250 			}
       
   251 		} else if ((flags == strictConversion) && (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)) {
       
   252 			--source; /* return to the illegal value itself */
       
   253 			result = sourceIllegal;
       
   254 			break;
       
   255 		}
       
   256 		/* Figure out how many bytes the result will require */
       
   257 		if (ch < (UTF32)0x80) {			bytesToWrite = 1;
       
   258 		} else if (ch < (UTF32)0x800) {		bytesToWrite = 2;
       
   259 		} else if (ch < (UTF32)0x10000) {	bytesToWrite = 3;
       
   260 		} else if (ch < (UTF32)0x200000) {	bytesToWrite = 4;
       
   261 		} else {				bytesToWrite = 2;
       
   262 							ch = UNI_REPLACEMENT_CHAR;
       
   263 		}
       
   264 
       
   265 		target += bytesToWrite;
       
   266 		if (target > targetEnd) {
       
   267 			source = oldSource; /* Back up source pointer! */
       
   268 			target -= bytesToWrite; result = targetExhausted; break;
       
   269 		}
       
   270 		switch (bytesToWrite) {	/* note: everything falls through. */
       
   271 			case 4:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; /*Added cast : PDEF140857*/
       
   272 			case 3:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; /*Added cast : PDEF140857*/
       
   273 			case 2:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; /*Added cast : PDEF140857*/
       
   274 			case 1:	*--target = (UTF8)( ch | firstByteMark[bytesToWrite]);    /*Added cast : PDEF140857*/
       
   275 		}
       
   276 		target += bytesToWrite;
       
   277 	}
       
   278 	*sourceStart = source;
       
   279 	*targetStart = target;
       
   280 	return result;
       
   281 }
       
   282 
       
   283 /* --------------------------------------------------------------------- */
       
   284 
       
   285 /*
       
   286  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
       
   287  * This must be called with the length pre-determined by the first byte.
       
   288  * If not calling this from ConvertUTF8to*, then the length can be set by:
       
   289  *	length = trailingBytesForUTF8[*source]+1;
       
   290  * and the sequence is illegal right away if there aren't that many bytes
       
   291  * available.
       
   292  * If presented with a length > 4, this returns false.  The Unicode
       
   293  * definition of UTF-8 goes up to 4-byte sequences.
       
   294  */
       
   295 
       
   296 static Boolean isLegalUTF8(const UTF8 *source, int length) {
       
   297 	UTF8 a;
       
   298 	const UTF8 *srcptr = source+length;
       
   299 	switch (length) {
       
   300 	default: return false;
       
   301 		/* Everything else falls through when "true"... */
       
   302 	case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
       
   303 	case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
       
   304 	case 2: if ((a = (*--srcptr)) > 0xBF) return false;
       
   305 		switch (*source) {
       
   306 		    /* no fall-through in this inner switch */
       
   307 		    case 0xE0: if (a < 0xA0) return false; break;
       
   308 		    case 0xF0: if (a < 0x90) return false; break;
       
   309 		    case 0xF4: if (a > 0x8F) return false; break;
       
   310 		    default:  if (a < 0x80) return false;
       
   311 		}
       
   312     	case 1: if (*source >= 0x80 && *source < 0xC2) return false;
       
   313 		if (*source > 0xF4) return false;
       
   314 	}
       
   315 	return true;
       
   316 }
       
   317 
       
   318 /* --------------------------------------------------------------------- */
       
   319 
       
   320 /*
       
   321  * Exported function to return whether a UTF-8 sequence is legal or not.
       
   322  * This is not used here; it's just exported.
       
   323  */
       
   324 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
       
   325 	int length = trailingBytesForUTF8[*source]+1;
       
   326 	if (source+length > sourceEnd) {
       
   327 	    return false;
       
   328 	}
       
   329 	return isLegalUTF8(source, length);
       
   330 }
       
   331 
       
   332 /* --------------------------------------------------------------------- */
       
   333 
       
   334 ConversionResult ConvertUTF8toUTF16 (
       
   335 		const UTF8** sourceStart, const UTF8* sourceEnd, 
       
   336 		UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
       
   337 	ConversionResult result = conversionOK;
       
   338 	const UTF8* source = *sourceStart;
       
   339 	UTF16* target = *targetStart;
       
   340 	while (source < sourceEnd) {
       
   341 		UTF32 ch = 0;
       
   342 		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
       
   343 		if (source + extraBytesToRead >= sourceEnd) {
       
   344 			result = sourceExhausted; break;
       
   345 		}
       
   346 		/* Do this check whether lenient or strict */
       
   347 		if (! isLegalUTF8(source, extraBytesToRead+1)) {
       
   348 			result = sourceIllegal;
       
   349 			break;
       
   350 		}
       
   351 		/*
       
   352 		 * The cases all fall through. See "Note A" below.
       
   353 		 */
       
   354 		switch (extraBytesToRead) {
       
   355 			case 3:	ch += *source++; ch <<= 6;
       
   356 			case 2:	ch += *source++; ch <<= 6;
       
   357 			case 1:	ch += *source++; ch <<= 6;
       
   358 			case 0:	ch += *source++;
       
   359 		}
       
   360 		ch -= offsetsFromUTF8[extraBytesToRead];
       
   361 
       
   362 		if (target >= targetEnd) {
       
   363 			source -= (extraBytesToRead+1);	/* Back up source pointer! */
       
   364 			result = targetExhausted; break;
       
   365 		}
       
   366 		if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
       
   367 			if ((flags == strictConversion) && (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)) {
       
   368 				source -= (extraBytesToRead+1); /* return to the illegal value itself */
       
   369 				result = sourceIllegal;
       
   370 				break;
       
   371 			} else {
       
   372 			    *target++ = (UTF16)ch;	/* normal case */ /*Added cast : PDEF140857*/
       
   373 			}
       
   374 		} else if (ch > UNI_MAX_UTF16) {
       
   375 			if (flags == strictConversion) {
       
   376 				result = sourceIllegal;
       
   377 				source -= (extraBytesToRead+1); /* return to the start */
       
   378 				break; /* Bail out; shouldn't continue */
       
   379 			} else {
       
   380 				*target++ = UNI_REPLACEMENT_CHAR;
       
   381 			}
       
   382 		} else {
       
   383 			/* target is a character in range 0xFFFF - 0x10FFFF. */
       
   384 			if (target + 1 >= targetEnd) {
       
   385 				source -= (extraBytesToRead+1);	/* Back up source pointer! */
       
   386 				result = targetExhausted; break;
       
   387 			}
       
   388 			ch -= halfBase;
       
   389 			*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); /*Added cast : PDEF140857*/
       
   390 			*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);    /*Added cast : PDEF140857*/
       
   391 		}
       
   392 	}
       
   393 	*sourceStart = source;
       
   394 	*targetStart = target;
       
   395 	return result;
       
   396 }
       
   397 
       
   398 /* --------------------------------------------------------------------- */
       
   399 
       
   400 ConversionResult ConvertUTF32toUTF8 (
       
   401 		const UTF32** sourceStart, const UTF32* sourceEnd, 
       
   402 		UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
       
   403 	ConversionResult result = conversionOK;
       
   404 	const UTF32* source = *sourceStart;
       
   405 	UTF8* target = *targetStart;
       
   406 	while (source < sourceEnd) {
       
   407 		UTF32 ch;
       
   408 		unsigned short bytesToWrite = 0;
       
   409 		const UTF32 byteMask = 0xBF;
       
   410 		const UTF32 byteMark = 0x80; 
       
   411 		ch = *source++;
       
   412 		/* surrogates of any stripe are not legal UTF32 characters */
       
   413 		if (flags == strictConversion ) {
       
   414 			if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_LOW_END)) {
       
   415 				--source; /* return to the illegal value itself */
       
   416 				result = sourceIllegal;
       
   417 				break;
       
   418 			}
       
   419 		}
       
   420 		/* Figure out how many bytes the result will require */
       
   421 		if (ch < (UTF32)0x80) {			bytesToWrite = 1;
       
   422 		} else if (ch < (UTF32)0x800) {		bytesToWrite = 2;
       
   423 		} else if (ch < (UTF32)0x10000) {	bytesToWrite = 3;
       
   424 		} else if (ch < (UTF32)0x200000) {	bytesToWrite = 4;
       
   425 		} else {				bytesToWrite = 2;
       
   426 							ch = UNI_REPLACEMENT_CHAR;
       
   427 		}
       
   428 		
       
   429 		target += bytesToWrite;
       
   430 		if (target > targetEnd) {
       
   431 			--source; /* Back up source pointer! */
       
   432 			target -= bytesToWrite; result = targetExhausted; break;
       
   433 		}
       
   434 		switch (bytesToWrite) {	/* note: everything falls through. */
       
   435 			case 4:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; /*Added cast : PDEF140857*/
       
   436 			case 3:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; /*Added cast : PDEF140857*/
       
   437 			case 2:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; /*Added cast : PDEF140857*/
       
   438 			case 1:	*--target = (UTF8)( ch | firstByteMark[bytesToWrite]);    /*Added cast : PDEF140857*/
       
   439 		}
       
   440 		target += bytesToWrite;
       
   441 	}
       
   442 	*sourceStart = source;
       
   443 	*targetStart = target;
       
   444 	return result;
       
   445 }
       
   446 
       
   447 /* --------------------------------------------------------------------- */
       
   448 
       
   449 ConversionResult ConvertUTF8toUTF32 (
       
   450 		const UTF8** sourceStart, const UTF8* sourceEnd, 
       
   451 		UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
       
   452 	ConversionResult result = conversionOK;
       
   453 	const UTF8* source = *sourceStart;
       
   454 	UTF32* target = *targetStart;
       
   455 	while (source < sourceEnd) {
       
   456 		UTF32 ch = 0;
       
   457 		unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
       
   458 		if (source + extraBytesToRead >= sourceEnd) {
       
   459 			result = sourceExhausted; break;
       
   460 		}
       
   461 		/* Do this check whether lenient or strict */
       
   462 		if (! isLegalUTF8(source, extraBytesToRead+1)) {
       
   463 			result = sourceIllegal;
       
   464 			break;
       
   465 		}
       
   466 		/*
       
   467 		 * The cases all fall through. See "Note A" below.
       
   468 		 */
       
   469 		switch (extraBytesToRead) {
       
   470 			case 3:	ch += *source++; ch <<= 6;
       
   471 			case 2:	ch += *source++; ch <<= 6;
       
   472 			case 1:	ch += *source++; ch <<= 6;
       
   473 			case 0:	ch += *source++;
       
   474 		}
       
   475 		ch -= offsetsFromUTF8[extraBytesToRead];
       
   476 
       
   477 		if (target >= targetEnd) {
       
   478 			source -= (extraBytesToRead+1);	/* Back up the source pointer! */
       
   479 			result = targetExhausted; break;
       
   480 		}
       
   481 		if (ch <= UNI_MAX_UTF32) {
       
   482 			*target++ = ch;
       
   483 		} else { /* i.e., ch > UNI_MAX_UTF32 */
       
   484 			*target++ = UNI_REPLACEMENT_CHAR;
       
   485 		}
       
   486 	}
       
   487 	*sourceStart = source;
       
   488 	*targetStart = target;
       
   489 	return result;
       
   490 }
       
   491 
       
   492 /* ---------------------------------------------------------------------
       
   493 
       
   494 	Note A.
       
   495 	The fall-through switches in UTF-8 reading code save a
       
   496 	temp variable, some decrements & conditionals.  The switches
       
   497 	are equivalent to the following loop:
       
   498 		{
       
   499 			int tmpBytesToRead = extraBytesToRead+1;
       
   500 			do {
       
   501 				ch += *source++;
       
   502 				--tmpBytesToRead;
       
   503 				if (tmpBytesToRead) ch <<= 6;
       
   504 			} while (tmpBytesToRead > 0);
       
   505 		}
       
   506 	In UTF-8 writing code, the switches on "bytesToWrite" are
       
   507 	similarly unrolled loops.
       
   508 
       
   509    --------------------------------------------------------------------- */
       
   510 
       
   511