charconvfw/Charconv/ongoing/test/source/otherutf/UTF8.CPP
changeset 16 56cd22a7a1cb
parent 0 1fb32624e06b
child 18 67f6b0d39020
child 21 f2f7b3284356
equal deleted inserted replaced
0:1fb32624e06b 16:56cd22a7a1cb
     1 /*
       
     2 * Copyright (c) 2000-2005 Nokia Corporation and/or its subsidiary(-ies). 
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description:      
       
    15 *
       
    16 */
       
    17 /* ================================================================ */
       
    18 /*
       
    19 File:	ConvertUTF.C
       
    20 Author: Mark E. Davis
       
    21 Copyright (C) 1994 Taligent, Inc. All rights reserved.
       
    22 
       
    23 This code is copyrighted. Under the copyright laws, this code may not
       
    24 be copied, in whole or part, without prior written consent of Taligent. 
       
    25 
       
    26 Taligent grants the right to use or reprint this code as long as this
       
    27 ENTIRE copyright notice is reproduced in the code or reproduction.
       
    28 The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES,
       
    29 EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED
       
    30 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  IN
       
    31 NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING,
       
    32 WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS
       
    33 INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY
       
    34 LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN
       
    35 IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
       
    36 BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF
       
    37 LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE
       
    38 LIMITATION MAY NOT APPLY TO YOU.
       
    39 
       
    40 RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the
       
    41 government is subject to restrictions as set forth in subparagraph
       
    42 (c)(l)(ii) of the Rights in Technical Data and Computer Software
       
    43 clause at DFARS 252.227-7013 and FAR 52.227-19.
       
    44 
       
    45 This code may be protected by one or more U.S. and International
       
    46 Patents.
       
    47 
       
    48 TRADEMARKS: Taligent and the Taligent Design Mark are registered
       
    49 trademarks of Taligent, Inc.
       
    50 */
       
    51 /* ================================================================ */
       
    52 
       
    53 // #include "CVTUTF.H" // commented out by DPB
       
    54 #include "UTF8.H" // added by DPB
       
    55 
       
    56 /* ================================================================ */
       
    57 
       
    58 const int halfShift				= 10;
       
    59 const UCS4 halfBase				= 0x0010000UL;
       
    60 const UCS4 halfMask				= 0x3FFUL;
       
    61 const UCS4 kSurrogateHighStart	= 0xD800UL;
       
    62 const UCS4 kSurrogateHighEnd	= 0xDBFFUL;
       
    63 const UCS4 kSurrogateLowStart	= 0xDC00UL;
       
    64 const UCS4 kSurrogateLowEnd		= 0xDFFFUL;
       
    65 
       
    66 /* ================================================================ */
       
    67 
       
    68 EXPORT_C // added by DPB
       
    69 ConversionResult	ConvertUCS4toUTF16 (
       
    70 		UCS4** sourceStart, const UCS4* sourceEnd, 
       
    71 		UTF16** targetStart, const UTF16* targetEnd) {
       
    72 	ConversionResult result = ok;
       
    73 	register UCS4* source = *sourceStart;
       
    74 	register UTF16* target = *targetStart;
       
    75 	while (source < sourceEnd) {
       
    76 		register UCS4 ch;
       
    77 		if (target >= targetEnd) {
       
    78 			result = targetExhausted; break;
       
    79 		};
       
    80 		ch = *source++;
       
    81 		if (ch <= kMaximumUCS2) {
       
    82 			*target++ = (UTF16)ch; // cast added by DPB
       
    83 		} else if (ch > kMaximumUTF16) {
       
    84 			*target++ = kReplacementCharacter;
       
    85 		} else {
       
    86 			if (target + 1 >= targetEnd) {
       
    87 				result = targetExhausted; break;
       
    88 			};
       
    89 			ch -= halfBase;
       
    90 			*target++ = (UTF16)((ch >> halfShift) + kSurrogateHighStart); // cast added by DPB
       
    91 			*target++ = (UTF16)((ch & halfMask) + kSurrogateLowStart); // cast added by DPB
       
    92 		};
       
    93 	};
       
    94 	*sourceStart = source;
       
    95 	*targetStart = target;
       
    96 	return result;
       
    97 };
       
    98 
       
    99 /* ================================================================ */
       
   100 
       
   101 EXPORT_C // added by DPB
       
   102 ConversionResult	ConvertUTF16toUCS4 (
       
   103 		UTF16** sourceStart, UTF16* sourceEnd, 
       
   104 		UCS4** targetStart, const UCS4* targetEnd) {
       
   105 	ConversionResult result = ok;
       
   106 	register UTF16* source = *sourceStart;
       
   107 	register UCS4* target = *targetStart;
       
   108 	while (source < sourceEnd) {
       
   109 		register UCS4 ch;
       
   110 		ch = *source++;
       
   111 		if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd && source < sourceEnd) {
       
   112 			register UCS4 ch2 = *source;
       
   113 			if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
       
   114 				ch = ((ch - kSurrogateHighStart) << halfShift)
       
   115 					+ (ch2 - kSurrogateLowStart) + halfBase;
       
   116 				++source;
       
   117 			};
       
   118 		};
       
   119 		if (target >= targetEnd) {
       
   120 			result = targetExhausted; break;
       
   121 		};
       
   122 		*target++ = ch;
       
   123 	};
       
   124 	*sourceStart = source;
       
   125 	*targetStart = target;
       
   126 	return result;
       
   127 };
       
   128 
       
   129 /* ================================================================ */
       
   130 
       
   131 const UCS4 offsetsFromUTF8[6] =	{0x00000000UL, 0x00003080UL, 0x000E2080UL, // "const" added by DPB
       
   132 					 	 	 0x03C82080UL, 0xFA082080UL, 0x82082080UL};
       
   133 const char bytesFromUTF8[256] = { // "const" added by DPB
       
   134 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   135 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   136 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   137 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   138 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   139 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       
   140 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
       
   141 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5};
       
   142 
       
   143 const UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; // "const" added by DPB
       
   144 
       
   145 /* ================================================================ */
       
   146 /*	This code is similar in effect to making successive calls on the
       
   147 mbtowc and wctomb routines in FSS-UTF. However, it is considerably
       
   148 different in code:
       
   149 * it is adapted to be consistent with UTF16,
       
   150 * the interface converts a whole buffer to avoid function-call overhead
       
   151 * constants have been gathered.
       
   152 * loops & conditionals have been removed as much as possible for
       
   153 efficiency, in favor of drop-through switch statements.
       
   154 */
       
   155 
       
   156 /* ================================================================ */
       
   157 EXPORT_C // added by DPB
       
   158 ConversionResult	ConvertUTF16toUTF8 (
       
   159 		UTF16** sourceStart, const UTF16* sourceEnd, 
       
   160 		UTF8** targetStart, const UTF8* targetEnd)
       
   161 {
       
   162 	ConversionResult result = ok;
       
   163 	register UTF16* source = *sourceStart;
       
   164 	register UTF8* target = *targetStart;
       
   165 	while (source < sourceEnd) {
       
   166 		register UCS4 ch;
       
   167 		register unsigned short bytesToWrite = 0;
       
   168 		register const UCS4 byteMask = 0xBF;
       
   169 		register const UCS4 byteMark = 0x80; 
       
   170 		ch = *source++;
       
   171 		if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd
       
   172 				&& source < sourceEnd) {
       
   173 			register UCS4 ch2 = *source;
       
   174 			if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
       
   175 				ch = ((ch - kSurrogateHighStart) << halfShift)
       
   176 					+ (ch2 - kSurrogateLowStart) + halfBase;
       
   177 				++source;
       
   178 			};
       
   179 		};
       
   180 		if (ch < 0x80) {				bytesToWrite = 1;
       
   181 		} else if (ch < 0x800) {		bytesToWrite = 2;
       
   182 		} else if (ch < 0x10000) {		bytesToWrite = 3;
       
   183 		} else if (ch < 0x200000) {		bytesToWrite = 4;
       
   184 		} else if (ch < 0x4000000) {	bytesToWrite = 5;
       
   185 		} else if (ch <= kMaximumUCS4){	bytesToWrite = 6;
       
   186 		} else {						bytesToWrite = 2;
       
   187 										ch = kReplacementCharacter;
       
   188 		}; /* I wish there were a smart way to avoid this conditional */
       
   189 		
       
   190 		target += bytesToWrite;
       
   191 		if (target > targetEnd) {
       
   192 			target -= bytesToWrite; result = targetExhausted; break;
       
   193 		};
       
   194 		switch (bytesToWrite) {	/* note: code falls through cases! */
       
   195 			case 6:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB
       
   196 			case 5:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB
       
   197 			case 4:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB
       
   198 			case 3:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB
       
   199 			case 2:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB
       
   200 			case 1:	*--target = (UTF8)(ch | firstByteMark[bytesToWrite]); // cast added by DPB
       
   201 		};
       
   202 		target += bytesToWrite;
       
   203 	};
       
   204 	*sourceStart = source;
       
   205 	*targetStart = target;
       
   206 	return result;
       
   207 };
       
   208 
       
   209 /* ================================================================ */
       
   210 
       
   211 EXPORT_C // added by DPB
       
   212 ConversionResult	ConvertUTF8toUTF16 (
       
   213 		UTF8** sourceStart, UTF8* sourceEnd, 
       
   214 		UTF16** targetStart, const UTF16* targetEnd)
       
   215 {
       
   216 	ConversionResult result = ok;
       
   217 	register UTF8* source = *sourceStart;
       
   218 	register UTF16* target = *targetStart;
       
   219 	while (source < sourceEnd) {
       
   220 		register UCS4 ch = 0;
       
   221 		register unsigned short extraBytesToWrite = bytesFromUTF8[*source];
       
   222 		if (source + extraBytesToWrite > sourceEnd) {
       
   223 			result = sourceExhausted; break;
       
   224 		};
       
   225 		switch(extraBytesToWrite) {	/* note: code falls through cases! */
       
   226 			case 5:	ch += *source++; ch <<= 6;
       
   227 			case 4:	ch += *source++; ch <<= 6;
       
   228 			case 3:	ch += *source++; ch <<= 6;
       
   229 			case 2:	ch += *source++; ch <<= 6;
       
   230 			case 1:	ch += *source++; ch <<= 6;
       
   231 			case 0:	ch += *source++;
       
   232 		};
       
   233 		ch -= offsetsFromUTF8[extraBytesToWrite];
       
   234 
       
   235 		if (target >= targetEnd) {
       
   236 			result = targetExhausted; break;
       
   237 		};
       
   238 		if (ch <= kMaximumUCS2) {
       
   239 			*target++ = (UTF16)ch;
       
   240 		} else if (ch > kMaximumUTF16) {
       
   241 			*target++ = kReplacementCharacter;
       
   242 		} else {
       
   243 			if (target + 1 >= targetEnd) {
       
   244 				result = targetExhausted; break;
       
   245 			};
       
   246 			ch -= halfBase;
       
   247 			*target++ = (UTF16)((ch >> halfShift) + kSurrogateHighStart);
       
   248 			*target++ = (UTF16)((ch & halfMask) + kSurrogateLowStart);
       
   249 		};
       
   250 	};
       
   251 	*sourceStart = source;
       
   252 	*targetStart = target;
       
   253 	return result;
       
   254 };
       
   255 
       
   256 /* ================================================================ */
       
   257 EXPORT_C // added by DPB
       
   258 ConversionResult	ConvertUCS4toUTF8 (
       
   259 		UCS4** sourceStart, const UCS4* sourceEnd, 
       
   260 		UTF8** targetStart, const UTF8* targetEnd)
       
   261 {
       
   262 	ConversionResult result = ok;
       
   263 	register UCS4* source = *sourceStart;
       
   264 	register UTF8* target = *targetStart;
       
   265 	while (source < sourceEnd) {
       
   266 		register UCS4 ch;
       
   267 		register unsigned short bytesToWrite = 0;
       
   268 		register const UCS4 byteMask = 0xBF;
       
   269 		register const UCS4 byteMark = 0x80; 
       
   270 		ch = *source++;
       
   271 		if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd
       
   272 				&& source < sourceEnd) {
       
   273 			register UCS4 ch2 = *source;
       
   274 			if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
       
   275 				ch = ((ch - kSurrogateHighStart) << halfShift)
       
   276 					+ (ch2 - kSurrogateLowStart) + halfBase;
       
   277 				++source;
       
   278 			};
       
   279 		};
       
   280 		if (ch < 0x80) {				bytesToWrite = 1;
       
   281 		} else if (ch < 0x800) {		bytesToWrite = 2;
       
   282 		} else if (ch < 0x10000) {		bytesToWrite = 3;
       
   283 		} else if (ch < 0x200000) {		bytesToWrite = 4;
       
   284 		} else if (ch < 0x4000000) {	bytesToWrite = 5;
       
   285 		} else if (ch <= kMaximumUCS4){	bytesToWrite = 6;
       
   286 		} else {						bytesToWrite = 2;
       
   287 										ch = kReplacementCharacter;
       
   288 		}; /* I wish there were a smart way to avoid this conditional */
       
   289 		
       
   290 		target += bytesToWrite;
       
   291 		if (target > targetEnd) {
       
   292 			target -= bytesToWrite; result = targetExhausted; break;
       
   293 		};
       
   294 		switch (bytesToWrite) {	/* note: code falls through cases! */
       
   295 			case 6:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB
       
   296 			case 5:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB
       
   297 			case 4:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB
       
   298 			case 3:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB
       
   299 			case 2:	*--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB
       
   300 			case 1:	*--target = (UTF8)(ch | firstByteMark[bytesToWrite]); // cast added by DPB
       
   301 		};
       
   302 		target += bytesToWrite;
       
   303 	};
       
   304 	*sourceStart = source;
       
   305 	*targetStart = target;
       
   306 	return result;
       
   307 };
       
   308 
       
   309 /* ================================================================ */
       
   310 
       
   311 EXPORT_C // added by DPB
       
   312 ConversionResult	ConvertUTF8toUCS4 (
       
   313 		UTF8** sourceStart, UTF8* sourceEnd, 
       
   314 		UCS4** targetStart, const UCS4* targetEnd)
       
   315 {
       
   316 	ConversionResult result = ok;
       
   317 	register UTF8* source = *sourceStart;
       
   318 	register UCS4* target = *targetStart;
       
   319 	while (source < sourceEnd) {
       
   320 		register UCS4 ch = 0;
       
   321 		register unsigned short extraBytesToWrite = bytesFromUTF8[*source];
       
   322 		if (source + extraBytesToWrite > sourceEnd) {
       
   323 			result = sourceExhausted; break;
       
   324 		};
       
   325 		switch(extraBytesToWrite) {	/* note: code falls through cases! */
       
   326 			case 5:	ch += *source++; ch <<= 6;
       
   327 			case 4:	ch += *source++; ch <<= 6;
       
   328 			case 3:	ch += *source++; ch <<= 6;
       
   329 			case 2:	ch += *source++; ch <<= 6;
       
   330 			case 1:	ch += *source++; ch <<= 6;
       
   331 			case 0:	ch += *source++;
       
   332 		};
       
   333 		ch -= offsetsFromUTF8[extraBytesToWrite];
       
   334 
       
   335 		if (target >= targetEnd) {
       
   336 			result = targetExhausted; break;
       
   337 		};
       
   338 		if (ch <= kMaximumUCS2) {
       
   339 			*target++ = ch;
       
   340 		} else if (ch > kMaximumUCS4) {
       
   341 			*target++ = kReplacementCharacter;
       
   342 		} else {
       
   343 			if (target + 1 >= targetEnd) {
       
   344 				result = targetExhausted; break;
       
   345 			};
       
   346 			ch -= halfBase;
       
   347 			*target++ = (ch >> halfShift) + kSurrogateHighStart;
       
   348 			*target++ = (ch & halfMask) + kSurrogateLowStart;
       
   349 		};
       
   350 	};
       
   351 	*sourceStart = source;
       
   352 	*targetStart = target;
       
   353 	return result;
       
   354 };