charconvfw/charconv_fw/src/charconv/utf.cpp
changeset 0 1fb32624e06b
equal deleted inserted replaced
-1:000000000000 0:1fb32624e06b
       
     1 /*
       
     2 * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 
       
    19 #include <e32std.h>
       
    20 #include <e32base.h>
       
    21 #include <utf.h>
       
    22 
       
    23 const TUint KNotInBase64Alphabet=KMaxTUint;
       
    24 
       
    25 enum TPanic
       
    26 	{
       
    27 	EPanicBad6BitNumber=1,
       
    28 	EPanicBadUtf7Pointers1,
       
    29 	EPanicBadUtf7Pointers2,
       
    30 	EPanicBadUtf7Pointers3,
       
    31 	EPanicBadUtf7Pointers4,
       
    32 	EPanicBadUtf7Pointers5,
       
    33 	EPanicBadUtf7Pointers6,
       
    34 	EPanicBadUtf7Pointers7,
       
    35 	EPanicBadUtf7Pointers8,
       
    36 	EPanicBadUtf7Pointers9,
       
    37 	EPanicBadUtf7Pointers10,
       
    38 	EPanicBadUtf7Pointers11,
       
    39 	EPanicNotInBase64Block,
       
    40 	EPanicBadUnicodePointers1,
       
    41 	EPanicBadUnicodePointers2,
       
    42 	EPanicBadUnicodePointers3,
       
    43 	EPanicBadUnicodePointers4,
       
    44 	EPanicBadUnicodePointers5,
       
    45 	EPanicBadUnicodePointers6,
       
    46 	EPanicBadUnicodePointers7,
       
    47 	EPanicBadUnicodePointers8,
       
    48 	EPanicBadUnicodePointers9,
       
    49 	EPanicBadUnicodePointers10,
       
    50 	EPanicBadBitBufferState1,
       
    51 	EPanicBadBitBufferState2,
       
    52 	EPanicBadBitBufferState3,
       
    53 	EPanicBadBitBufferState4,
       
    54 	EPanicBadBitBufferState5,
       
    55 	EPanicBadBitBufferState6,
       
    56 	EPanicBadBitBufferState7,
       
    57 	EPanicBadBitBufferState8,
       
    58 	EPanicBadBitBufferState9,
       
    59 	EPanicBadBitBufferState10,
       
    60 	EPanicBadBitBufferState11,
       
    61 	EPanicBadBitBufferState12,
       
    62 	EPanicBadBitBufferState13,
       
    63 	EPanicBadBitBufferState14,
       
    64 	EPanicBadBitBufferState15,
       
    65 	EPanicBadBitBufferState16,
       
    66 	EPanicBadBitBufferState17,
       
    67 	EPanicUnexpectedNumberOfLoopIterations,
       
    68 	EPanicInitialEscapeCharacterButNoBase64,
       
    69 	EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
       
    70 	EPanicBadUtf8Pointers1,
       
    71 	EPanicBadUtf8Pointers2,
       
    72 	EPanicBadUtf8Pointers3,
       
    73 	EPanicBadUtf8Pointers4,
       
    74 	EPanicBadUtf8Pointers5,
       
    75 	EPanicBadUtf8Pointers6,
       
    76 	EPanicBadUtf8Pointers7,
       
    77 	EPanicOutOfSyncUtf7Byte1,
       
    78 	EPanicOutOfSyncUtf7Byte2,
       
    79 	EPanicOutOfSyncBase64Decoding
       
    80 	};
       
    81 
       
    82 _LIT(KLitPanicText, "CHARCONV-UTF");
       
    83 
       
    84 LOCAL_C void Panic(TPanic aPanic)
       
    85 	{
       
    86 	User::Panic(KLitPanicText, aPanic);
       
    87 	}
       
    88 
       
    89 inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
       
    90 
       
    91 LOCAL_C TUint Base64Decoding(TUint aMemberOfBase64Alphabet, TBool aIsImapUtf7)
       
    92 	{
       
    93 	if ((aMemberOfBase64Alphabet>='A') && (aMemberOfBase64Alphabet<='Z'))
       
    94 		{
       
    95 		return aMemberOfBase64Alphabet-'A';
       
    96 		}
       
    97 	if ((aMemberOfBase64Alphabet>='a') && (aMemberOfBase64Alphabet<='z'))
       
    98 		{
       
    99 		return aMemberOfBase64Alphabet-('a'-26);
       
   100 		}
       
   101 	if ((aMemberOfBase64Alphabet>='0') && (aMemberOfBase64Alphabet<='9'))
       
   102 		{
       
   103 		return aMemberOfBase64Alphabet+((26*2)-'0');
       
   104 		}
       
   105 	if (aMemberOfBase64Alphabet=='+')
       
   106 		{
       
   107 		return 62;
       
   108 		}
       
   109 	if (aMemberOfBase64Alphabet==STATIC_CAST(TUint, aIsImapUtf7? ',': '/'))
       
   110 		{
       
   111 		return 63;
       
   112 		}
       
   113 	return KNotInBase64Alphabet;
       
   114 	}
       
   115 
       
   116 LOCAL_C TUint Base64Encoding(TUint a6BitNumber, TBool aIsImapUtf7)
       
   117 	{
       
   118 	__ASSERT_DEBUG(a6BitNumber<64, Panic(EPanicBad6BitNumber));
       
   119 	if ((a6BitNumber==63) && aIsImapUtf7)
       
   120 		{
       
   121 		return ',';
       
   122 		}
       
   123 	static const TUint8 base64Alphabet[64]={'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'};
       
   124 	return base64Alphabet[a6BitNumber];
       
   125 	}
       
   126 
       
   127 LOCAL_C TUint8* PointerToEscapeCharacterStartingBase64Block(TUint8* aPointerToUtf7Byte, const TUint8* aPointerToFirstUtf7Byte, TBool aIsImapUtf7)
       
   128 	{
       
   129 	__ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers1));
       
   130 	TUint8* pointerToCandidateEscapeCharacter=NULL;
       
   131 	FOREVER
       
   132 		{
       
   133 		const TUint utf7Byte=*aPointerToUtf7Byte;
       
   134 		if (utf7Byte==EscapeCharacterForStartingBase64Block(aIsImapUtf7))
       
   135 			{
       
   136 			pointerToCandidateEscapeCharacter=aPointerToUtf7Byte;
       
   137 			}
       
   138 		else if (Base64Decoding(utf7Byte, aIsImapUtf7)==KNotInBase64Alphabet)
       
   139 			{
       
   140 			break;
       
   141 			}
       
   142 		__ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers2));
       
   143 		if (aPointerToUtf7Byte<=aPointerToFirstUtf7Byte)
       
   144 			{
       
   145 			break;
       
   146 			}
       
   147 		--aPointerToUtf7Byte;
       
   148 		}
       
   149 	__ASSERT_DEBUG(pointerToCandidateEscapeCharacter!=NULL, Panic(EPanicNotInBase64Block));
       
   150 	return pointerToCandidateEscapeCharacter;
       
   151 	}
       
   152 
       
   153 LOCAL_C TBool EncodeInUtf7Directly(TUint aUnicodeCharacter, TBool aIsImapUtf7, TBool aEncodeOptionalDirectCharactersInBase64)
       
   154 	{
       
   155 	if (aIsImapUtf7)
       
   156 		{
       
   157 		return (aUnicodeCharacter>=0x0020) && (aUnicodeCharacter<=0x007e);
       
   158 		}
       
   159 	if ((aUnicodeCharacter>=0x0021) && (aUnicodeCharacter<=0x007d))
       
   160 		{
       
   161 		if (aEncodeOptionalDirectCharactersInBase64)
       
   162 			{
       
   163 			return (((aUnicodeCharacter>=0x0041) && (aUnicodeCharacter<=0x005a)) ||
       
   164 					((aUnicodeCharacter>=0x0061) && (aUnicodeCharacter<=0x007a)) ||
       
   165 					((aUnicodeCharacter>=0x0027) && (aUnicodeCharacter<=0x0029)) ||
       
   166 					((aUnicodeCharacter>=0x002b) && (aUnicodeCharacter<=0x003a)) ||
       
   167 					(aUnicodeCharacter==0x003f));
       
   168 			}
       
   169 		return aUnicodeCharacter!=0x005c;
       
   170 		}
       
   171 	return (aUnicodeCharacter==0x0020) || (aUnicodeCharacter==0x0009) || (aUnicodeCharacter==0x000d) || (aUnicodeCharacter==0x000a);
       
   172 	}
       
   173 
       
   174 inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
       
   175 	{
       
   176 	return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
       
   177 	}
       
   178 
       
   179 
       
   180 
       
   181 /**  Converts Unicode text into UTF-7 encoding. The fucntion leaves with 
       
   182 KErrCorrupt if the input string is corrupt.
       
   183 
       
   184 @param aUnicode A UCS-2 encoded input string.
       
   185 @param aEncodeOptionalDirectCharactersInBase64  If ETrue then 
       
   186 characters from UTF-7 set O (optional direct characters) are encoded in 
       
   187 Modified Base64. If EFalse the characters are encoded directly, 
       
   188 as their ASCII equivalents.
       
   189 @return A descriptor containing the UTF-7 encoded output string. */
       
   190 EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf7L(
       
   191 										const TDesC16& aUnicode, 
       
   192 										TBool aEncodeOptionalDirectCharactersInBase64)
       
   193 	{
       
   194 	// If aUnicode is  Null string, return an empty HBufC
       
   195 	if (aUnicode.Length() == 0)
       
   196 		{
       
   197 		HBufC8* hBuf8 = HBufC8::NewL(1);
       
   198 		return hBuf8;
       
   199 		}
       
   200 
       
   201 	// Otherwise, convert and store result in a buffer, reallocating that buffer if needed.
       
   202 	TInt length = aUnicode.Length();
       
   203 	const TInt bufsize = 100;
       
   204 	
       
   205 	TPtrC16 unicode (aUnicode);
       
   206 	TBuf8<bufsize> buf;
       
   207 	HBufC8* hBuf8 = HBufC8::NewLC(length);
       
   208 	TPtr8 utf7 = hBuf8->Des();
       
   209 
       
   210 	FOREVER
       
   211 		{
       
   212 		TInt unconverted = ConvertFromUnicodeToUtf7(buf, unicode, aEncodeOptionalDirectCharactersInBase64);
       
   213 		if( unconverted == EErrorIllFormedInput || unconverted < 0)
       
   214 			User::Leave(KErrCorrupt);
       
   215 
       
   216 		if (utf7.Length() + buf.Length() > utf7.MaxLength())
       
   217 			{
       
   218 			// Reallocate the hBuf8
       
   219 			hBuf8 = hBuf8->ReAllocL(utf7.Length() + buf.Length());
       
   220 			CleanupStack::Pop();
       
   221 			CleanupStack::PushL(hBuf8);
       
   222 			utf7.Set(hBuf8->Des());
       
   223 			}
       
   224 		utf7.Append(buf);
       
   225 		if (unconverted ==0) 
       
   226 			break;
       
   227 		unicode.Set(unicode.Right(unconverted));
       
   228 		}
       
   229 	CleanupStack::Pop();
       
   230 	return hBuf8;
       
   231 
       
   232 	}
       
   233 
       
   234 /** Converts Unicode text into UTF-7 encoding.
       
   235 
       
   236 @param aUtf7 On return, contains the UTF-7 encoded output string.
       
   237 @param aUnicode A UCS-2 encoded input string.
       
   238 @param aEncodeOptionalDirectCharactersInBase64 If ETrue then characters from 
       
   239 UTF-7 set O (optional direct characters) are encoded in Modified Base64. If 
       
   240 EFalse the characters are encoded directly, as their ASCII equivalents.
       
   241 @return The number of unconverted characters left at the end of the input 
       
   242 descriptor, or one of the error values defined in TError. */
       
   243 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(
       
   244 										TDes8& aUtf7, 
       
   245 										const TDesC16& aUnicode, 
       
   246 										TBool aEncodeOptionalDirectCharactersInBase64)
       
   247 	{
       
   248 	return ConvertFromUnicodeToUtf7(aUtf7, aUnicode, EFalse, aEncodeOptionalDirectCharactersInBase64);
       
   249 	}
       
   250 
       
   251 TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(TDes8& aUtf7, 
       
   252 											   const TDesC16& aUnicode, 
       
   253 											   TBool aIsImapUtf7, 
       
   254 											   TBool aEncodeOptionalDirectCharactersInBase64)
       
   255 	{
       
   256 	if (aUnicode.Length()==0)
       
   257 		{
       
   258 		aUtf7.SetLength(0);
       
   259 		return 0;
       
   260 		}
       
   261 	if (aUtf7.MaxLength()==0)
       
   262 		{
       
   263 		return aUnicode.Length();
       
   264 		}
       
   265 	const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7);
       
   266 	TUint8* pointerToPreviousUtf7Byte=CONST_CAST(TUint8*, aUtf7.Ptr()-1);
       
   267 	const TUint8* const pointerToLastUtf7Byte=pointerToPreviousUtf7Byte+aUtf7.MaxLength();
       
   268 	const TUint16* pointerToPreviousUnicodeCharacter=aUnicode.Ptr()-1;
       
   269 	const TUint16* const pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.Length();
       
   270 	const TUint KIsInBase64Block=0x80000000u;
       
   271 	TUint bitBuffer=0;
       
   272 	TInt numberOfBitsInBuffer=0;
       
   273 	FOREVER
       
   274 		{
       
   275 		__ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers3));
       
   276 		__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers1));
       
   277 		TUint currentUnicodeCharacter=(pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)? 0: *(pointerToPreviousUnicodeCharacter+1);
       
   278 		if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || EncodeInUtf7Directly(currentUnicodeCharacter, aIsImapUtf7, aEncodeOptionalDirectCharactersInBase64))
       
   279 			{
       
   280 			__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState1));
       
   281 			__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState2));
       
   282 			if (bitBuffer&KIsInBase64Block)
       
   283 				{
       
   284 				if (numberOfBitsInBuffer!=0)
       
   285 					{
       
   286 					if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<2) // make sure there is enough space for the trailing '-' as well as the remains of the bitBuffer as the KIsInBase64Block flag is about to turned off, thus the trailing '-' may never get written
       
   287 						{
       
   288 						break;
       
   289 						}
       
   290 					++pointerToPreviousUtf7Byte;
       
   291 					*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7));
       
   292 					}
       
   293 				else
       
   294 					{
       
   295 					if (pointerToPreviousUtf7Byte==pointerToLastUtf7Byte)
       
   296 						{
       
   297 						break;
       
   298 						}
       
   299 					}
       
   300 				++pointerToPreviousUtf7Byte;
       
   301 				*pointerToPreviousUtf7Byte='-';
       
   302 				bitBuffer=0;
       
   303 				numberOfBitsInBuffer=0;
       
   304 				}
       
   305 			__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers2));
       
   306 			if (pointerToPreviousUnicodeCharacter>=pointerToLastUnicodeCharacter)
       
   307 				{
       
   308 				break;
       
   309 				}
       
   310 			__ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers4));
       
   311 			if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<((currentUnicodeCharacter==escapeCharacterForStartingBase64Block)? 2: 1))
       
   312 				{
       
   313 				break;
       
   314 				}
       
   315 			++pointerToPreviousUtf7Byte;
       
   316 			*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, currentUnicodeCharacter);
       
   317 			++pointerToPreviousUnicodeCharacter;
       
   318 			if (currentUnicodeCharacter==escapeCharacterForStartingBase64Block)
       
   319 				{
       
   320 				++pointerToPreviousUtf7Byte;
       
   321 				*pointerToPreviousUtf7Byte='-';
       
   322 				}
       
   323 			}
       
   324 		else
       
   325 			{
       
   326 			{
       
   327 			TInt numberOfUtf7BytesRequired=(numberOfBitsInBuffer+16)/6; // "(numberOfBitsInBuffer+16)/6" is the number of iterations that will happen in the while loop below
       
   328 			if (~bitBuffer&KIsInBase64Block)
       
   329 				{
       
   330 				++numberOfUtf7BytesRequired; // for the initial escapeCharacterForStartingBase64Block
       
   331 				}
       
   332 			if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<numberOfUtf7BytesRequired)
       
   333 				{
       
   334 				break;
       
   335 				}
       
   336 			}
       
   337 			if (~bitBuffer&KIsInBase64Block)
       
   338 				{
       
   339 				__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers5));
       
   340 				++pointerToPreviousUtf7Byte;
       
   341 				*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, escapeCharacterForStartingBase64Block);
       
   342 				}
       
   343 			bitBuffer<<=16;
       
   344 			bitBuffer|=currentUnicodeCharacter;
       
   345 			numberOfBitsInBuffer+=16;
       
   346 			++pointerToPreviousUnicodeCharacter;
       
   347 			__ASSERT_DEBUG(numberOfBitsInBuffer<=20, Panic(EPanicBadBitBufferState3));
       
   348 			while (numberOfBitsInBuffer>=6)
       
   349 				{
       
   350 				numberOfBitsInBuffer-=6;
       
   351 				__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers6));
       
   352 				++pointerToPreviousUtf7Byte;
       
   353 				*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer>>numberOfBitsInBuffer)&0x3f, aIsImapUtf7));
       
   354 				}
       
   355 			bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - not strictly necessary but it leaves the buffer in a cleaner state
       
   356 			bitBuffer|=KIsInBase64Block;
       
   357 			}
       
   358 		}
       
   359 	__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState4));
       
   360 	__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState5));
       
   361 	if (bitBuffer&KIsInBase64Block)
       
   362 		{
       
   363 #if defined(_DEBUG)
       
   364 		TInt numberOfLoopIterations=1;
       
   365 #endif
       
   366 		FOREVER // there should never be more than 2 iterations of this loop - the first "if" should always succeed the second time if it doesn't succeed the first time
       
   367 			{
       
   368 			__ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers7));
       
   369 			__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState6));
       
   370 			__ASSERT_DEBUG(numberOfLoopIterations<=2, Panic(EPanicUnexpectedNumberOfLoopIterations));
       
   371 #if defined(_DEBUG)
       
   372 			++numberOfLoopIterations;
       
   373 #endif
       
   374 			if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte>=((numberOfBitsInBuffer==0)? 1: 2)) // if there's room to finish off the base-64 sequence by (i) flushing the bit-buffer and (ii) appending the trailing '-'
       
   375 				{
       
   376 				if (numberOfBitsInBuffer!=0)
       
   377 					{
       
   378 					__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers8));
       
   379 					++pointerToPreviousUtf7Byte;
       
   380 					*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7));
       
   381 					}
       
   382 				__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers9));
       
   383 				++pointerToPreviousUtf7Byte;
       
   384 				*pointerToPreviousUtf7Byte='-';
       
   385 				break;
       
   386 				}
       
   387 			// it is now necessary to move back pointerToPreviousUtf7Byte so that the base-64 sequence can be terminated - note it must be terminated on a Unicode character boundary hence the reason why pointerToPreviousUnicodeCharacter may be moved back too
       
   388 			TUint8* pointerToEscapeCharacterStartingBase64Block=PointerToEscapeCharacterStartingBase64Block(pointerToPreviousUtf7Byte, aUtf7.Ptr(), aIsImapUtf7);
       
   389 			const TInt oldNumberOfBase64Characters=pointerToPreviousUtf7Byte-pointerToEscapeCharacterStartingBase64Block;
       
   390 			__ASSERT_DEBUG(oldNumberOfBase64Characters>0, Panic(EPanicInitialEscapeCharacterButNoBase64));
       
   391 			__ASSERT_DEBUG(((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)%16==0, Panic(EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary));
       
   392 			pointerToPreviousUnicodeCharacter-=((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)/16; // move back pointerToPreviousUnicodeCharacter to before the equivalent of the base-64 sequence
       
   393 			pointerToPreviousUtf7Byte=pointerToEscapeCharacterStartingBase64Block;
       
   394 			__ASSERT_DEBUG(*pointerToPreviousUtf7Byte==escapeCharacterForStartingBase64Block, Panic(EPanicBadUtf7Pointers10));
       
   395 			if (oldNumberOfBase64Characters<4) // if the new base-64 sequence will be so short that it won't even be able to contain the UTF-7 encoding of a single Unicode character
       
   396 				{
       
   397 				--pointerToPreviousUtf7Byte; // move back pointerToPreviousUtf7Byte to before the escapeCharacterForStartingBase64Block
       
   398 				break;
       
   399 				}
       
   400 			const TInt newNumberOfUnicodeCharacters=((oldNumberOfBase64Characters-1)*3)/8;
       
   401 			pointerToPreviousUnicodeCharacter+=newNumberOfUnicodeCharacters;
       
   402 			pointerToPreviousUtf7Byte+=((newNumberOfUnicodeCharacters*8)+2)/3;
       
   403 			const TInt numberOfBitsToBeZeroedInLastBase64Character=(newNumberOfUnicodeCharacters%3)*2;
       
   404 			if (numberOfBitsToBeZeroedInLastBase64Character!=0)
       
   405 				{
       
   406 				*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding(Base64Decoding(*pointerToPreviousUtf7Byte, aIsImapUtf7)&0x3f&~((1<<numberOfBitsToBeZeroedInLastBase64Character)-1), aIsImapUtf7));
       
   407 				}
       
   408 			bitBuffer=KIsInBase64Block;
       
   409 			numberOfBitsInBuffer=0;
       
   410 			}
       
   411 		}
       
   412 	aUtf7.SetLength((pointerToPreviousUtf7Byte-aUtf7.Ptr())+1);
       
   413 	return pointerToLastUnicodeCharacter-pointerToPreviousUnicodeCharacter;
       
   414 	}
       
   415 
       
   416  
       
   417 
       
   418 /** Converts Unicode text into UTF-8 encoding.
       
   419 
       
   420 @param aUtf8 On return, contains the UTF-8 encoded output string.
       
   421 @param aUnicode The Unicode-encoded input string.
       
   422 @return The number of unconverted characters left at the end of the input 
       
   423 descriptor, or one of the error values defined in TError. */
       
   424 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
       
   425 	{
       
   426 	return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
       
   427 	}
       
   428 
       
   429 
       
   430 /**  Converts Unicode text into UTF-8 encoding.
       
   431 
       
   432 The variant of UTF-8 used internally by Java differs slightly from
       
   433 standard UTF-8. The TBool argument controls the UTF-8
       
   434 variant generated by this function. This function leaves with a 
       
   435 KErrCorrupt if the input string is corrupt. 
       
   436 
       
   437 @param aUnicode A UCS-2 encoded input string.
       
   438 @return A pointer to an HBufC8 containing the converted UTF8. */	
       
   439 EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf8L(const TDesC16& aUnicode)
       
   440  	{
       
   441 	// If aUnicode is  Null string, return an empty HBufC
       
   442 	if (aUnicode.Length() == 0)
       
   443 		{
       
   444 		HBufC8* hBuf8 = HBufC8::NewL(1);
       
   445 		return hBuf8;
       
   446 		}
       
   447 
       
   448 	// Otherwise, convert and store result in a buffer, reallocating that buffer if needed.
       
   449 	const TInt length = aUnicode.Length();
       
   450 	const TInt bufsize = 100;
       
   451 	
       
   452 	TPtrC16 unicode (aUnicode);
       
   453 	TBuf8<bufsize> buf;
       
   454 	HBufC8* hBuf8 = HBufC8::NewLC(length);
       
   455 	TPtr8 utf8 = hBuf8->Des();
       
   456 
       
   457 	FOREVER
       
   458 		{
       
   459 		TInt unconverted = ConvertFromUnicodeToUtf8(buf, unicode);
       
   460 		if( unconverted == EErrorIllFormedInput || unconverted < 0)
       
   461 			User::Leave(KErrCorrupt);
       
   462 
       
   463 		if (utf8.Length() + buf.Length() > utf8.MaxLength())
       
   464 			{
       
   465 			// Reallocate the hBuf8
       
   466 			hBuf8 = hBuf8->ReAllocL(utf8.Length() + buf.Length());
       
   467 			CleanupStack::Pop();
       
   468 			CleanupStack::PushL(hBuf8);
       
   469 			utf8.Set(hBuf8->Des());
       
   470 			}
       
   471 		utf8.Append(buf);
       
   472 		if (unconverted ==0) 
       
   473 			break;
       
   474 		unicode.Set(unicode.Right(unconverted));
       
   475 		}
       
   476 	CleanupStack::Pop();
       
   477 	return hBuf8;
       
   478 	}
       
   479 
       
   480 /** Converts Unicode text into UTF-8 encoding. 
       
   481 
       
   482 Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
       
   483 
       
   484 The variant of UTF-8 used internally by Java differs slightly from standard 
       
   485 UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
       
   486 
       
   487 @param aUtf8 On return, contains the UTF-8 encoded output string.
       
   488 @param aUnicode A UCS-2 encoded input string.
       
   489 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
       
   490 UTF-8. The default is EFalse.
       
   491 @return The number of unconverted characters left at the end of the input descriptor, 
       
   492 or one of the error values defined in TError. */
       
   493 TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, 
       
   494 											   const TDesC16& aUnicode, 
       
   495 											   TBool aGenerateJavaConformantUtf8)
       
   496 	{
       
   497 	if (aUnicode.Length() == 0)
       
   498 		{
       
   499 		aUtf8.SetLength(0);
       
   500 		return 0;
       
   501 		}
       
   502 	if (aUtf8.MaxLength() == 0)
       
   503 		{
       
   504 		return aUnicode.Length();
       
   505 		}
       
   506 	
       
   507 	TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
       
   508 	const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
       
   509 	TBool inputIsTruncated = EFalse;
       
   510 	const TUint16* pUnicode = aUnicode.Ptr();
       
   511 	const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
       
   512 	
       
   513 	FOREVER
       
   514 		{
       
   515 		__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
       
   516 		__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
       
   517 	
       
   518 		if (pUnicode[0] < 0x80)
       
   519 			{
       
   520 			// ascii - 1 byte
       
   521 			
       
   522 			// internally java is different since the \x0000 character is 
       
   523 			// translated into \xC0 \x80.
       
   524 			
       
   525 			if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
       
   526 				{
       
   527 				if (pUtf8 == pointerToLastUtf8Byte)
       
   528 					{
       
   529 					pUtf8--;
       
   530 					pUnicode--;
       
   531 					break;			
       
   532 					}
       
   533 				*pUtf8++ = STATIC_CAST(TUint8, 0xc0);
       
   534 				*pUtf8   = STATIC_CAST(TUint8, 0x80);	
       
   535 				}
       
   536 			else
       
   537 				{
       
   538 				*pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
       
   539 				}
       
   540 			}
       
   541 		else if (pUnicode[0] < 0x800)
       
   542 			{
       
   543 			// U+0080..U+07FF - 2 bytes
       
   544 			
       
   545 			if (pUtf8 == pointerToLastUtf8Byte)
       
   546 				{
       
   547 				pUtf8--;
       
   548 				pUnicode--;
       
   549 				break;
       
   550 				}
       
   551 			
       
   552 			*pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
       
   553 			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
       
   554 			
       
   555 			}
       
   556 
       
   557 		// check to see if we have a surrogate in the stream, surrogates encode code points outside
       
   558 		// the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
       
   559 
       
   560 		else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
       
   561 			{
       
   562 			// surrogate pair - 4 bytes in utf-8
       
   563 			// U+10000..U+10FFFF
       
   564 			
       
   565 			__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
       
   566 			// is there enough space to hold the character
       
   567 			if ((pointerToLastUtf8Byte - pUtf8) < 3)
       
   568 				{
       
   569 				pUtf8--;
       
   570 				pUnicode--;
       
   571 				break;  // no go to the exit condition
       
   572 				}
       
   573 			
       
   574 			__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
       
   575 			if (pUnicode >= pointerToLastUnicodeCharacter)
       
   576 				{
       
   577 				pUtf8--;
       
   578 				pUnicode--;
       
   579 				inputIsTruncated = ETrue;
       
   580 				break; // middle of a surrogate pair. go to end condition
       
   581 				}
       
   582 			
       
   583 			if ((pUnicode[1] & 0xfc00) != 0xdc00)
       
   584 				{
       
   585 				return EErrorIllFormedInput;
       
   586 				}
       
   587 			
       
   588 			// convert utf-16 surrogate to utf-32
       
   589 			TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
       
   590 			
       
   591 			// convert utf-32 to utf-8
       
   592             *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));   
       
   593             *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
       
   594             *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
       
   595             *pUtf8   = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
       
   596 			
       
   597             // we consumed 2 utf-16 values, move this pointer
       
   598 			pUnicode++;
       
   599 			}		
       
   600 		else
       
   601 			{
       
   602 			// 3 byte - utf-8, U+800..U+FFFF rest of BMP.
       
   603 			
       
   604 			if (pointerToLastUtf8Byte - pUtf8 < 2)
       
   605 				{
       
   606 				pUtf8--;
       
   607 				pUnicode--;
       
   608 				break;
       
   609 				}
       
   610 			*pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
       
   611 			*pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
       
   612 			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
       
   613 			}
       
   614 		
       
   615 		if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
       
   616 			{
       
   617 			break;
       
   618 			}
       
   619 		
       
   620 		pUtf8++;
       
   621 		pUnicode++;
       
   622 		
       
   623 		}
       
   624 	
       
   625 	if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
       
   626 		{
       
   627 		return EErrorIllFormedInput;
       
   628 		}
       
   629 	
       
   630 	aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
       
   631 	return pointerToLastUnicodeCharacter-pUnicode;
       
   632 	}
       
   633 
       
   634 
       
   635 
       
   636 /**  Converts text encoded using the Unicode transformation format UTF-7
       
   637 into the Unicode UCS-2 character set.
       
   638 
       
   639 @param aUtf7 The UTF-7 encoded input string.
       
   640 @return A pointer to an HBufC16 containing the converted Unicode string */	
       
   641 EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf7L(const TDesC8& aUtf7)
       
   642 	{
       
   643 		// If aUtf8 is an empty string return 
       
   644 	if (aUtf7.Length()==0)
       
   645 		{
       
   646 		HBufC16* hBuf = HBufC16::NewL(1);
       
   647 		return hBuf;
       
   648 		}
       
   649 
       
   650 	// else convert aUtf8 to Unicode storing the result in a buffer, reallocating
       
   651 	// it when needed.
       
   652 	TInt length = aUtf7.Length();
       
   653 	const TInt bufsize = 100;
       
   654 	TInt state = KStateDefault;
       
   655 
       
   656 	TPtrC8 utf7 (aUtf7);
       
   657 	TBuf<bufsize> buf;
       
   658 	HBufC16* hBuf = HBufC16::NewLC(length);
       
   659 	TPtr unicode = hBuf->Des();
       
   660 
       
   661 	FOREVER
       
   662 		{
       
   663 		TInt unconverted = ConvertToUnicodeFromUtf7(buf, utf7, state);
       
   664 		if( unconverted == EErrorIllFormedInput || unconverted < 0)
       
   665 			User::Leave(KErrCorrupt);
       
   666 
       
   667 		if (unicode.Length() + buf.Length() > unicode.MaxLength())
       
   668 			{
       
   669 			// Reallocate hBuf
       
   670 			hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length());
       
   671 			CleanupStack::Pop();
       
   672 			CleanupStack::PushL(hBuf);
       
   673 			unicode.Set(hBuf->Des());
       
   674 			}
       
   675 		unicode.Append(buf);
       
   676 		if (unconverted ==0) 
       
   677 			break;
       
   678 		utf7.Set(utf7.Right(unconverted));
       
   679 		}
       
   680 	CleanupStack::Pop();
       
   681 	return hBuf;
       
   682 	}
       
   683 
       
   684  
       
   685 
       
   686 /** Converts text encoded using the Unicode transformation format UTF-7 into the 
       
   687 Unicode UCS-2 character set.
       
   688 
       
   689 If the conversion is achieved using a series of calls to this function, where 
       
   690 each call starts off where the previous call reached in the input descriptor, 
       
   691 the state of the conversion is stored. The initial value of the state variable 
       
   692 should be set as KStateDefault when the conversion is started, and afterwards 
       
   693 simply passed unchanged into each function call.
       
   694 
       
   695 @param aUnicode On return, contains the Unicode encoded output string.
       
   696 @param aUtf7 The UTF-7 encoded input string.
       
   697 @param aState For the first call of the function set to KStateDefault. For 
       
   698 subsequent calls, pass in the variable unchanged.
       
   699 @return The number of unconverted bytes left at the end of the input descriptor, 
       
   700 or one of the error values defined in TError. */
       
   701 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode, 
       
   702 														const TDesC8& aUtf7, 
       
   703 														TInt& aState)
       
   704 	{
       
   705 	return ConvertToUnicodeFromUtf7(aUnicode, aUtf7, EFalse, aState);
       
   706 	}
       
   707 
       
   708 TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode, 
       
   709 											   const TDesC8& aUtf7, 
       
   710 											   TBool aIsImapUtf7, 
       
   711 											   TInt& aState)
       
   712 	{
       
   713 	if (aUtf7.Length()==0)
       
   714 		{
       
   715 		aUnicode.SetLength(0);
       
   716 		return 0;
       
   717 		}
       
   718 	if (aUnicode.MaxLength()==0)
       
   719 		{
       
   720 		return aUtf7.Length();
       
   721 		}
       
   722 	const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7);
       
   723 	TUint16* pointerToPreviousUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr()-1);
       
   724 	const TUint16* pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.MaxLength();
       
   725 	const TUint8* pointerToCurrentUtf7Byte=aUtf7.Ptr();
       
   726 	const TUint8* pointerToLastUtf7Byte=pointerToCurrentUtf7Byte+(aUtf7.Length()-1);
       
   727 	TUint currentUtf7Byte=*pointerToCurrentUtf7Byte;
       
   728 	const TUint KIsInBase64Block=0x80000000u;
       
   729 	TUint bitBuffer=STATIC_CAST(TUint, aState);
       
   730 	TInt numberOfBitsInBuffer=((bitBuffer&0xf0)>>4);
       
   731 	bitBuffer&=~0xf0; // turn off the bits that stored numberOfBitsInBuffer
       
   732 	if (bitBuffer&KIsInBase64Block)
       
   733 		{
       
   734 		__ASSERT_ALWAYS((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4) || ((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)), Panic(EPanicBadBitBufferState7));
       
   735 		__ASSERT_ALWAYS((bitBuffer&~(KIsInBase64Block|0x0000000f))==0, Panic(EPanicBadBitBufferState8));
       
   736 		}
       
   737 	else
       
   738 		{
       
   739 		__ASSERT_ALWAYS(bitBuffer==0, Panic(EPanicBadBitBufferState9));
       
   740 		__ASSERT_ALWAYS(numberOfBitsInBuffer==0, Panic(EPanicBadBitBufferState10));
       
   741 		}
       
   742 	aState=KStateDefault;
       
   743 	if (bitBuffer&KIsInBase64Block)
       
   744 		{
       
   745 		currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7);
       
   746 		}
       
   747 	TBool inputIsTruncated=EFalse;
       
   748 	FOREVER
       
   749 		{
       
   750 		__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers5));
       
   751 		__ASSERT_DEBUG(pointerToCurrentUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers11));
       
   752 		__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (currentUtf7Byte==*pointerToCurrentUtf7Byte), Panic(EPanicOutOfSyncUtf7Byte1));
       
   753 		__ASSERT_DEBUG((~bitBuffer&KIsInBase64Block) || (currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7)), Panic(EPanicOutOfSyncUtf7Byte2));
       
   754 		__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || ((bitBuffer==0) && (numberOfBitsInBuffer==0)), Panic(EPanicBadBitBufferState11));
       
   755 		if ((~bitBuffer&KIsInBase64Block) && (currentUtf7Byte==escapeCharacterForStartingBase64Block))
       
   756 			{
       
   757 			if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
       
   758 				{
       
   759 				--pointerToCurrentUtf7Byte;
       
   760 				inputIsTruncated=ETrue;
       
   761 				goto end;
       
   762 				}
       
   763 			++pointerToCurrentUtf7Byte;
       
   764 			currentUtf7Byte=*pointerToCurrentUtf7Byte;
       
   765 			if (currentUtf7Byte=='-')
       
   766 				{
       
   767 				currentUtf7Byte=escapeCharacterForStartingBase64Block;
       
   768 				}
       
   769 			else
       
   770 				{
       
   771 				currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7);
       
   772 				if (currentUtf7Byte==KNotInBase64Alphabet)
       
   773 					{
       
   774 					return EErrorIllFormedInput;
       
   775 					}
       
   776 				bitBuffer=KIsInBase64Block;
       
   777 				}
       
   778 			}
       
   779 		if (bitBuffer&KIsInBase64Block)
       
   780 			{
       
   781 			FOREVER
       
   782 				{
       
   783 				__ASSERT_DEBUG(currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7), Panic(EPanicOutOfSyncBase64Decoding));
       
   784 				__ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState12));
       
   785 				if (currentUtf7Byte==KNotInBase64Alphabet)
       
   786 					{
       
   787 					if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer))
       
   788 						{
       
   789 						return EErrorIllFormedInput;
       
   790 						}
       
   791 					bitBuffer=0;
       
   792 					numberOfBitsInBuffer=0;
       
   793 					currentUtf7Byte=*pointerToCurrentUtf7Byte;
       
   794 					if (currentUtf7Byte=='-')
       
   795 						{
       
   796 						if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
       
   797 							{
       
   798 							goto end;
       
   799 							}
       
   800 						++pointerToCurrentUtf7Byte;
       
   801 						currentUtf7Byte=*pointerToCurrentUtf7Byte;
       
   802 						}
       
   803 					break;
       
   804 					}
       
   805 				bitBuffer<<=6;
       
   806 				bitBuffer|=currentUtf7Byte;
       
   807 				bitBuffer|=KIsInBase64Block;
       
   808 				numberOfBitsInBuffer+=6;
       
   809 				// only flush the buffer if it contains a whole Unicode character and the remainder is either all zero-bits (hence would be a legal point to end the base-64 sequence) or at least 6 bits long (therefore would leave at least one UTF-7 byte unconverted at the end of the input descriptor)
       
   810 				if ((numberOfBitsInBuffer>=16+6) || ((numberOfBitsInBuffer>=16) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16)))
       
   811 					{
       
   812 					numberOfBitsInBuffer-=16;
       
   813 					__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers6));
       
   814 					++pointerToPreviousUnicodeCharacter;
       
   815 					*pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, bitBuffer>>numberOfBitsInBuffer);
       
   816 					bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - must be done as bitBuffer is stored along with numberOfBitsInBuffer in aState if the output descriptor runs out of space or if the input descriptor was truncated
       
   817 					bitBuffer|=KIsInBase64Block; // turn it back on as the line above turned it off
       
   818 					if (pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)
       
   819 						{
       
   820 						goto end;
       
   821 						}
       
   822 					}
       
   823 				if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
       
   824 					{
       
   825 					inputIsTruncated=ETrue;
       
   826 					goto end;
       
   827 					}
       
   828 				++pointerToCurrentUtf7Byte;
       
   829 				currentUtf7Byte=Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7);
       
   830 				}
       
   831 			}
       
   832 		else
       
   833 			{
       
   834 			__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers7));
       
   835 			++pointerToPreviousUnicodeCharacter;
       
   836 			*pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, currentUtf7Byte);
       
   837 			if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte))
       
   838 				{
       
   839 				goto end;
       
   840 				}
       
   841 			++pointerToCurrentUtf7Byte;
       
   842 			currentUtf7Byte=*pointerToCurrentUtf7Byte;
       
   843 			}
       
   844 		}
       
   845 end:
       
   846 	if (bitBuffer&KIsInBase64Block)
       
   847 		{
       
   848 		__ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState13));
       
   849 		if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer))
       
   850 			{
       
   851 			// rewind how far we've got in the UTF-7 descriptor to indicate to the user (by returning a value greater than zero) that not all of the input could be converted as it ended with a truncated base-64 sequence
       
   852 			__ASSERT_DEBUG(numberOfBitsInBuffer>=6, Panic(EPanicBadBitBufferState14));
       
   853 			pointerToCurrentUtf7Byte-=numberOfBitsInBuffer/6;
       
   854 			const TInt newNumberOfBitsInBuffer=numberOfBitsInBuffer%6;
       
   855 			bitBuffer&=~KIsInBase64Block; // temporarily turn off the KIsInBase64Block for the right-shift
       
   856 			bitBuffer>>=(numberOfBitsInBuffer-newNumberOfBitsInBuffer);
       
   857 			bitBuffer|=KIsInBase64Block; // must be turned back on again as the bit-buffer is packed into aState
       
   858 			numberOfBitsInBuffer=newNumberOfBitsInBuffer;
       
   859 			__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState15));
       
   860 			}
       
   861 		__ASSERT_DEBUG((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0), Panic(EPanicBadBitBufferState16));
       
   862 		aState=STATIC_CAST(TInt, bitBuffer);
       
   863 		aState|=(numberOfBitsInBuffer<<4);
       
   864 		__ASSERT_DEBUG(aState&KIsInBase64Block, Panic(EPanicBadBitBufferState17));
       
   865 		bitBuffer=0;
       
   866 		numberOfBitsInBuffer=0;
       
   867 		}
       
   868 	if ((pointerToCurrentUtf7Byte<aUtf7.Ptr()) && inputIsTruncated)
       
   869 		{
       
   870 		return EErrorIllFormedInput;
       
   871 		}
       
   872 	aUnicode.SetLength((pointerToPreviousUnicodeCharacter+1)-aUnicode.Ptr());
       
   873 	return pointerToLastUtf7Byte-pointerToCurrentUtf7Byte;
       
   874 	}
       
   875 
       
   876 
       
   877 
       
   878 /** Converts text encoded using the Unicode transformation format UTF-8
       
   879 into the Unicode UCS-2 character set. This function leaves with an 
       
   880 error code of the input string is corrupted. 
       
   881 
       
   882 @param aUtf8 The UTF-8 encoded input string
       
   883 @return A pointer to an HBufC16 with the converted Unicode string. */	
       
   884 EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf8L(const TDesC8& aUtf8)
       
   885  	{
       
   886 	// If aUtf8 is an empty string return 
       
   887 	if (aUtf8.Length()==0)
       
   888 		{
       
   889 		HBufC16* hBuf = HBufC16::NewL(1);
       
   890 		return hBuf;
       
   891 		}
       
   892 
       
   893 	// else convert aUtf8 to Unicode storing the result in a buffer, reallocating
       
   894 	// it when needed.
       
   895 	TInt length = aUtf8.Length();
       
   896 	const TInt bufsize = 100;
       
   897 
       
   898 	TPtrC8 utf8 (aUtf8);
       
   899 	TBuf<bufsize> buf;
       
   900 	HBufC16* hBuf = HBufC16::NewLC(length);
       
   901 	TPtr unicode = hBuf->Des();
       
   902 
       
   903 	FOREVER
       
   904 		{
       
   905 		TInt unconverted = ConvertToUnicodeFromUtf8(buf, utf8);
       
   906 		if( unconverted == EErrorIllFormedInput || unconverted < 0)
       
   907 			User::Leave(KErrCorrupt);
       
   908 
       
   909 		if (unicode.Length() + buf.Length() > unicode.MaxLength())
       
   910 			{
       
   911 			// Reallocate hBuf
       
   912 			hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length());
       
   913 			CleanupStack::Pop();
       
   914 			CleanupStack::PushL(hBuf);
       
   915 			unicode.Set(hBuf->Des());
       
   916 			}
       
   917 		unicode.Append(buf);
       
   918 		if (unconverted ==0) 
       
   919 			break;
       
   920 		utf8.Set(utf8.Right(unconverted));
       
   921 		}
       
   922 	CleanupStack::Pop();
       
   923 	return hBuf;
       
   924 	}
       
   925 
       
   926 /** Converts text encoded using the Unicode transformation format UTF-8 into the 
       
   927 Unicode UCS-2 character set.
       
   928 
       
   929 @param aUnicode On return, contains the Unicode encoded output string.
       
   930 @param aUtf8 The UTF-8 encoded input string
       
   931 @return The number of unconverted bytes left at the end of the input descriptor, 
       
   932 or one of the error values defined in TError. */
       
   933 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
       
   934 	{
       
   935 	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
       
   936 	}
       
   937 
       
   938 static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
       
   939 		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
       
   940 	{
       
   941 	if (aNumberOfUnconvertibleCharacters<=0)
       
   942 		{
       
   943 		aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
       
   944 		}
       
   945 	++aNumberOfUnconvertibleCharacters;
       
   946 	}
       
   947 
       
   948 /** Converts text encoded using the Unicode transformation format UTF-8 into the 
       
   949 Unicode UCS-2 character set.
       
   950 
       
   951 @param aUnicode On return, contains the Unicode encoded output string.
       
   952 @param aUtf8 The UTF-8 encoded input string
       
   953 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
       
   954 @return The number of unconverted bytes left at the end of the input descriptor, 
       
   955 or one of the error values defined in TError. */
       
   956 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
       
   957 	{
       
   958 	TInt dummyUnconverted, dummyUnconvertedIndex;
       
   959 	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
       
   960 	}
       
   961 
       
   962 /** Converts text encoded using the Unicode transformation format UTF-8 into the 
       
   963 Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
       
   964 
       
   965 The variant of UTF-8 used internally by Java differs slightly from standard 
       
   966 UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
       
   967 
       
   968 @param aUnicode On return, contains the Unicode encoded output string.
       
   969 @param aUtf8 The UTF-8 encoded input string
       
   970 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
       
   971 UTF-8. The default is EFalse.
       
   972 @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes 
       
   973 which were not converted.
       
   974 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index 
       
   975 of the first byte of the first unconvertible character. For instance if the 
       
   976 first character in the input descriptor (aForeign) could not be converted, 
       
   977 then this parameter is set to the first byte of that character, i.e. zero. 
       
   978 A negative value is returned if all the characters were converted.
       
   979 @return The number of unconverted bytes left at the end of the input descriptor, 
       
   980 or one of the error values defined in TError. */
       
   981 
       
   982 /* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
       
   983  * Well formed UTF-8 Byte Sequences, full table.
       
   984  * +----------------------------------------------------------------+
       
   985  * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
       
   986  * +--------------------+----------+----------+----------+----------+
       
   987  * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
       
   988  * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
       
   989  * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
       
   990  * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
       
   991  * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
       
   992  * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
       
   993  * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
       
   994  * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
       
   995  * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
       
   996  * +--------------------+----------+----------+----------+----------+
       
   997  * 
       
   998  * As a consequence of the well-formedness conditions specified in table 3-7,
       
   999  * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
       
  1000  */
       
  1001 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
       
  1002 		TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
       
  1003 	{	
       
  1004 	aUnicode.SetLength(0);
       
  1005 	
       
  1006 	if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
       
  1007 		{
       
  1008 		return aUtf8.Length();
       
  1009 		}
       
  1010 
       
  1011 	TUint16*           pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
       
  1012 	const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
       
  1013 	const TUint8*         pUtf8 = aUtf8.Ptr();   
       
  1014 	const TUint8*     pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
       
  1015 	const TUint16 replacementcharacter = 0xFFFD;
       
  1016 	TUint currentUnicodeCharacter;
       
  1017 	TInt sequenceLength;
       
  1018 
       
  1019 	
       
  1020 	FOREVER
       
  1021 		{
       
  1022 		TBool illFormed=EFalse;
       
  1023 		
       
  1024 		__ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
       
  1025 		__ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
       
  1026 		
       
  1027 		sequenceLength = 1;
       
  1028 		
       
  1029 		// ascii - optimisation (i.e. it isn't a sequence)
       
  1030 		if (pUtf8[0] < 0x80)
       
  1031 			{
       
  1032 			currentUnicodeCharacter = pUtf8[0];
       
  1033 			}
       
  1034 		else
       
  1035 			{
       
  1036 			// see if well formed utf-8, use table above for reference	
       
  1037 			if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
       
  1038 				{
       
  1039 				// 0xc1-0xc2 are not valid bytes
       
  1040 				sequenceLength = 2;
       
  1041 				}
       
  1042 			else if ((pUtf8[0] & 0xf0) == 0xe0)
       
  1043 				{
       
  1044 				sequenceLength = 3;
       
  1045 				}
       
  1046 			else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
       
  1047 				{
       
  1048 				// 0xf5-0xff, are not valid bytes
       
  1049 				sequenceLength = 4;
       
  1050 				}
       
  1051 			else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
       
  1052 				{
       
  1053 				if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
       
  1054 					{
       
  1055 					// either we've split the 0xc0 0x80 (i.e. 0xc0 is
       
  1056 					// the last character in the string) or we've
       
  1057 					// discovered a valid 0xc0 0x80 sequence.  
       
  1058 					sequenceLength = 2;
       
  1059 					}
       
  1060 				}
       
  1061 			
       
  1062 			/* checking to see if we got a valid sequence */
       
  1063 			if (sequenceLength == 1)
       
  1064 				{
       
  1065 				// bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
       
  1066 				currentUnicodeCharacter = replacementcharacter;
       
  1067 				UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1068 						aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1069 				}
       
  1070 			else
       
  1071 				{
       
  1072 				// this is a check to see if the sequence goes beyond the input 
       
  1073 				// stream.  if its not the first and only character in the input
       
  1074 				// stream this isn't an error, otherwise it is.
       
  1075 				if ((pUtf8 + sequenceLength - 1) >  pLastUtf8)
       
  1076 					{
       
  1077 					// check to see if this sequence was the first character
       
  1078 					if ((pUnicode - aUnicode.Ptr()) == 0)
       
  1079 						{
       
  1080 						return EErrorIllFormedInput;
       
  1081 						}
       
  1082 					break;
       
  1083 					}			
       
  1084 				
       
  1085 				currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
       
  1086 			
       
  1087 				/* check the trailing bytes, they should begin with 10 */
       
  1088 				TUint i = 1;
       
  1089 
       
  1090 				do
       
  1091 					{
       
  1092 					if ((pUtf8[i] & 0xc0) == 0x80)
       
  1093 						{
       
  1094 						// add the trailing 6 bits to the current unicode char
       
  1095 						currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
       
  1096 						}
       
  1097 					else
       
  1098 						{
       
  1099 						// ill formed character (doesn't have a lead 10)
       
  1100 						currentUnicodeCharacter = replacementcharacter;
       
  1101 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1102 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1103 						illFormed=ETrue;
       
  1104 						break; 
       
  1105 						}
       
  1106 					i++;
       
  1107 					}
       
  1108 				while (i < sequenceLength);
       
  1109 				}
       
  1110 				
       
  1111 			/* conformance check.  bits of above table for reference.
       
  1112 			 * +----------------------------------------------------------------+
       
  1113 			 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
       
  1114 			 * +--------------------+----------+----------+----------+----------+
       
  1115 			 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, 2nd < 0xA0
       
  1116 			 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, 2nd > 0x9F
       
  1117 			 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, 2nd < 0x90
       
  1118 			 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, 2nd > 0x8F
       
  1119 			 * +--------------------+----------+----------+----------+----------+
       
  1120 			 */
       
  1121 			
       
  1122 			if (currentUnicodeCharacter != replacementcharacter)
       
  1123 				{
       
  1124 				if (sequenceLength == 3)
       
  1125 					{
       
  1126 					if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
       
  1127 						{
       
  1128 						currentUnicodeCharacter = replacementcharacter;
       
  1129 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1130 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1131 						illFormed=ETrue;
       
  1132 						}
       
  1133 					else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
       
  1134 						{
       
  1135 						currentUnicodeCharacter = replacementcharacter;
       
  1136 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1137 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1138 						illFormed=ETrue;
       
  1139 						}
       
  1140 					}
       
  1141 				else if (sequenceLength == 4)
       
  1142 					{
       
  1143 					if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
       
  1144 						{
       
  1145 						currentUnicodeCharacter = replacementcharacter;
       
  1146 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1147 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1148 						illFormed=ETrue;
       
  1149 						}
       
  1150 					else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
       
  1151 						{
       
  1152 						currentUnicodeCharacter = replacementcharacter;
       
  1153 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1154 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1155 						illFormed=ETrue;
       
  1156 						}
       
  1157 					}
       
  1158 				
       
  1159 				
       
  1160 				/* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
       
  1161 				 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code 
       
  1162 				 * points D800..DFFF is ill formed */
       
  1163 				
       
  1164 				if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
       
  1165 					{
       
  1166 					currentUnicodeCharacter = replacementcharacter;
       
  1167 					UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1168 							aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1169 					illFormed=ETrue;
       
  1170 					}	
       
  1171 				}
       
  1172 				// end conformance check
       
  1173 			}
       
  1174 
       
  1175 		// would this character generate a surrogate pair in UTF-16?
       
  1176 		if (currentUnicodeCharacter > 0xFFFF)
       
  1177 			{
       
  1178 			// is there enough space to hold a surrogate pair in the output?
       
  1179 			if (pUnicode >= pLastUnicode)
       
  1180 				{
       
  1181 				break; // no, end processing.
       
  1182 				}
       
  1183 			
       
  1184 			TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
       
  1185 			*pUnicode++ = STATIC_CAST(TUint16, surrogate);
       
  1186 					
       
  1187 			surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
       
  1188 			*pUnicode++ = STATIC_CAST(TUint16, surrogate);			
       
  1189 			}
       
  1190 		else
       
  1191 			{
       
  1192 			*pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
       
  1193 			}
       
  1194 		
       
  1195 		// move the input pointer
       
  1196 		if (currentUnicodeCharacter != replacementcharacter)
       
  1197 			{
       
  1198 			pUtf8 += sequenceLength;
       
  1199 			}
       
  1200 		else if(illFormed == EFalse)
       
  1201 			{
       
  1202 			pUtf8 += (sequenceLength);
       
  1203 			}
       
  1204 		else
       
  1205 			{
       
  1206 			// we had a character we didn't recognize (i.e. it was invalid)
       
  1207 			// so move to the next character in the input
       
  1208 			pUtf8++;
       
  1209 			}
       
  1210 		
       
  1211 		if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
       
  1212 			{ 
       
  1213 			break;  // we've either reached the end of the input or the end of output
       
  1214 			}
       
  1215 		}
       
  1216 
       
  1217 	aUnicode.SetLength(pUnicode - aUnicode.Ptr());
       
  1218 	return (pLastUtf8 - pUtf8 + 1);
       
  1219 	}
       
  1220 
       
  1221 /** Given a sample text this function attempts to determine whether or not
       
  1222  *  the same text is encoded using the UTF-8 standard encoding scheme.
       
  1223 
       
  1224 @param TInt a confidence level, given at certain value.  if the given sample
       
  1225 			is UTF-8 this value will not be changed (unless > 100) then its
       
  1226 			set to 100.  Otherwise if the same isn't UTF-8, its set to 0.
       
  1227 @param TDesC8 sample text.
       
  1228 UTF-8. The default is EFalse.
       
  1229 @return void
       
  1230  */
       
  1231 
       
  1232 /* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
       
  1233  * Well formed UTF-8 Byte Sequences, full table.
       
  1234  * +----------------------------------------------------------------+
       
  1235  * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
       
  1236  * +--------------------+----------+----------+----------+----------+
       
  1237  * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
       
  1238  * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
       
  1239  * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
       
  1240  * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
       
  1241  * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
       
  1242  * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
       
  1243  * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
       
  1244  * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
       
  1245  * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
       
  1246  * +--------------------+----------+----------+----------+----------+
       
  1247  * 
       
  1248  * As a consequence of the well-formedness conditions specified in table 3-7,
       
  1249  * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
       
  1250  * 
       
  1251  * Code Rules:
       
  1252  *   R1: If the string contains any non-UTF-8 characters the returned confidence
       
  1253  *       is 0.  Valid UTF-8 combinations are listed in the above table.
       
  1254  *   R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in  
       
  1255  *       the (see ) the returned confidence is 95.
       
  1256  *   R3: Otherwise the confidence returned is based upon the sample string 
       
  1257  *       length.
       
  1258  *   R4: If the sample string is under 75 characters, the confidence is set to 
       
  1259  *       75.
       
  1260  */
       
  1261 GLREF_C void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
       
  1262 	{
       
  1263 
       
  1264 	TInt sampleLength = aSample.Length();
       
  1265 	
       
  1266 	if (sampleLength == 0)
       
  1267 		{
       
  1268 		aConfidenceLevel = 89;
       
  1269 		return;
       
  1270 		}
       
  1271 	TInt bytesRemaining  = 0;
       
  1272 	TInt sequenceLength  = 0;
       
  1273 	
       
  1274 	aConfidenceLevel = sampleLength;
       
  1275 
       
  1276 	const TUint8* buffer = &aSample[0];
       
  1277 
       
  1278 	if (sampleLength < 95)
       
  1279 		{
       
  1280 		// check for the BOM
       
  1281 		if ((sampleLength >= 3) && 
       
  1282 			((buffer[0] == 0xEF) &&
       
  1283 			 (buffer[1] == 0xBB) &&
       
  1284 			 (buffer[2] == 0xBF)) 
       
  1285 			) 
       
  1286 			{
       
  1287 			aConfidenceLevel = 95;
       
  1288 			}
       
  1289 		else if (sampleLength < 75)
       
  1290 			{
       
  1291 			aConfidenceLevel = 75;
       
  1292 			}
       
  1293 		}
       
  1294 	
       
  1295 	for (TInt index = 0;index != sampleLength;index++)
       
  1296 		{
       
  1297 		
       
  1298 		if (bytesRemaining > 0)
       
  1299 			{
       
  1300 			// bytesRemaining > 0, means that a byte representing the start of a 
       
  1301 			// multibyte sequence was encountered and the bytesRemaining is the 
       
  1302 			// number of bytes to follow. 
       
  1303 			
       
  1304 			if ((buffer[index] & 0xc0) == 0x80) 
       
  1305 				{
       
  1306 				// need to check for ill-formed sequences -- all are in the 2nd byte
       
  1307 				
       
  1308 				if ((sequenceLength == 3) && (bytesRemaining == 2))
       
  1309 					{
       
  1310 					if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
       
  1311 						{
       
  1312 						aConfidenceLevel = 0;
       
  1313 						break;
       
  1314 						}
       
  1315 					else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
       
  1316 						{
       
  1317 						aConfidenceLevel = 0;
       
  1318 						break;
       
  1319 						}
       
  1320 					}
       
  1321 				else if ((sequenceLength == 4) && (bytesRemaining == 3))
       
  1322 					{
       
  1323 					if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
       
  1324 						{
       
  1325 						aConfidenceLevel = 0;
       
  1326 						break;
       
  1327 						}
       
  1328 					else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
       
  1329 						{
       
  1330 						aConfidenceLevel = 0;
       
  1331 						break;
       
  1332 						}
       
  1333 					}
       
  1334 				
       
  1335 				--bytesRemaining;
       
  1336 				continue;
       
  1337 				}
       
  1338 			else
       
  1339 				{
       
  1340 				aConfidenceLevel = 0;
       
  1341 				break;
       
  1342 				}
       
  1343 			}
       
  1344 		
       
  1345 		if (bytesRemaining == 0)
       
  1346 			{
       
  1347 			if (buffer[index] < 0x80)
       
  1348 				{
       
  1349 				// The value of aSample[index] is in the range 0x00-0x7f
       
  1350 				//UTF8 maintains ASCII transparency. So it's a valid
       
  1351 				//UTF8. Do nothing, check next value.
       
  1352 				continue;
       
  1353 				}
       
  1354 			else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
       
  1355 				{
       
  1356 				// valid start of a 2 byte sequence (see conformance note)
       
  1357 				sequenceLength = 2;
       
  1358 				bytesRemaining = 1;
       
  1359 				}
       
  1360 			else if ((buffer[index] & 0xf0) == 0xe0)
       
  1361 				{
       
  1362 				// valid start of a 3 byte sequence
       
  1363 				sequenceLength = 3;
       
  1364 				bytesRemaining = 2;
       
  1365 				}
       
  1366 			else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
       
  1367 				{
       
  1368 				// valid start of a 4 byte sequence (see conformance note)
       
  1369 				sequenceLength = 4;
       
  1370 				bytesRemaining = 3;
       
  1371 				}	
       
  1372 			else
       
  1373 				{
       
  1374 				// wasn't anything expected so must be an illegal/irregular UTF8 coded value
       
  1375 				aConfidenceLevel = 0;
       
  1376 				break;
       
  1377 				}
       
  1378 			}
       
  1379 		} // for 
       
  1380 	
       
  1381 	aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
       
  1382 	}
       
  1383 
       
  1384 GLREF_C void IsCharacterSetUTF7(TInt& aConfidenceLevel, const TDesC8& aSample)
       
  1385 	{
       
  1386 	TInt sampleLength = aSample.Length();
       
  1387 	aConfidenceLevel = 70;
       
  1388 	for (TInt i=0; i<sampleLength; ++i)
       
  1389 		{
       
  1390 		// UTF-7 value ranges only 7 bits 
       
  1391 		if((aSample[i]&0x80)!=0x00)
       
  1392 			{
       
  1393 			aConfidenceLevel= 0;
       
  1394 			break;
       
  1395 			}
       
  1396 	
       
  1397 		// there is no "~" in UTF-7 encoding. So if find either, it's not UTF-7
       
  1398 		else if (char(aSample[i])=='~')
       
  1399 			{
       
  1400 			aConfidenceLevel = 0; 
       
  1401 			break;
       
  1402 			}
       
  1403 
       
  1404 		// The SMS7Bit escape char value is 0x1b. Reduce confidence if it follows the following format
       
  1405 		else if ( (aSample[i]==0x1b) && (i <sampleLength-1) )
       
  1406 			{
       
  1407 			static const TInt smsExtensionTable[11] = 
       
  1408 				{0x0a, 0x14, 0x1b, 0x28, 0x29, 0x2f, 0x3c, 0x3d, 0x3e, 0x40, 0x65};
       
  1409 			TInt increment1 = i+1;
       
  1410 			if (increment1>= sampleLength)
       
  1411 				break;
       
  1412 			for (TInt j=0; j < 11; ++j)
       
  1413 				{
       
  1414 				if (aSample[increment1] == smsExtensionTable[j])
       
  1415 					{
       
  1416 					aConfidenceLevel-=10;
       
  1417 					}
       
  1418 				}
       
  1419 			}
       
  1420 		// The UTF-7 escape char is 0x2b. The values that follow the escape sequence
       
  1421 		// the values following the escape char value must belong to the modified base64
       
  1422 		// or '-' else it is an ill-formed sequence, so probably not UTF-7
       
  1423 		else if ( (aSample[i]==0x2b)  && (i <sampleLength-1) )
       
  1424 			{
       
  1425 			TInt increment1 = i+1;
       
  1426 			if ((aSample[increment1] == 0x2b) || (aSample[increment1] == 0x2d) || (aSample[increment1] == 0x2f) ||
       
  1427 				((aSample[increment1] >= 0x41) && (aSample[increment1] <= 0x5a)) ||
       
  1428 				((aSample[increment1] >= 0x61) && (aSample[increment1] <= 0x7a))) 
       
  1429 				{
       
  1430 				aConfidenceLevel+=5;
       
  1431 				}
       
  1432 			else
       
  1433 				{
       
  1434 				aConfidenceLevel-=15;
       
  1435 				}
       
  1436 			i++; // should this be here or up in the if loop ??
       
  1437 			}
       
  1438 		} //for
       
  1439 	aConfidenceLevel =(aConfidenceLevel >0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
       
  1440 	}