symport/charconv/framework/src/charconv/utf.cpp
changeset 1 0a7b44b10206
child 2 806186ab5e14
equal deleted inserted replaced
0:c55016431358 1:0a7b44b10206
       
     1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     2 // All rights reserved.
       
     3 // This component and the accompanying materials are made available
       
     4 // under the terms of the License "Symbian Foundation License v1.0"
       
     5 // which accompanies this distribution, and is available
       
     6 // at the URL "http://www.symbianfoundation.org/legal/sfl-v10.html".
       
     7 //
       
     8 // Initial Contributors:
       
     9 // Nokia Corporation - initial contribution.
       
    10 //
       
    11 // Contributors:
       
    12 //
       
    13 // Description:
       
    14 //
       
    15 
       
    16 #include <e32std.h>
       
    17 #include <e32base.h>
       
    18 #include <utf.h>
       
    19 
       
    20 const TUint KNotInBase64Alphabet=KMaxTUint;
       
    21 
       
    22 enum TPanic
       
    23 	{
       
    24 	EPanicBad6BitNumber=1,
       
    25 	EPanicBadUtf7Pointers1,
       
    26 	EPanicBadUtf7Pointers2,
       
    27 	EPanicBadUtf7Pointers3,
       
    28 	EPanicBadUtf7Pointers4,
       
    29 	EPanicBadUtf7Pointers5,
       
    30 	EPanicBadUtf7Pointers6,
       
    31 	EPanicBadUtf7Pointers7,
       
    32 	EPanicBadUtf7Pointers8,
       
    33 	EPanicBadUtf7Pointers9,
       
    34 	EPanicBadUtf7Pointers10,
       
    35 	EPanicBadUtf7Pointers11,
       
    36 	EPanicNotInBase64Block,
       
    37 	EPanicBadUnicodePointers1,
       
    38 	EPanicBadUnicodePointers2,
       
    39 	EPanicBadUnicodePointers3,
       
    40 	EPanicBadUnicodePointers4,
       
    41 	EPanicBadUnicodePointers5,
       
    42 	EPanicBadUnicodePointers6,
       
    43 	EPanicBadUnicodePointers7,
       
    44 	EPanicBadUnicodePointers8,
       
    45 	EPanicBadUnicodePointers9,
       
    46 	EPanicBadUnicodePointers10,
       
    47 	EPanicBadBitBufferState1,
       
    48 	EPanicBadBitBufferState2,
       
    49 	EPanicBadBitBufferState3,
       
    50 	EPanicBadBitBufferState4,
       
    51 	EPanicBadBitBufferState5,
       
    52 	EPanicBadBitBufferState6,
       
    53 	EPanicBadBitBufferState7,
       
    54 	EPanicBadBitBufferState8,
       
    55 	EPanicBadBitBufferState9,
       
    56 	EPanicBadBitBufferState10,
       
    57 	EPanicBadBitBufferState11,
       
    58 	EPanicBadBitBufferState12,
       
    59 	EPanicBadBitBufferState13,
       
    60 	EPanicBadBitBufferState14,
       
    61 	EPanicBadBitBufferState15,
       
    62 	EPanicBadBitBufferState16,
       
    63 	EPanicBadBitBufferState17,
       
    64 	EPanicUnexpectedNumberOfLoopIterations,
       
    65 	EPanicInitialEscapeCharacterButNoBase64,
       
    66 	EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
       
    67 	EPanicBadUtf8Pointers1,
       
    68 	EPanicBadUtf8Pointers2,
       
    69 	EPanicBadUtf8Pointers3,
       
    70 	EPanicBadUtf8Pointers4,
       
    71 	EPanicBadUtf8Pointers5,
       
    72 	EPanicBadUtf8Pointers6,
       
    73 	EPanicBadUtf8Pointers7,
       
    74 	EPanicOutOfSyncUtf7Byte1,
       
    75 	EPanicOutOfSyncUtf7Byte2,
       
    76 	EPanicOutOfSyncBase64Decoding
       
    77 	};
       
    78 
       
    79 _LIT(KLitPanicText, "CHARCONV-UTF");
       
    80 
       
    81 LOCAL_C void Panic(TPanic aPanic)
       
    82 	{
       
    83 	User::Panic(KLitPanicText, aPanic);
       
    84 	}
       
    85 
       
    86 inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
       
    87 
       
    88 LOCAL_C TUint Base64Decoding(TUint aMemberOfBase64Alphabet, TBool aIsImapUtf7)
       
    89 	{
       
    90 	if ((aMemberOfBase64Alphabet>='A') && (aMemberOfBase64Alphabet<='Z'))
       
    91 		{
       
    92 		return aMemberOfBase64Alphabet-'A';
       
    93 		}
       
    94 	if ((aMemberOfBase64Alphabet>='a') && (aMemberOfBase64Alphabet<='z'))
       
    95 		{
       
    96 		return aMemberOfBase64Alphabet-('a'-26);
       
    97 		}
       
    98 	if ((aMemberOfBase64Alphabet>='0') && (aMemberOfBase64Alphabet<='9'))
       
    99 		{
       
   100 		return aMemberOfBase64Alphabet+((26*2)-'0');
       
   101 		}
       
   102 	if (aMemberOfBase64Alphabet=='+')
       
   103 		{
       
   104 		return 62;
       
   105 		}
       
   106 	if (aMemberOfBase64Alphabet==STATIC_CAST(TUint, aIsImapUtf7? ',': '/'))
       
   107 		{
       
   108 		return 63;
       
   109 		}
       
   110 	return KNotInBase64Alphabet;
       
   111 	}
       
   112 
       
   113 LOCAL_C TUint Base64Encoding(TUint a6BitNumber, TBool aIsImapUtf7)
       
   114 	{
       
   115 	__ASSERT_DEBUG(a6BitNumber<64, Panic(EPanicBad6BitNumber));
       
   116 	if ((a6BitNumber==63) && aIsImapUtf7)
       
   117 		{
       
   118 		return ',';
       
   119 		}
       
   120 	static const TUint8 base64Alphabet[64]={'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'};
       
   121 	return base64Alphabet[a6BitNumber];
       
   122 	}
       
   123 
       
   124 LOCAL_C TUint8* PointerToEscapeCharacterStartingBase64Block(TUint8* aPointerToUtf7Byte, const TUint8* aPointerToFirstUtf7Byte, TBool aIsImapUtf7)
       
   125 	{
       
   126 	__ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers1));
       
   127 	TUint8* pointerToCandidateEscapeCharacter=NULL;
       
   128 	FOREVER
       
   129 		{
       
   130 		const TUint utf7Byte=*aPointerToUtf7Byte;
       
   131 		if (utf7Byte==EscapeCharacterForStartingBase64Block(aIsImapUtf7))
       
   132 			{
       
   133 			pointerToCandidateEscapeCharacter=aPointerToUtf7Byte;
       
   134 			}
       
   135 		else if (Base64Decoding(utf7Byte, aIsImapUtf7)==KNotInBase64Alphabet)
       
   136 			{
       
   137 			break;
       
   138 			}
       
   139 		__ASSERT_DEBUG(aPointerToUtf7Byte>=aPointerToFirstUtf7Byte, Panic(EPanicBadUtf7Pointers2));
       
   140 		if (aPointerToUtf7Byte<=aPointerToFirstUtf7Byte)
       
   141 			{
       
   142 			break;
       
   143 			}
       
   144 		--aPointerToUtf7Byte;
       
   145 		}
       
   146 	__ASSERT_DEBUG(pointerToCandidateEscapeCharacter!=NULL, Panic(EPanicNotInBase64Block));
       
   147 	return pointerToCandidateEscapeCharacter;
       
   148 	}
       
   149 
       
   150 LOCAL_C TBool EncodeInUtf7Directly(TUint aUnicodeCharacter, TBool aIsImapUtf7, TBool aEncodeOptionalDirectCharactersInBase64)
       
   151 	{
       
   152 	if (aIsImapUtf7)
       
   153 		{
       
   154 		return (aUnicodeCharacter>=0x0020) && (aUnicodeCharacter<=0x007e);
       
   155 		}
       
   156 	if ((aUnicodeCharacter>=0x0021) && (aUnicodeCharacter<=0x007d))
       
   157 		{
       
   158 		if (aEncodeOptionalDirectCharactersInBase64)
       
   159 			{
       
   160 			return (((aUnicodeCharacter>=0x0041) && (aUnicodeCharacter<=0x005a)) ||
       
   161 					((aUnicodeCharacter>=0x0061) && (aUnicodeCharacter<=0x007a)) ||
       
   162 					((aUnicodeCharacter>=0x0027) && (aUnicodeCharacter<=0x0029)) ||
       
   163 					((aUnicodeCharacter>=0x002b) && (aUnicodeCharacter<=0x003a)) ||
       
   164 					(aUnicodeCharacter==0x003f));
       
   165 			}
       
   166 		return aUnicodeCharacter!=0x005c;
       
   167 		}
       
   168 	return (aUnicodeCharacter==0x0020) || (aUnicodeCharacter==0x0009) || (aUnicodeCharacter==0x000d) || (aUnicodeCharacter==0x000a);
       
   169 	}
       
   170 
       
   171 inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
       
   172 	{
       
   173 	return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
       
   174 	}
       
   175 
       
   176 
       
   177 
       
   178 /**  Converts Unicode text into UTF-7 encoding. The fucntion leaves with
       
   179 KErrCorrupt if the input string is corrupt.
       
   180 
       
   181 @param aUnicode A UCS-2 encoded input string.
       
   182 @param aEncodeOptionalDirectCharactersInBase64  If ETrue then
       
   183 characters from UTF-7 set O (optional direct characters) are encoded in
       
   184 Modified Base64. If EFalse the characters are encoded directly,
       
   185 as their ASCII equivalents.
       
   186 @return A descriptor containing the UTF-7 encoded output string. */
       
   187 EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf7L(
       
   188 										const TDesC16& aUnicode,
       
   189 										TBool aEncodeOptionalDirectCharactersInBase64)
       
   190 	{
       
   191 	// If aUnicode is  Null string, return an empty HBufC
       
   192 	if (aUnicode.Length() == 0)
       
   193 		{
       
   194 		HBufC8* hBuf8 = HBufC8::NewL(1);
       
   195 		return hBuf8;
       
   196 		}
       
   197 
       
   198 	// Otherwise, convert and store result in a buffer, reallocating that buffer if needed.
       
   199 	TInt length = aUnicode.Length();
       
   200 	const TInt bufsize = 100;
       
   201 
       
   202 	TPtrC16 unicode (aUnicode);
       
   203 	TBuf8<bufsize> buf;
       
   204 	HBufC8* hBuf8 = HBufC8::NewLC(length);
       
   205 	TPtr8 utf7 = hBuf8->Des();
       
   206 
       
   207 	FOREVER
       
   208 		{
       
   209 		TInt unconverted = ConvertFromUnicodeToUtf7(buf, unicode, aEncodeOptionalDirectCharactersInBase64);
       
   210 		if( unconverted == EErrorIllFormedInput || unconverted < 0)
       
   211 			User::Leave(KErrCorrupt);
       
   212 
       
   213 		if (utf7.Length() + buf.Length() > utf7.MaxLength())
       
   214 			{
       
   215 			// Reallocate the hBuf8
       
   216 			hBuf8 = hBuf8->ReAllocL(utf7.Length() + buf.Length());
       
   217 			CleanupStack::Pop();
       
   218 			CleanupStack::PushL(hBuf8);
       
   219 			utf7.Set(hBuf8->Des());
       
   220 			}
       
   221 		utf7.Append(buf);
       
   222 		if (unconverted ==0)
       
   223 			break;
       
   224 		unicode.Set(unicode.Right(unconverted));
       
   225 		}
       
   226 	CleanupStack::Pop();
       
   227 	return hBuf8;
       
   228 
       
   229 	}
       
   230 
       
   231 /** Converts Unicode text into UTF-7 encoding.
       
   232 
       
   233 @param aUtf7 On return, contains the UTF-7 encoded output string.
       
   234 @param aUnicode A UCS-2 encoded input string.
       
   235 @param aEncodeOptionalDirectCharactersInBase64 If ETrue then characters from
       
   236 UTF-7 set O (optional direct characters) are encoded in Modified Base64. If
       
   237 EFalse the characters are encoded directly, as their ASCII equivalents.
       
   238 @return The number of unconverted characters left at the end of the input
       
   239 descriptor, or one of the error values defined in TError. */
       
   240 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(
       
   241 										TDes8& aUtf7,
       
   242 										const TDesC16& aUnicode,
       
   243 										TBool aEncodeOptionalDirectCharactersInBase64)
       
   244 	{
       
   245 	return ConvertFromUnicodeToUtf7(aUtf7, aUnicode, EFalse, aEncodeOptionalDirectCharactersInBase64);
       
   246 	}
       
   247 
       
   248 TInt CnvUtfConverter::ConvertFromUnicodeToUtf7(TDes8& aUtf7,
       
   249 											   const TDesC16& aUnicode,
       
   250 											   TBool aIsImapUtf7,
       
   251 											   TBool aEncodeOptionalDirectCharactersInBase64)
       
   252 	{
       
   253 	if (aUnicode.Length()==0)
       
   254 		{
       
   255 		aUtf7.SetLength(0);
       
   256 		return 0;
       
   257 		}
       
   258 	if (aUtf7.MaxLength()==0)
       
   259 		{
       
   260 		return aUnicode.Length();
       
   261 		}
       
   262 	const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7);
       
   263 	TUint8* pointerToPreviousUtf7Byte=CONST_CAST(TUint8*, aUtf7.Ptr()-1);
       
   264 	const TUint8* const pointerToLastUtf7Byte=pointerToPreviousUtf7Byte+aUtf7.MaxLength();
       
   265 	const TUint16* pointerToPreviousUnicodeCharacter=aUnicode.Ptr()-1;
       
   266 	const TUint16* const pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.Length();
       
   267 	const TUint KIsInBase64Block=0x80000000u;
       
   268 	TUint bitBuffer=0;
       
   269 	TInt numberOfBitsInBuffer=0;
       
   270 	FOREVER
       
   271 		{
       
   272 		__ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers3));
       
   273 		__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers1));
       
   274 		TUint currentUnicodeCharacter=(pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)? 0: *(pointerToPreviousUnicodeCharacter+1);
       
   275 		if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || EncodeInUtf7Directly(currentUnicodeCharacter, aIsImapUtf7, aEncodeOptionalDirectCharactersInBase64))
       
   276 			{
       
   277 			__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState1));
       
   278 			__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState2));
       
   279 			if (bitBuffer&KIsInBase64Block)
       
   280 				{
       
   281 				if (numberOfBitsInBuffer!=0)
       
   282 					{
       
   283 					if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<2) // make sure there is enough space for the trailing '-' as well as the remains of the bitBuffer as the KIsInBase64Block flag is about to turned off, thus the trailing '-' may never get written
       
   284 						{
       
   285 						break;
       
   286 						}
       
   287 					++pointerToPreviousUtf7Byte;
       
   288 					*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7));
       
   289 					}
       
   290 				else
       
   291 					{
       
   292 					if (pointerToPreviousUtf7Byte==pointerToLastUtf7Byte)
       
   293 						{
       
   294 						break;
       
   295 						}
       
   296 					}
       
   297 				++pointerToPreviousUtf7Byte;
       
   298 				*pointerToPreviousUtf7Byte='-';
       
   299 				bitBuffer=0;
       
   300 				numberOfBitsInBuffer=0;
       
   301 				}
       
   302 			__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<=pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers2));
       
   303 			if (pointerToPreviousUnicodeCharacter>=pointerToLastUnicodeCharacter)
       
   304 				{
       
   305 				break;
       
   306 				}
       
   307 			__ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers4));
       
   308 			if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<((currentUnicodeCharacter==escapeCharacterForStartingBase64Block)? 2: 1))
       
   309 				{
       
   310 				break;
       
   311 				}
       
   312 			++pointerToPreviousUtf7Byte;
       
   313 			*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, currentUnicodeCharacter);
       
   314 			++pointerToPreviousUnicodeCharacter;
       
   315 			if (currentUnicodeCharacter==escapeCharacterForStartingBase64Block)
       
   316 				{
       
   317 				++pointerToPreviousUtf7Byte;
       
   318 				*pointerToPreviousUtf7Byte='-';
       
   319 				}
       
   320 			}
       
   321 		else
       
   322 			{
       
   323 			{
       
   324 			TInt numberOfUtf7BytesRequired=(numberOfBitsInBuffer+16)/6; // "(numberOfBitsInBuffer+16)/6" is the number of iterations that will happen in the while loop below
       
   325 			if (~bitBuffer&KIsInBase64Block)
       
   326 				{
       
   327 				++numberOfUtf7BytesRequired; // for the initial escapeCharacterForStartingBase64Block
       
   328 				}
       
   329 			if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte<numberOfUtf7BytesRequired)
       
   330 				{
       
   331 				break;
       
   332 				}
       
   333 			}
       
   334 			if (~bitBuffer&KIsInBase64Block)
       
   335 				{
       
   336 				__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers5));
       
   337 				++pointerToPreviousUtf7Byte;
       
   338 				*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, escapeCharacterForStartingBase64Block);
       
   339 				}
       
   340 			bitBuffer<<=16;
       
   341 			bitBuffer|=currentUnicodeCharacter;
       
   342 			numberOfBitsInBuffer+=16;
       
   343 			++pointerToPreviousUnicodeCharacter;
       
   344 			__ASSERT_DEBUG(numberOfBitsInBuffer<=20, Panic(EPanicBadBitBufferState3));
       
   345 			while (numberOfBitsInBuffer>=6)
       
   346 				{
       
   347 				numberOfBitsInBuffer-=6;
       
   348 				__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers6));
       
   349 				++pointerToPreviousUtf7Byte;
       
   350 				*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer>>numberOfBitsInBuffer)&0x3f, aIsImapUtf7));
       
   351 				}
       
   352 			bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - not strictly necessary but it leaves the buffer in a cleaner state
       
   353 			bitBuffer|=KIsInBase64Block;
       
   354 			}
       
   355 		}
       
   356 	__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (numberOfBitsInBuffer==0), Panic(EPanicBadBitBufferState4));
       
   357 	__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState5));
       
   358 	if (bitBuffer&KIsInBase64Block)
       
   359 		{
       
   360 #if defined(_DEBUG)
       
   361 		TInt numberOfLoopIterations=1;
       
   362 #endif
       
   363 		FOREVER // there should never be more than 2 iterations of this loop - the first "if" should always succeed the second time if it doesn't succeed the first time
       
   364 			{
       
   365 			__ASSERT_DEBUG(pointerToPreviousUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers7));
       
   366 			__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState6));
       
   367 			__ASSERT_DEBUG(numberOfLoopIterations<=2, Panic(EPanicUnexpectedNumberOfLoopIterations));
       
   368 #if defined(_DEBUG)
       
   369 			++numberOfLoopIterations;
       
   370 #endif
       
   371 			if (pointerToLastUtf7Byte-pointerToPreviousUtf7Byte>=((numberOfBitsInBuffer==0)? 1: 2)) // if there's room to finish off the base-64 sequence by (i) flushing the bit-buffer and (ii) appending the trailing '-'
       
   372 				{
       
   373 				if (numberOfBitsInBuffer!=0)
       
   374 					{
       
   375 					__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers8));
       
   376 					++pointerToPreviousUtf7Byte;
       
   377 					*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding((bitBuffer<<(6-numberOfBitsInBuffer))&0x3f, aIsImapUtf7));
       
   378 					}
       
   379 				__ASSERT_DEBUG(pointerToPreviousUtf7Byte<pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers9));
       
   380 				++pointerToPreviousUtf7Byte;
       
   381 				*pointerToPreviousUtf7Byte='-';
       
   382 				break;
       
   383 				}
       
   384 			// it is now necessary to move back pointerToPreviousUtf7Byte so that the base-64 sequence can be terminated - note it must be terminated on a Unicode character boundary hence the reason why pointerToPreviousUnicodeCharacter may be moved back too
       
   385 			TUint8* pointerToEscapeCharacterStartingBase64Block=PointerToEscapeCharacterStartingBase64Block(pointerToPreviousUtf7Byte, aUtf7.Ptr(), aIsImapUtf7);
       
   386 			const TInt oldNumberOfBase64Characters=pointerToPreviousUtf7Byte-pointerToEscapeCharacterStartingBase64Block;
       
   387 			__ASSERT_DEBUG(oldNumberOfBase64Characters>0, Panic(EPanicInitialEscapeCharacterButNoBase64));
       
   388 			__ASSERT_DEBUG(((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)%16==0, Panic(EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary));
       
   389 			pointerToPreviousUnicodeCharacter-=((oldNumberOfBase64Characters*6)+numberOfBitsInBuffer)/16; // move back pointerToPreviousUnicodeCharacter to before the equivalent of the base-64 sequence
       
   390 			pointerToPreviousUtf7Byte=pointerToEscapeCharacterStartingBase64Block;
       
   391 			__ASSERT_DEBUG(*pointerToPreviousUtf7Byte==escapeCharacterForStartingBase64Block, Panic(EPanicBadUtf7Pointers10));
       
   392 			if (oldNumberOfBase64Characters<4) // if the new base-64 sequence will be so short that it won't even be able to contain the UTF-7 encoding of a single Unicode character
       
   393 				{
       
   394 				--pointerToPreviousUtf7Byte; // move back pointerToPreviousUtf7Byte to before the escapeCharacterForStartingBase64Block
       
   395 				break;
       
   396 				}
       
   397 			const TInt newNumberOfUnicodeCharacters=((oldNumberOfBase64Characters-1)*3)/8;
       
   398 			pointerToPreviousUnicodeCharacter+=newNumberOfUnicodeCharacters;
       
   399 			pointerToPreviousUtf7Byte+=((newNumberOfUnicodeCharacters*8)+2)/3;
       
   400 			const TInt numberOfBitsToBeZeroedInLastBase64Character=(newNumberOfUnicodeCharacters%3)*2;
       
   401 			if (numberOfBitsToBeZeroedInLastBase64Character!=0)
       
   402 				{
       
   403 				*pointerToPreviousUtf7Byte=STATIC_CAST(TUint8, Base64Encoding(Base64Decoding(*pointerToPreviousUtf7Byte, aIsImapUtf7)&0x3f&~((1<<numberOfBitsToBeZeroedInLastBase64Character)-1), aIsImapUtf7));
       
   404 				}
       
   405 			bitBuffer=KIsInBase64Block;
       
   406 			numberOfBitsInBuffer=0;
       
   407 			}
       
   408 		}
       
   409 	aUtf7.SetLength((pointerToPreviousUtf7Byte-aUtf7.Ptr())+1);
       
   410 	return pointerToLastUnicodeCharacter-pointerToPreviousUnicodeCharacter;
       
   411 	}
       
   412 
       
   413 
       
   414 
       
   415 /** Converts Unicode text into UTF-8 encoding.
       
   416 
       
   417 @param aUtf8 On return, contains the UTF-8 encoded output string.
       
   418 @param aUnicode The Unicode-encoded input string.
       
   419 @return The number of unconverted characters left at the end of the input
       
   420 descriptor, or one of the error values defined in TError. */
       
   421 EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
       
   422 	{
       
   423 	return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
       
   424 	}
       
   425 
       
   426 
       
   427 /**  Converts Unicode text into UTF-8 encoding.
       
   428 
       
   429 The variant of UTF-8 used internally by Java differs slightly from
       
   430 standard UTF-8. The TBool argument controls the UTF-8
       
   431 variant generated by this function. This function leaves with a
       
   432 KErrCorrupt if the input string is corrupt.
       
   433 
       
   434 @param aUnicode A UCS-2 encoded input string.
       
   435 @return A pointer to an HBufC8 containing the converted UTF8. */
       
   436 EXPORT_C HBufC8* CnvUtfConverter::ConvertFromUnicodeToUtf8L(const TDesC16& aUnicode)
       
   437  	{
       
   438 	// If aUnicode is  Null string, return an empty HBufC
       
   439 	if (aUnicode.Length() == 0)
       
   440 		{
       
   441 		HBufC8* hBuf8 = HBufC8::NewL(1);
       
   442 		return hBuf8;
       
   443 		}
       
   444 
       
   445 	// Otherwise, convert and store result in a buffer, reallocating that buffer if needed.
       
   446 	const TInt length = aUnicode.Length();
       
   447 	const TInt bufsize = 100;
       
   448 
       
   449 	TPtrC16 unicode (aUnicode);
       
   450 	TBuf8<bufsize> buf;
       
   451 	HBufC8* hBuf8 = HBufC8::NewLC(length);
       
   452 	TPtr8 utf8 = hBuf8->Des();
       
   453 
       
   454 	FOREVER
       
   455 		{
       
   456 		TInt unconverted = ConvertFromUnicodeToUtf8(buf, unicode);
       
   457 		if( unconverted == EErrorIllFormedInput || unconverted < 0)
       
   458 			User::Leave(KErrCorrupt);
       
   459 
       
   460 		if (utf8.Length() + buf.Length() > utf8.MaxLength())
       
   461 			{
       
   462 			// Reallocate the hBuf8
       
   463 			hBuf8 = hBuf8->ReAllocL(utf8.Length() + buf.Length());
       
   464 			CleanupStack::Pop();
       
   465 			CleanupStack::PushL(hBuf8);
       
   466 			utf8.Set(hBuf8->Des());
       
   467 			}
       
   468 		utf8.Append(buf);
       
   469 		if (unconverted ==0)
       
   470 			break;
       
   471 		unicode.Set(unicode.Right(unconverted));
       
   472 		}
       
   473 	CleanupStack::Pop();
       
   474 	return hBuf8;
       
   475 	}
       
   476 
       
   477 /** Converts Unicode text into UTF-8 encoding.
       
   478 
       
   479 Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
       
   480 
       
   481 The variant of UTF-8 used internally by Java differs slightly from standard
       
   482 UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
       
   483 
       
   484 @param aUtf8 On return, contains the UTF-8 encoded output string.
       
   485 @param aUnicode A UCS-2 encoded input string.
       
   486 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
       
   487 UTF-8. The default is EFalse.
       
   488 @return The number of unconverted characters left at the end of the input descriptor,
       
   489 or one of the error values defined in TError. */
       
   490 TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8,
       
   491 											   const TDesC16& aUnicode,
       
   492 											   TBool aGenerateJavaConformantUtf8)
       
   493 	{
       
   494 	if (aUnicode.Length() == 0)
       
   495 		{
       
   496 		aUtf8.SetLength(0);
       
   497 		return 0;
       
   498 		}
       
   499 	if (aUtf8.MaxLength() == 0)
       
   500 		{
       
   501 		return aUnicode.Length();
       
   502 		}
       
   503 
       
   504 	TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
       
   505 	const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
       
   506 	TBool inputIsTruncated = EFalse;
       
   507 	const TUint16* pUnicode = aUnicode.Ptr();
       
   508 	const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
       
   509 
       
   510 	FOREVER
       
   511 		{
       
   512 		__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
       
   513 		__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
       
   514 
       
   515 		if (pUnicode[0] < 0x80)
       
   516 			{
       
   517 			// ascii - 1 byte
       
   518 
       
   519 			// internally java is different since the \x0000 character is
       
   520 			// translated into \xC0 \x80.
       
   521 
       
   522 			if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
       
   523 				{
       
   524 				if (pUtf8 == pointerToLastUtf8Byte)
       
   525 					{
       
   526 					pUtf8--;
       
   527 					pUnicode--;
       
   528 					break;
       
   529 					}
       
   530 				*pUtf8++ = STATIC_CAST(TUint8, 0xc0);
       
   531 				*pUtf8   = STATIC_CAST(TUint8, 0x80);
       
   532 				}
       
   533 			else
       
   534 				{
       
   535 				*pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
       
   536 				}
       
   537 			}
       
   538 		else if (pUnicode[0] < 0x800)
       
   539 			{
       
   540 			// U+0080..U+07FF - 2 bytes
       
   541 
       
   542 			if (pUtf8 == pointerToLastUtf8Byte)
       
   543 				{
       
   544 				pUtf8--;
       
   545 				pUnicode--;
       
   546 				break;
       
   547 				}
       
   548 
       
   549 			*pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
       
   550 			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
       
   551 
       
   552 			}
       
   553 
       
   554 		// check to see if we have a surrogate in the stream, surrogates encode code points outside
       
   555 		// the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
       
   556 
       
   557 		else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
       
   558 			{
       
   559 			// surrogate pair - 4 bytes in utf-8
       
   560 			// U+10000..U+10FFFF
       
   561 
       
   562 			__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
       
   563 			// is there enough space to hold the character
       
   564 			if ((pointerToLastUtf8Byte - pUtf8) < 3)
       
   565 				{
       
   566 				pUtf8--;
       
   567 				pUnicode--;
       
   568 				break;  // no go to the exit condition
       
   569 				}
       
   570 
       
   571 			__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
       
   572 			if (pUnicode >= pointerToLastUnicodeCharacter)
       
   573 				{
       
   574 				pUtf8--;
       
   575 				pUnicode--;
       
   576 				inputIsTruncated = ETrue;
       
   577 				break; // middle of a surrogate pair. go to end condition
       
   578 				}
       
   579 
       
   580 			if ((pUnicode[1] & 0xfc00) != 0xdc00)
       
   581 				{
       
   582 				return EErrorIllFormedInput;
       
   583 				}
       
   584 
       
   585 			// convert utf-16 surrogate to utf-32
       
   586 			TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
       
   587 
       
   588 			// convert utf-32 to utf-8
       
   589             *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));
       
   590             *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
       
   591             *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
       
   592             *pUtf8   = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
       
   593 
       
   594             // we consumed 2 utf-16 values, move this pointer
       
   595 			pUnicode++;
       
   596 			}
       
   597 		else
       
   598 			{
       
   599 			// 3 byte - utf-8, U+800..U+FFFF rest of BMP.
       
   600 
       
   601 			if (pointerToLastUtf8Byte - pUtf8 < 2)
       
   602 				{
       
   603 				pUtf8--;
       
   604 				pUnicode--;
       
   605 				break;
       
   606 				}
       
   607 			*pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
       
   608 			*pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
       
   609 			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
       
   610 			}
       
   611 
       
   612 		if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
       
   613 			{
       
   614 			break;
       
   615 			}
       
   616 
       
   617 		pUtf8++;
       
   618 		pUnicode++;
       
   619 
       
   620 		}
       
   621 
       
   622 	if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
       
   623 		{
       
   624 		return EErrorIllFormedInput;
       
   625 		}
       
   626 
       
   627 	aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
       
   628 	return pointerToLastUnicodeCharacter-pUnicode;
       
   629 	}
       
   630 
       
   631 
       
   632 
       
   633 /**  Converts text encoded using the Unicode transformation format UTF-7
       
   634 into the Unicode UCS-2 character set.
       
   635 
       
   636 @param aUtf7 The UTF-7 encoded input string.
       
   637 @return A pointer to an HBufC16 containing the converted Unicode string */
       
   638 EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf7L(const TDesC8& aUtf7)
       
   639 	{
       
   640 		// If aUtf8 is an empty string return
       
   641 	if (aUtf7.Length()==0)
       
   642 		{
       
   643 		HBufC16* hBuf = HBufC16::NewL(1);
       
   644 		return hBuf;
       
   645 		}
       
   646 
       
   647 	// else convert aUtf8 to Unicode storing the result in a buffer, reallocating
       
   648 	// it when needed.
       
   649 	TInt length = aUtf7.Length();
       
   650 	const TInt bufsize = 100;
       
   651 	TInt state = KStateDefault;
       
   652 
       
   653 	TPtrC8 utf7 (aUtf7);
       
   654 	TBuf<bufsize> buf;
       
   655 	HBufC16* hBuf = HBufC16::NewLC(length);
       
   656 	TPtr unicode = hBuf->Des();
       
   657 
       
   658 	FOREVER
       
   659 		{
       
   660 		TInt unconverted = ConvertToUnicodeFromUtf7(buf, utf7, state);
       
   661 		if( unconverted == EErrorIllFormedInput || unconverted < 0)
       
   662 			User::Leave(KErrCorrupt);
       
   663 
       
   664 		if (unicode.Length() + buf.Length() > unicode.MaxLength())
       
   665 			{
       
   666 			// Reallocate hBuf
       
   667 			hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length());
       
   668 			CleanupStack::Pop();
       
   669 			CleanupStack::PushL(hBuf);
       
   670 			unicode.Set(hBuf->Des());
       
   671 			}
       
   672 		unicode.Append(buf);
       
   673 		if (unconverted ==0)
       
   674 			break;
       
   675 		utf7.Set(utf7.Right(unconverted));
       
   676 		}
       
   677 	CleanupStack::Pop();
       
   678 	return hBuf;
       
   679 	}
       
   680 
       
   681 
       
   682 
       
   683 /** Converts text encoded using the Unicode transformation format UTF-7 into the
       
   684 Unicode UCS-2 character set.
       
   685 
       
   686 If the conversion is achieved using a series of calls to this function, where
       
   687 each call starts off where the previous call reached in the input descriptor,
       
   688 the state of the conversion is stored. The initial value of the state variable
       
   689 should be set as KStateDefault when the conversion is started, and afterwards
       
   690 simply passed unchanged into each function call.
       
   691 
       
   692 @param aUnicode On return, contains the Unicode encoded output string.
       
   693 @param aUtf7 The UTF-7 encoded input string.
       
   694 @param aState For the first call of the function set to KStateDefault. For
       
   695 subsequent calls, pass in the variable unchanged.
       
   696 @return The number of unconverted bytes left at the end of the input descriptor,
       
   697 or one of the error values defined in TError. */
       
   698 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode,
       
   699 														const TDesC8& aUtf7,
       
   700 														TInt& aState)
       
   701 	{
       
   702 	return ConvertToUnicodeFromUtf7(aUnicode, aUtf7, EFalse, aState);
       
   703 	}
       
   704 
       
   705 TInt CnvUtfConverter::ConvertToUnicodeFromUtf7(TDes16& aUnicode,
       
   706 											   const TDesC8& aUtf7,
       
   707 											   TBool aIsImapUtf7,
       
   708 											   TInt& aState)
       
   709 	{
       
   710 	if (aUtf7.Length()==0)
       
   711 		{
       
   712 		aUnicode.SetLength(0);
       
   713 		return 0;
       
   714 		}
       
   715 	if (aUnicode.MaxLength()==0)
       
   716 		{
       
   717 		return aUtf7.Length();
       
   718 		}
       
   719 	const TUint escapeCharacterForStartingBase64Block=EscapeCharacterForStartingBase64Block(aIsImapUtf7);
       
   720 	TUint16* pointerToPreviousUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr()-1);
       
   721 	const TUint16* pointerToLastUnicodeCharacter=pointerToPreviousUnicodeCharacter+aUnicode.MaxLength();
       
   722 	const TUint8* pointerToCurrentUtf7Byte=aUtf7.Ptr();
       
   723 	const TUint8* pointerToLastUtf7Byte=pointerToCurrentUtf7Byte+(aUtf7.Length()-1);
       
   724 	TUint currentUtf7Byte=*pointerToCurrentUtf7Byte;
       
   725 	const TUint KIsInBase64Block=0x80000000u;
       
   726 	TUint bitBuffer=STATIC_CAST(TUint, aState);
       
   727 	TInt numberOfBitsInBuffer=((bitBuffer&0xf0)>>4);
       
   728 	bitBuffer&=~0xf0; // turn off the bits that stored numberOfBitsInBuffer
       
   729 	if (bitBuffer&KIsInBase64Block)
       
   730 		{
       
   731 		__ASSERT_ALWAYS((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4) || ((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer)), Panic(EPanicBadBitBufferState7));
       
   732 		__ASSERT_ALWAYS((bitBuffer&~(KIsInBase64Block|0x0000000f))==0, Panic(EPanicBadBitBufferState8));
       
   733 		}
       
   734 	else
       
   735 		{
       
   736 		__ASSERT_ALWAYS(bitBuffer==0, Panic(EPanicBadBitBufferState9));
       
   737 		__ASSERT_ALWAYS(numberOfBitsInBuffer==0, Panic(EPanicBadBitBufferState10));
       
   738 		}
       
   739 	aState=KStateDefault;
       
   740 	if (bitBuffer&KIsInBase64Block)
       
   741 		{
       
   742 		currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7);
       
   743 		}
       
   744 	TBool inputIsTruncated=EFalse;
       
   745 	FOREVER
       
   746 		{
       
   747 		__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers5));
       
   748 		__ASSERT_DEBUG(pointerToCurrentUtf7Byte<=pointerToLastUtf7Byte, Panic(EPanicBadUtf7Pointers11));
       
   749 		__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || (currentUtf7Byte==*pointerToCurrentUtf7Byte), Panic(EPanicOutOfSyncUtf7Byte1));
       
   750 		__ASSERT_DEBUG((~bitBuffer&KIsInBase64Block) || (currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7)), Panic(EPanicOutOfSyncUtf7Byte2));
       
   751 		__ASSERT_DEBUG((bitBuffer&KIsInBase64Block) || ((bitBuffer==0) && (numberOfBitsInBuffer==0)), Panic(EPanicBadBitBufferState11));
       
   752 		if ((~bitBuffer&KIsInBase64Block) && (currentUtf7Byte==escapeCharacterForStartingBase64Block))
       
   753 			{
       
   754 			if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
       
   755 				{
       
   756 				--pointerToCurrentUtf7Byte;
       
   757 				inputIsTruncated=ETrue;
       
   758 				goto end;
       
   759 				}
       
   760 			++pointerToCurrentUtf7Byte;
       
   761 			currentUtf7Byte=*pointerToCurrentUtf7Byte;
       
   762 			if (currentUtf7Byte=='-')
       
   763 				{
       
   764 				currentUtf7Byte=escapeCharacterForStartingBase64Block;
       
   765 				}
       
   766 			else
       
   767 				{
       
   768 				currentUtf7Byte=Base64Decoding(currentUtf7Byte, aIsImapUtf7);
       
   769 				if (currentUtf7Byte==KNotInBase64Alphabet)
       
   770 					{
       
   771 					return EErrorIllFormedInput;
       
   772 					}
       
   773 				bitBuffer=KIsInBase64Block;
       
   774 				}
       
   775 			}
       
   776 		if (bitBuffer&KIsInBase64Block)
       
   777 			{
       
   778 			FOREVER
       
   779 				{
       
   780 				__ASSERT_DEBUG(currentUtf7Byte==Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7), Panic(EPanicOutOfSyncBase64Decoding));
       
   781 				__ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState12));
       
   782 				if (currentUtf7Byte==KNotInBase64Alphabet)
       
   783 					{
       
   784 					if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer))
       
   785 						{
       
   786 						return EErrorIllFormedInput;
       
   787 						}
       
   788 					bitBuffer=0;
       
   789 					numberOfBitsInBuffer=0;
       
   790 					currentUtf7Byte=*pointerToCurrentUtf7Byte;
       
   791 					if (currentUtf7Byte=='-')
       
   792 						{
       
   793 						if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
       
   794 							{
       
   795 							goto end;
       
   796 							}
       
   797 						++pointerToCurrentUtf7Byte;
       
   798 						currentUtf7Byte=*pointerToCurrentUtf7Byte;
       
   799 						}
       
   800 					break;
       
   801 					}
       
   802 				bitBuffer<<=6;
       
   803 				bitBuffer|=currentUtf7Byte;
       
   804 				bitBuffer|=KIsInBase64Block;
       
   805 				numberOfBitsInBuffer+=6;
       
   806 				// only flush the buffer if it contains a whole Unicode character and the remainder is either all zero-bits (hence would be a legal point to end the base-64 sequence) or at least 6 bits long (therefore would leave at least one UTF-7 byte unconverted at the end of the input descriptor)
       
   807 				if ((numberOfBitsInBuffer>=16+6) || ((numberOfBitsInBuffer>=16) && !BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16)))
       
   808 					{
       
   809 					numberOfBitsInBuffer-=16;
       
   810 					__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers6));
       
   811 					++pointerToPreviousUnicodeCharacter;
       
   812 					*pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, bitBuffer>>numberOfBitsInBuffer);
       
   813 					bitBuffer&=((1<<numberOfBitsInBuffer)-1); // zero all the consumed bits - must be done as bitBuffer is stored along with numberOfBitsInBuffer in aState if the output descriptor runs out of space or if the input descriptor was truncated
       
   814 					bitBuffer|=KIsInBase64Block; // turn it back on as the line above turned it off
       
   815 					if (pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter)
       
   816 						{
       
   817 						goto end;
       
   818 						}
       
   819 					}
       
   820 				if (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte)
       
   821 					{
       
   822 					inputIsTruncated=ETrue;
       
   823 					goto end;
       
   824 					}
       
   825 				++pointerToCurrentUtf7Byte;
       
   826 				currentUtf7Byte=Base64Decoding(*pointerToCurrentUtf7Byte, aIsImapUtf7);
       
   827 				}
       
   828 			}
       
   829 		else
       
   830 			{
       
   831 			__ASSERT_DEBUG(pointerToPreviousUnicodeCharacter<pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers7));
       
   832 			++pointerToPreviousUnicodeCharacter;
       
   833 			*pointerToPreviousUnicodeCharacter=STATIC_CAST(TUint16, currentUtf7Byte);
       
   834 			if ((pointerToPreviousUnicodeCharacter==pointerToLastUnicodeCharacter) || (pointerToCurrentUtf7Byte==pointerToLastUtf7Byte))
       
   835 				{
       
   836 				goto end;
       
   837 				}
       
   838 			++pointerToCurrentUtf7Byte;
       
   839 			currentUtf7Byte=*pointerToCurrentUtf7Byte;
       
   840 			}
       
   841 		}
       
   842 end:
       
   843 	if (bitBuffer&KIsInBase64Block)
       
   844 		{
       
   845 		__ASSERT_DEBUG((numberOfBitsInBuffer<16) || (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer-16) && (numberOfBitsInBuffer<16+6)), Panic(EPanicBadBitBufferState13));
       
   846 		if (BitBufferContainsNonZeroBits(bitBuffer, numberOfBitsInBuffer))
       
   847 			{
       
   848 			// rewind how far we've got in the UTF-7 descriptor to indicate to the user (by returning a value greater than zero) that not all of the input could be converted as it ended with a truncated base-64 sequence
       
   849 			__ASSERT_DEBUG(numberOfBitsInBuffer>=6, Panic(EPanicBadBitBufferState14));
       
   850 			pointerToCurrentUtf7Byte-=numberOfBitsInBuffer/6;
       
   851 			const TInt newNumberOfBitsInBuffer=numberOfBitsInBuffer%6;
       
   852 			bitBuffer&=~KIsInBase64Block; // temporarily turn off the KIsInBase64Block for the right-shift
       
   853 			bitBuffer>>=(numberOfBitsInBuffer-newNumberOfBitsInBuffer);
       
   854 			bitBuffer|=KIsInBase64Block; // must be turned back on again as the bit-buffer is packed into aState
       
   855 			numberOfBitsInBuffer=newNumberOfBitsInBuffer;
       
   856 			__ASSERT_DEBUG((numberOfBitsInBuffer==0) || (numberOfBitsInBuffer==2) || (numberOfBitsInBuffer==4), Panic(EPanicBadBitBufferState15));
       
   857 			}
       
   858 		__ASSERT_DEBUG((numberOfBitsInBuffer<16) && (numberOfBitsInBuffer%2==0), Panic(EPanicBadBitBufferState16));
       
   859 		aState=STATIC_CAST(TInt, bitBuffer);
       
   860 		aState|=(numberOfBitsInBuffer<<4);
       
   861 		__ASSERT_DEBUG(aState&KIsInBase64Block, Panic(EPanicBadBitBufferState17));
       
   862 		bitBuffer=0;
       
   863 		numberOfBitsInBuffer=0;
       
   864 		}
       
   865 	if ((pointerToCurrentUtf7Byte<aUtf7.Ptr()) && inputIsTruncated)
       
   866 		{
       
   867 		return EErrorIllFormedInput;
       
   868 		}
       
   869 	aUnicode.SetLength((pointerToPreviousUnicodeCharacter+1)-aUnicode.Ptr());
       
   870 	return pointerToLastUtf7Byte-pointerToCurrentUtf7Byte;
       
   871 	}
       
   872 
       
   873 
       
   874 
       
   875 /** Converts text encoded using the Unicode transformation format UTF-8
       
   876 into the Unicode UCS-2 character set. This function leaves with an
       
   877 error code of the input string is corrupted.
       
   878 
       
   879 @param aUtf8 The UTF-8 encoded input string
       
   880 @return A pointer to an HBufC16 with the converted Unicode string. */
       
   881 EXPORT_C HBufC16* CnvUtfConverter::ConvertToUnicodeFromUtf8L(const TDesC8& aUtf8)
       
   882  	{
       
   883 	// If aUtf8 is an empty string return
       
   884 	if (aUtf8.Length()==0)
       
   885 		{
       
   886 		HBufC16* hBuf = HBufC16::NewL(1);
       
   887 		return hBuf;
       
   888 		}
       
   889 
       
   890 	// else convert aUtf8 to Unicode storing the result in a buffer, reallocating
       
   891 	// it when needed.
       
   892 	TInt length = aUtf8.Length();
       
   893 	const TInt bufsize = 100;
       
   894 
       
   895 	TPtrC8 utf8 (aUtf8);
       
   896 	TBuf<bufsize> buf;
       
   897 	HBufC16* hBuf = HBufC16::NewLC(length);
       
   898 	TPtr unicode = hBuf->Des();
       
   899 
       
   900 	FOREVER
       
   901 		{
       
   902 		TInt unconverted = ConvertToUnicodeFromUtf8(buf, utf8);
       
   903 		if( unconverted == EErrorIllFormedInput || unconverted < 0)
       
   904 			User::Leave(KErrCorrupt);
       
   905 
       
   906 		if (unicode.Length() + buf.Length() > unicode.MaxLength())
       
   907 			{
       
   908 			// Reallocate hBuf
       
   909 			hBuf = hBuf->ReAllocL(unicode.Length() + buf.Length());
       
   910 			CleanupStack::Pop();
       
   911 			CleanupStack::PushL(hBuf);
       
   912 			unicode.Set(hBuf->Des());
       
   913 			}
       
   914 		unicode.Append(buf);
       
   915 		if (unconverted ==0)
       
   916 			break;
       
   917 		utf8.Set(utf8.Right(unconverted));
       
   918 		}
       
   919 	CleanupStack::Pop();
       
   920 	return hBuf;
       
   921 	}
       
   922 
       
   923 /** Converts text encoded using the Unicode transformation format UTF-8 into the
       
   924 Unicode UCS-2 character set.
       
   925 
       
   926 @param aUnicode On return, contains the Unicode encoded output string.
       
   927 @param aUtf8 The UTF-8 encoded input string
       
   928 @return The number of unconverted bytes left at the end of the input descriptor,
       
   929 or one of the error values defined in TError. */
       
   930 EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
       
   931 	{
       
   932 	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
       
   933 	}
       
   934 
       
   935 static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
       
   936 		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
       
   937 	{
       
   938 	if (aNumberOfUnconvertibleCharacters<=0)
       
   939 		{
       
   940 		aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
       
   941 		}
       
   942 	++aNumberOfUnconvertibleCharacters;
       
   943 	}
       
   944 
       
   945 /** Converts text encoded using the Unicode transformation format UTF-8 into the
       
   946 Unicode UCS-2 character set.
       
   947 
       
   948 @param aUnicode On return, contains the Unicode encoded output string.
       
   949 @param aUtf8 The UTF-8 encoded input string
       
   950 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
       
   951 @return The number of unconverted bytes left at the end of the input descriptor,
       
   952 or one of the error values defined in TError. */
       
   953 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
       
   954 	{
       
   955 	TInt dummyUnconverted, dummyUnconvertedIndex;
       
   956 	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
       
   957 	}
       
   958 
       
   959 /** Converts text encoded using the Unicode transformation format UTF-8 into the
       
   960 Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
       
   961 
       
   962 The variant of UTF-8 used internally by Java differs slightly from standard
       
   963 UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
       
   964 
       
   965 @param aUnicode On return, contains the Unicode encoded output string.
       
   966 @param aUtf8 The UTF-8 encoded input string
       
   967 @param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java
       
   968 UTF-8. The default is EFalse.
       
   969 @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes
       
   970 which were not converted.
       
   971 @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index
       
   972 of the first byte of the first unconvertible character. For instance if the
       
   973 first character in the input descriptor (aForeign) could not be converted,
       
   974 then this parameter is set to the first byte of that character, i.e. zero.
       
   975 A negative value is returned if all the characters were converted.
       
   976 @return The number of unconverted bytes left at the end of the input descriptor,
       
   977 or one of the error values defined in TError. */
       
   978 
       
   979 /* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
       
   980  * Well formed UTF-8 Byte Sequences, full table.
       
   981  * +----------------------------------------------------------------+
       
   982  * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
       
   983  * +--------------------+----------+----------+----------+----------+
       
   984  * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
       
   985  * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2
       
   986  * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
       
   987  * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
       
   988  * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
       
   989  * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
       
   990  * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
       
   991  * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
       
   992  * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
       
   993  * +--------------------+----------+----------+----------+----------+
       
   994  *
       
   995  * As a consequence of the well-formedness conditions specified in table 3-7,
       
   996  * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
       
   997  */
       
   998 TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
       
   999 		TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
       
  1000 	{
       
  1001 	aUnicode.SetLength(0);
       
  1002 
       
  1003 	if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
       
  1004 		{
       
  1005 		return aUtf8.Length();
       
  1006 		}
       
  1007 
       
  1008 	TUint16*           pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
       
  1009 	const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
       
  1010 	const TUint8*         pUtf8 = aUtf8.Ptr();
       
  1011 	const TUint8*     pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
       
  1012 	const TUint16 replacementcharacter = 0xFFFD;
       
  1013 	TUint currentUnicodeCharacter;
       
  1014 	TInt sequenceLength;
       
  1015 
       
  1016 
       
  1017 	FOREVER
       
  1018 		{
       
  1019 		TBool illFormed=EFalse;
       
  1020 
       
  1021 		__ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
       
  1022 		__ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
       
  1023 
       
  1024 		sequenceLength = 1;
       
  1025 
       
  1026 		// ascii - optimisation (i.e. it isn't a sequence)
       
  1027 		if (pUtf8[0] < 0x80)
       
  1028 			{
       
  1029 			currentUnicodeCharacter = pUtf8[0];
       
  1030 			}
       
  1031 		else
       
  1032 			{
       
  1033 			// see if well formed utf-8, use table above for reference
       
  1034 			if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
       
  1035 				{
       
  1036 				// 0xc1-0xc2 are not valid bytes
       
  1037 				sequenceLength = 2;
       
  1038 				}
       
  1039 			else if ((pUtf8[0] & 0xf0) == 0xe0)
       
  1040 				{
       
  1041 				sequenceLength = 3;
       
  1042 				}
       
  1043 			else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
       
  1044 				{
       
  1045 				// 0xf5-0xff, are not valid bytes
       
  1046 				sequenceLength = 4;
       
  1047 				}
       
  1048 			else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
       
  1049 				{
       
  1050 				if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
       
  1051 					{
       
  1052 					// either we've split the 0xc0 0x80 (i.e. 0xc0 is
       
  1053 					// the last character in the string) or we've
       
  1054 					// discovered a valid 0xc0 0x80 sequence.
       
  1055 					sequenceLength = 2;
       
  1056 					}
       
  1057 				}
       
  1058 
       
  1059 			/* checking to see if we got a valid sequence */
       
  1060 			if (sequenceLength == 1)
       
  1061 				{
       
  1062 				// bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
       
  1063 				currentUnicodeCharacter = replacementcharacter;
       
  1064 				UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1065 						aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1066 				}
       
  1067 			else
       
  1068 				{
       
  1069 				// this is a check to see if the sequence goes beyond the input
       
  1070 				// stream.  if its not the first and only character in the input
       
  1071 				// stream this isn't an error, otherwise it is.
       
  1072 				if ((pUtf8 + sequenceLength - 1) >  pLastUtf8)
       
  1073 					{
       
  1074 					// check to see if this sequence was the first character
       
  1075 					if ((pUnicode - aUnicode.Ptr()) == 0)
       
  1076 						{
       
  1077 						return EErrorIllFormedInput;
       
  1078 						}
       
  1079 					break;
       
  1080 					}
       
  1081 
       
  1082 				currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
       
  1083 
       
  1084 				/* check the trailing bytes, they should begin with 10 */
       
  1085 				TUint i = 1;
       
  1086 
       
  1087 				do
       
  1088 					{
       
  1089 					if ((pUtf8[i] & 0xc0) == 0x80)
       
  1090 						{
       
  1091 						// add the trailing 6 bits to the current unicode char
       
  1092 						currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
       
  1093 						}
       
  1094 					else
       
  1095 						{
       
  1096 						// ill formed character (doesn't have a lead 10)
       
  1097 						currentUnicodeCharacter = replacementcharacter;
       
  1098 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1099 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1100 						illFormed=ETrue;
       
  1101 						break;
       
  1102 						}
       
  1103 					i++;
       
  1104 					}
       
  1105 				while (i < (unsigned)sequenceLength);
       
  1106 				}
       
  1107 
       
  1108 			/* conformance check.  bits of above table for reference.
       
  1109 			 * +----------------------------------------------------------------+
       
  1110 			 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
       
  1111 			 * +--------------------+----------+----------+----------+----------+
       
  1112 			 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, 2nd < 0xA0
       
  1113 			 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, 2nd > 0x9F
       
  1114 			 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, 2nd < 0x90
       
  1115 			 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, 2nd > 0x8F
       
  1116 			 * +--------------------+----------+----------+----------+----------+
       
  1117 			 */
       
  1118 
       
  1119 			if (currentUnicodeCharacter != replacementcharacter)
       
  1120 				{
       
  1121 				if (sequenceLength == 3)
       
  1122 					{
       
  1123 					if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
       
  1124 						{
       
  1125 						currentUnicodeCharacter = replacementcharacter;
       
  1126 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1127 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1128 						illFormed=ETrue;
       
  1129 						}
       
  1130 					else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
       
  1131 						{
       
  1132 						currentUnicodeCharacter = replacementcharacter;
       
  1133 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1134 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1135 						illFormed=ETrue;
       
  1136 						}
       
  1137 					}
       
  1138 				else if (sequenceLength == 4)
       
  1139 					{
       
  1140 					if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
       
  1141 						{
       
  1142 						currentUnicodeCharacter = replacementcharacter;
       
  1143 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1144 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1145 						illFormed=ETrue;
       
  1146 						}
       
  1147 					else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
       
  1148 						{
       
  1149 						currentUnicodeCharacter = replacementcharacter;
       
  1150 						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1151 								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1152 						illFormed=ETrue;
       
  1153 						}
       
  1154 					}
       
  1155 
       
  1156 
       
  1157 				/* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
       
  1158 				 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code
       
  1159 				 * points D800..DFFF is ill formed */
       
  1160 
       
  1161 				if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
       
  1162 					{
       
  1163 					currentUnicodeCharacter = replacementcharacter;
       
  1164 					UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
       
  1165 							aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
       
  1166 					illFormed=ETrue;
       
  1167 					}
       
  1168 				}
       
  1169 				// end conformance check
       
  1170 			}
       
  1171 
       
  1172 		// would this character generate a surrogate pair in UTF-16?
       
  1173 		if (currentUnicodeCharacter > 0xFFFF)
       
  1174 			{
       
  1175 			// is there enough space to hold a surrogate pair in the output?
       
  1176 			if (pUnicode >= pLastUnicode)
       
  1177 				{
       
  1178 				break; // no, end processing.
       
  1179 				}
       
  1180 
       
  1181 			TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
       
  1182 			*pUnicode++ = STATIC_CAST(TUint16, surrogate);
       
  1183 
       
  1184 			surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
       
  1185 			*pUnicode++ = STATIC_CAST(TUint16, surrogate);
       
  1186 			}
       
  1187 		else
       
  1188 			{
       
  1189 			*pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
       
  1190 			}
       
  1191 
       
  1192 		// move the input pointer
       
  1193 		if (currentUnicodeCharacter != replacementcharacter)
       
  1194 			{
       
  1195 			pUtf8 += sequenceLength;
       
  1196 			}
       
  1197 		else if(illFormed == EFalse)
       
  1198 			{
       
  1199 			pUtf8 += (sequenceLength);
       
  1200 			}
       
  1201 		else
       
  1202 			{
       
  1203 			// we had a character we didn't recognize (i.e. it was invalid)
       
  1204 			// so move to the next character in the input
       
  1205 			pUtf8++;
       
  1206 			}
       
  1207 
       
  1208 		if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
       
  1209 			{
       
  1210 			break;  // we've either reached the end of the input or the end of output
       
  1211 			}
       
  1212 		}
       
  1213 
       
  1214 	aUnicode.SetLength(pUnicode - aUnicode.Ptr());
       
  1215 	return (pLastUtf8 - pUtf8 + 1);
       
  1216 	}
       
  1217 
       
  1218 /** Given a sample text this function attempts to determine whether or not
       
  1219  *  the same text is encoded using the UTF-8 standard encoding scheme.
       
  1220 
       
  1221 @param TInt a confidence level, given at certain value.  if the given sample
       
  1222 			is UTF-8 this value will not be changed (unless > 100) then its
       
  1223 			set to 100.  Otherwise if the same isn't UTF-8, its set to 0.
       
  1224 @param TDesC8 sample text.
       
  1225 UTF-8. The default is EFalse.
       
  1226 @return void
       
  1227  */
       
  1228 
       
  1229 /* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
       
  1230  * Well formed UTF-8 Byte Sequences, full table.
       
  1231  * +----------------------------------------------------------------+
       
  1232  * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
       
  1233  * +--------------------+----------+----------+----------+----------+
       
  1234  * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
       
  1235  * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2
       
  1236  * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
       
  1237  * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
       
  1238  * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
       
  1239  * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
       
  1240  * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
       
  1241  * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
       
  1242  * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
       
  1243  * +--------------------+----------+----------+----------+----------+
       
  1244  *
       
  1245  * As a consequence of the well-formedness conditions specified in table 3-7,
       
  1246  * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
       
  1247  *
       
  1248  * Code Rules:
       
  1249  *   R1: If the string contains any non-UTF-8 characters the returned confidence
       
  1250  *       is 0.  Valid UTF-8 combinations are listed in the above table.
       
  1251  *   R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in
       
  1252  *       the (see ) the returned confidence is 95.
       
  1253  *   R3: Otherwise the confidence returned is based upon the sample string
       
  1254  *       length.
       
  1255  *   R4: If the sample string is under 75 characters, the confidence is set to
       
  1256  *       75.
       
  1257  */
       
  1258 GLREF_C void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
       
  1259 	{
       
  1260 
       
  1261 	TInt sampleLength = aSample.Length();
       
  1262 
       
  1263 	if (sampleLength == 0)
       
  1264 		{
       
  1265 		aConfidenceLevel = 89;
       
  1266 		return;
       
  1267 		}
       
  1268 	TInt bytesRemaining  = 0;
       
  1269 	TInt sequenceLength  = 0;
       
  1270 
       
  1271 	aConfidenceLevel = sampleLength;
       
  1272 
       
  1273 	const TUint8* buffer = &aSample[0];
       
  1274 
       
  1275 	if (sampleLength < 95)
       
  1276 		{
       
  1277 		// check for the BOM
       
  1278 		if ((sampleLength >= 3) &&
       
  1279 			((buffer[0] == 0xEF) &&
       
  1280 			 (buffer[1] == 0xBB) &&
       
  1281 			 (buffer[2] == 0xBF))
       
  1282 			)
       
  1283 			{
       
  1284 			aConfidenceLevel = 95;
       
  1285 			}
       
  1286 		else if (sampleLength < 75)
       
  1287 			{
       
  1288 			aConfidenceLevel = 75;
       
  1289 			}
       
  1290 		}
       
  1291 
       
  1292 	for (TInt index = 0;index != sampleLength;index++)
       
  1293 		{
       
  1294 
       
  1295 		if (bytesRemaining > 0)
       
  1296 			{
       
  1297 			// bytesRemaining > 0, means that a byte representing the start of a
       
  1298 			// multibyte sequence was encountered and the bytesRemaining is the
       
  1299 			// number of bytes to follow.
       
  1300 
       
  1301 			if ((buffer[index] & 0xc0) == 0x80)
       
  1302 				{
       
  1303 				// need to check for ill-formed sequences -- all are in the 2nd byte
       
  1304 
       
  1305 				if ((sequenceLength == 3) && (bytesRemaining == 2))
       
  1306 					{
       
  1307 					if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
       
  1308 						{
       
  1309 						aConfidenceLevel = 0;
       
  1310 						break;
       
  1311 						}
       
  1312 					else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
       
  1313 						{
       
  1314 						aConfidenceLevel = 0;
       
  1315 						break;
       
  1316 						}
       
  1317 					}
       
  1318 				else if ((sequenceLength == 4) && (bytesRemaining == 3))
       
  1319 					{
       
  1320 					if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
       
  1321 						{
       
  1322 						aConfidenceLevel = 0;
       
  1323 						break;
       
  1324 						}
       
  1325 					else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
       
  1326 						{
       
  1327 						aConfidenceLevel = 0;
       
  1328 						break;
       
  1329 						}
       
  1330 					}
       
  1331 
       
  1332 				--bytesRemaining;
       
  1333 				continue;
       
  1334 				}
       
  1335 			else
       
  1336 				{
       
  1337 				aConfidenceLevel = 0;
       
  1338 				break;
       
  1339 				}
       
  1340 			}
       
  1341 
       
  1342 		if (bytesRemaining == 0)
       
  1343 			{
       
  1344 			if (buffer[index] < 0x80)
       
  1345 				{
       
  1346 				// The value of aSample[index] is in the range 0x00-0x7f
       
  1347 				//UTF8 maintains ASCII transparency. So it's a valid
       
  1348 				//UTF8. Do nothing, check next value.
       
  1349 				continue;
       
  1350 				}
       
  1351 			else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
       
  1352 				{
       
  1353 				// valid start of a 2 byte sequence (see conformance note)
       
  1354 				sequenceLength = 2;
       
  1355 				bytesRemaining = 1;
       
  1356 				}
       
  1357 			else if ((buffer[index] & 0xf0) == 0xe0)
       
  1358 				{
       
  1359 				// valid start of a 3 byte sequence
       
  1360 				sequenceLength = 3;
       
  1361 				bytesRemaining = 2;
       
  1362 				}
       
  1363 			else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
       
  1364 				{
       
  1365 				// valid start of a 4 byte sequence (see conformance note)
       
  1366 				sequenceLength = 4;
       
  1367 				bytesRemaining = 3;
       
  1368 				}
       
  1369 			else
       
  1370 				{
       
  1371 				// wasn't anything expected so must be an illegal/irregular UTF8 coded value
       
  1372 				aConfidenceLevel = 0;
       
  1373 				break;
       
  1374 				}
       
  1375 			}
       
  1376 		} // for
       
  1377 
       
  1378 	aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
       
  1379 	}
       
  1380 
       
  1381 GLREF_C void IsCharacterSetUTF7(TInt& aConfidenceLevel, const TDesC8& aSample)
       
  1382 	{
       
  1383 	TInt sampleLength = aSample.Length();
       
  1384 	aConfidenceLevel = 70;
       
  1385 	for (TInt i=0; i<sampleLength; ++i)
       
  1386 		{
       
  1387 		// UTF-7 value ranges only 7 bits
       
  1388 		if((aSample[i]&0x80)!=0x00)
       
  1389 			{
       
  1390 			aConfidenceLevel= 0;
       
  1391 			break;
       
  1392 			}
       
  1393 
       
  1394 		// there is no "~" in UTF-7 encoding. So if find either, it's not UTF-7
       
  1395 		else if (char(aSample[i])=='~')
       
  1396 			{
       
  1397 			aConfidenceLevel = 0;
       
  1398 			break;
       
  1399 			}
       
  1400 
       
  1401 		// The SMS7Bit escape char value is 0x1b. Reduce confidence if it follows the following format
       
  1402 		else if ( (aSample[i]==0x1b) && (i <sampleLength-1) )
       
  1403 			{
       
  1404 			static const TInt smsExtensionTable[11] =
       
  1405 				{0x0a, 0x14, 0x1b, 0x28, 0x29, 0x2f, 0x3c, 0x3d, 0x3e, 0x40, 0x65};
       
  1406 			TInt increment1 = i+1;
       
  1407 			if (increment1>= sampleLength)
       
  1408 				break;
       
  1409 			for (TInt j=0; j < 11; ++j)
       
  1410 				{
       
  1411 				if (aSample[increment1] == smsExtensionTable[j])
       
  1412 					{
       
  1413 					aConfidenceLevel-=10;
       
  1414 					}
       
  1415 				}
       
  1416 			}
       
  1417 		// The UTF-7 escape char is 0x2b. The values that follow the escape sequence
       
  1418 		// the values following the escape char value must belong to the modified base64
       
  1419 		// or '-' else it is an ill-formed sequence, so probably not UTF-7
       
  1420 		else if ( (aSample[i]==0x2b)  && (i <sampleLength-1) )
       
  1421 			{
       
  1422 			TInt increment1 = i+1;
       
  1423 			if ((aSample[increment1] == 0x2b) || (aSample[increment1] == 0x2d) || (aSample[increment1] == 0x2f) ||
       
  1424 				((aSample[increment1] >= 0x41) && (aSample[increment1] <= 0x5a)) ||
       
  1425 				((aSample[increment1] >= 0x61) && (aSample[increment1] <= 0x7a)))
       
  1426 				{
       
  1427 				aConfidenceLevel+=5;
       
  1428 				}
       
  1429 			else
       
  1430 				{
       
  1431 				aConfidenceLevel-=15;
       
  1432 				}
       
  1433 			i++; // should this be here or up in the if loop ??
       
  1434 			}
       
  1435 		} //for
       
  1436 	aConfidenceLevel =(aConfidenceLevel >0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
       
  1437 	}