securityanddataprivacytools/securitytools/certapp/store--/utf.cpp
changeset 8 35751d3474b7
parent 0 2c201484c85f
--- a/securityanddataprivacytools/securitytools/certapp/store--/utf.cpp	Tue Jul 21 01:04:32 2009 +0100
+++ b/securityanddataprivacytools/securitytools/certapp/store--/utf.cpp	Thu Sep 10 14:01:51 2009 +0300
@@ -1,742 +1,742 @@
-/*
-* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
-* All rights reserved.
-* This component and the accompanying materials are made available
-* under the terms of the License "Eclipse Public License v1.0"
-* which accompanies this distribution, and is available
-* at the URL "http://www.eclipse.org/legal/epl-v10.html".
-*
-* Initial Contributors:
-* Nokia Corporation - initial contribution.
-*
-* Contributors:
-*
-* Description: 
-*
-*/
-
-
-#include <e32std.h>
-#include <e32base.h>
-#include <utf.h>
-
-#define STATIC_CAST(t,v) static_cast<t>(v)
-#define CONST_CAST(t,v) const_cast<t>(v)
-#define FOREVER for(;;)
-
-const TUint KNotInBase64Alphabet=KMaxTUint;
-
-enum TPanic
-	{
-	EPanicBad6BitNumber=1,
-	EPanicBadUtf7Pointers1,
-	EPanicBadUtf7Pointers2,
-	EPanicBadUtf7Pointers3,
-	EPanicBadUtf7Pointers4,
-	EPanicBadUtf7Pointers5,
-	EPanicBadUtf7Pointers6,
-	EPanicBadUtf7Pointers7,
-	EPanicBadUtf7Pointers8,
-	EPanicBadUtf7Pointers9,
-	EPanicBadUtf7Pointers10,
-	EPanicBadUtf7Pointers11,
-	EPanicNotInBase64Block,
-	EPanicBadUnicodePointers1,
-	EPanicBadUnicodePointers2,
-	EPanicBadUnicodePointers3,
-	EPanicBadUnicodePointers4,
-	EPanicBadUnicodePointers5,
-	EPanicBadUnicodePointers6,
-	EPanicBadUnicodePointers7,
-	EPanicBadUnicodePointers8,
-	EPanicBadUnicodePointers9,
-	EPanicBadUnicodePointers10,
-	EPanicBadBitBufferState1,
-	EPanicBadBitBufferState2,
-	EPanicBadBitBufferState3,
-	EPanicBadBitBufferState4,
-	EPanicBadBitBufferState5,
-	EPanicBadBitBufferState6,
-	EPanicBadBitBufferState7,
-	EPanicBadBitBufferState8,
-	EPanicBadBitBufferState9,
-	EPanicBadBitBufferState10,
-	EPanicBadBitBufferState11,
-	EPanicBadBitBufferState12,
-	EPanicBadBitBufferState13,
-	EPanicBadBitBufferState14,
-	EPanicBadBitBufferState15,
-	EPanicBadBitBufferState16,
-	EPanicBadBitBufferState17,
-	EPanicUnexpectedNumberOfLoopIterations,
-	EPanicInitialEscapeCharacterButNoBase64,
-	EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
-	EPanicBadUtf8Pointers1,
-	EPanicBadUtf8Pointers2,
-	EPanicBadUtf8Pointers3,
-	EPanicBadUtf8Pointers4,
-	EPanicBadUtf8Pointers5,
-	EPanicBadUtf8Pointers6,
-	EPanicBadUtf8Pointers7,
-	EPanicOutOfSyncUtf7Byte1,
-	EPanicOutOfSyncUtf7Byte2,
-	EPanicOutOfSyncBase64Decoding
-	};
-
-_LIT(KLitPanicText, "CHARCONV-UTF");
-
-LOCAL_C void Panic(TPanic aPanic)
-	{
-	User::Panic(KLitPanicText, aPanic);
-	}
-
-inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
-
-inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
-	{
-	return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
-	}
-
-
-
-
-
-
- 
-
-/** Converts Unicode text into UTF-8 encoding.
-
-@param aUtf8 On return, contains the UTF-8 encoded output string.
-@param aUnicode The Unicode-encoded input string.
-@return The number of unconverted characters left at the end of the input 
-descriptor, or one of the error values defined in TError. */
-EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
-	{
-	return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
-	}
-
-
-
-/** Converts Unicode text into UTF-8 encoding. 
-
-Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
-
-The variant of UTF-8 used internally by Java differs slightly from standard 
-UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
-
-@param aUtf8 On return, contains the UTF-8 encoded output string.
-@param aUnicode A UCS-2 encoded input string.
-@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
-UTF-8. The default is EFalse.
-@return The number of unconverted characters left at the end of the input descriptor, 
-or one of the error values defined in TError. */
-TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, 
-											   const TDesC16& aUnicode, 
-											   TBool aGenerateJavaConformantUtf8)
-	{
-	if (aUnicode.Length() == 0)
-		{
-		aUtf8.SetLength(0);
-		return 0;
-		}
-	if (aUtf8.MaxLength() == 0)
-		{
-		return aUnicode.Length();
-		}
-	
-	TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
-	const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
-	TBool inputIsTruncated = EFalse;
-	const TUint16* pUnicode = aUnicode.Ptr();
-	const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
-	
-	FOREVER
-		{
-		__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
-		__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
-	
-		if (pUnicode[0] < 0x80)
-			{
-			// ascii - 1 byte
-			
-			// internally java is different since the \x0000 character is 
-			// translated into \xC0 \x80.
-			
-			if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
-				{
-				if (pUtf8 == pointerToLastUtf8Byte)
-					{
-					pUtf8--;
-					pUnicode--;
-					break;			
-					}
-				*pUtf8++ = STATIC_CAST(TUint8, 0xc0);
-				*pUtf8   = STATIC_CAST(TUint8, 0x80);	
-				}
-			else
-				{
-				*pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
-				}
-			}
-		else if (pUnicode[0] < 0x800)
-			{
-			// U+0080..U+07FF - 2 bytes
-			
-			if (pUtf8 == pointerToLastUtf8Byte)
-				{
-				pUtf8--;
-				pUnicode--;
-				break;
-				}
-			
-			*pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
-			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
-			
-			}
-
-		// check to see if we have a surrogate in the stream, surrogates encode code points outside
-		// the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
-
-		else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
-			{
-			// surrogate pair - 4 bytes in utf-8
-			// U+10000..U+10FFFF
-			
-			__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
-			// is there enough space to hold the character
-			if ((pointerToLastUtf8Byte - pUtf8) < 3)
-				{
-				pUtf8--;
-				pUnicode--;
-				break;  // no go to the exit condition
-				}
-			
-			__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
-			if (pUnicode >= pointerToLastUnicodeCharacter)
-				{
-				pUtf8--;
-				pUnicode--;
-				inputIsTruncated = ETrue;
-				break; // middle of a surrogate pair. go to end condition
-				}
-			
-			if ((pUnicode[1] & 0xfc00) != 0xdc00)
-				{
-				return EErrorIllFormedInput;
-				}
-			
-			// convert utf-16 surrogate to utf-32
-			TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
-			
-			// convert utf-32 to utf-8
-            *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));   
-            *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
-            *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
-            *pUtf8   = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
-			
-            // we consumed 2 utf-16 values, move this pointer
-			pUnicode++;
-			}		
-		else
-			{
-			// 3 byte - utf-8, U+800..U+FFFF rest of BMP.
-			
-			if (pointerToLastUtf8Byte - pUtf8 < 2)
-				{
-				pUtf8--;
-				pUnicode--;
-				break;
-				}
-			*pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
-			*pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
-			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
-			}
-		
-		if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
-			{
-			break;
-			}
-		
-		pUtf8++;
-		pUnicode++;
-		
-		}
-	
-	if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
-		{
-		return EErrorIllFormedInput;
-		}
-	
-	aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
-	return pointerToLastUnicodeCharacter-pUnicode;
-	}
-
-
-
-
-
- 
-
-
-
-
-
-/** Converts text encoded using the Unicode transformation format UTF-8 into the 
-Unicode UCS-2 character set.
-
-@param aUnicode On return, contains the Unicode encoded output string.
-@param aUtf8 The UTF-8 encoded input string
-@return The number of unconverted bytes left at the end of the input descriptor, 
-or one of the error values defined in TError. */
-EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
-	{
-	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
-	}
-
-static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
-		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
-	{
-	if (aNumberOfUnconvertibleCharacters<=0)
-		{
-		aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
-		}
-	++aNumberOfUnconvertibleCharacters;
-	}
-
-/** Converts text encoded using the Unicode transformation format UTF-8 into the 
-Unicode UCS-2 character set.
-
-@param aUnicode On return, contains the Unicode encoded output string.
-@param aUtf8 The UTF-8 encoded input string
-@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
-@return The number of unconverted bytes left at the end of the input descriptor, 
-or one of the error values defined in TError. */
-TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
-	{
-	TInt dummyUnconverted, dummyUnconvertedIndex;
-	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
-	}
-
-/** Converts text encoded using the Unicode transformation format UTF-8 into the 
-Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
-
-The variant of UTF-8 used internally by Java differs slightly from standard 
-UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
-
-@param aUnicode On return, contains the Unicode encoded output string.
-@param aUtf8 The UTF-8 encoded input string
-@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
-UTF-8. The default is EFalse.
-@param aNumberOfUnconvertibleCharacters On return, contains the number of bytes 
-which were not converted.
-@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index 
-of the first byte of the first unconvertible character. For instance if the 
-first character in the input descriptor (aForeign) could not be converted, 
-then this parameter is set to the first byte of that character, i.e. zero. 
-A negative value is returned if all the characters were converted.
-@return The number of unconverted bytes left at the end of the input descriptor, 
-or one of the error values defined in TError. */
-
-/* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
- * Well formed UTF-8 Byte Sequences, full table.
- * +----------------------------------------------------------------+
- * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
- * +--------------------+----------+----------+----------+----------+
- * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
- * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
- * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
- * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
- * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
- * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
- * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
- * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
- * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
- * +--------------------+----------+----------+----------+----------+
- * 
- * As a consequence of the well-formedness conditions specified in table 3-7,
- * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
- */
-TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
-		TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
-	{	
-	aUnicode.SetLength(0);
-	
-	if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
-		{
-		return aUtf8.Length();
-		}
-
-	TUint16*           pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
-	const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
-	const TUint8*         pUtf8 = aUtf8.Ptr();   
-	const TUint8*     pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
-	const TUint16 replacementcharacter = 0xFFFD;
-	TUint currentUnicodeCharacter;
-	TUint sequenceLength;
-
-	
-	FOREVER
-		{
-		TBool illFormed=EFalse;
-		
-		__ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
-		__ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
-		
-		sequenceLength = 1;
-		
-		// ascii - optimisation (i.e. it isn't a sequence)
-		if (pUtf8[0] < 0x80)
-			{
-			currentUnicodeCharacter = pUtf8[0];
-			}
-		else
-			{
-			// see if well formed utf-8, use table above for reference	
-			if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
-				{
-				// 0xc1-0xc2 are not valid bytes
-				sequenceLength = 2;
-				}
-			else if ((pUtf8[0] & 0xf0) == 0xe0)
-				{
-				sequenceLength = 3;
-				}
-			else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
-				{
-				// 0xf5-0xff, are not valid bytes
-				sequenceLength = 4;
-				}
-			else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
-				{
-				if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
-					{
-					// either we've split the 0xc0 0x80 (i.e. 0xc0 is
-					// the last character in the string) or we've
-					// discovered a valid 0xc0 0x80 sequence.  
-					sequenceLength = 2;
-					}
-				}
-			
-			/* checking to see if we got a valid sequence */
-			if (sequenceLength == 1)
-				{
-				// bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
-				currentUnicodeCharacter = replacementcharacter;
-				UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
-						aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
-				}
-			else
-				{
-				// this is a check to see if the sequence goes beyond the input 
-				// stream.  if its not the first and only character in the input
-				// stream this isn't an error, otherwise it is.
-				if ((pUtf8 + sequenceLength - 1) >  pLastUtf8)
-					{
-					// check to see if this sequence was the first character
-					if ((pUnicode - aUnicode.Ptr()) == 0)
-						{
-						return EErrorIllFormedInput;
-						}
-					break;
-					}			
-				
-				currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
-			
-				/* check the trailing bytes, they should begin with 10 */
-				TUint i = 1;
-
-				do
-					{
-					if ((pUtf8[i] & 0xc0) == 0x80)
-						{
-						// add the trailing 6 bits to the current unicode char
-						currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
-						}
-					else
-						{
-						// ill formed character (doesn't have a lead 10)
-						currentUnicodeCharacter = replacementcharacter;
-						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
-								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
-						illFormed=ETrue;
-						break; 
-						}
-					i++;
-					}
-				while (i < sequenceLength);
-				}
-				
-			/* conformance check.  bits of above table for reference.
-			 * +----------------------------------------------------------------+
-			 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
-			 * +--------------------+----------+----------+----------+----------+
-			 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, 2nd < 0xA0
-			 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, 2nd > 0x9F
-			 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, 2nd < 0x90
-			 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, 2nd > 0x8F
-			 * +--------------------+----------+----------+----------+----------+
-			 */
-			
-			if (currentUnicodeCharacter != replacementcharacter)
-				{
-				if (sequenceLength == 3)
-					{
-					if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
-						{
-						currentUnicodeCharacter = replacementcharacter;
-						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
-								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
-						illFormed=ETrue;
-						}
-					else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
-						{
-						currentUnicodeCharacter = replacementcharacter;
-						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
-								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
-						illFormed=ETrue;
-						}
-					}
-				else if (sequenceLength == 4)
-					{
-					if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
-						{
-						currentUnicodeCharacter = replacementcharacter;
-						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
-								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
-						illFormed=ETrue;
-						}
-					else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
-						{
-						currentUnicodeCharacter = replacementcharacter;
-						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
-								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
-						illFormed=ETrue;
-						}
-					}
-				
-				
-				/* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
-				 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code 
-				 * points D800..DFFF is ill formed */
-				
-				if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
-					{
-					currentUnicodeCharacter = replacementcharacter;
-					UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
-							aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
-					illFormed=ETrue;
-					}	
-				}
-				// end conformance check
-			}
-
-		// would this character generate a surrogate pair in UTF-16?
-		if (currentUnicodeCharacter > 0xFFFF)
-			{
-			// is there enough space to hold a surrogate pair in the output?
-			if (pUnicode >= pLastUnicode)
-				{
-				break; // no, end processing.
-				}
-			
-			TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
-			*pUnicode++ = STATIC_CAST(TUint16, surrogate);
-					
-			surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
-			*pUnicode++ = STATIC_CAST(TUint16, surrogate);			
-			}
-		else
-			{
-			*pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
-			}
-		
-		// move the input pointer
-		if (currentUnicodeCharacter != replacementcharacter)
-			{
-			pUtf8 += sequenceLength;
-			}
-		else if(illFormed == EFalse)
-			{
-			pUtf8 += (sequenceLength);
-			}
-		else
-			{
-			// we had a character we didn't recognize (i.e. it was invalid)
-			// so move to the next character in the input
-			pUtf8++;
-			}
-		
-		if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
-			{ 
-			break;  // we've either reached the end of the input or the end of output
-			}
-		}
-
-	aUnicode.SetLength(pUnicode - aUnicode.Ptr());
-	return (pLastUtf8 - pUtf8 + 1);
-	}
-
-/** Given a sample text this function attempts to determine whether or not
- *  the same text is encoded using the UTF-8 standard encoding scheme.
-
-@param TInt a confidence level, given at certain value.  if the given sample
-			is UTF-8 this value will not be changed (unless > 100) then its
-			set to 100.  Otherwise if the same isn't UTF-8, its set to 0.
-@param TDesC8 sample text.
-UTF-8. The default is EFalse.
-@return void
- */
-
-/* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
- * Well formed UTF-8 Byte Sequences, full table.
- * +----------------------------------------------------------------+
- * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
- * +--------------------+----------+----------+----------+----------+
- * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
- * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
- * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
- * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
- * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
- * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
- * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
- * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
- * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
- * +--------------------+----------+----------+----------+----------+
- * 
- * As a consequence of the well-formedness conditions specified in table 3-7,
- * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
- * 
- * Code Rules:
- *   R1: If the string contains any non-UTF-8 characters the returned confidence
- *       is 0.  Valid UTF-8 combinations are listed in the above table.
- *   R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in  
- *       the (see ) the returned confidence is 95.
- *   R3: Otherwise the confidence returned is based upon the sample string 
- *       length.
- *   R4: If the sample string is under 75 characters, the confidence is set to 
- *       75.
- */
-void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
-	{
-
-	TInt sampleLength = aSample.Length();
-	
-	if (sampleLength == 0)
-		{
-		aConfidenceLevel = 89;
-		return;
-		}
-	TInt bytesRemaining  = 0;
-	TUint sequenceLength  = 0;
-	
-	aConfidenceLevel = sampleLength;
-
-	const TUint8* buffer = &aSample[0];
-
-	if (sampleLength < 95)
-		{
-		// check for the BOM
-		if ((sampleLength >= 3) && 
-			((buffer[0] == 0xEF) &&
-			 (buffer[1] == 0xBB) &&
-			 (buffer[2] == 0xBF)) 
-			) 
-			{
-			aConfidenceLevel = 95;
-			}
-		else if (sampleLength < 75)
-			{
-			aConfidenceLevel = 75;
-			}
-		}
-	
-	for (TInt index = 0;index != sampleLength;index++)
-		{
-		
-		if (bytesRemaining > 0)
-			{
-			// bytesRemaining > 0, means that a byte representing the start of a 
-			// multibyte sequence was encountered and the bytesRemaining is the 
-			// number of bytes to follow. 
-			
-			if ((buffer[index] & 0xc0) == 0x80) 
-				{
-				// need to check for ill-formed sequences -- all are in the 2nd byte
-				
-				if ((sequenceLength == 3) && (bytesRemaining == 2))
-					{
-					if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
-						{
-						aConfidenceLevel = 0;
-						break;
-						}
-					else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
-						{
-						aConfidenceLevel = 0;
-						break;
-						}
-					}
-				else if ((sequenceLength == 4) && (bytesRemaining == 3))
-					{
-					if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
-						{
-						aConfidenceLevel = 0;
-						break;
-						}
-					else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
-						{
-						aConfidenceLevel = 0;
-						break;
-						}
-					}
-				
-				--bytesRemaining;
-				continue;
-				}
-			else
-				{
-				aConfidenceLevel = 0;
-				break;
-				}
-			}
-		
-		if (bytesRemaining == 0)
-			{
-			if (buffer[index] < 0x80)
-				{
-				// The value of aSample[index] is in the range 0x00-0x7f
-				//UTF8 maintains ASCII transparency. So it's a valid
-				//UTF8. Do nothing, check next value.
-				continue;
-				}
-			else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
-				{
-				// valid start of a 2 byte sequence (see conformance note)
-				sequenceLength = 2;
-				bytesRemaining = 1;
-				}
-			else if ((buffer[index] & 0xf0) == 0xe0)
-				{
-				// valid start of a 3 byte sequence
-				sequenceLength = 3;
-				bytesRemaining = 2;
-				}
-			else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
-				{
-				// valid start of a 4 byte sequence (see conformance note)
-				sequenceLength = 4;
-				bytesRemaining = 3;
-				}	
-			else
-				{
-				// wasn't anything expected so must be an illegal/irregular UTF8 coded value
-				aConfidenceLevel = 0;
-				break;
-				}
-			}
-		} // for 
-	
-	aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
-	}
-
-// End of file
+/*
+* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of the License "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: 
+*
+*/
+
+
+#include <e32std.h>
+#include <e32base.h>
+#include <utf.h>
+
+#define STATIC_CAST(t,v) static_cast<t>(v)
+#define CONST_CAST(t,v) const_cast<t>(v)
+#define FOREVER for(;;)
+
+const TUint KNotInBase64Alphabet=KMaxTUint;
+
+enum TPanic
+	{
+	EPanicBad6BitNumber=1,
+	EPanicBadUtf7Pointers1,
+	EPanicBadUtf7Pointers2,
+	EPanicBadUtf7Pointers3,
+	EPanicBadUtf7Pointers4,
+	EPanicBadUtf7Pointers5,
+	EPanicBadUtf7Pointers6,
+	EPanicBadUtf7Pointers7,
+	EPanicBadUtf7Pointers8,
+	EPanicBadUtf7Pointers9,
+	EPanicBadUtf7Pointers10,
+	EPanicBadUtf7Pointers11,
+	EPanicNotInBase64Block,
+	EPanicBadUnicodePointers1,
+	EPanicBadUnicodePointers2,
+	EPanicBadUnicodePointers3,
+	EPanicBadUnicodePointers4,
+	EPanicBadUnicodePointers5,
+	EPanicBadUnicodePointers6,
+	EPanicBadUnicodePointers7,
+	EPanicBadUnicodePointers8,
+	EPanicBadUnicodePointers9,
+	EPanicBadUnicodePointers10,
+	EPanicBadBitBufferState1,
+	EPanicBadBitBufferState2,
+	EPanicBadBitBufferState3,
+	EPanicBadBitBufferState4,
+	EPanicBadBitBufferState5,
+	EPanicBadBitBufferState6,
+	EPanicBadBitBufferState7,
+	EPanicBadBitBufferState8,
+	EPanicBadBitBufferState9,
+	EPanicBadBitBufferState10,
+	EPanicBadBitBufferState11,
+	EPanicBadBitBufferState12,
+	EPanicBadBitBufferState13,
+	EPanicBadBitBufferState14,
+	EPanicBadBitBufferState15,
+	EPanicBadBitBufferState16,
+	EPanicBadBitBufferState17,
+	EPanicUnexpectedNumberOfLoopIterations,
+	EPanicInitialEscapeCharacterButNoBase64,
+	EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary,
+	EPanicBadUtf8Pointers1,
+	EPanicBadUtf8Pointers2,
+	EPanicBadUtf8Pointers3,
+	EPanicBadUtf8Pointers4,
+	EPanicBadUtf8Pointers5,
+	EPanicBadUtf8Pointers6,
+	EPanicBadUtf8Pointers7,
+	EPanicOutOfSyncUtf7Byte1,
+	EPanicOutOfSyncUtf7Byte2,
+	EPanicOutOfSyncBase64Decoding
+	};
+
+_LIT(KLitPanicText, "CHARCONV-UTF");
+
+LOCAL_C void Panic(TPanic aPanic)
+	{
+	User::Panic(KLitPanicText, aPanic);
+	}
+
+inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';}
+
+inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer)
+	{
+	return (aBitBuffer&((1<<aNumberOfBitsInBuffer)-1))!=0;
+	}
+
+
+
+
+
+
+ 
+
+/** Converts Unicode text into UTF-8 encoding.
+
+@param aUtf8 On return, contains the UTF-8 encoded output string.
+@param aUnicode The Unicode-encoded input string.
+@return The number of unconverted characters left at the end of the input 
+descriptor, or one of the error values defined in TError. */
+EXPORT_C TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, const TDesC16& aUnicode)
+	{
+	return ConvertFromUnicodeToUtf8(aUtf8, aUnicode, EFalse);
+	}
+
+
+
+/** Converts Unicode text into UTF-8 encoding. 
+
+Surrogate pairs can be input which will result in a valid 4 byte UTF-8 value.
+
+The variant of UTF-8 used internally by Java differs slightly from standard 
+UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
+
+@param aUtf8 On return, contains the UTF-8 encoded output string.
+@param aUnicode A UCS-2 encoded input string.
+@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
+UTF-8. The default is EFalse.
+@return The number of unconverted characters left at the end of the input descriptor, 
+or one of the error values defined in TError. */
+TInt CnvUtfConverter::ConvertFromUnicodeToUtf8(TDes8& aUtf8, 
+											   const TDesC16& aUnicode, 
+											   TBool aGenerateJavaConformantUtf8)
+	{
+	if (aUnicode.Length() == 0)
+		{
+		aUtf8.SetLength(0);
+		return 0;
+		}
+	if (aUtf8.MaxLength() == 0)
+		{
+		return aUnicode.Length();
+		}
+	
+	TUint8* pUtf8 = CONST_CAST(TUint8*, aUtf8.Ptr());
+	const TUint8* pointerToLastUtf8Byte = pUtf8 + (aUtf8.MaxLength() - 1);
+	TBool inputIsTruncated = EFalse;
+	const TUint16* pUnicode = aUnicode.Ptr();
+	const TUint16* pointerToLastUnicodeCharacter = pUnicode + (aUnicode.Length() - 1);
+	
+	FOREVER
+		{
+		__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers1));
+		__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers3));
+	
+		if (pUnicode[0] < 0x80)
+			{
+			// ascii - 1 byte
+			
+			// internally java is different since the \x0000 character is 
+			// translated into \xC0 \x80.
+			
+			if ((aGenerateJavaConformantUtf8) && (pUnicode[0] == 0x0000))
+				{
+				if (pUtf8 == pointerToLastUtf8Byte)
+					{
+					pUtf8--;
+					pUnicode--;
+					break;			
+					}
+				*pUtf8++ = STATIC_CAST(TUint8, 0xc0);
+				*pUtf8   = STATIC_CAST(TUint8, 0x80);	
+				}
+			else
+				{
+				*pUtf8 = STATIC_CAST(TUint8, pUnicode[0]);
+				}
+			}
+		else if (pUnicode[0] < 0x800)
+			{
+			// U+0080..U+07FF - 2 bytes
+			
+			if (pUtf8 == pointerToLastUtf8Byte)
+				{
+				pUtf8--;
+				pUnicode--;
+				break;
+				}
+			
+			*pUtf8++ = STATIC_CAST(TUint8, 0xc0|(pUnicode[0]>>6));
+			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
+			
+			}
+
+		// check to see if we have a surrogate in the stream, surrogates encode code points outside
+		// the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars.
+
+		else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8)
+			{
+			// surrogate pair - 4 bytes in utf-8
+			// U+10000..U+10FFFF
+			
+			__ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2));
+			// is there enough space to hold the character
+			if ((pointerToLastUtf8Byte - pUtf8) < 3)
+				{
+				pUtf8--;
+				pUnicode--;
+				break;  // no go to the exit condition
+				}
+			
+			__ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4));
+			if (pUnicode >= pointerToLastUnicodeCharacter)
+				{
+				pUtf8--;
+				pUnicode--;
+				inputIsTruncated = ETrue;
+				break; // middle of a surrogate pair. go to end condition
+				}
+			
+			if ((pUnicode[1] & 0xfc00) != 0xdc00)
+				{
+				return EErrorIllFormedInput;
+				}
+			
+			// convert utf-16 surrogate to utf-32
+			TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000;
+			
+			// convert utf-32 to utf-8
+            *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18));   
+            *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f));
+            *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f));
+            *pUtf8   = STATIC_CAST(TUint8,0x80 | (ch & 0x3f));
+			
+            // we consumed 2 utf-16 values, move this pointer
+			pUnicode++;
+			}		
+		else
+			{
+			// 3 byte - utf-8, U+800..U+FFFF rest of BMP.
+			
+			if (pointerToLastUtf8Byte - pUtf8 < 2)
+				{
+				pUtf8--;
+				pUnicode--;
+				break;
+				}
+			*pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12));
+			*pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f));
+			*pUtf8   = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f));
+			}
+		
+		if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte))
+			{
+			break;
+			}
+		
+		pUtf8++;
+		pUnicode++;
+		
+		}
+	
+	if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated)
+		{
+		return EErrorIllFormedInput;
+		}
+	
+	aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1);
+	return pointerToLastUnicodeCharacter-pUnicode;
+	}
+
+
+
+
+
+ 
+
+
+
+
+
+/** Converts text encoded using the Unicode transformation format UTF-8 into the 
+Unicode UCS-2 character set.
+
+@param aUnicode On return, contains the Unicode encoded output string.
+@param aUtf8 The UTF-8 encoded input string
+@return The number of unconverted bytes left at the end of the input descriptor, 
+or one of the error values defined in TError. */
+EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8)
+	{
+	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse);
+	}
+
+static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters,
+		TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex)
+	{
+	if (aNumberOfUnconvertibleCharacters<=0)
+		{
+		aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex;
+		}
+	++aNumberOfUnconvertibleCharacters;
+	}
+
+/** Converts text encoded using the Unicode transformation format UTF-8 into the 
+Unicode UCS-2 character set.
+
+@param aUnicode On return, contains the Unicode encoded output string.
+@param aUtf8 The UTF-8 encoded input string
+@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
+@return The number of unconverted bytes left at the end of the input descriptor, 
+or one of the error values defined in TError. */
+TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8)
+	{
+	TInt dummyUnconverted, dummyUnconvertedIndex;
+	return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex);
+	}
+
+/** Converts text encoded using the Unicode transformation format UTF-8 into the 
+Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input.
+
+The variant of UTF-8 used internally by Java differs slightly from standard 
+UTF-8. The TBool argument controls the UTF-8 variant generated by this function.
+
+@param aUnicode On return, contains the Unicode encoded output string.
+@param aUtf8 The UTF-8 encoded input string
+@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java 
+UTF-8. The default is EFalse.
+@param aNumberOfUnconvertibleCharacters On return, contains the number of bytes 
+which were not converted.
+@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index 
+of the first byte of the first unconvertible character. For instance if the 
+first character in the input descriptor (aForeign) could not be converted, 
+then this parameter is set to the first byte of that character, i.e. zero. 
+A negative value is returned if all the characters were converted.
+@return The number of unconverted bytes left at the end of the input descriptor, 
+or one of the error values defined in TError. */
+
+/* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
+ * Well formed UTF-8 Byte Sequences, full table.
+ * +----------------------------------------------------------------+
+ * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
+ * +--------------------+----------+----------+----------+----------+
+ * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
+ * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
+ * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
+ * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
+ * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
+ * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
+ * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
+ * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
+ * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
+ * +--------------------+----------+----------+----------+----------+
+ * 
+ * As a consequence of the well-formedness conditions specified in table 3-7,
+ * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
+ */
+TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8,
+		TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
+	{	
+	aUnicode.SetLength(0);
+	
+	if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0))
+		{
+		return aUtf8.Length();
+		}
+
+	TUint16*           pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr());
+	const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1);
+	const TUint8*         pUtf8 = aUtf8.Ptr();   
+	const TUint8*     pLastUtf8 = pUtf8 + (aUtf8.Length() - 1);
+	const TUint16 replacementcharacter = 0xFFFD;
+	TUint currentUnicodeCharacter;
+	TUint sequenceLength;
+
+	
+	FOREVER
+		{
+		TBool illFormed=EFalse;
+		
+		__ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8));
+		__ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3));
+		
+		sequenceLength = 1;
+		
+		// ascii - optimisation (i.e. it isn't a sequence)
+		if (pUtf8[0] < 0x80)
+			{
+			currentUnicodeCharacter = pUtf8[0];
+			}
+		else
+			{
+			// see if well formed utf-8, use table above for reference	
+			if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf))
+				{
+				// 0xc1-0xc2 are not valid bytes
+				sequenceLength = 2;
+				}
+			else if ((pUtf8[0] & 0xf0) == 0xe0)
+				{
+				sequenceLength = 3;
+				}
+			else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5))
+				{
+				// 0xf5-0xff, are not valid bytes
+				sequenceLength = 4;
+				}
+			else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8)
+				{
+				if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80))
+					{
+					// either we've split the 0xc0 0x80 (i.e. 0xc0 is
+					// the last character in the string) or we've
+					// discovered a valid 0xc0 0x80 sequence.  
+					sequenceLength = 2;
+					}
+				}
+			
+			/* checking to see if we got a valid sequence */
+			if (sequenceLength == 1)
+				{
+				// bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example
+				currentUnicodeCharacter = replacementcharacter;
+				UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
+						aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
+				}
+			else
+				{
+				// this is a check to see if the sequence goes beyond the input 
+				// stream.  if its not the first and only character in the input
+				// stream this isn't an error, otherwise it is.
+				if ((pUtf8 + sequenceLength - 1) >  pLastUtf8)
+					{
+					// check to see if this sequence was the first character
+					if ((pUnicode - aUnicode.Ptr()) == 0)
+						{
+						return EErrorIllFormedInput;
+						}
+					break;
+					}			
+				
+				currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength);
+			
+				/* check the trailing bytes, they should begin with 10 */
+				TUint i = 1;
+
+				do
+					{
+					if ((pUtf8[i] & 0xc0) == 0x80)
+						{
+						// add the trailing 6 bits to the current unicode char
+						currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F);
+						}
+					else
+						{
+						// ill formed character (doesn't have a lead 10)
+						currentUnicodeCharacter = replacementcharacter;
+						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
+								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
+						illFormed=ETrue;
+						break; 
+						}
+					i++;
+					}
+				while (i < sequenceLength);
+				}
+				
+			/* conformance check.  bits of above table for reference.
+			 * +----------------------------------------------------------------+
+			 * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
+			 * +--------------------+----------+----------+----------+----------+
+			 * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, 2nd < 0xA0
+			 * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, 2nd > 0x9F
+			 * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, 2nd < 0x90
+			 * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, 2nd > 0x8F
+			 * +--------------------+----------+----------+----------+----------+
+			 */
+			
+			if (currentUnicodeCharacter != replacementcharacter)
+				{
+				if (sequenceLength == 3)
+					{
+					if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0))
+						{
+						currentUnicodeCharacter = replacementcharacter;
+						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
+								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
+						illFormed=ETrue;
+						}
+					else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F))
+						{
+						currentUnicodeCharacter = replacementcharacter;
+						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
+								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
+						illFormed=ETrue;
+						}
+					}
+				else if (sequenceLength == 4)
+					{
+					if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90))
+						{
+						currentUnicodeCharacter = replacementcharacter;
+						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
+								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
+						illFormed=ETrue;
+						}
+					else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F))
+						{
+						currentUnicodeCharacter = replacementcharacter;
+						UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
+								aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
+						illFormed=ETrue;
+						}
+					}
+				
+				
+				/* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points
+				 * are not Unicode scalar values, any UTF-8 byte sequence that would map to code 
+				 * points D800..DFFF is ill formed */
+				
+				if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF))
+					{
+					currentUnicodeCharacter = replacementcharacter;
+					UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters,
+							aIndexOfFirstByteOfFirstUnconvertibleCharacter,	pUtf8-aUtf8.Ptr());
+					illFormed=ETrue;
+					}	
+				}
+				// end conformance check
+			}
+
+		// would this character generate a surrogate pair in UTF-16?
+		if (currentUnicodeCharacter > 0xFFFF)
+			{
+			// is there enough space to hold a surrogate pair in the output?
+			if (pUnicode >= pLastUnicode)
+				{
+				break; // no, end processing.
+				}
+			
+			TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
+			*pUnicode++ = STATIC_CAST(TUint16, surrogate);
+					
+			surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00;
+			*pUnicode++ = STATIC_CAST(TUint16, surrogate);			
+			}
+		else
+			{
+			*pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter);
+			}
+		
+		// move the input pointer
+		if (currentUnicodeCharacter != replacementcharacter)
+			{
+			pUtf8 += sequenceLength;
+			}
+		else if(illFormed == EFalse)
+			{
+			pUtf8 += (sequenceLength);
+			}
+		else
+			{
+			// we had a character we didn't recognize (i.e. it was invalid)
+			// so move to the next character in the input
+			pUtf8++;
+			}
+		
+		if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode))
+			{ 
+			break;  // we've either reached the end of the input or the end of output
+			}
+		}
+
+	aUnicode.SetLength(pUnicode - aUnicode.Ptr());
+	return (pLastUtf8 - pUtf8 + 1);
+	}
+
+/** Given a sample text this function attempts to determine whether or not
+ *  the same text is encoded using the UTF-8 standard encoding scheme.
+
+@param TInt a confidence level, given at certain value.  if the given sample
+			is UTF-8 this value will not be changed (unless > 100) then its
+			set to 100.  Otherwise if the same isn't UTF-8, its set to 0.
+@param TDesC8 sample text.
+UTF-8. The default is EFalse.
+@return void
+ */
+
+/* of note: conformance.  Unicode standard 5.0 section 3.9, table 3-7
+ * Well formed UTF-8 Byte Sequences, full table.
+ * +----------------------------------------------------------------+
+ * | Code Points        | 1st byte | 2nd byte | 3rd byte | 4th byte |
+ * +--------------------+----------+----------+----------+----------+
+ * | U+0000..U+007F     | 00..7D   |          |          |          |  1 byte, ascii
+ * | U+0080..U+07FF     | C2..DF   | 80..BF   |          |          |  2 bytes, error if 1st < 0xC2 
+ * | U+0800..U+0FFF     | E0       | A0..BF   | 80..BF   |          |  3 bytes, 1st == 0xE0, error if 2nd < 0xA0
+ * | U+1000..U+CFFF     | E1..EC   | 80..BF   | 80..BF   |          |  normal
+ * | U+D000..U+D7FF     | ED       | 80..9F   | 80..BF   |          |  3 bytes, 1st == 0xED, error if 2nd > 0x9F
+ * | U+E000..U+FFFF     | EE..EF   | 80..BF   | 80..BF   |          |  normal
+ * | U+10000..U+3FFFF   | F0       | 90..BF   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xf0, error if 2nd < 0x90
+ * | U+40000..U+FFFFF   | F1..F3   | 80..BF   | 80..BF   | 80..BF   |  normal
+ * | U+100000..U+10FFFF | F4       | 80..8F   | 80..BF   | 80..BF   |  4 bytes, 1st == 0xF4, error if 2nd > 0x8F
+ * +--------------------+----------+----------+----------+----------+
+ * 
+ * As a consequence of the well-formedness conditions specified in table 3-7,
+ * the following byte values are disallowed in UTF-8: C0-C1, F5-FF.
+ * 
+ * Code Rules:
+ *   R1: If the string contains any non-UTF-8 characters the returned confidence
+ *       is 0.  Valid UTF-8 combinations are listed in the above table.
+ *   R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in  
+ *       the (see ) the returned confidence is 95.
+ *   R3: Otherwise the confidence returned is based upon the sample string 
+ *       length.
+ *   R4: If the sample string is under 75 characters, the confidence is set to 
+ *       75.
+ */
+void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample)
+	{
+
+	TInt sampleLength = aSample.Length();
+	
+	if (sampleLength == 0)
+		{
+		aConfidenceLevel = 89;
+		return;
+		}
+	TInt bytesRemaining  = 0;
+	TUint sequenceLength  = 0;
+	
+	aConfidenceLevel = sampleLength;
+
+	const TUint8* buffer = &aSample[0];
+
+	if (sampleLength < 95)
+		{
+		// check for the BOM
+		if ((sampleLength >= 3) && 
+			((buffer[0] == 0xEF) &&
+			 (buffer[1] == 0xBB) &&
+			 (buffer[2] == 0xBF)) 
+			) 
+			{
+			aConfidenceLevel = 95;
+			}
+		else if (sampleLength < 75)
+			{
+			aConfidenceLevel = 75;
+			}
+		}
+	
+	for (TInt index = 0;index != sampleLength;index++)
+		{
+		
+		if (bytesRemaining > 0)
+			{
+			// bytesRemaining > 0, means that a byte representing the start of a 
+			// multibyte sequence was encountered and the bytesRemaining is the 
+			// number of bytes to follow. 
+			
+			if ((buffer[index] & 0xc0) == 0x80) 
+				{
+				// need to check for ill-formed sequences -- all are in the 2nd byte
+				
+				if ((sequenceLength == 3) && (bytesRemaining == 2))
+					{
+					if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0))
+						{
+						aConfidenceLevel = 0;
+						break;
+						}
+					else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f))
+						{
+						aConfidenceLevel = 0;
+						break;
+						}
+					}
+				else if ((sequenceLength == 4) && (bytesRemaining == 3))
+					{
+					if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90))
+						{
+						aConfidenceLevel = 0;
+						break;
+						}
+					else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f))
+						{
+						aConfidenceLevel = 0;
+						break;
+						}
+					}
+				
+				--bytesRemaining;
+				continue;
+				}
+			else
+				{
+				aConfidenceLevel = 0;
+				break;
+				}
+			}
+		
+		if (bytesRemaining == 0)
+			{
+			if (buffer[index] < 0x80)
+				{
+				// The value of aSample[index] is in the range 0x00-0x7f
+				//UTF8 maintains ASCII transparency. So it's a valid
+				//UTF8. Do nothing, check next value.
+				continue;
+				}
+			else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0))
+				{
+				// valid start of a 2 byte sequence (see conformance note)
+				sequenceLength = 2;
+				bytesRemaining = 1;
+				}
+			else if ((buffer[index] & 0xf0) == 0xe0)
+				{
+				// valid start of a 3 byte sequence
+				sequenceLength = 3;
+				bytesRemaining = 2;
+				}
+			else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5))
+				{
+				// valid start of a 4 byte sequence (see conformance note)
+				sequenceLength = 4;
+				bytesRemaining = 3;
+				}	
+			else
+				{
+				// wasn't anything expected so must be an illegal/irregular UTF8 coded value
+				aConfidenceLevel = 0;
+				break;
+				}
+			}
+		} // for 
+	
+	aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0;
+	}
+
+// End of file