diff -r 675a964f4eb5 -r 35751d3474b7 securityanddataprivacytools/securitytools/certapp/store--/utf.cpp --- a/securityanddataprivacytools/securitytools/certapp/store--/utf.cpp Tue Jul 21 01:04:32 2009 +0100 +++ b/securityanddataprivacytools/securitytools/certapp/store--/utf.cpp Thu Sep 10 14:01:51 2009 +0300 @@ -1,742 +1,742 @@ -/* -* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). -* All rights reserved. -* This component and the accompanying materials are made available -* under the terms of the License "Eclipse Public License v1.0" -* which accompanies this distribution, and is available -* at the URL "http://www.eclipse.org/legal/epl-v10.html". -* -* Initial Contributors: -* Nokia Corporation - initial contribution. -* -* Contributors: -* -* Description: -* -*/ - - -#include -#include -#include - -#define STATIC_CAST(t,v) static_cast(v) -#define CONST_CAST(t,v) const_cast(v) -#define FOREVER for(;;) - -const TUint KNotInBase64Alphabet=KMaxTUint; - -enum TPanic - { - EPanicBad6BitNumber=1, - EPanicBadUtf7Pointers1, - EPanicBadUtf7Pointers2, - EPanicBadUtf7Pointers3, - EPanicBadUtf7Pointers4, - EPanicBadUtf7Pointers5, - EPanicBadUtf7Pointers6, - EPanicBadUtf7Pointers7, - EPanicBadUtf7Pointers8, - EPanicBadUtf7Pointers9, - EPanicBadUtf7Pointers10, - EPanicBadUtf7Pointers11, - EPanicNotInBase64Block, - EPanicBadUnicodePointers1, - EPanicBadUnicodePointers2, - EPanicBadUnicodePointers3, - EPanicBadUnicodePointers4, - EPanicBadUnicodePointers5, - EPanicBadUnicodePointers6, - EPanicBadUnicodePointers7, - EPanicBadUnicodePointers8, - EPanicBadUnicodePointers9, - EPanicBadUnicodePointers10, - EPanicBadBitBufferState1, - EPanicBadBitBufferState2, - EPanicBadBitBufferState3, - EPanicBadBitBufferState4, - EPanicBadBitBufferState5, - EPanicBadBitBufferState6, - EPanicBadBitBufferState7, - EPanicBadBitBufferState8, - EPanicBadBitBufferState9, - EPanicBadBitBufferState10, - EPanicBadBitBufferState11, - EPanicBadBitBufferState12, - EPanicBadBitBufferState13, - EPanicBadBitBufferState14, - EPanicBadBitBufferState15, - EPanicBadBitBufferState16, - EPanicBadBitBufferState17, - EPanicUnexpectedNumberOfLoopIterations, - EPanicInitialEscapeCharacterButNoBase64, - EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary, - EPanicBadUtf8Pointers1, - EPanicBadUtf8Pointers2, - EPanicBadUtf8Pointers3, - EPanicBadUtf8Pointers4, - EPanicBadUtf8Pointers5, - EPanicBadUtf8Pointers6, - EPanicBadUtf8Pointers7, - EPanicOutOfSyncUtf7Byte1, - EPanicOutOfSyncUtf7Byte2, - EPanicOutOfSyncBase64Decoding - }; - -_LIT(KLitPanicText, "CHARCONV-UTF"); - -LOCAL_C void Panic(TPanic aPanic) - { - User::Panic(KLitPanicText, aPanic); - } - -inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';} - -inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer) - { - return (aBitBuffer&((1<>6)); - *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f)); - - } - - // check to see if we have a surrogate in the stream, surrogates encode code points outside - // the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars. - - else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8) - { - // surrogate pair - 4 bytes in utf-8 - // U+10000..U+10FFFF - - __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2)); - // is there enough space to hold the character - if ((pointerToLastUtf8Byte - pUtf8) < 3) - { - pUtf8--; - pUnicode--; - break; // no go to the exit condition - } - - __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4)); - if (pUnicode >= pointerToLastUnicodeCharacter) - { - pUtf8--; - pUnicode--; - inputIsTruncated = ETrue; - break; // middle of a surrogate pair. go to end condition - } - - if ((pUnicode[1] & 0xfc00) != 0xdc00) - { - return EErrorIllFormedInput; - } - - // convert utf-16 surrogate to utf-32 - TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000; - - // convert utf-32 to utf-8 - *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18)); - *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f)); - *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f)); - *pUtf8 = STATIC_CAST(TUint8,0x80 | (ch & 0x3f)); - - // we consumed 2 utf-16 values, move this pointer - pUnicode++; - } - else - { - // 3 byte - utf-8, U+800..U+FFFF rest of BMP. - - if (pointerToLastUtf8Byte - pUtf8 < 2) - { - pUtf8--; - pUnicode--; - break; - } - *pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12)); - *pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f)); - *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f)); - } - - if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte)) - { - break; - } - - pUtf8++; - pUnicode++; - - } - - if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated) - { - return EErrorIllFormedInput; - } - - aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1); - return pointerToLastUnicodeCharacter-pUnicode; - } - - - - - - - - - - - -/** Converts text encoded using the Unicode transformation format UTF-8 into the -Unicode UCS-2 character set. - -@param aUnicode On return, contains the Unicode encoded output string. -@param aUtf8 The UTF-8 encoded input string -@return The number of unconverted bytes left at the end of the input descriptor, -or one of the error values defined in TError. */ -EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8) - { - return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse); - } - -static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters, - TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex) - { - if (aNumberOfUnconvertibleCharacters<=0) - { - aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex; - } - ++aNumberOfUnconvertibleCharacters; - } - -/** Converts text encoded using the Unicode transformation format UTF-8 into the -Unicode UCS-2 character set. - -@param aUnicode On return, contains the Unicode encoded output string. -@param aUtf8 The UTF-8 encoded input string -@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java -@return The number of unconverted bytes left at the end of the input descriptor, -or one of the error values defined in TError. */ -TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8) - { - TInt dummyUnconverted, dummyUnconvertedIndex; - return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex); - } - -/** Converts text encoded using the Unicode transformation format UTF-8 into the -Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input. - -The variant of UTF-8 used internally by Java differs slightly from standard -UTF-8. The TBool argument controls the UTF-8 variant generated by this function. - -@param aUnicode On return, contains the Unicode encoded output string. -@param aUtf8 The UTF-8 encoded input string -@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java -UTF-8. The default is EFalse. -@param aNumberOfUnconvertibleCharacters On return, contains the number of bytes -which were not converted. -@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index -of the first byte of the first unconvertible character. For instance if the -first character in the input descriptor (aForeign) could not be converted, -then this parameter is set to the first byte of that character, i.e. zero. -A negative value is returned if all the characters were converted. -@return The number of unconverted bytes left at the end of the input descriptor, -or one of the error values defined in TError. */ - -/* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 - * Well formed UTF-8 Byte Sequences, full table. - * +----------------------------------------------------------------+ - * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | - * +--------------------+----------+----------+----------+----------+ - * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii - * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 - * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 - * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal - * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F - * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal - * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 - * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal - * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F - * +--------------------+----------+----------+----------+----------+ - * - * As a consequence of the well-formedness conditions specified in table 3-7, - * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. - */ -TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8, - TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) - { - aUnicode.SetLength(0); - - if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0)) - { - return aUtf8.Length(); - } - - TUint16* pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr()); - const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1); - const TUint8* pUtf8 = aUtf8.Ptr(); - const TUint8* pLastUtf8 = pUtf8 + (aUtf8.Length() - 1); - const TUint16 replacementcharacter = 0xFFFD; - TUint currentUnicodeCharacter; - TUint sequenceLength; - - - FOREVER - { - TBool illFormed=EFalse; - - __ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8)); - __ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3)); - - sequenceLength = 1; - - // ascii - optimisation (i.e. it isn't a sequence) - if (pUtf8[0] < 0x80) - { - currentUnicodeCharacter = pUtf8[0]; - } - else - { - // see if well formed utf-8, use table above for reference - if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf)) - { - // 0xc1-0xc2 are not valid bytes - sequenceLength = 2; - } - else if ((pUtf8[0] & 0xf0) == 0xe0) - { - sequenceLength = 3; - } - else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5)) - { - // 0xf5-0xff, are not valid bytes - sequenceLength = 4; - } - else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8) - { - if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80)) - { - // either we've split the 0xc0 0x80 (i.e. 0xc0 is - // the last character in the string) or we've - // discovered a valid 0xc0 0x80 sequence. - sequenceLength = 2; - } - } - - /* checking to see if we got a valid sequence */ - if (sequenceLength == 1) - { - // bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example - currentUnicodeCharacter = replacementcharacter; - UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, - aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); - } - else - { - // this is a check to see if the sequence goes beyond the input - // stream. if its not the first and only character in the input - // stream this isn't an error, otherwise it is. - if ((pUtf8 + sequenceLength - 1) > pLastUtf8) - { - // check to see if this sequence was the first character - if ((pUnicode - aUnicode.Ptr()) == 0) - { - return EErrorIllFormedInput; - } - break; - } - - currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength); - - /* check the trailing bytes, they should begin with 10 */ - TUint i = 1; - - do - { - if ((pUtf8[i] & 0xc0) == 0x80) - { - // add the trailing 6 bits to the current unicode char - currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F); - } - else - { - // ill formed character (doesn't have a lead 10) - currentUnicodeCharacter = replacementcharacter; - UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, - aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); - illFormed=ETrue; - break; - } - i++; - } - while (i < sequenceLength); - } - - /* conformance check. bits of above table for reference. - * +----------------------------------------------------------------+ - * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | - * +--------------------+----------+----------+----------+----------+ - * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, 2nd < 0xA0 - * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, 2nd > 0x9F - * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, 2nd < 0x90 - * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, 2nd > 0x8F - * +--------------------+----------+----------+----------+----------+ - */ - - if (currentUnicodeCharacter != replacementcharacter) - { - if (sequenceLength == 3) - { - if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0)) - { - currentUnicodeCharacter = replacementcharacter; - UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, - aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); - illFormed=ETrue; - } - else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F)) - { - currentUnicodeCharacter = replacementcharacter; - UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, - aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); - illFormed=ETrue; - } - } - else if (sequenceLength == 4) - { - if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90)) - { - currentUnicodeCharacter = replacementcharacter; - UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, - aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); - illFormed=ETrue; - } - else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F)) - { - currentUnicodeCharacter = replacementcharacter; - UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, - aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); - illFormed=ETrue; - } - } - - - /* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points - * are not Unicode scalar values, any UTF-8 byte sequence that would map to code - * points D800..DFFF is ill formed */ - - if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF)) - { - currentUnicodeCharacter = replacementcharacter; - UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, - aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); - illFormed=ETrue; - } - } - // end conformance check - } - - // would this character generate a surrogate pair in UTF-16? - if (currentUnicodeCharacter > 0xFFFF) - { - // is there enough space to hold a surrogate pair in the output? - if (pUnicode >= pLastUnicode) - { - break; // no, end processing. - } - - TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0; - *pUnicode++ = STATIC_CAST(TUint16, surrogate); - - surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00; - *pUnicode++ = STATIC_CAST(TUint16, surrogate); - } - else - { - *pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter); - } - - // move the input pointer - if (currentUnicodeCharacter != replacementcharacter) - { - pUtf8 += sequenceLength; - } - else if(illFormed == EFalse) - { - pUtf8 += (sequenceLength); - } - else - { - // we had a character we didn't recognize (i.e. it was invalid) - // so move to the next character in the input - pUtf8++; - } - - if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode)) - { - break; // we've either reached the end of the input or the end of output - } - } - - aUnicode.SetLength(pUnicode - aUnicode.Ptr()); - return (pLastUtf8 - pUtf8 + 1); - } - -/** Given a sample text this function attempts to determine whether or not - * the same text is encoded using the UTF-8 standard encoding scheme. - -@param TInt a confidence level, given at certain value. if the given sample - is UTF-8 this value will not be changed (unless > 100) then its - set to 100. Otherwise if the same isn't UTF-8, its set to 0. -@param TDesC8 sample text. -UTF-8. The default is EFalse. -@return void - */ - -/* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 - * Well formed UTF-8 Byte Sequences, full table. - * +----------------------------------------------------------------+ - * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | - * +--------------------+----------+----------+----------+----------+ - * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii - * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 - * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 - * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal - * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F - * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal - * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 - * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal - * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F - * +--------------------+----------+----------+----------+----------+ - * - * As a consequence of the well-formedness conditions specified in table 3-7, - * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. - * - * Code Rules: - * R1: If the string contains any non-UTF-8 characters the returned confidence - * is 0. Valid UTF-8 combinations are listed in the above table. - * R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in - * the (see ) the returned confidence is 95. - * R3: Otherwise the confidence returned is based upon the sample string - * length. - * R4: If the sample string is under 75 characters, the confidence is set to - * 75. - */ -void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample) - { - - TInt sampleLength = aSample.Length(); - - if (sampleLength == 0) - { - aConfidenceLevel = 89; - return; - } - TInt bytesRemaining = 0; - TUint sequenceLength = 0; - - aConfidenceLevel = sampleLength; - - const TUint8* buffer = &aSample[0]; - - if (sampleLength < 95) - { - // check for the BOM - if ((sampleLength >= 3) && - ((buffer[0] == 0xEF) && - (buffer[1] == 0xBB) && - (buffer[2] == 0xBF)) - ) - { - aConfidenceLevel = 95; - } - else if (sampleLength < 75) - { - aConfidenceLevel = 75; - } - } - - for (TInt index = 0;index != sampleLength;index++) - { - - if (bytesRemaining > 0) - { - // bytesRemaining > 0, means that a byte representing the start of a - // multibyte sequence was encountered and the bytesRemaining is the - // number of bytes to follow. - - if ((buffer[index] & 0xc0) == 0x80) - { - // need to check for ill-formed sequences -- all are in the 2nd byte - - if ((sequenceLength == 3) && (bytesRemaining == 2)) - { - if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0)) - { - aConfidenceLevel = 0; - break; - } - else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f)) - { - aConfidenceLevel = 0; - break; - } - } - else if ((sequenceLength == 4) && (bytesRemaining == 3)) - { - if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90)) - { - aConfidenceLevel = 0; - break; - } - else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f)) - { - aConfidenceLevel = 0; - break; - } - } - - --bytesRemaining; - continue; - } - else - { - aConfidenceLevel = 0; - break; - } - } - - if (bytesRemaining == 0) - { - if (buffer[index] < 0x80) - { - // The value of aSample[index] is in the range 0x00-0x7f - //UTF8 maintains ASCII transparency. So it's a valid - //UTF8. Do nothing, check next value. - continue; - } - else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0)) - { - // valid start of a 2 byte sequence (see conformance note) - sequenceLength = 2; - bytesRemaining = 1; - } - else if ((buffer[index] & 0xf0) == 0xe0) - { - // valid start of a 3 byte sequence - sequenceLength = 3; - bytesRemaining = 2; - } - else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5)) - { - // valid start of a 4 byte sequence (see conformance note) - sequenceLength = 4; - bytesRemaining = 3; - } - else - { - // wasn't anything expected so must be an illegal/irregular UTF8 coded value - aConfidenceLevel = 0; - break; - } - } - } // for - - aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0; - } - -// End of file +/* +* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of the License "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ + + +#include +#include +#include + +#define STATIC_CAST(t,v) static_cast(v) +#define CONST_CAST(t,v) const_cast(v) +#define FOREVER for(;;) + +const TUint KNotInBase64Alphabet=KMaxTUint; + +enum TPanic + { + EPanicBad6BitNumber=1, + EPanicBadUtf7Pointers1, + EPanicBadUtf7Pointers2, + EPanicBadUtf7Pointers3, + EPanicBadUtf7Pointers4, + EPanicBadUtf7Pointers5, + EPanicBadUtf7Pointers6, + EPanicBadUtf7Pointers7, + EPanicBadUtf7Pointers8, + EPanicBadUtf7Pointers9, + EPanicBadUtf7Pointers10, + EPanicBadUtf7Pointers11, + EPanicNotInBase64Block, + EPanicBadUnicodePointers1, + EPanicBadUnicodePointers2, + EPanicBadUnicodePointers3, + EPanicBadUnicodePointers4, + EPanicBadUnicodePointers5, + EPanicBadUnicodePointers6, + EPanicBadUnicodePointers7, + EPanicBadUnicodePointers8, + EPanicBadUnicodePointers9, + EPanicBadUnicodePointers10, + EPanicBadBitBufferState1, + EPanicBadBitBufferState2, + EPanicBadBitBufferState3, + EPanicBadBitBufferState4, + EPanicBadBitBufferState5, + EPanicBadBitBufferState6, + EPanicBadBitBufferState7, + EPanicBadBitBufferState8, + EPanicBadBitBufferState9, + EPanicBadBitBufferState10, + EPanicBadBitBufferState11, + EPanicBadBitBufferState12, + EPanicBadBitBufferState13, + EPanicBadBitBufferState14, + EPanicBadBitBufferState15, + EPanicBadBitBufferState16, + EPanicBadBitBufferState17, + EPanicUnexpectedNumberOfLoopIterations, + EPanicInitialEscapeCharacterButNoBase64, + EPanicBase64SequenceDoesNotFallOnUnicodeCharacterBoundary, + EPanicBadUtf8Pointers1, + EPanicBadUtf8Pointers2, + EPanicBadUtf8Pointers3, + EPanicBadUtf8Pointers4, + EPanicBadUtf8Pointers5, + EPanicBadUtf8Pointers6, + EPanicBadUtf8Pointers7, + EPanicOutOfSyncUtf7Byte1, + EPanicOutOfSyncUtf7Byte2, + EPanicOutOfSyncBase64Decoding + }; + +_LIT(KLitPanicText, "CHARCONV-UTF"); + +LOCAL_C void Panic(TPanic aPanic) + { + User::Panic(KLitPanicText, aPanic); + } + +inline TUint EscapeCharacterForStartingBase64Block(TBool aIsImapUtf7) {return aIsImapUtf7? '&': '+';} + +inline TBool BitBufferContainsNonZeroBits(TUint aBitBuffer, TInt aNumberOfBitsInBuffer) + { + return (aBitBuffer&((1<>6)); + *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f)); + + } + + // check to see if we have a surrogate in the stream, surrogates encode code points outside + // the BMP and are 4 utf-8 chars, otherwise what we have here is 3 utf-8 chars. + + else if (((pUnicode[0] & 0xfc00) == 0xd800) && !aGenerateJavaConformantUtf8) + { + // surrogate pair - 4 bytes in utf-8 + // U+10000..U+10FFFF + + __ASSERT_DEBUG(pUtf8 <= pointerToLastUtf8Byte, Panic(EPanicBadUtf8Pointers2)); + // is there enough space to hold the character + if ((pointerToLastUtf8Byte - pUtf8) < 3) + { + pUtf8--; + pUnicode--; + break; // no go to the exit condition + } + + __ASSERT_DEBUG(pUnicode <= pointerToLastUnicodeCharacter, Panic(EPanicBadUnicodePointers4)); + if (pUnicode >= pointerToLastUnicodeCharacter) + { + pUtf8--; + pUnicode--; + inputIsTruncated = ETrue; + break; // middle of a surrogate pair. go to end condition + } + + if ((pUnicode[1] & 0xfc00) != 0xdc00) + { + return EErrorIllFormedInput; + } + + // convert utf-16 surrogate to utf-32 + TUint ch = ((pUnicode[0] - 0xD800) << 10 | (pUnicode[1] - 0xDC00)) + 0x10000; + + // convert utf-32 to utf-8 + *pUtf8++ = STATIC_CAST(TUint8,0xf0 | (ch >> 18)); + *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 12) & 0x3f)); + *pUtf8++ = STATIC_CAST(TUint8,0x80 | ((ch >> 6) & 0x3f)); + *pUtf8 = STATIC_CAST(TUint8,0x80 | (ch & 0x3f)); + + // we consumed 2 utf-16 values, move this pointer + pUnicode++; + } + else + { + // 3 byte - utf-8, U+800..U+FFFF rest of BMP. + + if (pointerToLastUtf8Byte - pUtf8 < 2) + { + pUtf8--; + pUnicode--; + break; + } + *pUtf8++ = STATIC_CAST(TUint8, 0xe0|(pUnicode[0]>>12)); + *pUtf8++ = STATIC_CAST(TUint8, 0x80|((pUnicode[0]>>6)&0x3f)); + *pUtf8 = STATIC_CAST(TUint8, 0x80|(pUnicode[0]&0x3f)); + } + + if ((pUnicode == pointerToLastUnicodeCharacter) || (pUtf8 == pointerToLastUtf8Byte)) + { + break; + } + + pUtf8++; + pUnicode++; + + } + + if ((pUnicode < aUnicode.Ptr()) && inputIsTruncated) + { + return EErrorIllFormedInput; + } + + aUtf8.SetLength((pUtf8 - aUtf8.Ptr())+1); + return pointerToLastUnicodeCharacter-pUnicode; + } + + + + + + + + + + + +/** Converts text encoded using the Unicode transformation format UTF-8 into the +Unicode UCS-2 character set. + +@param aUnicode On return, contains the Unicode encoded output string. +@param aUtf8 The UTF-8 encoded input string +@return The number of unconverted bytes left at the end of the input descriptor, +or one of the error values defined in TError. */ +EXPORT_C TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8) + { + return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, EFalse); + } + +static void UpdateUnconvertibleInfo(TInt& aNumberOfUnconvertibleCharacters, + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, TUint8 aIndex) + { + if (aNumberOfUnconvertibleCharacters<=0) + { + aIndexOfFirstByteOfFirstUnconvertibleCharacter = aIndex; + } + ++aNumberOfUnconvertibleCharacters; + } + +/** Converts text encoded using the Unicode transformation format UTF-8 into the +Unicode UCS-2 character set. + +@param aUnicode On return, contains the Unicode encoded output string. +@param aUtf8 The UTF-8 encoded input string +@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java +@return The number of unconverted bytes left at the end of the input descriptor, +or one of the error values defined in TError. */ +TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8) + { + TInt dummyUnconverted, dummyUnconvertedIndex; + return ConvertToUnicodeFromUtf8(aUnicode, aUtf8, aGenerateJavaConformantUtf8, dummyUnconverted, dummyUnconvertedIndex); + } + +/** Converts text encoded using the Unicode transformation format UTF-8 into the +Unicode UCS-2 character set. Surrogate pairs can be created when a valid 4 byte UTF-8 is input. + +The variant of UTF-8 used internally by Java differs slightly from standard +UTF-8. The TBool argument controls the UTF-8 variant generated by this function. + +@param aUnicode On return, contains the Unicode encoded output string. +@param aUtf8 The UTF-8 encoded input string +@param aGenerateJavaConformantUtf8 EFalse for orthodox UTF-8. ETrue for Java +UTF-8. The default is EFalse. +@param aNumberOfUnconvertibleCharacters On return, contains the number of bytes +which were not converted. +@param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, the index +of the first byte of the first unconvertible character. For instance if the +first character in the input descriptor (aForeign) could not be converted, +then this parameter is set to the first byte of that character, i.e. zero. +A negative value is returned if all the characters were converted. +@return The number of unconverted bytes left at the end of the input descriptor, +or one of the error values defined in TError. */ + +/* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 + * Well formed UTF-8 Byte Sequences, full table. + * +----------------------------------------------------------------+ + * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | + * +--------------------+----------+----------+----------+----------+ + * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii + * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 + * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 + * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal + * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F + * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal + * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 + * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal + * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F + * +--------------------+----------+----------+----------+----------+ + * + * As a consequence of the well-formedness conditions specified in table 3-7, + * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. + */ +TInt CnvUtfConverter::ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, TBool aGenerateJavaConformantUtf8, + TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) + { + aUnicode.SetLength(0); + + if ((aUtf8.Length() == 0) || (aUnicode.MaxLength() == 0)) + { + return aUtf8.Length(); + } + + TUint16* pUnicode = CONST_CAST(TUint16*, aUnicode.Ptr()); + const TUint16* pLastUnicode = pUnicode + (aUnicode.MaxLength() - 1); + const TUint8* pUtf8 = aUtf8.Ptr(); + const TUint8* pLastUtf8 = pUtf8 + (aUtf8.Length() - 1); + const TUint16 replacementcharacter = 0xFFFD; + TUint currentUnicodeCharacter; + TUint sequenceLength; + + + FOREVER + { + TBool illFormed=EFalse; + + __ASSERT_DEBUG(pUnicode <= pLastUnicode, Panic(EPanicBadUnicodePointers8)); + __ASSERT_DEBUG(pUtf8 <= pLastUtf8, Panic(EPanicBadUtf8Pointers3)); + + sequenceLength = 1; + + // ascii - optimisation (i.e. it isn't a sequence) + if (pUtf8[0] < 0x80) + { + currentUnicodeCharacter = pUtf8[0]; + } + else + { + // see if well formed utf-8, use table above for reference + if ((pUtf8[0] >= 0xc2) && (pUtf8[0] <= 0xdf)) + { + // 0xc1-0xc2 are not valid bytes + sequenceLength = 2; + } + else if ((pUtf8[0] & 0xf0) == 0xe0) + { + sequenceLength = 3; + } + else if ((pUtf8[0] >= 0xf0) && (pUtf8[0] < 0xf5)) + { + // 0xf5-0xff, are not valid bytes + sequenceLength = 4; + } + else if ((pUtf8[0] == 0xc0) && aGenerateJavaConformantUtf8) + { + if ((pUtf8 == pLastUtf8) || (pUtf8[1] == 0x80)) + { + // either we've split the 0xc0 0x80 (i.e. 0xc0 is + // the last character in the string) or we've + // discovered a valid 0xc0 0x80 sequence. + sequenceLength = 2; + } + } + + /* checking to see if we got a valid sequence */ + if (sequenceLength == 1) + { + // bad value in the leading byte, 0xc0-0xc1,0x5f-0xff for example + currentUnicodeCharacter = replacementcharacter; + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); + } + else + { + // this is a check to see if the sequence goes beyond the input + // stream. if its not the first and only character in the input + // stream this isn't an error, otherwise it is. + if ((pUtf8 + sequenceLength - 1) > pLastUtf8) + { + // check to see if this sequence was the first character + if ((pUnicode - aUnicode.Ptr()) == 0) + { + return EErrorIllFormedInput; + } + break; + } + + currentUnicodeCharacter = pUtf8[0] & (0x7F>>sequenceLength); + + /* check the trailing bytes, they should begin with 10 */ + TUint i = 1; + + do + { + if ((pUtf8[i] & 0xc0) == 0x80) + { + // add the trailing 6 bits to the current unicode char + currentUnicodeCharacter = (currentUnicodeCharacter <<6 ) | (pUtf8[i] & 0x3F); + } + else + { + // ill formed character (doesn't have a lead 10) + currentUnicodeCharacter = replacementcharacter; + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); + illFormed=ETrue; + break; + } + i++; + } + while (i < sequenceLength); + } + + /* conformance check. bits of above table for reference. + * +----------------------------------------------------------------+ + * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | + * +--------------------+----------+----------+----------+----------+ + * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, 2nd < 0xA0 + * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, 2nd > 0x9F + * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, 2nd < 0x90 + * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, 2nd > 0x8F + * +--------------------+----------+----------+----------+----------+ + */ + + if (currentUnicodeCharacter != replacementcharacter) + { + if (sequenceLength == 3) + { + if ((pUtf8[0] == 0xE0) && (pUtf8[1] < 0xA0)) + { + currentUnicodeCharacter = replacementcharacter; + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); + illFormed=ETrue; + } + else if ((pUtf8[0] == 0xED) && (pUtf8[1] > 0x9F)) + { + currentUnicodeCharacter = replacementcharacter; + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); + illFormed=ETrue; + } + } + else if (sequenceLength == 4) + { + if ((pUtf8[0] == 0xF0) && (pUtf8[1] < 0x90)) + { + currentUnicodeCharacter = replacementcharacter; + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); + illFormed=ETrue; + } + else if ((pUtf8[0] == 0xF4) && (pUtf8[1] > 0x8F)) + { + currentUnicodeCharacter = replacementcharacter; + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); + illFormed=ETrue; + } + } + + + /* last conformance check - Unicode 5.0 section 3.9 D92 Because surrogate code points + * are not Unicode scalar values, any UTF-8 byte sequence that would map to code + * points D800..DFFF is ill formed */ + + if ((currentUnicodeCharacter >= 0xD800) && (currentUnicodeCharacter <= 0xDFFF)) + { + currentUnicodeCharacter = replacementcharacter; + UpdateUnconvertibleInfo(aNumberOfUnconvertibleCharacters, + aIndexOfFirstByteOfFirstUnconvertibleCharacter, pUtf8-aUtf8.Ptr()); + illFormed=ETrue; + } + } + // end conformance check + } + + // would this character generate a surrogate pair in UTF-16? + if (currentUnicodeCharacter > 0xFFFF) + { + // is there enough space to hold a surrogate pair in the output? + if (pUnicode >= pLastUnicode) + { + break; // no, end processing. + } + + TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0; + *pUnicode++ = STATIC_CAST(TUint16, surrogate); + + surrogate = (currentUnicodeCharacter & 0x3FF) + 0xDC00; + *pUnicode++ = STATIC_CAST(TUint16, surrogate); + } + else + { + *pUnicode++ = STATIC_CAST(TUint16, currentUnicodeCharacter); + } + + // move the input pointer + if (currentUnicodeCharacter != replacementcharacter) + { + pUtf8 += sequenceLength; + } + else if(illFormed == EFalse) + { + pUtf8 += (sequenceLength); + } + else + { + // we had a character we didn't recognize (i.e. it was invalid) + // so move to the next character in the input + pUtf8++; + } + + if ((pUtf8 > pLastUtf8) || (pUnicode > pLastUnicode)) + { + break; // we've either reached the end of the input or the end of output + } + } + + aUnicode.SetLength(pUnicode - aUnicode.Ptr()); + return (pLastUtf8 - pUtf8 + 1); + } + +/** Given a sample text this function attempts to determine whether or not + * the same text is encoded using the UTF-8 standard encoding scheme. + +@param TInt a confidence level, given at certain value. if the given sample + is UTF-8 this value will not be changed (unless > 100) then its + set to 100. Otherwise if the same isn't UTF-8, its set to 0. +@param TDesC8 sample text. +UTF-8. The default is EFalse. +@return void + */ + +/* of note: conformance. Unicode standard 5.0 section 3.9, table 3-7 + * Well formed UTF-8 Byte Sequences, full table. + * +----------------------------------------------------------------+ + * | Code Points | 1st byte | 2nd byte | 3rd byte | 4th byte | + * +--------------------+----------+----------+----------+----------+ + * | U+0000..U+007F | 00..7D | | | | 1 byte, ascii + * | U+0080..U+07FF | C2..DF | 80..BF | | | 2 bytes, error if 1st < 0xC2 + * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | 3 bytes, 1st == 0xE0, error if 2nd < 0xA0 + * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | normal + * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | 3 bytes, 1st == 0xED, error if 2nd > 0x9F + * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | normal + * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | 4 bytes, 1st == 0xf0, error if 2nd < 0x90 + * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | normal + * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | 4 bytes, 1st == 0xF4, error if 2nd > 0x8F + * +--------------------+----------+----------+----------+----------+ + * + * As a consequence of the well-formedness conditions specified in table 3-7, + * the following byte values are disallowed in UTF-8: C0-C1, F5-FF. + * + * Code Rules: + * R1: If the string contains any non-UTF-8 characters the returned confidence + * is 0. Valid UTF-8 combinations are listed in the above table. + * R2: Otherwise if the string starts with a UTF-8 BOM (byte order mark) in + * the (see ) the returned confidence is 95. + * R3: Otherwise the confidence returned is based upon the sample string + * length. + * R4: If the sample string is under 75 characters, the confidence is set to + * 75. + */ +void IsCharacterSetUTF8(TInt& aConfidenceLevel, const TDesC8& aSample) + { + + TInt sampleLength = aSample.Length(); + + if (sampleLength == 0) + { + aConfidenceLevel = 89; + return; + } + TInt bytesRemaining = 0; + TUint sequenceLength = 0; + + aConfidenceLevel = sampleLength; + + const TUint8* buffer = &aSample[0]; + + if (sampleLength < 95) + { + // check for the BOM + if ((sampleLength >= 3) && + ((buffer[0] == 0xEF) && + (buffer[1] == 0xBB) && + (buffer[2] == 0xBF)) + ) + { + aConfidenceLevel = 95; + } + else if (sampleLength < 75) + { + aConfidenceLevel = 75; + } + } + + for (TInt index = 0;index != sampleLength;index++) + { + + if (bytesRemaining > 0) + { + // bytesRemaining > 0, means that a byte representing the start of a + // multibyte sequence was encountered and the bytesRemaining is the + // number of bytes to follow. + + if ((buffer[index] & 0xc0) == 0x80) + { + // need to check for ill-formed sequences -- all are in the 2nd byte + + if ((sequenceLength == 3) && (bytesRemaining == 2)) + { + if ((buffer[index - 1] == 0xe0) && (buffer[index] < 0xa0)) + { + aConfidenceLevel = 0; + break; + } + else if ((buffer[index - 1] == 0xed) && (buffer[index] > 0x9f)) + { + aConfidenceLevel = 0; + break; + } + } + else if ((sequenceLength == 4) && (bytesRemaining == 3)) + { + if ((buffer[index - 1] == 0xf0) && (buffer[index] < 0x90)) + { + aConfidenceLevel = 0; + break; + } + else if ((buffer[index - 1] == 0xf4) && (buffer[index] > 0x8f)) + { + aConfidenceLevel = 0; + break; + } + } + + --bytesRemaining; + continue; + } + else + { + aConfidenceLevel = 0; + break; + } + } + + if (bytesRemaining == 0) + { + if (buffer[index] < 0x80) + { + // The value of aSample[index] is in the range 0x00-0x7f + //UTF8 maintains ASCII transparency. So it's a valid + //UTF8. Do nothing, check next value. + continue; + } + else if ((buffer[index] >= 0xc2) && (buffer[index] < 0xe0)) + { + // valid start of a 2 byte sequence (see conformance note) + sequenceLength = 2; + bytesRemaining = 1; + } + else if ((buffer[index] & 0xf0) == 0xe0) + { + // valid start of a 3 byte sequence + sequenceLength = 3; + bytesRemaining = 2; + } + else if ((buffer[index] >= 0xf0) && (buffer[index] < 0xf5)) + { + // valid start of a 4 byte sequence (see conformance note) + sequenceLength = 4; + bytesRemaining = 3; + } + else + { + // wasn't anything expected so must be an illegal/irregular UTF8 coded value + aConfidenceLevel = 0; + break; + } + } + } // for + + aConfidenceLevel = (aConfidenceLevel > 0)? ((aConfidenceLevel > 100)? 100: aConfidenceLevel): 0; + } + +// End of file