diff -r 000000000000 -r 1fb32624e06b charconvfw/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/charconvfw/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp Tue Feb 02 02:02:46 2010 +0200 @@ -0,0 +1,461 @@ +/* +* Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ +// There are 2 reasons why not use existing unicodeconv.cpp: +// 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially +// for huge code pages (e.g, Asia code pages). See INC127598. +// +// 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle. +// +// The algorithm of this special version unicodeconv.cpp is straightforward: +// 1) foreign->unicode: +// 1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in +// "cp54936_2byte_tounicode.cpp", which is generated with command +// "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt". +// +// 1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then +// search into the mapping table in "cp54936_4byte_tounicode.cpp", +// which is generated with command +// "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt". +// +// 1.3) 4 byte->unicode non-bmp: calculate with formula in this file. +// +// 2) unicode->foreign: +// 2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp" +// can map directly, which is generated with command +// "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt". +// +// 2.2) unicode non-bmp->4 byte: calculate with formula in this file. +// +// The function cp54936_2byte_tounicode.cpp::TConvDataStruct:: +// ConvertSingleUnicode() is not used anymore. It's reserved just because not +// changing the tool FatConversionTable.pl. +// +// About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt": +// 1) All Private Used Area (PUA) code points are reserved. +// 2) All GB18030 code points that mapping to undefined Unicode are reserved. +// +// +// About the formula for non-bmp calculation: +// 1) All code points from 0x10000 to 0x10FFFF are supported. +// 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from +// the GB18030 standard, since the standard does not define the mapping for +// code points out of 0x20000-0x2FFFF. + + +#include +#include +#include +#include "unicodeconv.h" +#include "cp54936.h" + + +enum TFccPanic + { + EBadForeignCode = 0, + E4ByteIndexOutOfRange, + EPanicBadIndices1, + EInavlidUnicodeValue + }; +void Panic(TFccPanic aPanic) + { + + User::Panic(_L("FatCharsetConv"),aPanic); + } + + +//replacement character to be used when unicode cannot be converted +const TUint8 KForeignReplacement = 0x5F; + +const TUint8 KU10000Byte1 = 0x90; +const TUint8 KU10000Byte2 = 0x30; +const TUint8 KU10000Byte3 = 0x81; +const TUint8 KU10000Byte4 = 0x30; + +inline TBool IsSupplementary(TUint aChar) +/** +@param aChar The 32-bit code point value of a Unicode character. + +@return True, if aChar is supplementary character; false, otherwise. +*/ + { + return (aChar > 0xFFFF); + } + +inline TBool IsSurrogate(TText16 aInt16) +/** +@return True, if aText16 is high surrogate or low surrogate; false, otherwise. +*/ + { + return (aInt16 & 0xF800) == 0xD800; + } + +inline TBool IsHighSurrogate(TText16 aInt16) +/** +@return True, if aText16 is high surrogate; false, otherwise. +*/ + { + return (aInt16 & 0xFC00) == 0xD800; + } + +inline TBool IsLowSurrogate(TText16 aInt16) +/** +@return True, if aText16 is low surrogate; false, otherwise. +*/ + { + return (aInt16 & 0xFC00) == 0xDC00; + } + +inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate) +/** +Combine a high surrogate and a low surrogate into a supplementary character. + +@return The 32-bit code point value of the generated Unicode supplementary + character. +*/ + { + return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate; + } + +inline TText16 GetHighSurrogate(TUint aChar) +/** +Retrieve the high surrogate of a supplementary character. + +@param aChar The 32-bit code point value of a Unicode character. + +@return High surrogate of aChar, if aChar is a supplementary character; + aChar itself, if aChar is not a supplementary character. +*/ + { + return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10)); + } + +inline TText16 GetLowSurrogate(TUint aChar) +/** +Retrieve the low surrogate of a supplementary character. + +@param aChar The 32-bit code point value of a Unicode character. + +@return Low surrogate of aChar, if aChar is a supplementary character; + zero, if aChar is not a supplementary character. +*/ + { + return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF)); + } + +//This function converts from Unicoded characters, to foreign characters and adds them into a descriptor +EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode) + { + UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue); + } + +//This function converts from Unicoded characters, to foreign characters and adds them into a descriptor +EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow) + { + const TInt length = aUnicode.Length(); + const TUint16* unicode = aUnicode.Ptr(); + const TUint16* guard = unicode + length; + + TUint8* foreign = const_cast(aForeign.Ptr()); + TUint8* foreignguard = foreign + aForeign.MaxLength(); + + //loop going through the character of the unicode descriptor + while (unicode < guard) + { + TUint32 unicodeChar = *unicode++; + if (IsHighSurrogate(unicodeChar)) + { + if (unicode >= guard || !IsLowSurrogate(*unicode)) + { + if (foreign >= foreignguard) + { + aForeign.SetLength(foreign-aForeign.Ptr()); + if (leaveWhenOverflow) + User::Leave(KErrOverflow); + else + return KErrOverflow; + } + *foreign++ = KForeignReplacement; + continue; + } + unicodeChar = JoinSurrogate(unicodeChar, *unicode++); + } + if (IsLowSurrogate(unicodeChar)) + { + if (foreign >= foreignguard) + { + aForeign.SetLength(foreign-aForeign.Ptr()); + if (leaveWhenOverflow) + User::Leave(KErrOverflow); + else + return KErrOverflow; + } + *foreign++ = KForeignReplacement; + continue; + } + + TUint8 b1, b2, b3, b4; // byte 1,2,3,4 of result GB18030 code. + TInt count; // byte count of result GB18030 code; can be 1, 2 or 4. + + // unicode to cp54936 + if (IsSupplementary(unicodeChar)) + { + unicodeChar -= 0x10000; + b4 = unicodeChar % 10 + KU10000Byte4; + unicodeChar /= 10; + b3 = unicodeChar % 126 + KU10000Byte3; + unicodeChar /= 126; + b2 = unicodeChar % 10 + KU10000Byte2; + b1 = unicodeChar / 10 + KU10000Byte1; + count = 4; + } + else + { + TUint32 foreignChar; + foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar]; + b1 = ((foreignChar >> 24) & 0xFF); + b2 = ((foreignChar >> 16) & 0xFF); + b3 = ((foreignChar >> 8) & 0xFF); + b4 = (foreignChar & 0xFF); + count = 1; + if (b1) + { + count = 4; + } + else + { + __ASSERT_DEBUG(b2==0, Panic(EBadForeignCode)); + if (b3) + { + count = 2; + } + } + } + + if (foreign + count > foreignguard) + { + aForeign.SetLength(foreign-aForeign.Ptr()); + if (leaveWhenOverflow) + User::Leave(KErrOverflow); + else + return KErrOverflow; + } + if (count == 4) + { + *foreign++ = b1; + *foreign++ = b2; + } + if (count >= 2) + *foreign++ = b3; + *foreign++ = b4; + } + aForeign.SetLength(foreign-aForeign.Ptr()); + return KErrNone; + } + + +//This function converts from foreign characters into unicode and adds them into a descriptor +EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign) + { + UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue); + } + +//This function converts from foreign characters into unicode and adds them into a descriptor +EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow) + { + const TInt foreignLength = aForeign.Length(); + const TUint8* foreign = aForeign.Ptr(); + const TUint8* guard = foreign + foreignLength; + + TUint16* unicode = const_cast(aUnicode.Ptr()); + TUint16* unicodeguard = unicode + aUnicode.MaxLength(); + + TUint8 b1, b2, b3, b4; + enum TCodeType + { + E1Byte = 0, + E2Byte, + E4ByteBmp, + E4ByteSupplementary, + EError, + }; + TCodeType codetype; + TUint32 unicodeChar; + + //loop going through the characters of the foreign descriptor + while (foreign < guard) + { + // roughly, detect which area the foreign code belongs to + b1 = *foreign++; + if (b1 <= 0x7F) + codetype = E1Byte; + else if (b1 == 0x80 || b1 > 0xFE) + codetype = EError; + else if (foreign >= guard) + codetype = EError; + else + { + b2 = *foreign++; + if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F) + codetype = E2Byte; + else if (b2 < 0x30 || b2 > 0x39) + codetype = EError; + else if (foreign+1 >= guard) + codetype = EError; + else + { + b3 = *foreign++; + if (b3 < 0x81 || b3 > 0xFE) + codetype = EError; + else + { + b4 = *foreign++; + if (b4 < 0x30 || b4 > 0x39) + codetype = EError; + else if (b1 >= 0x81 && b1 <= 0x84) // 0x81308130-0x8439FE39 + codetype = E4ByteBmp; + else if (b1 >= 0x90 && b1 <= 0xE3) // 0x90308130-0xE339FE39 + codetype = E4ByteSupplementary; + else + codetype = EError; // others are reserved + } + } + } + + // cp54936 to unicode + if (codetype == E1Byte) + { + unicodeChar = b1; + } + else if (codetype == E2Byte) + { + // conventional algorithm used in FatCharsetConv + const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80); + if (structPtr->iUnicodeIfSingle) + unicodeChar = structPtr->iUnicodeIfSingle; + else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte) + unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)]; + else + unicodeChar = 0xFFFD; + } + else if (codetype == E4ByteBmp) + { + TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30); + __ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange)); + unicodeChar = KMappingTable4ByteBmp2Unicode[index]; + } + else if (codetype == E4ByteSupplementary) + { + unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 + + (b2 - KU10000Byte2) * 1260 + + (b3 - KU10000Byte3) * 10 + + (b4 - KU10000Byte4); + __ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue)); + } + else + { + unicodeChar = 0xFFFD; + } + + // append to output buffer + if (IsSupplementary(unicodeChar)) + { + if (unicode + 1 >= unicodeguard) + { + aUnicode.SetLength(unicode-aUnicode.Ptr()); + if (leaveWhenOverflow) + User::Leave(KErrOverflow); + else + return KErrOverflow; + } + *unicode++ = GetHighSurrogate(unicodeChar); + *unicode++ = GetLowSurrogate(unicodeChar); + } + else + { + if (unicode >= unicodeguard) + { + aUnicode.SetLength(unicode-aUnicode.Ptr()); + if (leaveWhenOverflow) + User::Leave(KErrOverflow); + else + return KErrOverflow; + } + *unicode++ = unicodeChar; + } + } + aUnicode.SetLength(unicode-aUnicode.Ptr()); + return KErrNone; + } + +EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter) + { + //1. aCharacter >= 0x0080 + if (aCharacter>=0x0080) + { + // Since all Unicode characters can be mapped to GB18030, so no need to + // test the converting. + if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter)) + return ETrue; + else + return EFalse; + } + + // For most common cases: + // Note: lower case characters are considered legal DOS char here. + if ((aCharacter>='a' && aCharacter<='z') || + (aCharacter>='A' && aCharacter<='Z') || + (aCharacter>='0' && aCharacter<='9')) + { + return ETrue; + } + // Checking for illegal chars: + // 2. aCharacter <= 0x20 + // Note: leading 0x05 byte should be guarded by callers of this function + // as the information of the position of the character is required. + if (aCharacter < 0x20) + return EFalse; + // Space (' ') is not considered as a legal DOS char here. + if (aCharacter == 0x20) + return EFalse; + + // 3. 0x20 < aCharacter < 0x80 + // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name": + switch (aCharacter) + { + case 0x22: // '"' + case 0x2A: // '*' + case 0x2B: // '+' + case 0x2C: // ',' + //case 0x2E: // '.' // Although '.' is not allowed in any bytes of DIR_Name, it + // is a valid character in short file names. + case 0x2F: // '/' + case 0x3A: // ':' + case 0x3B: // ';' + case 0x3C: // '<' + case 0x3D: // '=' + case 0x3E: // '>' + case 0x3F: // '?' + case 0x5B: // '[' + case 0x5C: // '\' + case 0x5D: // ']' + case 0x7C: // '|' + return EFalse; + default: + return ETrue; + } + } +