diff -r 4122176ea935 -r 56f325a607ea userlibandfileserver/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp --- a/userlibandfileserver/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp Mon Dec 21 16:14:42 2009 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,461 +0,0 @@ -/* -* Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). -* All rights reserved. -* This component and the accompanying materials are made available -* under the terms of "Eclipse Public License v1.0" -* which accompanies this distribution, and is available -* at the URL "http://www.eclipse.org/legal/epl-v10.html". -* -* Initial Contributors: -* Nokia Corporation - initial contribution. -* -* Contributors: -* -* Description: -* -*/ -// There are 2 reasons why not use existing unicodeconv.cpp: -// 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially -// for huge code pages (e.g, Asia code pages). See INC127598. -// -// 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle. -// -// The algorithm of this special version unicodeconv.cpp is straightforward: -// 1) foreign->unicode: -// 1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in -// "cp54936_2byte_tounicode.cpp", which is generated with command -// "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt". -// -// 1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then -// search into the mapping table in "cp54936_4byte_tounicode.cpp", -// which is generated with command -// "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt". -// -// 1.3) 4 byte->unicode non-bmp: calculate with formula in this file. -// -// 2) unicode->foreign: -// 2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp" -// can map directly, which is generated with command -// "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt". -// -// 2.2) unicode non-bmp->4 byte: calculate with formula in this file. -// -// The function cp54936_2byte_tounicode.cpp::TConvDataStruct:: -// ConvertSingleUnicode() is not used anymore. It's reserved just because not -// changing the tool FatConversionTable.pl. -// -// About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt": -// 1) All Private Used Area (PUA) code points are reserved. -// 2) All GB18030 code points that mapping to undefined Unicode are reserved. -// -// -// About the formula for non-bmp calculation: -// 1) All code points from 0x10000 to 0x10FFFF are supported. -// 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from -// the GB18030 standard, since the standard does not define the mapping for -// code points out of 0x20000-0x2FFFF. - - -#include -#include -#include -#include "unicodeconv.h" -#include "cp54936.h" - - -enum TFccPanic - { - EBadForeignCode = 0, - E4ByteIndexOutOfRange, - EPanicBadIndices1, - EInavlidUnicodeValue - }; -void Panic(TFccPanic aPanic) - { - - User::Panic(_L("FatCharsetConv"),aPanic); - } - - -//replacement character to be used when unicode cannot be converted -const TUint8 KForeignReplacement = 0x5F; - -const TUint8 KU10000Byte1 = 0x90; -const TUint8 KU10000Byte2 = 0x30; -const TUint8 KU10000Byte3 = 0x81; -const TUint8 KU10000Byte4 = 0x30; - -inline TBool IsSupplementary(TUint aChar) -/** -@param aChar The 32-bit code point value of a Unicode character. - -@return True, if aChar is supplementary character; false, otherwise. -*/ - { - return (aChar > 0xFFFF); - } - -inline TBool IsSurrogate(TText16 aInt16) -/** -@return True, if aText16 is high surrogate or low surrogate; false, otherwise. -*/ - { - return (aInt16 & 0xF800) == 0xD800; - } - -inline TBool IsHighSurrogate(TText16 aInt16) -/** -@return True, if aText16 is high surrogate; false, otherwise. -*/ - { - return (aInt16 & 0xFC00) == 0xD800; - } - -inline TBool IsLowSurrogate(TText16 aInt16) -/** -@return True, if aText16 is low surrogate; false, otherwise. -*/ - { - return (aInt16 & 0xFC00) == 0xDC00; - } - -inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate) -/** -Combine a high surrogate and a low surrogate into a supplementary character. - -@return The 32-bit code point value of the generated Unicode supplementary - character. -*/ - { - return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate; - } - -inline TText16 GetHighSurrogate(TUint aChar) -/** -Retrieve the high surrogate of a supplementary character. - -@param aChar The 32-bit code point value of a Unicode character. - -@return High surrogate of aChar, if aChar is a supplementary character; - aChar itself, if aChar is not a supplementary character. -*/ - { - return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10)); - } - -inline TText16 GetLowSurrogate(TUint aChar) -/** -Retrieve the low surrogate of a supplementary character. - -@param aChar The 32-bit code point value of a Unicode character. - -@return Low surrogate of aChar, if aChar is a supplementary character; - zero, if aChar is not a supplementary character. -*/ - { - return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF)); - } - -//This function converts from Unicoded characters, to foreign characters and adds them into a descriptor -EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode) - { - UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue); - } - -//This function converts from Unicoded characters, to foreign characters and adds them into a descriptor -EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow) - { - const TInt length = aUnicode.Length(); - const TUint16* unicode = aUnicode.Ptr(); - const TUint16* guard = unicode + length; - - TUint8* foreign = const_cast(aForeign.Ptr()); - TUint8* foreignguard = foreign + aForeign.MaxLength(); - - //loop going through the character of the unicode descriptor - while (unicode < guard) - { - TUint32 unicodeChar = *unicode++; - if (IsHighSurrogate(unicodeChar)) - { - if (unicode >= guard || !IsLowSurrogate(*unicode)) - { - if (foreign >= foreignguard) - { - aForeign.SetLength(foreign-aForeign.Ptr()); - if (leaveWhenOverflow) - User::Leave(KErrOverflow); - else - return KErrOverflow; - } - *foreign++ = KForeignReplacement; - continue; - } - unicodeChar = JoinSurrogate(unicodeChar, *unicode++); - } - if (IsLowSurrogate(unicodeChar)) - { - if (foreign >= foreignguard) - { - aForeign.SetLength(foreign-aForeign.Ptr()); - if (leaveWhenOverflow) - User::Leave(KErrOverflow); - else - return KErrOverflow; - } - *foreign++ = KForeignReplacement; - continue; - } - - TUint8 b1, b2, b3, b4; // byte 1,2,3,4 of result GB18030 code. - TInt count; // byte count of result GB18030 code; can be 1, 2 or 4. - - // unicode to cp54936 - if (IsSupplementary(unicodeChar)) - { - unicodeChar -= 0x10000; - b4 = unicodeChar % 10 + KU10000Byte4; - unicodeChar /= 10; - b3 = unicodeChar % 126 + KU10000Byte3; - unicodeChar /= 126; - b2 = unicodeChar % 10 + KU10000Byte2; - b1 = unicodeChar / 10 + KU10000Byte1; - count = 4; - } - else - { - TUint32 foreignChar; - foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar]; - b1 = ((foreignChar >> 24) & 0xFF); - b2 = ((foreignChar >> 16) & 0xFF); - b3 = ((foreignChar >> 8) & 0xFF); - b4 = (foreignChar & 0xFF); - count = 1; - if (b1) - { - count = 4; - } - else - { - __ASSERT_DEBUG(b2==0, Panic(EBadForeignCode)); - if (b3) - { - count = 2; - } - } - } - - if (foreign + count > foreignguard) - { - aForeign.SetLength(foreign-aForeign.Ptr()); - if (leaveWhenOverflow) - User::Leave(KErrOverflow); - else - return KErrOverflow; - } - if (count == 4) - { - *foreign++ = b1; - *foreign++ = b2; - } - if (count >= 2) - *foreign++ = b3; - *foreign++ = b4; - } - aForeign.SetLength(foreign-aForeign.Ptr()); - return KErrNone; - } - - -//This function converts from foreign characters into unicode and adds them into a descriptor -EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign) - { - UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue); - } - -//This function converts from foreign characters into unicode and adds them into a descriptor -EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow) - { - const TInt foreignLength = aForeign.Length(); - const TUint8* foreign = aForeign.Ptr(); - const TUint8* guard = foreign + foreignLength; - - TUint16* unicode = const_cast(aUnicode.Ptr()); - TUint16* unicodeguard = unicode + aUnicode.MaxLength(); - - TUint8 b1, b2, b3, b4; - enum TCodeType - { - E1Byte = 0, - E2Byte, - E4ByteBmp, - E4ByteSupplementary, - EError, - }; - TCodeType codetype; - TUint32 unicodeChar; - - //loop going through the characters of the foreign descriptor - while (foreign < guard) - { - // roughly, detect which area the foreign code belongs to - b1 = *foreign++; - if (b1 <= 0x7F) - codetype = E1Byte; - else if (b1 == 0x80 || b1 > 0xFE) - codetype = EError; - else if (foreign >= guard) - codetype = EError; - else - { - b2 = *foreign++; - if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F) - codetype = E2Byte; - else if (b2 < 0x30 || b2 > 0x39) - codetype = EError; - else if (foreign+1 >= guard) - codetype = EError; - else - { - b3 = *foreign++; - if (b3 < 0x81 || b3 > 0xFE) - codetype = EError; - else - { - b4 = *foreign++; - if (b4 < 0x30 || b4 > 0x39) - codetype = EError; - else if (b1 >= 0x81 && b1 <= 0x84) // 0x81308130-0x8439FE39 - codetype = E4ByteBmp; - else if (b1 >= 0x90 && b1 <= 0xE3) // 0x90308130-0xE339FE39 - codetype = E4ByteSupplementary; - else - codetype = EError; // others are reserved - } - } - } - - // cp54936 to unicode - if (codetype == E1Byte) - { - unicodeChar = b1; - } - else if (codetype == E2Byte) - { - // conventional algorithm used in FatCharsetConv - const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80); - if (structPtr->iUnicodeIfSingle) - unicodeChar = structPtr->iUnicodeIfSingle; - else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte) - unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)]; - else - unicodeChar = 0xFFFD; - } - else if (codetype == E4ByteBmp) - { - TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30); - __ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange)); - unicodeChar = KMappingTable4ByteBmp2Unicode[index]; - } - else if (codetype == E4ByteSupplementary) - { - unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 + - (b2 - KU10000Byte2) * 1260 + - (b3 - KU10000Byte3) * 10 + - (b4 - KU10000Byte4); - __ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue)); - } - else - { - unicodeChar = 0xFFFD; - } - - // append to output buffer - if (IsSupplementary(unicodeChar)) - { - if (unicode + 1 >= unicodeguard) - { - aUnicode.SetLength(unicode-aUnicode.Ptr()); - if (leaveWhenOverflow) - User::Leave(KErrOverflow); - else - return KErrOverflow; - } - *unicode++ = GetHighSurrogate(unicodeChar); - *unicode++ = GetLowSurrogate(unicodeChar); - } - else - { - if (unicode >= unicodeguard) - { - aUnicode.SetLength(unicode-aUnicode.Ptr()); - if (leaveWhenOverflow) - User::Leave(KErrOverflow); - else - return KErrOverflow; - } - *unicode++ = unicodeChar; - } - } - aUnicode.SetLength(unicode-aUnicode.Ptr()); - return KErrNone; - } - -EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter) - { - //1. aCharacter >= 0x0080 - if (aCharacter>=0x0080) - { - // Since all Unicode characters can be mapped to GB18030, so no need to - // test the converting. - if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter)) - return ETrue; - else - return EFalse; - } - - // For most common cases: - // Note: lower case characters are considered legal DOS char here. - if ((aCharacter>='a' && aCharacter<='z') || - (aCharacter>='A' && aCharacter<='Z') || - (aCharacter>='0' && aCharacter<='9')) - { - return ETrue; - } - // Checking for illegal chars: - // 2. aCharacter <= 0x20 - // Note: leading 0x05 byte should be guarded by callers of this function - // as the information of the position of the character is required. - if (aCharacter < 0x20) - return EFalse; - // Space (' ') is not considered as a legal DOS char here. - if (aCharacter == 0x20) - return EFalse; - - // 3. 0x20 < aCharacter < 0x80 - // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name": - switch (aCharacter) - { - case 0x22: // '"' - case 0x2A: // '*' - case 0x2B: // '+' - case 0x2C: // ',' - //case 0x2E: // '.' // Although '.' is not allowed in any bytes of DIR_Name, it - // is a valid character in short file names. - case 0x2F: // '/' - case 0x3A: // ':' - case 0x3B: // ';' - case 0x3C: // '<' - case 0x3D: // '=' - case 0x3E: // '>' - case 0x3F: // '?' - case 0x5B: // '[' - case 0x5C: // '\' - case 0x5D: // ']' - case 0x7C: // '|' - return EFalse; - default: - return ETrue; - } - } -