userlibandfileserver/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp
changeset 0 a41df078684a
child 2 4122176ea935
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/userlibandfileserver/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp	Mon Oct 19 15:55:17 2009 +0100
@@ -0,0 +1,459 @@
+// Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies).
+// All rights reserved.
+// This component and the accompanying materials are made available
+// under the terms of the License "Eclipse Public License v1.0"
+// which accompanies this distribution, and is available
+// at the URL "http://www.eclipse.org/legal/epl-v10.html".
+//
+// Initial Contributors:
+// Nokia Corporation - initial contribution.
+//
+// Contributors:
+//
+// Description:
+//
+// There are 2 reasons why not use existing unicodeconv.cpp:
+// 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially
+//    for huge code pages (e.g, Asia code pages). See INC127598.
+//
+// 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle.
+//
+// The algorithm of this special version unicodeconv.cpp is straightforward:
+// 1) foreign->unicode:
+//    1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in
+//              "cp54936_2byte_tounicode.cpp", which is generated with command
+//              "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt".
+//
+//    1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then
+//              search into the mapping table in "cp54936_4byte_tounicode.cpp",
+//              which is generated with command
+//              "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt".
+//
+//    1.3) 4 byte->unicode non-bmp: calculate with formula in this file.
+//
+// 2) unicode->foreign:
+//    2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp"
+//              can map directly, which is generated with command
+//              "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt".
+//
+//    2.2) unicode non-bmp->4 byte: calculate with formula in this file.
+//
+// The function cp54936_2byte_tounicode.cpp::TConvDataStruct::
+// ConvertSingleUnicode() is not used anymore. It's reserved just because not
+// changing the tool FatConversionTable.pl.
+//
+// About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt":
+// 1) All Private Used Area (PUA) code points are reserved.
+// 2) All GB18030 code points that mapping to undefined Unicode are reserved.
+//
+//
+// About the formula for non-bmp calculation:
+// 1) All code points from 0x10000 to 0x10FFFF are supported.
+// 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from
+//    the GB18030 standard, since the standard does not define the mapping for
+//    code points out of 0x20000-0x2FFFF.
+
+
+#include <e32std.h>
+#include <e32def.h>
+#include <e32des8.h> 
+#include "unicodeconv.h"
+#include "cp54936.h"
+
+
+enum TFccPanic
+	{
+	EBadForeignCode = 0,
+	E4ByteIndexOutOfRange,
+	EPanicBadIndices1,
+	EInavlidUnicodeValue
+	};
+void Panic(TFccPanic aPanic)
+	{
+
+	User::Panic(_L("FatCharsetConv"),aPanic);
+	}
+
+
+//replacement character to be used when unicode cannot be converted
+const TUint8 KForeignReplacement = 0x5F;
+
+const TUint8 KU10000Byte1 = 0x90;
+const TUint8 KU10000Byte2 = 0x30;
+const TUint8 KU10000Byte3 = 0x81;
+const TUint8 KU10000Byte4 = 0x30;
+
+inline TBool IsSupplementary(TUint aChar)
+/**
+@param aChar The 32-bit code point value of a Unicode character.
+
+@return True, if aChar is supplementary character; false, otherwise.
+*/
+	{
+	return (aChar > 0xFFFF);
+	}
+
+inline TBool IsSurrogate(TText16 aInt16)
+/**
+@return True, if aText16 is high surrogate or low surrogate; false, otherwise.
+*/
+	{
+	return (aInt16 & 0xF800) == 0xD800;
+	}
+
+inline TBool IsHighSurrogate(TText16 aInt16)
+/**
+@return True, if aText16 is high surrogate; false, otherwise.
+*/
+	{
+	return (aInt16 & 0xFC00) == 0xD800;
+	}
+
+inline TBool IsLowSurrogate(TText16 aInt16)
+/**
+@return True, if aText16 is low surrogate; false, otherwise.
+*/
+	{
+	return (aInt16 & 0xFC00) == 0xDC00;
+	}
+
+inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate)
+/**
+Combine a high surrogate and a low surrogate into a supplementary character.
+
+@return The 32-bit code point value of the generated Unicode supplementary
+        character.
+*/
+	{
+	return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate;
+	}
+
+inline TText16 GetHighSurrogate(TUint aChar)
+/**
+Retrieve the high surrogate of a supplementary character.
+
+@param aChar The 32-bit code point value of a Unicode character.
+
+@return High surrogate of aChar, if aChar is a supplementary character; 
+        aChar itself, if aChar is not a supplementary character.
+*/
+	{
+	return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10));
+	}
+
+inline TText16 GetLowSurrogate(TUint aChar)
+/**
+Retrieve the low surrogate of a supplementary character.
+
+@param aChar The 32-bit code point value of a Unicode character.
+
+@return Low surrogate of aChar, if aChar is a supplementary character; 
+        zero, if aChar is not a supplementary character.
+*/
+	{
+	return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF));
+	}
+
+//This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
+EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode)
+	{
+    UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue);
+    }
+
+//This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
+EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow)
+	{
+	const TInt length = aUnicode.Length();
+	const TUint16* unicode = aUnicode.Ptr();
+	const TUint16* guard = unicode + length;
+	
+	TUint8* foreign = const_cast<TUint8*>(aForeign.Ptr());
+	TUint8* foreignguard = foreign + aForeign.MaxLength();
+	
+	//loop going through the character of the unicode descriptor
+	while (unicode < guard)
+		{
+		TUint32 unicodeChar = *unicode++;
+		if (IsHighSurrogate(unicodeChar))
+			{
+			if (unicode >= guard || !IsLowSurrogate(*unicode))
+				{
+				if (foreign >= foreignguard)
+					{
+                    aForeign.SetLength(foreign-aForeign.Ptr());
+					if (leaveWhenOverflow)
+						User::Leave(KErrOverflow);
+                    else
+                    	return KErrOverflow;
+					}
+				*foreign++ = KForeignReplacement;
+				continue;
+				}
+			unicodeChar = JoinSurrogate(unicodeChar, *unicode++);
+			}
+		if (IsLowSurrogate(unicodeChar))
+			{
+			if (foreign >= foreignguard)
+				{
+				aForeign.SetLength(foreign-aForeign.Ptr());
+				if (leaveWhenOverflow)
+					User::Leave(KErrOverflow);
+				else
+					return KErrOverflow;
+				}
+			*foreign++ = KForeignReplacement;
+			continue;
+			}
+		
+		TUint8 b1, b2, b3, b4;		// byte 1,2,3,4 of result GB18030 code.
+		TInt count;					// byte count of result GB18030 code; can be 1, 2 or 4.
+		
+		// unicode to cp54936
+		if (IsSupplementary(unicodeChar))
+			{
+			unicodeChar -= 0x10000;
+			b4 = unicodeChar % 10 + KU10000Byte4;
+			unicodeChar /= 10;
+			b3 = unicodeChar % 126 + KU10000Byte3;
+			unicodeChar /= 126;
+			b2 = unicodeChar % 10 + KU10000Byte2;
+			b1 = unicodeChar / 10 + KU10000Byte1;
+			count = 4;
+			}
+		else
+			{
+			TUint32 foreignChar;
+			foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar];
+			b1 = ((foreignChar >> 24) & 0xFF);
+			b2 = ((foreignChar >> 16) & 0xFF);
+			b3 = ((foreignChar >> 8) & 0xFF);
+			b4 = (foreignChar & 0xFF);
+			count = 1;
+			if (b1)
+				{
+				count = 4;
+				}
+			else
+				{
+				__ASSERT_DEBUG(b2==0, Panic(EBadForeignCode));
+				if (b3)
+					{
+					count = 2;
+					}
+				}
+			}
+		
+		if (foreign + count > foreignguard)
+			{
+			aForeign.SetLength(foreign-aForeign.Ptr());
+            if (leaveWhenOverflow)
+            	User::Leave(KErrOverflow);
+            else
+            	return KErrOverflow;
+			}
+		if (count == 4)
+			{
+			*foreign++ = b1;
+			*foreign++ = b2;
+			}
+		if (count >= 2)
+			*foreign++ = b3;
+		*foreign++ = b4;
+		}
+	aForeign.SetLength(foreign-aForeign.Ptr());
+	return KErrNone;
+	}
+
+
+//This function converts from foreign characters into unicode and adds them into a descriptor
+EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign)
+	{
+    UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue);
+    }
+
+//This function converts from foreign characters into unicode and adds them into a descriptor
+EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow)
+	{
+	const TInt foreignLength = aForeign.Length();
+	const TUint8* foreign = aForeign.Ptr();
+	const TUint8* guard = foreign + foreignLength;
+	
+	TUint16* unicode = const_cast<TUint16*>(aUnicode.Ptr());
+	TUint16* unicodeguard = unicode + aUnicode.MaxLength();
+	
+	TUint8 b1, b2, b3, b4;
+	enum TCodeType
+	{
+	E1Byte = 0,
+	E2Byte,
+	E4ByteBmp,
+	E4ByteSupplementary,
+	EError,
+	};
+	TCodeType codetype;
+	TUint32 unicodeChar;
+
+	//loop going through the characters of the foreign descriptor
+	while (foreign < guard)
+		{
+		// roughly, detect which area the foreign code belongs to
+		b1 = *foreign++;
+		if (b1 <= 0x7F)
+			codetype = E1Byte;
+		else if (b1 == 0x80 || b1 > 0xFE)
+			codetype = EError;
+		else if (foreign >= guard)
+			codetype = EError;
+		else
+			{
+			b2 = *foreign++;
+			if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F)
+				codetype = E2Byte;
+			else if (b2 < 0x30 || b2 > 0x39)
+				codetype = EError;
+			else if (foreign+1 >= guard)
+				codetype = EError;
+			else
+				{
+				b3 = *foreign++;
+				if (b3 < 0x81 || b3 > 0xFE)
+					codetype = EError;
+				else
+					{
+					b4 = *foreign++;
+					if (b4 < 0x30 || b4 > 0x39)
+						codetype = EError;
+					else if (b1 >= 0x81 && b1 <= 0x84)		// 0x81308130-0x8439FE39
+						codetype = E4ByteBmp;
+					else if (b1 >= 0x90 && b1 <= 0xE3)		// 0x90308130-0xE339FE39
+						codetype = E4ByteSupplementary;
+					else
+						codetype = EError;					// others are reserved
+					}
+				}
+			}
+		
+		// cp54936 to unicode
+		if (codetype == E1Byte)
+			{
+			unicodeChar = b1;
+			}
+		else if (codetype == E2Byte)
+			{
+			// conventional algorithm used in FatCharsetConv
+			const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80);
+			if (structPtr->iUnicodeIfSingle)
+				unicodeChar = structPtr->iUnicodeIfSingle;
+			else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte)
+				unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)];
+			else
+				unicodeChar = 0xFFFD;
+			}
+		else if (codetype == E4ByteBmp)
+			{
+			TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30);
+			__ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange));
+			unicodeChar = KMappingTable4ByteBmp2Unicode[index];
+			}
+		else if (codetype == E4ByteSupplementary)
+			{
+			unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 +
+									(b2 - KU10000Byte2) * 1260 +
+									(b3 - KU10000Byte3) * 10 +
+									(b4 - KU10000Byte4);
+			__ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue));
+			}
+		else
+			{
+			unicodeChar = 0xFFFD;
+			}
+		
+		// append to output buffer
+		if (IsSupplementary(unicodeChar))
+			{
+			if (unicode + 1 >= unicodeguard)
+				{
+				aUnicode.SetLength(unicode-aUnicode.Ptr());
+				if (leaveWhenOverflow)
+					User::Leave(KErrOverflow);
+				else
+					return KErrOverflow;
+				}
+			*unicode++ = GetHighSurrogate(unicodeChar);
+			*unicode++ = GetLowSurrogate(unicodeChar);
+			}
+		else
+			{
+			if (unicode >= unicodeguard)
+				{
+				aUnicode.SetLength(unicode-aUnicode.Ptr());
+                if (leaveWhenOverflow)
+                	User::Leave(KErrOverflow);
+                else
+                	return KErrOverflow;
+				}
+			*unicode++ = unicodeChar;
+			}
+		}
+	aUnicode.SetLength(unicode-aUnicode.Ptr());
+	return KErrNone;
+	}
+
+EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter)
+	{
+	//1. aCharacter >= 0x0080 
+	if (aCharacter>=0x0080)
+		{
+		// Since all Unicode characters can be mapped to GB18030, so no need to
+		// test the converting.
+		if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter))
+			return ETrue;
+		else
+			return EFalse;
+		}
+
+    // For most common cases: 
+    // Note: lower case characters are considered legal DOS char here. 
+	if ((aCharacter>='a' && aCharacter<='z') || 
+	    (aCharacter>='A' && aCharacter<='Z') || 
+	    (aCharacter>='0' && aCharacter<='9'))
+			{
+			return ETrue;
+			}
+    // Checking for illegal chars: 
+    // 2. aCharacter <= 0x20 
+    // Note: leading 0x05 byte should be guarded by callers of this function 
+    //  as the information of the position of the character is required. 
+	if (aCharacter < 0x20)
+		return EFalse;
+	// Space (' ') is not considered as a legal DOS char here.
+	if (aCharacter == 0x20)
+		return EFalse;
+	
+	// 3. 0x20 < aCharacter < 0x80 
+    // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name": 
+    switch (aCharacter) 
+            { 
+            case 0x22:        // '"' 
+            case 0x2A:        // '*' 
+            case 0x2B:        // '+' 
+            case 0x2C:        // ',' 
+            //case 0x2E:        // '.'   // Although '.' is not allowed in any bytes of DIR_Name, it 
+                                         // is a valid character in short file names. 
+            case 0x2F:        // '/' 
+            case 0x3A:        // ':' 
+            case 0x3B:        // ';' 
+            case 0x3C:        // '<' 
+            case 0x3D:        // '=' 
+            case 0x3E:        // '>' 
+            case 0x3F:        // '?' 
+            case 0x5B:        // '[' 
+            case 0x5C:        // '\' 
+            case 0x5D:        // ']' 
+            case 0x7C:        // '|' 
+            	return EFalse; 
+            default: 
+            	return ETrue; 
+            } 
+	}		
+