userlibandfileserver/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp
changeset 0 a41df078684a
child 2 4122176ea935
equal deleted inserted replaced
-1:000000000000 0:a41df078684a
       
     1 // Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     2 // All rights reserved.
       
     3 // This component and the accompanying materials are made available
       
     4 // under the terms of the License "Eclipse Public License v1.0"
       
     5 // which accompanies this distribution, and is available
       
     6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     7 //
       
     8 // Initial Contributors:
       
     9 // Nokia Corporation - initial contribution.
       
    10 //
       
    11 // Contributors:
       
    12 //
       
    13 // Description:
       
    14 //
       
    15 // There are 2 reasons why not use existing unicodeconv.cpp:
       
    16 // 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially
       
    17 //    for huge code pages (e.g, Asia code pages). See INC127598.
       
    18 //
       
    19 // 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle.
       
    20 //
       
    21 // The algorithm of this special version unicodeconv.cpp is straightforward:
       
    22 // 1) foreign->unicode:
       
    23 //    1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in
       
    24 //              "cp54936_2byte_tounicode.cpp", which is generated with command
       
    25 //              "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt".
       
    26 //
       
    27 //    1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then
       
    28 //              search into the mapping table in "cp54936_4byte_tounicode.cpp",
       
    29 //              which is generated with command
       
    30 //              "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt".
       
    31 //
       
    32 //    1.3) 4 byte->unicode non-bmp: calculate with formula in this file.
       
    33 //
       
    34 // 2) unicode->foreign:
       
    35 //    2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp"
       
    36 //              can map directly, which is generated with command
       
    37 //              "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt".
       
    38 //
       
    39 //    2.2) unicode non-bmp->4 byte: calculate with formula in this file.
       
    40 //
       
    41 // The function cp54936_2byte_tounicode.cpp::TConvDataStruct::
       
    42 // ConvertSingleUnicode() is not used anymore. It's reserved just because not
       
    43 // changing the tool FatConversionTable.pl.
       
    44 //
       
    45 // About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt":
       
    46 // 1) All Private Used Area (PUA) code points are reserved.
       
    47 // 2) All GB18030 code points that mapping to undefined Unicode are reserved.
       
    48 //
       
    49 //
       
    50 // About the formula for non-bmp calculation:
       
    51 // 1) All code points from 0x10000 to 0x10FFFF are supported.
       
    52 // 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from
       
    53 //    the GB18030 standard, since the standard does not define the mapping for
       
    54 //    code points out of 0x20000-0x2FFFF.
       
    55 
       
    56 
       
    57 #include <e32std.h>
       
    58 #include <e32def.h>
       
    59 #include <e32des8.h> 
       
    60 #include "unicodeconv.h"
       
    61 #include "cp54936.h"
       
    62 
       
    63 
       
    64 enum TFccPanic
       
    65 	{
       
    66 	EBadForeignCode = 0,
       
    67 	E4ByteIndexOutOfRange,
       
    68 	EPanicBadIndices1,
       
    69 	EInavlidUnicodeValue
       
    70 	};
       
    71 void Panic(TFccPanic aPanic)
       
    72 	{
       
    73 
       
    74 	User::Panic(_L("FatCharsetConv"),aPanic);
       
    75 	}
       
    76 
       
    77 
       
    78 //replacement character to be used when unicode cannot be converted
       
    79 const TUint8 KForeignReplacement = 0x5F;
       
    80 
       
    81 const TUint8 KU10000Byte1 = 0x90;
       
    82 const TUint8 KU10000Byte2 = 0x30;
       
    83 const TUint8 KU10000Byte3 = 0x81;
       
    84 const TUint8 KU10000Byte4 = 0x30;
       
    85 
       
    86 inline TBool IsSupplementary(TUint aChar)
       
    87 /**
       
    88 @param aChar The 32-bit code point value of a Unicode character.
       
    89 
       
    90 @return True, if aChar is supplementary character; false, otherwise.
       
    91 */
       
    92 	{
       
    93 	return (aChar > 0xFFFF);
       
    94 	}
       
    95 
       
    96 inline TBool IsSurrogate(TText16 aInt16)
       
    97 /**
       
    98 @return True, if aText16 is high surrogate or low surrogate; false, otherwise.
       
    99 */
       
   100 	{
       
   101 	return (aInt16 & 0xF800) == 0xD800;
       
   102 	}
       
   103 
       
   104 inline TBool IsHighSurrogate(TText16 aInt16)
       
   105 /**
       
   106 @return True, if aText16 is high surrogate; false, otherwise.
       
   107 */
       
   108 	{
       
   109 	return (aInt16 & 0xFC00) == 0xD800;
       
   110 	}
       
   111 
       
   112 inline TBool IsLowSurrogate(TText16 aInt16)
       
   113 /**
       
   114 @return True, if aText16 is low surrogate; false, otherwise.
       
   115 */
       
   116 	{
       
   117 	return (aInt16 & 0xFC00) == 0xDC00;
       
   118 	}
       
   119 
       
   120 inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate)
       
   121 /**
       
   122 Combine a high surrogate and a low surrogate into a supplementary character.
       
   123 
       
   124 @return The 32-bit code point value of the generated Unicode supplementary
       
   125         character.
       
   126 */
       
   127 	{
       
   128 	return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate;
       
   129 	}
       
   130 
       
   131 inline TText16 GetHighSurrogate(TUint aChar)
       
   132 /**
       
   133 Retrieve the high surrogate of a supplementary character.
       
   134 
       
   135 @param aChar The 32-bit code point value of a Unicode character.
       
   136 
       
   137 @return High surrogate of aChar, if aChar is a supplementary character; 
       
   138         aChar itself, if aChar is not a supplementary character.
       
   139 */
       
   140 	{
       
   141 	return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10));
       
   142 	}
       
   143 
       
   144 inline TText16 GetLowSurrogate(TUint aChar)
       
   145 /**
       
   146 Retrieve the low surrogate of a supplementary character.
       
   147 
       
   148 @param aChar The 32-bit code point value of a Unicode character.
       
   149 
       
   150 @return Low surrogate of aChar, if aChar is a supplementary character; 
       
   151         zero, if aChar is not a supplementary character.
       
   152 */
       
   153 	{
       
   154 	return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF));
       
   155 	}
       
   156 
       
   157 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
       
   158 EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode)
       
   159 	{
       
   160     UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue);
       
   161     }
       
   162 
       
   163 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
       
   164 EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow)
       
   165 	{
       
   166 	const TInt length = aUnicode.Length();
       
   167 	const TUint16* unicode = aUnicode.Ptr();
       
   168 	const TUint16* guard = unicode + length;
       
   169 	
       
   170 	TUint8* foreign = const_cast<TUint8*>(aForeign.Ptr());
       
   171 	TUint8* foreignguard = foreign + aForeign.MaxLength();
       
   172 	
       
   173 	//loop going through the character of the unicode descriptor
       
   174 	while (unicode < guard)
       
   175 		{
       
   176 		TUint32 unicodeChar = *unicode++;
       
   177 		if (IsHighSurrogate(unicodeChar))
       
   178 			{
       
   179 			if (unicode >= guard || !IsLowSurrogate(*unicode))
       
   180 				{
       
   181 				if (foreign >= foreignguard)
       
   182 					{
       
   183                     aForeign.SetLength(foreign-aForeign.Ptr());
       
   184 					if (leaveWhenOverflow)
       
   185 						User::Leave(KErrOverflow);
       
   186                     else
       
   187                     	return KErrOverflow;
       
   188 					}
       
   189 				*foreign++ = KForeignReplacement;
       
   190 				continue;
       
   191 				}
       
   192 			unicodeChar = JoinSurrogate(unicodeChar, *unicode++);
       
   193 			}
       
   194 		if (IsLowSurrogate(unicodeChar))
       
   195 			{
       
   196 			if (foreign >= foreignguard)
       
   197 				{
       
   198 				aForeign.SetLength(foreign-aForeign.Ptr());
       
   199 				if (leaveWhenOverflow)
       
   200 					User::Leave(KErrOverflow);
       
   201 				else
       
   202 					return KErrOverflow;
       
   203 				}
       
   204 			*foreign++ = KForeignReplacement;
       
   205 			continue;
       
   206 			}
       
   207 		
       
   208 		TUint8 b1, b2, b3, b4;		// byte 1,2,3,4 of result GB18030 code.
       
   209 		TInt count;					// byte count of result GB18030 code; can be 1, 2 or 4.
       
   210 		
       
   211 		// unicode to cp54936
       
   212 		if (IsSupplementary(unicodeChar))
       
   213 			{
       
   214 			unicodeChar -= 0x10000;
       
   215 			b4 = unicodeChar % 10 + KU10000Byte4;
       
   216 			unicodeChar /= 10;
       
   217 			b3 = unicodeChar % 126 + KU10000Byte3;
       
   218 			unicodeChar /= 126;
       
   219 			b2 = unicodeChar % 10 + KU10000Byte2;
       
   220 			b1 = unicodeChar / 10 + KU10000Byte1;
       
   221 			count = 4;
       
   222 			}
       
   223 		else
       
   224 			{
       
   225 			TUint32 foreignChar;
       
   226 			foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar];
       
   227 			b1 = ((foreignChar >> 24) & 0xFF);
       
   228 			b2 = ((foreignChar >> 16) & 0xFF);
       
   229 			b3 = ((foreignChar >> 8) & 0xFF);
       
   230 			b4 = (foreignChar & 0xFF);
       
   231 			count = 1;
       
   232 			if (b1)
       
   233 				{
       
   234 				count = 4;
       
   235 				}
       
   236 			else
       
   237 				{
       
   238 				__ASSERT_DEBUG(b2==0, Panic(EBadForeignCode));
       
   239 				if (b3)
       
   240 					{
       
   241 					count = 2;
       
   242 					}
       
   243 				}
       
   244 			}
       
   245 		
       
   246 		if (foreign + count > foreignguard)
       
   247 			{
       
   248 			aForeign.SetLength(foreign-aForeign.Ptr());
       
   249             if (leaveWhenOverflow)
       
   250             	User::Leave(KErrOverflow);
       
   251             else
       
   252             	return KErrOverflow;
       
   253 			}
       
   254 		if (count == 4)
       
   255 			{
       
   256 			*foreign++ = b1;
       
   257 			*foreign++ = b2;
       
   258 			}
       
   259 		if (count >= 2)
       
   260 			*foreign++ = b3;
       
   261 		*foreign++ = b4;
       
   262 		}
       
   263 	aForeign.SetLength(foreign-aForeign.Ptr());
       
   264 	return KErrNone;
       
   265 	}
       
   266 
       
   267 
       
   268 //This function converts from foreign characters into unicode and adds them into a descriptor
       
   269 EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign)
       
   270 	{
       
   271     UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue);
       
   272     }
       
   273 
       
   274 //This function converts from foreign characters into unicode and adds them into a descriptor
       
   275 EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow)
       
   276 	{
       
   277 	const TInt foreignLength = aForeign.Length();
       
   278 	const TUint8* foreign = aForeign.Ptr();
       
   279 	const TUint8* guard = foreign + foreignLength;
       
   280 	
       
   281 	TUint16* unicode = const_cast<TUint16*>(aUnicode.Ptr());
       
   282 	TUint16* unicodeguard = unicode + aUnicode.MaxLength();
       
   283 	
       
   284 	TUint8 b1, b2, b3, b4;
       
   285 	enum TCodeType
       
   286 	{
       
   287 	E1Byte = 0,
       
   288 	E2Byte,
       
   289 	E4ByteBmp,
       
   290 	E4ByteSupplementary,
       
   291 	EError,
       
   292 	};
       
   293 	TCodeType codetype;
       
   294 	TUint32 unicodeChar;
       
   295 
       
   296 	//loop going through the characters of the foreign descriptor
       
   297 	while (foreign < guard)
       
   298 		{
       
   299 		// roughly, detect which area the foreign code belongs to
       
   300 		b1 = *foreign++;
       
   301 		if (b1 <= 0x7F)
       
   302 			codetype = E1Byte;
       
   303 		else if (b1 == 0x80 || b1 > 0xFE)
       
   304 			codetype = EError;
       
   305 		else if (foreign >= guard)
       
   306 			codetype = EError;
       
   307 		else
       
   308 			{
       
   309 			b2 = *foreign++;
       
   310 			if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F)
       
   311 				codetype = E2Byte;
       
   312 			else if (b2 < 0x30 || b2 > 0x39)
       
   313 				codetype = EError;
       
   314 			else if (foreign+1 >= guard)
       
   315 				codetype = EError;
       
   316 			else
       
   317 				{
       
   318 				b3 = *foreign++;
       
   319 				if (b3 < 0x81 || b3 > 0xFE)
       
   320 					codetype = EError;
       
   321 				else
       
   322 					{
       
   323 					b4 = *foreign++;
       
   324 					if (b4 < 0x30 || b4 > 0x39)
       
   325 						codetype = EError;
       
   326 					else if (b1 >= 0x81 && b1 <= 0x84)		// 0x81308130-0x8439FE39
       
   327 						codetype = E4ByteBmp;
       
   328 					else if (b1 >= 0x90 && b1 <= 0xE3)		// 0x90308130-0xE339FE39
       
   329 						codetype = E4ByteSupplementary;
       
   330 					else
       
   331 						codetype = EError;					// others are reserved
       
   332 					}
       
   333 				}
       
   334 			}
       
   335 		
       
   336 		// cp54936 to unicode
       
   337 		if (codetype == E1Byte)
       
   338 			{
       
   339 			unicodeChar = b1;
       
   340 			}
       
   341 		else if (codetype == E2Byte)
       
   342 			{
       
   343 			// conventional algorithm used in FatCharsetConv
       
   344 			const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80);
       
   345 			if (structPtr->iUnicodeIfSingle)
       
   346 				unicodeChar = structPtr->iUnicodeIfSingle;
       
   347 			else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte)
       
   348 				unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)];
       
   349 			else
       
   350 				unicodeChar = 0xFFFD;
       
   351 			}
       
   352 		else if (codetype == E4ByteBmp)
       
   353 			{
       
   354 			TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30);
       
   355 			__ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange));
       
   356 			unicodeChar = KMappingTable4ByteBmp2Unicode[index];
       
   357 			}
       
   358 		else if (codetype == E4ByteSupplementary)
       
   359 			{
       
   360 			unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 +
       
   361 									(b2 - KU10000Byte2) * 1260 +
       
   362 									(b3 - KU10000Byte3) * 10 +
       
   363 									(b4 - KU10000Byte4);
       
   364 			__ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue));
       
   365 			}
       
   366 		else
       
   367 			{
       
   368 			unicodeChar = 0xFFFD;
       
   369 			}
       
   370 		
       
   371 		// append to output buffer
       
   372 		if (IsSupplementary(unicodeChar))
       
   373 			{
       
   374 			if (unicode + 1 >= unicodeguard)
       
   375 				{
       
   376 				aUnicode.SetLength(unicode-aUnicode.Ptr());
       
   377 				if (leaveWhenOverflow)
       
   378 					User::Leave(KErrOverflow);
       
   379 				else
       
   380 					return KErrOverflow;
       
   381 				}
       
   382 			*unicode++ = GetHighSurrogate(unicodeChar);
       
   383 			*unicode++ = GetLowSurrogate(unicodeChar);
       
   384 			}
       
   385 		else
       
   386 			{
       
   387 			if (unicode >= unicodeguard)
       
   388 				{
       
   389 				aUnicode.SetLength(unicode-aUnicode.Ptr());
       
   390                 if (leaveWhenOverflow)
       
   391                 	User::Leave(KErrOverflow);
       
   392                 else
       
   393                 	return KErrOverflow;
       
   394 				}
       
   395 			*unicode++ = unicodeChar;
       
   396 			}
       
   397 		}
       
   398 	aUnicode.SetLength(unicode-aUnicode.Ptr());
       
   399 	return KErrNone;
       
   400 	}
       
   401 
       
   402 EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter)
       
   403 	{
       
   404 	//1. aCharacter >= 0x0080 
       
   405 	if (aCharacter>=0x0080)
       
   406 		{
       
   407 		// Since all Unicode characters can be mapped to GB18030, so no need to
       
   408 		// test the converting.
       
   409 		if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter))
       
   410 			return ETrue;
       
   411 		else
       
   412 			return EFalse;
       
   413 		}
       
   414 
       
   415     // For most common cases: 
       
   416     // Note: lower case characters are considered legal DOS char here. 
       
   417 	if ((aCharacter>='a' && aCharacter<='z') || 
       
   418 	    (aCharacter>='A' && aCharacter<='Z') || 
       
   419 	    (aCharacter>='0' && aCharacter<='9'))
       
   420 			{
       
   421 			return ETrue;
       
   422 			}
       
   423     // Checking for illegal chars: 
       
   424     // 2. aCharacter <= 0x20 
       
   425     // Note: leading 0x05 byte should be guarded by callers of this function 
       
   426     //  as the information of the position of the character is required. 
       
   427 	if (aCharacter < 0x20)
       
   428 		return EFalse;
       
   429 	// Space (' ') is not considered as a legal DOS char here.
       
   430 	if (aCharacter == 0x20)
       
   431 		return EFalse;
       
   432 	
       
   433 	// 3. 0x20 < aCharacter < 0x80 
       
   434     // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name": 
       
   435     switch (aCharacter) 
       
   436             { 
       
   437             case 0x22:        // '"' 
       
   438             case 0x2A:        // '*' 
       
   439             case 0x2B:        // '+' 
       
   440             case 0x2C:        // ',' 
       
   441             //case 0x2E:        // '.'   // Although '.' is not allowed in any bytes of DIR_Name, it 
       
   442                                          // is a valid character in short file names. 
       
   443             case 0x2F:        // '/' 
       
   444             case 0x3A:        // ':' 
       
   445             case 0x3B:        // ';' 
       
   446             case 0x3C:        // '<' 
       
   447             case 0x3D:        // '=' 
       
   448             case 0x3E:        // '>' 
       
   449             case 0x3F:        // '?' 
       
   450             case 0x5B:        // '[' 
       
   451             case 0x5C:        // '\' 
       
   452             case 0x5D:        // ']' 
       
   453             case 0x7C:        // '|' 
       
   454             	return EFalse; 
       
   455             default: 
       
   456             	return ETrue; 
       
   457             } 
       
   458 	}		
       
   459