userlibandfileserver/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp
changeset 31 56f325a607ea
parent 15 4122176ea935
child 32 c9417927a896
child 33 0173bcd7697c
equal deleted inserted replaced
15:4122176ea935 31:56f325a607ea
     1 /*
       
     2 * Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 // There are 2 reasons why not use existing unicodeconv.cpp:
       
    18 // 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially
       
    19 //    for huge code pages (e.g, Asia code pages). See INC127598.
       
    20 //
       
    21 // 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle.
       
    22 //
       
    23 // The algorithm of this special version unicodeconv.cpp is straightforward:
       
    24 // 1) foreign->unicode:
       
    25 //    1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in
       
    26 //              "cp54936_2byte_tounicode.cpp", which is generated with command
       
    27 //              "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt".
       
    28 //
       
    29 //    1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then
       
    30 //              search into the mapping table in "cp54936_4byte_tounicode.cpp",
       
    31 //              which is generated with command
       
    32 //              "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt".
       
    33 //
       
    34 //    1.3) 4 byte->unicode non-bmp: calculate with formula in this file.
       
    35 //
       
    36 // 2) unicode->foreign:
       
    37 //    2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp"
       
    38 //              can map directly, which is generated with command
       
    39 //              "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt".
       
    40 //
       
    41 //    2.2) unicode non-bmp->4 byte: calculate with formula in this file.
       
    42 //
       
    43 // The function cp54936_2byte_tounicode.cpp::TConvDataStruct::
       
    44 // ConvertSingleUnicode() is not used anymore. It's reserved just because not
       
    45 // changing the tool FatConversionTable.pl.
       
    46 //
       
    47 // About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt":
       
    48 // 1) All Private Used Area (PUA) code points are reserved.
       
    49 // 2) All GB18030 code points that mapping to undefined Unicode are reserved.
       
    50 //
       
    51 //
       
    52 // About the formula for non-bmp calculation:
       
    53 // 1) All code points from 0x10000 to 0x10FFFF are supported.
       
    54 // 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from
       
    55 //    the GB18030 standard, since the standard does not define the mapping for
       
    56 //    code points out of 0x20000-0x2FFFF.
       
    57 
       
    58 
       
    59 #include <e32std.h>
       
    60 #include <e32def.h>
       
    61 #include <e32des8.h> 
       
    62 #include "unicodeconv.h"
       
    63 #include "cp54936.h"
       
    64 
       
    65 
       
    66 enum TFccPanic
       
    67 	{
       
    68 	EBadForeignCode = 0,
       
    69 	E4ByteIndexOutOfRange,
       
    70 	EPanicBadIndices1,
       
    71 	EInavlidUnicodeValue
       
    72 	};
       
    73 void Panic(TFccPanic aPanic)
       
    74 	{
       
    75 
       
    76 	User::Panic(_L("FatCharsetConv"),aPanic);
       
    77 	}
       
    78 
       
    79 
       
    80 //replacement character to be used when unicode cannot be converted
       
    81 const TUint8 KForeignReplacement = 0x5F;
       
    82 
       
    83 const TUint8 KU10000Byte1 = 0x90;
       
    84 const TUint8 KU10000Byte2 = 0x30;
       
    85 const TUint8 KU10000Byte3 = 0x81;
       
    86 const TUint8 KU10000Byte4 = 0x30;
       
    87 
       
    88 inline TBool IsSupplementary(TUint aChar)
       
    89 /**
       
    90 @param aChar The 32-bit code point value of a Unicode character.
       
    91 
       
    92 @return True, if aChar is supplementary character; false, otherwise.
       
    93 */
       
    94 	{
       
    95 	return (aChar > 0xFFFF);
       
    96 	}
       
    97 
       
    98 inline TBool IsSurrogate(TText16 aInt16)
       
    99 /**
       
   100 @return True, if aText16 is high surrogate or low surrogate; false, otherwise.
       
   101 */
       
   102 	{
       
   103 	return (aInt16 & 0xF800) == 0xD800;
       
   104 	}
       
   105 
       
   106 inline TBool IsHighSurrogate(TText16 aInt16)
       
   107 /**
       
   108 @return True, if aText16 is high surrogate; false, otherwise.
       
   109 */
       
   110 	{
       
   111 	return (aInt16 & 0xFC00) == 0xD800;
       
   112 	}
       
   113 
       
   114 inline TBool IsLowSurrogate(TText16 aInt16)
       
   115 /**
       
   116 @return True, if aText16 is low surrogate; false, otherwise.
       
   117 */
       
   118 	{
       
   119 	return (aInt16 & 0xFC00) == 0xDC00;
       
   120 	}
       
   121 
       
   122 inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate)
       
   123 /**
       
   124 Combine a high surrogate and a low surrogate into a supplementary character.
       
   125 
       
   126 @return The 32-bit code point value of the generated Unicode supplementary
       
   127         character.
       
   128 */
       
   129 	{
       
   130 	return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate;
       
   131 	}
       
   132 
       
   133 inline TText16 GetHighSurrogate(TUint aChar)
       
   134 /**
       
   135 Retrieve the high surrogate of a supplementary character.
       
   136 
       
   137 @param aChar The 32-bit code point value of a Unicode character.
       
   138 
       
   139 @return High surrogate of aChar, if aChar is a supplementary character; 
       
   140         aChar itself, if aChar is not a supplementary character.
       
   141 */
       
   142 	{
       
   143 	return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10));
       
   144 	}
       
   145 
       
   146 inline TText16 GetLowSurrogate(TUint aChar)
       
   147 /**
       
   148 Retrieve the low surrogate of a supplementary character.
       
   149 
       
   150 @param aChar The 32-bit code point value of a Unicode character.
       
   151 
       
   152 @return Low surrogate of aChar, if aChar is a supplementary character; 
       
   153         zero, if aChar is not a supplementary character.
       
   154 */
       
   155 	{
       
   156 	return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF));
       
   157 	}
       
   158 
       
   159 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
       
   160 EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode)
       
   161 	{
       
   162     UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue);
       
   163     }
       
   164 
       
   165 //This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
       
   166 EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow)
       
   167 	{
       
   168 	const TInt length = aUnicode.Length();
       
   169 	const TUint16* unicode = aUnicode.Ptr();
       
   170 	const TUint16* guard = unicode + length;
       
   171 	
       
   172 	TUint8* foreign = const_cast<TUint8*>(aForeign.Ptr());
       
   173 	TUint8* foreignguard = foreign + aForeign.MaxLength();
       
   174 	
       
   175 	//loop going through the character of the unicode descriptor
       
   176 	while (unicode < guard)
       
   177 		{
       
   178 		TUint32 unicodeChar = *unicode++;
       
   179 		if (IsHighSurrogate(unicodeChar))
       
   180 			{
       
   181 			if (unicode >= guard || !IsLowSurrogate(*unicode))
       
   182 				{
       
   183 				if (foreign >= foreignguard)
       
   184 					{
       
   185                     aForeign.SetLength(foreign-aForeign.Ptr());
       
   186 					if (leaveWhenOverflow)
       
   187 						User::Leave(KErrOverflow);
       
   188                     else
       
   189                     	return KErrOverflow;
       
   190 					}
       
   191 				*foreign++ = KForeignReplacement;
       
   192 				continue;
       
   193 				}
       
   194 			unicodeChar = JoinSurrogate(unicodeChar, *unicode++);
       
   195 			}
       
   196 		if (IsLowSurrogate(unicodeChar))
       
   197 			{
       
   198 			if (foreign >= foreignguard)
       
   199 				{
       
   200 				aForeign.SetLength(foreign-aForeign.Ptr());
       
   201 				if (leaveWhenOverflow)
       
   202 					User::Leave(KErrOverflow);
       
   203 				else
       
   204 					return KErrOverflow;
       
   205 				}
       
   206 			*foreign++ = KForeignReplacement;
       
   207 			continue;
       
   208 			}
       
   209 		
       
   210 		TUint8 b1, b2, b3, b4;		// byte 1,2,3,4 of result GB18030 code.
       
   211 		TInt count;					// byte count of result GB18030 code; can be 1, 2 or 4.
       
   212 		
       
   213 		// unicode to cp54936
       
   214 		if (IsSupplementary(unicodeChar))
       
   215 			{
       
   216 			unicodeChar -= 0x10000;
       
   217 			b4 = unicodeChar % 10 + KU10000Byte4;
       
   218 			unicodeChar /= 10;
       
   219 			b3 = unicodeChar % 126 + KU10000Byte3;
       
   220 			unicodeChar /= 126;
       
   221 			b2 = unicodeChar % 10 + KU10000Byte2;
       
   222 			b1 = unicodeChar / 10 + KU10000Byte1;
       
   223 			count = 4;
       
   224 			}
       
   225 		else
       
   226 			{
       
   227 			TUint32 foreignChar;
       
   228 			foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar];
       
   229 			b1 = ((foreignChar >> 24) & 0xFF);
       
   230 			b2 = ((foreignChar >> 16) & 0xFF);
       
   231 			b3 = ((foreignChar >> 8) & 0xFF);
       
   232 			b4 = (foreignChar & 0xFF);
       
   233 			count = 1;
       
   234 			if (b1)
       
   235 				{
       
   236 				count = 4;
       
   237 				}
       
   238 			else
       
   239 				{
       
   240 				__ASSERT_DEBUG(b2==0, Panic(EBadForeignCode));
       
   241 				if (b3)
       
   242 					{
       
   243 					count = 2;
       
   244 					}
       
   245 				}
       
   246 			}
       
   247 		
       
   248 		if (foreign + count > foreignguard)
       
   249 			{
       
   250 			aForeign.SetLength(foreign-aForeign.Ptr());
       
   251             if (leaveWhenOverflow)
       
   252             	User::Leave(KErrOverflow);
       
   253             else
       
   254             	return KErrOverflow;
       
   255 			}
       
   256 		if (count == 4)
       
   257 			{
       
   258 			*foreign++ = b1;
       
   259 			*foreign++ = b2;
       
   260 			}
       
   261 		if (count >= 2)
       
   262 			*foreign++ = b3;
       
   263 		*foreign++ = b4;
       
   264 		}
       
   265 	aForeign.SetLength(foreign-aForeign.Ptr());
       
   266 	return KErrNone;
       
   267 	}
       
   268 
       
   269 
       
   270 //This function converts from foreign characters into unicode and adds them into a descriptor
       
   271 EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign)
       
   272 	{
       
   273     UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue);
       
   274     }
       
   275 
       
   276 //This function converts from foreign characters into unicode and adds them into a descriptor
       
   277 EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow)
       
   278 	{
       
   279 	const TInt foreignLength = aForeign.Length();
       
   280 	const TUint8* foreign = aForeign.Ptr();
       
   281 	const TUint8* guard = foreign + foreignLength;
       
   282 	
       
   283 	TUint16* unicode = const_cast<TUint16*>(aUnicode.Ptr());
       
   284 	TUint16* unicodeguard = unicode + aUnicode.MaxLength();
       
   285 	
       
   286 	TUint8 b1, b2, b3, b4;
       
   287 	enum TCodeType
       
   288 	{
       
   289 	E1Byte = 0,
       
   290 	E2Byte,
       
   291 	E4ByteBmp,
       
   292 	E4ByteSupplementary,
       
   293 	EError,
       
   294 	};
       
   295 	TCodeType codetype;
       
   296 	TUint32 unicodeChar;
       
   297 
       
   298 	//loop going through the characters of the foreign descriptor
       
   299 	while (foreign < guard)
       
   300 		{
       
   301 		// roughly, detect which area the foreign code belongs to
       
   302 		b1 = *foreign++;
       
   303 		if (b1 <= 0x7F)
       
   304 			codetype = E1Byte;
       
   305 		else if (b1 == 0x80 || b1 > 0xFE)
       
   306 			codetype = EError;
       
   307 		else if (foreign >= guard)
       
   308 			codetype = EError;
       
   309 		else
       
   310 			{
       
   311 			b2 = *foreign++;
       
   312 			if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F)
       
   313 				codetype = E2Byte;
       
   314 			else if (b2 < 0x30 || b2 > 0x39)
       
   315 				codetype = EError;
       
   316 			else if (foreign+1 >= guard)
       
   317 				codetype = EError;
       
   318 			else
       
   319 				{
       
   320 				b3 = *foreign++;
       
   321 				if (b3 < 0x81 || b3 > 0xFE)
       
   322 					codetype = EError;
       
   323 				else
       
   324 					{
       
   325 					b4 = *foreign++;
       
   326 					if (b4 < 0x30 || b4 > 0x39)
       
   327 						codetype = EError;
       
   328 					else if (b1 >= 0x81 && b1 <= 0x84)		// 0x81308130-0x8439FE39
       
   329 						codetype = E4ByteBmp;
       
   330 					else if (b1 >= 0x90 && b1 <= 0xE3)		// 0x90308130-0xE339FE39
       
   331 						codetype = E4ByteSupplementary;
       
   332 					else
       
   333 						codetype = EError;					// others are reserved
       
   334 					}
       
   335 				}
       
   336 			}
       
   337 		
       
   338 		// cp54936 to unicode
       
   339 		if (codetype == E1Byte)
       
   340 			{
       
   341 			unicodeChar = b1;
       
   342 			}
       
   343 		else if (codetype == E2Byte)
       
   344 			{
       
   345 			// conventional algorithm used in FatCharsetConv
       
   346 			const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80);
       
   347 			if (structPtr->iUnicodeIfSingle)
       
   348 				unicodeChar = structPtr->iUnicodeIfSingle;
       
   349 			else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte)
       
   350 				unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)];
       
   351 			else
       
   352 				unicodeChar = 0xFFFD;
       
   353 			}
       
   354 		else if (codetype == E4ByteBmp)
       
   355 			{
       
   356 			TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30);
       
   357 			__ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange));
       
   358 			unicodeChar = KMappingTable4ByteBmp2Unicode[index];
       
   359 			}
       
   360 		else if (codetype == E4ByteSupplementary)
       
   361 			{
       
   362 			unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 +
       
   363 									(b2 - KU10000Byte2) * 1260 +
       
   364 									(b3 - KU10000Byte3) * 10 +
       
   365 									(b4 - KU10000Byte4);
       
   366 			__ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue));
       
   367 			}
       
   368 		else
       
   369 			{
       
   370 			unicodeChar = 0xFFFD;
       
   371 			}
       
   372 		
       
   373 		// append to output buffer
       
   374 		if (IsSupplementary(unicodeChar))
       
   375 			{
       
   376 			if (unicode + 1 >= unicodeguard)
       
   377 				{
       
   378 				aUnicode.SetLength(unicode-aUnicode.Ptr());
       
   379 				if (leaveWhenOverflow)
       
   380 					User::Leave(KErrOverflow);
       
   381 				else
       
   382 					return KErrOverflow;
       
   383 				}
       
   384 			*unicode++ = GetHighSurrogate(unicodeChar);
       
   385 			*unicode++ = GetLowSurrogate(unicodeChar);
       
   386 			}
       
   387 		else
       
   388 			{
       
   389 			if (unicode >= unicodeguard)
       
   390 				{
       
   391 				aUnicode.SetLength(unicode-aUnicode.Ptr());
       
   392                 if (leaveWhenOverflow)
       
   393                 	User::Leave(KErrOverflow);
       
   394                 else
       
   395                 	return KErrOverflow;
       
   396 				}
       
   397 			*unicode++ = unicodeChar;
       
   398 			}
       
   399 		}
       
   400 	aUnicode.SetLength(unicode-aUnicode.Ptr());
       
   401 	return KErrNone;
       
   402 	}
       
   403 
       
   404 EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter)
       
   405 	{
       
   406 	//1. aCharacter >= 0x0080 
       
   407 	if (aCharacter>=0x0080)
       
   408 		{
       
   409 		// Since all Unicode characters can be mapped to GB18030, so no need to
       
   410 		// test the converting.
       
   411 		if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter))
       
   412 			return ETrue;
       
   413 		else
       
   414 			return EFalse;
       
   415 		}
       
   416 
       
   417     // For most common cases: 
       
   418     // Note: lower case characters are considered legal DOS char here. 
       
   419 	if ((aCharacter>='a' && aCharacter<='z') || 
       
   420 	    (aCharacter>='A' && aCharacter<='Z') || 
       
   421 	    (aCharacter>='0' && aCharacter<='9'))
       
   422 			{
       
   423 			return ETrue;
       
   424 			}
       
   425     // Checking for illegal chars: 
       
   426     // 2. aCharacter <= 0x20 
       
   427     // Note: leading 0x05 byte should be guarded by callers of this function 
       
   428     //  as the information of the position of the character is required. 
       
   429 	if (aCharacter < 0x20)
       
   430 		return EFalse;
       
   431 	// Space (' ') is not considered as a legal DOS char here.
       
   432 	if (aCharacter == 0x20)
       
   433 		return EFalse;
       
   434 	
       
   435 	// 3. 0x20 < aCharacter < 0x80 
       
   436     // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name": 
       
   437     switch (aCharacter) 
       
   438             { 
       
   439             case 0x22:        // '"' 
       
   440             case 0x2A:        // '*' 
       
   441             case 0x2B:        // '+' 
       
   442             case 0x2C:        // ',' 
       
   443             //case 0x2E:        // '.'   // Although '.' is not allowed in any bytes of DIR_Name, it 
       
   444                                          // is a valid character in short file names. 
       
   445             case 0x2F:        // '/' 
       
   446             case 0x3A:        // ':' 
       
   447             case 0x3B:        // ';' 
       
   448             case 0x3C:        // '<' 
       
   449             case 0x3D:        // '=' 
       
   450             case 0x3E:        // '>' 
       
   451             case 0x3F:        // '?' 
       
   452             case 0x5B:        // '[' 
       
   453             case 0x5C:        // '\' 
       
   454             case 0x5D:        // ']' 
       
   455             case 0x7C:        // '|' 
       
   456             	return EFalse; 
       
   457             default: 
       
   458             	return ETrue; 
       
   459             } 
       
   460 	}		
       
   461