userlibandfileserver/fatfilenameconversionplugins/src/cp54936_unicodeconv.cpp
author John Imhofe <john.imhofe@nokia.com>
Mon, 21 Dec 2009 16:14:42 +0000
changeset 15 4122176ea935
parent 0 a41df078684a
permissions -rw-r--r--
Revision: 200948 + Removing redundant base integration tests and fixing build errors Kit: 200948

/*
* Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/
// There are 2 reasons why not use existing unicodeconv.cpp:
// 1) "unicode->foreign" in existing unicodeconv.cpp is quite slow, especially
//    for huge code pages (e.g, Asia code pages). See INC127598.
//
// 2) GB18030 has 32-bit code that existing unicodeconv.cpp cannot handle.
//
// The algorithm of this special version unicodeconv.cpp is straightforward:
// 1) foreign->unicode:
//    1.1) 1 byte/2 byte->unicode bmp: use existing mechanism; mapping table in
//              "cp54936_2byte_tounicode.cpp", which is generated with command
//              "perl -w ..\group\FatConversionTable.pl cp54936_2byte.txt".
//
//    1.2) 4 byte->unicode bmp: convert the 4-byte code to a 16-bit index, then
//              search into the mapping table in "cp54936_4byte_tounicode.cpp",
//              which is generated with command
//              "perl -w ..\group\cp54936_4byte_tounicode.pl cp54936_4byte.txt".
//
//    1.3) 4 byte->unicode non-bmp: calculate with formula in this file.
//
// 2) unicode->foreign:
//    2.1) unicode bmp->1/2/4 byte: the huge table in "cp54936_allbmp_fromunicode.cpp"
//              can map directly, which is generated with command
//              "perl -w ..\group\cp54936_allbmp_fromunicode.pl cp54936_2byte.txt cp54936_4byte.txt".
//
//    2.2) unicode non-bmp->4 byte: calculate with formula in this file.
//
// The function cp54936_2byte_tounicode.cpp::TConvDataStruct::
// ConvertSingleUnicode() is not used anymore. It's reserved just because not
// changing the tool FatConversionTable.pl.
//
// About the mapping table "cp54936_2byte.txt" and "cp54936_4byte.txt":
// 1) All Private Used Area (PUA) code points are reserved.
// 2) All GB18030 code points that mapping to undefined Unicode are reserved.
//
//
// About the formula for non-bmp calculation:
// 1) All code points from 0x10000 to 0x10FFFF are supported.
// 2) Code points in 0x10000-0x1FFFF and 0x30000-0x10FFFF are summarized from
//    the GB18030 standard, since the standard does not define the mapping for
//    code points out of 0x20000-0x2FFFF.


#include <e32std.h>
#include <e32def.h>
#include <e32des8.h> 
#include "unicodeconv.h"
#include "cp54936.h"


enum TFccPanic
	{
	EBadForeignCode = 0,
	E4ByteIndexOutOfRange,
	EPanicBadIndices1,
	EInavlidUnicodeValue
	};
void Panic(TFccPanic aPanic)
	{

	User::Panic(_L("FatCharsetConv"),aPanic);
	}


//replacement character to be used when unicode cannot be converted
const TUint8 KForeignReplacement = 0x5F;

const TUint8 KU10000Byte1 = 0x90;
const TUint8 KU10000Byte2 = 0x30;
const TUint8 KU10000Byte3 = 0x81;
const TUint8 KU10000Byte4 = 0x30;

inline TBool IsSupplementary(TUint aChar)
/**
@param aChar The 32-bit code point value of a Unicode character.

@return True, if aChar is supplementary character; false, otherwise.
*/
	{
	return (aChar > 0xFFFF);
	}

inline TBool IsSurrogate(TText16 aInt16)
/**
@return True, if aText16 is high surrogate or low surrogate; false, otherwise.
*/
	{
	return (aInt16 & 0xF800) == 0xD800;
	}

inline TBool IsHighSurrogate(TText16 aInt16)
/**
@return True, if aText16 is high surrogate; false, otherwise.
*/
	{
	return (aInt16 & 0xFC00) == 0xD800;
	}

inline TBool IsLowSurrogate(TText16 aInt16)
/**
@return True, if aText16 is low surrogate; false, otherwise.
*/
	{
	return (aInt16 & 0xFC00) == 0xDC00;
	}

inline TUint JoinSurrogate(TText16 aHighSurrogate, TText16 aLowSurrogate)
/**
Combine a high surrogate and a low surrogate into a supplementary character.

@return The 32-bit code point value of the generated Unicode supplementary
        character.
*/
	{
	return ((aHighSurrogate - 0xD7F7) << 10) + aLowSurrogate;
	}

inline TText16 GetHighSurrogate(TUint aChar)
/**
Retrieve the high surrogate of a supplementary character.

@param aChar The 32-bit code point value of a Unicode character.

@return High surrogate of aChar, if aChar is a supplementary character; 
        aChar itself, if aChar is not a supplementary character.
*/
	{
	return STATIC_CAST(TText16, 0xD7C0 + (aChar >> 10));
	}

inline TText16 GetLowSurrogate(TUint aChar)
/**
Retrieve the low surrogate of a supplementary character.

@param aChar The 32-bit code point value of a Unicode character.

@return Low surrogate of aChar, if aChar is a supplementary character; 
        zero, if aChar is not a supplementary character.
*/
	{
	return STATIC_CAST(TText16, 0xDC00 | (aChar & 0x3FF));
	}

//This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
EXPORT_C void UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode)
	{
    UnicodeConv::ConvertFromUnicodeL(aForeign, aUnicode, ETrue);
    }

//This function converts from Unicoded characters, to foreign characters and adds them into a descriptor
EXPORT_C TInt UnicodeConv::ConvertFromUnicodeL(TDes8& aForeign, const TDesC16& aUnicode, TBool leaveWhenOverflow)
	{
	const TInt length = aUnicode.Length();
	const TUint16* unicode = aUnicode.Ptr();
	const TUint16* guard = unicode + length;
	
	TUint8* foreign = const_cast<TUint8*>(aForeign.Ptr());
	TUint8* foreignguard = foreign + aForeign.MaxLength();
	
	//loop going through the character of the unicode descriptor
	while (unicode < guard)
		{
		TUint32 unicodeChar = *unicode++;
		if (IsHighSurrogate(unicodeChar))
			{
			if (unicode >= guard || !IsLowSurrogate(*unicode))
				{
				if (foreign >= foreignguard)
					{
                    aForeign.SetLength(foreign-aForeign.Ptr());
					if (leaveWhenOverflow)
						User::Leave(KErrOverflow);
                    else
                    	return KErrOverflow;
					}
				*foreign++ = KForeignReplacement;
				continue;
				}
			unicodeChar = JoinSurrogate(unicodeChar, *unicode++);
			}
		if (IsLowSurrogate(unicodeChar))
			{
			if (foreign >= foreignguard)
				{
				aForeign.SetLength(foreign-aForeign.Ptr());
				if (leaveWhenOverflow)
					User::Leave(KErrOverflow);
				else
					return KErrOverflow;
				}
			*foreign++ = KForeignReplacement;
			continue;
			}
		
		TUint8 b1, b2, b3, b4;		// byte 1,2,3,4 of result GB18030 code.
		TInt count;					// byte count of result GB18030 code; can be 1, 2 or 4.
		
		// unicode to cp54936
		if (IsSupplementary(unicodeChar))
			{
			unicodeChar -= 0x10000;
			b4 = unicodeChar % 10 + KU10000Byte4;
			unicodeChar /= 10;
			b3 = unicodeChar % 126 + KU10000Byte3;
			unicodeChar /= 126;
			b2 = unicodeChar % 10 + KU10000Byte2;
			b1 = unicodeChar / 10 + KU10000Byte1;
			count = 4;
			}
		else
			{
			TUint32 foreignChar;
			foreignChar = KMappingTableUnicodeBmp2CP54936[unicodeChar];
			b1 = ((foreignChar >> 24) & 0xFF);
			b2 = ((foreignChar >> 16) & 0xFF);
			b3 = ((foreignChar >> 8) & 0xFF);
			b4 = (foreignChar & 0xFF);
			count = 1;
			if (b1)
				{
				count = 4;
				}
			else
				{
				__ASSERT_DEBUG(b2==0, Panic(EBadForeignCode));
				if (b3)
					{
					count = 2;
					}
				}
			}
		
		if (foreign + count > foreignguard)
			{
			aForeign.SetLength(foreign-aForeign.Ptr());
            if (leaveWhenOverflow)
            	User::Leave(KErrOverflow);
            else
            	return KErrOverflow;
			}
		if (count == 4)
			{
			*foreign++ = b1;
			*foreign++ = b2;
			}
		if (count >= 2)
			*foreign++ = b3;
		*foreign++ = b4;
		}
	aForeign.SetLength(foreign-aForeign.Ptr());
	return KErrNone;
	}


//This function converts from foreign characters into unicode and adds them into a descriptor
EXPORT_C void UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign)
	{
    UnicodeConv::ConvertToUnicodeL(aUnicode, aForeign, ETrue);
    }

//This function converts from foreign characters into unicode and adds them into a descriptor
EXPORT_C TInt UnicodeConv::ConvertToUnicodeL(TDes16& aUnicode, const TDesC8& aForeign, TBool leaveWhenOverflow)
	{
	const TInt foreignLength = aForeign.Length();
	const TUint8* foreign = aForeign.Ptr();
	const TUint8* guard = foreign + foreignLength;
	
	TUint16* unicode = const_cast<TUint16*>(aUnicode.Ptr());
	TUint16* unicodeguard = unicode + aUnicode.MaxLength();
	
	TUint8 b1, b2, b3, b4;
	enum TCodeType
	{
	E1Byte = 0,
	E2Byte,
	E4ByteBmp,
	E4ByteSupplementary,
	EError,
	};
	TCodeType codetype;
	TUint32 unicodeChar;

	//loop going through the characters of the foreign descriptor
	while (foreign < guard)
		{
		// roughly, detect which area the foreign code belongs to
		b1 = *foreign++;
		if (b1 <= 0x7F)
			codetype = E1Byte;
		else if (b1 == 0x80 || b1 > 0xFE)
			codetype = EError;
		else if (foreign >= guard)
			codetype = EError;
		else
			{
			b2 = *foreign++;
			if (b2 >= 0x40 && b2 <= 0xFE && b2 != 0x7F)
				codetype = E2Byte;
			else if (b2 < 0x30 || b2 > 0x39)
				codetype = EError;
			else if (foreign+1 >= guard)
				codetype = EError;
			else
				{
				b3 = *foreign++;
				if (b3 < 0x81 || b3 > 0xFE)
					codetype = EError;
				else
					{
					b4 = *foreign++;
					if (b4 < 0x30 || b4 > 0x39)
						codetype = EError;
					else if (b1 >= 0x81 && b1 <= 0x84)		// 0x81308130-0x8439FE39
						codetype = E4ByteBmp;
					else if (b1 >= 0x90 && b1 <= 0xE3)		// 0x90308130-0xE339FE39
						codetype = E4ByteSupplementary;
					else
						codetype = EError;					// others are reserved
					}
				}
			}
		
		// cp54936 to unicode
		if (codetype == E1Byte)
			{
			unicodeChar = b1;
			}
		else if (codetype == E2Byte)
			{
			// conventional algorithm used in FatCharsetConv
			const TLeadOrSingle* structPtr = TConvDataStruct::KFirstByteConversions + (b1-0x80);
			if (structPtr->iUnicodeIfSingle)
				unicodeChar = structPtr->iUnicodeIfSingle;
			else if (TConvDataStruct::KMinTrailByte <= b2 && b2 <= TConvDataStruct::KMaxTrailByte)
				unicodeChar = TConvDataStruct::KDoubleByteConversions[structPtr->iDoubleByteIndex + (b2 - TConvDataStruct::KMinTrailByte)];
			else
				unicodeChar = 0xFFFD;
			}
		else if (codetype == E4ByteBmp)
			{
			TUint index = (b1-0x81)*12600 + (b2-0x30)*1260 + (b3-0x81)*10 + (b4-0x30);
			__ASSERT_DEBUG(index<39420, Panic(E4ByteIndexOutOfRange));
			unicodeChar = KMappingTable4ByteBmp2Unicode[index];
			}
		else if (codetype == E4ByteSupplementary)
			{
			unicodeChar = 0x10000 + (b1 - KU10000Byte1) * 12600 +
									(b2 - KU10000Byte2) * 1260 +
									(b3 - KU10000Byte3) * 10 +
									(b4 - KU10000Byte4);
			__ASSERT_DEBUG(unicodeChar >= 0x10000 && unicodeChar <= 0x10FFFF, Panic(EInavlidUnicodeValue));
			}
		else
			{
			unicodeChar = 0xFFFD;
			}
		
		// append to output buffer
		if (IsSupplementary(unicodeChar))
			{
			if (unicode + 1 >= unicodeguard)
				{
				aUnicode.SetLength(unicode-aUnicode.Ptr());
				if (leaveWhenOverflow)
					User::Leave(KErrOverflow);
				else
					return KErrOverflow;
				}
			*unicode++ = GetHighSurrogate(unicodeChar);
			*unicode++ = GetLowSurrogate(unicodeChar);
			}
		else
			{
			if (unicode >= unicodeguard)
				{
				aUnicode.SetLength(unicode-aUnicode.Ptr());
                if (leaveWhenOverflow)
                	User::Leave(KErrOverflow);
                else
                	return KErrOverflow;
				}
			*unicode++ = unicodeChar;
			}
		}
	aUnicode.SetLength(unicode-aUnicode.Ptr());
	return KErrNone;
	}

EXPORT_C TBool UnicodeConv::IsLegalShortNameCharacter (TUint aCharacter)
	{
	//1. aCharacter >= 0x0080 
	if (aCharacter>=0x0080)
		{
		// Since all Unicode characters can be mapped to GB18030, so no need to
		// test the converting.
		if (aCharacter <= 0x10FFFF && !IsSurrogate(aCharacter))
			return ETrue;
		else
			return EFalse;
		}

    // For most common cases: 
    // Note: lower case characters are considered legal DOS char here. 
	if ((aCharacter>='a' && aCharacter<='z') || 
	    (aCharacter>='A' && aCharacter<='Z') || 
	    (aCharacter>='0' && aCharacter<='9'))
			{
			return ETrue;
			}
    // Checking for illegal chars: 
    // 2. aCharacter <= 0x20 
    // Note: leading 0x05 byte should be guarded by callers of this function 
    //  as the information of the position of the character is required. 
	if (aCharacter < 0x20)
		return EFalse;
	// Space (' ') is not considered as a legal DOS char here.
	if (aCharacter == 0x20)
		return EFalse;
	
	// 3. 0x20 < aCharacter < 0x80 
    // According to FAT Spec, "following characters are not legal in any bytes of DIR_Name": 
    switch (aCharacter) 
            { 
            case 0x22:        // '"' 
            case 0x2A:        // '*' 
            case 0x2B:        // '+' 
            case 0x2C:        // ',' 
            //case 0x2E:        // '.'   // Although '.' is not allowed in any bytes of DIR_Name, it 
                                         // is a valid character in short file names. 
            case 0x2F:        // '/' 
            case 0x3A:        // ':' 
            case 0x3B:        // ';' 
            case 0x3C:        // '<' 
            case 0x3D:        // '=' 
            case 0x3E:        // '>' 
            case 0x3F:        // '?' 
            case 0x5B:        // '[' 
            case 0x5C:        // '\' 
            case 0x5D:        // ']' 
            case 0x7C:        // '|' 
            	return EFalse; 
            default: 
            	return ETrue; 
            } 
	}