diff -r 000000000000 -r 1fb32624e06b charconvfw/charconvplugins/src/plugins/j5.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/charconvfw/charconvplugins/src/plugins/j5.cpp Tue Feb 02 02:02:46 2010 +0200 @@ -0,0 +1,823 @@ +/* +* Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* J5 charconv character converter +* +*/ + + +#include +#include +#include +#include +#include +#include +#include "shiftjis.h" +#include "jisbase.h" +#include "j5.h" + +#include "jisx0201.h" +#include "jisx0208.h" +#include "jisx0212.h" + +#include "featmgr/featmgr.h" + +/** + J5 will use up to KMaxSizeAutoDetectSample to try to deterine the format of data. + */ +const TInt KMaxSizeAutoDetectSample = 1000; + +const TUint8 KEscape = 0x1b; +const TInt KByteOrderMark = 0xfeff; + +const TDesC8& CJ5Converter::ReplacementForUnconvertibleUnicodeCharacters() + { + return CnvShiftJis::ReplacementForUnconvertibleUnicodeCharacters(); + } + +/** + This API should not be used as it is ambiguous as to what encoding is required. + The user should instead call the specific plug-in for the appropriate conversion. + J5 ConvertFromUnicode() will convert to UTF8 as default. +@internalTechnology + */ +TInt CJ5Converter::ConvertFromUnicode( + CCnvCharacterSetConverter::TEndianness /* aDefaultEndiannessOfForeignCharacters */, + const TDesC8& /* aReplacementForUnconvertibleUnicodeCharacters */, + TDes8& aForeign, + const TDesC16& aUnicode, + CCnvCharacterSetConverter::TArrayOfAscendingIndices& /* aIndicesOfUnconvertibleCharacters */) + { + return CnvUtfConverter::ConvertFromUnicodeToUtf8(aForeign, aUnicode); + } + +/** + This will automatically determine one of the five supported encodings + to use and convert accordingly. This plugin method is available to the + user though the CCnvCharacterSetConverter::ConvertToUnicode() method. + There is no way for the caller to determine which encoding has been used. + + NOTE: For debugging the selected character set is returned in the state. + + @released 9.1 + @param aDefaultEndiannessOfForeignCharacters The default endian-ness to use when reading characters + in the foreign character set. + @param aUnicode On return, contains the text converted into Unicode. + @param aForeign The non-Unicode source text to be converted. + @param aState Used to save state information across multiple calls + to ConvertToUnicode(). + @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes which were not + converted. + @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, contains the index of the first bytein the + input text that could not be converted. A negative + value indicates that all the characters were + converted. + @return The number of unconverted bytes left at the end of the input descriptor + (e.g. because the output descriptor is not long enough to hold all the text), + or one of the error values defined in TError. + @internalTechnology +*/ +TInt CJ5Converter::ConvertToUnicode( + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, + TDes16& aUnicode, + const TDesC8& aForeign, + TInt& aState, + TInt& aNumberOfUnconvertibleCharacters, + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) + { + // As the aState parameter is used to pass back the detected value + // use a "hidden" internal state variable. + TInt internalState = CCnvCharacterSetConverter::KStateDefault; + + // determine the encoding type and then decode appropriatly + switch ( DetectEncoding(aDefaultEndiannessOfForeignCharacters, aForeign)) + { + case EShiftjis: + aState = EShiftjis; + return CnvShiftJis::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, + aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter); + + case EIso2022jp1: + aState = EIso2022jp1; + return CnvJisBase::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState, + aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter); + + case EEucjp: + aState = EEucjp; + return ConvertEEucjpToUnicode( + aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState, + aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter); + + case EUcs2: + aState = EUcs2; + return ConvertUcs2ToUnicode( aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, + aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter); + + case EUtf8: + aState = EUtf8; + return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign); + + default: + // fall though to the default, which is decode as UTF8 + aState = EUnknown; + break; + } + + // decode as UTF8 + return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign); + } + +/** + This API is used by CCnvCharacterSetConverter::AutoDetectCharacterSetL(). + This method returns a value between 0 and 100, indicating how likely it + is that this is the correct converter, for the text supplied. As J5 is + NOT intended to be used with the existing auto-detect mechanism, it will + always return 0 + @internalTechnology + */ +TBool CJ5Converter::IsInThisCharacterSetL( + TBool& aSetToTrue, + TInt& aConfidenceLevel, + const TDesC8& /* aSample */) + { + /* + aSetToTrue - This value should be set to ETrue. It is used to indicate to + CCnvCharacterSetConverter::AutoDetectCharacterSetL() that the plug-in DLL + is implementing a function of this signature and is therefore not the empty + */ + aSetToTrue=ETrue; + + /* no need to look at the sample as this always returns 0 + as the autodetect feature is not supported by the J5 plug-in + */ + aConfidenceLevel=0; + return ETrue; + } + +CJ5Converter* CJ5Converter::NewL() + { + CJ5Converter* self = new(ELeave) CJ5Converter(); + CleanupStack::PushL(self); + self->ConstructL(); + CleanupStack::Pop(self); + return self; + } + +CJ5Converter::~CJ5Converter() + { + FeatureManager::UnInitializeLib(); + } + +CJ5Converter::CJ5Converter() + { + } + +void CJ5Converter::ConstructL() + { + FeatureManager::InitializeLibL(); + } + +const TImplementationProxy ImplementationTable[] = + { +#ifdef KDDIAU_TEST + // for the test build use a special test UID + IMPLEMENTATION_PROXY_ENTRY(0x01000002, CJ5Converter::NewL) +#else + IMPLEMENTATION_PROXY_ENTRY(KCharacterSetIdentifierJ5, CJ5Converter::NewL) +#endif + }; + +EXPORT_C const TImplementationProxy* ImplementationGroupProxy(TInt& aTableCount) + { + aTableCount = sizeof(ImplementationTable) / sizeof(TImplementationProxy); + + return ImplementationTable; + } + +/** + DetectEncoding determine the characterset encoding. + The logic for this detection is based on the information in CJKV by Ken Lunde. + A detailed diagram of this logic is in the J5 how to document section 2.4 + @return The detected character set as a enum CJ5Converter. + @internalTechnology + */ +enum CJ5Converter::TJ5Encoding CJ5Converter::DetectEncoding( + CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters , + const TDesC8& aForeign) + { + + // first check for UCS2 + CCnvCharacterSetConverter::TEndianness ucs2Endianness = CCnvCharacterSetConverter::ELittleEndian; + if ( DetectUcs2(aForeign, ucs2Endianness )) + { + // if ucs2 is detected pass back the detected endianess + aDefaultEndiannessOfForeignCharacters = ucs2Endianness; + return EUcs2; + } + + // next try EUC_JP + TInt eucJpValidBytes = 0; + CJ5Converter::TDectectCharacterSet result = DetectEucJp( aForeign, eucJpValidBytes ); + if ( result == EIsCharacterSet ) + { + return EEucjp; + } + + // next try Iso 2020JP + if ( DetectIso2022( aForeign ) == EIsCharacterSet ) + { + return EIso2022jp1; + } + + // next try Utf8 + if ( DetectUtf8( aForeign ) == EIsCharacterSet ) + { + return EUtf8; + } + + // shiftjis + TInt shiftjisValidBytes = 0; + result = DetectShiftJis( aForeign, shiftjisValidBytes ); + if ( result == EIsCharacterSet ) + { + return EShiftjis; + } + + // no clear winner so go for the best + TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample); + + // if more than half is shiftjis and more shiftjis than EUC_JP, + if ((shiftjisValidBytes > eucJpValidBytes ) && (shiftjisValidBytes * 2> sampleLength)) + return EShiftjis; + + // if more than half is EUC_JP and more EUC_JP than shiftjis, + if ((eucJpValidBytes > shiftjisValidBytes ) && (eucJpValidBytes * 2> sampleLength)) + return EEucjp; + + // return the default + return EUcs2; + } + + +/** + Check if UCS2. + If the first two bytes are the Unicode Endian Specifiers (0xfffe or 0xfeff) + then this must be UCS2. Otherwise try lookiing for 0x**00 or 0x00** + @param A sample of data to be checked + @param The Endianness if USC2 is detected + @return ETrue if UCS2 else EFalse + @internalTechnology + */ +TBool CJ5Converter::DetectUcs2( const TDesC8& aForeign, + CCnvCharacterSetConverter::TEndianness& aTEndianness ) + { + // if the sample is not big enough + if (aForeign.Length() < 2) + { + return EFalse; + } + else if (aForeign[0]==0xff && aForeign[1]==0xfe ) + { + // we have found a Little Endian Byte order mark + aTEndianness = CCnvCharacterSetConverter::ELittleEndian; + return ETrue; + } + else if (aForeign[0]==0xfe && aForeign[1]==0xff ) + { + // we have found a Big Endian Byte order mark + aTEndianness = CCnvCharacterSetConverter::EBigEndian; + return ETrue; + } + + // Next check for sequences of 0x**00 or 0x00** as UCS-2 is the only charset that + // specifies 0x**00 or 0x00** (according to endianness) for the ASCII range of characters. + // NB: This will fail if there are no ASCII characters in the text. + TInt sampleLength = aForeign.Length(); + sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);; + + // check the sample for sequences of 0x**00 or 0x00** + TInt bigEndianConfidence = 0; + TInt littleEndianConfidence = 0; + TInt i=0; + for(;i< (sampleLength-1); i+=2) + { + if( aForeign[i] == 0x00) + { + bigEndianConfidence +=2; + } + else if ( aForeign[i+1] == 0x00) + { + littleEndianConfidence +=2; + } + } + + // which occurs most BE or LE + TInt confidenceLevel = 0; + if (bigEndianConfidence > littleEndianConfidence) + { + aTEndianness = CCnvCharacterSetConverter::EBigEndian; + confidenceLevel = bigEndianConfidence; + } + else + { + aTEndianness = CCnvCharacterSetConverter::ELittleEndian; + confidenceLevel = littleEndianConfidence; + } + + // if more than 97% count as UCS2 + if ( confidenceLevel * 100/sampleLength > 97) + return ETrue; + + return EFalse; + } + +/** + Check if ShiftJis (reference CJKV by Ken Lunde page 175) + @param A sample of data to be checked + @param The number of input bytes that can be converted + @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet + @internalTechnology + */ +enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectShiftJis( const TDesC8& aForeign,TInt &aNumberOfBytesConverted ) + { + // Get the sample length + TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);; + + TInt i=0; + aNumberOfBytesConverted = 0; + + TText8 character; + TText8 characterPlus1; + TText8 characterPlus2; + + // scan the sample text looking for valid shiftjis data + while ( i < sampleLength ) + { + // get the next few characters, use 0 if there is no more sample + // as this will not match any test. + character = aForeign[i]; + characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0); + characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0); + + // SHIFTJIS - 0x8e to 0x9f followed by 0x40 to 0xfc + if ((character >= 0x81) && (character <= 0x9f) && + (characterPlus1 >= 0x40) && (characterPlus1 <= 0xfc) ) + { + // this is SHIFTJIS unless it is EUC JP code set 2 or 3 + if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF)) + { + // this could be EUC JP code set 2 (or shiftjis) + aNumberOfBytesConverted+=2; + i++; + } + else if ((character == 0x8F) && + (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) && + (characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF)) + { + // this could be EUC JP code set 3 (or shiftjis) + aNumberOfBytesConverted+=3; + i+=2; + } + else + { + // this can only be shift jis + return EIsCharacterSet; + } + } + + // SHIFTJIS - 0xE0 to 0xEF followed by ..... + else if ((character >= 0xE0) && (character <= 0xEF)) + { + // 0x40 to 0xFC which overlaps UTF8 between 0x80 and 0xBF + // including Mopera extension to shiftjis from 0xEF80 to 0xEFFC + + if ( (characterPlus1 >= 0x40) && (characterPlus1 <= 0x7E) ) + { + // this can only be shift jis + return EIsCharacterSet; + } + else if ( (characterPlus1 >= 0xC0) && (characterPlus1 <= 0xFC) ) + { + // this could be EUC JP code set 1 + aNumberOfBytesConverted+=2; + i++; + } + + // problem here is the overlap between the UTF8 and shiftjis + else if ( (characterPlus1 >= 0x80) && (characterPlus1 <= 0xBF) ) + { + // this could be shiftjis or utf8 + aNumberOfBytesConverted+=2; + i++; + } + } + // half width katakana A1-DF + else if ((character >= 0xA1) && (character <= 0xDF)) + { + aNumberOfBytesConverted+=1; + } + // ASCII or JIS-Roman 20-7e + else if ( ((character >= 0x20) && (character <= 0x7E)) || (character == 0x0A) || (character == 0x0D)) + { + aNumberOfBytesConverted+=1; + } + else + { + // This is not decoding as shiftjis, so reject + aNumberOfBytesConverted =0; + return EIsNotCharacterSet; + } + i++; + } + + // if all the characters could be converted + if (aNumberOfBytesConverted == sampleLength) + { + return EIsCharacterSet; + } + else if (aNumberOfBytesConverted == 0) + { + return EIsNotCharacterSet; + } + else + { + return EMaybeCharacterSet; + } + } + +/** + Check if UTF8 (reference CJKV by Ken Lunde page 189) + @param A sample of data to be checked + @param The number of input bytes that can be converted + @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet + @internalTechnology + */ +enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectUtf8( const TDesC8& aForeign ) + { + // Get the sample length + TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);; + + TInt i=0; + TText8 character; + TText8 characterPlus1; + TText8 characterPlus2; + TText8 characterPlus3; + + // scan the sample text looking for valid UTF8 + while ( i < sampleLength ) + { + // get the next few characters, use 0 if there is no more sample + // as this will not match any test. + character = aForeign[i]; + characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0); + characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0); + characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0); + + // UTF8 range 110xxxxx followed by one valid UTF8 bytes + if(((character & 0xe0)==0xc0) && (( characterPlus1 & 0xc0)==0x80) ) + { + // two bytes of valid UTF8 found + i+=2; + } + // UTF8 range 1110xxxx followed by two valid UTF8 bytes + else if(((character & 0xf0)==0xe0) && (( characterPlus1 & 0xc0)==0x80) && (( characterPlus2 & 0xc0)==0x80)) + { + // three bytes of valid UTF8 found + i+=3; + } + // UTF8 range 11110xxx followed by three valid UTF8 bytes + else if(((character & 0xf8)==0xf0) && (( characterPlus1 & 0xc0)==0x80) + && (( characterPlus2 & 0xc0)==0x80) && (( characterPlus3 & 0xc0)==0x80) ) + { + // four bytes of valid UTF8 found + i+=4; + } + + // ascii range 0 to 0x7F + else if((character & 0x80)==0x00) + { + // The value of character is in the range 0x00-0x7f + // UTF8 maintains ASCII transparency. So it's a valid UTF8. + i++; + } + // if the sample data is longer than KMaxSizeAutoDetectSample then except anything + // for the last two bytes as they may not appear valid without more data + else if( i >= (KMaxSizeAutoDetectSample -2) ) + { + i++; + } + else + { + // This is not decoding as UTF8 so reject + return EIsNotCharacterSet; + } + } + + // All the characters could be converted + return EIsCharacterSet; + + } + + +/** + Check if ISO2022JP by lookiing for the escape sequences. + @param A sample of data to be checked + @param The number of input bytes that can be converted + @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet + @internalTechnology + */ +enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectIso2022( const TDesC8& aForeign ) + { + // Get the sample length + TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);; + + TInt i=0; + TText8 character; + TText8 characterPlus1; + TText8 characterPlus2; + TText8 characterPlus3; + TText8 characterPlus4; + TText8 characterPlus5; + + // scan the sample text looking for valid UTF8 + while ( i < sampleLength ) + { + // get the next few characters, use 0 if there is no more sample + // as this will not match any test. + character = aForeign[i]; + characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0); + characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0); + characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0); + + + // check for the JIS escape sequences of ISO 2022Jp + // These values have been taken from JISBASE_SHARED + if (character == KEscape) + { + // Escape Sequence For Jis C6226_1978 \x1b\x24\x40 + if ((characterPlus1 == 0x24) && (characterPlus2 == 0x40)) + { + return EIsCharacterSet; + } + + // Escape Sequence For Jis X0208_1983 \x1b\x24\x42 + else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x42)) + { + return EIsCharacterSet; + } + + // Escape Sequence For Jis Roman \x1b\x28\x4a + else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x4A)) + { + return EIsCharacterSet; + } + + // Escape Sequence For Jis RomanIncorrect \x1b\x28\x48 + else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x48)) + { + return EIsCharacterSet; + } + + // Escape Sequence For Ascii \x1b\x28\x42 + else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x42)) + { + return EIsCharacterSet; + } + + // Escape Sequence For EscapeSequenceForHalfWidthKatakana \x1b\x28\x49 + else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x49)) + { + return EIsCharacterSet; + } + + // Escape Sequence For Jis X0208_199x \x1b\x26\x40\x1b\x24\x42 + else if ((characterPlus1 == 0x26) && (characterPlus2 == 0x40)) + { + characterPlus4 = ( i < (sampleLength-4) ? aForeign[i+4]:0); + characterPlus5 = ( i < (sampleLength-5) ? aForeign[i+5]:0); + + if ((characterPlus3 == 0x1b) && (characterPlus4 == 0x24) && (characterPlus5 == 0x42)) + { + return EIsCharacterSet; + } + } + // Escape Sequence For Jis X0212_1990 \x1b\x24\x28\x44 + else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x28)) + { + if (characterPlus3 == 0x44) + { + return EIsCharacterSet; + } + } + + // check for the JIS escape sequences of ISO 2022Jp "B@" x42 x40 + else if ((characterPlus1 == 'B') || (characterPlus1 == '@')) + { + return EIsCharacterSet; + } + + } // end of if ( character == KEscape ) + + i++; + } + + // if escape sequences have been found then this is not ISO2022 + return EIsNotCharacterSet; + + } + + +/** + Check if EUC JP (reference CJKV by Ken Lunde page 164) + @param A sample of data to be checked + @param The number of input bytes that can be converted + @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet + @internalTechnology + */ +CJ5Converter::TDectectCharacterSet CJ5Converter::DetectEucJp( const TDesC8& aForeign,TInt &aNumberOfBytesConverted ) + { + // Get the sample length + TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);; + + TInt i=0; + aNumberOfBytesConverted = 0; + + TText8 character; + TText8 characterPlus1; + TText8 characterPlus2; + + // scan the sample text looking for valid shiftjis data + while ( i < sampleLength ) + { + // get the next few characters, use 0 if there is no more sample + // as this will not match any test. + character = aForeign[i]; + characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0); + characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0); + + // EUCJP code set 0 0x21-0x7e + if ( (character >= 0x21) && (character <= 0x7e)) + { + aNumberOfBytesConverted++; + } + else if ( (character == 0x0a) || (character == 0x0d)) + { + aNumberOfBytesConverted++; + } + // EUCJP code set 1 + else if ( (character >= 0xa1) && (character <= 0xff) + && (characterPlus1 >= 0xa1) && (characterPlus1 <= 0xff) ) + { + aNumberOfBytesConverted+=2; + i++; + } + + // EUC JP code set 2, starts with the EUC JP SS2 character (0x8E) + // and is followed by character in range 0xA1- 0xDF + else if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) ) + { + // this could be 2 bytes of EUC JP code set 2 + aNumberOfBytesConverted += 2; + i++; + } + // EUC JP code set 3, starts with the EUC JP SS3 character (0x8F) + // and is followed by two characters in range A1- DF A1 -FE + else if ((character == 0x8F) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) + && (characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF)) + { + // this could be 3 bytes of EUC JP code set 3 + aNumberOfBytesConverted += 3; + i+=2; + } + else + { + // This is not a valid decoding as EUC JP so reject + return EIsNotCharacterSet; + } + i++; + } + + + // if all the characters could be converted + if (aNumberOfBytesConverted == sampleLength) + { + return EIsCharacterSet; + } + else if (aNumberOfBytesConverted == 0) + { + return EIsNotCharacterSet; + } + else + { + return EMaybeCharacterSet; + } + } + + +/** + Convert from UCS2 (Universal Character Set containing two bytes) to unicode + Remove any byte order marks in the UCSs. + @param aUnicode Contains the converted text in the Unicode character set. + @param aForeign The non-Unicode source text to be converted + @param aNumberOfUnconvertibleCharacters Contains the number of bytes which were not converted. + @param aIndexOfFirstByteOfFirstUnconvertibleCharacter The index of the first byte of the first unconvertible character. + @return the number of bytes converted + @internalTechnology + */ + TInt CJ5Converter::ConvertUcs2ToUnicode(CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters, + TDes16& aUnicode, + const TDesC8& aForeign, + TInt& aNumberOfUnconvertibleCharacters, + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) + + { + TInt numberOfBytesConverted = 0; + TInt numberOfUnicodeCharacters =0; + TChar nextChar; + + // start at begining of the output buffer provided + aUnicode.Zero(); + + // while there is at least 2 bytes of data to convert and space in the output buffer + while ( (numberOfBytesConverted+1 < aForeign.Size()) && (numberOfUnicodeCharacters < aUnicode.MaxLength()) ) + { + if (aDefaultEndiannessOfForeignCharacters == CCnvCharacterSetConverter::ELittleEndian ) + { + // ELittleEndian 0x??00 + nextChar = aForeign[numberOfBytesConverted] + ( aForeign[numberOfBytesConverted+1] << 8); + } + else + { + // EBigEndian 0x00?? + nextChar = ( aForeign[numberOfBytesConverted] <<8 ) + aForeign[numberOfBytesConverted+1]; + } + + // save the unicode character extracted unless it's a BOM + if ( nextChar != KByteOrderMark ) + { + aUnicode.Append( nextChar ); + numberOfUnicodeCharacters++; + } + + numberOfBytesConverted+=2; + } + + // there are no uncovertable characters with UCS2, but there could be + aNumberOfUnconvertibleCharacters = 0; + // a negative value indicates that all characters converted + aIndexOfFirstByteOfFirstUnconvertibleCharacter = -1; + + // returns the number of unconverted bytes left at the end of the input descriptor + // Note there could be 1 byte left over if an odd number of bytes provided for conversion + return aForeign.Size() - numberOfBytesConverted; + } + +/** + Convert from EUC_JP (Extended Unix Code encoding for Japanese) + Using the standard Charconv method of an array of methods + @return the number of bytes converted + @internalTechnology + */ + TInt CJ5Converter::ConvertEEucjpToUnicode( + CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, + TDes16& aUnicode, + const TDesC8& aForeign, + TInt& /*aState*/, + TInt& aNumberOfUnconvertibleCharacters, + TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) + { + TFixedArray methods; + methods[0].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisRoman; + methods[0].iConvertToIntermediateBufferInPlace=DummyConvertToIntermediateBufferInPlace; + methods[0].iConversionData=&CnvJisRoman::ConversionData(); + methods[0].iNumberOfBytesPerCharacter=1; + methods[0].iNumberOfCoreBytesPerCharacter=1; + methods[1].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0208; + methods[1].iConvertToIntermediateBufferInPlace=ConvertToJisX0208FromEucJpPackedInPlace; + methods[1].iConversionData=&CnvJisX0208::ConversionData(); + methods[1].iNumberOfBytesPerCharacter=2; + methods[1].iNumberOfCoreBytesPerCharacter=2; + methods[2].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToHalfWidthKatakana8; + methods[2].iConvertToIntermediateBufferInPlace=ConvertToHalfWidthKatakana8FromEucJpPackedInPlace; + methods[2].iConversionData=&CnvHalfWidthKatakana8::ConversionData(); + methods[2].iNumberOfBytesPerCharacter=2; + methods[2].iNumberOfCoreBytesPerCharacter=1; + methods[3].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0212; + methods[3].iConvertToIntermediateBufferInPlace=ConvertToJisX0212FromEucJpPackedInPlace; + methods[3].iConversionData=&CnvJisX0212::ConversionData(); + methods[3].iNumberOfBytesPerCharacter=3; + methods[3].iNumberOfCoreBytesPerCharacter=2; + return CnvUtilities::ConvertToUnicodeFromHeterogeneousForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, methods.Array()); + } +