--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/charconvfw/charconvplugins/src/plugins/j5.cpp Tue Feb 02 02:02:46 2010 +0200
@@ -0,0 +1,823 @@
+/*
+* Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description:
+* J5 charconv character converter
+*
+*/
+
+
+#include <e32std.h>
+#include <charconv.h>
+#include <ecom/implementationproxy.h>
+#include <utf.h>
+#include <charactersetconverter.h>
+#include <convutils.h>
+#include "shiftjis.h"
+#include "jisbase.h"
+#include "j5.h"
+
+#include "jisx0201.h"
+#include "jisx0208.h"
+#include "jisx0212.h"
+
+#include "featmgr/featmgr.h"
+
+/**
+ J5 will use up to KMaxSizeAutoDetectSample to try to deterine the format of data.
+ */
+const TInt KMaxSizeAutoDetectSample = 1000;
+
+const TUint8 KEscape = 0x1b;
+const TInt KByteOrderMark = 0xfeff;
+
+const TDesC8& CJ5Converter::ReplacementForUnconvertibleUnicodeCharacters()
+ {
+ return CnvShiftJis::ReplacementForUnconvertibleUnicodeCharacters();
+ }
+
+/**
+ This API should not be used as it is ambiguous as to what encoding is required.
+ The user should instead call the specific plug-in for the appropriate conversion.
+ J5 ConvertFromUnicode() will convert to UTF8 as default.
+@internalTechnology
+ */
+TInt CJ5Converter::ConvertFromUnicode(
+ CCnvCharacterSetConverter::TEndianness /* aDefaultEndiannessOfForeignCharacters */,
+ const TDesC8& /* aReplacementForUnconvertibleUnicodeCharacters */,
+ TDes8& aForeign,
+ const TDesC16& aUnicode,
+ CCnvCharacterSetConverter::TArrayOfAscendingIndices& /* aIndicesOfUnconvertibleCharacters */)
+ {
+ return CnvUtfConverter::ConvertFromUnicodeToUtf8(aForeign, aUnicode);
+ }
+
+/**
+ This will automatically determine one of the five supported encodings
+ to use and convert accordingly. This plugin method is available to the
+ user though the CCnvCharacterSetConverter::ConvertToUnicode() method.
+ There is no way for the caller to determine which encoding has been used.
+
+ NOTE: For debugging the selected character set is returned in the state.
+
+ @released 9.1
+ @param aDefaultEndiannessOfForeignCharacters The default endian-ness to use when reading characters
+ in the foreign character set.
+ @param aUnicode On return, contains the text converted into Unicode.
+ @param aForeign The non-Unicode source text to be converted.
+ @param aState Used to save state information across multiple calls
+ to <code>ConvertToUnicode()</code>.
+ @param aNumberOfUnconvertibleCharacters On return, contains the number of bytes which were not
+ converted.
+ @param aIndexOfFirstByteOfFirstUnconvertibleCharacter On return, contains the index of the first bytein the
+ input text that could not be converted. A negative
+ value indicates that all the characters were
+ converted.
+ @return The number of unconverted bytes left at the end of the input descriptor
+ (e.g. because the output descriptor is not long enough to hold all the text),
+ or one of the error values defined in TError.
+ @internalTechnology
+*/
+TInt CJ5Converter::ConvertToUnicode(
+ CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters,
+ TDes16& aUnicode,
+ const TDesC8& aForeign,
+ TInt& aState,
+ TInt& aNumberOfUnconvertibleCharacters,
+ TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
+ {
+ // As the aState parameter is used to pass back the detected value
+ // use a "hidden" internal state variable.
+ TInt internalState = CCnvCharacterSetConverter::KStateDefault;
+
+ // determine the encoding type and then decode appropriatly
+ switch ( DetectEncoding(aDefaultEndiannessOfForeignCharacters, aForeign))
+ {
+ case EShiftjis:
+ aState = EShiftjis;
+ return CnvShiftJis::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign,
+ aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
+
+ case EIso2022jp1:
+ aState = EIso2022jp1;
+ return CnvJisBase::ConvertToUnicode(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState,
+ aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
+
+ case EEucjp:
+ aState = EEucjp;
+ return ConvertEEucjpToUnicode(
+ aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, internalState,
+ aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
+
+ case EUcs2:
+ aState = EUcs2;
+ return ConvertUcs2ToUnicode( aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign,
+ aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter);
+
+ case EUtf8:
+ aState = EUtf8;
+ return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign);
+
+ default:
+ // fall though to the default, which is decode as UTF8
+ aState = EUnknown;
+ break;
+ }
+
+ // decode as UTF8
+ return CnvUtfConverter::ConvertToUnicodeFromUtf8(aUnicode, aForeign);
+ }
+
+/**
+ This API is used by CCnvCharacterSetConverter::AutoDetectCharacterSetL().
+ This method returns a value between 0 and 100, indicating how likely it
+ is that this is the correct converter, for the text supplied. As J5 is
+ NOT intended to be used with the existing auto-detect mechanism, it will
+ always return 0
+ @internalTechnology
+ */
+TBool CJ5Converter::IsInThisCharacterSetL(
+ TBool& aSetToTrue,
+ TInt& aConfidenceLevel,
+ const TDesC8& /* aSample */)
+ {
+ /*
+ aSetToTrue - This value should be set to ETrue. It is used to indicate to
+ CCnvCharacterSetConverter::AutoDetectCharacterSetL() that the plug-in DLL
+ is implementing a function of this signature and is therefore not the empty
+ */
+ aSetToTrue=ETrue;
+
+ /* no need to look at the sample as this always returns 0
+ as the autodetect feature is not supported by the J5 plug-in
+ */
+ aConfidenceLevel=0;
+ return ETrue;
+ }
+
+CJ5Converter* CJ5Converter::NewL()
+ {
+ CJ5Converter* self = new(ELeave) CJ5Converter();
+ CleanupStack::PushL(self);
+ self->ConstructL();
+ CleanupStack::Pop(self);
+ return self;
+ }
+
+CJ5Converter::~CJ5Converter()
+ {
+ FeatureManager::UnInitializeLib();
+ }
+
+CJ5Converter::CJ5Converter()
+ {
+ }
+
+void CJ5Converter::ConstructL()
+ {
+ FeatureManager::InitializeLibL();
+ }
+
+const TImplementationProxy ImplementationTable[] =
+ {
+#ifdef KDDIAU_TEST
+ // for the test build use a special test UID
+ IMPLEMENTATION_PROXY_ENTRY(0x01000002, CJ5Converter::NewL)
+#else
+ IMPLEMENTATION_PROXY_ENTRY(KCharacterSetIdentifierJ5, CJ5Converter::NewL)
+#endif
+ };
+
+EXPORT_C const TImplementationProxy* ImplementationGroupProxy(TInt& aTableCount)
+ {
+ aTableCount = sizeof(ImplementationTable) / sizeof(TImplementationProxy);
+
+ return ImplementationTable;
+ }
+
+/**
+ DetectEncoding determine the characterset encoding.
+ The logic for this detection is based on the information in CJKV by Ken Lunde.
+ A detailed diagram of this logic is in the J5 how to document section 2.4
+ @return The detected character set as a enum CJ5Converter.
+ @internalTechnology
+ */
+enum CJ5Converter::TJ5Encoding CJ5Converter::DetectEncoding(
+ CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters ,
+ const TDesC8& aForeign)
+ {
+
+ // first check for UCS2
+ CCnvCharacterSetConverter::TEndianness ucs2Endianness = CCnvCharacterSetConverter::ELittleEndian;
+ if ( DetectUcs2(aForeign, ucs2Endianness ))
+ {
+ // if ucs2 is detected pass back the detected endianess
+ aDefaultEndiannessOfForeignCharacters = ucs2Endianness;
+ return EUcs2;
+ }
+
+ // next try EUC_JP
+ TInt eucJpValidBytes = 0;
+ CJ5Converter::TDectectCharacterSet result = DetectEucJp( aForeign, eucJpValidBytes );
+ if ( result == EIsCharacterSet )
+ {
+ return EEucjp;
+ }
+
+ // next try Iso 2020JP
+ if ( DetectIso2022( aForeign ) == EIsCharacterSet )
+ {
+ return EIso2022jp1;
+ }
+
+ // next try Utf8
+ if ( DetectUtf8( aForeign ) == EIsCharacterSet )
+ {
+ return EUtf8;
+ }
+
+ // shiftjis
+ TInt shiftjisValidBytes = 0;
+ result = DetectShiftJis( aForeign, shiftjisValidBytes );
+ if ( result == EIsCharacterSet )
+ {
+ return EShiftjis;
+ }
+
+ // no clear winner so go for the best
+ TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);
+
+ // if more than half is shiftjis and more shiftjis than EUC_JP,
+ if ((shiftjisValidBytes > eucJpValidBytes ) && (shiftjisValidBytes * 2> sampleLength))
+ return EShiftjis;
+
+ // if more than half is EUC_JP and more EUC_JP than shiftjis,
+ if ((eucJpValidBytes > shiftjisValidBytes ) && (eucJpValidBytes * 2> sampleLength))
+ return EEucjp;
+
+ // return the default
+ return EUcs2;
+ }
+
+
+/**
+ Check if UCS2.
+ If the first two bytes are the Unicode Endian Specifiers (0xfffe or 0xfeff)
+ then this must be UCS2. Otherwise try lookiing for 0x**00 or 0x00**
+ @param A sample of data to be checked
+ @param The Endianness if USC2 is detected
+ @return ETrue if UCS2 else EFalse
+ @internalTechnology
+ */
+TBool CJ5Converter::DetectUcs2( const TDesC8& aForeign,
+ CCnvCharacterSetConverter::TEndianness& aTEndianness )
+ {
+ // if the sample is not big enough
+ if (aForeign.Length() < 2)
+ {
+ return EFalse;
+ }
+ else if (aForeign[0]==0xff && aForeign[1]==0xfe )
+ {
+ // we have found a Little Endian Byte order mark
+ aTEndianness = CCnvCharacterSetConverter::ELittleEndian;
+ return ETrue;
+ }
+ else if (aForeign[0]==0xfe && aForeign[1]==0xff )
+ {
+ // we have found a Big Endian Byte order mark
+ aTEndianness = CCnvCharacterSetConverter::EBigEndian;
+ return ETrue;
+ }
+
+ // Next check for sequences of 0x**00 or 0x00** as UCS-2 is the only charset that
+ // specifies 0x**00 or 0x00** (according to endianness) for the ASCII range of characters.
+ // NB: This will fail if there are no ASCII characters in the text.
+ TInt sampleLength = aForeign.Length();
+ sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
+
+ // check the sample for sequences of 0x**00 or 0x00**
+ TInt bigEndianConfidence = 0;
+ TInt littleEndianConfidence = 0;
+ TInt i=0;
+ for(;i< (sampleLength-1); i+=2)
+ {
+ if( aForeign[i] == 0x00)
+ {
+ bigEndianConfidence +=2;
+ }
+ else if ( aForeign[i+1] == 0x00)
+ {
+ littleEndianConfidence +=2;
+ }
+ }
+
+ // which occurs most BE or LE
+ TInt confidenceLevel = 0;
+ if (bigEndianConfidence > littleEndianConfidence)
+ {
+ aTEndianness = CCnvCharacterSetConverter::EBigEndian;
+ confidenceLevel = bigEndianConfidence;
+ }
+ else
+ {
+ aTEndianness = CCnvCharacterSetConverter::ELittleEndian;
+ confidenceLevel = littleEndianConfidence;
+ }
+
+ // if more than 97% count as UCS2
+ if ( confidenceLevel * 100/sampleLength > 97)
+ return ETrue;
+
+ return EFalse;
+ }
+
+/**
+ Check if ShiftJis (reference CJKV by Ken Lunde page 175)
+ @param A sample of data to be checked
+ @param The number of input bytes that can be converted
+ @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
+ @internalTechnology
+ */
+enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectShiftJis( const TDesC8& aForeign,TInt &aNumberOfBytesConverted )
+ {
+ // Get the sample length
+ TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
+
+ TInt i=0;
+ aNumberOfBytesConverted = 0;
+
+ TText8 character;
+ TText8 characterPlus1;
+ TText8 characterPlus2;
+
+ // scan the sample text looking for valid shiftjis data
+ while ( i < sampleLength )
+ {
+ // get the next few characters, use 0 if there is no more sample
+ // as this will not match any test.
+ character = aForeign[i];
+ characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
+ characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
+
+ // SHIFTJIS - 0x8e to 0x9f followed by 0x40 to 0xfc
+ if ((character >= 0x81) && (character <= 0x9f) &&
+ (characterPlus1 >= 0x40) && (characterPlus1 <= 0xfc) )
+ {
+ // this is SHIFTJIS unless it is EUC JP code set 2 or 3
+ if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF))
+ {
+ // this could be EUC JP code set 2 (or shiftjis)
+ aNumberOfBytesConverted+=2;
+ i++;
+ }
+ else if ((character == 0x8F) &&
+ (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) &&
+ (characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF))
+ {
+ // this could be EUC JP code set 3 (or shiftjis)
+ aNumberOfBytesConverted+=3;
+ i+=2;
+ }
+ else
+ {
+ // this can only be shift jis
+ return EIsCharacterSet;
+ }
+ }
+
+ // SHIFTJIS - 0xE0 to 0xEF followed by .....
+ else if ((character >= 0xE0) && (character <= 0xEF))
+ {
+ // 0x40 to 0xFC which overlaps UTF8 between 0x80 and 0xBF
+ // including Mopera extension to shiftjis from 0xEF80 to 0xEFFC
+
+ if ( (characterPlus1 >= 0x40) && (characterPlus1 <= 0x7E) )
+ {
+ // this can only be shift jis
+ return EIsCharacterSet;
+ }
+ else if ( (characterPlus1 >= 0xC0) && (characterPlus1 <= 0xFC) )
+ {
+ // this could be EUC JP code set 1
+ aNumberOfBytesConverted+=2;
+ i++;
+ }
+
+ // problem here is the overlap between the UTF8 and shiftjis
+ else if ( (characterPlus1 >= 0x80) && (characterPlus1 <= 0xBF) )
+ {
+ // this could be shiftjis or utf8
+ aNumberOfBytesConverted+=2;
+ i++;
+ }
+ }
+ // half width katakana A1-DF
+ else if ((character >= 0xA1) && (character <= 0xDF))
+ {
+ aNumberOfBytesConverted+=1;
+ }
+ // ASCII or JIS-Roman 20-7e
+ else if ( ((character >= 0x20) && (character <= 0x7E)) || (character == 0x0A) || (character == 0x0D))
+ {
+ aNumberOfBytesConverted+=1;
+ }
+ else
+ {
+ // This is not decoding as shiftjis, so reject
+ aNumberOfBytesConverted =0;
+ return EIsNotCharacterSet;
+ }
+ i++;
+ }
+
+ // if all the characters could be converted
+ if (aNumberOfBytesConverted == sampleLength)
+ {
+ return EIsCharacterSet;
+ }
+ else if (aNumberOfBytesConverted == 0)
+ {
+ return EIsNotCharacterSet;
+ }
+ else
+ {
+ return EMaybeCharacterSet;
+ }
+ }
+
+/**
+ Check if UTF8 (reference CJKV by Ken Lunde page 189)
+ @param A sample of data to be checked
+ @param The number of input bytes that can be converted
+ @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
+ @internalTechnology
+ */
+enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectUtf8( const TDesC8& aForeign )
+ {
+ // Get the sample length
+ TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
+
+ TInt i=0;
+ TText8 character;
+ TText8 characterPlus1;
+ TText8 characterPlus2;
+ TText8 characterPlus3;
+
+ // scan the sample text looking for valid UTF8
+ while ( i < sampleLength )
+ {
+ // get the next few characters, use 0 if there is no more sample
+ // as this will not match any test.
+ character = aForeign[i];
+ characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
+ characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
+ characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0);
+
+ // UTF8 range 110xxxxx followed by one valid UTF8 bytes
+ if(((character & 0xe0)==0xc0) && (( characterPlus1 & 0xc0)==0x80) )
+ {
+ // two bytes of valid UTF8 found
+ i+=2;
+ }
+ // UTF8 range 1110xxxx followed by two valid UTF8 bytes
+ else if(((character & 0xf0)==0xe0) && (( characterPlus1 & 0xc0)==0x80) && (( characterPlus2 & 0xc0)==0x80))
+ {
+ // three bytes of valid UTF8 found
+ i+=3;
+ }
+ // UTF8 range 11110xxx followed by three valid UTF8 bytes
+ else if(((character & 0xf8)==0xf0) && (( characterPlus1 & 0xc0)==0x80)
+ && (( characterPlus2 & 0xc0)==0x80) && (( characterPlus3 & 0xc0)==0x80) )
+ {
+ // four bytes of valid UTF8 found
+ i+=4;
+ }
+
+ // ascii range 0 to 0x7F
+ else if((character & 0x80)==0x00)
+ {
+ // The value of character is in the range 0x00-0x7f
+ // UTF8 maintains ASCII transparency. So it's a valid UTF8.
+ i++;
+ }
+ // if the sample data is longer than KMaxSizeAutoDetectSample then except anything
+ // for the last two bytes as they may not appear valid without more data
+ else if( i >= (KMaxSizeAutoDetectSample -2) )
+ {
+ i++;
+ }
+ else
+ {
+ // This is not decoding as UTF8 so reject
+ return EIsNotCharacterSet;
+ }
+ }
+
+ // All the characters could be converted
+ return EIsCharacterSet;
+
+ }
+
+
+/**
+ Check if ISO2022JP by lookiing for the escape sequences.
+ @param A sample of data to be checked
+ @param The number of input bytes that can be converted
+ @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
+ @internalTechnology
+ */
+enum CJ5Converter::TDectectCharacterSet CJ5Converter::DetectIso2022( const TDesC8& aForeign )
+ {
+ // Get the sample length
+ TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
+
+ TInt i=0;
+ TText8 character;
+ TText8 characterPlus1;
+ TText8 characterPlus2;
+ TText8 characterPlus3;
+ TText8 characterPlus4;
+ TText8 characterPlus5;
+
+ // scan the sample text looking for valid UTF8
+ while ( i < sampleLength )
+ {
+ // get the next few characters, use 0 if there is no more sample
+ // as this will not match any test.
+ character = aForeign[i];
+ characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
+ characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
+ characterPlus3 = ( i < (sampleLength-3) ? aForeign[i+3]:0);
+
+
+ // check for the JIS escape sequences of ISO 2022Jp
+ // These values have been taken from JISBASE_SHARED
+ if (character == KEscape)
+ {
+ // Escape Sequence For Jis C6226_1978 \x1b\x24\x40
+ if ((characterPlus1 == 0x24) && (characterPlus2 == 0x40))
+ {
+ return EIsCharacterSet;
+ }
+
+ // Escape Sequence For Jis X0208_1983 \x1b\x24\x42
+ else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x42))
+ {
+ return EIsCharacterSet;
+ }
+
+ // Escape Sequence For Jis Roman \x1b\x28\x4a
+ else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x4A))
+ {
+ return EIsCharacterSet;
+ }
+
+ // Escape Sequence For Jis RomanIncorrect \x1b\x28\x48
+ else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x48))
+ {
+ return EIsCharacterSet;
+ }
+
+ // Escape Sequence For Ascii \x1b\x28\x42
+ else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x42))
+ {
+ return EIsCharacterSet;
+ }
+
+ // Escape Sequence For EscapeSequenceForHalfWidthKatakana \x1b\x28\x49
+ else if ((characterPlus1 == 0x28) && (characterPlus2 == 0x49))
+ {
+ return EIsCharacterSet;
+ }
+
+ // Escape Sequence For Jis X0208_199x \x1b\x26\x40\x1b\x24\x42
+ else if ((characterPlus1 == 0x26) && (characterPlus2 == 0x40))
+ {
+ characterPlus4 = ( i < (sampleLength-4) ? aForeign[i+4]:0);
+ characterPlus5 = ( i < (sampleLength-5) ? aForeign[i+5]:0);
+
+ if ((characterPlus3 == 0x1b) && (characterPlus4 == 0x24) && (characterPlus5 == 0x42))
+ {
+ return EIsCharacterSet;
+ }
+ }
+ // Escape Sequence For Jis X0212_1990 \x1b\x24\x28\x44
+ else if ((characterPlus1 == 0x24) && (characterPlus2 == 0x28))
+ {
+ if (characterPlus3 == 0x44)
+ {
+ return EIsCharacterSet;
+ }
+ }
+
+ // check for the JIS escape sequences of ISO 2022Jp "B@" x42 x40
+ else if ((characterPlus1 == 'B') || (characterPlus1 == '@'))
+ {
+ return EIsCharacterSet;
+ }
+
+ } // end of if ( character == KEscape )
+
+ i++;
+ }
+
+ // if escape sequences have been found then this is not ISO2022
+ return EIsNotCharacterSet;
+
+ }
+
+
+/**
+ Check if EUC JP (reference CJKV by Ken Lunde page 164)
+ @param A sample of data to be checked
+ @param The number of input bytes that can be converted
+ @return The result of the check as either EIsCharacterSet, EIsNotCharacterSet or EMaybeCharacterSet
+ @internalTechnology
+ */
+CJ5Converter::TDectectCharacterSet CJ5Converter::DetectEucJp( const TDesC8& aForeign,TInt &aNumberOfBytesConverted )
+ {
+ // Get the sample length
+ TInt sampleLength = Min(aForeign.Length(), KMaxSizeAutoDetectSample);;
+
+ TInt i=0;
+ aNumberOfBytesConverted = 0;
+
+ TText8 character;
+ TText8 characterPlus1;
+ TText8 characterPlus2;
+
+ // scan the sample text looking for valid shiftjis data
+ while ( i < sampleLength )
+ {
+ // get the next few characters, use 0 if there is no more sample
+ // as this will not match any test.
+ character = aForeign[i];
+ characterPlus1 = ( i < (sampleLength-1) ? aForeign[i+1]:0);
+ characterPlus2 = ( i < (sampleLength-2) ? aForeign[i+2]:0);
+
+ // EUCJP code set 0 0x21-0x7e
+ if ( (character >= 0x21) && (character <= 0x7e))
+ {
+ aNumberOfBytesConverted++;
+ }
+ else if ( (character == 0x0a) || (character == 0x0d))
+ {
+ aNumberOfBytesConverted++;
+ }
+ // EUCJP code set 1
+ else if ( (character >= 0xa1) && (character <= 0xff)
+ && (characterPlus1 >= 0xa1) && (characterPlus1 <= 0xff) )
+ {
+ aNumberOfBytesConverted+=2;
+ i++;
+ }
+
+ // EUC JP code set 2, starts with the EUC JP SS2 character (0x8E)
+ // and is followed by character in range 0xA1- 0xDF
+ else if ((character == 0x8E) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF) )
+ {
+ // this could be 2 bytes of EUC JP code set 2
+ aNumberOfBytesConverted += 2;
+ i++;
+ }
+ // EUC JP code set 3, starts with the EUC JP SS3 character (0x8F)
+ // and is followed by two characters in range A1- DF A1 -FE
+ else if ((character == 0x8F) && (characterPlus1 >= 0xA1) && (characterPlus1 <= 0xDF)
+ && (characterPlus2 >= 0xA1) && (characterPlus2 <= 0xDF))
+ {
+ // this could be 3 bytes of EUC JP code set 3
+ aNumberOfBytesConverted += 3;
+ i+=2;
+ }
+ else
+ {
+ // This is not a valid decoding as EUC JP so reject
+ return EIsNotCharacterSet;
+ }
+ i++;
+ }
+
+
+ // if all the characters could be converted
+ if (aNumberOfBytesConverted == sampleLength)
+ {
+ return EIsCharacterSet;
+ }
+ else if (aNumberOfBytesConverted == 0)
+ {
+ return EIsNotCharacterSet;
+ }
+ else
+ {
+ return EMaybeCharacterSet;
+ }
+ }
+
+
+/**
+ Convert from UCS2 (Universal Character Set containing two bytes) to unicode
+ Remove any byte order marks in the UCSs.
+ @param aUnicode Contains the converted text in the Unicode character set.
+ @param aForeign The non-Unicode source text to be converted
+ @param aNumberOfUnconvertibleCharacters Contains the number of bytes which were not converted.
+ @param aIndexOfFirstByteOfFirstUnconvertibleCharacter The index of the first byte of the first unconvertible character.
+ @return the number of bytes converted
+ @internalTechnology
+ */
+ TInt CJ5Converter::ConvertUcs2ToUnicode(CCnvCharacterSetConverter::TEndianness& aDefaultEndiannessOfForeignCharacters,
+ TDes16& aUnicode,
+ const TDesC8& aForeign,
+ TInt& aNumberOfUnconvertibleCharacters,
+ TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
+
+ {
+ TInt numberOfBytesConverted = 0;
+ TInt numberOfUnicodeCharacters =0;
+ TChar nextChar;
+
+ // start at begining of the output buffer provided
+ aUnicode.Zero();
+
+ // while there is at least 2 bytes of data to convert and space in the output buffer
+ while ( (numberOfBytesConverted+1 < aForeign.Size()) && (numberOfUnicodeCharacters < aUnicode.MaxLength()) )
+ {
+ if (aDefaultEndiannessOfForeignCharacters == CCnvCharacterSetConverter::ELittleEndian )
+ {
+ // ELittleEndian 0x??00
+ nextChar = aForeign[numberOfBytesConverted] + ( aForeign[numberOfBytesConverted+1] << 8);
+ }
+ else
+ {
+ // EBigEndian 0x00??
+ nextChar = ( aForeign[numberOfBytesConverted] <<8 ) + aForeign[numberOfBytesConverted+1];
+ }
+
+ // save the unicode character extracted unless it's a BOM
+ if ( nextChar != KByteOrderMark )
+ {
+ aUnicode.Append( nextChar );
+ numberOfUnicodeCharacters++;
+ }
+
+ numberOfBytesConverted+=2;
+ }
+
+ // there are no uncovertable characters with UCS2, but there could be
+ aNumberOfUnconvertibleCharacters = 0;
+ // a negative value indicates that all characters converted
+ aIndexOfFirstByteOfFirstUnconvertibleCharacter = -1;
+
+ // returns the number of unconverted bytes left at the end of the input descriptor
+ // Note there could be 1 byte left over if an odd number of bytes provided for conversion
+ return aForeign.Size() - numberOfBytesConverted;
+ }
+
+/**
+ Convert from EUC_JP (Extended Unix Code encoding for Japanese)
+ Using the standard Charconv method of an array of methods
+ @return the number of bytes converted
+ @internalTechnology
+ */
+ TInt CJ5Converter::ConvertEEucjpToUnicode(
+ CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters,
+ TDes16& aUnicode,
+ const TDesC8& aForeign,
+ TInt& /*aState*/,
+ TInt& aNumberOfUnconvertibleCharacters,
+ TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
+ {
+ TFixedArray<CnvUtilities::SMethod, 4> methods;
+ methods[0].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisRoman;
+ methods[0].iConvertToIntermediateBufferInPlace=DummyConvertToIntermediateBufferInPlace;
+ methods[0].iConversionData=&CnvJisRoman::ConversionData();
+ methods[0].iNumberOfBytesPerCharacter=1;
+ methods[0].iNumberOfCoreBytesPerCharacter=1;
+ methods[1].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0208;
+ methods[1].iConvertToIntermediateBufferInPlace=ConvertToJisX0208FromEucJpPackedInPlace;
+ methods[1].iConversionData=&CnvJisX0208::ConversionData();
+ methods[1].iNumberOfBytesPerCharacter=2;
+ methods[1].iNumberOfCoreBytesPerCharacter=2;
+ methods[2].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToHalfWidthKatakana8;
+ methods[2].iConvertToIntermediateBufferInPlace=ConvertToHalfWidthKatakana8FromEucJpPackedInPlace;
+ methods[2].iConversionData=&CnvHalfWidthKatakana8::ConversionData();
+ methods[2].iNumberOfBytesPerCharacter=2;
+ methods[2].iNumberOfCoreBytesPerCharacter=1;
+ methods[3].iNumberOfBytesAbleToConvert=NumberOfBytesAbleToConvertToJisX0212;
+ methods[3].iConvertToIntermediateBufferInPlace=ConvertToJisX0212FromEucJpPackedInPlace;
+ methods[3].iConversionData=&CnvJisX0212::ConversionData();
+ methods[3].iNumberOfBytesPerCharacter=3;
+ methods[3].iNumberOfCoreBytesPerCharacter=2;
+ return CnvUtilities::ConvertToUnicodeFromHeterogeneousForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aForeign, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, methods.Array());
+ }
+