charconvfw/charconvplugins/src/shared/JISBASE_SHARED_2.CPP
changeset 0 1fb32624e06b
child 37 6be019398652
child 40 91ef7621b7fc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/charconvfw/charconvplugins/src/shared/JISBASE_SHARED_2.CPP	Tue Feb 02 02:02:46 2010 +0200
@@ -0,0 +1,447 @@
+/*
+* Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description:       
+*
+*/
+
+
+#include "PictographObserver.h"
+#include <e32std.h>
+#include <charconv.h>
+#include <convdata.h>
+#include <convutils.h>
+#include "jisx0201.h"
+#include "jisx0208.h"
+#include "jisx0212.h"
+#include "jisbase.h"
+#include "featmgr/featmgr.h"
+
+const TUint KControlCharacterEscape=0x1b;
+const TUint KControlCharacterShiftOut=0x0e;
+const TUint KControlCharacterShiftIn=0x0f;
+const TUint KBitsForNonStandardStates=0x03;
+
+_LIT8(KLit8EscapeSequenceForJisRoman, "\x1b\x28\x4a");
+_LIT8(KLit8EscapeSequenceForJisRomanIncorrect, "\x1b\x28\x48");
+_LIT8(KLit8EscapeSequenceForAscii, "\x1b\x28\x42");
+_LIT8(KLit8EscapeSequenceForHalfWidthKatakana, "\x1b\x28\x49");
+_LIT8(KLit8EscapeSequenceForJisC6226_1978, "\x1b\x24\x40");
+_LIT8(KLit8EscapeSequenceForJisX0208_1983, "\x1b\x24\x42");
+_LIT8(KLit8EscapeSequenceForJisX0208_199x, "\x1b\x26\x40\x1b\x24\x42");
+_LIT8(KLit8EscapeSequenceForJisX0212_1990, "\x1b\x24\x28\x44");
+
+typedef TInt (*FChangeState)(TInt aState);
+typedef TInt (*FAppendConvertToUnicode)(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags);
+
+enum TNonStandardState // each of these values must fit into KBitsForNonStandardStates and each must also be non-zero
+	{
+	ENonStandardStateJis7=1,
+	ENonStandardStateJis8
+	};
+
+
+LOCAL_D const SCnvConversionData::SVariableByteData::SRange halfWidthKatakana7VariableByteDataRange=
+	{
+	0x00,
+	0xff,
+	0,
+	0
+	};
+
+LOCAL_D const SCnvConversionData::SOneDirectionData::SRange halfWidthKatakana7ToUnicodeDataRange=
+	{
+	0x21,
+	0x5f,
+	SCnvConversionData::SOneDirectionData::SRange::EOffset,
+	0,
+	0,
+		{
+		STATIC_CAST(TUint, 65344),
+		0
+		}
+	};
+
+LOCAL_D const SCnvConversionData::SOneDirectionData::SRange unicodeToHalfWidthKatakana7DataRange=
+	{
+	0xff61,
+	0xff9f,
+	SCnvConversionData::SOneDirectionData::SRange::EOffset,
+	1,
+	0,
+		{
+		STATIC_CAST(TUint, -65344),
+		0
+		}
+	};
+
+LOCAL_D const SCnvConversionData halfWidthKatakana7ConversionData=
+	{
+	SCnvConversionData::EUnspecified,
+		{
+		1,
+		&halfWidthKatakana7VariableByteDataRange
+		},
+		{
+		1,
+		&halfWidthKatakana7ToUnicodeDataRange
+		},
+		{
+		1,
+		&unicodeToHalfWidthKatakana7DataRange
+		}
+	};
+
+#if defined(_DEBUG)
+
+_LIT(KLitPanicText, "JISBASE_SHARED");
+
+enum TPanic
+	{
+	EPanicNotAppending1=1,
+	EPanicNotAppending2,
+	EPanicNotAppending3,
+	EPanicBadNonStandardState,
+	EPanicBadPointers1,
+	EPanicBadPointers2,
+	EPanicBadPointers3,
+	EPanicBadPointers4,
+	EPanicBadFunctionPointer
+	};
+
+LOCAL_C void Panic(TPanic aPanic)
+	{
+	User::Panic(KLitPanicText, aPanic);
+	}
+
+#endif
+
+TInt CnvJisBase::ChangeToNonStandardStateJis7(TInt aState)
+	{
+	return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis7;
+	}
+
+TInt CnvJisBase::ChangeToNonStandardStateJis8(TInt aState)
+	{
+	return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis8;
+	}
+
+TInt CnvJisBase::ChangeToStandardState(TInt)
+	{
+	return CCnvCharacterSetConverter::KStateDefault; // I actually thought that the correct behaviour for this would be to return "aState&~KBitsForNonStandardStates", but I asked Ken Lunde about it in an email and he said that after a run of JIS7 or JIS8, the bytes should always be interpreted as JIS-Roman
+	}
+
+TInt CnvJisBase::AppendConvertToUnicodeFromModalForeign(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aModalForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
+	{
+	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending1));
+	return CnvUtilities::ConvertToUnicodeFromModalForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aModalForeign, aState, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aArrayOfStates, aOutputConversionFlags, aInputConversionFlags);
+	}
+
+TInt CnvJisBase::AppendConvertToUnicodeFromJis7(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis7, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
+	{
+	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending2));
+	return CCnvCharacterSetConverter::DoConvertToUnicode(halfWidthKatakana7ConversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, aJis7, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
+	}
+
+TInt CnvJisBase::AppendConvertToUnicodeFromJis8(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis8, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
+	{
+	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending3));
+	return CCnvCharacterSetConverter::DoConvertToUnicode(CnvHalfWidthKatakana8::ConversionData(), aDefaultEndiannessOfForeignCharacters, aUnicode, aJis8, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
+	}
+
+EXPORT_C TInt CnvJisBase::ConvertToUnicode(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
+	{
+    TBool pictographsSupported = FeatureManager::FeatureSupported(KFeatureIdJapanesePicto);
+    RArray<CnvUtilities::SState> states;
+    if ( pictographsSupported )
+        {        
+
+        CnvUtilities::SState state;
+        state.iEscapeSequence=&KLit8EscapeSequenceForJisRoman; // Jis-Roman is the default state, so it must come first in the array
+        state.iConversionData=&CnvJisRoman::ConversionData();
+        states.Append(state);
+        state.iEscapeSequence=&KLit8EscapeSequenceForJisRomanIncorrect;
+        state.iConversionData=&CnvJisRoman::ConversionData();
+        states.Append(state);
+        state.iEscapeSequence=&KLit8EscapeSequenceForAscii;
+        state.iConversionData=&CCnvCharacterSetConverter::AsciiConversionData();
+        states.Append(state);
+    
+        SetStatesForPictograph(states);
+
+        state.iEscapeSequence=&KLit8EscapeSequenceForHalfWidthKatakana;
+        state.iConversionData=&halfWidthKatakana7ConversionData;
+        states.Append(state);
+        state.iEscapeSequence=&KLit8EscapeSequenceForJisC6226_1978;
+        state.iConversionData=&CnvJisX0208::ConversionData();
+        states.Append(state);
+        state.iEscapeSequence=&KLit8EscapeSequenceForJisX0208_1983;
+        state.iConversionData=&CnvJisX0208::ConversionData();
+        states.Append(state);
+        state.iEscapeSequence=&KLit8EscapeSequenceForJisX0208_199x;
+        state.iConversionData=&CnvJisX0208::ConversionData();
+        states.Append(state);
+        state.iEscapeSequence=&KLit8EscapeSequenceForJisX0212_1990;
+        state.iConversionData=&CnvJisX0212::ConversionData();
+        states.Append(state);
+        }
+    else
+        {            
+        CnvUtilities::SState state;
+    	state.iEscapeSequence=&KLit8EscapeSequenceForJisRoman; // Jis-Roman is the default state, so it must come first in the array
+    	state.iConversionData=&CnvJisRoman::ConversionData();
+        states.Append(state);
+        state.iEscapeSequence=&KLit8EscapeSequenceForJisRomanIncorrect;
+    	state.iConversionData=&CnvJisRoman::ConversionData();	
+        states.Append(state);
+    	state.iEscapeSequence=&KLit8EscapeSequenceForAscii;
+    	state.iConversionData=&CCnvCharacterSetConverter::AsciiConversionData();
+        states.Append(state);
+    	state.iEscapeSequence=&KLit8EscapeSequenceForHalfWidthKatakana;
+    	state.iConversionData=&halfWidthKatakana7ConversionData;
+        states.Append(state);
+    	state.iEscapeSequence=&KLit8EscapeSequenceForJisC6226_1978;
+    	state.iConversionData=&CnvJisX0208::ConversionData();
+        states.Append(state);
+    	state.iEscapeSequence=&KLit8EscapeSequenceForJisX0208_1983;
+    	state.iConversionData=&CnvJisX0208::ConversionData();
+        states.Append(state);
+    	state.iEscapeSequence=&KLit8EscapeSequenceForJisX0208_199x;
+    	state.iConversionData=&CnvJisX0208::ConversionData();
+        states.Append(state);
+    	state.iEscapeSequence=&KLit8EscapeSequenceForJisX0212_1990;
+    	state.iConversionData=&CnvJisX0212::ConversionData();
+        states.Append(state);
+        }
+	const TArray<CnvUtilities::SState> arrayOfStates(states.Array());
+	aUnicode.SetLength(0);
+	const TUint8* const pointerToFirstByte=aForeign.Ptr();
+	const TUint8* pointerToCurrentByte=pointerToFirstByte;
+	const TUint8* pointerToStartOfNextRunToConvert=pointerToFirstByte;
+	const TUint8* const pointerToLastByte=pointerToFirstByte+(aForeign.Length()-1);
+	TUint outputConversionFlags=0;
+	TUint inputConversionFlags=CCnvCharacterSetConverter::EInputConversionFlagAppend;
+	FOREVER
+		{
+		FChangeState changeState=NULL;
+		FAppendConvertToUnicode appendConvertToUnicode=NULL;
+		TBool skipThisByte=EFalse;
+		const TUint currentByte=*pointerToCurrentByte;
+		switch (aState&KBitsForNonStandardStates)
+			{
+		case 0:
+			if (currentByte==KControlCharacterShiftOut)
+				{
+				changeState=ChangeToNonStandardStateJis7;
+				skipThisByte=ETrue;
+				}
+            else if (pictographsSupported && (currentByte==KControlCharacterShiftIn))
+                {
+                changeState=ChangeToStandardState;
+                skipThisByte=ETrue;
+                }
+			else if (currentByte&0x80)
+				{
+				changeState=ChangeToNonStandardStateJis8;
+				}
+			appendConvertToUnicode=AppendConvertToUnicodeFromModalForeign;
+			break;
+		case ENonStandardStateJis7:
+			if (currentByte==KControlCharacterEscape)
+				{
+				changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
+				}
+			else if (currentByte==KControlCharacterShiftIn)
+				{
+				changeState=ChangeToStandardState;
+				skipThisByte=ETrue;
+				}
+			else if (currentByte&0x80)
+				{
+				changeState=ChangeToNonStandardStateJis8;
+				}
+			appendConvertToUnicode=AppendConvertToUnicodeFromJis7;
+			break;
+		case ENonStandardStateJis8:
+			if (currentByte==KControlCharacterEscape)
+				{
+				changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
+				}
+			else if (currentByte==KControlCharacterShiftOut)
+				{
+				changeState=ChangeToNonStandardStateJis7;
+				skipThisByte=ETrue;
+				}
+			else if ((currentByte&0x80)==0)
+				{
+				changeState=ChangeToStandardState;
+				}
+			appendConvertToUnicode=AppendConvertToUnicodeFromJis8;
+			break;
+#if defined(_DEBUG)
+		default:
+			Panic(EPanicBadNonStandardState);
+			break;
+#endif
+			}
+		__ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers1));
+		if ((pointerToCurrentByte>=pointerToLastByte) || (changeState!=NULL))
+			{
+			TBool lastIteration=EFalse;
+			__ASSERT_DEBUG(pointerToCurrentByte>=pointerToStartOfNextRunToConvert, Panic(EPanicBadPointers2));
+			if (changeState==NULL)
+				{
+				++pointerToCurrentByte; // this may make pointerToCurrentByte greater than pointerToLastByte
+				lastIteration=ETrue;
+				}
+			if (pointerToCurrentByte>pointerToStartOfNextRunToConvert)
+				{
+				TPtrC8 runToConvert(pointerToStartOfNextRunToConvert, pointerToCurrentByte-pointerToStartOfNextRunToConvert);
+				TInt numberOfUnconvertibleCharacters;
+				TInt indexOfFirstByteOfFirstUnconvertibleCharacter;
+				__ASSERT_DEBUG(appendConvertToUnicode!=NULL, Panic(EPanicBadFunctionPointer));
+				const TInt returnValue=(*appendConvertToUnicode)(aDefaultEndiannessOfForeignCharacters, aUnicode, runToConvert, aState, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, arrayOfStates, outputConversionFlags, inputConversionFlags);
+				if (returnValue<0)
+					{
+					return returnValue; // this is an error-code
+					}
+				if (numberOfUnconvertibleCharacters>0)
+					{
+					if (aNumberOfUnconvertibleCharacters==0)
+						{
+						aIndexOfFirstByteOfFirstUnconvertibleCharacter=(pointerToStartOfNextRunToConvert-pointerToFirstByte)+indexOfFirstByteOfFirstUnconvertibleCharacter;
+						}
+					aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters;
+					}
+				if (returnValue>0)
+					{
+					pointerToCurrentByte-=returnValue; // pointerToStartOfNextRunToConvert (which also needs adjusting in the same way) gets set below
+					lastIteration=ETrue;
+					changeState=NULL;
+					skipThisByte=EFalse;
+					}
+				__ASSERT_DEBUG(pointerToCurrentByte>=pointerToFirstByte, Panic(EPanicBadPointers3));
+				if (pointerToCurrentByte>pointerToFirstByte)
+					{
+					inputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable;
+					}
+				}
+			if (changeState!=NULL)
+				{
+				aState=(*changeState)(aState);
+				}
+			if (skipThisByte)
+				{
+				if (pointerToCurrentByte==pointerToLastByte) // pointerToCurrentByte may already be greater than pointerToLastByte, in which case lastIteration will already be ETrue
+					{
+					lastIteration=ETrue;
+					}
+				++pointerToCurrentByte;
+				}
+			pointerToStartOfNextRunToConvert=pointerToCurrentByte;
+			if (lastIteration) // check this first as pointerToCurrentByte may be greater than pointerToLastByte (but it will only be if lastIteration is EFalse)
+				{
+				break;
+				}
+			__ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers4));
+			if (pointerToCurrentByte>=pointerToLastByte)
+				{
+				break;
+				}
+			}
+		++pointerToCurrentByte;
+		}
+
+    states.Close();
+	// no checking with outputConversionFlags need to be done here
+	return pointerToLastByte-(pointerToCurrentByte-1);
+	}
+
+EXPORT_C const SCnvConversionData& CnvJisBase::HalfWidthKatakana7ConversionData()
+	{
+	return halfWidthKatakana7ConversionData;
+	}
+
+EXPORT_C void CnvJisBase::IsCharacterJISBased(TInt& aConfidenceLevel, const TDesC8& aSample) 
+	{
+	// JIS is modal... so start off with a confidence of 0 and to begin with look 
+	// for JIS escape sequences....Escape sequences defined above in the KLITs
+	// For each escape sequence, increase the confidenceLevel ..... 
+	aConfidenceLevel = 55;
+	TInt jisRomanResult = 0;
+	TInt asciiResult = 0;
+	TInt jisX0208Result = 0;
+	TInt jisC6226Result = 0;
+	TInt jixX0212Result = 0;
+	TInt hwKanaResult = 0;
+
+	TInt EscSequences = 0;
+	
+	TInt sampleLength = aSample.Length();
+	for (TInt i = 0; i < sampleLength; ++i)
+		{
+	
+		// JIS is 7 bit encoding
+		if((aSample[i]&0x80)!=0x00)
+			{
+			aConfidenceLevel=0;
+			break;
+			}
+		// JIS supports the following character sets 
+		if (i > jisC6226Result)
+			{
+			jisC6226Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisC6226_1978);
+			if (jisC6226Result!=KErrNotFound)
+				EscSequences += 15; 
+			}
+
+		if (i > jisRomanResult)
+			{
+			jisRomanResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisRoman);
+			if (jisRomanResult!=KErrNotFound)
+				EscSequences += 15; 
+			}
+
+		if (i > asciiResult)
+			{
+			asciiResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForAscii);
+			if (asciiResult!=KErrNotFound)
+				EscSequences += 15; 
+			}
+
+		if (i > jisX0208Result)
+			{
+			jisX0208Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0208_1983);
+			if (jisX0208Result!=KErrNotFound)
+				EscSequences += 15; 
+			}
+
+		if (i > jixX0212Result)
+			{
+			jixX0212Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0212_1990);
+			if (jixX0212Result!=KErrNotFound)
+				EscSequences += 15; 
+			}
+
+		if (i > hwKanaResult)
+			{
+			hwKanaResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForHalfWidthKatakana);
+			if (hwKanaResult!=KErrNotFound)
+				EscSequences += 15; 
+			}
+		}
+
+	aConfidenceLevel = 0 < sampleLength?
+		aConfidenceLevel + ((EscSequences*100)/sampleLength) : 90;
+	aConfidenceLevel=(aConfidenceLevel >100)?100:aConfidenceLevel;
+	}