charconvfw/charconvplugins/src/shared/jisbase_shared.cpp
changeset 0 1fb32624e06b
equal deleted inserted replaced
-1:000000000000 0:1fb32624e06b
       
     1 /*
       
     2 * Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 
       
    19 #include <e32std.h>
       
    20 #include <charconv.h>
       
    21 #include <convdata.h>
       
    22 #include <convutils.h>
       
    23 #include "jisx0201.h"
       
    24 #include "jisx0208.h"
       
    25 #include "jisx0212.h"
       
    26 #include "jisbase.h"
       
    27 
       
    28 const TUint KControlCharacterEscape=0x1b;
       
    29 const TUint KControlCharacterShiftOut=0x0e;
       
    30 const TUint KControlCharacterShiftIn=0x0f;
       
    31 const TUint KBitsForNonStandardStates=0x03;
       
    32 
       
    33 _LIT8(KLit8EscapeSequenceForJisRoman, "\x1b\x28\x4a");
       
    34 _LIT8(KLit8EscapeSequenceForJisRomanIncorrect, "\x1b\x28\x48");
       
    35 _LIT8(KLit8EscapeSequenceForAscii, "\x1b\x28\x42");
       
    36 _LIT8(KLit8EscapeSequenceForHalfWidthKatakana, "\x1b\x28\x49");
       
    37 _LIT8(KLit8EscapeSequenceForJisC6226_1978, "\x1b\x24\x40");
       
    38 _LIT8(KLit8EscapeSequenceForJisX0208_1983, "\x1b\x24\x42");
       
    39 _LIT8(KLit8EscapeSequenceForJisX0208_199x, "\x1b\x26\x40\x1b\x24\x42");
       
    40 _LIT8(KLit8EscapeSequenceForJisX0212_1990, "\x1b\x24\x28\x44");
       
    41 
       
    42 typedef TInt (*FChangeState)(TInt aState);
       
    43 typedef TInt (*FAppendConvertToUnicode)(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags);
       
    44 
       
    45 enum TNonStandardState // each of these values must fit into KBitsForNonStandardStates and each must also be non-zero
       
    46 	{
       
    47 	ENonStandardStateJis7=1,
       
    48 	ENonStandardStateJis8
       
    49 	};
       
    50 
       
    51 
       
    52 LOCAL_D const SCnvConversionData::SVariableByteData::SRange halfWidthKatakana7VariableByteDataRange=
       
    53 	{
       
    54 	0x00,
       
    55 	0xff,
       
    56 	0,
       
    57 	0
       
    58 	};
       
    59 
       
    60 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange halfWidthKatakana7ToUnicodeDataRange=
       
    61 	{
       
    62 	0x21,
       
    63 	0x5f,
       
    64 	SCnvConversionData::SOneDirectionData::SRange::EOffset,
       
    65 	0,
       
    66 	0,
       
    67 		{
       
    68 		STATIC_CAST(TUint, 65344),
       
    69 		0
       
    70 		}
       
    71 	};
       
    72 
       
    73 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange unicodeToHalfWidthKatakana7DataRange=
       
    74 	{
       
    75 	0xff61,
       
    76 	0xff9f,
       
    77 	SCnvConversionData::SOneDirectionData::SRange::EOffset,
       
    78 	1,
       
    79 	0,
       
    80 		{
       
    81 		STATIC_CAST(TUint, -65344),
       
    82 		0
       
    83 		}
       
    84 	};
       
    85 
       
    86 LOCAL_D const SCnvConversionData halfWidthKatakana7ConversionData=
       
    87 	{
       
    88 	SCnvConversionData::EUnspecified,
       
    89 		{
       
    90 		1,
       
    91 		&halfWidthKatakana7VariableByteDataRange
       
    92 		},
       
    93 		{
       
    94 		1,
       
    95 		&halfWidthKatakana7ToUnicodeDataRange
       
    96 		},
       
    97 		{
       
    98 		1,
       
    99 		&unicodeToHalfWidthKatakana7DataRange
       
   100 		}
       
   101 	};
       
   102 
       
   103 #if defined(_DEBUG)
       
   104 
       
   105 _LIT(KLitPanicText, "JISBASE_SHARED");
       
   106 
       
   107 enum TPanic
       
   108 	{
       
   109 	EPanicNotAppending1=1,
       
   110 	EPanicNotAppending2,
       
   111 	EPanicNotAppending3,
       
   112 	EPanicBadNonStandardState,
       
   113 	EPanicBadPointers1,
       
   114 	EPanicBadPointers2,
       
   115 	EPanicBadPointers3,
       
   116 	EPanicBadPointers4,
       
   117 	EPanicBadFunctionPointer
       
   118 	};
       
   119 
       
   120 LOCAL_C void Panic(TPanic aPanic)
       
   121 	{
       
   122 	User::Panic(KLitPanicText, aPanic);
       
   123 	}
       
   124 
       
   125 #endif
       
   126 
       
   127 TInt CnvJisBase::ChangeToNonStandardStateJis7(TInt aState)
       
   128 	{
       
   129 	return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis7;
       
   130 	}
       
   131 
       
   132 TInt CnvJisBase::ChangeToNonStandardStateJis8(TInt aState)
       
   133 	{
       
   134 	return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis8;
       
   135 	}
       
   136 
       
   137 TInt CnvJisBase::ChangeToStandardState(TInt)
       
   138 	{
       
   139 	return CCnvCharacterSetConverter::KStateDefault; // I actually thought that the correct behaviour for this would be to return "aState&~KBitsForNonStandardStates", but I asked Ken Lunde about it in an email and he said that after a run of JIS7 or JIS8, the bytes should always be interpreted as JIS-Roman
       
   140 	}
       
   141 
       
   142 TInt CnvJisBase::AppendConvertToUnicodeFromModalForeign(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aModalForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
       
   143 	{
       
   144 	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending1));
       
   145 	return CnvUtilities::ConvertToUnicodeFromModalForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aModalForeign, aState, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aArrayOfStates, aOutputConversionFlags, aInputConversionFlags);
       
   146 	}
       
   147 
       
   148 TInt CnvJisBase::AppendConvertToUnicodeFromJis7(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis7, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
       
   149 	{
       
   150 	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending2));
       
   151 	return CCnvCharacterSetConverter::DoConvertToUnicode(halfWidthKatakana7ConversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, aJis7, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
       
   152 	}
       
   153 
       
   154 TInt CnvJisBase::AppendConvertToUnicodeFromJis8(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis8, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags)
       
   155 	{
       
   156 	__ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending3));
       
   157 	return CCnvCharacterSetConverter::DoConvertToUnicode(CnvHalfWidthKatakana8::ConversionData(), aDefaultEndiannessOfForeignCharacters, aUnicode, aJis8, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags);
       
   158 	}
       
   159 
       
   160 EXPORT_C TInt CnvJisBase::ConvertToUnicode(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter)
       
   161 	{
       
   162 	TFixedArray<CnvUtilities::SState, 8> states;
       
   163 	states[0].iEscapeSequence=&KLit8EscapeSequenceForJisRoman; // Jis-Roman is the default state, so it must come first in the array
       
   164 	states[0].iConversionData=&CnvJisRoman::ConversionData();
       
   165 	states[1].iEscapeSequence=&KLit8EscapeSequenceForJisRomanIncorrect;
       
   166 	states[1].iConversionData=&CnvJisRoman::ConversionData();	
       
   167 	states[2].iEscapeSequence=&KLit8EscapeSequenceForAscii;
       
   168 	states[2].iConversionData=&CCnvCharacterSetConverter::AsciiConversionData();
       
   169 	states[3].iEscapeSequence=&KLit8EscapeSequenceForHalfWidthKatakana;
       
   170 	states[3].iConversionData=&halfWidthKatakana7ConversionData;
       
   171 	states[4].iEscapeSequence=&KLit8EscapeSequenceForJisC6226_1978;
       
   172 	states[4].iConversionData=&CnvJisX0208::ConversionData();
       
   173 	states[5].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_1983;
       
   174 	states[5].iConversionData=&CnvJisX0208::ConversionData();
       
   175 	states[6].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_199x;
       
   176 	states[6].iConversionData=&CnvJisX0208::ConversionData();
       
   177 	states[7].iEscapeSequence=&KLit8EscapeSequenceForJisX0212_1990;
       
   178 	states[7].iConversionData=&CnvJisX0212::ConversionData();
       
   179 	const TArray<CnvUtilities::SState> arrayOfStates(states.Array());
       
   180 	aUnicode.SetLength(0);
       
   181 	const TUint8* const pointerToFirstByte=aForeign.Ptr();
       
   182 	const TUint8* pointerToCurrentByte=pointerToFirstByte;
       
   183 	const TUint8* pointerToStartOfNextRunToConvert=pointerToFirstByte;
       
   184 	const TUint8* const pointerToLastByte=pointerToFirstByte+(aForeign.Length()-1);
       
   185 	TUint outputConversionFlags=0;
       
   186 	TUint inputConversionFlags=CCnvCharacterSetConverter::EInputConversionFlagAppend;
       
   187 	FOREVER
       
   188 		{
       
   189 		FChangeState changeState=NULL;
       
   190 		FAppendConvertToUnicode appendConvertToUnicode=NULL;
       
   191 		TBool skipThisByte=EFalse;
       
   192 		const TUint currentByte=*pointerToCurrentByte;
       
   193 		switch (aState&KBitsForNonStandardStates)
       
   194 			{
       
   195 		case 0:
       
   196 			if (currentByte==KControlCharacterShiftOut)
       
   197 				{
       
   198 				changeState=ChangeToNonStandardStateJis7;
       
   199 				skipThisByte=ETrue;
       
   200 				}
       
   201 			else if (currentByte&0x80)
       
   202 				{
       
   203 				changeState=ChangeToNonStandardStateJis8;
       
   204 				}
       
   205 			appendConvertToUnicode=AppendConvertToUnicodeFromModalForeign;
       
   206 			break;
       
   207 		case ENonStandardStateJis7:
       
   208 			if (currentByte==KControlCharacterEscape)
       
   209 				{
       
   210 				changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
       
   211 				}
       
   212 			else if (currentByte==KControlCharacterShiftIn)
       
   213 				{
       
   214 				changeState=ChangeToStandardState;
       
   215 				skipThisByte=ETrue;
       
   216 				}
       
   217 			else if (currentByte&0x80)
       
   218 				{
       
   219 				changeState=ChangeToNonStandardStateJis8;
       
   220 				}
       
   221 			appendConvertToUnicode=AppendConvertToUnicodeFromJis7;
       
   222 			break;
       
   223 		case ENonStandardStateJis8:
       
   224 			if (currentByte==KControlCharacterEscape)
       
   225 				{
       
   226 				changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes
       
   227 				}
       
   228 			else if (currentByte==KControlCharacterShiftOut)
       
   229 				{
       
   230 				changeState=ChangeToNonStandardStateJis7;
       
   231 				skipThisByte=ETrue;
       
   232 				}
       
   233 			else if ((currentByte&0x80)==0)
       
   234 				{
       
   235 				changeState=ChangeToStandardState;
       
   236 				}
       
   237 			appendConvertToUnicode=AppendConvertToUnicodeFromJis8;
       
   238 			break;
       
   239 #if defined(_DEBUG)
       
   240 		default:
       
   241 			Panic(EPanicBadNonStandardState);
       
   242 			break;
       
   243 #endif
       
   244 			}
       
   245 		__ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers1));
       
   246 		if ((pointerToCurrentByte>=pointerToLastByte) || (changeState!=NULL))
       
   247 			{
       
   248 			TBool lastIteration=EFalse;
       
   249 			__ASSERT_DEBUG(pointerToCurrentByte>=pointerToStartOfNextRunToConvert, Panic(EPanicBadPointers2));
       
   250 			if (changeState==NULL)
       
   251 				{
       
   252 				++pointerToCurrentByte; // this may make pointerToCurrentByte greater than pointerToLastByte
       
   253 				lastIteration=ETrue;
       
   254 				}
       
   255 			if (pointerToCurrentByte>pointerToStartOfNextRunToConvert)
       
   256 				{
       
   257 				TPtrC8 runToConvert(pointerToStartOfNextRunToConvert, pointerToCurrentByte-pointerToStartOfNextRunToConvert);
       
   258 				TInt numberOfUnconvertibleCharacters;
       
   259 				TInt indexOfFirstByteOfFirstUnconvertibleCharacter;
       
   260 				__ASSERT_DEBUG(appendConvertToUnicode!=NULL, Panic(EPanicBadFunctionPointer));
       
   261 				const TInt returnValue=(*appendConvertToUnicode)(aDefaultEndiannessOfForeignCharacters, aUnicode, runToConvert, aState, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, arrayOfStates, outputConversionFlags, inputConversionFlags);
       
   262 				if (returnValue<0)
       
   263 					{
       
   264 					return returnValue; // this is an error-code
       
   265 					}
       
   266 				if (numberOfUnconvertibleCharacters>0)
       
   267 					{
       
   268 					if (aNumberOfUnconvertibleCharacters==0)
       
   269 						{
       
   270 						aIndexOfFirstByteOfFirstUnconvertibleCharacter=(pointerToStartOfNextRunToConvert-pointerToFirstByte)+indexOfFirstByteOfFirstUnconvertibleCharacter;
       
   271 						}
       
   272 					aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters;
       
   273 					}
       
   274 				if (returnValue>0)
       
   275 					{
       
   276 					pointerToCurrentByte-=returnValue; // pointerToStartOfNextRunToConvert (which also needs adjusting in the same way) gets set below
       
   277 					lastIteration=ETrue;
       
   278 					changeState=NULL;
       
   279 					skipThisByte=EFalse;
       
   280 					}
       
   281 				__ASSERT_DEBUG(pointerToCurrentByte>=pointerToFirstByte, Panic(EPanicBadPointers3));
       
   282 				if (pointerToCurrentByte>pointerToFirstByte)
       
   283 					{
       
   284 					inputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable;
       
   285 					}
       
   286 				}
       
   287 			if (changeState!=NULL)
       
   288 				{
       
   289 				aState=(*changeState)(aState);
       
   290 				}
       
   291 			if (skipThisByte)
       
   292 				{
       
   293 				if (pointerToCurrentByte==pointerToLastByte) // pointerToCurrentByte may already be greater than pointerToLastByte, in which case lastIteration will already be ETrue
       
   294 					{
       
   295 					lastIteration=ETrue;
       
   296 					}
       
   297 				++pointerToCurrentByte;
       
   298 				}
       
   299 			pointerToStartOfNextRunToConvert=pointerToCurrentByte;
       
   300 			if (lastIteration) // check this first as pointerToCurrentByte may be greater than pointerToLastByte (but it will only be if lastIteration is EFalse)
       
   301 				{
       
   302 				break;
       
   303 				}
       
   304 			__ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers4));
       
   305 			if (pointerToCurrentByte>=pointerToLastByte)
       
   306 				{
       
   307 				break;
       
   308 				}
       
   309 			}
       
   310 		++pointerToCurrentByte;
       
   311 		}
       
   312 	// no checking with outputConversionFlags need to be done here
       
   313 	return pointerToLastByte-(pointerToCurrentByte-1);
       
   314 	}
       
   315 
       
   316 EXPORT_C const SCnvConversionData& CnvJisBase::HalfWidthKatakana7ConversionData()
       
   317 	{
       
   318 	return halfWidthKatakana7ConversionData;
       
   319 	}
       
   320 
       
   321 EXPORT_C void CnvJisBase::IsCharacterJISBased(TInt& aConfidenceLevel, const TDesC8& aSample) 
       
   322 	{
       
   323 	// JIS is modal... so start off with a confidence of 0 and to begin with look 
       
   324 	// for JIS escape sequences....Escape sequences defined above in the KLITs
       
   325 	// For each escape sequence, increase the confidenceLevel ..... 
       
   326 	aConfidenceLevel = 55;
       
   327 	TInt jisRomanResult = 0;
       
   328 	TInt asciiResult = 0;
       
   329 	TInt jisX0208Result = 0;
       
   330 	TInt jisC6226Result = 0;
       
   331 	TInt jixX0212Result = 0;
       
   332 	TInt hwKanaResult = 0;
       
   333 
       
   334 	TInt EscSequences = 0;
       
   335 	
       
   336 	TInt sampleLength = aSample.Length();
       
   337 	for (TInt i = 0; i < sampleLength; ++i)
       
   338 		{
       
   339 	
       
   340 		// JIS is 7 bit encoding
       
   341 		if((aSample[i]&0x80)!=0x00)
       
   342 			{
       
   343 			aConfidenceLevel=0;
       
   344 			break;
       
   345 			}
       
   346 		// JIS supports the following character sets 
       
   347 		if (i > jisC6226Result)
       
   348 			{
       
   349 			jisC6226Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisC6226_1978);
       
   350 			if (jisC6226Result!=KErrNotFound)
       
   351 				EscSequences += 15; 
       
   352 			}
       
   353 
       
   354 		if (i > jisRomanResult)
       
   355 			{
       
   356 			jisRomanResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisRoman);
       
   357 			if (jisRomanResult!=KErrNotFound)
       
   358 				EscSequences += 15; 
       
   359 			}
       
   360 
       
   361 		if (i > asciiResult)
       
   362 			{
       
   363 			asciiResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForAscii);
       
   364 			if (asciiResult!=KErrNotFound)
       
   365 				EscSequences += 15; 
       
   366 			}
       
   367 
       
   368 		if (i > jisX0208Result)
       
   369 			{
       
   370 			jisX0208Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0208_1983);
       
   371 			if (jisX0208Result!=KErrNotFound)
       
   372 				EscSequences += 15; 
       
   373 			}
       
   374 
       
   375 		if (i > jixX0212Result)
       
   376 			{
       
   377 			jixX0212Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0212_1990);
       
   378 			if (jixX0212Result!=KErrNotFound)
       
   379 				EscSequences += 15; 
       
   380 			}
       
   381 
       
   382 		if (i > hwKanaResult)
       
   383 			{
       
   384 			hwKanaResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForHalfWidthKatakana);
       
   385 			if (hwKanaResult!=KErrNotFound)
       
   386 				EscSequences += 15; 
       
   387 			}
       
   388 		}
       
   389 
       
   390 	aConfidenceLevel = 0 < sampleLength?
       
   391 		aConfidenceLevel + ((EscSequences*100)/sampleLength) : 90;
       
   392 	aConfidenceLevel=(aConfidenceLevel >100)?100:aConfidenceLevel;
       
   393 	}