|
1 /* |
|
2 * Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 |
|
19 #include <e32std.h> |
|
20 #include <charconv.h> |
|
21 #include <convdata.h> |
|
22 #include <convutils.h> |
|
23 #include "jisx0201.h" |
|
24 #include "jisx0208.h" |
|
25 #include "jisx0212.h" |
|
26 #include "jisbase.h" |
|
27 |
|
28 const TUint KControlCharacterEscape=0x1b; |
|
29 const TUint KControlCharacterShiftOut=0x0e; |
|
30 const TUint KControlCharacterShiftIn=0x0f; |
|
31 const TUint KBitsForNonStandardStates=0x03; |
|
32 |
|
33 _LIT8(KLit8EscapeSequenceForJisRoman, "\x1b\x28\x4a"); |
|
34 _LIT8(KLit8EscapeSequenceForJisRomanIncorrect, "\x1b\x28\x48"); |
|
35 _LIT8(KLit8EscapeSequenceForAscii, "\x1b\x28\x42"); |
|
36 _LIT8(KLit8EscapeSequenceForHalfWidthKatakana, "\x1b\x28\x49"); |
|
37 _LIT8(KLit8EscapeSequenceForJisC6226_1978, "\x1b\x24\x40"); |
|
38 _LIT8(KLit8EscapeSequenceForJisX0208_1983, "\x1b\x24\x42"); |
|
39 _LIT8(KLit8EscapeSequenceForJisX0208_199x, "\x1b\x26\x40\x1b\x24\x42"); |
|
40 _LIT8(KLit8EscapeSequenceForJisX0212_1990, "\x1b\x24\x28\x44"); |
|
41 |
|
42 typedef TInt (*FChangeState)(TInt aState); |
|
43 typedef TInt (*FAppendConvertToUnicode)(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags); |
|
44 |
|
45 enum TNonStandardState // each of these values must fit into KBitsForNonStandardStates and each must also be non-zero |
|
46 { |
|
47 ENonStandardStateJis7=1, |
|
48 ENonStandardStateJis8 |
|
49 }; |
|
50 |
|
51 |
|
52 LOCAL_D const SCnvConversionData::SVariableByteData::SRange halfWidthKatakana7VariableByteDataRange= |
|
53 { |
|
54 0x00, |
|
55 0xff, |
|
56 0, |
|
57 0 |
|
58 }; |
|
59 |
|
60 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange halfWidthKatakana7ToUnicodeDataRange= |
|
61 { |
|
62 0x21, |
|
63 0x5f, |
|
64 SCnvConversionData::SOneDirectionData::SRange::EOffset, |
|
65 0, |
|
66 0, |
|
67 { |
|
68 STATIC_CAST(TUint, 65344), |
|
69 0 |
|
70 } |
|
71 }; |
|
72 |
|
73 LOCAL_D const SCnvConversionData::SOneDirectionData::SRange unicodeToHalfWidthKatakana7DataRange= |
|
74 { |
|
75 0xff61, |
|
76 0xff9f, |
|
77 SCnvConversionData::SOneDirectionData::SRange::EOffset, |
|
78 1, |
|
79 0, |
|
80 { |
|
81 STATIC_CAST(TUint, -65344), |
|
82 0 |
|
83 } |
|
84 }; |
|
85 |
|
86 LOCAL_D const SCnvConversionData halfWidthKatakana7ConversionData= |
|
87 { |
|
88 SCnvConversionData::EUnspecified, |
|
89 { |
|
90 1, |
|
91 &halfWidthKatakana7VariableByteDataRange |
|
92 }, |
|
93 { |
|
94 1, |
|
95 &halfWidthKatakana7ToUnicodeDataRange |
|
96 }, |
|
97 { |
|
98 1, |
|
99 &unicodeToHalfWidthKatakana7DataRange |
|
100 } |
|
101 }; |
|
102 |
|
103 #if defined(_DEBUG) |
|
104 |
|
105 _LIT(KLitPanicText, "JISBASE_SHARED"); |
|
106 |
|
107 enum TPanic |
|
108 { |
|
109 EPanicNotAppending1=1, |
|
110 EPanicNotAppending2, |
|
111 EPanicNotAppending3, |
|
112 EPanicBadNonStandardState, |
|
113 EPanicBadPointers1, |
|
114 EPanicBadPointers2, |
|
115 EPanicBadPointers3, |
|
116 EPanicBadPointers4, |
|
117 EPanicBadFunctionPointer |
|
118 }; |
|
119 |
|
120 LOCAL_C void Panic(TPanic aPanic) |
|
121 { |
|
122 User::Panic(KLitPanicText, aPanic); |
|
123 } |
|
124 |
|
125 #endif |
|
126 |
|
127 TInt CnvJisBase::ChangeToNonStandardStateJis7(TInt aState) |
|
128 { |
|
129 return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis7; |
|
130 } |
|
131 |
|
132 TInt CnvJisBase::ChangeToNonStandardStateJis8(TInt aState) |
|
133 { |
|
134 return (aState&~KBitsForNonStandardStates)|ENonStandardStateJis8; |
|
135 } |
|
136 |
|
137 TInt CnvJisBase::ChangeToStandardState(TInt) |
|
138 { |
|
139 return CCnvCharacterSetConverter::KStateDefault; // I actually thought that the correct behaviour for this would be to return "aState&~KBitsForNonStandardStates", but I asked Ken Lunde about it in an email and he said that after a run of JIS7 or JIS8, the bytes should always be interpreted as JIS-Roman |
|
140 } |
|
141 |
|
142 TInt CnvJisBase::AppendConvertToUnicodeFromModalForeign(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aModalForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>& aArrayOfStates, TUint& aOutputConversionFlags, TUint aInputConversionFlags) |
|
143 { |
|
144 __ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending1)); |
|
145 return CnvUtilities::ConvertToUnicodeFromModalForeign(aDefaultEndiannessOfForeignCharacters, aUnicode, aModalForeign, aState, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aArrayOfStates, aOutputConversionFlags, aInputConversionFlags); |
|
146 } |
|
147 |
|
148 TInt CnvJisBase::AppendConvertToUnicodeFromJis7(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis7, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags) |
|
149 { |
|
150 __ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending2)); |
|
151 return CCnvCharacterSetConverter::DoConvertToUnicode(halfWidthKatakana7ConversionData, aDefaultEndiannessOfForeignCharacters, aUnicode, aJis7, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags); |
|
152 } |
|
153 |
|
154 TInt CnvJisBase::AppendConvertToUnicodeFromJis8(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aJis8, TInt&, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter, const TArray<CnvUtilities::SState>&, TUint& aOutputConversionFlags, TUint aInputConversionFlags) |
|
155 { |
|
156 __ASSERT_DEBUG(aInputConversionFlags&CCnvCharacterSetConverter::EInputConversionFlagAppend, Panic(EPanicNotAppending3)); |
|
157 return CCnvCharacterSetConverter::DoConvertToUnicode(CnvHalfWidthKatakana8::ConversionData(), aDefaultEndiannessOfForeignCharacters, aUnicode, aJis8, aNumberOfUnconvertibleCharacters, aIndexOfFirstByteOfFirstUnconvertibleCharacter, aOutputConversionFlags, aInputConversionFlags); |
|
158 } |
|
159 |
|
160 EXPORT_C TInt CnvJisBase::ConvertToUnicode(CCnvCharacterSetConverter::TEndianness aDefaultEndiannessOfForeignCharacters, TDes16& aUnicode, const TDesC8& aForeign, TInt& aState, TInt& aNumberOfUnconvertibleCharacters, TInt& aIndexOfFirstByteOfFirstUnconvertibleCharacter) |
|
161 { |
|
162 TFixedArray<CnvUtilities::SState, 8> states; |
|
163 states[0].iEscapeSequence=&KLit8EscapeSequenceForJisRoman; // Jis-Roman is the default state, so it must come first in the array |
|
164 states[0].iConversionData=&CnvJisRoman::ConversionData(); |
|
165 states[1].iEscapeSequence=&KLit8EscapeSequenceForJisRomanIncorrect; |
|
166 states[1].iConversionData=&CnvJisRoman::ConversionData(); |
|
167 states[2].iEscapeSequence=&KLit8EscapeSequenceForAscii; |
|
168 states[2].iConversionData=&CCnvCharacterSetConverter::AsciiConversionData(); |
|
169 states[3].iEscapeSequence=&KLit8EscapeSequenceForHalfWidthKatakana; |
|
170 states[3].iConversionData=&halfWidthKatakana7ConversionData; |
|
171 states[4].iEscapeSequence=&KLit8EscapeSequenceForJisC6226_1978; |
|
172 states[4].iConversionData=&CnvJisX0208::ConversionData(); |
|
173 states[5].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_1983; |
|
174 states[5].iConversionData=&CnvJisX0208::ConversionData(); |
|
175 states[6].iEscapeSequence=&KLit8EscapeSequenceForJisX0208_199x; |
|
176 states[6].iConversionData=&CnvJisX0208::ConversionData(); |
|
177 states[7].iEscapeSequence=&KLit8EscapeSequenceForJisX0212_1990; |
|
178 states[7].iConversionData=&CnvJisX0212::ConversionData(); |
|
179 const TArray<CnvUtilities::SState> arrayOfStates(states.Array()); |
|
180 aUnicode.SetLength(0); |
|
181 const TUint8* const pointerToFirstByte=aForeign.Ptr(); |
|
182 const TUint8* pointerToCurrentByte=pointerToFirstByte; |
|
183 const TUint8* pointerToStartOfNextRunToConvert=pointerToFirstByte; |
|
184 const TUint8* const pointerToLastByte=pointerToFirstByte+(aForeign.Length()-1); |
|
185 TUint outputConversionFlags=0; |
|
186 TUint inputConversionFlags=CCnvCharacterSetConverter::EInputConversionFlagAppend; |
|
187 FOREVER |
|
188 { |
|
189 FChangeState changeState=NULL; |
|
190 FAppendConvertToUnicode appendConvertToUnicode=NULL; |
|
191 TBool skipThisByte=EFalse; |
|
192 const TUint currentByte=*pointerToCurrentByte; |
|
193 switch (aState&KBitsForNonStandardStates) |
|
194 { |
|
195 case 0: |
|
196 if (currentByte==KControlCharacterShiftOut) |
|
197 { |
|
198 changeState=ChangeToNonStandardStateJis7; |
|
199 skipThisByte=ETrue; |
|
200 } |
|
201 else if (currentByte&0x80) |
|
202 { |
|
203 changeState=ChangeToNonStandardStateJis8; |
|
204 } |
|
205 appendConvertToUnicode=AppendConvertToUnicodeFromModalForeign; |
|
206 break; |
|
207 case ENonStandardStateJis7: |
|
208 if (currentByte==KControlCharacterEscape) |
|
209 { |
|
210 changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes |
|
211 } |
|
212 else if (currentByte==KControlCharacterShiftIn) |
|
213 { |
|
214 changeState=ChangeToStandardState; |
|
215 skipThisByte=ETrue; |
|
216 } |
|
217 else if (currentByte&0x80) |
|
218 { |
|
219 changeState=ChangeToNonStandardStateJis8; |
|
220 } |
|
221 appendConvertToUnicode=AppendConvertToUnicodeFromJis7; |
|
222 break; |
|
223 case ENonStandardStateJis8: |
|
224 if (currentByte==KControlCharacterEscape) |
|
225 { |
|
226 changeState=ChangeToStandardState; // it doesn't matter what function changeState is set to (as its return value won't actually be used), as long as changeState!=NULL so that the test below (after the end of this switch) passes |
|
227 } |
|
228 else if (currentByte==KControlCharacterShiftOut) |
|
229 { |
|
230 changeState=ChangeToNonStandardStateJis7; |
|
231 skipThisByte=ETrue; |
|
232 } |
|
233 else if ((currentByte&0x80)==0) |
|
234 { |
|
235 changeState=ChangeToStandardState; |
|
236 } |
|
237 appendConvertToUnicode=AppendConvertToUnicodeFromJis8; |
|
238 break; |
|
239 #if defined(_DEBUG) |
|
240 default: |
|
241 Panic(EPanicBadNonStandardState); |
|
242 break; |
|
243 #endif |
|
244 } |
|
245 __ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers1)); |
|
246 if ((pointerToCurrentByte>=pointerToLastByte) || (changeState!=NULL)) |
|
247 { |
|
248 TBool lastIteration=EFalse; |
|
249 __ASSERT_DEBUG(pointerToCurrentByte>=pointerToStartOfNextRunToConvert, Panic(EPanicBadPointers2)); |
|
250 if (changeState==NULL) |
|
251 { |
|
252 ++pointerToCurrentByte; // this may make pointerToCurrentByte greater than pointerToLastByte |
|
253 lastIteration=ETrue; |
|
254 } |
|
255 if (pointerToCurrentByte>pointerToStartOfNextRunToConvert) |
|
256 { |
|
257 TPtrC8 runToConvert(pointerToStartOfNextRunToConvert, pointerToCurrentByte-pointerToStartOfNextRunToConvert); |
|
258 TInt numberOfUnconvertibleCharacters; |
|
259 TInt indexOfFirstByteOfFirstUnconvertibleCharacter; |
|
260 __ASSERT_DEBUG(appendConvertToUnicode!=NULL, Panic(EPanicBadFunctionPointer)); |
|
261 const TInt returnValue=(*appendConvertToUnicode)(aDefaultEndiannessOfForeignCharacters, aUnicode, runToConvert, aState, numberOfUnconvertibleCharacters, indexOfFirstByteOfFirstUnconvertibleCharacter, arrayOfStates, outputConversionFlags, inputConversionFlags); |
|
262 if (returnValue<0) |
|
263 { |
|
264 return returnValue; // this is an error-code |
|
265 } |
|
266 if (numberOfUnconvertibleCharacters>0) |
|
267 { |
|
268 if (aNumberOfUnconvertibleCharacters==0) |
|
269 { |
|
270 aIndexOfFirstByteOfFirstUnconvertibleCharacter=(pointerToStartOfNextRunToConvert-pointerToFirstByte)+indexOfFirstByteOfFirstUnconvertibleCharacter; |
|
271 } |
|
272 aNumberOfUnconvertibleCharacters+=numberOfUnconvertibleCharacters; |
|
273 } |
|
274 if (returnValue>0) |
|
275 { |
|
276 pointerToCurrentByte-=returnValue; // pointerToStartOfNextRunToConvert (which also needs adjusting in the same way) gets set below |
|
277 lastIteration=ETrue; |
|
278 changeState=NULL; |
|
279 skipThisByte=EFalse; |
|
280 } |
|
281 __ASSERT_DEBUG(pointerToCurrentByte>=pointerToFirstByte, Panic(EPanicBadPointers3)); |
|
282 if (pointerToCurrentByte>pointerToFirstByte) |
|
283 { |
|
284 inputConversionFlags|=CCnvCharacterSetConverter::EInputConversionFlagAllowTruncatedInputNotEvenPartlyConsumable; |
|
285 } |
|
286 } |
|
287 if (changeState!=NULL) |
|
288 { |
|
289 aState=(*changeState)(aState); |
|
290 } |
|
291 if (skipThisByte) |
|
292 { |
|
293 if (pointerToCurrentByte==pointerToLastByte) // pointerToCurrentByte may already be greater than pointerToLastByte, in which case lastIteration will already be ETrue |
|
294 { |
|
295 lastIteration=ETrue; |
|
296 } |
|
297 ++pointerToCurrentByte; |
|
298 } |
|
299 pointerToStartOfNextRunToConvert=pointerToCurrentByte; |
|
300 if (lastIteration) // check this first as pointerToCurrentByte may be greater than pointerToLastByte (but it will only be if lastIteration is EFalse) |
|
301 { |
|
302 break; |
|
303 } |
|
304 __ASSERT_DEBUG(pointerToCurrentByte<=pointerToLastByte, Panic(EPanicBadPointers4)); |
|
305 if (pointerToCurrentByte>=pointerToLastByte) |
|
306 { |
|
307 break; |
|
308 } |
|
309 } |
|
310 ++pointerToCurrentByte; |
|
311 } |
|
312 // no checking with outputConversionFlags need to be done here |
|
313 return pointerToLastByte-(pointerToCurrentByte-1); |
|
314 } |
|
315 |
|
316 EXPORT_C const SCnvConversionData& CnvJisBase::HalfWidthKatakana7ConversionData() |
|
317 { |
|
318 return halfWidthKatakana7ConversionData; |
|
319 } |
|
320 |
|
321 EXPORT_C void CnvJisBase::IsCharacterJISBased(TInt& aConfidenceLevel, const TDesC8& aSample) |
|
322 { |
|
323 // JIS is modal... so start off with a confidence of 0 and to begin with look |
|
324 // for JIS escape sequences....Escape sequences defined above in the KLITs |
|
325 // For each escape sequence, increase the confidenceLevel ..... |
|
326 aConfidenceLevel = 55; |
|
327 TInt jisRomanResult = 0; |
|
328 TInt asciiResult = 0; |
|
329 TInt jisX0208Result = 0; |
|
330 TInt jisC6226Result = 0; |
|
331 TInt jixX0212Result = 0; |
|
332 TInt hwKanaResult = 0; |
|
333 |
|
334 TInt EscSequences = 0; |
|
335 |
|
336 TInt sampleLength = aSample.Length(); |
|
337 for (TInt i = 0; i < sampleLength; ++i) |
|
338 { |
|
339 |
|
340 // JIS is 7 bit encoding |
|
341 if((aSample[i]&0x80)!=0x00) |
|
342 { |
|
343 aConfidenceLevel=0; |
|
344 break; |
|
345 } |
|
346 // JIS supports the following character sets |
|
347 if (i > jisC6226Result) |
|
348 { |
|
349 jisC6226Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisC6226_1978); |
|
350 if (jisC6226Result!=KErrNotFound) |
|
351 EscSequences += 15; |
|
352 } |
|
353 |
|
354 if (i > jisRomanResult) |
|
355 { |
|
356 jisRomanResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisRoman); |
|
357 if (jisRomanResult!=KErrNotFound) |
|
358 EscSequences += 15; |
|
359 } |
|
360 |
|
361 if (i > asciiResult) |
|
362 { |
|
363 asciiResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForAscii); |
|
364 if (asciiResult!=KErrNotFound) |
|
365 EscSequences += 15; |
|
366 } |
|
367 |
|
368 if (i > jisX0208Result) |
|
369 { |
|
370 jisX0208Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0208_1983); |
|
371 if (jisX0208Result!=KErrNotFound) |
|
372 EscSequences += 15; |
|
373 } |
|
374 |
|
375 if (i > jixX0212Result) |
|
376 { |
|
377 jixX0212Result=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForJisX0212_1990); |
|
378 if (jixX0212Result!=KErrNotFound) |
|
379 EscSequences += 15; |
|
380 } |
|
381 |
|
382 if (i > hwKanaResult) |
|
383 { |
|
384 hwKanaResult=(aSample.Right(sampleLength-i)).Find(KLit8EscapeSequenceForHalfWidthKatakana); |
|
385 if (hwKanaResult!=KErrNotFound) |
|
386 EscSequences += 15; |
|
387 } |
|
388 } |
|
389 |
|
390 aConfidenceLevel = 0 < sampleLength? |
|
391 aConfidenceLevel + ((EscSequences*100)/sampleLength) : 90; |
|
392 aConfidenceLevel=(aConfidenceLevel >100)?100:aConfidenceLevel; |
|
393 } |