|
1 /* |
|
2 * Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 |
|
19 #include <e32std.h> |
|
20 #include <convgeneratedcpp.h> |
|
21 #include "gb2312.h" |
|
22 |
|
23 struct SCnvConversionData; |
|
24 |
|
25 EXPORT_C const TDesC8& CnvGb2312::ReplacementForUnconvertibleUnicodeCharacters() |
|
26 { |
|
27 return ReplacementForUnconvertibleUnicodeCharacters_internal(); |
|
28 } |
|
29 |
|
30 EXPORT_C const SCnvConversionData& CnvGb2312::ConversionData() |
|
31 { |
|
32 return conversionData; |
|
33 } |
|
34 |
|
35 EXPORT_C TBool CnvGb2312::IsCharGBBased(TInt& aConfidenceLevel, const TDesC8& aSample) |
|
36 { |
|
37 TInt sampleLength = aSample.Length(); |
|
38 aConfidenceLevel = 0; |
|
39 //WBB the following is for distiguish between big5 and GBK |
|
40 TInt totalWeight=0; //sum of the weights of 20 most frequent chars |
|
41 TInt sumOfGoodChar=0; //the number of chars whose first byte and second are both in the range |
|
42 TInt sumOfWeight=0; //sum of the weights of the chars which are included in the sample |
|
43 TInt sumOutChar=0; //the number of chars which are not common |
|
44 TInt sumOfBadSecondByte=0;//the number of chars whose first byte is in the range but not the second |
|
45 struct referenceChar |
|
46 { |
|
47 TUint charGBK; |
|
48 TInt weight; |
|
49 }; |
|
50 |
|
51 referenceChar refGbk[20]; |
|
52 static const TInt iniWeight[20]= |
|
53 { |
|
54 //occurence per 1000 chars |
|
55 30,20,20,10,10,10,10,10,5,5, |
|
56 5,5,5,5,5,5,5,5,5,5 |
|
57 }; |
|
58 |
|
59 static const TUint iniChar[20]= |
|
60 { |
|
61 0xa3ac,0xb5c4,0xc1cb,0xb8f6,0xb2bb,0xb0d1,0xd2bb,0xcac7,0xd2aa,0xbecd, |
|
62 0xd2b2,0xccec,0xc9cf,0xbacd,0xd6d0,0xd4da,0xd0a1,0xc8cb,0xcfc2,0xd6d0, |
|
63 }; |
|
64 |
|
65 for (TInt k=0; k<20; k++) |
|
66 { |
|
67 refGbk[k].charGBK=iniChar[k]; |
|
68 refGbk[k].weight=iniWeight[k]; |
|
69 totalWeight=totalWeight+iniWeight[k]; |
|
70 } |
|
71 |
|
72 |
|
73 //WBB |
|
74 for (TInt i = 0; i < sampleLength; ++i) |
|
75 { |
|
76 //GBK encoding first byte range 0x81-0xfe |
|
77 // second byte range 0x40-0x7e, 0x80-0xfe |
|
78 if((aSample[i] >= 0x81) && (aSample[i] <= 0xfe)) |
|
79 { |
|
80 TInt increment1 = i+1; |
|
81 if (increment1 >= sampleLength) |
|
82 break; |
|
83 if (((aSample[increment1] >=0x40) && (aSample[increment1] <= 0x7e)) || |
|
84 ((aSample[increment1] >=0x80) && (aSample[increment1] <= 0xfe))) |
|
85 { |
|
86 //WBB |
|
87 TUint charGbk=(aSample[i]<<8)|(aSample[increment1]); |
|
88 TInt j; |
|
89 for (j=0; j<20; j++) |
|
90 { |
|
91 if (charGbk==refGbk[j].charGBK) |
|
92 { |
|
93 sumOfWeight=sumOfWeight+refGbk[j].weight; |
|
94 break; |
|
95 } |
|
96 } |
|
97 if ((aSample[i]>=0xa4)&&(aSample[i]<=0xaf)) |
|
98 sumOutChar++; |
|
99 sumOfGoodChar++; |
|
100 i++; |
|
101 //WBB |
|
102 } |
|
103 else |
|
104 { |
|
105 sumOfBadSecondByte++; |
|
106 } |
|
107 } |
|
108 } // for |
|
109 |
|
110 TInt limit; |
|
111 limit = (10*sampleLength)/100; |
|
112 if (sumOfGoodChar > limit) |
|
113 { |
|
114 aConfidenceLevel=sumOfGoodChar*100/(sumOfBadSecondByte+sumOfGoodChar); |
|
115 aConfidenceLevel=aConfidenceLevel-Max(0,((totalWeight-sumOfWeight)*sumOfGoodChar/1000));//against frequent chars |
|
116 aConfidenceLevel=aConfidenceLevel-(sumOutChar*100/sumOfGoodChar);//against gap |
|
117 aConfidenceLevel=(aConfidenceLevel < 0)?0:aConfidenceLevel; |
|
118 } |
|
119 else |
|
120 aConfidenceLevel=0; |
|
121 return ETrue; |
|
122 } |