|
1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
2 // All rights reserved. |
|
3 // This component and the accompanying materials are made available |
|
4 // under the terms of the License "Symbian Foundation License v1.0" |
|
5 // which accompanies this distribution, and is available |
|
6 // at the URL "http://www.symbianfoundation.org/legal/sfl-v10.html". |
|
7 // |
|
8 // Initial Contributors: |
|
9 // Nokia Corporation - initial contribution. |
|
10 // |
|
11 // Contributors: |
|
12 // |
|
13 // Description: |
|
14 // e32\euser\unicode\unicode.cpp |
|
15 // The implementation of the base-level Unicode character classification functions. These are members of |
|
16 // a class called TUnicode that contains a Unicode value. |
|
17 // |
|
18 // |
|
19 |
|
20 #include <unicode.h> |
|
21 #include "compareimp.h" |
|
22 |
|
23 static const TUnicodeData TheDefaultUnicodeData = |
|
24 { TChar::ECnCategory, TChar::EOtherNeutral, 0, 0, 0, TUnicodeData::ENonNumeric }; |
|
25 |
|
26 // Fill in a TChar::TCharInfo structure with category information about the character. |
|
27 void TUnicode::GetInfo(TChar::TCharInfo& aInfo,const TUnicodeDataSet *aOverridingDataSet) const |
|
28 { |
|
29 const TUnicodeData& data = GetData(aOverridingDataSet); |
|
30 aInfo.iCategory = (TChar::TCategory)data.iCategory; |
|
31 aInfo.iBdCategory = (TChar::TBdCategory)data.iBdCategory; |
|
32 aInfo.iCombiningClass = data.iCombiningClass; |
|
33 aInfo.iLowerCase = iCode; |
|
34 aInfo.iUpperCase = iCode; |
|
35 aInfo.iTitleCase = iCode; |
|
36 if (data.iFlags & TUnicodeData::EHasLowerCase) |
|
37 aInfo.iLowerCase = GetLowerCase(data); |
|
38 if (data.iFlags & TUnicodeData::EHasUpperCase) |
|
39 aInfo.iUpperCase = GetUpperCase(data); |
|
40 if (data.iFlags & TUnicodeData::EHasTitleCase) |
|
41 aInfo.iTitleCase = GetTitleCase(data); |
|
42 aInfo.iMirrored = data.iFlags & TUnicodeData::EMirrored; |
|
43 if (data.iFlags & TUnicodeData::ENumericFlags) |
|
44 aInfo.iNumericValue = GetNumericValue(data); |
|
45 else |
|
46 aInfo.iNumericValue = -1; |
|
47 } |
|
48 |
|
49 /* |
|
50 Get the data describing a character. If "aOverridingDataSet" is non-null, look in that |
|
51 data set before searching the standard data set. |
|
52 */ |
|
53 const TUnicodeData& TUnicode::GetData(const TUnicodeDataSet *aOverridingDataSet) const |
|
54 { |
|
55 const TUnicodeData *result = NULL; |
|
56 if (aOverridingDataSet) |
|
57 result = GetDataFromDataSet(*aOverridingDataSet); |
|
58 if (result == NULL) |
|
59 { |
|
60 if (0xFFFF < iCode) |
|
61 // for now we have no data for values above U+FFFF |
|
62 return TheDefaultUnicodeData; |
|
63 int index = TheStandardUnicodeDataSet.iIndex1[iCode >> 4]; |
|
64 if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index |
|
65 index &= ~0x8000; |
|
66 else |
|
67 index = TheStandardUnicodeDataSet.iIndex2[index + (iCode & 0x000F)]; |
|
68 return TheStandardUnicodeDataSet.iData[index]; |
|
69 } |
|
70 |
|
71 return *result; |
|
72 } |
|
73 |
|
74 /* |
|
75 Given a character data set, get the data referring to this character. |
|
76 Return NULL if no data is available in this data set. |
|
77 */ |
|
78 const TUnicodeData *TUnicode::GetDataFromDataSet(const TUnicodeDataSet& aDataSet) const |
|
79 { |
|
80 // Perform a binary chop to find the range containing this character. |
|
81 TInt n = aDataSet.iRanges; |
|
82 const TUnicodeDataRange *base = aDataSet.iRange; |
|
83 const TUnicodeDataRange *last = base + n - 1; |
|
84 const TUnicodeDataRange *r = base; |
|
85 |
|
86 while (n > 1) |
|
87 { |
|
88 TInt pivot = n / 2; |
|
89 r += pivot; |
|
90 if (iCode < r->iRangeStart) // it's before this range |
|
91 n = pivot; |
|
92 else if (r < last && iCode >= r[1].iRangeStart) // it's after this range |
|
93 { |
|
94 base = r + 1; |
|
95 n -= pivot + 1; |
|
96 } |
|
97 else // it's in this range |
|
98 break; |
|
99 r = base; |
|
100 } |
|
101 |
|
102 if (r->iIndex >= 0) |
|
103 return &aDataSet.iData[r->iIndex]; // index >= 0: data available |
|
104 else |
|
105 return NULL; // index < 0: no data available |
|
106 } |
|
107 |
|
108 EXPORT_C TChar::TCategory TUnicode::GetCategory(const TUnicodeDataSet *aOverridingDataSet) const |
|
109 { |
|
110 return (TChar::TCategory)GetData(aOverridingDataSet).iCategory; |
|
111 } |
|
112 |
|
113 TChar::TBdCategory TUnicode::GetBdCategory(const TUnicodeDataSet *aOverridingDataSet) const |
|
114 { |
|
115 return (TChar::TBdCategory)GetData(aOverridingDataSet).iBdCategory; |
|
116 } |
|
117 |
|
118 TInt TUnicode::GetCombiningClass(const TUnicodeDataSet *aOverridingDataSet) const |
|
119 { |
|
120 return GetData(aOverridingDataSet).iCombiningClass; |
|
121 } |
|
122 |
|
123 EXPORT_C TUint TUnicode::GetLowerCase(const TUnicodeDataSet *aOverridingDataSet) const |
|
124 { |
|
125 return GetLowerCase(GetData(aOverridingDataSet)); |
|
126 } |
|
127 |
|
128 EXPORT_C TUint TUnicode::GetUpperCase(const TUnicodeDataSet *aOverridingDataSet) const |
|
129 { |
|
130 return GetUpperCase(GetData(aOverridingDataSet)); |
|
131 } |
|
132 |
|
133 TUint TUnicode::GetLowerCase(const TUnicodeData& aData) const |
|
134 { |
|
135 if (aData.iFlags & TUnicodeData::EHasLowerCase) |
|
136 return iCode + aData.iCaseOffset; |
|
137 else |
|
138 return iCode; |
|
139 } |
|
140 |
|
141 TUint TUnicode::GetUpperCase(const TUnicodeData& aData) const |
|
142 { |
|
143 if (aData.iFlags & TUnicodeData::EHasUpperCase) |
|
144 return iCode - aData.iCaseOffset; |
|
145 else |
|
146 return iCode; |
|
147 } |
|
148 |
|
149 TUint TUnicode::GetTitleCase(const TUnicodeDataSet *aOverridingDataSet) const |
|
150 { |
|
151 return GetTitleCase(GetData(aOverridingDataSet)); |
|
152 } |
|
153 |
|
154 TUint TUnicode::GetTitleCase(const TUnicodeData& aData) const |
|
155 { |
|
156 // Handle the very few characters with distinct title case variants. |
|
157 if (aData.iFlags & TUnicodeData::EHasTitleCase) |
|
158 { |
|
159 // If the character has no upper case variant add one to get the title case form. |
|
160 if (!(aData.iFlags & TUnicodeData::EHasUpperCase)) |
|
161 return iCode + 1; |
|
162 // If the character has no lower case variant subtract one to get the title case form. |
|
163 if (!(aData.iFlags & TUnicodeData::EHasLowerCase)) |
|
164 return iCode - 1; |
|
165 // Both upper and lower case forms exist so the character itself must be title case. |
|
166 return iCode; |
|
167 } |
|
168 |
|
169 // All other characters have title case forms that are the same as their upper case forms. |
|
170 return GetUpperCase(aData); |
|
171 } |
|
172 |
|
173 TBool TUnicode::IsMirrored(const TUnicodeDataSet *aOverridingDataSet) const |
|
174 { |
|
175 return GetData(aOverridingDataSet).iFlags & TUnicodeData::EMirrored; |
|
176 } |
|
177 |
|
178 TInt TUnicode::GetNumericValue(const TUnicodeDataSet *aOverridingDataSet) const |
|
179 { |
|
180 return GetNumericValue(GetData(aOverridingDataSet)); |
|
181 } |
|
182 |
|
183 /* |
|
184 Return the integer numeric value of this character. |
|
185 Return -1 if the character is not numeric, or -2 if it has a fractional value. |
|
186 */ |
|
187 TInt TUnicode::GetNumericValue(const TUnicodeData& aData) const |
|
188 { |
|
189 switch (aData.iFlags & TUnicodeData::ENumericFlags) |
|
190 { |
|
191 case TUnicodeData::ENonNumeric: return -1; |
|
192 case TUnicodeData::ESmallNumeric: return (iCode + aData.iDigitOffset) & 0xFF; |
|
193 case TUnicodeData::EFiveHundred: return 500; |
|
194 case TUnicodeData::EOneThousand: return 1000; |
|
195 case TUnicodeData::EFiveThousand: return 5000; |
|
196 case TUnicodeData::ETenThousand: return 10000; |
|
197 case TUnicodeData::EHundredThousand: return 100000; |
|
198 case TUnicodeData::EFraction: return -2; |
|
199 default: return -1; // we should never come here |
|
200 } |
|
201 } |
|
202 |
|
203 struct TWidthInfo |
|
204 { |
|
205 TUint iStart; |
|
206 TUint iEnd; |
|
207 TChar::TCjkWidth iWidth; |
|
208 }; |
|
209 |
|
210 static const TWidthInfo TheWidthInfoTable[] = |
|
211 { |
|
212 { 0x0020, 0x007F, TChar::ENarrow }, |
|
213 { 0x00A2, 0x00A4, TChar::ENarrow }, |
|
214 { 0x00A5, 0x00A7, TChar::ENarrow }, |
|
215 { 0x00AF, 0x00B0, TChar::ENarrow }, |
|
216 { 0x00B1, 0x1100, TChar::ENeutralWidth }, |
|
217 { 0x1100, 0x1160, TChar::EWide }, |
|
218 { 0x1160, 0x2E80, TChar::ENeutralWidth }, |
|
219 { 0x2E80, 0xD7A4, TChar::EWide }, |
|
220 { 0xF900, 0xFA2E, TChar::EWide }, |
|
221 { 0xFE30, 0xFE6C, TChar::EWide }, |
|
222 { 0xFF01, 0xFF5F, TChar::EFullWidth }, |
|
223 { 0xFF61, 0xFFDD, TChar::EHalfWidth }, |
|
224 { 0xFFE0, 0xFFE7, TChar::EFullWidth }, |
|
225 { 0xFFE8, 0xFFEF, TChar::EHalfWidth } |
|
226 }; |
|
227 |
|
228 const TInt TheWidthInfos = sizeof(TheWidthInfoTable) / sizeof(TheWidthInfoTable[0]); |
|
229 |
|
230 /* |
|
231 Get the notional width used by East Asian encoding systems. No check is made that the character is assigned. |
|
232 No separate 'ambiguous width' is returned; ambiguous characters are treated as neutral except for those |
|
233 in the CJK range, which are treated as wide. This is a big simplification, but the cost of an exhaustive table |
|
234 is too great to justify at the moment. |
|
235 */ |
|
236 TChar::TCjkWidth TUnicode::GetCjkWidth() const |
|
237 { |
|
238 const TWidthInfo* w = TheWidthInfoTable; |
|
239 for (int i = 0; i < TheWidthInfos; i++, w++) |
|
240 if (iCode >= w->iStart && iCode < w->iEnd) |
|
241 return w->iWidth; |
|
242 return TChar::ENeutralWidth; |
|
243 } |
|
244 |
|
245 /* |
|
246 Convert a Unicode character into a form most likely to be equal to another character, while |
|
247 still preserving the essential meaning of the character. Possible folding operations include |
|
248 converting to lower case (TChar::EFoldCase), stripping accents (TChar::EFoldAccents) and others. |
|
249 The flag value has a default, TChar::EFoldStandard, which performs the folding operations done |
|
250 by calling Fold functions with no flags argument, and there is also TChar::EFoldAll, |
|
251 which performs all possible folding operations. |
|
252 |
|
253 Note that the difference between folding and collation is that folding is |
|
254 * character-based |
|
255 * biased towards yielding equality where possible |
|
256 while collation is |
|
257 * string-based |
|
258 * designed to yield a non-equal ordering |
|
259 |
|
260 Typically, folding will be used when searching for a match, while collation will be used when |
|
261 sorting a list. |
|
262 */ |
|
263 EXPORT_C TUint TUnicode::Fold(TInt aFlags,const TUnicodeDataSet *aOverridingDataSet) const |
|
264 { |
|
265 TUint result = iCode; |
|
266 |
|
267 /* |
|
268 Fold CJK width variants. This only applies to characters 0xFF00 and above so we can use |
|
269 a built-in table. |
|
270 */ |
|
271 if (result >= 0xFF00 && (aFlags & TChar::EFoldWidth)) |
|
272 result = CjkWidthFoldTable[result & 0xFF]; |
|
273 |
|
274 /* |
|
275 If the character is <= 0x00FF and the flags include folding case and stripping accents, |
|
276 and there is no overriding character data, we can use the built-in fold table. |
|
277 */ |
|
278 const TUnicodeData* data = NULL; |
|
279 if (aOverridingDataSet) |
|
280 data = GetDataFromDataSet(*aOverridingDataSet); |
|
281 if (data == NULL && result < 256 && |
|
282 (aFlags & (TChar::EFoldCase | TChar::EFoldAccents)) == (TChar::EFoldCase | TChar::EFoldAccents)) |
|
283 return FoldTable[result]; |
|
284 |
|
285 /* |
|
286 Other characters have to be dealt with laboriously. |
|
287 The first operations are those that, if successful, tell us that nothing more |
|
288 need be done. If a value is folded to a space or a digit or converted to Katakana |
|
289 it cannot have anything else done to it. |
|
290 */ |
|
291 if (aFlags & TChar::EFoldKana) |
|
292 { |
|
293 if ((result >= 0x3041 && result <= 0x3094) || result == 0x309D || result == 0x309E) |
|
294 return result += 0x0060; |
|
295 } |
|
296 if (data == NULL) |
|
297 data = &GetData(NULL); |
|
298 if (aFlags & TChar::EFoldSpaces) |
|
299 { |
|
300 if (data->iCategory == TChar::EZsCategory) |
|
301 return 0x0020; |
|
302 } |
|
303 if (aFlags & TChar::EFoldDigits) |
|
304 { |
|
305 TInt n = GetNumericValue(*data); |
|
306 if (n >= 0 && n <= 9) |
|
307 return 0x0030 + n; |
|
308 } |
|
309 |
|
310 /* |
|
311 The final operations are the relatively rare and expensive ones (after the special |
|
312 case dealt with above) of accent removal and case conversion. |
|
313 */ |
|
314 if ((aFlags & TChar::EFoldAccents) && (result < 0x2000)) |
|
315 { |
|
316 /* |
|
317 Throw away characters other than the first if all are accents. For the moment these |
|
318 are defined as characters in the range 0x0300..0x0361. This definition may need |
|
319 to be modified; or I may decide to store a flag in the decomposition table indicating |
|
320 whether or not the decomposition consists of base + accent(s). |
|
321 */ |
|
322 TPtrC16 decomposition; |
|
323 if (::DecomposeChar(iCode, decomposition)) |
|
324 { |
|
325 TBool all_accents = TRUE; |
|
326 for (TInt i = 1; all_accents && i < decomposition.Length(); ++i) |
|
327 { |
|
328 if (decomposition[i] < 0x0300 || decomposition[i] > 0x0361) |
|
329 all_accents = FALSE; |
|
330 } |
|
331 if (all_accents) |
|
332 result = decomposition[0]; |
|
333 } |
|
334 } |
|
335 |
|
336 if (aFlags & TChar::EFoldCase) |
|
337 { |
|
338 if (aOverridingDataSet == NULL && result < 256) |
|
339 result = FoldTable[result]; |
|
340 else |
|
341 result = TUnicode(result).GetLowerCase(aOverridingDataSet); |
|
342 } |
|
343 |
|
344 return result; |
|
345 } |
|
346 |
|
347 /* |
|
348 Compare two Unicode strings naively by Unicode value. This is NOT the same as a comparison |
|
349 of null-terminated strings; the strings can contain null characters (Unicode 0x0000) and they |
|
350 compare greater than no character. This means that the string { 0x0001 0x0000 } always comes |
|
351 after the string { 0x0001 }. |
|
352 |
|
353 This function exists to make it easier to search tables of Unicode strings (like the composition |
|
354 buffer) using the binary chop method. It is also used by READTYPE when sorting the compose table. |
|
355 |
|
356 The return values are: 0 for equality, < 0 if aString1 < aString2, > 0 if aString1 > aString2. |
|
357 */ |
|
358 TInt TUnicode::Compare(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2) |
|
359 { |
|
360 for (TInt i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++) |
|
361 { |
|
362 TInt x = i < aLength1 ? *aString1 : -1; |
|
363 TInt y = i < aLength2 ? *aString2 : -1; |
|
364 if (x != y) |
|
365 return x - y; |
|
366 } |
|
367 return 0; |
|
368 } |
|
369 |