|
1 // Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
2 // All rights reserved. |
|
3 // This component and the accompanying materials are made available |
|
4 // under the terms of the License "Eclipse Public License v1.0" |
|
5 // which accompanies this distribution, and is available |
|
6 // at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
7 // |
|
8 // Initial Contributors: |
|
9 // Nokia Corporation - initial contribution. |
|
10 // |
|
11 // Contributors: |
|
12 // |
|
13 // Description: |
|
14 // e32\euser\unicode\unicode.cpp |
|
15 // The implementation of the base-level Unicode character classification functions. These are members of |
|
16 // a class called TUnicode that contains a Unicode value. |
|
17 // |
|
18 // |
|
19 |
|
20 #include <unicode.h> |
|
21 #include "CompareImp.h" |
|
22 |
|
23 static const TUnicodeData TheDefaultUnicodeData = |
|
24 { TChar::ECnCategory, TChar::EOtherNeutral, 0, 0, 0, TUnicodeData::ENonNumeric }; |
|
25 |
|
26 |
|
27 // Declarations for tables held in unitable.cpp and used by unicode.cpp. |
|
28 #ifndef __KERNEL_MODE__ |
|
29 extern const TStandardUnicodeDataSet TheStandardUnicodeDataSet[]; |
|
30 extern const TUnicodePlane ThePlanes[17]; |
|
31 #endif |
|
32 |
|
33 |
|
34 // Fill in a TChar::TCharInfo structure with category information about the character. |
|
35 void TUnicode::GetInfo(TChar::TCharInfo& aInfo,const TUnicodeDataSet *aOverridingDataSet) const |
|
36 { |
|
37 const TUnicodeData& data = GetData(aOverridingDataSet); |
|
38 aInfo.iCategory = (TChar::TCategory)data.iCategory; |
|
39 aInfo.iBdCategory = (TChar::TBdCategory)data.iBdCategory; |
|
40 aInfo.iCombiningClass = data.iCombiningClass; |
|
41 aInfo.iLowerCase = iCode; |
|
42 aInfo.iUpperCase = iCode; |
|
43 aInfo.iTitleCase = iCode; |
|
44 if (data.iFlags & TUnicodeData::EHasLowerCase) |
|
45 aInfo.iLowerCase = GetLowerCase(data); |
|
46 if (data.iFlags & TUnicodeData::EHasUpperCase) |
|
47 aInfo.iUpperCase = GetUpperCase(data); |
|
48 if (data.iFlags & TUnicodeData::EHasTitleCase) |
|
49 aInfo.iTitleCase = GetTitleCase(data); |
|
50 aInfo.iMirrored = data.iFlags & TUnicodeData::EMirrored; |
|
51 if (data.iFlags & TUnicodeData::ENumericFlags) |
|
52 aInfo.iNumericValue = GetNumericValue(data); |
|
53 else |
|
54 aInfo.iNumericValue = -1; |
|
55 } |
|
56 |
|
57 /* |
|
58 Get the data describing a character. If "aOverridingDataSet" is non-null, look in that |
|
59 data set before searching the standard data set. |
|
60 */ |
|
61 const TUnicodeData& TUnicode::GetData(const TUnicodeDataSet *aOverridingDataSet) const |
|
62 { |
|
63 const TUnicodeData *result = NULL; |
|
64 if (aOverridingDataSet) |
|
65 result = GetDataFromDataSet(*aOverridingDataSet); |
|
66 if (result == NULL) |
|
67 { |
|
68 if (0xFFFF >= iCode) |
|
69 { |
|
70 // optimize for BMP characters (plane 0) |
|
71 TInt index = TheStandardUnicodeDataSet[0].iIndex1[iCode >> 4]; |
|
72 if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index |
|
73 index &= ~0x8000; |
|
74 else |
|
75 index = TheStandardUnicodeDataSet[0].iIndex2[index + (iCode & 0x000F)]; |
|
76 return TheStandardUnicodeDataSet[0].iData[index]; |
|
77 } |
|
78 else |
|
79 { |
|
80 // for non-BMP characters (plane 1-16) |
|
81 TInt plane = (iCode >> 16); |
|
82 if (plane > 16) |
|
83 { |
|
84 // for now we have no data for values above U+10FFFF |
|
85 return TheDefaultUnicodeData; |
|
86 } |
|
87 TInt codesPerBlock = ThePlanes[plane].iCodesPerBlock; |
|
88 TInt maskForCodePoint = ThePlanes[plane].iMaskForCodePoint; |
|
89 |
|
90 TInt low16bit = (iCode & 0xFFFF); |
|
91 TInt index = TheStandardUnicodeDataSet[plane].iIndex1[low16bit >> codesPerBlock]; |
|
92 if (index & 0x8000) // high bit set means all values in block have the same value, and it's in the index |
|
93 index &= ~0x8000; |
|
94 else |
|
95 index = TheStandardUnicodeDataSet[plane].iIndex2[index + (low16bit & maskForCodePoint)]; |
|
96 return TheStandardUnicodeDataSet[plane].iData[index]; |
|
97 } |
|
98 } |
|
99 |
|
100 return *result; |
|
101 } |
|
102 |
|
103 /* |
|
104 Given a character data set, get the data referring to this character. |
|
105 Return NULL if no data is available in this data set. |
|
106 */ |
|
107 const TUnicodeData *TUnicode::GetDataFromDataSet(const TUnicodeDataSet& aDataSet) const |
|
108 { |
|
109 // Perform a binary chop to find the range containing this character. |
|
110 TInt n = aDataSet.iRanges; |
|
111 const TUnicodeDataRange *base = aDataSet.iRange; |
|
112 const TUnicodeDataRange *last = base + n - 1; |
|
113 const TUnicodeDataRange *r = base; |
|
114 |
|
115 while (n > 1) |
|
116 { |
|
117 TInt pivot = n / 2; |
|
118 r += pivot; |
|
119 if (iCode < r->iRangeStart) // it's before this range |
|
120 n = pivot; |
|
121 else if (r < last && iCode >= r[1].iRangeStart) // it's after this range |
|
122 { |
|
123 base = r + 1; |
|
124 n -= pivot + 1; |
|
125 } |
|
126 else // it's in this range |
|
127 break; |
|
128 r = base; |
|
129 } |
|
130 |
|
131 if (r->iIndex >= 0) |
|
132 return &aDataSet.iData[r->iIndex]; // index >= 0: data available |
|
133 else |
|
134 return NULL; // index < 0: no data available |
|
135 } |
|
136 |
|
137 EXPORT_C TChar::TCategory TUnicode::GetCategory(const TUnicodeDataSet *aOverridingDataSet) const |
|
138 { |
|
139 return (TChar::TCategory)GetData(aOverridingDataSet).iCategory; |
|
140 } |
|
141 |
|
142 TChar::TBdCategory TUnicode::GetBdCategory(const TUnicodeDataSet *aOverridingDataSet) const |
|
143 { |
|
144 return (TChar::TBdCategory)GetData(aOverridingDataSet).iBdCategory; |
|
145 } |
|
146 |
|
147 TInt TUnicode::GetCombiningClass(const TUnicodeDataSet *aOverridingDataSet) const |
|
148 { |
|
149 return GetData(aOverridingDataSet).iCombiningClass; |
|
150 } |
|
151 |
|
152 EXPORT_C TUint TUnicode::GetLowerCase(const TUnicodeDataSet *aOverridingDataSet) const |
|
153 { |
|
154 return GetLowerCase(GetData(aOverridingDataSet)); |
|
155 } |
|
156 |
|
157 EXPORT_C TUint TUnicode::GetUpperCase(const TUnicodeDataSet *aOverridingDataSet) const |
|
158 { |
|
159 return GetUpperCase(GetData(aOverridingDataSet)); |
|
160 } |
|
161 |
|
162 TUint TUnicode::GetLowerCase(const TUnicodeData& aData) const |
|
163 { |
|
164 if (aData.iFlags & TUnicodeData::EHasLowerCase) |
|
165 return iCode + aData.iCaseOffset; |
|
166 else |
|
167 return iCode; |
|
168 } |
|
169 |
|
170 TUint TUnicode::GetUpperCase(const TUnicodeData& aData) const |
|
171 { |
|
172 if (aData.iFlags & TUnicodeData::EHasUpperCase) |
|
173 return iCode - aData.iCaseOffset; |
|
174 else |
|
175 return iCode; |
|
176 } |
|
177 |
|
178 TUint TUnicode::GetTitleCase(const TUnicodeDataSet *aOverridingDataSet) const |
|
179 { |
|
180 return GetTitleCase(GetData(aOverridingDataSet)); |
|
181 } |
|
182 |
|
183 TUint TUnicode::GetTitleCase(const TUnicodeData& aData) const |
|
184 { |
|
185 // Handle the very few characters with distinct title case variants. |
|
186 if (aData.iFlags & TUnicodeData::EHasTitleCase) |
|
187 { |
|
188 // If the character has no upper case variant add one to get the title case form. |
|
189 if (!(aData.iFlags & TUnicodeData::EHasUpperCase)) |
|
190 return iCode + 1; |
|
191 // If the character has no lower case variant subtract one to get the title case form. |
|
192 if (!(aData.iFlags & TUnicodeData::EHasLowerCase)) |
|
193 return iCode - 1; |
|
194 // Both upper and lower case forms exist so the character itself must be title case. |
|
195 return iCode; |
|
196 } |
|
197 |
|
198 // All other characters have title case forms that are the same as their upper case forms. |
|
199 return GetUpperCase(aData); |
|
200 } |
|
201 |
|
202 TBool TUnicode::IsMirrored(const TUnicodeDataSet *aOverridingDataSet) const |
|
203 { |
|
204 return GetData(aOverridingDataSet).iFlags & TUnicodeData::EMirrored; |
|
205 } |
|
206 |
|
207 TInt TUnicode::GetNumericValue(const TUnicodeDataSet *aOverridingDataSet) const |
|
208 { |
|
209 return GetNumericValue(GetData(aOverridingDataSet)); |
|
210 } |
|
211 |
|
212 /* |
|
213 Return the integer numeric value of this character. |
|
214 Return -1 if the character is not numeric, or -2 if it has a fractional value. |
|
215 */ |
|
216 TInt TUnicode::GetNumericValue(const TUnicodeData& aData) const |
|
217 { |
|
218 switch (aData.iFlags & TUnicodeData::ENumericFlags) |
|
219 { |
|
220 case TUnicodeData::ENonNumeric: return -1; |
|
221 case TUnicodeData::ESmallNumeric: return (iCode + aData.iDigitOffset) & 0xFF; |
|
222 case TUnicodeData::EFiveHundred: return 500; |
|
223 case TUnicodeData::EOneThousand: return 1000; |
|
224 case TUnicodeData::EFiveThousand: return 5000; |
|
225 case TUnicodeData::ETenThousand: return 10000; |
|
226 case TUnicodeData::EHundredThousand: return 100000; |
|
227 case TUnicodeData::EFraction: return -2; |
|
228 default: return -1; // we should never come here |
|
229 } |
|
230 } |
|
231 |
|
232 struct TWidthInfo |
|
233 { |
|
234 TUint iStart; |
|
235 TUint iEnd; |
|
236 TChar::TCjkWidth iWidth; |
|
237 }; |
|
238 |
|
239 static const TWidthInfo TheWidthInfoTable[] = |
|
240 { |
|
241 { 0x0020, 0x007F, TChar::ENarrow }, |
|
242 { 0x00A2, 0x00A4, TChar::ENarrow }, |
|
243 { 0x00A5, 0x00A7, TChar::ENarrow }, |
|
244 { 0x00AF, 0x00B0, TChar::ENarrow }, |
|
245 { 0x00B1, 0x1100, TChar::ENeutralWidth }, |
|
246 { 0x1100, 0x1160, TChar::EWide }, |
|
247 { 0x1160, 0x2E80, TChar::ENeutralWidth }, |
|
248 { 0x2E80, 0xD7A4, TChar::EWide }, |
|
249 { 0xF900, 0xFA2E, TChar::EWide }, |
|
250 { 0xFE30, 0xFE6C, TChar::EWide }, |
|
251 { 0xFF01, 0xFF5F, TChar::EFullWidth }, |
|
252 { 0xFF61, 0xFFDD, TChar::EHalfWidth }, |
|
253 { 0xFFE0, 0xFFE7, TChar::EFullWidth }, |
|
254 { 0xFFE8, 0xFFEF, TChar::EHalfWidth }, |
|
255 { 0x20000, 0x2A6DF, TChar::EWide }, // CJK Unified Ideographs Extension B |
|
256 { 0x2F800, 0x2FA1F, TChar::EWide }, // CJK Unified Ideographs Supplement |
|
257 }; |
|
258 |
|
259 const TInt TheWidthInfos = sizeof(TheWidthInfoTable) / sizeof(TheWidthInfoTable[0]); |
|
260 |
|
261 /* |
|
262 Get the notional width used by East Asian encoding systems. No check is made that the character is assigned. |
|
263 No separate 'ambiguous width' is returned; ambiguous characters are treated as neutral except for those |
|
264 in the CJK range, which are treated as wide. This is a big simplification, but the cost of an exhaustive table |
|
265 is too great to justify at the moment. |
|
266 */ |
|
267 TChar::TCjkWidth TUnicode::GetCjkWidth() const |
|
268 { |
|
269 const TWidthInfo* w = TheWidthInfoTable; |
|
270 for (TInt i = 0; i < TheWidthInfos; i++, w++) |
|
271 if (iCode >= w->iStart && iCode < w->iEnd) |
|
272 return w->iWidth; |
|
273 return TChar::ENeutralWidth; |
|
274 } |
|
275 |
|
276 /* |
|
277 Convert a Unicode character into a form most likely to be equal to another character, while |
|
278 still preserving the essential meaning of the character. Possible folding operations include |
|
279 converting to lower case (TChar::EFoldCase), stripping accents (TChar::EFoldAccents) and others. |
|
280 The flag value has a default, TChar::EFoldStandard, which performs the folding operations done |
|
281 by calling Fold functions with no flags argument, and there is also TChar::EFoldAll, |
|
282 which performs all possible folding operations. |
|
283 |
|
284 Note that the difference between folding and collation is that folding is |
|
285 * character-based |
|
286 * biased towards yielding equality where possible |
|
287 while collation is |
|
288 * string-based |
|
289 * designed to yield a non-equal ordering |
|
290 |
|
291 Typically, folding will be used when searching for a match, while collation will be used when |
|
292 sorting a list. |
|
293 */ |
|
294 EXPORT_C TUint TUnicode::Fold(TInt aFlags,const TUnicodeDataSet *aOverridingDataSet) const |
|
295 { |
|
296 TUint result = iCode; |
|
297 |
|
298 /* |
|
299 Fold CJK width variants. This only applies to characters 0xFF00 and above so we can use |
|
300 a built-in table. |
|
301 */ |
|
302 if (result >= 0xFF00 && (aFlags & TChar::EFoldWidth)) |
|
303 result = CjkWidthFoldTable[result & 0xFF]; |
|
304 |
|
305 /* |
|
306 If the character is <= 0x00FF and the flags include folding case and stripping accents, |
|
307 and there is no overriding character data, we can use the built-in fold table. |
|
308 */ |
|
309 const TUnicodeData* data = NULL; |
|
310 if (aOverridingDataSet) |
|
311 data = GetDataFromDataSet(*aOverridingDataSet); |
|
312 if (data == NULL && result < 256 && |
|
313 (aFlags & (TChar::EFoldCase | TChar::EFoldAccents)) == (TChar::EFoldCase | TChar::EFoldAccents)) |
|
314 return FoldTable[result]; |
|
315 |
|
316 /* |
|
317 Other characters have to be dealt with laboriously. |
|
318 The first operations are those that, if successful, tell us that nothing more |
|
319 need be done. If a value is folded to a space or a digit or converted to Katakana |
|
320 it cannot have anything else done to it. |
|
321 */ |
|
322 if (aFlags & TChar::EFoldKana) |
|
323 { |
|
324 if ((result >= 0x3041 && result <= 0x3094) || result == 0x309D || result == 0x309E) |
|
325 return result += 0x0060; |
|
326 } |
|
327 if (data == NULL) |
|
328 data = &GetData(NULL); |
|
329 if (aFlags & TChar::EFoldSpaces) |
|
330 { |
|
331 if (data->iCategory == TChar::EZsCategory) |
|
332 return 0x0020; |
|
333 } |
|
334 if (aFlags & TChar::EFoldDigits) |
|
335 { |
|
336 TInt n = GetNumericValue(*data); |
|
337 if (n >= 0 && n <= 9) |
|
338 return 0x0030 + n; |
|
339 } |
|
340 |
|
341 /* |
|
342 The final operations are the relatively rare and expensive ones (after the special |
|
343 case dealt with above) of accent removal and case conversion. |
|
344 */ |
|
345 if ((aFlags & TChar::EFoldAccents) && (result < 0x2000)) |
|
346 { |
|
347 /* |
|
348 Throw away characters other than the first if all are accents. For the moment these |
|
349 are defined as characters in the range 0x0300..0x0361. This definition may need |
|
350 to be modified; or I may decide to store a flag in the decomposition table indicating |
|
351 whether or not the decomposition consists of base + accent(s). |
|
352 */ |
|
353 TPtrC16 decomposition; |
|
354 if (::DecomposeChar(iCode, decomposition)) |
|
355 { |
|
356 TBool all_accents = TRUE; |
|
357 for (TInt i = 1; all_accents && i < decomposition.Length(); ++i) |
|
358 { |
|
359 if (decomposition[i] < 0x0300 || decomposition[i] > 0x0361) |
|
360 all_accents = FALSE; |
|
361 } |
|
362 if (all_accents) |
|
363 result = decomposition[0]; |
|
364 } |
|
365 } |
|
366 |
|
367 if (aFlags & TChar::EFoldCase) |
|
368 { |
|
369 if (aOverridingDataSet == NULL && result < 256) |
|
370 result = FoldTable[result]; |
|
371 else |
|
372 result = TUnicode(result).GetLowerCase(aOverridingDataSet); |
|
373 } |
|
374 |
|
375 return result; |
|
376 } |
|
377 |
|
378 /* |
|
379 Compare two Unicode strings naively by Unicode value. This is NOT the same as a comparison |
|
380 of null-terminated strings; the strings can contain null characters (Unicode 0x0000) and they |
|
381 compare greater than no character. This means that the string { 0x0001 0x0000 } always comes |
|
382 after the string { 0x0001 }. |
|
383 |
|
384 This function exists to make it easier to search tables of Unicode strings (like the composition |
|
385 buffer) using the binary chop method. It is also used by READTYPE when sorting the compose table. |
|
386 |
|
387 The return values are: 0 for equality, < 0 if aString1 < aString2, > 0 if aString1 > aString2. |
|
388 */ |
|
389 TInt TUnicode::Compare(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2) |
|
390 { |
|
391 for (TInt i = 0; i < aLength1 || i < aLength2; i++, aString1++, aString2++) |
|
392 { |
|
393 TInt x = i < aLength1 ? *aString1 : -1; |
|
394 TInt y = i < aLength2 ? *aString2 : -1; |
|
395 if (x != y) |
|
396 return x - y; |
|
397 } |
|
398 return 0; |
|
399 } |
|
400 |