|
1 // Copyright (c) 1996-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
2 // All rights reserved. |
|
3 // This component and the accompanying materials are made available |
|
4 // under the terms of the License "Eclipse Public License v1.0" |
|
5 // which accompanies this distribution, and is available |
|
6 // at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
7 // |
|
8 // Initial Contributors: |
|
9 // Nokia Corporation - initial contribution. |
|
10 // |
|
11 // Contributors: |
|
12 // |
|
13 // Description: |
|
14 // e32\include\collate.h |
|
15 // Definitions needed for Unicode collation. |
|
16 // Collation is the comparison of two Unicode strings to produce an ordering |
|
17 // that may be used in a dictionary or other list. |
|
18 // Collation is implemented using the Standard Unicode Collation algorithm. There |
|
19 // are four levels of comparison: |
|
20 // primary: basic character identity |
|
21 // secondary: accents and diacritics |
|
22 // tertiary: upper and lower case, and other minor attributes |
|
23 // quaternary: Unicode character value |
|
24 // Punctuation is normally ignored but can optionally be taken into account. |
|
25 // Strings are fully expanded using the standard Unicode canonical expansions before |
|
26 // they are compared. Thai and Lao vowels are swapped with the following character |
|
27 // if any. |
|
28 // EUSER contains the 'basic collation method'. This method assigns the standard Unicode collation key values |
|
29 // to the characters in the WGL4 repertoire, plus commonly used control characters and fixed-width spaces, plus |
|
30 // the CJK ideograms (for which the keys can be generated algorithmically). Other characters are collated after |
|
31 // all the characters for which keys are defined, and ordered by their Unicode values. |
|
32 // Locales can supply any number of other collation methods. They will usually supply a 'tailoring' of the standard |
|
33 // method. This is done by using the standard table as the main key table (signalled by placing NULL in |
|
34 // TCollationMethod::iMainTable) and specifying an override table (TCollationMethod::iOverrideTable). |
|
35 // Locale-specific collation data resides in ELOCL. |
|
36 // |
|
37 // WARNING: This file contains some APIs which are internal and are subject |
|
38 // to change without notice. Such APIs should therefore not be used |
|
39 // outside the Kernel and Hardware Services package. |
|
40 // |
|
41 |
|
42 #ifndef __COLLATE_H__ |
|
43 #define __COLLATE_H__ |
|
44 |
|
45 #ifdef __KERNEL_MODE__ |
|
46 #include <e32cmn.h> |
|
47 #else |
|
48 #include <e32std.h> |
|
49 #endif |
|
50 |
|
51 //This material is used in the Unicode build only. |
|
52 #ifdef _UNICODE |
|
53 |
|
54 /** |
|
55 Collation key table structure. |
|
56 @publishedPartner |
|
57 @released |
|
58 */ |
|
59 struct TCollationKeyTable |
|
60 { |
|
61 public: |
|
62 /** |
|
63 Masks for the various parts of the elements of the iKey array. |
|
64 */ |
|
65 enum |
|
66 { |
|
67 ELevel0Mask = 0xFFFF0000, // primary key - basic character identity |
|
68 ELevel1Mask = 0x0000FF00, // secondary key - accents and diacritics |
|
69 ELevel2Mask = 0x000000FC, // tertiary key - case, etc. |
|
70 EIgnoreFlag = 0x2, // if set, this key is normally ignored |
|
71 EStopFlag = 0x1 // if set, this key is the last in a sequence representing a Unicode value or values |
|
72 }; |
|
73 |
|
74 /** |
|
75 An array containing all of the keys and strings of keys concatenated |
|
76 together. Each key has EStopFlag set only if it is the last key in its |
|
77 string. Eack key contains the keys for levels 0, 1 and 2, and a flag |
|
78 EIgnoreFlag if the key is usually ignored (for punctuation & spaces |
|
79 etc.). |
|
80 */ |
|
81 const TUint32* iKey; |
|
82 /** |
|
83 An array of indices into the iKey array. Each element has its high 16 |
|
84 bits indicating a Unicode value and its low 16 bits indicating an index |
|
85 into the iKey array at which its key starts. For surrogate pairs, high |
|
86 surrogate code is in index[i]:16-31, and low surrogate code is in |
|
87 index[i+1]:16-31. These two elements are combined to represent a surrogate |
|
88 pair. The elements are sorted by Unicode value. |
|
89 */ |
|
90 const TUint32* iIndex; |
|
91 /** |
|
92 The size of the iIndex array. |
|
93 */ |
|
94 TInt iIndices; |
|
95 /** |
|
96 Concatenated Unicode strings. Each is a strings that is to be converted |
|
97 to keys differently from how it would be if each letter were converted |
|
98 independently. An example is "ch" in Spanish, which sorts as though it |
|
99 were a single letter. Each Unicode string is preceeded by a 16-bit value |
|
100 indicating the string's length (in 16-bit). The end of the string is not |
|
101 delimited. A surrogate pair is represented by two ajacent 16-bit values. |
|
102 */ |
|
103 const TUint16* iString; |
|
104 /** |
|
105 An array of elements mapping elements of iString to elements of iIndex. |
|
106 Each element has its high 16 bits indicating the index of the start of |
|
107 an element of iString, and its low 16 bits indicating the corresponding |
|
108 element in iIndex. This array is sorted on the string index. |
|
109 */ |
|
110 const TUint32* iStringIndex; |
|
111 /** |
|
112 The size of the iStringIndex array. |
|
113 */ |
|
114 TInt iStringIndices; |
|
115 }; |
|
116 |
|
117 /** |
|
118 Defines a collation method. |
|
119 |
|
120 Collation means sorting pieces of text. It needs to take into account characters, |
|
121 accents and case; spaces and punctuation are usually ignored. It differs from |
|
122 ordinary methods of sorting in that it is locale-dependent - different |
|
123 languages use different ordering methods. Additionally, multiple collation |
|
124 methods may exist within the same locale. |
|
125 |
|
126 A collation method provides the collation keys and other data needed to customise |
|
127 collation; the Mem and TDesC16 collation functions (e.g. Mem::CompareC()) |
|
128 perform the collation. Note that these functions use the standard collation |
|
129 method for the current locale - you only need to specify an object of class |
|
130 TCollationMethod to customise this collation scheme. Collation methods can |
|
131 be retrieved using member functions of the Mem class. Each one has a unique |
|
132 identifier. |
|
133 |
|
134 A collation method specifies a main table of collation keys, and optionally |
|
135 an overriding table that contains keys for which the values in the main table |
|
136 are overridden. A collation key table (TCollationKeyTable) is the set of collation |
|
137 keys: primary (basic character identity), secondary (accents and diacritics) |
|
138 and tertiary (case). The quaternary key is the Unicode character values themselves. |
|
139 |
|
140 The simplest way to customise a collation method is to create a local copy |
|
141 of the standard collation method and change it. For example, you could use |
|
142 the standard method, but not ignore punctuation and spaces: |
|
143 |
|
144 @code |
|
145 TCollationMethod m = *Mem::CollationMethodByIndex(0); // get the standard method |
|
146 m.iFlags |= TCollationMethod::EIgnoreNone; // dont ignore punctuation and spaces |
|
147 @endcode |
|
148 |
|
149 @publishedPartner |
|
150 @released |
|
151 */ |
|
152 struct TCollationMethod |
|
153 { |
|
154 public: |
|
155 /** |
|
156 The UID of this collation method. |
|
157 */ |
|
158 TUint iId; |
|
159 |
|
160 /** |
|
161 The main collation key table; if NULL, use the standard table. |
|
162 */ |
|
163 const TCollationKeyTable* iMainTable; |
|
164 |
|
165 /** |
|
166 If non-NULL, tailoring for collation keys. |
|
167 */ |
|
168 const TCollationKeyTable* iOverrideTable; |
|
169 enum |
|
170 { |
|
171 /** |
|
172 Don't ignore any keys (punctuation, etc. is normally ignored). |
|
173 */ |
|
174 EIgnoreNone = 1, |
|
175 |
|
176 /** |
|
177 Reverse the normal order for characters differing only in case |
|
178 */ |
|
179 ESwapCase = 2, |
|
180 |
|
181 /** |
|
182 Compare secondary keys which represent accents in reverse |
|
183 order (from right to left); this is needed for French when comparing |
|
184 words that differ only in accents. |
|
185 */ |
|
186 EAccentsBackwards = 4, |
|
187 |
|
188 /** |
|
189 Reverse the normal order for characters differing only in whether they |
|
190 are katakana or hiragana. |
|
191 */ |
|
192 ESwapKana = 8, |
|
193 |
|
194 /** |
|
195 Fold all characters to lower case before extracting keys; needed for |
|
196 comparison of filenames, for which case is ignored but other |
|
197 tertiary (level-2) distinctions are not. |
|
198 */ |
|
199 EFoldCase = 16, |
|
200 |
|
201 /** Flag to indicate a collation method for matching purpose |
|
202 This flag is only needed if we wish to specify a particular collation method |
|
203 to be used for matching purpose. |
|
204 */ |
|
205 EMatchingTable = 32, |
|
206 |
|
207 /** Ignore the check for adjacent combining characters. A combining |
|
208 character effectively changes the character it combines with to something |
|
209 else and so a match doesn't occur. Setting this flag will allow character |
|
210 matching regardless of any combining characters. |
|
211 */ |
|
212 EIgnoreCombining = 64 |
|
213 }; |
|
214 |
|
215 /** |
|
216 Flags. |
|
217 |
|
218 @see TCollationMethod::EIgnoreNone |
|
219 @see TCollationMethod::ESwapCase |
|
220 @see TCollationMethod::EAccentsBackwards |
|
221 @see TCollationMethod::ESwapKana |
|
222 @see TCollationMethod::EFoldCase |
|
223 */ |
|
224 TUint iFlags; |
|
225 }; |
|
226 |
|
227 /** |
|
228 A collation data set provides any collation methods needed by a locale. |
|
229 @publishedPartner |
|
230 @released |
|
231 */ |
|
232 struct TCollationDataSet |
|
233 { |
|
234 public: |
|
235 const TCollationMethod* iMethod; |
|
236 TInt iMethods; |
|
237 }; |
|
238 |
|
239 // Collation method IDs |
|
240 |
|
241 /** |
|
242 A collation data set provides any collation methods needed by a locale. |
|
243 @internalTechnology |
|
244 @released |
|
245 */ |
|
246 const TUint KUidBasicCollationMethod = 0x10004F4E; |
|
247 |
|
248 /** |
|
249 A collation data set provides any collation methods needed by a locale. |
|
250 @internalTechnology |
|
251 @released |
|
252 */ |
|
253 const TUint KUidStandardUnicodeCollationMethod = 0x10004E96; |
|
254 |
|
255 #ifndef __KERNEL_MODE__ |
|
256 |
|
257 //Forward declarations |
|
258 class TUTF32Iterator; |
|
259 struct LCharSet; |
|
260 |
|
261 /** |
|
262 Provides low-level collation functions. |
|
263 @internalComponent |
|
264 @released |
|
265 */ |
|
266 class TCollate |
|
267 { |
|
268 public: |
|
269 /** |
|
270 Construct a TCollate object based on the collation method specified |
|
271 within aCharSet, if any. If there is none, or aCharSet is null, the |
|
272 standard collation method will be used. aMask and aFlags provide a |
|
273 method for overriding the flags in the collation method: Each flag set |
|
274 to 1 in aMask is a flag that will be overridden and set to the |
|
275 corresponding flag value in aFlags. Ownership of aCharSet is not passed. |
|
276 */ |
|
277 TCollate(const LCharSet* aCharSet,TUint aMask = 0,TUint aFlags = 0xFFFFFFFF); |
|
278 /** |
|
279 Construct a TCollate object based on an already constructed |
|
280 TCollationMethod specified in aMethod. Ownership is not passed. |
|
281 */ |
|
282 TCollate(const TCollationMethod& aMethod); |
|
283 |
|
284 enum TComparisonResult |
|
285 { |
|
286 ELeftComparesLessAndIsNotPrefix = -2, |
|
287 ELeftIsPrefixOfRight = -1, |
|
288 EStringsIdentical = 0, |
|
289 ERightIsPrefixOfLeft = 1, |
|
290 ERightComparesLessAndIsNotPrefix = 2 |
|
291 }; |
|
292 |
|
293 /** |
|
294 Compare the string beginning at aString1 of length aLength1 against the |
|
295 string beginning at aString2 of length aLength2. |
|
296 aMaxLevel determines the tightness of the collation. At level 0, only |
|
297 character identities are distinguished. At level 1 accents are |
|
298 distinguished as well. At level 2 case is distinguishes as well. At |
|
299 level 3 all valid different Unicode characters are considered different. |
|
300 */ |
|
301 TComparisonResult Compare(const TUint16* aString1,TInt aLength1, |
|
302 const TUint16* aString2,TInt aLength2, |
|
303 TInt aMaxLevel = 3) const; |
|
304 /** |
|
305 Find the string beginning at aString2 of length aLength2 in the string |
|
306 beginning at aString1 of length aLength1. aMaxLevel determines |
|
307 the tightness of the collation, see Compare for details. |
|
308 */ |
|
309 TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2, |
|
310 TInt aMaxLevel,TUint aString2WildChar = 0) const; |
|
311 |
|
312 TInt Find(const TUint16 *aString1,TInt aLength1,const TUint16 *aString2,TInt aLength2, |
|
313 TInt &aLengthFound,TInt aMaxLevel,TUint aString2WildChar = 0) const; |
|
314 |
|
315 /** |
|
316 Test if the string beginning at aSearchTerm of length aSearchTermLength |
|
317 matches the string beginning at aCandidate of length aCandidateLength. |
|
318 aMaxLevel determines the tightness of the collation, see |
|
319 Compare for details. The search term may have wild card characters as |
|
320 specified by aWildChar (for matching a single grapheme- i.e. character |
|
321 and any characters that combine with it, such as accents) and |
|
322 aWildSequenceChar (for matching any sequence of whole graphemes). The |
|
323 return value is KErrNotFound iff the search term does not match the |
|
324 candidate string exactly. To find a match within the candidate string, |
|
325 the search term must begin and end with a wild sequence character. If |
|
326 the search term does match the candidate string, 0 will be returned, |
|
327 unless the first character of the search term is a wild sequence |
|
328 character in which case the value returned will be the index into |
|
329 aCandidate at which the first non-wild sequence character matched. |
|
330 aWildSequenceChar must be a valid (non-surrogate) Unicode character |
|
331 below FFFE. |
|
332 */ |
|
333 TInt Match(const TUint16 *aCandidate, TInt aCandidateLength, |
|
334 const TUint16 *aSearchTerm,TInt aSearchTermLength, |
|
335 TInt aMaxLevel, TUint aWildChar = '?', TUint aWildSequenceChar = '*', TUint aEscapeChar = 0) const; |
|
336 |
|
337 private: |
|
338 /** |
|
339 Compare values output from the iterators. After the comparison, if |
|
340 ERightIsPrefixOfLeft or EStringsIdentical is returned, then aLeft and |
|
341 aRight will be pointing at the next key (at MaxLevel) after the match. |
|
342 If right is shown to be a prefix of left, this means that it has been |
|
343 checked at all requested levels. If it is reported that the right is a |
|
344 prefix of the left, then this will mean also that there are no unmatched |
|
345 combining characters on the left. |
|
346 */ |
|
347 TComparisonResult CompareKeySequences(TUTF32Iterator& aLeft, TUTF32Iterator& aRight, |
|
348 TInt aMaxLevel, TInt aRightStringWildChar, TInt aEscapeChar) const; |
|
349 /** |
|
350 Finds search term inside candidate string. Returns KErrNotFound if there |
|
351 is no match, returns the offset into the candidate string at which the |
|
352 search term was found (note that this is the offset from the start of |
|
353 the iteration, not from where the iteration was when the function was |
|
354 called). If a string was found, the search term iterator is left |
|
355 pointing at the end of the search term, and the candidate iterator is |
|
356 left pointing just after the matched keys. aMatchPos returns where in |
|
357 the candidate string the match was found. |
|
358 */ |
|
359 TInt FindKeySequence(TUTF32Iterator& aCandidate, TUTF32Iterator& aSearchTerm, |
|
360 TInt aMaxLevel, TInt aWildChar, TInt aEscapeChar, TInt& aLengthFound) const; |
|
361 |
|
362 private: |
|
363 TCollationMethod iMethod; |
|
364 }; |
|
365 |
|
366 #endif // __KERNEL_MODE__ |
|
367 |
|
368 #endif // _UNICODE |
|
369 |
|
370 #endif // __COLLATE_H__ |