|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 2002-2005, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: uprops.h |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 2002feb24 |
|
14 * created by: Markus W. Scherer |
|
15 * |
|
16 * Constants for mostly non-core Unicode character properties |
|
17 * stored in uprops.icu. |
|
18 */ |
|
19 |
|
20 #ifndef __UPROPS_H__ |
|
21 #define __UPROPS_H__ |
|
22 |
|
23 #include "unicode/utypes.h" |
|
24 #include "unicode/uset.h" |
|
25 #include "uset_imp.h" |
|
26 #include "udataswp.h" |
|
27 |
|
28 /* indexes[] entries */ |
|
29 enum { |
|
30 UPROPS_PROPS32_INDEX, |
|
31 UPROPS_EXCEPTIONS_INDEX, |
|
32 UPROPS_EXCEPTIONS_TOP_INDEX, |
|
33 |
|
34 UPROPS_ADDITIONAL_TRIE_INDEX, |
|
35 UPROPS_ADDITIONAL_VECTORS_INDEX, |
|
36 UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX, |
|
37 |
|
38 UPROPS_RESERVED_INDEX, /* 6 */ |
|
39 |
|
40 /* maximum values for code values in vector word 0 */ |
|
41 UPROPS_MAX_VALUES_INDEX=10, |
|
42 /* maximum values for code values in vector word 2 */ |
|
43 UPROPS_MAX_VALUES_2_INDEX, |
|
44 |
|
45 UPROPS_INDEX_COUNT=16 |
|
46 }; |
|
47 |
|
48 /* definitions for the main properties words */ |
|
49 enum { |
|
50 /* general category shift==0 0 (5 bits) */ |
|
51 UPROPS_NUMERIC_TYPE_SHIFT=5, /* 5 (3 bits) */ |
|
52 UPROPS_NUMERIC_VALUE_SHIFT=8 /* 8 (8 bits) */ |
|
53 }; |
|
54 |
|
55 #define GET_CATEGORY(props) ((props)&0x1f) |
|
56 #define CAT_MASK(props) U_MASK(GET_CATEGORY(props)) |
|
57 |
|
58 #define GET_NUMERIC_TYPE(props) (((props)>>UPROPS_NUMERIC_TYPE_SHIFT)&7) |
|
59 #define GET_NUMERIC_VALUE(props) (((props)>>UPROPS_NUMERIC_VALUE_SHIFT)&0xff) |
|
60 |
|
61 /* internal numeric pseudo-types for special encodings of numeric values */ |
|
62 enum { |
|
63 UPROPS_NT_FRACTION=4, /* ==U_NT_COUNT, must not change unless binary format version changes */ |
|
64 UPROPS_NT_LARGE, |
|
65 UPROPS_NT_COUNT |
|
66 }; |
|
67 |
|
68 /* encoding of fractional and large numbers */ |
|
69 enum { |
|
70 UPROPS_MAX_SMALL_NUMBER=0xff, |
|
71 |
|
72 UPROPS_FRACTION_NUM_SHIFT=3, /* numerator: bits 7..3 */ |
|
73 UPROPS_FRACTION_DEN_MASK=7, /* denominator: bits 2..0 */ |
|
74 |
|
75 UPROPS_FRACTION_MAX_NUM=31, |
|
76 UPROPS_FRACTION_DEN_OFFSET=2, /* denominator values are 2..9 */ |
|
77 |
|
78 UPROPS_FRACTION_MIN_DEN=UPROPS_FRACTION_DEN_OFFSET, |
|
79 UPROPS_FRACTION_MAX_DEN=UPROPS_FRACTION_MIN_DEN+UPROPS_FRACTION_DEN_MASK, |
|
80 |
|
81 UPROPS_LARGE_MANT_SHIFT=4, /* mantissa: bits 7..4 */ |
|
82 UPROPS_LARGE_EXP_MASK=0xf, /* exponent: bits 3..0 */ |
|
83 UPROPS_LARGE_EXP_OFFSET=2, /* regular exponents 2..17 */ |
|
84 UPROPS_LARGE_EXP_OFFSET_EXTRA=18, /* extra large exponents 18..33 */ |
|
85 |
|
86 UPROPS_LARGE_MIN_EXP=UPROPS_LARGE_EXP_OFFSET, |
|
87 UPROPS_LARGE_MAX_EXP=UPROPS_LARGE_MIN_EXP+UPROPS_LARGE_EXP_MASK, |
|
88 UPROPS_LARGE_MAX_EXP_EXTRA=UPROPS_LARGE_EXP_OFFSET_EXTRA+UPROPS_LARGE_EXP_MASK |
|
89 }; |
|
90 |
|
91 /* number of properties vector words */ |
|
92 #define UPROPS_VECTOR_WORDS 3 |
|
93 |
|
94 /* |
|
95 * Properties in vector word 0 |
|
96 * Bits |
|
97 * 31..24 DerivedAge version major/minor one nibble each |
|
98 * 23..18 Line Break |
|
99 * 17..15 East Asian Width |
|
100 * 14.. 7 UBlockCode |
|
101 * 6.. 0 UScriptCode |
|
102 */ |
|
103 |
|
104 /* derived age: one nibble each for major and minor version numbers */ |
|
105 #define UPROPS_AGE_MASK 0xff000000 |
|
106 #define UPROPS_AGE_SHIFT 24 |
|
107 |
|
108 #define UPROPS_LB_MASK 0x00FC0000 |
|
109 #define UPROPS_LB_SHIFT 18 |
|
110 |
|
111 #define UPROPS_EA_MASK 0x00038000 |
|
112 #define UPROPS_EA_SHIFT 15 |
|
113 |
|
114 #define UPROPS_BLOCK_MASK 0x00007f80 |
|
115 #define UPROPS_BLOCK_SHIFT 7 |
|
116 |
|
117 #define UPROPS_SCRIPT_MASK 0x0000007f |
|
118 |
|
119 /* |
|
120 * Properties in vector word 1 |
|
121 * Each bit encodes one binary property. |
|
122 * The following constants represent the bit number, use 1<<UPROPS_XYZ. |
|
123 * UPROPS_BINARY_1_TOP<=32! |
|
124 * |
|
125 * Keep this list of property enums in sync with |
|
126 * propListNames[] in icu/source/tools/genprops/props2.c! |
|
127 * |
|
128 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". |
|
129 */ |
|
130 enum { |
|
131 UPROPS_WHITE_SPACE, |
|
132 UPROPS_WAS_BIDI_CONTROL, /* reserved, was used in format version 3 */ |
|
133 UPROPS_WAS_JOIN_CONTROL, |
|
134 UPROPS_DASH, |
|
135 UPROPS_HYPHEN, |
|
136 UPROPS_QUOTATION_MARK, |
|
137 UPROPS_TERMINAL_PUNCTUATION, |
|
138 UPROPS_MATH, |
|
139 UPROPS_HEX_DIGIT, |
|
140 UPROPS_ASCII_HEX_DIGIT, |
|
141 UPROPS_ALPHABETIC, |
|
142 UPROPS_IDEOGRAPHIC, |
|
143 UPROPS_DIACRITIC, |
|
144 UPROPS_EXTENDER, |
|
145 UPROPS_WAS_LOWERCASE, /* reserved, was used in format version 3 */ |
|
146 UPROPS_WAS_UPPERCASE, |
|
147 UPROPS_NONCHARACTER_CODE_POINT, |
|
148 UPROPS_GRAPHEME_EXTEND, |
|
149 UPROPS_GRAPHEME_LINK, |
|
150 UPROPS_IDS_BINARY_OPERATOR, |
|
151 UPROPS_IDS_TRINARY_OPERATOR, |
|
152 UPROPS_RADICAL, |
|
153 UPROPS_UNIFIED_IDEOGRAPH, |
|
154 UPROPS_DEFAULT_IGNORABLE_CODE_POINT, |
|
155 UPROPS_DEPRECATED, |
|
156 UPROPS_WAS_SOFT_DOTTED, /* reserved, was used in format version 3 */ |
|
157 UPROPS_LOGICAL_ORDER_EXCEPTION, |
|
158 UPROPS_XID_START, |
|
159 UPROPS_XID_CONTINUE, |
|
160 UPROPS_ID_START, /* ICU 2.6, uprops format version 3.2 */ |
|
161 UPROPS_ID_CONTINUE, |
|
162 UPROPS_GRAPHEME_BASE, |
|
163 UPROPS_BINARY_1_TOP /* ==32 - full! */ |
|
164 }; |
|
165 |
|
166 /* |
|
167 * Properties in vector word 2 |
|
168 * Bits |
|
169 * 31..24 More binary properties |
|
170 * 23..19 reserved |
|
171 * 18..14 Sentence Break |
|
172 * 13..10 Word Break |
|
173 * 9.. 5 Grapheme Cluster Break |
|
174 * 4.. 0 Decomposition Type |
|
175 */ |
|
176 #define UPROPS_SB_MASK 0x0007c000 |
|
177 #define UPROPS_SB_SHIFT 14 |
|
178 |
|
179 #define UPROPS_WB_MASK 0x00003c00 |
|
180 #define UPROPS_WB_SHIFT 10 |
|
181 |
|
182 #define UPROPS_GCB_MASK 0x000003e0 |
|
183 #define UPROPS_GCB_SHIFT 5 |
|
184 |
|
185 #define UPROPS_DT_MASK 0x0000001f |
|
186 |
|
187 enum { |
|
188 UPROPS_V2_S_TERM=24, /* new in ICU 3.0 and Unicode 4.0.1 */ |
|
189 UPROPS_V2_VARIATION_SELECTOR, |
|
190 UPROPS_V2_PATTERN_SYNTAX, /* new in ICU 3.4 and Unicode 4.1 */ |
|
191 UPROPS_V2_PATTERN_WHITE_SPACE, |
|
192 UPROPS_V2_TOP /* must be <=32 */ |
|
193 }; |
|
194 |
|
195 /** |
|
196 * Get a properties vector word for a code point. |
|
197 * Implemented in uchar.c for uprops.c. |
|
198 * column==-1 gets the 32-bit main properties word instead. |
|
199 * @return 0 if no data or illegal argument |
|
200 */ |
|
201 U_CFUNC uint32_t |
|
202 u_getUnicodeProperties(UChar32 c, int32_t column); |
|
203 |
|
204 /** |
|
205 * Get the the maximum values for some enum/int properties. |
|
206 * Use the same column numbers as for u_getUnicodeProperties(). |
|
207 * The returned value will contain maximum values stored in the same bit fields |
|
208 * as where the enum values are stored in the u_getUnicodeProperties() |
|
209 * return values for the same columns. |
|
210 * |
|
211 * Valid columns are those for properties words that contain enumerated values. |
|
212 * (ICU 2.6: columns 0 and 2) |
|
213 * For other column numbers, this function will return 0. |
|
214 * |
|
215 * @internal |
|
216 */ |
|
217 U_CFUNC int32_t |
|
218 uprv_getMaxValues(int32_t column); |
|
219 |
|
220 /** |
|
221 * Get the Hangul Syllable Type for c. |
|
222 * @internal |
|
223 */ |
|
224 U_CFUNC UHangulSyllableType |
|
225 uchar_getHST(UChar32 c); |
|
226 |
|
227 /** |
|
228 * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM. |
|
229 * @internal |
|
230 */ |
|
231 U_CFUNC UBool |
|
232 u_isalnumPOSIX(UChar32 c); |
|
233 |
|
234 /** |
|
235 * Checks if c is in |
|
236 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] |
|
237 * with space=\p{Whitespace} and Control=Cc. |
|
238 * Implements UCHAR_POSIX_GRAPH. |
|
239 * @internal |
|
240 */ |
|
241 U_CFUNC UBool |
|
242 u_isgraphPOSIX(UChar32 c); |
|
243 |
|
244 /** |
|
245 * Checks if c is in \p{graph}\p{blank} - \p{cntrl}. |
|
246 * Implements UCHAR_POSIX_PRINT. |
|
247 * @internal |
|
248 */ |
|
249 U_CFUNC UBool |
|
250 u_isprintPOSIX(UChar32 c); |
|
251 |
|
252 /** Turn a bit index into a bit flag. @internal */ |
|
253 #define FLAG(n) ((uint32_t)1<<(n)) |
|
254 |
|
255 /** Flags for general categories in the order of UCharCategory. @internal */ |
|
256 #define _Cn FLAG(U_GENERAL_OTHER_TYPES) |
|
257 #define _Lu FLAG(U_UPPERCASE_LETTER) |
|
258 #define _Ll FLAG(U_LOWERCASE_LETTER) |
|
259 #define _Lt FLAG(U_TITLECASE_LETTER) |
|
260 #define _Lm FLAG(U_MODIFIER_LETTER) |
|
261 #define _Lo FLAG(U_OTHER_LETTER) |
|
262 #define _Mn FLAG(U_NON_SPACING_MARK) |
|
263 #define _Me FLAG(U_ENCLOSING_MARK) |
|
264 #define _Mc FLAG(U_COMBINING_SPACING_MARK) |
|
265 #define _Nd FLAG(U_DECIMAL_DIGIT_NUMBER) |
|
266 #define _Nl FLAG(U_LETTER_NUMBER) |
|
267 #define _No FLAG(U_OTHER_NUMBER) |
|
268 #define _Zs FLAG(U_SPACE_SEPARATOR) |
|
269 #define _Zl FLAG(U_LINE_SEPARATOR) |
|
270 #define _Zp FLAG(U_PARAGRAPH_SEPARATOR) |
|
271 #define _Cc FLAG(U_CONTROL_CHAR) |
|
272 #define _Cf FLAG(U_FORMAT_CHAR) |
|
273 #define _Co FLAG(U_PRIVATE_USE_CHAR) |
|
274 #define _Cs FLAG(U_SURROGATE) |
|
275 #define _Pd FLAG(U_DASH_PUNCTUATION) |
|
276 #define _Ps FLAG(U_START_PUNCTUATION) |
|
277 #define _Pe FLAG(U_END_PUNCTUATION) |
|
278 #define _Pc FLAG(U_CONNECTOR_PUNCTUATION) |
|
279 #define _Po FLAG(U_OTHER_PUNCTUATION) |
|
280 #define _Sm FLAG(U_MATH_SYMBOL) |
|
281 #define _Sc FLAG(U_CURRENCY_SYMBOL) |
|
282 #define _Sk FLAG(U_MODIFIER_SYMBOL) |
|
283 #define _So FLAG(U_OTHER_SYMBOL) |
|
284 #define _Pi FLAG(U_INITIAL_PUNCTUATION) |
|
285 #define _Pf FLAG(U_FINAL_PUNCTUATION) |
|
286 |
|
287 /** Some code points. @internal */ |
|
288 enum { |
|
289 TAB =0x0009, |
|
290 LF =0x000a, |
|
291 FF =0x000c, |
|
292 CR =0x000d, |
|
293 U_A =0x0041, |
|
294 U_F =0x0046, |
|
295 U_Z =0x005a, |
|
296 U_a =0x0061, |
|
297 U_f =0x0066, |
|
298 U_z =0x007a, |
|
299 DEL =0x007f, |
|
300 NL =0x0085, |
|
301 NBSP =0x00a0, |
|
302 CGJ =0x034f, |
|
303 FIGURESP=0x2007, |
|
304 HAIRSP =0x200a, |
|
305 ZWNJ =0x200c, |
|
306 ZWJ =0x200d, |
|
307 RLM =0x200f, |
|
308 NNBSP =0x202f, |
|
309 WJ =0x2060, |
|
310 INHSWAP =0x206a, |
|
311 NOMDIG =0x206f, |
|
312 U_FW_A =0xff21, |
|
313 U_FW_F =0xff26, |
|
314 U_FW_Z =0xff3a, |
|
315 U_FW_a =0xff41, |
|
316 U_FW_f =0xff46, |
|
317 U_FW_z =0xff5a, |
|
318 ZWNBSP =0xfeff |
|
319 }; |
|
320 |
|
321 /** |
|
322 * Get the maximum length of a (regular/1.0/extended) character name. |
|
323 * @return 0 if no character names available. |
|
324 */ |
|
325 U_CAPI int32_t U_EXPORT2 |
|
326 uprv_getMaxCharNameLength(void); |
|
327 |
|
328 #if 0 |
|
329 /* |
|
330 Currently not used but left for future use. Probably by UnicodeSet. |
|
331 urename.h and unames.c changed accordingly. |
|
332 */ |
|
333 /** |
|
334 * Get the maximum length of an ISO comment. |
|
335 * @return 0 if no ISO comments available. |
|
336 */ |
|
337 U_CAPI int32_t U_EXPORT2 |
|
338 uprv_getMaxISOCommentLength(); |
|
339 #endif |
|
340 |
|
341 /** |
|
342 * Fills set with characters that are used in Unicode character names. |
|
343 * Includes all characters that are used in regular/Unicode 1.0/extended names. |
|
344 * Just empties the set if no character names are available. |
|
345 * @param sa USetAdder to receive characters. |
|
346 */ |
|
347 U_CAPI void U_EXPORT2 |
|
348 uprv_getCharNameCharacters(const USetAdder *sa); |
|
349 |
|
350 #if 0 |
|
351 /* |
|
352 Currently not used but left for future use. Probably by UnicodeSet. |
|
353 urename.h and unames.c changed accordingly. |
|
354 */ |
|
355 /** |
|
356 * Fills set with characters that are used in Unicode character names. |
|
357 * Just empties the set if no ISO comments are available. |
|
358 * @param sa USetAdder to receive characters. |
|
359 */ |
|
360 U_CAPI void U_EXPORT2 |
|
361 uprv_getISOCommentCharacters(const USetAdder *sa); |
|
362 */ |
|
363 #endif |
|
364 |
|
365 /** |
|
366 * Constants for which data and implementation files provide which properties. |
|
367 * Used by UnicodeSet for service-specific property enumeration. |
|
368 * @internal |
|
369 */ |
|
370 enum UPropertySource { |
|
371 /** No source, not a supported property. */ |
|
372 UPROPS_SRC_NONE, |
|
373 /** From uchar.c/uprops.icu main trie */ |
|
374 UPROPS_SRC_CHAR, |
|
375 /** From uchar.c/uprops.icu properties vectors trie */ |
|
376 UPROPS_SRC_PROPSVEC, |
|
377 /** Hangul_Syllable_Type, from uchar.c/uprops.icu */ |
|
378 UPROPS_SRC_HST, |
|
379 /** From unames.c/unames.icu */ |
|
380 UPROPS_SRC_NAMES, |
|
381 /** From unorm.cpp/unorm.icu */ |
|
382 UPROPS_SRC_NORM, |
|
383 /** From ucase.c/ucase.icu */ |
|
384 UPROPS_SRC_CASE, |
|
385 /** From ubidi_props.c/ubidi.icu */ |
|
386 UPROPS_SRC_BIDI, |
|
387 /** From uchar.c/uprops.icu main trie as well as properties vectors trie */ |
|
388 UPROPS_SRC_CHAR_AND_PROPSVEC, |
|
389 /** One more than the highest UPropertySource (UPROPS_SRC_) constant. */ |
|
390 UPROPS_SRC_COUNT |
|
391 }; |
|
392 typedef enum UPropertySource UPropertySource; |
|
393 |
|
394 /** |
|
395 * @see UPropertySource |
|
396 * @internal |
|
397 */ |
|
398 U_CAPI UPropertySource U_EXPORT2 |
|
399 uprops_getSource(UProperty which); |
|
400 |
|
401 /** |
|
402 * Enumerate uprops.icu's main data trie and add the |
|
403 * start of each range of same properties to the set. |
|
404 * @internal |
|
405 */ |
|
406 U_CAPI void U_EXPORT2 |
|
407 uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); |
|
408 |
|
409 /** |
|
410 * Enumerate uprops.icu's properties vectors trie and add the |
|
411 * start of each range of same properties to the set. |
|
412 * @internal |
|
413 */ |
|
414 U_CAPI void U_EXPORT2 |
|
415 upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); |
|
416 |
|
417 /** |
|
418 * Same as uchar_addPropertyStarts() but only for Hangul_Syllable_Type. |
|
419 * @internal |
|
420 */ |
|
421 U_CAPI void U_EXPORT2 |
|
422 uhst_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); |
|
423 |
|
424 /** |
|
425 * Return a set of characters for property enumeration. |
|
426 * For each two consecutive characters (start, limit) in the set, |
|
427 * all of the properties for start..limit-1 are all the same. |
|
428 * |
|
429 * @param sa USetAdder to receive result. Existing contents are lost. |
|
430 * @internal |
|
431 */ |
|
432 U_CAPI void U_EXPORT2 |
|
433 uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode); |
|
434 |
|
435 /** |
|
436 * Swap the ICU Unicode properties file. See uchar.c. |
|
437 * @internal |
|
438 */ |
|
439 U_CAPI int32_t U_EXPORT2 |
|
440 uprops_swap(const UDataSwapper *ds, |
|
441 const void *inData, int32_t length, void *outData, |
|
442 UErrorCode *pErrorCode); |
|
443 |
|
444 /** |
|
445 * Swap the ICU Unicode character names file. See uchar.c. |
|
446 * @internal |
|
447 */ |
|
448 U_CAPI int32_t U_EXPORT2 |
|
449 uchar_swapNames(const UDataSwapper *ds, |
|
450 const void *inData, int32_t length, void *outData, |
|
451 UErrorCode *pErrorCode); |
|
452 |
|
453 #endif |