|
1 /* guniprop.c - Unicode character properties. |
|
2 * |
|
3 * Copyright (C) 1999 Tom Tromey |
|
4 * Copyright (C) 2000 Red Hat, Inc. |
|
5 * Portions copyright (c) 2006 Nokia Corporation. All rights reserved. |
|
6 * |
|
7 * This library is free software; you can redistribute it and/or |
|
8 * modify it under the terms of the GNU Lesser General Public |
|
9 * License as published by the Free Software Foundation; either |
|
10 * version 2 of the License, or (at your option) any later version. |
|
11 * |
|
12 * This library is distributed in the hope that it will be useful, |
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 * Lesser General Public License for more details. |
|
16 * |
|
17 * You should have received a copy of the GNU Lesser General Public |
|
18 * License along with this library; if not, write to the |
|
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
|
20 * Boston, MA 02111-1307, USA. |
|
21 */ |
|
22 |
|
23 #include "config.h" |
|
24 |
|
25 #include <stddef.h> |
|
26 #include <string.h> |
|
27 #include <locale.h> |
|
28 |
|
29 #include "glib.h" |
|
30 #include "gunichartables.h" |
|
31 #include "gmirroringtable.h" |
|
32 #include "gunicodeprivate.h" |
|
33 #include "galias.h" |
|
34 |
|
35 #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \ |
|
36 ? attr_table_part1[Page] \ |
|
37 : attr_table_part2[(Page) - 0xe00]) |
|
38 |
|
39 #define ATTTABLE(Page, Char) \ |
|
40 ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char])) |
|
41 |
|
42 #define TTYPE_PART1(Page, Char) \ |
|
43 ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
|
44 ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
|
45 : (type_data[type_table_part1[Page]][Char])) |
|
46 |
|
47 #define TTYPE_PART2(Page, Char) \ |
|
48 ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
|
49 ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
|
50 : (type_data[type_table_part2[Page]][Char])) |
|
51 |
|
52 #define TYPE(Char) \ |
|
53 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
|
54 ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \ |
|
55 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
|
56 ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
|
57 : G_UNICODE_UNASSIGNED)) |
|
58 |
|
59 |
|
60 #define IS(Type, Class) (((guint)1 << (Type)) & (Class)) |
|
61 #define OR(Type, Rest) (((guint)1 << (Type)) | (Rest)) |
|
62 |
|
63 |
|
64 |
|
65 #define ISDIGIT(Type) IS ((Type), \ |
|
66 OR (G_UNICODE_DECIMAL_NUMBER, \ |
|
67 OR (G_UNICODE_LETTER_NUMBER, \ |
|
68 OR (G_UNICODE_OTHER_NUMBER, 0)))) |
|
69 |
|
70 #define ISALPHA(Type) IS ((Type), \ |
|
71 OR (G_UNICODE_LOWERCASE_LETTER, \ |
|
72 OR (G_UNICODE_UPPERCASE_LETTER, \ |
|
73 OR (G_UNICODE_TITLECASE_LETTER, \ |
|
74 OR (G_UNICODE_MODIFIER_LETTER, \ |
|
75 OR (G_UNICODE_OTHER_LETTER, 0)))))) |
|
76 |
|
77 #define ISALDIGIT(Type) IS ((Type), \ |
|
78 OR (G_UNICODE_DECIMAL_NUMBER, \ |
|
79 OR (G_UNICODE_LETTER_NUMBER, \ |
|
80 OR (G_UNICODE_OTHER_NUMBER, \ |
|
81 OR (G_UNICODE_LOWERCASE_LETTER, \ |
|
82 OR (G_UNICODE_UPPERCASE_LETTER, \ |
|
83 OR (G_UNICODE_TITLECASE_LETTER, \ |
|
84 OR (G_UNICODE_MODIFIER_LETTER, \ |
|
85 OR (G_UNICODE_OTHER_LETTER, 0))))))))) |
|
86 |
|
87 #define ISMARK(Type) IS ((Type), \ |
|
88 OR (G_UNICODE_NON_SPACING_MARK, \ |
|
89 OR (G_UNICODE_COMBINING_MARK, \ |
|
90 OR (G_UNICODE_ENCLOSING_MARK, 0)))) |
|
91 |
|
92 /** |
|
93 * g_unichar_isalnum: |
|
94 * @c: a Unicode character |
|
95 * |
|
96 * Determines whether a character is alphanumeric. |
|
97 * Given some UTF-8 text, obtain a character value |
|
98 * with g_utf8_get_char(). |
|
99 * |
|
100 * Return value: %TRUE if @c is an alphanumeric character |
|
101 **/ |
|
102 EXPORT_C gboolean |
|
103 g_unichar_isalnum (gunichar c) |
|
104 { |
|
105 return ISALDIGIT (TYPE (c)) ? TRUE : FALSE; |
|
106 } |
|
107 |
|
108 /** |
|
109 * g_unichar_isalpha: |
|
110 * @c: a Unicode character |
|
111 * |
|
112 * Determines whether a character is alphabetic (i.e. a letter). |
|
113 * Given some UTF-8 text, obtain a character value with |
|
114 * g_utf8_get_char(). |
|
115 * |
|
116 * Return value: %TRUE if @c is an alphabetic character |
|
117 **/ |
|
118 EXPORT_C gboolean |
|
119 g_unichar_isalpha (gunichar c) |
|
120 { |
|
121 return ISALPHA (TYPE (c)) ? TRUE : FALSE; |
|
122 } |
|
123 |
|
124 |
|
125 /** |
|
126 * g_unichar_iscntrl: |
|
127 * @c: a Unicode character |
|
128 * |
|
129 * Determines whether a character is a control character. |
|
130 * Given some UTF-8 text, obtain a character value with |
|
131 * g_utf8_get_char(). |
|
132 * |
|
133 * Return value: %TRUE if @c is a control character |
|
134 **/ |
|
135 EXPORT_C gboolean |
|
136 g_unichar_iscntrl (gunichar c) |
|
137 { |
|
138 return TYPE (c) == G_UNICODE_CONTROL; |
|
139 } |
|
140 |
|
141 /** |
|
142 * g_unichar_isdigit: |
|
143 * @c: a Unicode character |
|
144 * |
|
145 * Determines whether a character is numeric (i.e. a digit). This |
|
146 * covers ASCII 0-9 and also digits in other languages/scripts. Given |
|
147 * some UTF-8 text, obtain a character value with g_utf8_get_char(). |
|
148 * |
|
149 * Return value: %TRUE if @c is a digit |
|
150 **/ |
|
151 EXPORT_C gboolean |
|
152 g_unichar_isdigit (gunichar c) |
|
153 { |
|
154 return TYPE (c) == G_UNICODE_DECIMAL_NUMBER; |
|
155 } |
|
156 |
|
157 |
|
158 /** |
|
159 * g_unichar_isgraph: |
|
160 * @c: a Unicode character |
|
161 * |
|
162 * Determines whether a character is printable and not a space |
|
163 * (returns %FALSE for control characters, format characters, and |
|
164 * spaces). g_unichar_isprint() is similar, but returns %TRUE for |
|
165 * spaces. Given some UTF-8 text, obtain a character value with |
|
166 * g_utf8_get_char(). |
|
167 * |
|
168 * Return value: %TRUE if @c is printable unless it's a space |
|
169 **/ |
|
170 EXPORT_C gboolean |
|
171 g_unichar_isgraph (gunichar c) |
|
172 { |
|
173 return !IS (TYPE(c), |
|
174 OR (G_UNICODE_CONTROL, |
|
175 OR (G_UNICODE_FORMAT, |
|
176 OR (G_UNICODE_UNASSIGNED, |
|
177 OR (G_UNICODE_PRIVATE_USE, |
|
178 OR (G_UNICODE_SURROGATE, |
|
179 OR (G_UNICODE_SPACE_SEPARATOR, |
|
180 0))))))); |
|
181 } |
|
182 |
|
183 /** |
|
184 * g_unichar_islower: |
|
185 * @c: a Unicode character |
|
186 * |
|
187 * Determines whether a character is a lowercase letter. |
|
188 * Given some UTF-8 text, obtain a character value with |
|
189 * g_utf8_get_char(). |
|
190 * |
|
191 * Return value: %TRUE if @c is a lowercase letter |
|
192 **/ |
|
193 EXPORT_C gboolean |
|
194 g_unichar_islower (gunichar c) |
|
195 { |
|
196 return TYPE (c) == G_UNICODE_LOWERCASE_LETTER; |
|
197 } |
|
198 |
|
199 |
|
200 /** |
|
201 * g_unichar_isprint: |
|
202 * @c: a Unicode character |
|
203 * |
|
204 * Determines whether a character is printable. |
|
205 * Unlike g_unichar_isgraph(), returns %TRUE for spaces. |
|
206 * Given some UTF-8 text, obtain a character value with |
|
207 * g_utf8_get_char(). |
|
208 * |
|
209 * Return value: %TRUE if @c is printable |
|
210 **/ |
|
211 EXPORT_C gboolean |
|
212 g_unichar_isprint (gunichar c) |
|
213 { |
|
214 return !IS (TYPE(c), |
|
215 OR (G_UNICODE_CONTROL, |
|
216 OR (G_UNICODE_FORMAT, |
|
217 OR (G_UNICODE_UNASSIGNED, |
|
218 OR (G_UNICODE_PRIVATE_USE, |
|
219 OR (G_UNICODE_SURROGATE, |
|
220 0)))))); |
|
221 } |
|
222 |
|
223 /** |
|
224 * g_unichar_ispunct: |
|
225 * @c: a Unicode character |
|
226 * |
|
227 * Determines whether a character is punctuation or a symbol. |
|
228 * Given some UTF-8 text, obtain a character value with |
|
229 * g_utf8_get_char(). |
|
230 * |
|
231 * Return value: %TRUE if @c is a punctuation or symbol character |
|
232 **/ |
|
233 EXPORT_C gboolean |
|
234 g_unichar_ispunct (gunichar c) |
|
235 { |
|
236 return IS (TYPE(c), |
|
237 OR (G_UNICODE_CONNECT_PUNCTUATION, |
|
238 OR (G_UNICODE_DASH_PUNCTUATION, |
|
239 OR (G_UNICODE_CLOSE_PUNCTUATION, |
|
240 OR (G_UNICODE_FINAL_PUNCTUATION, |
|
241 OR (G_UNICODE_INITIAL_PUNCTUATION, |
|
242 OR (G_UNICODE_OTHER_PUNCTUATION, |
|
243 OR (G_UNICODE_OPEN_PUNCTUATION, |
|
244 OR (G_UNICODE_CURRENCY_SYMBOL, |
|
245 OR (G_UNICODE_MODIFIER_SYMBOL, |
|
246 OR (G_UNICODE_MATH_SYMBOL, |
|
247 OR (G_UNICODE_OTHER_SYMBOL, |
|
248 0)))))))))))) ? TRUE : FALSE; |
|
249 } |
|
250 |
|
251 /** |
|
252 * g_unichar_isspace: |
|
253 * @c: a Unicode character |
|
254 * |
|
255 * Determines whether a character is a space, tab, or line separator |
|
256 * (newline, carriage return, etc.). Given some UTF-8 text, obtain a |
|
257 * character value with g_utf8_get_char(). |
|
258 * |
|
259 * (Note: don't use this to do word breaking; you have to use |
|
260 * Pango or equivalent to get word breaking right, the algorithm |
|
261 * is fairly complex.) |
|
262 * |
|
263 * Return value: %TRUE if @c is a space character |
|
264 **/ |
|
265 EXPORT_C gboolean |
|
266 g_unichar_isspace (gunichar c) |
|
267 { |
|
268 switch (c) |
|
269 { |
|
270 /* special-case these since Unicode thinks they are not spaces */ |
|
271 case '\t': |
|
272 case '\n': |
|
273 case '\r': |
|
274 case '\f': |
|
275 return TRUE; |
|
276 break; |
|
277 |
|
278 default: |
|
279 { |
|
280 return IS (TYPE(c), |
|
281 OR (G_UNICODE_SPACE_SEPARATOR, |
|
282 OR (G_UNICODE_LINE_SEPARATOR, |
|
283 OR (G_UNICODE_PARAGRAPH_SEPARATOR, |
|
284 0)))) ? TRUE : FALSE; |
|
285 } |
|
286 break; |
|
287 } |
|
288 } |
|
289 |
|
290 /** |
|
291 * g_unichar_isupper: |
|
292 * @c: a Unicode character |
|
293 * |
|
294 * Determines if a character is uppercase. |
|
295 * |
|
296 * Return value: %TRUE if @c is an uppercase character |
|
297 **/ |
|
298 EXPORT_C gboolean |
|
299 g_unichar_isupper (gunichar c) |
|
300 { |
|
301 return TYPE (c) == G_UNICODE_UPPERCASE_LETTER; |
|
302 } |
|
303 |
|
304 /** |
|
305 * g_unichar_istitle: |
|
306 * @c: a Unicode character |
|
307 * |
|
308 * Determines if a character is titlecase. Some characters in |
|
309 * Unicode which are composites, such as the DZ digraph |
|
310 * have three case variants instead of just two. The titlecase |
|
311 * form is used at the beginning of a word where only the |
|
312 * first letter is capitalized. The titlecase form of the DZ |
|
313 * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z. |
|
314 * |
|
315 * Return value: %TRUE if the character is titlecase |
|
316 **/ |
|
317 EXPORT_C gboolean |
|
318 g_unichar_istitle (gunichar c) |
|
319 { |
|
320 unsigned int i; |
|
321 for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
|
322 if (title_table[i][0] == c) |
|
323 return 1; |
|
324 return 0; |
|
325 } |
|
326 |
|
327 /** |
|
328 * g_unichar_isxdigit: |
|
329 * @c: a Unicode character. |
|
330 * |
|
331 * Determines if a character is a hexidecimal digit. |
|
332 * |
|
333 * Return value: %TRUE if the character is a hexadecimal digit |
|
334 **/ |
|
335 EXPORT_C gboolean |
|
336 g_unichar_isxdigit (gunichar c) |
|
337 { |
|
338 return ((c >= 'a' && c <= 'f') |
|
339 || (c >= 'A' && c <= 'F') |
|
340 || ISDIGIT (TYPE (c))); |
|
341 } |
|
342 |
|
343 /** |
|
344 * g_unichar_isdefined: |
|
345 * @c: a Unicode character |
|
346 * |
|
347 * Determines if a given character is assigned in the Unicode |
|
348 * standard. |
|
349 * |
|
350 * Return value: %TRUE if the character has an assigned value |
|
351 **/ |
|
352 EXPORT_C gboolean |
|
353 g_unichar_isdefined (gunichar c) |
|
354 { |
|
355 return TYPE (c) != G_UNICODE_UNASSIGNED; |
|
356 } |
|
357 |
|
358 /** |
|
359 * g_unichar_iswide: |
|
360 * @c: a Unicode character |
|
361 * |
|
362 * Determines if a character is typically rendered in a double-width |
|
363 * cell. |
|
364 * |
|
365 * Return value: %TRUE if the character is wide |
|
366 **/ |
|
367 /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>. */ |
|
368 EXPORT_C gboolean |
|
369 g_unichar_iswide (gunichar c) |
|
370 { |
|
371 if (c < 0x1100) |
|
372 return FALSE; |
|
373 |
|
374 return (c <= 0x115f /* Hangul Jamo init. consonants */ |
|
375 || c == 0x2329 || c == 0x232a /* angle brackets */ |
|
376 || (c >= 0x2e80 && c <= 0xa4cf && (c < 0x302a || c > 0x302f) |
|
377 && c != 0x303f && c != 0x3099 && c!= 0x309a) /* CJK ... Yi */ |
|
378 || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */ |
|
379 || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility Ideographs */ |
|
380 || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */ |
|
381 || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */ |
|
382 || (c >= 0xffe0 && c <= 0xffe6) /* Fullwidth Forms */ |
|
383 || (c >= 0x20000 && c <= 0x2fffd) /* CJK extra stuff */ |
|
384 || (c >= 0x30000 && c <= 0x3fffd)); |
|
385 } |
|
386 |
|
387 /** |
|
388 * g_unichar_toupper: |
|
389 * @c: a Unicode character |
|
390 * |
|
391 * Converts a character to uppercase. |
|
392 * |
|
393 * Return value: the result of converting @c to uppercase. |
|
394 * If @c is not an lowercase or titlecase character, |
|
395 * or has no upper case equivalent @c is returned unchanged. |
|
396 **/ |
|
397 EXPORT_C gunichar |
|
398 g_unichar_toupper (gunichar c) |
|
399 { |
|
400 int t = TYPE (c); |
|
401 if (t == G_UNICODE_LOWERCASE_LETTER) |
|
402 { |
|
403 gunichar val = ATTTABLE (c >> 8, c & 0xff); |
|
404 if (val >= 0x1000000) |
|
405 { |
|
406 const gchar *p = special_case_table + val - 0x1000000; |
|
407 return g_utf8_get_char (p); |
|
408 } |
|
409 else |
|
410 return val ? val : c; |
|
411 } |
|
412 else if (t == G_UNICODE_TITLECASE_LETTER) |
|
413 { |
|
414 unsigned int i; |
|
415 for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
|
416 { |
|
417 if (title_table[i][0] == c) |
|
418 return title_table[i][1]; |
|
419 } |
|
420 } |
|
421 return c; |
|
422 } |
|
423 |
|
424 /** |
|
425 * g_unichar_tolower: |
|
426 * @c: a Unicode character. |
|
427 * |
|
428 * Converts a character to lower case. |
|
429 * |
|
430 * Return value: the result of converting @c to lower case. |
|
431 * If @c is not an upperlower or titlecase character, |
|
432 * or has no lowercase equivalent @c is returned unchanged. |
|
433 **/ |
|
434 EXPORT_C gunichar |
|
435 g_unichar_tolower (gunichar c) |
|
436 { |
|
437 int t = TYPE (c); |
|
438 if (t == G_UNICODE_UPPERCASE_LETTER) |
|
439 { |
|
440 gunichar val = ATTTABLE (c >> 8, c & 0xff); |
|
441 if (val >= 0x1000000) |
|
442 { |
|
443 const gchar *p = special_case_table + val - 0x1000000; |
|
444 return g_utf8_get_char (p); |
|
445 } |
|
446 else |
|
447 return val ? val : c; |
|
448 } |
|
449 else if (t == G_UNICODE_TITLECASE_LETTER) |
|
450 { |
|
451 unsigned int i; |
|
452 for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
|
453 { |
|
454 if (title_table[i][0] == c) |
|
455 return title_table[i][2]; |
|
456 } |
|
457 } |
|
458 return c; |
|
459 } |
|
460 |
|
461 /** |
|
462 * g_unichar_totitle: |
|
463 * @c: a Unicode character |
|
464 * |
|
465 * Converts a character to the titlecase. |
|
466 * |
|
467 * Return value: the result of converting @c to titlecase. |
|
468 * If @c is not an uppercase or lowercase character, |
|
469 * @c is returned unchanged. |
|
470 **/ |
|
471 EXPORT_C gunichar |
|
472 g_unichar_totitle (gunichar c) |
|
473 { |
|
474 unsigned int i; |
|
475 for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
|
476 { |
|
477 if (title_table[i][0] == c || title_table[i][1] == c |
|
478 || title_table[i][2] == c) |
|
479 return title_table[i][0]; |
|
480 } |
|
481 return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER |
|
482 ? ATTTABLE (c >> 8, c & 0xff) |
|
483 : c); |
|
484 } |
|
485 |
|
486 /** |
|
487 * g_unichar_digit_value: |
|
488 * @c: a Unicode character |
|
489 * |
|
490 * Determines the numeric value of a character as a decimal |
|
491 * digit. |
|
492 * |
|
493 * Return value: If @c is a decimal digit (according to |
|
494 * g_unichar_isdigit()), its numeric value. Otherwise, -1. |
|
495 **/ |
|
496 EXPORT_C int |
|
497 g_unichar_digit_value (gunichar c) |
|
498 { |
|
499 if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) |
|
500 return ATTTABLE (c >> 8, c & 0xff); |
|
501 return -1; |
|
502 } |
|
503 |
|
504 /** |
|
505 * g_unichar_xdigit_value: |
|
506 * @c: a Unicode character |
|
507 * |
|
508 * Determines the numeric value of a character as a hexidecimal |
|
509 * digit. |
|
510 * |
|
511 * Return value: If @c is a hex digit (according to |
|
512 * g_unichar_isxdigit()), its numeric value. Otherwise, -1. |
|
513 **/ |
|
514 EXPORT_C int |
|
515 g_unichar_xdigit_value (gunichar c) |
|
516 { |
|
517 if (c >= 'A' && c <= 'F') |
|
518 return c - 'A' + 10; |
|
519 if (c >= 'a' && c <= 'f') |
|
520 return c - 'a' + 10; |
|
521 if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) |
|
522 return ATTTABLE (c >> 8, c & 0xff); |
|
523 return -1; |
|
524 } |
|
525 |
|
526 /** |
|
527 * g_unichar_type: |
|
528 * @c: a Unicode character |
|
529 * |
|
530 * Classifies a Unicode character by type. |
|
531 * |
|
532 * Return value: the type of the character. |
|
533 **/ |
|
534 EXPORT_C GUnicodeType |
|
535 g_unichar_type (gunichar c) |
|
536 { |
|
537 return TYPE (c); |
|
538 } |
|
539 |
|
540 /* |
|
541 * Case mapping functions |
|
542 */ |
|
543 |
|
544 typedef enum { |
|
545 LOCALE_NORMAL, |
|
546 LOCALE_TURKIC, |
|
547 LOCALE_LITHUANIAN |
|
548 } LocaleType; |
|
549 |
|
550 static LocaleType |
|
551 get_locale_type (void) |
|
552 { |
|
553 #ifdef G_OS_WIN32 |
|
554 char *tem = g_win32_getlocale (); |
|
555 char locale[2]; |
|
556 |
|
557 locale[0] = tem[0]; |
|
558 locale[1] = tem[1]; |
|
559 g_free (tem); |
|
560 #else |
|
561 const char *locale = setlocale (LC_CTYPE, NULL); |
|
562 #endif |
|
563 |
|
564 switch (locale[0]) |
|
565 { |
|
566 case 'a': |
|
567 if (locale[1] == 'z') |
|
568 return LOCALE_TURKIC; |
|
569 break; |
|
570 case 'l': |
|
571 if (locale[1] == 't') |
|
572 return LOCALE_LITHUANIAN; |
|
573 break; |
|
574 case 't': |
|
575 if (locale[1] == 'r') |
|
576 return LOCALE_TURKIC; |
|
577 break; |
|
578 } |
|
579 |
|
580 return LOCALE_NORMAL; |
|
581 } |
|
582 |
|
583 static gint |
|
584 output_marks (const char **p_inout, |
|
585 char *out_buffer, |
|
586 gboolean remove_dot) |
|
587 { |
|
588 const char *p = *p_inout; |
|
589 gint len = 0; |
|
590 |
|
591 while (*p) |
|
592 { |
|
593 gunichar c = g_utf8_get_char (p); |
|
594 |
|
595 if (ISMARK (TYPE (c))) |
|
596 { |
|
597 if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */) |
|
598 len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL); |
|
599 p = g_utf8_next_char (p); |
|
600 } |
|
601 else |
|
602 break; |
|
603 } |
|
604 |
|
605 *p_inout = p; |
|
606 return len; |
|
607 } |
|
608 |
|
609 static gint |
|
610 output_special_case (gchar *out_buffer, |
|
611 int offset, |
|
612 int type, |
|
613 int which) |
|
614 { |
|
615 const gchar *p = special_case_table + offset; |
|
616 gint len; |
|
617 |
|
618 if (type != G_UNICODE_TITLECASE_LETTER) |
|
619 p = g_utf8_next_char (p); |
|
620 |
|
621 if (which == 1) |
|
622 p += strlen (p) + 1; |
|
623 |
|
624 len = strlen (p); |
|
625 if (out_buffer) |
|
626 memcpy (out_buffer, p, len); |
|
627 |
|
628 return len; |
|
629 } |
|
630 |
|
631 static gsize |
|
632 real_toupper (const gchar *str, |
|
633 gssize max_len, |
|
634 gchar *out_buffer, |
|
635 LocaleType locale_type) |
|
636 { |
|
637 const gchar *p = str; |
|
638 const char *last = NULL; |
|
639 gsize len = 0; |
|
640 gboolean last_was_i = FALSE; |
|
641 |
|
642 while ((max_len < 0 || p < str + max_len) && *p) |
|
643 { |
|
644 gunichar c = g_utf8_get_char (p); |
|
645 int t = TYPE (c); |
|
646 gunichar val; |
|
647 |
|
648 last = p; |
|
649 p = g_utf8_next_char (p); |
|
650 |
|
651 if (locale_type == LOCALE_LITHUANIAN) |
|
652 { |
|
653 if (c == 'i') |
|
654 last_was_i = TRUE; |
|
655 else |
|
656 { |
|
657 if (last_was_i) |
|
658 { |
|
659 /* Nasty, need to remove any dot above. Though |
|
660 * I think only E WITH DOT ABOVE occurs in practice |
|
661 * which could simplify this considerably. |
|
662 */ |
|
663 gsize decomp_len, i; |
|
664 gunichar *decomp; |
|
665 |
|
666 decomp = g_unicode_canonical_decomposition (c, &decomp_len); |
|
667 for (i=0; i < decomp_len; i++) |
|
668 { |
|
669 if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */) |
|
670 len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL); |
|
671 } |
|
672 g_free (decomp); |
|
673 |
|
674 len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE); |
|
675 |
|
676 continue; |
|
677 } |
|
678 |
|
679 if (!ISMARK (t)) |
|
680 last_was_i = FALSE; |
|
681 } |
|
682 } |
|
683 |
|
684 if (locale_type == LOCALE_TURKIC && c == 'i') |
|
685 { |
|
686 /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */ |
|
687 len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL); |
|
688 } |
|
689 else if (c == 0x0345) /* COMBINING GREEK YPOGEGRAMMENI */ |
|
690 { |
|
691 /* Nasty, need to move it after other combining marks .. this would go away if |
|
692 * we normalized first. |
|
693 */ |
|
694 len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE); |
|
695 |
|
696 /* And output as GREEK CAPITAL LETTER IOTA */ |
|
697 len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL); |
|
698 } |
|
699 else if (IS (t, |
|
700 OR (G_UNICODE_LOWERCASE_LETTER, |
|
701 OR (G_UNICODE_TITLECASE_LETTER, |
|
702 0)))) |
|
703 { |
|
704 val = ATTTABLE (c >> 8, c & 0xff); |
|
705 |
|
706 if (val >= 0x1000000) |
|
707 { |
|
708 len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, |
|
709 t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1); |
|
710 } |
|
711 else |
|
712 { |
|
713 if (t == G_UNICODE_TITLECASE_LETTER) |
|
714 { |
|
715 unsigned int i; |
|
716 for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
|
717 { |
|
718 if (title_table[i][0] == c) |
|
719 val = title_table[i][1]; |
|
720 } |
|
721 } |
|
722 |
|
723 len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL); |
|
724 } |
|
725 } |
|
726 else |
|
727 { |
|
728 gsize char_len = g_utf8_skip[*(guchar *)last]; |
|
729 |
|
730 if (out_buffer) |
|
731 memcpy (out_buffer + len, last, char_len); |
|
732 |
|
733 len += char_len; |
|
734 } |
|
735 |
|
736 } |
|
737 |
|
738 return len; |
|
739 } |
|
740 |
|
741 /** |
|
742 * g_utf8_strup: |
|
743 * @str: a UTF-8 encoded string |
|
744 * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
|
745 * |
|
746 * Converts all Unicode characters in the string that have a case |
|
747 * to uppercase. The exact manner that this is done depends |
|
748 * on the current locale, and may result in the number of |
|
749 * characters in the string increasing. (For instance, the |
|
750 * German ess-zet will be changed to SS.) |
|
751 * |
|
752 * Return value: a newly allocated string, with all characters |
|
753 * converted to uppercase. |
|
754 **/ |
|
755 EXPORT_C gchar * |
|
756 g_utf8_strup (const gchar *str, |
|
757 gssize len) |
|
758 { |
|
759 gsize result_len; |
|
760 LocaleType locale_type; |
|
761 gchar *result; |
|
762 |
|
763 g_return_val_if_fail (str != NULL, NULL); |
|
764 |
|
765 locale_type = get_locale_type (); |
|
766 |
|
767 /* |
|
768 * We use a two pass approach to keep memory management simple |
|
769 */ |
|
770 result_len = real_toupper (str, len, NULL, locale_type); |
|
771 result = g_malloc (result_len + 1); |
|
772 real_toupper (str, len, result, locale_type); |
|
773 result[result_len] = '\0'; |
|
774 |
|
775 return result; |
|
776 } |
|
777 |
|
778 /* traverses the string checking for characters with combining class == 230 |
|
779 * until a base character is found */ |
|
780 static gboolean |
|
781 has_more_above (const gchar *str) |
|
782 { |
|
783 const gchar *p = str; |
|
784 gint combining_class; |
|
785 |
|
786 while (*p) |
|
787 { |
|
788 combining_class = _g_unichar_combining_class (g_utf8_get_char (p)); |
|
789 if (combining_class == 230) |
|
790 return TRUE; |
|
791 else if (combining_class == 0) |
|
792 break; |
|
793 |
|
794 p = g_utf8_next_char (p); |
|
795 } |
|
796 |
|
797 return FALSE; |
|
798 } |
|
799 |
|
800 static gsize |
|
801 real_tolower (const gchar *str, |
|
802 gssize max_len, |
|
803 gchar *out_buffer, |
|
804 LocaleType locale_type) |
|
805 { |
|
806 const gchar *p = str; |
|
807 const char *last = NULL; |
|
808 gsize len = 0; |
|
809 |
|
810 while ((max_len < 0 || p < str + max_len) && *p) |
|
811 { |
|
812 gunichar c = g_utf8_get_char (p); |
|
813 int t = TYPE (c); |
|
814 gunichar val; |
|
815 |
|
816 last = p; |
|
817 p = g_utf8_next_char (p); |
|
818 |
|
819 if (locale_type == LOCALE_TURKIC && c == 'I') |
|
820 { |
|
821 if (g_utf8_get_char (p) == 0x0307) |
|
822 { |
|
823 /* I + COMBINING DOT ABOVE => i (U+0069) */ |
|
824 len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); |
|
825 p = g_utf8_next_char (p); |
|
826 } |
|
827 else |
|
828 { |
|
829 /* I => LATIN SMALL LETTER DOTLESS I */ |
|
830 len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL); |
|
831 } |
|
832 } |
|
833 /* Introduce an explicit dot above when lowercasing capital I's and J's |
|
834 * whenever there are more accents above. [SpecialCasing.txt] */ |
|
835 else if (locale_type == LOCALE_LITHUANIAN && |
|
836 (c == 0x00cc || c == 0x00cd || c == 0x0128)) |
|
837 { |
|
838 len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); |
|
839 len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); |
|
840 |
|
841 switch (c) |
|
842 { |
|
843 case 0x00cc: |
|
844 len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL); |
|
845 break; |
|
846 case 0x00cd: |
|
847 len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL); |
|
848 break; |
|
849 case 0x0128: |
|
850 len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL); |
|
851 break; |
|
852 } |
|
853 } |
|
854 else if (locale_type == LOCALE_LITHUANIAN && |
|
855 (c == 'I' || c == 'J' || c == 0x012e) && |
|
856 has_more_above (p)) |
|
857 { |
|
858 len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL); |
|
859 len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); |
|
860 } |
|
861 else if (c == 0x03A3) /* GREEK CAPITAL LETTER SIGMA */ |
|
862 { |
|
863 if ((max_len < 0 || p < str + max_len) && *p) |
|
864 { |
|
865 gunichar next_c = g_utf8_get_char (p); |
|
866 int next_type = TYPE(next_c); |
|
867 |
|
868 /* SIGMA mapps differently depending on whether it is |
|
869 * final or not. The following simplified test would |
|
870 * fail in the case of combining marks following the |
|
871 * sigma, but I don't think that occurs in real text. |
|
872 * The test here matches that in ICU. |
|
873 */ |
|
874 if (ISALPHA (next_type)) /* Lu,Ll,Lt,Lm,Lo */ |
|
875 val = 0x3c3; /* GREEK SMALL SIGMA */ |
|
876 else |
|
877 val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ |
|
878 } |
|
879 else |
|
880 val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ |
|
881 |
|
882 len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL); |
|
883 } |
|
884 else if (IS (t, |
|
885 OR (G_UNICODE_UPPERCASE_LETTER, |
|
886 OR (G_UNICODE_TITLECASE_LETTER, |
|
887 0)))) |
|
888 { |
|
889 val = ATTTABLE (c >> 8, c & 0xff); |
|
890 |
|
891 if (val >= 0x1000000) |
|
892 { |
|
893 len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, 0); |
|
894 } |
|
895 else |
|
896 { |
|
897 if (t == G_UNICODE_TITLECASE_LETTER) |
|
898 { |
|
899 unsigned int i; |
|
900 for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
|
901 { |
|
902 if (title_table[i][0] == c) |
|
903 val = title_table[i][2]; |
|
904 } |
|
905 } |
|
906 |
|
907 len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL); |
|
908 } |
|
909 } |
|
910 else |
|
911 { |
|
912 gsize char_len = g_utf8_skip[*(guchar *)last]; |
|
913 |
|
914 if (out_buffer) |
|
915 memcpy (out_buffer + len, last, char_len); |
|
916 |
|
917 len += char_len; |
|
918 } |
|
919 |
|
920 } |
|
921 |
|
922 return len; |
|
923 } |
|
924 |
|
925 /** |
|
926 * g_utf8_strdown: |
|
927 * @str: a UTF-8 encoded string |
|
928 * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
|
929 * |
|
930 * Converts all Unicode characters in the string that have a case |
|
931 * to lowercase. The exact manner that this is done depends |
|
932 * on the current locale, and may result in the number of |
|
933 * characters in the string changing. |
|
934 * |
|
935 * Return value: a newly allocated string, with all characters |
|
936 * converted to lowercase. |
|
937 **/ |
|
938 EXPORT_C gchar * |
|
939 g_utf8_strdown (const gchar *str, |
|
940 gssize len) |
|
941 { |
|
942 gsize result_len; |
|
943 LocaleType locale_type; |
|
944 gchar *result; |
|
945 |
|
946 g_return_val_if_fail (str != NULL, NULL); |
|
947 |
|
948 locale_type = get_locale_type (); |
|
949 |
|
950 /* |
|
951 * We use a two pass approach to keep memory management simple |
|
952 */ |
|
953 result_len = real_tolower (str, len, NULL, locale_type); |
|
954 result = g_malloc (result_len + 1); |
|
955 real_tolower (str, len, result, locale_type); |
|
956 result[result_len] = '\0'; |
|
957 |
|
958 return result; |
|
959 } |
|
960 |
|
961 /** |
|
962 * g_utf8_casefold: |
|
963 * @str: a UTF-8 encoded string |
|
964 * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
|
965 * |
|
966 * Converts a string into a form that is independent of case. The |
|
967 * result will not correspond to any particular case, but can be |
|
968 * compared for equality or ordered with the results of calling |
|
969 * g_utf8_casefold() on other strings. |
|
970 * |
|
971 * Note that calling g_utf8_casefold() followed by g_utf8_collate() is |
|
972 * only an approximation to the correct linguistic case insensitive |
|
973 * ordering, though it is a fairly good one. Getting this exactly |
|
974 * right would require a more sophisticated collation function that |
|
975 * takes case sensitivity into account. GLib does not currently |
|
976 * provide such a function. |
|
977 * |
|
978 * Return value: a newly allocated string, that is a |
|
979 * case independent form of @str. |
|
980 **/ |
|
981 EXPORT_C gchar * |
|
982 g_utf8_casefold (const gchar *str, |
|
983 gssize len) |
|
984 { |
|
985 GString *result; |
|
986 const char *p; |
|
987 |
|
988 g_return_val_if_fail (str != NULL, NULL); |
|
989 |
|
990 result = g_string_new (NULL); |
|
991 p = str; |
|
992 while ((len < 0 || p < str + len) && *p) |
|
993 { |
|
994 gunichar ch = g_utf8_get_char (p); |
|
995 |
|
996 int start = 0; |
|
997 int end = G_N_ELEMENTS (casefold_table); |
|
998 |
|
999 if (ch >= casefold_table[start].ch && |
|
1000 ch <= casefold_table[end - 1].ch) |
|
1001 { |
|
1002 while (TRUE) |
|
1003 { |
|
1004 int half = (start + end) / 2; |
|
1005 if (ch == casefold_table[half].ch) |
|
1006 { |
|
1007 g_string_append (result, casefold_table[half].data); |
|
1008 goto next; |
|
1009 } |
|
1010 else if (half == start) |
|
1011 break; |
|
1012 else if (ch > casefold_table[half].ch) |
|
1013 start = half; |
|
1014 else |
|
1015 end = half; |
|
1016 } |
|
1017 } |
|
1018 |
|
1019 g_string_append_unichar (result, g_unichar_tolower (ch)); |
|
1020 |
|
1021 next: |
|
1022 p = g_utf8_next_char (p); |
|
1023 } |
|
1024 |
|
1025 return g_string_free (result, FALSE); |
|
1026 } |
|
1027 |
|
1028 /** |
|
1029 * g_unichar_get_mirror_char: |
|
1030 * @ch: a Unicode character |
|
1031 * @mirrored_ch: location to store the mirrored character |
|
1032 * |
|
1033 * In Unicode, some characters are <firstterm>mirrored</firstterm>. This |
|
1034 * means that their images are mirrored horizontally in text that is laid |
|
1035 * out from right to left. For instance, "(" would become its mirror image, |
|
1036 * ")", in right-to-left text. |
|
1037 * |
|
1038 * If @ch has the Unicode mirrored property and there is another unicode |
|
1039 * character that typically has a glyph that is the mirror image of @ch's |
|
1040 * glyph and @mirrored_ch is set, it puts that character in the address |
|
1041 * pointed to by @mirrored_ch. Otherwise the original character is put. |
|
1042 * |
|
1043 * Return value: %TRUE if @ch has a mirrored character, %FALSE otherwise |
|
1044 * |
|
1045 * Since: 2.4 |
|
1046 **/ |
|
1047 EXPORT_C gboolean |
|
1048 g_unichar_get_mirror_char (gunichar ch, |
|
1049 gunichar *mirrored_ch) |
|
1050 { |
|
1051 gboolean found; |
|
1052 gunichar mirrored; |
|
1053 |
|
1054 mirrored = GLIB_GET_MIRRORING(ch); |
|
1055 |
|
1056 found = ch != mirrored; |
|
1057 if (mirrored_ch) |
|
1058 *mirrored_ch = mirrored; |
|
1059 |
|
1060 return found; |
|
1061 |
|
1062 } |
|
1063 |
|
1064 #define __G_UNIPROP_C__ |
|
1065 #include "galiasdef.c" |