1 gunicode.h |
1 /* gunicode.h - Unicode manipulation functions |
|
2 * |
|
3 * Copyright (C) 1999, 2000 Tom Tromey |
|
4 * Copyright 2000, 2005 Red Hat, Inc. |
|
5 * Portions copyright (c) 2006 Nokia Corporation. All rights reserved. |
|
6 * |
|
7 * The Gnome Library is free software; you can redistribute it and/or |
|
8 * modify it under the terms of the GNU Lesser General Public License as |
|
9 * published by the Free Software Foundation; either version 2 of the |
|
10 * License, or (at your option) any later version. |
|
11 * |
|
12 * The Gnome Library is distributed in the hope that it will be useful, |
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 * Lesser General Public License for more details. |
|
16 * |
|
17 * You should have received a copy of the GNU Lesser General Public |
|
18 * License along with the Gnome Library; see the file COPYING.LIB. If not, |
|
19 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
|
20 * Boston, MA 02111-1307, USA. |
|
21 */ |
|
22 |
|
23 #ifndef __G_UNICODE_H__ |
|
24 #define __G_UNICODE_H__ |
|
25 |
|
26 #include <_ansi.h> |
|
27 #include <glib/gerror.h> |
|
28 #include <glib/gtypes.h> |
|
29 |
|
30 G_BEGIN_DECLS |
|
31 |
|
32 typedef guint32 gunichar; |
|
33 typedef guint16 gunichar2; |
|
34 |
|
35 /* These are the possible character classifications. |
|
36 * See http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values |
|
37 */ |
|
38 typedef enum |
|
39 { |
|
40 G_UNICODE_CONTROL, |
|
41 G_UNICODE_FORMAT, |
|
42 G_UNICODE_UNASSIGNED, |
|
43 G_UNICODE_PRIVATE_USE, |
|
44 G_UNICODE_SURROGATE, |
|
45 G_UNICODE_LOWERCASE_LETTER, |
|
46 G_UNICODE_MODIFIER_LETTER, |
|
47 G_UNICODE_OTHER_LETTER, |
|
48 G_UNICODE_TITLECASE_LETTER, |
|
49 G_UNICODE_UPPERCASE_LETTER, |
|
50 G_UNICODE_COMBINING_MARK, |
|
51 G_UNICODE_ENCLOSING_MARK, |
|
52 G_UNICODE_NON_SPACING_MARK, |
|
53 G_UNICODE_DECIMAL_NUMBER, |
|
54 G_UNICODE_LETTER_NUMBER, |
|
55 G_UNICODE_OTHER_NUMBER, |
|
56 G_UNICODE_CONNECT_PUNCTUATION, |
|
57 G_UNICODE_DASH_PUNCTUATION, |
|
58 G_UNICODE_CLOSE_PUNCTUATION, |
|
59 G_UNICODE_FINAL_PUNCTUATION, |
|
60 G_UNICODE_INITIAL_PUNCTUATION, |
|
61 G_UNICODE_OTHER_PUNCTUATION, |
|
62 G_UNICODE_OPEN_PUNCTUATION, |
|
63 G_UNICODE_CURRENCY_SYMBOL, |
|
64 G_UNICODE_MODIFIER_SYMBOL, |
|
65 G_UNICODE_MATH_SYMBOL, |
|
66 G_UNICODE_OTHER_SYMBOL, |
|
67 G_UNICODE_LINE_SEPARATOR, |
|
68 G_UNICODE_PARAGRAPH_SEPARATOR, |
|
69 G_UNICODE_SPACE_SEPARATOR |
|
70 } GUnicodeType; |
|
71 |
|
72 /* These are the possible line break classifications. |
|
73 * Note that new types may be added in the future. |
|
74 * Implementations may regard unknown values like G_UNICODE_BREAK_UNKNOWN |
|
75 * See http://www.unicode.org/unicode/reports/tr14/ |
|
76 */ |
|
77 typedef enum |
|
78 { |
|
79 G_UNICODE_BREAK_MANDATORY, |
|
80 G_UNICODE_BREAK_CARRIAGE_RETURN, |
|
81 G_UNICODE_BREAK_LINE_FEED, |
|
82 G_UNICODE_BREAK_COMBINING_MARK, |
|
83 G_UNICODE_BREAK_SURROGATE, |
|
84 G_UNICODE_BREAK_ZERO_WIDTH_SPACE, |
|
85 G_UNICODE_BREAK_INSEPARABLE, |
|
86 G_UNICODE_BREAK_NON_BREAKING_GLUE, |
|
87 G_UNICODE_BREAK_CONTINGENT, |
|
88 G_UNICODE_BREAK_SPACE, |
|
89 G_UNICODE_BREAK_AFTER, |
|
90 G_UNICODE_BREAK_BEFORE, |
|
91 G_UNICODE_BREAK_BEFORE_AND_AFTER, |
|
92 G_UNICODE_BREAK_HYPHEN, |
|
93 G_UNICODE_BREAK_NON_STARTER, |
|
94 G_UNICODE_BREAK_OPEN_PUNCTUATION, |
|
95 G_UNICODE_BREAK_CLOSE_PUNCTUATION, |
|
96 G_UNICODE_BREAK_QUOTATION, |
|
97 G_UNICODE_BREAK_EXCLAMATION, |
|
98 G_UNICODE_BREAK_IDEOGRAPHIC, |
|
99 G_UNICODE_BREAK_NUMERIC, |
|
100 G_UNICODE_BREAK_INFIX_SEPARATOR, |
|
101 G_UNICODE_BREAK_SYMBOL, |
|
102 G_UNICODE_BREAK_ALPHABETIC, |
|
103 G_UNICODE_BREAK_PREFIX, |
|
104 G_UNICODE_BREAK_POSTFIX, |
|
105 G_UNICODE_BREAK_COMPLEX_CONTEXT, |
|
106 G_UNICODE_BREAK_AMBIGUOUS, |
|
107 G_UNICODE_BREAK_UNKNOWN, |
|
108 G_UNICODE_BREAK_NEXT_LINE, |
|
109 G_UNICODE_BREAK_WORD_JOINER, |
|
110 G_UNICODE_BREAK_HANGUL_L_JAMO, |
|
111 G_UNICODE_BREAK_HANGUL_V_JAMO, |
|
112 G_UNICODE_BREAK_HANGUL_T_JAMO, |
|
113 G_UNICODE_BREAK_HANGUL_LV_SYLLABLE, |
|
114 G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE |
|
115 } GUnicodeBreakType; |
|
116 |
|
117 /* Returns TRUE if current locale uses UTF-8 charset. If CHARSET is |
|
118 * not null, sets *CHARSET to the name of the current locale's |
|
119 * charset. This value is statically allocated, and should be copied |
|
120 * in case the locale's charset will be changed later using setlocale() |
|
121 * or in some other way. |
|
122 */ |
|
123 IMPORT_C gboolean g_get_charset (G_CONST_RETURN char **charset); |
|
124 |
|
125 /* These are all analogs of the <ctype.h> functions. |
|
126 */ |
|
127 IMPORT_C gboolean g_unichar_isalnum (gunichar c) G_GNUC_CONST; |
|
128 IMPORT_C gboolean g_unichar_isalpha (gunichar c) G_GNUC_CONST; |
|
129 IMPORT_C gboolean g_unichar_iscntrl (gunichar c) G_GNUC_CONST; |
|
130 IMPORT_C gboolean g_unichar_isdigit (gunichar c) G_GNUC_CONST; |
|
131 IMPORT_C gboolean g_unichar_isgraph (gunichar c) G_GNUC_CONST; |
|
132 IMPORT_C gboolean g_unichar_islower (gunichar c) G_GNUC_CONST; |
|
133 IMPORT_C gboolean g_unichar_isprint (gunichar c) G_GNUC_CONST; |
|
134 IMPORT_C gboolean g_unichar_ispunct (gunichar c) G_GNUC_CONST; |
|
135 IMPORT_C gboolean g_unichar_isspace (gunichar c) G_GNUC_CONST; |
|
136 IMPORT_C gboolean g_unichar_isupper (gunichar c) G_GNUC_CONST; |
|
137 IMPORT_C gboolean g_unichar_isxdigit (gunichar c) G_GNUC_CONST; |
|
138 IMPORT_C gboolean g_unichar_istitle (gunichar c) G_GNUC_CONST; |
|
139 IMPORT_C gboolean g_unichar_isdefined (gunichar c) G_GNUC_CONST; |
|
140 IMPORT_C gboolean g_unichar_iswide (gunichar c) G_GNUC_CONST; |
|
141 |
|
142 /* More <ctype.h> functions. These convert between the three cases. |
|
143 * See the Unicode book to understand title case. */ |
|
144 IMPORT_C gunichar g_unichar_toupper (gunichar c) G_GNUC_CONST; |
|
145 IMPORT_C gunichar g_unichar_tolower (gunichar c) G_GNUC_CONST; |
|
146 IMPORT_C gunichar g_unichar_totitle (gunichar c) G_GNUC_CONST; |
|
147 |
|
148 /* If C is a digit (according to `g_unichar_isdigit'), then return its |
|
149 numeric value. Otherwise return -1. */ |
|
150 IMPORT_C gint g_unichar_digit_value (gunichar c) G_GNUC_CONST; |
|
151 |
|
152 IMPORT_C gint g_unichar_xdigit_value (gunichar c) G_GNUC_CONST; |
|
153 |
|
154 /* Return the Unicode character type of a given character. */ |
|
155 IMPORT_C GUnicodeType g_unichar_type (gunichar c) G_GNUC_CONST; |
|
156 |
|
157 /* Return the line break property for a given character */ |
|
158 IMPORT_C GUnicodeBreakType g_unichar_break_type (gunichar c) G_GNUC_CONST; |
|
159 |
|
160 |
|
161 /* Compute canonical ordering of a string in-place. This rearranges |
|
162 decomposed characters in the string according to their combining |
|
163 classes. See the Unicode manual for more information. */ |
|
164 IMPORT_C void g_unicode_canonical_ordering (gunichar *string, |
|
165 gsize len); |
|
166 |
|
167 /* Compute canonical decomposition of a character. Returns g_malloc()d |
|
168 string of Unicode characters. RESULT_LEN is set to the resulting |
|
169 length of the string. */ |
|
170 IMPORT_C gunichar *g_unicode_canonical_decomposition (gunichar ch, |
|
171 gsize *result_len) G_GNUC_MALLOC; |
|
172 |
|
173 /* Array of skip-bytes-per-initial character. |
|
174 */ |
|
175 #ifdef __SYMBIAN32__ |
|
176 IMPORT_C const gchar * const * _g_utf8_skip(); |
|
177 #endif /* __SYMBIAN32__ */ |
|
178 GLIB_VAR const gchar * const g_utf8_skip; |
|
179 |
|
180 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)]) |
|
181 |
|
182 IMPORT_C gunichar g_utf8_get_char (const gchar *p); |
|
183 IMPORT_C gunichar g_utf8_get_char_validated (const gchar *p, |
|
184 gssize max_len); |
|
185 |
|
186 IMPORT_C gchar* g_utf8_offset_to_pointer (const gchar *str, |
|
187 glong offset); |
|
188 IMPORT_C glong g_utf8_pointer_to_offset (const gchar *str, |
|
189 const gchar *pos); |
|
190 IMPORT_C gchar* g_utf8_prev_char (const gchar *p); |
|
191 IMPORT_C gchar* g_utf8_find_next_char (const gchar *p, |
|
192 const gchar *end); |
|
193 IMPORT_C gchar* g_utf8_find_prev_char (const gchar *str, |
|
194 const gchar *p); |
|
195 |
|
196 IMPORT_C glong g_utf8_strlen (const gchar *p, |
|
197 gssize max); |
|
198 |
|
199 /* Copies n characters from src to dest */ |
|
200 IMPORT_C gchar* g_utf8_strncpy (gchar *dest, |
|
201 const gchar *src, |
|
202 gsize n); |
|
203 |
|
204 /* Find the UTF-8 character corresponding to ch, in string p. These |
|
205 functions are equivalants to strchr and strrchr */ |
|
206 IMPORT_C gchar* g_utf8_strchr (const gchar *p, |
|
207 gssize len, |
|
208 gunichar c); |
|
209 IMPORT_C gchar* g_utf8_strrchr (const gchar *p, |
|
210 gssize len, |
|
211 gunichar c); |
|
212 IMPORT_C gchar* g_utf8_strreverse (const gchar *str, |
|
213 gssize len); |
|
214 |
|
215 IMPORT_C gunichar2 *g_utf8_to_utf16 (const gchar *str, |
|
216 glong len, |
|
217 glong *items_read, |
|
218 glong *items_written, |
|
219 GError **error) G_GNUC_MALLOC; |
|
220 IMPORT_C gunichar * g_utf8_to_ucs4 (const gchar *str, |
|
221 glong len, |
|
222 glong *items_read, |
|
223 glong *items_written, |
|
224 GError **error) G_GNUC_MALLOC; |
|
225 IMPORT_C gunichar * g_utf8_to_ucs4_fast (const gchar *str, |
|
226 glong len, |
|
227 glong *items_written) G_GNUC_MALLOC; |
|
228 IMPORT_C gunichar * g_utf16_to_ucs4 (const gunichar2 *str, |
|
229 glong len, |
|
230 glong *items_read, |
|
231 glong *items_written, |
|
232 GError **error) G_GNUC_MALLOC; |
|
233 IMPORT_C gchar* g_utf16_to_utf8 (const gunichar2 *str, |
|
234 glong len, |
|
235 glong *items_read, |
|
236 glong *items_written, |
|
237 GError **error) G_GNUC_MALLOC; |
|
238 IMPORT_C gunichar2 *g_ucs4_to_utf16 (const gunichar *str, |
|
239 glong len, |
|
240 glong *items_read, |
|
241 glong *items_written, |
|
242 GError **error) G_GNUC_MALLOC; |
|
243 IMPORT_C gchar* g_ucs4_to_utf8 (const gunichar *str, |
|
244 glong len, |
|
245 glong *items_read, |
|
246 glong *items_written, |
|
247 GError **error) G_GNUC_MALLOC; |
|
248 |
|
249 /* Convert a single character into UTF-8. outbuf must have at |
|
250 * least 6 bytes of space. Returns the number of bytes in the |
|
251 * result. |
|
252 */ |
|
253 IMPORT_C gint g_unichar_to_utf8 (gunichar c, |
|
254 gchar *outbuf); |
|
255 |
|
256 /* Validate a UTF8 string, return TRUE if valid, put pointer to |
|
257 * first invalid char in **end |
|
258 */ |
|
259 |
|
260 IMPORT_C gboolean g_utf8_validate (const gchar *str, |
|
261 gssize max_len, |
|
262 const gchar **end); |
|
263 |
|
264 /* Validate a Unicode character */ |
|
265 IMPORT_C gboolean g_unichar_validate (gunichar ch); |
|
266 |
|
267 IMPORT_C gchar *g_utf8_strup (const gchar *str, |
|
268 gssize len) G_GNUC_MALLOC; |
|
269 IMPORT_C gchar *g_utf8_strdown (const gchar *str, |
|
270 gssize len) G_GNUC_MALLOC; |
|
271 IMPORT_C gchar *g_utf8_casefold (const gchar *str, |
|
272 gssize len) G_GNUC_MALLOC; |
|
273 |
|
274 typedef enum { |
|
275 G_NORMALIZE_DEFAULT, |
|
276 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT, |
|
277 G_NORMALIZE_DEFAULT_COMPOSE, |
|
278 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE, |
|
279 G_NORMALIZE_ALL, |
|
280 G_NORMALIZE_NFKD = G_NORMALIZE_ALL, |
|
281 G_NORMALIZE_ALL_COMPOSE, |
|
282 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE |
|
283 } GNormalizeMode; |
|
284 |
|
285 IMPORT_C gchar *g_utf8_normalize (const gchar *str, |
|
286 gssize len, |
|
287 GNormalizeMode mode) G_GNUC_MALLOC; |
|
288 |
|
289 IMPORT_C gint g_utf8_collate (const gchar *str1, |
|
290 const gchar *str2); |
|
291 IMPORT_C gchar *g_utf8_collate_key (const gchar *str, |
|
292 gssize len) G_GNUC_MALLOC; |
|
293 IMPORT_C gchar *g_utf8_collate_key_for_filename (const gchar *str, |
|
294 gssize len) G_GNUC_MALLOC; |
|
295 |
|
296 IMPORT_C gboolean g_unichar_get_mirror_char (gunichar ch, |
|
297 gunichar *mirrored_ch); |
|
298 |
|
299 G_END_DECLS |
|
300 |
|
301 #endif /* __G_UNICODE_H__ */ |