|
1 /* gutf8.c - Operations on UTF-8 strings. |
|
2 * |
|
3 * Copyright (C) 1999 Tom Tromey |
|
4 * Copyright (C) 2000 Red Hat, Inc. |
|
5 * Portions copyright (c) 2006 Nokia Corporation. All rights reserved. |
|
6 * |
|
7 * This library is free software; you can redistribute it and/or |
|
8 * modify it under the terms of the GNU Lesser General Public |
|
9 * License as published by the Free Software Foundation; either |
|
10 * version 2 of the License, or (at your option) any later version. |
|
11 * |
|
12 * This library is distributed in the hope that it will be useful, |
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 * Lesser General Public License for more details. |
|
16 * |
|
17 * You should have received a copy of the GNU Lesser General Public |
|
18 * License along with this library; if not, write to the |
|
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
|
20 * Boston, MA 02111-1307, USA. |
|
21 */ |
|
22 |
|
23 #include <config.h> |
|
24 |
|
25 #include <stdlib.h> |
|
26 #ifdef HAVE_CODESET |
|
27 #include <langinfo.h> |
|
28 #endif |
|
29 #include <string.h> |
|
30 |
|
31 #include "glib.h" |
|
32 #include "galias.h" |
|
33 |
|
34 #ifdef G_PLATFORM_WIN32 |
|
35 #include <stdio.h> |
|
36 #define STRICT |
|
37 #include <windows.h> |
|
38 #undef STRICT |
|
39 #endif |
|
40 |
|
41 #include "libcharset/libcharset.h" |
|
42 |
|
43 #include "glibintl.h" |
|
44 |
|
45 #ifdef __SYMBIAN32__ |
|
46 #include <glib_wsd.h> |
|
47 #include "glibbackend.h" |
|
48 #endif /* __SYMBIAN32__ */ |
|
49 |
|
50 #if EMULATOR |
|
51 #define g_thread_functions_for_glib_use (*_g_thread_functions_for_glib_use()) |
|
52 #define g_thread_use_default_impl (*_g_thread_use_default_impl()) |
|
53 #endif /* EMULATOR */ |
|
54 |
|
55 #define UTF8_COMPUTE(Char, Mask, Len) \ |
|
56 if (Char < 128) \ |
|
57 { \ |
|
58 Len = 1; \ |
|
59 Mask = 0x7f; \ |
|
60 } \ |
|
61 else if ((Char & 0xe0) == 0xc0) \ |
|
62 { \ |
|
63 Len = 2; \ |
|
64 Mask = 0x1f; \ |
|
65 } \ |
|
66 else if ((Char & 0xf0) == 0xe0) \ |
|
67 { \ |
|
68 Len = 3; \ |
|
69 Mask = 0x0f; \ |
|
70 } \ |
|
71 else if ((Char & 0xf8) == 0xf0) \ |
|
72 { \ |
|
73 Len = 4; \ |
|
74 Mask = 0x07; \ |
|
75 } \ |
|
76 else if ((Char & 0xfc) == 0xf8) \ |
|
77 { \ |
|
78 Len = 5; \ |
|
79 Mask = 0x03; \ |
|
80 } \ |
|
81 else if ((Char & 0xfe) == 0xfc) \ |
|
82 { \ |
|
83 Len = 6; \ |
|
84 Mask = 0x01; \ |
|
85 } \ |
|
86 else \ |
|
87 Len = -1; |
|
88 |
|
89 #define UTF8_LENGTH(Char) \ |
|
90 ((Char) < 0x80 ? 1 : \ |
|
91 ((Char) < 0x800 ? 2 : \ |
|
92 ((Char) < 0x10000 ? 3 : \ |
|
93 ((Char) < 0x200000 ? 4 : \ |
|
94 ((Char) < 0x4000000 ? 5 : 6))))) |
|
95 |
|
96 |
|
97 #define UTF8_GET(Result, Chars, Count, Mask, Len) \ |
|
98 (Result) = (Chars)[0] & (Mask); \ |
|
99 for ((Count) = 1; (Count) < (Len); ++(Count)) \ |
|
100 { \ |
|
101 if (((Chars)[(Count)] & 0xc0) != 0x80) \ |
|
102 { \ |
|
103 (Result) = -1; \ |
|
104 break; \ |
|
105 } \ |
|
106 (Result) <<= 6; \ |
|
107 (Result) |= ((Chars)[(Count)] & 0x3f); \ |
|
108 } |
|
109 |
|
110 #define UNICODE_VALID(Char) \ |
|
111 ((Char) < 0x110000 && \ |
|
112 (((Char) & 0xFFFFF800) != 0xD800) && \ |
|
113 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ |
|
114 ((Char) & 0xFFFE) != 0xFFFE) |
|
115 |
|
116 |
|
117 static const gchar utf8_skip_data[256] = { |
|
118 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
119 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
120 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
121 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
122 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
123 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
124 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
|
125 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 |
|
126 }; |
|
127 |
|
128 const gchar * const g_utf8_skip = utf8_skip_data; |
|
129 |
|
130 #ifdef __SYMBIAN32__ |
|
131 EXPORT_C const gchar * const * _g_utf8_skip() |
|
132 { |
|
133 return &g_utf8_skip; |
|
134 } |
|
135 #endif /* __SYMBIAN32__ */ |
|
136 |
|
137 /** |
|
138 * g_utf8_find_prev_char: |
|
139 * @str: pointer to the beginning of a UTF-8 encoded string |
|
140 * @p: pointer to some position within @str |
|
141 * |
|
142 * Given a position @p with a UTF-8 encoded string @str, find the start |
|
143 * of the previous UTF-8 character starting before @p. Returns %NULL if no |
|
144 * UTF-8 characters are present in @str before @p. |
|
145 * |
|
146 * @p does not have to be at the beginning of a UTF-8 character. No check |
|
147 * is made to see if the character found is actually valid other than |
|
148 * it starts with an appropriate byte. |
|
149 * |
|
150 * Return value: a pointer to the found character or %NULL. |
|
151 **/ |
|
152 EXPORT_C gchar * |
|
153 g_utf8_find_prev_char (const char *str, |
|
154 const char *p) |
|
155 { |
|
156 for (--p; p >= str; --p) |
|
157 { |
|
158 if ((*p & 0xc0) != 0x80) |
|
159 return (gchar *)p; |
|
160 } |
|
161 return NULL; |
|
162 } |
|
163 |
|
164 /** |
|
165 * g_utf8_find_next_char: |
|
166 * @p: a pointer to a position within a UTF-8 encoded string |
|
167 * @end: a pointer to the end of the string, or %NULL to indicate |
|
168 * that the string is nul-terminated, in which case |
|
169 * the returned value will be |
|
170 * |
|
171 * Finds the start of the next UTF-8 character in the string after @p. |
|
172 * |
|
173 * @p does not have to be at the beginning of a UTF-8 character. No check |
|
174 * is made to see if the character found is actually valid other than |
|
175 * it starts with an appropriate byte. |
|
176 * |
|
177 * Return value: a pointer to the found character or %NULL |
|
178 **/ |
|
179 EXPORT_C gchar * |
|
180 g_utf8_find_next_char (const gchar *p, |
|
181 const gchar *end) |
|
182 { |
|
183 if (*p) |
|
184 { |
|
185 if (end) |
|
186 for (++p; p < end && (*p & 0xc0) == 0x80; ++p) |
|
187 ; |
|
188 else |
|
189 for (++p; (*p & 0xc0) == 0x80; ++p) |
|
190 ; |
|
191 } |
|
192 return (p == end) ? NULL : (gchar *)p; |
|
193 } |
|
194 |
|
195 /** |
|
196 * g_utf8_prev_char: |
|
197 * @p: a pointer to a position within a UTF-8 encoded string |
|
198 * |
|
199 * Finds the previous UTF-8 character in the string before @p. |
|
200 * |
|
201 * @p does not have to be at the beginning of a UTF-8 character. No check |
|
202 * is made to see if the character found is actually valid other than |
|
203 * it starts with an appropriate byte. If @p might be the first |
|
204 * character of the string, you must use g_utf8_find_prev_char() instead. |
|
205 * |
|
206 * Return value: a pointer to the found character. |
|
207 **/ |
|
208 EXPORT_C gchar * |
|
209 g_utf8_prev_char (const gchar *p) |
|
210 { |
|
211 while (TRUE) |
|
212 { |
|
213 p--; |
|
214 if ((*p & 0xc0) != 0x80) |
|
215 return (gchar *)p; |
|
216 } |
|
217 } |
|
218 |
|
219 /** |
|
220 * g_utf8_strlen: |
|
221 * @p: pointer to the start of a UTF-8 encoded string. |
|
222 * @max: the maximum number of bytes to examine. If @max |
|
223 * is less than 0, then the string is assumed to be |
|
224 * nul-terminated. If @max is 0, @p will not be examined and |
|
225 * may be %NULL. |
|
226 * |
|
227 * Returns the length of the string in characters. |
|
228 * |
|
229 * Return value: the length of the string in characters |
|
230 **/ |
|
231 EXPORT_C glong |
|
232 g_utf8_strlen (const gchar *p, |
|
233 gssize max) |
|
234 { |
|
235 glong len = 0; |
|
236 const gchar *start = p; |
|
237 g_return_val_if_fail (p != NULL || max == 0, 0); |
|
238 |
|
239 if (max < 0) |
|
240 { |
|
241 while (*p) |
|
242 { |
|
243 p = g_utf8_next_char (p); |
|
244 ++len; |
|
245 } |
|
246 } |
|
247 else |
|
248 { |
|
249 if (max == 0 || !*p) |
|
250 return 0; |
|
251 |
|
252 p = g_utf8_next_char (p); |
|
253 |
|
254 while (p - start < max && *p) |
|
255 { |
|
256 ++len; |
|
257 p = g_utf8_next_char (p); |
|
258 } |
|
259 |
|
260 /* only do the last len increment if we got a complete |
|
261 * char (don't count partial chars) |
|
262 */ |
|
263 if (p - start <= max) |
|
264 ++len; |
|
265 } |
|
266 |
|
267 return len; |
|
268 } |
|
269 |
|
270 /** |
|
271 * g_utf8_get_char: |
|
272 * @p: a pointer to Unicode character encoded as UTF-8 |
|
273 * |
|
274 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. |
|
275 * If @p does not point to a valid UTF-8 encoded character, results are |
|
276 * undefined. If you are not sure that the bytes are complete |
|
277 * valid Unicode characters, you should use g_utf8_get_char_validated() |
|
278 * instead. |
|
279 * |
|
280 * Return value: the resulting character |
|
281 **/ |
|
282 EXPORT_C gunichar |
|
283 g_utf8_get_char (const gchar *p) |
|
284 { |
|
285 int i, mask = 0, len; |
|
286 gunichar result; |
|
287 unsigned char c = (unsigned char) *p; |
|
288 |
|
289 UTF8_COMPUTE (c, mask, len); |
|
290 if (len == -1) |
|
291 return (gunichar)-1; |
|
292 UTF8_GET (result, p, i, mask, len); |
|
293 |
|
294 return result; |
|
295 } |
|
296 |
|
297 /** |
|
298 * g_utf8_offset_to_pointer: |
|
299 * @str: a UTF-8 encoded string |
|
300 * @offset: a character offset within @str |
|
301 * |
|
302 * Converts from an integer character offset to a pointer to a position |
|
303 * within the string. |
|
304 * |
|
305 * Since 2.10, this function allows to pass a negative @offset to |
|
306 * step backwards. It is usually worth stepping backwards from the end |
|
307 * instead of forwards if @offset is in the last fourth of the string, |
|
308 * since moving forward is about 3 times faster than moving backward. |
|
309 * |
|
310 * Return value: the resulting pointer |
|
311 **/ |
|
312 EXPORT_C gchar * |
|
313 g_utf8_offset_to_pointer (const gchar *str, |
|
314 glong offset) |
|
315 { |
|
316 const gchar *s = str; |
|
317 |
|
318 if (offset > 0) |
|
319 while (offset--) |
|
320 s = g_utf8_next_char (s); |
|
321 else |
|
322 { |
|
323 const char *s1; |
|
324 |
|
325 /* This nice technique for fast backwards stepping |
|
326 * through a UTF-8 string was dubbed "stutter stepping" |
|
327 * by its inventor, Larry Ewing. |
|
328 */ |
|
329 while (offset) |
|
330 { |
|
331 s1 = s; |
|
332 s += offset; |
|
333 while ((*s & 0xc0) == 0x80) |
|
334 s--; |
|
335 |
|
336 offset += g_utf8_pointer_to_offset (s, s1); |
|
337 } |
|
338 } |
|
339 |
|
340 return (gchar *)s; |
|
341 } |
|
342 |
|
343 /** |
|
344 * g_utf8_pointer_to_offset: |
|
345 * @str: a UTF-8 encoded string |
|
346 * @pos: a pointer to a position within @str |
|
347 * |
|
348 * Converts from a pointer to position within a string to a integer |
|
349 * character offset. |
|
350 * |
|
351 * Since 2.10, this function allows @pos to be before @str, and returns |
|
352 * a negative offset in this case. |
|
353 * |
|
354 * Return value: the resulting character offset |
|
355 **/ |
|
356 EXPORT_C glong |
|
357 g_utf8_pointer_to_offset (const gchar *str, |
|
358 const gchar *pos) |
|
359 { |
|
360 const gchar *s = str; |
|
361 glong offset = 0; |
|
362 |
|
363 if (pos < str) |
|
364 offset = - g_utf8_pointer_to_offset (pos, str); |
|
365 else |
|
366 while (s < pos) |
|
367 { |
|
368 s = g_utf8_next_char (s); |
|
369 offset++; |
|
370 } |
|
371 |
|
372 return offset; |
|
373 } |
|
374 |
|
375 |
|
376 /** |
|
377 * g_utf8_strncpy: |
|
378 * @dest: buffer to fill with characters from @src |
|
379 * @src: UTF-8 encoded string |
|
380 * @n: character count |
|
381 * |
|
382 * Like the standard C strncpy() function, but |
|
383 * copies a given number of characters instead of a given number of |
|
384 * bytes. The @src string must be valid UTF-8 encoded text. |
|
385 * (Use g_utf8_validate() on all text before trying to use UTF-8 |
|
386 * utility functions with it.) |
|
387 * |
|
388 * Return value: @dest |
|
389 **/ |
|
390 EXPORT_C gchar * |
|
391 g_utf8_strncpy (gchar *dest, |
|
392 const gchar *src, |
|
393 gsize n) |
|
394 { |
|
395 const gchar *s = src; |
|
396 while (n && *s) |
|
397 { |
|
398 s = g_utf8_next_char(s); |
|
399 n--; |
|
400 } |
|
401 strncpy(dest, src, s - src); |
|
402 dest[s - src] = 0; |
|
403 return dest; |
|
404 } |
|
405 |
|
406 #if EMULATOR |
|
407 |
|
408 PLS_MACRO(aliases,gutf8,GStaticMutex) |
|
409 #define g__aliases_lock (*FUNCTION_NAME_MACRO(aliases,gutf8)()) |
|
410 |
|
411 PLS(alias_hash,get_alias_hash,GHashTable *) |
|
412 #define alias_hash (*FUNCTION_NAME(alias_hash,get_alias_hash)()) |
|
413 |
|
414 #else |
|
415 |
|
416 G_LOCK_DEFINE_STATIC (aliases); |
|
417 |
|
418 #endif /* EMULATOR */ |
|
419 |
|
420 static GHashTable * |
|
421 get_alias_hash (void) |
|
422 { |
|
423 #if !(EMULATOR) |
|
424 static GHashTable *alias_hash = NULL; |
|
425 #endif /* EMULATOR */ |
|
426 const char *aliases; |
|
427 |
|
428 G_LOCK (aliases); |
|
429 |
|
430 if (!alias_hash) |
|
431 { |
|
432 alias_hash = g_hash_table_new (g_str_hash, g_str_equal); |
|
433 |
|
434 aliases = _g_locale_get_charset_aliases (); |
|
435 while (*aliases != '\0') |
|
436 { |
|
437 const char *canonical; |
|
438 const char *alias; |
|
439 const char **alias_array; |
|
440 int count = 0; |
|
441 |
|
442 alias = aliases; |
|
443 aliases += strlen (aliases) + 1; |
|
444 canonical = aliases; |
|
445 aliases += strlen (aliases) + 1; |
|
446 |
|
447 alias_array = g_hash_table_lookup (alias_hash, canonical); |
|
448 if (alias_array) |
|
449 { |
|
450 while (alias_array[count]) |
|
451 count++; |
|
452 } |
|
453 alias_array = g_renew (const char *, alias_array, count + 2); |
|
454 alias_array[count] = alias; |
|
455 alias_array[count + 1] = NULL; |
|
456 |
|
457 g_hash_table_insert (alias_hash, (char *)canonical, alias_array); |
|
458 } |
|
459 } |
|
460 |
|
461 G_UNLOCK (aliases); |
|
462 |
|
463 return alias_hash; |
|
464 } |
|
465 |
|
466 #if EMULATOR |
|
467 #undef alias_hash |
|
468 #endif /* EMULATOR */ |
|
469 |
|
470 /* As an abuse of the alias table, the following routines gets |
|
471 * the charsets that are aliases for the canonical name. |
|
472 */ |
|
473 const char ** G_GNUC_INTERNAL |
|
474 _g_charset_get_aliases (const char *canonical_name) |
|
475 { |
|
476 GHashTable *alias_hash = get_alias_hash (); |
|
477 |
|
478 return g_hash_table_lookup (alias_hash, canonical_name); |
|
479 } |
|
480 |
|
481 static gboolean |
|
482 g_utf8_get_charset_internal (const char *raw_data, |
|
483 const char **a) |
|
484 { |
|
485 const char *charset = getenv("CHARSET"); |
|
486 |
|
487 if (charset && *charset) |
|
488 { |
|
489 *a = charset; |
|
490 |
|
491 if (charset && strstr (charset, "UTF-8")) |
|
492 return TRUE; |
|
493 else |
|
494 return FALSE; |
|
495 } |
|
496 |
|
497 /* The libcharset code tries to be thread-safe without |
|
498 * a lock, but has a memory leak and a missing memory |
|
499 * barrier, so we lock for it |
|
500 */ |
|
501 G_LOCK (aliases); |
|
502 charset = _g_locale_charset_unalias (raw_data); |
|
503 G_UNLOCK (aliases); |
|
504 |
|
505 if (charset && *charset) |
|
506 { |
|
507 *a = charset; |
|
508 |
|
509 if (charset && strstr (charset, "UTF-8")) |
|
510 return TRUE; |
|
511 else |
|
512 return FALSE; |
|
513 } |
|
514 |
|
515 /* Assume this for compatibility at present. */ |
|
516 *a = "US-ASCII"; |
|
517 |
|
518 return FALSE; |
|
519 } |
|
520 |
|
521 typedef struct _GCharsetCache GCharsetCache; |
|
522 |
|
523 struct _GCharsetCache { |
|
524 gboolean is_utf8; |
|
525 gchar *raw; |
|
526 gchar *charset; |
|
527 }; |
|
528 |
|
529 static void |
|
530 charset_cache_free (gpointer data) |
|
531 { |
|
532 GCharsetCache *cache = data; |
|
533 #ifndef __SYMBIAN32__ |
|
534 g_free (cache->raw); |
|
535 g_free (cache->charset); |
|
536 g_free (cache); |
|
537 #else |
|
538 pFree(cache->raw); |
|
539 pFree(cache->charset); |
|
540 pFree(cache); |
|
541 #endif |
|
542 } |
|
543 |
|
544 /** |
|
545 * g_get_charset: |
|
546 * @charset: return location for character set name |
|
547 * |
|
548 * Obtains the character set for the current locale; you might use |
|
549 * this character set as an argument to g_convert(), to convert from |
|
550 * the current locale's encoding to some other encoding. (Frequently |
|
551 * g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts, |
|
552 * though.) |
|
553 * |
|
554 * The return value is %TRUE if the locale's encoding is UTF-8, in that |
|
555 * case you can perhaps avoid calling g_convert(). |
|
556 * |
|
557 * The string returned in @charset is not allocated, and should not be |
|
558 * freed. |
|
559 * |
|
560 * Return value: %TRUE if the returned charset is UTF-8 |
|
561 **/ |
|
562 |
|
563 #if EMULATOR |
|
564 |
|
565 PLS(cache_private,g_get_charset,GStaticPrivate) |
|
566 #define cache_private (*FUNCTION_NAME(cache_private,g_get_charset)()) |
|
567 |
|
568 #endif /* EMULATOR */ |
|
569 |
|
570 /* |
|
571 #ifdef __SYMBIAN32__ |
|
572 IMPORT_C void *pAlloc(size_t nBytes); |
|
573 #endif //__SYMBIAN32__ |
|
574 */ |
|
575 |
|
576 EXPORT_C gboolean |
|
577 g_get_charset (G_CONST_RETURN char **charset) |
|
578 { |
|
579 #if !(EMULATOR) |
|
580 static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT; |
|
581 #endif /* EMULATOR */ |
|
582 GCharsetCache *cache = g_static_private_get (&cache_private); |
|
583 const gchar *raw; |
|
584 |
|
585 if (!cache) |
|
586 { |
|
587 #ifndef __SYMBIAN32__ |
|
588 cache = g_new0 (GCharsetCache, 1); |
|
589 #else |
|
590 cache = (GCharsetCache *)pAlloc(sizeof(GCharsetCache)); |
|
591 memset(cache,'\0',sizeof(GCharsetCache)); |
|
592 #endif //__SYMBIAN32__ |
|
593 g_static_private_set (&cache_private, cache, charset_cache_free); |
|
594 } |
|
595 |
|
596 raw = _g_locale_charset_raw (); |
|
597 |
|
598 if (!(cache->raw && strcmp (cache->raw, raw) == 0)) |
|
599 { |
|
600 const gchar *new_charset; |
|
601 |
|
602 #ifndef __SYMBIAN32__ |
|
603 g_free (cache->raw); |
|
604 g_free (cache->charset); |
|
605 cache->raw = g_strdup (raw); |
|
606 cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset); |
|
607 cache->charset = g_strdup (new_charset); |
|
608 #else |
|
609 gchar *temp,*temp1; |
|
610 |
|
611 pFree(cache->raw); |
|
612 pFree(cache->charset); |
|
613 |
|
614 temp = (gchar *)pAlloc(strlen(raw) * sizeof(gchar) + 1); |
|
615 strncpy(temp,raw,strlen(raw)); |
|
616 temp[strlen(raw)] = '\0'; |
|
617 |
|
618 //cache->raw = g_strdup (raw); |
|
619 cache->raw = temp; |
|
620 cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset); |
|
621 |
|
622 temp1 = (gchar *)pAlloc(strlen(new_charset) * sizeof(gchar) + 1); |
|
623 strncpy(temp1,new_charset,strlen(new_charset)); |
|
624 temp1[strlen(new_charset)] = '\0'; |
|
625 |
|
626 //cache->charset = g_strdup (new_charset); |
|
627 cache->charset = temp1; |
|
628 #endif //__SYMBIAN32__ |
|
629 } |
|
630 |
|
631 if (charset) |
|
632 *charset = cache->charset; |
|
633 |
|
634 return cache->is_utf8; |
|
635 } |
|
636 |
|
637 #if EMULATOR |
|
638 #undef cache_private |
|
639 #endif /* EMULATOR */ |
|
640 |
|
641 /* unicode_strchr */ |
|
642 |
|
643 /** |
|
644 * g_unichar_to_utf8: |
|
645 * @c: a Unicode character code |
|
646 * @outbuf: output buffer, must have at least 6 bytes of space. |
|
647 * If %NULL, the length will be computed and returned |
|
648 * and nothing will be written to @outbuf. |
|
649 * |
|
650 * Converts a single character to UTF-8. |
|
651 * |
|
652 * Return value: number of bytes written |
|
653 **/ |
|
654 EXPORT_C int |
|
655 g_unichar_to_utf8 (gunichar c, |
|
656 gchar *outbuf) |
|
657 { |
|
658 /* If this gets modified, also update the copy in g_string_insert_unichar() */ |
|
659 guint len = 0; |
|
660 int first; |
|
661 int i; |
|
662 |
|
663 if (c < 0x80) |
|
664 { |
|
665 first = 0; |
|
666 len = 1; |
|
667 } |
|
668 else if (c < 0x800) |
|
669 { |
|
670 first = 0xc0; |
|
671 len = 2; |
|
672 } |
|
673 else if (c < 0x10000) |
|
674 { |
|
675 first = 0xe0; |
|
676 len = 3; |
|
677 } |
|
678 else if (c < 0x200000) |
|
679 { |
|
680 first = 0xf0; |
|
681 len = 4; |
|
682 } |
|
683 else if (c < 0x4000000) |
|
684 { |
|
685 first = 0xf8; |
|
686 len = 5; |
|
687 } |
|
688 else |
|
689 { |
|
690 first = 0xfc; |
|
691 len = 6; |
|
692 } |
|
693 |
|
694 if (outbuf) |
|
695 { |
|
696 for (i = len - 1; i > 0; --i) |
|
697 { |
|
698 outbuf[i] = (c & 0x3f) | 0x80; |
|
699 c >>= 6; |
|
700 } |
|
701 outbuf[0] = c | first; |
|
702 } |
|
703 |
|
704 return len; |
|
705 } |
|
706 |
|
707 /** |
|
708 * g_utf8_strchr: |
|
709 * @p: a nul-terminated UTF-8 encoded string |
|
710 * @len: the maximum length of @p |
|
711 * @c: a Unicode character |
|
712 * |
|
713 * Finds the leftmost occurrence of the given Unicode character |
|
714 * in a UTF-8 encoded string, while limiting the search to @len bytes. |
|
715 * If @len is -1, allow unbounded search. |
|
716 * |
|
717 * Return value: %NULL if the string does not contain the character, |
|
718 * otherwise, a pointer to the start of the leftmost occurrence of |
|
719 * the character in the string. |
|
720 **/ |
|
721 EXPORT_C gchar * |
|
722 g_utf8_strchr (const char *p, |
|
723 gssize len, |
|
724 gunichar c) |
|
725 { |
|
726 gchar ch[10]; |
|
727 |
|
728 gint charlen = g_unichar_to_utf8 (c, ch); |
|
729 ch[charlen] = '\0'; |
|
730 |
|
731 return g_strstr_len (p, len, ch); |
|
732 } |
|
733 |
|
734 |
|
735 /** |
|
736 * g_utf8_strrchr: |
|
737 * @p: a nul-terminated UTF-8 encoded string |
|
738 * @len: the maximum length of @p |
|
739 * @c: a Unicode character |
|
740 * |
|
741 * Find the rightmost occurrence of the given Unicode character |
|
742 * in a UTF-8 encoded string, while limiting the search to @len bytes. |
|
743 * If @len is -1, allow unbounded search. |
|
744 * |
|
745 * Return value: %NULL if the string does not contain the character, |
|
746 * otherwise, a pointer to the start of the rightmost occurrence of the |
|
747 * character in the string. |
|
748 **/ |
|
749 EXPORT_C gchar * |
|
750 g_utf8_strrchr (const char *p, |
|
751 gssize len, |
|
752 gunichar c) |
|
753 { |
|
754 gchar ch[10]; |
|
755 |
|
756 gint charlen = g_unichar_to_utf8 (c, ch); |
|
757 ch[charlen] = '\0'; |
|
758 |
|
759 return g_strrstr_len (p, len, ch); |
|
760 } |
|
761 |
|
762 |
|
763 /* Like g_utf8_get_char, but take a maximum length |
|
764 * and return (gunichar)-2 on incomplete trailing character |
|
765 */ |
|
766 static inline gunichar |
|
767 g_utf8_get_char_extended (const gchar *p, |
|
768 gssize max_len) |
|
769 { |
|
770 guint i, len; |
|
771 gunichar wc = (guchar) *p; |
|
772 |
|
773 if (wc < 0x80) |
|
774 { |
|
775 return wc; |
|
776 } |
|
777 else if (wc < 0xc0) |
|
778 { |
|
779 return (gunichar)-1; |
|
780 } |
|
781 else if (wc < 0xe0) |
|
782 { |
|
783 len = 2; |
|
784 wc &= 0x1f; |
|
785 } |
|
786 else if (wc < 0xf0) |
|
787 { |
|
788 len = 3; |
|
789 wc &= 0x0f; |
|
790 } |
|
791 else if (wc < 0xf8) |
|
792 { |
|
793 len = 4; |
|
794 wc &= 0x07; |
|
795 } |
|
796 else if (wc < 0xfc) |
|
797 { |
|
798 len = 5; |
|
799 wc &= 0x03; |
|
800 } |
|
801 else if (wc < 0xfe) |
|
802 { |
|
803 len = 6; |
|
804 wc &= 0x01; |
|
805 } |
|
806 else |
|
807 { |
|
808 return (gunichar)-1; |
|
809 } |
|
810 |
|
811 if (max_len >= 0 && len > max_len) |
|
812 { |
|
813 for (i = 1; i < max_len; i++) |
|
814 { |
|
815 if ((((guchar *)p)[i] & 0xc0) != 0x80) |
|
816 return (gunichar)-1; |
|
817 } |
|
818 return (gunichar)-2; |
|
819 } |
|
820 |
|
821 for (i = 1; i < len; ++i) |
|
822 { |
|
823 gunichar ch = ((guchar *)p)[i]; |
|
824 |
|
825 if ((ch & 0xc0) != 0x80) |
|
826 { |
|
827 if (ch) |
|
828 return (gunichar)-1; |
|
829 else |
|
830 return (gunichar)-2; |
|
831 } |
|
832 |
|
833 wc <<= 6; |
|
834 wc |= (ch & 0x3f); |
|
835 } |
|
836 |
|
837 if (UTF8_LENGTH(wc) != len) |
|
838 return (gunichar)-1; |
|
839 |
|
840 return wc; |
|
841 } |
|
842 |
|
843 /** |
|
844 * g_utf8_get_char_validated: |
|
845 * @p: a pointer to Unicode character encoded as UTF-8 |
|
846 * @max_len: the maximum number of bytes to read, or -1, for no maximum. |
|
847 * |
|
848 * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. |
|
849 * This function checks for incomplete characters, for invalid characters |
|
850 * such as characters that are out of the range of Unicode, and for |
|
851 * overlong encodings of valid characters. |
|
852 * |
|
853 * Return value: the resulting character. If @p points to a partial |
|
854 * sequence at the end of a string that could begin a valid |
|
855 * character, returns (gunichar)-2; otherwise, if @p does not point |
|
856 * to a valid UTF-8 encoded Unicode character, returns (gunichar)-1. |
|
857 **/ |
|
858 EXPORT_C gunichar |
|
859 g_utf8_get_char_validated (const gchar *p, |
|
860 gssize max_len) |
|
861 { |
|
862 gunichar result = g_utf8_get_char_extended (p, max_len); |
|
863 |
|
864 if (result & 0x80000000) |
|
865 return result; |
|
866 else if (!UNICODE_VALID (result)) |
|
867 return (gunichar)-1; |
|
868 else |
|
869 return result; |
|
870 } |
|
871 |
|
872 /** |
|
873 * g_utf8_to_ucs4_fast: |
|
874 * @str: a UTF-8 encoded string |
|
875 * @len: the maximum length of @str to use. If @len < 0, then |
|
876 * the string is nul-terminated. |
|
877 * @items_written: location to store the number of characters in the |
|
878 * result, or %NULL. |
|
879 * |
|
880 * Convert a string from UTF-8 to a 32-bit fixed width |
|
881 * representation as UCS-4, assuming valid UTF-8 input. |
|
882 * This function is roughly twice as fast as g_utf8_to_ucs4() |
|
883 * but does no error checking on the input. |
|
884 * |
|
885 * Return value: a pointer to a newly allocated UCS-4 string. |
|
886 * This value must be freed with g_free(). |
|
887 **/ |
|
888 EXPORT_C gunichar * |
|
889 g_utf8_to_ucs4_fast (const gchar *str, |
|
890 glong len, |
|
891 glong *items_written) |
|
892 { |
|
893 gint j, charlen; |
|
894 gunichar *result; |
|
895 gint n_chars, i; |
|
896 const gchar *p; |
|
897 |
|
898 g_return_val_if_fail (str != NULL, NULL); |
|
899 |
|
900 p = str; |
|
901 n_chars = 0; |
|
902 if (len < 0) |
|
903 { |
|
904 while (*p) |
|
905 { |
|
906 p = g_utf8_next_char (p); |
|
907 ++n_chars; |
|
908 } |
|
909 } |
|
910 else |
|
911 { |
|
912 while (p < str + len && *p) |
|
913 { |
|
914 p = g_utf8_next_char (p); |
|
915 ++n_chars; |
|
916 } |
|
917 } |
|
918 result = g_new (gunichar, n_chars + 1); |
|
919 |
|
920 p = str; |
|
921 for (i=0; i < n_chars; i++) |
|
922 { |
|
923 gunichar wc = ((unsigned char *)p)[0]; |
|
924 |
|
925 if (wc < 0x80) |
|
926 { |
|
927 result[i] = wc; |
|
928 p++; |
|
929 } |
|
930 else |
|
931 { |
|
932 if (wc < 0xe0) |
|
933 { |
|
934 charlen = 2; |
|
935 wc &= 0x1f; |
|
936 } |
|
937 else if (wc < 0xf0) |
|
938 { |
|
939 charlen = 3; |
|
940 wc &= 0x0f; |
|
941 } |
|
942 else if (wc < 0xf8) |
|
943 { |
|
944 charlen = 4; |
|
945 wc &= 0x07; |
|
946 } |
|
947 else if (wc < 0xfc) |
|
948 { |
|
949 charlen = 5; |
|
950 wc &= 0x03; |
|
951 } |
|
952 else |
|
953 { |
|
954 charlen = 6; |
|
955 wc &= 0x01; |
|
956 } |
|
957 |
|
958 for (j = 1; j < charlen; j++) |
|
959 { |
|
960 wc <<= 6; |
|
961 wc |= ((unsigned char *)p)[j] & 0x3f; |
|
962 } |
|
963 |
|
964 result[i] = wc; |
|
965 p += charlen; |
|
966 } |
|
967 } |
|
968 result[i] = 0; |
|
969 |
|
970 if (items_written) |
|
971 *items_written = i; |
|
972 |
|
973 return result; |
|
974 } |
|
975 |
|
976 /** |
|
977 * g_utf8_to_ucs4: |
|
978 * @str: a UTF-8 encoded string |
|
979 * @len: the maximum length of @str to use. If @len < 0, then |
|
980 * the string is nul-terminated. |
|
981 * @items_read: location to store number of bytes read, or %NULL. |
|
982 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be |
|
983 * returned in case @str contains a trailing partial |
|
984 * character. If an error occurs then the index of the |
|
985 * invalid input is stored here. |
|
986 * @items_written: location to store number of characters written or %NULL. |
|
987 * The value here stored does not include the trailing 0 |
|
988 * character. |
|
989 * @error: location to store the error occuring, or %NULL to ignore |
|
990 * errors. Any of the errors in #GConvertError other than |
|
991 * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
|
992 * |
|
993 * Convert a string from UTF-8 to a 32-bit fixed width |
|
994 * representation as UCS-4. A trailing 0 will be added to the |
|
995 * string after the converted text. |
|
996 * |
|
997 * Return value: a pointer to a newly allocated UCS-4 string. |
|
998 * This value must be freed with g_free(). If an |
|
999 * error occurs, %NULL will be returned and |
|
1000 * @error set. |
|
1001 **/ |
|
1002 EXPORT_C gunichar * |
|
1003 g_utf8_to_ucs4 (const gchar *str, |
|
1004 glong len, |
|
1005 glong *items_read, |
|
1006 glong *items_written, |
|
1007 GError **error) |
|
1008 { |
|
1009 gunichar *result = NULL; |
|
1010 gint n_chars, i; |
|
1011 const gchar *in; |
|
1012 |
|
1013 in = str; |
|
1014 n_chars = 0; |
|
1015 while ((len < 0 || str + len - in > 0) && *in) |
|
1016 { |
|
1017 gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in); |
|
1018 if (wc & 0x80000000) |
|
1019 { |
|
1020 if (wc == (gunichar)-2) |
|
1021 { |
|
1022 if (items_read) |
|
1023 break; |
|
1024 else |
|
1025 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
|
1026 _("Partial character sequence at end of input")); |
|
1027 } |
|
1028 else |
|
1029 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
|
1030 _("Invalid byte sequence in conversion input")); |
|
1031 |
|
1032 goto err_out; |
|
1033 } |
|
1034 |
|
1035 n_chars++; |
|
1036 |
|
1037 in = g_utf8_next_char (in); |
|
1038 } |
|
1039 result = g_new (gunichar, n_chars + 1); |
|
1040 |
|
1041 in = str; |
|
1042 for (i=0; i < n_chars; i++) |
|
1043 { |
|
1044 result[i] = g_utf8_get_char (in); |
|
1045 in = g_utf8_next_char (in); |
|
1046 } |
|
1047 result[i] = 0; |
|
1048 |
|
1049 if (items_written) |
|
1050 *items_written = n_chars; |
|
1051 |
|
1052 err_out: |
|
1053 if (items_read) |
|
1054 *items_read = in - str; |
|
1055 |
|
1056 return result; |
|
1057 } |
|
1058 |
|
1059 /** |
|
1060 * g_ucs4_to_utf8: |
|
1061 * @str: a UCS-4 encoded string |
|
1062 * @len: the maximum length (number of characters) of @str to use. |
|
1063 * If @len < 0, then the string is terminated with a 0 character. |
|
1064 * @items_read: location to store number of characters read, or %NULL. |
|
1065 * @items_written: location to store number of bytes written or %NULL. |
|
1066 * The value here stored does not include the trailing 0 |
|
1067 * byte. |
|
1068 * @error: location to store the error occuring, or %NULL to ignore |
|
1069 * errors. Any of the errors in #GConvertError other than |
|
1070 * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
|
1071 * |
|
1072 * Convert a string from a 32-bit fixed width representation as UCS-4. |
|
1073 * to UTF-8. The result will be terminated with a 0 byte. |
|
1074 * |
|
1075 * Return value: a pointer to a newly allocated UTF-8 string. |
|
1076 * This value must be freed with g_free(). If an |
|
1077 * error occurs, %NULL will be returned and |
|
1078 * @error set. In that case, @items_read will be |
|
1079 * set to the position of the first invalid input |
|
1080 * character. |
|
1081 **/ |
|
1082 EXPORT_C gchar * |
|
1083 g_ucs4_to_utf8 (const gunichar *str, |
|
1084 glong len, |
|
1085 glong *items_read, |
|
1086 glong *items_written, |
|
1087 GError **error) |
|
1088 { |
|
1089 gint result_length; |
|
1090 gchar *result = NULL; |
|
1091 gchar *p; |
|
1092 gint i; |
|
1093 |
|
1094 result_length = 0; |
|
1095 for (i = 0; len < 0 || i < len ; i++) |
|
1096 { |
|
1097 if (!str[i]) |
|
1098 break; |
|
1099 |
|
1100 if (str[i] >= 0x80000000) |
|
1101 { |
|
1102 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
|
1103 _("Character out of range for UTF-8")); |
|
1104 goto err_out; |
|
1105 } |
|
1106 |
|
1107 result_length += UTF8_LENGTH (str[i]); |
|
1108 } |
|
1109 result = g_malloc (result_length + 1); |
|
1110 p = result; |
|
1111 |
|
1112 i = 0; |
|
1113 while (p < result + result_length) |
|
1114 p += g_unichar_to_utf8 (str[i++], p); |
|
1115 |
|
1116 *p = '\0'; |
|
1117 |
|
1118 if (items_written) |
|
1119 *items_written = p - result; |
|
1120 |
|
1121 err_out: |
|
1122 if (items_read) |
|
1123 *items_read = i; |
|
1124 |
|
1125 return result; |
|
1126 } |
|
1127 |
|
1128 #define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000) |
|
1129 |
|
1130 /** |
|
1131 * g_utf16_to_utf8: |
|
1132 * @str: a UTF-16 encoded string |
|
1133 * @len: the maximum length (number of <type>gunichar2</type>) of @str to use. |
|
1134 * If @len < 0, then the string is terminated with a 0 character. |
|
1135 * @items_read: location to store number of words read, or %NULL. |
|
1136 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be |
|
1137 * returned in case @str contains a trailing partial |
|
1138 * character. If an error occurs then the index of the |
|
1139 * invalid input is stored here. |
|
1140 * @items_written: location to store number of bytes written, or %NULL. |
|
1141 * The value stored here does not include the trailing |
|
1142 * 0 byte. |
|
1143 * @error: location to store the error occuring, or %NULL to ignore |
|
1144 * errors. Any of the errors in #GConvertError other than |
|
1145 * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
|
1146 * |
|
1147 * Convert a string from UTF-16 to UTF-8. The result will be |
|
1148 * terminated with a 0 byte. |
|
1149 * |
|
1150 * Note that the input is expected to be already in native endianness, |
|
1151 * an initial byte-order-mark character is not handled specially. |
|
1152 * g_convert() can be used to convert a byte buffer of UTF-16 data of |
|
1153 * ambiguous endianess. |
|
1154 * |
|
1155 * Return value: a pointer to a newly allocated UTF-8 string. |
|
1156 * This value must be freed with g_free(). If an |
|
1157 * error occurs, %NULL will be returned and |
|
1158 * @error set. |
|
1159 **/ |
|
1160 EXPORT_C gchar * |
|
1161 g_utf16_to_utf8 (const gunichar2 *str, |
|
1162 glong len, |
|
1163 glong *items_read, |
|
1164 glong *items_written, |
|
1165 GError **error) |
|
1166 { |
|
1167 /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ |
|
1168 * are marked. |
|
1169 */ |
|
1170 const gunichar2 *in; |
|
1171 gchar *out; |
|
1172 gchar *result = NULL; |
|
1173 gint n_bytes; |
|
1174 gunichar high_surrogate; |
|
1175 |
|
1176 g_return_val_if_fail (str != 0, NULL); |
|
1177 |
|
1178 n_bytes = 0; |
|
1179 in = str; |
|
1180 high_surrogate = 0; |
|
1181 while ((len < 0 || in - str < len) && *in) |
|
1182 { |
|
1183 gunichar2 c = *in; |
|
1184 gunichar wc; |
|
1185 |
|
1186 if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ |
|
1187 { |
|
1188 if (high_surrogate) |
|
1189 { |
|
1190 wc = SURROGATE_VALUE (high_surrogate, c); |
|
1191 high_surrogate = 0; |
|
1192 } |
|
1193 else |
|
1194 { |
|
1195 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
|
1196 _("Invalid sequence in conversion input")); |
|
1197 goto err_out; |
|
1198 } |
|
1199 } |
|
1200 else |
|
1201 { |
|
1202 if (high_surrogate) |
|
1203 { |
|
1204 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
|
1205 _("Invalid sequence in conversion input")); |
|
1206 goto err_out; |
|
1207 } |
|
1208 |
|
1209 if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ |
|
1210 { |
|
1211 high_surrogate = c; |
|
1212 goto next1; |
|
1213 } |
|
1214 else |
|
1215 wc = c; |
|
1216 } |
|
1217 |
|
1218 /********** DIFFERENT for UTF8/UCS4 **********/ |
|
1219 n_bytes += UTF8_LENGTH (wc); |
|
1220 |
|
1221 next1: |
|
1222 in++; |
|
1223 } |
|
1224 |
|
1225 if (high_surrogate && !items_read) |
|
1226 { |
|
1227 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
|
1228 _("Partial character sequence at end of input")); |
|
1229 goto err_out; |
|
1230 } |
|
1231 |
|
1232 /* At this point, everything is valid, and we just need to convert |
|
1233 */ |
|
1234 /********** DIFFERENT for UTF8/UCS4 **********/ |
|
1235 result = g_malloc (n_bytes + 1); |
|
1236 |
|
1237 high_surrogate = 0; |
|
1238 out = result; |
|
1239 in = str; |
|
1240 while (out < result + n_bytes) |
|
1241 { |
|
1242 gunichar2 c = *in; |
|
1243 gunichar wc; |
|
1244 |
|
1245 if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ |
|
1246 { |
|
1247 wc = SURROGATE_VALUE (high_surrogate, c); |
|
1248 high_surrogate = 0; |
|
1249 } |
|
1250 else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ |
|
1251 { |
|
1252 high_surrogate = c; |
|
1253 goto next2; |
|
1254 } |
|
1255 else |
|
1256 wc = c; |
|
1257 |
|
1258 /********** DIFFERENT for UTF8/UCS4 **********/ |
|
1259 out += g_unichar_to_utf8 (wc, out); |
|
1260 |
|
1261 next2: |
|
1262 in++; |
|
1263 } |
|
1264 |
|
1265 /********** DIFFERENT for UTF8/UCS4 **********/ |
|
1266 *out = '\0'; |
|
1267 |
|
1268 if (items_written) |
|
1269 /********** DIFFERENT for UTF8/UCS4 **********/ |
|
1270 *items_written = out - result; |
|
1271 |
|
1272 err_out: |
|
1273 if (items_read) |
|
1274 *items_read = in - str; |
|
1275 |
|
1276 return result; |
|
1277 } |
|
1278 |
|
1279 /** |
|
1280 * g_utf16_to_ucs4: |
|
1281 * @str: a UTF-16 encoded string |
|
1282 * @len: the maximum length (number of <type>gunichar2</type>) of @str to use. |
|
1283 * If @len < 0, then the string is terminated with a 0 character. |
|
1284 * @items_read: location to store number of words read, or %NULL. |
|
1285 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be |
|
1286 * returned in case @str contains a trailing partial |
|
1287 * character. If an error occurs then the index of the |
|
1288 * invalid input is stored here. |
|
1289 * @items_written: location to store number of characters written, or %NULL. |
|
1290 * The value stored here does not include the trailing |
|
1291 * 0 character. |
|
1292 * @error: location to store the error occuring, or %NULL to ignore |
|
1293 * errors. Any of the errors in #GConvertError other than |
|
1294 * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
|
1295 * |
|
1296 * Convert a string from UTF-16 to UCS-4. The result will be |
|
1297 * terminated with a 0 character. |
|
1298 * |
|
1299 * Return value: a pointer to a newly allocated UCS-4 string. |
|
1300 * This value must be freed with g_free(). If an |
|
1301 * error occurs, %NULL will be returned and |
|
1302 * @error set. |
|
1303 **/ |
|
1304 EXPORT_C gunichar * |
|
1305 g_utf16_to_ucs4 (const gunichar2 *str, |
|
1306 glong len, |
|
1307 glong *items_read, |
|
1308 glong *items_written, |
|
1309 GError **error) |
|
1310 { |
|
1311 const gunichar2 *in; |
|
1312 gchar *out; |
|
1313 gchar *result = NULL; |
|
1314 gint n_bytes; |
|
1315 gunichar high_surrogate; |
|
1316 |
|
1317 g_return_val_if_fail (str != 0, NULL); |
|
1318 |
|
1319 n_bytes = 0; |
|
1320 in = str; |
|
1321 high_surrogate = 0; |
|
1322 while ((len < 0 || in - str < len) && *in) |
|
1323 { |
|
1324 gunichar2 c = *in; |
|
1325 gunichar wc; |
|
1326 |
|
1327 if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ |
|
1328 { |
|
1329 if (high_surrogate) |
|
1330 { |
|
1331 wc = SURROGATE_VALUE (high_surrogate, c); |
|
1332 high_surrogate = 0; |
|
1333 } |
|
1334 else |
|
1335 { |
|
1336 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
|
1337 _("Invalid sequence in conversion input")); |
|
1338 goto err_out; |
|
1339 } |
|
1340 } |
|
1341 else |
|
1342 { |
|
1343 if (high_surrogate) |
|
1344 { |
|
1345 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
|
1346 _("Invalid sequence in conversion input")); |
|
1347 goto err_out; |
|
1348 } |
|
1349 |
|
1350 if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ |
|
1351 { |
|
1352 high_surrogate = c; |
|
1353 goto next1; |
|
1354 } |
|
1355 else |
|
1356 wc = c; |
|
1357 } |
|
1358 |
|
1359 /********** DIFFERENT for UTF8/UCS4 **********/ |
|
1360 n_bytes += sizeof (gunichar); |
|
1361 |
|
1362 next1: |
|
1363 in++; |
|
1364 } |
|
1365 |
|
1366 if (high_surrogate && !items_read) |
|
1367 { |
|
1368 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
|
1369 _("Partial character sequence at end of input")); |
|
1370 goto err_out; |
|
1371 } |
|
1372 |
|
1373 /* At this point, everything is valid, and we just need to convert |
|
1374 */ |
|
1375 /********** DIFFERENT for UTF8/UCS4 **********/ |
|
1376 result = g_malloc (n_bytes + 4); |
|
1377 |
|
1378 high_surrogate = 0; |
|
1379 out = result; |
|
1380 in = str; |
|
1381 while (out < result + n_bytes) |
|
1382 { |
|
1383 gunichar2 c = *in; |
|
1384 gunichar wc; |
|
1385 |
|
1386 if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ |
|
1387 { |
|
1388 wc = SURROGATE_VALUE (high_surrogate, c); |
|
1389 high_surrogate = 0; |
|
1390 } |
|
1391 else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ |
|
1392 { |
|
1393 high_surrogate = c; |
|
1394 goto next2; |
|
1395 } |
|
1396 else |
|
1397 wc = c; |
|
1398 |
|
1399 /********** DIFFERENT for UTF8/UCS4 **********/ |
|
1400 *(gunichar *)out = wc; |
|
1401 out += sizeof (gunichar); |
|
1402 |
|
1403 next2: |
|
1404 in++; |
|
1405 } |
|
1406 |
|
1407 /********** DIFFERENT for UTF8/UCS4 **********/ |
|
1408 *(gunichar *)out = 0; |
|
1409 |
|
1410 if (items_written) |
|
1411 /********** DIFFERENT for UTF8/UCS4 **********/ |
|
1412 *items_written = (out - result) / sizeof (gunichar); |
|
1413 |
|
1414 err_out: |
|
1415 if (items_read) |
|
1416 *items_read = in - str; |
|
1417 |
|
1418 return (gunichar *)result; |
|
1419 } |
|
1420 |
|
1421 /** |
|
1422 * g_utf8_to_utf16: |
|
1423 * @str: a UTF-8 encoded string |
|
1424 * @len: the maximum length (number of characters) of @str to use. |
|
1425 * If @len < 0, then the string is nul-terminated. |
|
1426 * @items_read: location to store number of bytes read, or %NULL. |
|
1427 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be |
|
1428 * returned in case @str contains a trailing partial |
|
1429 * character. If an error occurs then the index of the |
|
1430 * invalid input is stored here. |
|
1431 * @items_written: location to store number of <type>gunichar2</type> written, |
|
1432 * or %NULL. |
|
1433 * The value stored here does not include the trailing 0. |
|
1434 * @error: location to store the error occuring, or %NULL to ignore |
|
1435 * errors. Any of the errors in #GConvertError other than |
|
1436 * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
|
1437 * |
|
1438 * Convert a string from UTF-8 to UTF-16. A 0 character will be |
|
1439 * added to the result after the converted text. |
|
1440 * |
|
1441 * Return value: a pointer to a newly allocated UTF-16 string. |
|
1442 * This value must be freed with g_free(). If an |
|
1443 * error occurs, %NULL will be returned and |
|
1444 * @error set. |
|
1445 **/ |
|
1446 EXPORT_C gunichar2 * |
|
1447 g_utf8_to_utf16 (const gchar *str, |
|
1448 glong len, |
|
1449 glong *items_read, |
|
1450 glong *items_written, |
|
1451 GError **error) |
|
1452 { |
|
1453 gunichar2 *result = NULL; |
|
1454 gint n16; |
|
1455 const gchar *in; |
|
1456 gint i; |
|
1457 |
|
1458 g_return_val_if_fail (str != NULL, NULL); |
|
1459 |
|
1460 in = str; |
|
1461 n16 = 0; |
|
1462 while ((len < 0 || str + len - in > 0) && *in) |
|
1463 { |
|
1464 gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in); |
|
1465 if (wc & 0x80000000) |
|
1466 { |
|
1467 if (wc == (gunichar)-2) |
|
1468 { |
|
1469 if (items_read) |
|
1470 break; |
|
1471 else |
|
1472 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
|
1473 _("Partial character sequence at end of input")); |
|
1474 } |
|
1475 else |
|
1476 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
|
1477 _("Invalid byte sequence in conversion input")); |
|
1478 |
|
1479 goto err_out; |
|
1480 } |
|
1481 |
|
1482 if (wc < 0xd800) |
|
1483 n16 += 1; |
|
1484 else if (wc < 0xe000) |
|
1485 { |
|
1486 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
|
1487 _("Invalid sequence in conversion input")); |
|
1488 |
|
1489 goto err_out; |
|
1490 } |
|
1491 else if (wc < 0x10000) |
|
1492 n16 += 1; |
|
1493 else if (wc < 0x110000) |
|
1494 n16 += 2; |
|
1495 else |
|
1496 { |
|
1497 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
|
1498 _("Character out of range for UTF-16")); |
|
1499 |
|
1500 goto err_out; |
|
1501 } |
|
1502 |
|
1503 in = g_utf8_next_char (in); |
|
1504 } |
|
1505 result = g_new (gunichar2, n16 + 1); |
|
1506 |
|
1507 in = str; |
|
1508 for (i = 0; i < n16;) |
|
1509 { |
|
1510 gunichar wc = g_utf8_get_char (in); |
|
1511 |
|
1512 if (wc < 0x10000) |
|
1513 { |
|
1514 result[i++] = wc; |
|
1515 } |
|
1516 else |
|
1517 { |
|
1518 result[i++] = (wc - 0x10000) / 0x400 + 0xd800; |
|
1519 result[i++] = (wc - 0x10000) % 0x400 + 0xdc00; |
|
1520 } |
|
1521 |
|
1522 in = g_utf8_next_char (in); |
|
1523 } |
|
1524 |
|
1525 result[i] = 0; |
|
1526 |
|
1527 if (items_written) |
|
1528 *items_written = n16; |
|
1529 |
|
1530 err_out: |
|
1531 if (items_read) |
|
1532 *items_read = in - str; |
|
1533 |
|
1534 return result; |
|
1535 } |
|
1536 |
|
1537 /** |
|
1538 * g_ucs4_to_utf16: |
|
1539 * @str: a UCS-4 encoded string |
|
1540 * @len: the maximum length (number of characters) of @str to use. |
|
1541 * If @len < 0, then the string is terminated with a 0 character. |
|
1542 * @items_read: location to store number of bytes read, or %NULL. |
|
1543 * If an error occurs then the index of the invalid input |
|
1544 * is stored here. |
|
1545 * @items_written: location to store number of <type>gunichar2</type> |
|
1546 * written, or %NULL. The value stored here does not |
|
1547 * include the trailing 0. |
|
1548 * @error: location to store the error occuring, or %NULL to ignore |
|
1549 * errors. Any of the errors in #GConvertError other than |
|
1550 * %G_CONVERT_ERROR_NO_CONVERSION may occur. |
|
1551 * |
|
1552 * Convert a string from UCS-4 to UTF-16. A 0 character will be |
|
1553 * added to the result after the converted text. |
|
1554 * |
|
1555 * Return value: a pointer to a newly allocated UTF-16 string. |
|
1556 * This value must be freed with g_free(). If an |
|
1557 * error occurs, %NULL will be returned and |
|
1558 * @error set. |
|
1559 **/ |
|
1560 EXPORT_C gunichar2 * |
|
1561 g_ucs4_to_utf16 (const gunichar *str, |
|
1562 glong len, |
|
1563 glong *items_read, |
|
1564 glong *items_written, |
|
1565 GError **error) |
|
1566 { |
|
1567 gunichar2 *result = NULL; |
|
1568 gint n16; |
|
1569 gint i, j; |
|
1570 |
|
1571 n16 = 0; |
|
1572 i = 0; |
|
1573 while ((len < 0 || i < len) && str[i]) |
|
1574 { |
|
1575 gunichar wc = str[i]; |
|
1576 |
|
1577 if (wc < 0xd800) |
|
1578 n16 += 1; |
|
1579 else if (wc < 0xe000) |
|
1580 { |
|
1581 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
|
1582 _("Invalid sequence in conversion input")); |
|
1583 |
|
1584 goto err_out; |
|
1585 } |
|
1586 else if (wc < 0x10000) |
|
1587 n16 += 1; |
|
1588 else if (wc < 0x110000) |
|
1589 n16 += 2; |
|
1590 else |
|
1591 { |
|
1592 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
|
1593 _("Character out of range for UTF-16")); |
|
1594 |
|
1595 goto err_out; |
|
1596 } |
|
1597 |
|
1598 i++; |
|
1599 } |
|
1600 result = g_new (gunichar2, n16 + 1); |
|
1601 |
|
1602 for (i = 0, j = 0; j < n16; i++) |
|
1603 { |
|
1604 gunichar wc = str[i]; |
|
1605 |
|
1606 if (wc < 0x10000) |
|
1607 { |
|
1608 result[j++] = wc; |
|
1609 } |
|
1610 else |
|
1611 { |
|
1612 result[j++] = (wc - 0x10000) / 0x400 + 0xd800; |
|
1613 result[j++] = (wc - 0x10000) % 0x400 + 0xdc00; |
|
1614 } |
|
1615 } |
|
1616 result[j] = 0; |
|
1617 |
|
1618 if (items_written) |
|
1619 *items_written = n16; |
|
1620 |
|
1621 err_out: |
|
1622 if (items_read) |
|
1623 *items_read = i; |
|
1624 |
|
1625 return result; |
|
1626 } |
|
1627 |
|
1628 #define CONTINUATION_CHAR \ |
|
1629 G_STMT_START { \ |
|
1630 if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \ |
|
1631 goto error; \ |
|
1632 val <<= 6; \ |
|
1633 val |= (*(guchar *)p) & 0x3f; \ |
|
1634 } G_STMT_END |
|
1635 |
|
1636 static const gchar * |
|
1637 fast_validate (const char *str) |
|
1638 |
|
1639 { |
|
1640 gunichar val = 0; |
|
1641 gunichar min = 0; |
|
1642 const gchar *p; |
|
1643 |
|
1644 for (p = str; *p; p++) |
|
1645 { |
|
1646 if (*(guchar *)p < 128) |
|
1647 /* done */; |
|
1648 else |
|
1649 { |
|
1650 const gchar *last; |
|
1651 |
|
1652 last = p; |
|
1653 if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ |
|
1654 { |
|
1655 if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0)) |
|
1656 goto error; |
|
1657 p++; |
|
1658 if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */ |
|
1659 goto error; |
|
1660 } |
|
1661 else |
|
1662 { |
|
1663 if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ |
|
1664 { |
|
1665 min = (1 << 11); |
|
1666 val = *(guchar *)p & 0x0f; |
|
1667 goto TWO_REMAINING; |
|
1668 } |
|
1669 else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ |
|
1670 { |
|
1671 min = (1 << 16); |
|
1672 val = *(guchar *)p & 0x07; |
|
1673 } |
|
1674 else |
|
1675 goto error; |
|
1676 |
|
1677 p++; |
|
1678 CONTINUATION_CHAR; |
|
1679 TWO_REMAINING: |
|
1680 p++; |
|
1681 CONTINUATION_CHAR; |
|
1682 p++; |
|
1683 CONTINUATION_CHAR; |
|
1684 |
|
1685 if (G_UNLIKELY (val < min)) |
|
1686 goto error; |
|
1687 |
|
1688 if (G_UNLIKELY (!UNICODE_VALID(val))) |
|
1689 goto error; |
|
1690 } |
|
1691 |
|
1692 continue; |
|
1693 |
|
1694 error: |
|
1695 return last; |
|
1696 } |
|
1697 } |
|
1698 |
|
1699 return p; |
|
1700 } |
|
1701 |
|
1702 static const gchar * |
|
1703 fast_validate_len (const char *str, |
|
1704 gssize max_len) |
|
1705 |
|
1706 { |
|
1707 gunichar val = 0; |
|
1708 gunichar min = 0; |
|
1709 const gchar *p; |
|
1710 |
|
1711 for (p = str; (max_len < 0 || (p - str) < max_len) && *p; p++) |
|
1712 { |
|
1713 if (*(guchar *)p < 128) |
|
1714 /* done */; |
|
1715 else |
|
1716 { |
|
1717 const gchar *last; |
|
1718 |
|
1719 last = p; |
|
1720 if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */ |
|
1721 { |
|
1722 if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 2)) |
|
1723 goto error; |
|
1724 |
|
1725 if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0)) |
|
1726 goto error; |
|
1727 p++; |
|
1728 if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */ |
|
1729 goto error; |
|
1730 } |
|
1731 else |
|
1732 { |
|
1733 if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */ |
|
1734 { |
|
1735 if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 3)) |
|
1736 goto error; |
|
1737 |
|
1738 min = (1 << 11); |
|
1739 val = *(guchar *)p & 0x0f; |
|
1740 goto TWO_REMAINING; |
|
1741 } |
|
1742 else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */ |
|
1743 { |
|
1744 if (G_UNLIKELY (max_len >= 0 && max_len - (p - str) < 4)) |
|
1745 goto error; |
|
1746 |
|
1747 min = (1 << 16); |
|
1748 val = *(guchar *)p & 0x07; |
|
1749 } |
|
1750 else |
|
1751 goto error; |
|
1752 |
|
1753 p++; |
|
1754 CONTINUATION_CHAR; |
|
1755 TWO_REMAINING: |
|
1756 p++; |
|
1757 CONTINUATION_CHAR; |
|
1758 p++; |
|
1759 CONTINUATION_CHAR; |
|
1760 |
|
1761 if (G_UNLIKELY (val < min)) |
|
1762 goto error; |
|
1763 if (G_UNLIKELY (!UNICODE_VALID(val))) |
|
1764 goto error; |
|
1765 } |
|
1766 |
|
1767 continue; |
|
1768 |
|
1769 error: |
|
1770 return last; |
|
1771 } |
|
1772 } |
|
1773 |
|
1774 return p; |
|
1775 } |
|
1776 |
|
1777 /** |
|
1778 * g_utf8_validate: |
|
1779 * @str: a pointer to character data |
|
1780 * @max_len: max bytes to validate, or -1 to go until NUL |
|
1781 * @end: return location for end of valid data |
|
1782 * |
|
1783 * Validates UTF-8 encoded text. @str is the text to validate; |
|
1784 * if @str is nul-terminated, then @max_len can be -1, otherwise |
|
1785 * @max_len should be the number of bytes to validate. |
|
1786 * If @end is non-%NULL, then the end of the valid range |
|
1787 * will be stored there (i.e. the start of the first invalid |
|
1788 * character if some bytes were invalid, or the end of the text |
|
1789 * being validated otherwise). |
|
1790 * |
|
1791 * Note that g_utf8_validate() returns %FALSE if @max_len is |
|
1792 * positive and NUL is met before @max_len bytes have been read. |
|
1793 * |
|
1794 * Returns %TRUE if all of @str was valid. Many GLib and GTK+ |
|
1795 * routines <emphasis>require</emphasis> valid UTF-8 as input; |
|
1796 * so data read from a file or the network should be checked |
|
1797 * with g_utf8_validate() before doing anything else with it. |
|
1798 * |
|
1799 * Return value: %TRUE if the text was valid UTF-8 |
|
1800 **/ |
|
1801 EXPORT_C gboolean |
|
1802 g_utf8_validate (const char *str, |
|
1803 gssize max_len, |
|
1804 const gchar **end) |
|
1805 |
|
1806 { |
|
1807 const gchar *p; |
|
1808 |
|
1809 if (max_len < 0) |
|
1810 p = fast_validate (str); |
|
1811 else |
|
1812 p = fast_validate_len (str, max_len); |
|
1813 |
|
1814 if (end) |
|
1815 *end = p; |
|
1816 |
|
1817 if ((max_len >= 0 && p != str + max_len) || |
|
1818 (max_len < 0 && *p != '\0')) |
|
1819 return FALSE; |
|
1820 else |
|
1821 return TRUE; |
|
1822 } |
|
1823 |
|
1824 |
|
1825 /** |
|
1826 * g_unichar_validate: |
|
1827 * @ch: a Unicode character |
|
1828 * |
|
1829 * Checks whether @ch is a valid Unicode character. Some possible |
|
1830 * integer values of @ch will not be valid. 0 is considered a valid |
|
1831 * character, though it's normally a string terminator. |
|
1832 * |
|
1833 * Return value: %TRUE if @ch is a valid Unicode character |
|
1834 **/ |
|
1835 EXPORT_C gboolean |
|
1836 g_unichar_validate (gunichar ch) |
|
1837 { |
|
1838 return UNICODE_VALID (ch); |
|
1839 } |
|
1840 |
|
1841 /** |
|
1842 * g_utf8_strreverse: |
|
1843 * @str: a UTF-8 encoded string |
|
1844 * @len: the maximum length of @str to use. If @len < 0, then |
|
1845 * the string is nul-terminated. |
|
1846 * |
|
1847 * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text. |
|
1848 * (Use g_utf8_validate() on all text before trying to use UTF-8 |
|
1849 * utility functions with it.) |
|
1850 * |
|
1851 * Note that unlike g_strreverse(), this function returns |
|
1852 * newly-allocated memory, which should be freed with g_free() when |
|
1853 * no longer needed. |
|
1854 * |
|
1855 * Returns: a newly-allocated string which is the reverse of @str. |
|
1856 * |
|
1857 * Since: 2.2 |
|
1858 */ |
|
1859 EXPORT_C gchar * |
|
1860 g_utf8_strreverse (const gchar *str, |
|
1861 gssize len) |
|
1862 { |
|
1863 gchar *result; |
|
1864 const gchar *p; |
|
1865 gchar *m, *r, skip; |
|
1866 |
|
1867 if (len < 0) |
|
1868 len = strlen (str); |
|
1869 result = g_new (gchar, len + 1); |
|
1870 r = result + len; |
|
1871 p = str; |
|
1872 while (*p) |
|
1873 { |
|
1874 skip = g_utf8_skip[*(guchar*)p]; |
|
1875 r -= skip; |
|
1876 for (m = r; skip; skip--) |
|
1877 *m++ = *p++; |
|
1878 } |
|
1879 result[len] = 0; |
|
1880 |
|
1881 return result; |
|
1882 } |
|
1883 |
|
1884 #define __G_UTF8_C__ |
|
1885 #include "galiasdef.c" |