diff -r 000000000000 -r e4d67989cc36 glib/libglib/src/gunidecomp.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/glib/libglib/src/gunidecomp.c Tue Feb 02 02:01:42 2010 +0200 @@ -0,0 +1,525 @@ +/* decomp.c - Character decomposition. + * + * Copyright (C) 1999, 2000 Tom Tromey + * Copyright 2000 Red Hat, Inc. + * Portions copyright (c) 2006 Nokia Corporation. All rights reserved. + * + * The Gnome Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * The Gnome Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with the Gnome Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +#include "config.h" + +#include + +#include "glib.h" +#include "gunidecomp.h" +#include "gunicomp.h" +#include "gunicodeprivate.h" +#include "galias.h" + + +#define CC_PART1(Page, Char) \ + ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ + ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ + : (cclass_data[combining_class_table_part1[Page]][Char])) + +#define CC_PART2(Page, Char) \ + ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ + ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ + : (cclass_data[combining_class_table_part2[Page]][Char])) + +#define COMBINING_CLASS(Char) \ + (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ + ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \ + : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ + ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ + : 0)) + +gint +_g_unichar_combining_class (gunichar uc) +{ + return COMBINING_CLASS (uc); +} + +/* constants for hangul syllable [de]composition */ +#define SBase 0xAC00 +#define LBase 0x1100 +#define VBase 0x1161 +#define TBase 0x11A7 +#define LCount 19 +#define VCount 21 +#define TCount 28 +#define NCount (VCount * TCount) +#define SCount (LCount * NCount) + +/** + * g_unicode_canonical_ordering: + * @string: a UCS-4 encoded string. + * @len: the maximum length of @string to use. + * + * Computes the canonical ordering of a string in-place. + * This rearranges decomposed characters in the string + * according to their combining classes. See the Unicode + * manual for more information. + **/ +EXPORT_C void +g_unicode_canonical_ordering (gunichar *string, + gsize len) +{ + gsize i; + int swap = 1; + + while (swap) + { + int last; + swap = 0; + last = COMBINING_CLASS (string[0]); + for (i = 0; i < len - 1; ++i) + { + int next = COMBINING_CLASS (string[i + 1]); + if (next != 0 && last > next) + { + gsize j; + /* Percolate item leftward through string. */ + for (j = i + 1; j > 0; --j) + { + gunichar t; + if (COMBINING_CLASS (string[j - 1]) <= next) + break; + t = string[j]; + string[j] = string[j - 1]; + string[j - 1] = t; + swap = 1; + } + /* We're re-entering the loop looking at the old + character again. */ + next = last; + } + last = next; + } + } +} + +/* http://www.unicode.org/unicode/reports/tr15/#Hangul + * r should be null or have sufficient space. Calling with r == NULL will + * only calculate the result_len; however, a buffer with space for three + * characters will always be big enough. */ +static void +decompose_hangul (gunichar s, + gunichar *r, + gsize *result_len) +{ + gint SIndex = s - SBase; + + /* not a hangul syllable */ + if (SIndex < 0 || SIndex >= SCount) + { + if (r) + r[0] = s; + *result_len = 1; + } + else + { + gunichar L = LBase + SIndex / NCount; + gunichar V = VBase + (SIndex % NCount) / TCount; + gunichar T = TBase + SIndex % TCount; + + if (r) + { + r[0] = L; + r[1] = V; + } + + if (T != TBase) + { + if (r) + r[2] = T; + *result_len = 3; + } + else + *result_len = 2; + } +} + +/* returns a pointer to a null-terminated UTF-8 string */ +static const gchar * +find_decomposition (gunichar ch, + gboolean compat) +{ + int start = 0; + int end = G_N_ELEMENTS (decomp_table); + + if (ch >= decomp_table[start].ch && + ch <= decomp_table[end - 1].ch) + { + while (TRUE) + { + int half = (start + end) / 2; + if (ch == decomp_table[half].ch) + { + int offset; + + if (compat) + { + offset = decomp_table[half].compat_offset; + if (offset == G_UNICODE_NOT_PRESENT_OFFSET) + offset = decomp_table[half].canon_offset; + } + else + { + offset = decomp_table[half].canon_offset; + if (offset == G_UNICODE_NOT_PRESENT_OFFSET) + return NULL; + } + + return &(decomp_expansion_string[offset]); + } + else if (half == start) + break; + else if (ch > decomp_table[half].ch) + start = half; + else + end = half; + } + } + + return NULL; +} + +/** + * g_unicode_canonical_decomposition: + * @ch: a Unicode character. + * @result_len: location to store the length of the return value. + * + * Computes the canonical decomposition of a Unicode character. + * + * Return value: a newly allocated string of Unicode characters. + * @result_len is set to the resulting length of the string. + **/ +EXPORT_C gunichar * +g_unicode_canonical_decomposition (gunichar ch, + gsize *result_len) +{ + const gchar *decomp; + const gchar *p; + gunichar *r; + + /* Hangul syllable */ + if (ch >= 0xac00 && ch <= 0xd7a3) + { + decompose_hangul (ch, NULL, result_len); + r = g_malloc (*result_len * sizeof (gunichar)); + decompose_hangul (ch, r, result_len); + } + else if ((decomp = find_decomposition (ch, FALSE)) != NULL) + { + /* Found it. */ + int i; + + *result_len = g_utf8_strlen (decomp, -1); + r = g_malloc (*result_len * sizeof (gunichar)); + + for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++) + r[i] = g_utf8_get_char (p); + } + else + { + /* Not in our table. */ + r = g_malloc (sizeof (gunichar)); + *r = ch; + *result_len = 1; + } + + /* Supposedly following the Unicode 2.1.9 table means that the + decompositions come out in canonical order. I haven't tested + this, but we rely on it here. */ + return r; +} + +/* L,V => LV and LV,T => LVT */ +static gboolean +combine_hangul (gunichar a, + gunichar b, + gunichar *result) +{ + gint LIndex = a - LBase; + gint SIndex = a - SBase; + + gint VIndex = b - VBase; + gint TIndex = b - TBase; + + if (0 <= LIndex && LIndex < LCount + && 0 <= VIndex && VIndex < VCount) + { + *result = SBase + (LIndex * VCount + VIndex) * TCount; + return TRUE; + } + else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0 + && 0 < TIndex && TIndex < TCount) + { + *result = a + TIndex; + return TRUE; + } + + return FALSE; +} + +#define CI(Page, Char) \ + ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ + ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ + : (compose_data[compose_table[Page]][Char])) + +#define COMPOSE_INDEX(Char) \ + (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) + +static gboolean +combine (gunichar a, + gunichar b, + gunichar *result) +{ + gushort index_a, index_b; + + if (combine_hangul (a, b, result)) + return TRUE; + + index_a = COMPOSE_INDEX(a); + + if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) + { + if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) + { + *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; + return TRUE; + } + else + return FALSE; + } + + index_b = COMPOSE_INDEX(b); + + if (index_b >= COMPOSE_SECOND_SINGLE_START) + { + if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) + { + *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; + return TRUE; + } + else + return FALSE; + } + + if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START && + index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START) + { + gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START]; + + if (res) + { + *result = res; + return TRUE; + } + } + + return FALSE; +} + +gunichar * +_g_utf8_normalize_wc (const gchar *str, + gssize max_len, + GNormalizeMode mode) +{ + gsize n_wc; + gunichar *wc_buffer; + const char *p; + gsize last_start; + gboolean do_compat = (mode == G_NORMALIZE_NFKC || + mode == G_NORMALIZE_NFKD); + gboolean do_compose = (mode == G_NORMALIZE_NFC || + mode == G_NORMALIZE_NFKC); + + n_wc = 0; + p = str; + while ((max_len < 0 || p < str + max_len) && *p) + { + const gchar *decomp; + gunichar wc = g_utf8_get_char (p); + + if (wc >= 0xac00 && wc <= 0xd7a3) + { + gsize result_len; + decompose_hangul (wc, NULL, &result_len); + n_wc += result_len; + } + else + { + decomp = find_decomposition (wc, do_compat); + + if (decomp) + n_wc += g_utf8_strlen (decomp, -1); + else + n_wc++; + } + + p = g_utf8_next_char (p); + } + wc_buffer = g_new (gunichar, n_wc + 1); + + last_start = 0; + n_wc = 0; + p = str; + while ((max_len < 0 || p < str + max_len) && *p) + { + gunichar wc = g_utf8_get_char (p); + const gchar *decomp; + int cc; + gsize old_n_wc = n_wc; + + if (wc >= 0xac00 && wc <= 0xd7a3) + { + gsize result_len; + decompose_hangul (wc, wc_buffer + n_wc, &result_len); + n_wc += result_len; + } + else + { + decomp = find_decomposition (wc, do_compat); + + if (decomp) + { + const char *pd; + for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) + wc_buffer[n_wc++] = g_utf8_get_char (pd); + } + else + wc_buffer[n_wc++] = wc; + } + + if (n_wc > 0) + { + cc = COMBINING_CLASS (wc_buffer[old_n_wc]); + + if (cc == 0) + { + g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); + last_start = old_n_wc; + } + } + + p = g_utf8_next_char (p); + } + + if (n_wc > 0) + { + g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); + last_start = n_wc; + } + + wc_buffer[n_wc] = 0; + + /* All decomposed and reordered */ + + if (do_compose && n_wc > 0) + { + gsize i, j; + int last_cc = 0; + last_start = 0; + + for (i = 0; i < n_wc; i++) + { + int cc = COMBINING_CLASS (wc_buffer[i]); + + if (i > 0 && + (last_cc == 0 || last_cc != cc) && + combine (wc_buffer[last_start], wc_buffer[i], + &wc_buffer[last_start])) + { + for (j = i + 1; j < n_wc; j++) + wc_buffer[j-1] = wc_buffer[j]; + n_wc--; + i--; + + if (i == last_start) + last_cc = 0; + else + last_cc = COMBINING_CLASS (wc_buffer[i-1]); + + continue; + } + + if (cc == 0) + last_start = i; + + last_cc = cc; + } + } + + wc_buffer[n_wc] = 0; + + return wc_buffer; +} + +/** + * g_utf8_normalize: + * @str: a UTF-8 encoded string. + * @len: length of @str, in bytes, or -1 if @str is nul-terminated. + * @mode: the type of normalization to perform. + * + * Converts a string into canonical form, standardizing + * such issues as whether a character with an accent + * is represented as a base character and combining + * accent or as a single precomposed character. You + * should generally call g_utf8_normalize() before + * comparing two Unicode strings. + * + * The normalization mode %G_NORMALIZE_DEFAULT only + * standardizes differences that do not affect the + * text content, such as the above-mentioned accent + * representation. %G_NORMALIZE_ALL also standardizes + * the "compatibility" characters in Unicode, such + * as SUPERSCRIPT THREE to the standard forms + * (in this case DIGIT THREE). Formatting information + * may be lost but for most text operations such + * characters should be considered the same. + * For example, g_utf8_collate() normalizes + * with %G_NORMALIZE_ALL as its first step. + * + * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE + * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, + * but returned a result with composed forms rather + * than a maximally decomposed form. This is often + * useful if you intend to convert the string to + * a legacy encoding or pass it to a system with + * less capable Unicode handling. + * + * Return value: a newly allocated string, that is the + * normalized form of @str. + **/ +EXPORT_C gchar * +g_utf8_normalize (const gchar *str, + gssize len, + GNormalizeMode mode) +{ + gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); + gchar *result; + + result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL); + g_free (result_wc); + + return result; +} + +#define __G_UNIDECOMP_C__ +#include "galiasdef.c"