FCL/sf/os/ossrv: comparison glib/libglib/src/gunidecomp.c

equal deleted inserted replaced

--1:000000000000
+:e4d67989cc36
+/* decomp.c - Character decomposition.
+*
+*  Copyright (C) 1999, 2000 Tom Tromey
+*  Copyright 2000 Red Hat, Inc.
+* Portions copyright (c) 2006 Nokia Corporation.  All rights reserved.
+*
+* The Gnome Library is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* The Gnome Library is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with the Gnome Library; see the file COPYING.LIB.  If not,
+* write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+*   Boston, MA 02111-1307, USA.
+*/
+#include "config.h"
+#include <stdlib.h>
+#include "glib.h"
+#include "gunidecomp.h"
+#include "gunicomp.h"
+#include "gunicodeprivate.h"
+#include "galias.h"
+#define CC_PART1(Page, Char) \
+((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+: (cclass_data[combining_class_table_part1[Page]][Char]))
+#define CC_PART2(Page, Char) \
+((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+: (cclass_data[combining_class_table_part2[Page]][Char]))
+#define COMBINING_CLASS(Char) \
+(((Char) <= G_UNICODE_LAST_CHAR_PART1) \
+? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
+: (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
+? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
+: 0))
+gint
+_g_unichar_combining_class (gunichar uc)
+{
+return COMBINING_CLASS (uc);
+}
+/* constants for hangul syllable [de]composition */
+#define SBase 0xAC00
+#define LBase 0x1100
+#define VBase 0x1161
+#define TBase 0x11A7
+#define LCount 19
+#define VCount 21
+#define TCount 28
+#define NCount (VCount * TCount)
+#define SCount (LCount * NCount)
+/**
+* g_unicode_canonical_ordering:
+* @string: a UCS-4 encoded string.
+* @len: the maximum length of @string to use.
+*
+* Computes the canonical ordering of a string in-place.
+* This rearranges decomposed characters in the string
+* according to their combining classes.  See the Unicode
+* manual for more information.
+**/
+EXPORT_C void
+g_unicode_canonical_ordering (gunichar *string,
+			      gsize     len)
+{
+gsize i;
+int swap = 1;
+while (swap)
+{
+int last;
+swap = 0;
+last = COMBINING_CLASS (string[0]);
+for (i = 0; i < len - 1; ++i)
+	{
+	  int next = COMBINING_CLASS (string[i + 1]);
+	  if (next != 0 && last > next)
+	    {
+	      gsize j;
+	      /* Percolate item leftward through string.  */
+	      for (j = i + 1; j > 0; --j)
+		{
+		  gunichar t;
+		  if (COMBINING_CLASS (string[j - 1]) <= next)
+		    break;
+		  t = string[j];
+		  string[j] = string[j - 1];
+		  string[j - 1] = t;
+		  swap = 1;
+		}
+	      /* We're re-entering the loop looking at the old
+		 character again.  */
+	      next = last;
+	    }
+	  last = next;
+	}
+}
+}
+/* http://www.unicode.org/unicode/reports/tr15/#Hangul
+* r should be null or have sufficient space. Calling with r == NULL will
+* only calculate the result_len; however, a buffer with space for three
+* characters will always be big enough. */
+static void
+decompose_hangul (gunichar s,
+gunichar *r,
+gsize *result_len)
+{
+gint SIndex = s - SBase;
+/* not a hangul syllable */
+if (SIndex < 0 || SIndex >= SCount)
+{
+if (r)
+r[0] = s;
+*result_len = 1;
+}
+else
+{
+gunichar L = LBase + SIndex / NCount;
+gunichar V = VBase + (SIndex % NCount) / TCount;
+gunichar T = TBase + SIndex % TCount;
+if (r)
+{
+r[0] = L;
+r[1] = V;
+}
+if (T != TBase)
+{
+if (r)
+r[2] = T;
+*result_len = 3;
+}
+else
+*result_len = 2;
+}
+}
+/* returns a pointer to a null-terminated UTF-8 string */
+static const gchar *
+find_decomposition (gunichar ch,
+		    gboolean compat)
+{
+int start = 0;
+int end = G_N_ELEMENTS (decomp_table);
+if (ch >= decomp_table[start].ch &&
+ch <= decomp_table[end - 1].ch)
+{
+while (TRUE)
+	{
+	  int half = (start + end) / 2;
+	  if (ch == decomp_table[half].ch)
+	    {
+	      int offset;
+	      if (compat)
+		{
+		  offset = decomp_table[half].compat_offset;
+		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
+		    offset = decomp_table[half].canon_offset;
+		}
+	      else
+		{
+		  offset = decomp_table[half].canon_offset;
+		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
+		    return NULL;
+		}
+	      return &(decomp_expansion_string[offset]);
+	    }
+	  else if (half == start)
+	    break;
+	  else if (ch > decomp_table[half].ch)
+	    start = half;
+	  else
+	    end = half;
+	}
+}
+return NULL;
+}
+/**
+* g_unicode_canonical_decomposition:
+* @ch: a Unicode character.
+* @result_len: location to store the length of the return value.
+*
+* Computes the canonical decomposition of a Unicode character.
+*
+* Return value: a newly allocated string of Unicode characters.
+*   @result_len is set to the resulting length of the string.
+**/
+EXPORT_C gunichar *
+g_unicode_canonical_decomposition (gunichar ch,
+				   gsize   *result_len)
+{
+const gchar *decomp;
+const gchar *p;
+gunichar *r;
+/* Hangul syllable */
+if (ch >= 0xac00 && ch <= 0xd7a3)
+{
+decompose_hangul (ch, NULL, result_len);
+r = g_malloc (*result_len * sizeof (gunichar));
+decompose_hangul (ch, r, result_len);
+}
+else if ((decomp = find_decomposition (ch, FALSE)) != NULL)
+{
+/* Found it.  */
+int i;
+*result_len = g_utf8_strlen (decomp, -1);
+r = g_malloc (*result_len * sizeof (gunichar));
+for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
+r[i] = g_utf8_get_char (p);
+}
+else
+{
+/* Not in our table.  */
+r = g_malloc (sizeof (gunichar));
+*r = ch;
+*result_len = 1;
+}
+/* Supposedly following the Unicode 2.1.9 table means that the
+decompositions come out in canonical order.  I haven't tested
+this, but we rely on it here.  */
+return r;
+}
+/* L,V => LV and LV,T => LVT  */
+static gboolean
+combine_hangul (gunichar a,
+gunichar b,
+gunichar *result)
+{
+gint LIndex = a - LBase;
+gint SIndex = a - SBase;
+gint VIndex = b - VBase;
+gint TIndex = b - TBase;
+if (0 <= LIndex && LIndex < LCount
+&& 0 <= VIndex && VIndex < VCount)
+{
+*result = SBase + (LIndex * VCount + VIndex) * TCount;
+return TRUE;
+}
+else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
+&& 0 < TIndex && TIndex < TCount)
+{
+*result = a + TIndex;
+return TRUE;
+}
+return FALSE;
+}
+#define CI(Page, Char) \
+((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
+? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
+: (compose_data[compose_table[Page]][Char]))
+#define COMPOSE_INDEX(Char) \
+(((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
+static gboolean
+combine (gunichar  a,
+	 gunichar  b,
+	 gunichar *result)
+{
+gushort index_a, index_b;
+if (combine_hangul (a, b, result))
+return TRUE;
+index_a = COMPOSE_INDEX(a);
+if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
+{
+if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
+	{
+	  *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
+	  return TRUE;
+	}
+else
+return FALSE;
+}
+index_b = COMPOSE_INDEX(b);
+if (index_b >= COMPOSE_SECOND_SINGLE_START)
+{
+if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
+	{
+	  *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
+	  return TRUE;
+	}
+else
+return FALSE;
+}
+if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
+index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START)
+{
+gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
+if (res)
+	{
+	  *result = res;
+	  return TRUE;
+	}
+}
+return FALSE;
+}
+gunichar *
+_g_utf8_normalize_wc (const gchar    *str,
+		      gssize          max_len,
+		      GNormalizeMode  mode)
+{
+gsize n_wc;
+gunichar *wc_buffer;
+const char *p;
+gsize last_start;
+gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
+			mode == G_NORMALIZE_NFKD);
+gboolean do_compose = (mode == G_NORMALIZE_NFC ||
+			 mode == G_NORMALIZE_NFKC);
+n_wc = 0;
+p = str;
+while ((max_len < 0 || p < str + max_len) && *p)
+{
+const gchar *decomp;
+gunichar wc = g_utf8_get_char (p);
+if (wc >= 0xac00 && wc <= 0xd7a3)
+{
+gsize result_len;
+decompose_hangul (wc, NULL, &result_len);
+n_wc += result_len;
+}
+else
+{
+decomp = find_decomposition (wc, do_compat);
+if (decomp)
+n_wc += g_utf8_strlen (decomp, -1);
+else
+n_wc++;
+}
+p = g_utf8_next_char (p);
+}
+wc_buffer = g_new (gunichar, n_wc + 1);
+last_start = 0;
+n_wc = 0;
+p = str;
+while ((max_len < 0 || p < str + max_len) && *p)
+{
+gunichar wc = g_utf8_get_char (p);
+const gchar *decomp;
+int cc;
+gsize old_n_wc = n_wc;
+if (wc >= 0xac00 && wc <= 0xd7a3)
+{
+gsize result_len;
+decompose_hangul (wc, wc_buffer + n_wc, &result_len);
+n_wc += result_len;
+}
+else
+{
+decomp = find_decomposition (wc, do_compat);
+if (decomp)
+{
+const char *pd;
+for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
+wc_buffer[n_wc++] = g_utf8_get_char (pd);
+}
+else
+wc_buffer[n_wc++] = wc;
+}
+if (n_wc > 0)
+	{
+	  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
+	  if (cc == 0)
+	    {
+	      g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
+	      last_start = old_n_wc;
+	    }
+	}
+p = g_utf8_next_char (p);
+}
+if (n_wc > 0)
+{
+g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
+last_start = n_wc;
+}
+wc_buffer[n_wc] = 0;
+/* All decomposed and reordered */
+if (do_compose && n_wc > 0)
+{
+gsize i, j;
+int last_cc = 0;
+last_start = 0;
+for (i = 0; i < n_wc; i++)
+	{
+	  int cc = COMBINING_CLASS (wc_buffer[i]);
+	  if (i > 0 &&
+	      (last_cc == 0 || last_cc != cc) &&
+	      combine (wc_buffer[last_start], wc_buffer[i],
+		       &wc_buffer[last_start]))
+	    {
+	      for (j = i + 1; j < n_wc; j++)
+		wc_buffer[j-1] = wc_buffer[j];
+	      n_wc--;
+	      i--;
+	      if (i == last_start)
+		last_cc = 0;
+	      else
+		last_cc = COMBINING_CLASS (wc_buffer[i-1]);
+	      continue;
+	    }
+	  if (cc == 0)
+	    last_start = i;
+	  last_cc = cc;
+	}
+}
+wc_buffer[n_wc] = 0;
+return wc_buffer;
+}
+/**
+* g_utf8_normalize:
+* @str: a UTF-8 encoded string.
+* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
+* @mode: the type of normalization to perform.
+*
+* Converts a string into canonical form, standardizing
+* such issues as whether a character with an accent
+* is represented as a base character and combining
+* accent or as a single precomposed character. You
+* should generally call g_utf8_normalize() before
+* comparing two Unicode strings.
+*
+* The normalization mode %G_NORMALIZE_DEFAULT only
+* standardizes differences that do not affect the
+* text content, such as the above-mentioned accent
+* representation. %G_NORMALIZE_ALL also standardizes
+* the "compatibility" characters in Unicode, such
+* as SUPERSCRIPT THREE to the standard forms
+* (in this case DIGIT THREE). Formatting information
+* may be lost but for most text operations such
+* characters should be considered the same.
+* For example, g_utf8_collate() normalizes
+* with %G_NORMALIZE_ALL as its first step.
+*
+* %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
+* are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
+* but returned a result with composed forms rather
+* than a maximally decomposed form. This is often
+* useful if you intend to convert the string to
+* a legacy encoding or pass it to a system with
+* less capable Unicode handling.
+*
+* Return value: a newly allocated string, that is the
+*   normalized form of @str.
+**/
+EXPORT_C gchar *
+g_utf8_normalize (const gchar    *str,
+		  gssize          len,
+		  GNormalizeMode  mode)
+{
+gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
+gchar *result;
+result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
+g_free (result_wc);
+return result;
+}
+#define __G_UNIDECOMP_C__
+#include "galiasdef.c"