glib/libglib/src/guniprop.c
branchRCL_3
changeset 57 2efc27d87e1c
parent 0 e4d67989cc36
equal deleted inserted replaced
56:acd3cd4aaceb 57:2efc27d87e1c
       
     1 /* guniprop.c - Unicode character properties.
       
     2  *
       
     3  * Copyright (C) 1999 Tom Tromey
       
     4  * Copyright (C) 2000 Red Hat, Inc.
       
     5  * Portions copyright (c) 2006 Nokia Corporation.  All rights reserved.
       
     6  *
       
     7  * This library is free software; you can redistribute it and/or
       
     8  * modify it under the terms of the GNU Lesser General Public
       
     9  * License as published by the Free Software Foundation; either
       
    10  * version 2 of the License, or (at your option) any later version.
       
    11  *
       
    12  * This library is distributed in the hope that it will be useful,
       
    13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
       
    14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
       
    15  * Lesser General Public License for more details.
       
    16  *
       
    17  * You should have received a copy of the GNU Lesser General Public
       
    18  * License along with this library; if not, write to the
       
    19  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
       
    20  * Boston, MA 02111-1307, USA.
       
    21  */
       
    22 
       
    23 #include "config.h"
       
    24 
       
    25 #include <stddef.h>
       
    26 #include <string.h>
       
    27 #include <locale.h>
       
    28 
       
    29 #include "glib.h"
       
    30 #include "gunichartables.h"
       
    31 #include "gmirroringtable.h"
       
    32 #include "gunicodeprivate.h"
       
    33 #include "galias.h"
       
    34 
       
    35 #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
       
    36                           ? attr_table_part1[Page] \
       
    37                           : attr_table_part2[(Page) - 0xe00])
       
    38 
       
    39 #define ATTTABLE(Page, Char) \
       
    40   ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))
       
    41 
       
    42 #define TTYPE_PART1(Page, Char) \
       
    43   ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
       
    44    ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
       
    45    : (type_data[type_table_part1[Page]][Char]))
       
    46 
       
    47 #define TTYPE_PART2(Page, Char) \
       
    48   ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
       
    49    ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
       
    50    : (type_data[type_table_part2[Page]][Char]))
       
    51 
       
    52 #define TYPE(Char) \
       
    53   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
       
    54    ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
       
    55    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
       
    56       ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
       
    57       : G_UNICODE_UNASSIGNED))
       
    58 
       
    59 
       
    60 #define IS(Type, Class)	(((guint)1 << (Type)) & (Class))
       
    61 #define OR(Type, Rest)	(((guint)1 << (Type)) | (Rest))
       
    62 
       
    63 
       
    64 
       
    65 #define ISDIGIT(Type)	IS ((Type),				\
       
    66 			    OR (G_UNICODE_DECIMAL_NUMBER,	\
       
    67 			    OR (G_UNICODE_LETTER_NUMBER,	\
       
    68 			    OR (G_UNICODE_OTHER_NUMBER,		0))))
       
    69 
       
    70 #define ISALPHA(Type)	IS ((Type),				\
       
    71 			    OR (G_UNICODE_LOWERCASE_LETTER,	\
       
    72 			    OR (G_UNICODE_UPPERCASE_LETTER,	\
       
    73 			    OR (G_UNICODE_TITLECASE_LETTER,	\
       
    74 			    OR (G_UNICODE_MODIFIER_LETTER,	\
       
    75 			    OR (G_UNICODE_OTHER_LETTER,		0))))))
       
    76 
       
    77 #define ISALDIGIT(Type)	IS ((Type),				\
       
    78 			    OR (G_UNICODE_DECIMAL_NUMBER,	\
       
    79 			    OR (G_UNICODE_LETTER_NUMBER,	\
       
    80 			    OR (G_UNICODE_OTHER_NUMBER,		\
       
    81 			    OR (G_UNICODE_LOWERCASE_LETTER,	\
       
    82 			    OR (G_UNICODE_UPPERCASE_LETTER,	\
       
    83 			    OR (G_UNICODE_TITLECASE_LETTER,	\
       
    84 			    OR (G_UNICODE_MODIFIER_LETTER,	\
       
    85 			    OR (G_UNICODE_OTHER_LETTER,		0)))))))))
       
    86 
       
    87 #define ISMARK(Type)	IS ((Type),				\
       
    88 			    OR (G_UNICODE_NON_SPACING_MARK,	\
       
    89 			    OR (G_UNICODE_COMBINING_MARK,	\
       
    90 			    OR (G_UNICODE_ENCLOSING_MARK,	0))))
       
    91 
       
    92 /**
       
    93  * g_unichar_isalnum:
       
    94  * @c: a Unicode character
       
    95  * 
       
    96  * Determines whether a character is alphanumeric.
       
    97  * Given some UTF-8 text, obtain a character value
       
    98  * with g_utf8_get_char().
       
    99  * 
       
   100  * Return value: %TRUE if @c is an alphanumeric character
       
   101  **/
       
   102 EXPORT_C gboolean
       
   103 g_unichar_isalnum (gunichar c)
       
   104 {
       
   105   return ISALDIGIT (TYPE (c)) ? TRUE : FALSE;
       
   106 }
       
   107 
       
   108 /**
       
   109  * g_unichar_isalpha:
       
   110  * @c: a Unicode character
       
   111  * 
       
   112  * Determines whether a character is alphabetic (i.e. a letter).
       
   113  * Given some UTF-8 text, obtain a character value with
       
   114  * g_utf8_get_char().
       
   115  * 
       
   116  * Return value: %TRUE if @c is an alphabetic character
       
   117  **/
       
   118 EXPORT_C gboolean
       
   119 g_unichar_isalpha (gunichar c)
       
   120 {
       
   121   return ISALPHA (TYPE (c)) ? TRUE : FALSE;
       
   122 }
       
   123 
       
   124 
       
   125 /**
       
   126  * g_unichar_iscntrl:
       
   127  * @c: a Unicode character
       
   128  * 
       
   129  * Determines whether a character is a control character.
       
   130  * Given some UTF-8 text, obtain a character value with
       
   131  * g_utf8_get_char().
       
   132  * 
       
   133  * Return value: %TRUE if @c is a control character
       
   134  **/
       
   135 EXPORT_C gboolean
       
   136 g_unichar_iscntrl (gunichar c)
       
   137 {
       
   138   return TYPE (c) == G_UNICODE_CONTROL;
       
   139 }
       
   140 
       
   141 /**
       
   142  * g_unichar_isdigit:
       
   143  * @c: a Unicode character
       
   144  * 
       
   145  * Determines whether a character is numeric (i.e. a digit).  This
       
   146  * covers ASCII 0-9 and also digits in other languages/scripts.  Given
       
   147  * some UTF-8 text, obtain a character value with g_utf8_get_char().
       
   148  * 
       
   149  * Return value: %TRUE if @c is a digit
       
   150  **/
       
   151 EXPORT_C gboolean
       
   152 g_unichar_isdigit (gunichar c)
       
   153 {
       
   154   return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
       
   155 }
       
   156 
       
   157 
       
   158 /**
       
   159  * g_unichar_isgraph:
       
   160  * @c: a Unicode character
       
   161  * 
       
   162  * Determines whether a character is printable and not a space
       
   163  * (returns %FALSE for control characters, format characters, and
       
   164  * spaces). g_unichar_isprint() is similar, but returns %TRUE for
       
   165  * spaces. Given some UTF-8 text, obtain a character value with
       
   166  * g_utf8_get_char().
       
   167  * 
       
   168  * Return value: %TRUE if @c is printable unless it's a space
       
   169  **/
       
   170 EXPORT_C gboolean
       
   171 g_unichar_isgraph (gunichar c)
       
   172 {
       
   173   return !IS (TYPE(c),
       
   174 	      OR (G_UNICODE_CONTROL,
       
   175 	      OR (G_UNICODE_FORMAT,
       
   176 	      OR (G_UNICODE_UNASSIGNED,
       
   177 	      OR (G_UNICODE_PRIVATE_USE,
       
   178 	      OR (G_UNICODE_SURROGATE,
       
   179 	      OR (G_UNICODE_SPACE_SEPARATOR,
       
   180 	     0)))))));
       
   181 }
       
   182 
       
   183 /**
       
   184  * g_unichar_islower:
       
   185  * @c: a Unicode character
       
   186  * 
       
   187  * Determines whether a character is a lowercase letter.
       
   188  * Given some UTF-8 text, obtain a character value with
       
   189  * g_utf8_get_char().
       
   190  * 
       
   191  * Return value: %TRUE if @c is a lowercase letter
       
   192  **/
       
   193 EXPORT_C gboolean
       
   194 g_unichar_islower (gunichar c)
       
   195 {
       
   196   return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
       
   197 }
       
   198 
       
   199 
       
   200 /**
       
   201  * g_unichar_isprint:
       
   202  * @c: a Unicode character
       
   203  * 
       
   204  * Determines whether a character is printable.
       
   205  * Unlike g_unichar_isgraph(), returns %TRUE for spaces.
       
   206  * Given some UTF-8 text, obtain a character value with
       
   207  * g_utf8_get_char().
       
   208  * 
       
   209  * Return value: %TRUE if @c is printable
       
   210  **/
       
   211 EXPORT_C gboolean
       
   212 g_unichar_isprint (gunichar c)
       
   213 {
       
   214   return !IS (TYPE(c),
       
   215 	      OR (G_UNICODE_CONTROL,
       
   216 	      OR (G_UNICODE_FORMAT,
       
   217 	      OR (G_UNICODE_UNASSIGNED,
       
   218 	      OR (G_UNICODE_PRIVATE_USE,
       
   219 	      OR (G_UNICODE_SURROGATE,
       
   220 	     0))))));
       
   221 }
       
   222 
       
   223 /**
       
   224  * g_unichar_ispunct:
       
   225  * @c: a Unicode character
       
   226  * 
       
   227  * Determines whether a character is punctuation or a symbol.
       
   228  * Given some UTF-8 text, obtain a character value with
       
   229  * g_utf8_get_char().
       
   230  * 
       
   231  * Return value: %TRUE if @c is a punctuation or symbol character
       
   232  **/
       
   233 EXPORT_C gboolean
       
   234 g_unichar_ispunct (gunichar c)
       
   235 {
       
   236   return IS (TYPE(c),
       
   237 	     OR (G_UNICODE_CONNECT_PUNCTUATION,
       
   238 	     OR (G_UNICODE_DASH_PUNCTUATION,
       
   239 	     OR (G_UNICODE_CLOSE_PUNCTUATION,
       
   240 	     OR (G_UNICODE_FINAL_PUNCTUATION,
       
   241 	     OR (G_UNICODE_INITIAL_PUNCTUATION,
       
   242 	     OR (G_UNICODE_OTHER_PUNCTUATION,
       
   243 	     OR (G_UNICODE_OPEN_PUNCTUATION,
       
   244 	     OR (G_UNICODE_CURRENCY_SYMBOL,
       
   245 	     OR (G_UNICODE_MODIFIER_SYMBOL,
       
   246 	     OR (G_UNICODE_MATH_SYMBOL,
       
   247 	     OR (G_UNICODE_OTHER_SYMBOL,
       
   248 	    0)))))))))))) ? TRUE : FALSE;
       
   249 }
       
   250 
       
   251 /**
       
   252  * g_unichar_isspace:
       
   253  * @c: a Unicode character
       
   254  * 
       
   255  * Determines whether a character is a space, tab, or line separator
       
   256  * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
       
   257  * character value with g_utf8_get_char().
       
   258  *
       
   259  * (Note: don't use this to do word breaking; you have to use
       
   260  * Pango or equivalent to get word breaking right, the algorithm
       
   261  * is fairly complex.)
       
   262  *  
       
   263  * Return value: %TRUE if @c is a space character
       
   264  **/
       
   265 EXPORT_C gboolean
       
   266 g_unichar_isspace (gunichar c)
       
   267 {
       
   268   switch (c)
       
   269     {
       
   270       /* special-case these since Unicode thinks they are not spaces */
       
   271     case '\t':
       
   272     case '\n':
       
   273     case '\r':
       
   274     case '\f':
       
   275       return TRUE;
       
   276       break;
       
   277       
       
   278     default:
       
   279       {
       
   280 	return IS (TYPE(c),
       
   281 	           OR (G_UNICODE_SPACE_SEPARATOR,
       
   282 	           OR (G_UNICODE_LINE_SEPARATOR,
       
   283                    OR (G_UNICODE_PARAGRAPH_SEPARATOR,
       
   284 		  0)))) ? TRUE : FALSE;
       
   285       }
       
   286       break;
       
   287     }
       
   288 }
       
   289 
       
   290 /**
       
   291  * g_unichar_isupper:
       
   292  * @c: a Unicode character
       
   293  * 
       
   294  * Determines if a character is uppercase.
       
   295  * 
       
   296  * Return value: %TRUE if @c is an uppercase character
       
   297  **/
       
   298 EXPORT_C gboolean
       
   299 g_unichar_isupper (gunichar c)
       
   300 {
       
   301   return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
       
   302 }
       
   303 
       
   304 /**
       
   305  * g_unichar_istitle:
       
   306  * @c: a Unicode character
       
   307  * 
       
   308  * Determines if a character is titlecase. Some characters in
       
   309  * Unicode which are composites, such as the DZ digraph
       
   310  * have three case variants instead of just two. The titlecase
       
   311  * form is used at the beginning of a word where only the
       
   312  * first letter is capitalized. The titlecase form of the DZ
       
   313  * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
       
   314  * 
       
   315  * Return value: %TRUE if the character is titlecase
       
   316  **/
       
   317 EXPORT_C gboolean
       
   318 g_unichar_istitle (gunichar c)
       
   319 {
       
   320   unsigned int i;
       
   321   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
       
   322     if (title_table[i][0] == c)
       
   323       return 1;
       
   324   return 0;
       
   325 }
       
   326 
       
   327 /**
       
   328  * g_unichar_isxdigit:
       
   329  * @c: a Unicode character.
       
   330  * 
       
   331  * Determines if a character is a hexidecimal digit.
       
   332  * 
       
   333  * Return value: %TRUE if the character is a hexadecimal digit
       
   334  **/
       
   335 EXPORT_C gboolean
       
   336 g_unichar_isxdigit (gunichar c)
       
   337 {
       
   338   return ((c >= 'a' && c <= 'f')
       
   339 	  || (c >= 'A' && c <= 'F')
       
   340 	  || ISDIGIT (TYPE (c)));
       
   341 }
       
   342 
       
   343 /**
       
   344  * g_unichar_isdefined:
       
   345  * @c: a Unicode character
       
   346  * 
       
   347  * Determines if a given character is assigned in the Unicode
       
   348  * standard.
       
   349  *
       
   350  * Return value: %TRUE if the character has an assigned value
       
   351  **/
       
   352 EXPORT_C gboolean
       
   353 g_unichar_isdefined (gunichar c)
       
   354 {
       
   355   return TYPE (c) != G_UNICODE_UNASSIGNED;
       
   356 }
       
   357 
       
   358 /**
       
   359  * g_unichar_iswide:
       
   360  * @c: a Unicode character
       
   361  * 
       
   362  * Determines if a character is typically rendered in a double-width
       
   363  * cell.
       
   364  * 
       
   365  * Return value: %TRUE if the character is wide
       
   366  **/
       
   367 /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
       
   368 EXPORT_C gboolean
       
   369 g_unichar_iswide (gunichar c)
       
   370 {
       
   371   if (c < 0x1100)
       
   372     return FALSE;
       
   373 
       
   374   return (c <= 0x115f  /* Hangul Jamo init. consonants */ 
       
   375           || c == 0x2329 || c == 0x232a     /* angle brackets */
       
   376           || (c >= 0x2e80 && c <= 0xa4cf && (c < 0x302a || c > 0x302f) 
       
   377               && c != 0x303f && c != 0x3099 && c!= 0x309a) /* CJK ... Yi */
       
   378           || (c >= 0xac00 && c <= 0xd7a3)   /* Hangul Syllables */
       
   379           || (c >= 0xf900 && c <= 0xfaff)   /* CJK Compatibility Ideographs */
       
   380           || (c >= 0xfe30 && c <= 0xfe6f)   /* CJK Compatibility Forms */
       
   381           || (c >= 0xff00 && c <= 0xff60)   /* Fullwidth Forms */
       
   382           || (c >= 0xffe0 && c <= 0xffe6)   /* Fullwidth Forms */
       
   383           || (c >= 0x20000 && c <= 0x2fffd) /* CJK extra stuff */
       
   384           || (c >= 0x30000 && c <= 0x3fffd));
       
   385 }
       
   386 
       
   387 /**
       
   388  * g_unichar_toupper:
       
   389  * @c: a Unicode character
       
   390  * 
       
   391  * Converts a character to uppercase.
       
   392  * 
       
   393  * Return value: the result of converting @c to uppercase.
       
   394  *               If @c is not an lowercase or titlecase character,
       
   395  *               or has no upper case equivalent @c is returned unchanged.
       
   396  **/
       
   397 EXPORT_C gunichar
       
   398 g_unichar_toupper (gunichar c)
       
   399 {
       
   400   int t = TYPE (c);
       
   401   if (t == G_UNICODE_LOWERCASE_LETTER)
       
   402     {
       
   403       gunichar val = ATTTABLE (c >> 8, c & 0xff);
       
   404       if (val >= 0x1000000)
       
   405 	{
       
   406 	  const gchar *p = special_case_table + val - 0x1000000;
       
   407 	  return g_utf8_get_char (p);
       
   408 	}
       
   409       else
       
   410 	return val ? val : c;
       
   411     }
       
   412   else if (t == G_UNICODE_TITLECASE_LETTER)
       
   413     {
       
   414       unsigned int i;
       
   415       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
       
   416 	{
       
   417 	  if (title_table[i][0] == c)
       
   418 	    return title_table[i][1];
       
   419 	}
       
   420     }
       
   421   return c;
       
   422 }
       
   423 
       
   424 /**
       
   425  * g_unichar_tolower:
       
   426  * @c: a Unicode character.
       
   427  * 
       
   428  * Converts a character to lower case.
       
   429  * 
       
   430  * Return value: the result of converting @c to lower case.
       
   431  *               If @c is not an upperlower or titlecase character,
       
   432  *               or has no lowercase equivalent @c is returned unchanged.
       
   433  **/
       
   434 EXPORT_C gunichar
       
   435 g_unichar_tolower (gunichar c)
       
   436 {
       
   437   int t = TYPE (c);
       
   438   if (t == G_UNICODE_UPPERCASE_LETTER)
       
   439     {
       
   440       gunichar val = ATTTABLE (c >> 8, c & 0xff);
       
   441       if (val >= 0x1000000)
       
   442 	{
       
   443 	  const gchar *p = special_case_table + val - 0x1000000;
       
   444 	  return g_utf8_get_char (p);
       
   445 	}
       
   446       else
       
   447 	return val ? val : c;
       
   448     }
       
   449   else if (t == G_UNICODE_TITLECASE_LETTER)
       
   450     {
       
   451       unsigned int i;
       
   452       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
       
   453 	{
       
   454 	  if (title_table[i][0] == c)
       
   455 	    return title_table[i][2];
       
   456 	}
       
   457     }
       
   458   return c;
       
   459 }
       
   460 
       
   461 /**
       
   462  * g_unichar_totitle:
       
   463  * @c: a Unicode character
       
   464  * 
       
   465  * Converts a character to the titlecase.
       
   466  * 
       
   467  * Return value: the result of converting @c to titlecase.
       
   468  *               If @c is not an uppercase or lowercase character,
       
   469  *               @c is returned unchanged.
       
   470  **/
       
   471 EXPORT_C gunichar
       
   472 g_unichar_totitle (gunichar c)
       
   473 {
       
   474   unsigned int i;
       
   475   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
       
   476     {
       
   477       if (title_table[i][0] == c || title_table[i][1] == c
       
   478 	  || title_table[i][2] == c)
       
   479 	return title_table[i][0];
       
   480     }
       
   481   return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
       
   482 	  ? ATTTABLE (c >> 8, c & 0xff)
       
   483 	  : c);
       
   484 }
       
   485 
       
   486 /**
       
   487  * g_unichar_digit_value:
       
   488  * @c: a Unicode character
       
   489  *
       
   490  * Determines the numeric value of a character as a decimal
       
   491  * digit.
       
   492  *
       
   493  * Return value: If @c is a decimal digit (according to
       
   494  * g_unichar_isdigit()), its numeric value. Otherwise, -1.
       
   495  **/
       
   496 EXPORT_C int
       
   497 g_unichar_digit_value (gunichar c)
       
   498 {
       
   499   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
       
   500     return ATTTABLE (c >> 8, c & 0xff);
       
   501   return -1;
       
   502 }
       
   503 
       
   504 /**
       
   505  * g_unichar_xdigit_value:
       
   506  * @c: a Unicode character
       
   507  *
       
   508  * Determines the numeric value of a character as a hexidecimal
       
   509  * digit.
       
   510  *
       
   511  * Return value: If @c is a hex digit (according to
       
   512  * g_unichar_isxdigit()), its numeric value. Otherwise, -1.
       
   513  **/
       
   514 EXPORT_C int
       
   515 g_unichar_xdigit_value (gunichar c)
       
   516 {
       
   517   if (c >= 'A' && c <= 'F')
       
   518     return c - 'A' + 10;
       
   519   if (c >= 'a' && c <= 'f')
       
   520     return c - 'a' + 10;
       
   521   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
       
   522     return ATTTABLE (c >> 8, c & 0xff);
       
   523   return -1;
       
   524 }
       
   525 
       
   526 /**
       
   527  * g_unichar_type:
       
   528  * @c: a Unicode character
       
   529  * 
       
   530  * Classifies a Unicode character by type.
       
   531  * 
       
   532  * Return value: the type of the character.
       
   533  **/
       
   534 EXPORT_C GUnicodeType
       
   535 g_unichar_type (gunichar c)
       
   536 {
       
   537   return TYPE (c);
       
   538 }
       
   539 
       
   540 /*
       
   541  * Case mapping functions
       
   542  */
       
   543 
       
   544 typedef enum {
       
   545   LOCALE_NORMAL,
       
   546   LOCALE_TURKIC,
       
   547   LOCALE_LITHUANIAN
       
   548 } LocaleType;
       
   549 
       
   550 static LocaleType
       
   551 get_locale_type (void)
       
   552 {
       
   553 #ifdef G_OS_WIN32
       
   554   char *tem = g_win32_getlocale ();
       
   555   char locale[2];
       
   556 
       
   557   locale[0] = tem[0];
       
   558   locale[1] = tem[1];
       
   559   g_free (tem);
       
   560 #else
       
   561   const char *locale = setlocale (LC_CTYPE, NULL);
       
   562 #endif
       
   563 
       
   564   switch (locale[0])
       
   565     {
       
   566    case 'a':
       
   567       if (locale[1] == 'z')
       
   568 	return LOCALE_TURKIC;
       
   569       break;
       
   570     case 'l':
       
   571       if (locale[1] == 't')
       
   572 	return LOCALE_LITHUANIAN;
       
   573       break;
       
   574     case 't':
       
   575       if (locale[1] == 'r')
       
   576 	return LOCALE_TURKIC;
       
   577       break;
       
   578     }
       
   579 
       
   580   return LOCALE_NORMAL;
       
   581 }
       
   582 
       
   583 static gint
       
   584 output_marks (const char **p_inout,
       
   585 	      char        *out_buffer,
       
   586 	      gboolean     remove_dot)
       
   587 {
       
   588   const char *p = *p_inout;
       
   589   gint len = 0;
       
   590   
       
   591   while (*p)
       
   592     {
       
   593       gunichar c = g_utf8_get_char (p);
       
   594       
       
   595       if (ISMARK (TYPE (c)))
       
   596 	{
       
   597 	  if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */)
       
   598 	    len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL);
       
   599 	  p = g_utf8_next_char (p);
       
   600 	}
       
   601       else
       
   602 	break;
       
   603     }
       
   604 
       
   605   *p_inout = p;
       
   606   return len;
       
   607 }
       
   608 
       
   609 static gint
       
   610 output_special_case (gchar *out_buffer,
       
   611 		     int    offset,
       
   612 		     int    type,
       
   613 		     int    which)
       
   614 {
       
   615   const gchar *p = special_case_table + offset;
       
   616   gint len;
       
   617 
       
   618   if (type != G_UNICODE_TITLECASE_LETTER)
       
   619     p = g_utf8_next_char (p);
       
   620 
       
   621   if (which == 1)
       
   622     p += strlen (p) + 1;
       
   623 
       
   624   len = strlen (p);
       
   625   if (out_buffer)
       
   626     memcpy (out_buffer, p, len);
       
   627 
       
   628   return len;
       
   629 }
       
   630 
       
   631 static gsize
       
   632 real_toupper (const gchar *str,
       
   633 	      gssize       max_len,
       
   634 	      gchar       *out_buffer,
       
   635 	      LocaleType   locale_type)
       
   636 {
       
   637   const gchar *p = str;
       
   638   const char *last = NULL;
       
   639   gsize len = 0;
       
   640   gboolean last_was_i = FALSE;
       
   641 
       
   642   while ((max_len < 0 || p < str + max_len) && *p)
       
   643     {
       
   644       gunichar c = g_utf8_get_char (p);
       
   645       int t = TYPE (c);
       
   646       gunichar val;
       
   647 
       
   648       last = p;
       
   649       p = g_utf8_next_char (p);
       
   650 
       
   651       if (locale_type == LOCALE_LITHUANIAN)
       
   652 	{
       
   653 	  if (c == 'i')
       
   654 	    last_was_i = TRUE;
       
   655 	  else 
       
   656 	    {
       
   657 	      if (last_was_i)
       
   658 		{
       
   659 		  /* Nasty, need to remove any dot above. Though
       
   660 		   * I think only E WITH DOT ABOVE occurs in practice
       
   661 		   * which could simplify this considerably.
       
   662 		   */
       
   663 		  gsize decomp_len, i;
       
   664 		  gunichar *decomp;
       
   665 
       
   666 		  decomp = g_unicode_canonical_decomposition (c, &decomp_len);
       
   667 		  for (i=0; i < decomp_len; i++)
       
   668 		    {
       
   669 		      if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */)
       
   670 			len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL);
       
   671 		    }
       
   672 		  g_free (decomp);
       
   673 		  
       
   674 		  len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE);
       
   675 
       
   676 		  continue;
       
   677 		}
       
   678 
       
   679 	      if (!ISMARK (t))
       
   680 		last_was_i = FALSE;
       
   681 	    }
       
   682 	}
       
   683       
       
   684       if (locale_type == LOCALE_TURKIC && c == 'i')
       
   685 	{
       
   686 	  /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */
       
   687 	  len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL); 
       
   688 	}
       
   689       else if (c == 0x0345)	/* COMBINING GREEK YPOGEGRAMMENI */
       
   690 	{
       
   691 	  /* Nasty, need to move it after other combining marks .. this would go away if
       
   692 	   * we normalized first.
       
   693 	   */
       
   694 	  len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE);
       
   695 
       
   696 	  /* And output as GREEK CAPITAL LETTER IOTA */
       
   697 	  len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL); 	  
       
   698 	}
       
   699       else if (IS (t,
       
   700 		   OR (G_UNICODE_LOWERCASE_LETTER,
       
   701 		   OR (G_UNICODE_TITLECASE_LETTER,
       
   702 		  0))))
       
   703 	{
       
   704 	  val = ATTTABLE (c >> 8, c & 0xff);
       
   705 
       
   706 	  if (val >= 0x1000000)
       
   707 	    {
       
   708 	      len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t,
       
   709 					  t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
       
   710 	    }
       
   711 	  else
       
   712 	    {
       
   713 	      if (t == G_UNICODE_TITLECASE_LETTER)
       
   714 		{
       
   715 		  unsigned int i;
       
   716 		  for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
       
   717 		    {
       
   718 		      if (title_table[i][0] == c)
       
   719 			val = title_table[i][1];
       
   720 		    }
       
   721 		}
       
   722 
       
   723 	      len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
       
   724 	    }
       
   725 	}
       
   726       else
       
   727 	{
       
   728 	  gsize char_len = g_utf8_skip[*(guchar *)last];
       
   729 
       
   730 	  if (out_buffer)
       
   731 	    memcpy (out_buffer + len, last, char_len);
       
   732 
       
   733 	  len += char_len;
       
   734 	}
       
   735 
       
   736     }
       
   737 
       
   738   return len;
       
   739 }
       
   740 
       
   741 /**
       
   742  * g_utf8_strup:
       
   743  * @str: a UTF-8 encoded string
       
   744  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
       
   745  * 
       
   746  * Converts all Unicode characters in the string that have a case
       
   747  * to uppercase. The exact manner that this is done depends
       
   748  * on the current locale, and may result in the number of
       
   749  * characters in the string increasing. (For instance, the
       
   750  * German ess-zet will be changed to SS.)
       
   751  * 
       
   752  * Return value: a newly allocated string, with all characters
       
   753  *    converted to uppercase.  
       
   754  **/
       
   755 EXPORT_C gchar *
       
   756 g_utf8_strup (const gchar *str,
       
   757 	      gssize       len)
       
   758 {
       
   759   gsize result_len;
       
   760   LocaleType locale_type;
       
   761   gchar *result;
       
   762 
       
   763   g_return_val_if_fail (str != NULL, NULL);
       
   764 
       
   765   locale_type = get_locale_type ();
       
   766   
       
   767   /*
       
   768    * We use a two pass approach to keep memory management simple
       
   769    */
       
   770   result_len = real_toupper (str, len, NULL, locale_type);
       
   771   result = g_malloc (result_len + 1);
       
   772   real_toupper (str, len, result, locale_type);
       
   773   result[result_len] = '\0';
       
   774 
       
   775   return result;
       
   776 }
       
   777 
       
   778 /* traverses the string checking for characters with combining class == 230
       
   779  * until a base character is found */
       
   780 static gboolean
       
   781 has_more_above (const gchar *str)
       
   782 {
       
   783   const gchar *p = str;
       
   784   gint combining_class;
       
   785 
       
   786   while (*p)
       
   787     {
       
   788       combining_class = _g_unichar_combining_class (g_utf8_get_char (p));
       
   789       if (combining_class == 230)
       
   790         return TRUE;
       
   791       else if (combining_class == 0)
       
   792         break;
       
   793 
       
   794       p = g_utf8_next_char (p);
       
   795     }
       
   796 
       
   797   return FALSE;
       
   798 }
       
   799 
       
   800 static gsize
       
   801 real_tolower (const gchar *str,
       
   802 	      gssize       max_len,
       
   803 	      gchar       *out_buffer,
       
   804 	      LocaleType   locale_type)
       
   805 {
       
   806   const gchar *p = str;
       
   807   const char *last = NULL;
       
   808   gsize len = 0;
       
   809 
       
   810   while ((max_len < 0 || p < str + max_len) && *p)
       
   811     {
       
   812       gunichar c = g_utf8_get_char (p);
       
   813       int t = TYPE (c);
       
   814       gunichar val;
       
   815 
       
   816       last = p;
       
   817       p = g_utf8_next_char (p);
       
   818 
       
   819       if (locale_type == LOCALE_TURKIC && c == 'I')
       
   820 	{
       
   821           if (g_utf8_get_char (p) == 0x0307)
       
   822             {
       
   823               /* I + COMBINING DOT ABOVE => i (U+0069) */
       
   824               len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); 
       
   825               p = g_utf8_next_char (p);
       
   826             }
       
   827           else
       
   828             {
       
   829               /* I => LATIN SMALL LETTER DOTLESS I */
       
   830               len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL); 
       
   831             }
       
   832         }
       
   833       /* Introduce an explicit dot above when lowercasing capital I's and J's
       
   834        * whenever there are more accents above. [SpecialCasing.txt] */
       
   835       else if (locale_type == LOCALE_LITHUANIAN && 
       
   836                (c == 0x00cc || c == 0x00cd || c == 0x0128))
       
   837         {
       
   838           len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); 
       
   839           len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); 
       
   840 
       
   841           switch (c)
       
   842             {
       
   843             case 0x00cc: 
       
   844               len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL); 
       
   845               break;
       
   846             case 0x00cd: 
       
   847               len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL); 
       
   848               break;
       
   849             case 0x0128: 
       
   850               len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL); 
       
   851               break;
       
   852             }
       
   853         }
       
   854       else if (locale_type == LOCALE_LITHUANIAN && 
       
   855                (c == 'I' || c == 'J' || c == 0x012e) && 
       
   856                has_more_above (p))
       
   857         {
       
   858           len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL); 
       
   859           len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); 
       
   860         }
       
   861       else if (c == 0x03A3)	/* GREEK CAPITAL LETTER SIGMA */
       
   862 	{
       
   863 	  if ((max_len < 0 || p < str + max_len) && *p)
       
   864 	    {
       
   865 	      gunichar next_c = g_utf8_get_char (p);
       
   866 	      int next_type = TYPE(next_c);
       
   867 
       
   868 	      /* SIGMA mapps differently depending on whether it is
       
   869 	       * final or not. The following simplified test would
       
   870 	       * fail in the case of combining marks following the
       
   871 	       * sigma, but I don't think that occurs in real text.
       
   872 	       * The test here matches that in ICU.
       
   873 	       */
       
   874 	      if (ISALPHA (next_type)) /* Lu,Ll,Lt,Lm,Lo */
       
   875 		val = 0x3c3;	/* GREEK SMALL SIGMA */
       
   876 	      else
       
   877 		val = 0x3c2;	/* GREEK SMALL FINAL SIGMA */
       
   878 	    }
       
   879 	  else
       
   880 	    val = 0x3c2;	/* GREEK SMALL FINAL SIGMA */
       
   881 
       
   882 	  len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
       
   883 	}
       
   884       else if (IS (t,
       
   885 		   OR (G_UNICODE_UPPERCASE_LETTER,
       
   886 		   OR (G_UNICODE_TITLECASE_LETTER,
       
   887 		  0))))
       
   888 	{
       
   889 	  val = ATTTABLE (c >> 8, c & 0xff);
       
   890 
       
   891 	  if (val >= 0x1000000)
       
   892 	    {
       
   893 	      len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, 0);
       
   894 	    }
       
   895 	  else
       
   896 	    {
       
   897 	      if (t == G_UNICODE_TITLECASE_LETTER)
       
   898 		{
       
   899 		  unsigned int i;
       
   900 		  for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
       
   901 		    {
       
   902 		      if (title_table[i][0] == c)
       
   903 			val = title_table[i][2];
       
   904 		    }
       
   905 		}
       
   906 
       
   907 	      len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
       
   908 	    }
       
   909 	}
       
   910       else
       
   911 	{
       
   912 	  gsize char_len = g_utf8_skip[*(guchar *)last];
       
   913 
       
   914 	  if (out_buffer)
       
   915 	    memcpy (out_buffer + len, last, char_len);
       
   916 
       
   917 	  len += char_len;
       
   918 	}
       
   919 
       
   920     }
       
   921 
       
   922   return len;
       
   923 }
       
   924 
       
   925 /**
       
   926  * g_utf8_strdown:
       
   927  * @str: a UTF-8 encoded string
       
   928  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
       
   929  * 
       
   930  * Converts all Unicode characters in the string that have a case
       
   931  * to lowercase. The exact manner that this is done depends
       
   932  * on the current locale, and may result in the number of
       
   933  * characters in the string changing.
       
   934  * 
       
   935  * Return value: a newly allocated string, with all characters
       
   936  *    converted to lowercase.  
       
   937  **/
       
   938 EXPORT_C gchar *
       
   939 g_utf8_strdown (const gchar *str,
       
   940 		gssize       len)
       
   941 {
       
   942   gsize result_len;
       
   943   LocaleType locale_type;
       
   944   gchar *result;
       
   945 
       
   946   g_return_val_if_fail (str != NULL, NULL);
       
   947 
       
   948   locale_type = get_locale_type ();
       
   949   
       
   950   /*
       
   951    * We use a two pass approach to keep memory management simple
       
   952    */
       
   953   result_len = real_tolower (str, len, NULL, locale_type);
       
   954   result = g_malloc (result_len + 1);
       
   955   real_tolower (str, len, result, locale_type);
       
   956   result[result_len] = '\0';
       
   957 
       
   958   return result;
       
   959 }
       
   960 
       
   961 /**
       
   962  * g_utf8_casefold:
       
   963  * @str: a UTF-8 encoded string
       
   964  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
       
   965  * 
       
   966  * Converts a string into a form that is independent of case. The
       
   967  * result will not correspond to any particular case, but can be
       
   968  * compared for equality or ordered with the results of calling
       
   969  * g_utf8_casefold() on other strings.
       
   970  * 
       
   971  * Note that calling g_utf8_casefold() followed by g_utf8_collate() is
       
   972  * only an approximation to the correct linguistic case insensitive
       
   973  * ordering, though it is a fairly good one. Getting this exactly
       
   974  * right would require a more sophisticated collation function that
       
   975  * takes case sensitivity into account. GLib does not currently
       
   976  * provide such a function.
       
   977  * 
       
   978  * Return value: a newly allocated string, that is a
       
   979  *   case independent form of @str.
       
   980  **/
       
   981 EXPORT_C gchar *
       
   982 g_utf8_casefold (const gchar *str,
       
   983 		 gssize       len)
       
   984 {
       
   985   GString *result;
       
   986   const char *p;
       
   987 
       
   988   g_return_val_if_fail (str != NULL, NULL);
       
   989 
       
   990   result = g_string_new (NULL);
       
   991   p = str;
       
   992   while ((len < 0 || p < str + len) && *p)
       
   993     {
       
   994       gunichar ch = g_utf8_get_char (p);
       
   995 
       
   996       int start = 0;
       
   997       int end = G_N_ELEMENTS (casefold_table);
       
   998 
       
   999       if (ch >= casefold_table[start].ch &&
       
  1000           ch <= casefold_table[end - 1].ch)
       
  1001 	{
       
  1002 	  while (TRUE)
       
  1003 	    {
       
  1004 	      int half = (start + end) / 2;
       
  1005 	      if (ch == casefold_table[half].ch)
       
  1006 		{
       
  1007 		  g_string_append (result, casefold_table[half].data);
       
  1008 		  goto next;
       
  1009 		}
       
  1010 	      else if (half == start)
       
  1011 		break;
       
  1012 	      else if (ch > casefold_table[half].ch)
       
  1013 		start = half;
       
  1014 	      else
       
  1015 		end = half;
       
  1016 	    }
       
  1017 	}
       
  1018 
       
  1019       g_string_append_unichar (result, g_unichar_tolower (ch));
       
  1020       
       
  1021     next:
       
  1022       p = g_utf8_next_char (p);
       
  1023     }
       
  1024 
       
  1025   return g_string_free (result, FALSE); 
       
  1026 }
       
  1027 
       
  1028 /**
       
  1029  * g_unichar_get_mirror_char:
       
  1030  * @ch: a Unicode character
       
  1031  * @mirrored_ch: location to store the mirrored character
       
  1032  * 
       
  1033  * In Unicode, some characters are <firstterm>mirrored</firstterm>. This
       
  1034  * means that their images are mirrored horizontally in text that is laid
       
  1035  * out from right to left. For instance, "(" would become its mirror image,
       
  1036  * ")", in right-to-left text.
       
  1037  *
       
  1038  * If @ch has the Unicode mirrored property and there is another unicode
       
  1039  * character that typically has a glyph that is the mirror image of @ch's
       
  1040  * glyph and @mirrored_ch is set, it puts that character in the address
       
  1041  * pointed to by @mirrored_ch.  Otherwise the original character is put.
       
  1042  *
       
  1043  * Return value: %TRUE if @ch has a mirrored character, %FALSE otherwise
       
  1044  *
       
  1045  * Since: 2.4
       
  1046  **/
       
  1047 EXPORT_C gboolean
       
  1048 g_unichar_get_mirror_char (gunichar ch,
       
  1049                            gunichar *mirrored_ch)
       
  1050 {
       
  1051   gboolean found;
       
  1052   gunichar mirrored;
       
  1053 
       
  1054   mirrored = GLIB_GET_MIRRORING(ch);
       
  1055 
       
  1056   found = ch != mirrored;
       
  1057   if (mirrored_ch)
       
  1058     *mirrored_ch = mirrored;
       
  1059 
       
  1060   return found;
       
  1061 
       
  1062 }
       
  1063 
       
  1064 #define __G_UNIPROP_C__
       
  1065 #include "galiasdef.c"