glib/libglib/src/gunicollate.c
changeset 0 e4d67989cc36
equal deleted inserted replaced
-1:000000000000 0:e4d67989cc36
       
     1 /* gunicollate.c - Collation
       
     2  *
       
     3  *  Copyright 2001,2005 Red Hat, Inc.
       
     4  * Portions copyright (c) 2006 Nokia Corporation.  All rights reserved.
       
     5  *
       
     6  * The Gnome Library is free software; you can redistribute it and/or
       
     7  * modify it under the terms of the GNU Lesser General Public License as
       
     8  * published by the Free Software Foundation; either version 2 of the
       
     9  * License, or (at your option) any later version.
       
    10  *
       
    11  * The Gnome Library is distributed in the hope that it will be useful,
       
    12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
       
    13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
       
    14  * Lesser General Public License for more details.
       
    15  *
       
    16  * You should have received a copy of the GNU Lesser General Public
       
    17  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
       
    18  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
       
    19  *   Boston, MA 02111-1307, USA.
       
    20  */
       
    21 
       
    22 #include "config.h"
       
    23 
       
    24 #include <locale.h>
       
    25 #include <string.h>
       
    26 #ifdef __STDC_ISO_10646__
       
    27 #include <wchar.h>
       
    28 #endif
       
    29 
       
    30 #include "glib.h"
       
    31 #include "gunicodeprivate.h"
       
    32 #include "galias.h"
       
    33 
       
    34 /**
       
    35  * g_utf8_collate:
       
    36  * @str1: a UTF-8 encoded string
       
    37  * @str2: a UTF-8 encoded string
       
    38  * 
       
    39  * Compares two strings for ordering using the linguistically
       
    40  * correct rules for the current locale. When sorting a large
       
    41  * number of strings, it will be significantly faster to
       
    42  * obtain collation keys with g_utf8_collate_key() and 
       
    43  * compare the keys with strcmp() when 
       
    44  * sorting instead of sorting the original strings.
       
    45  * 
       
    46  * Return value: &lt; 0 if @str1 compares before @str2, 
       
    47  *   0 if they compare equal, &gt; 0 if @str1 compares after @str2.
       
    48  **/
       
    49 EXPORT_C gint
       
    50 g_utf8_collate (const gchar *str1,
       
    51 		const gchar *str2)
       
    52 {
       
    53   gint result;
       
    54   
       
    55 #ifdef __STDC_ISO_10646__
       
    56 
       
    57   gunichar *str1_norm;
       
    58   gunichar *str2_norm;
       
    59 
       
    60   g_return_val_if_fail (str1 != NULL, 0);
       
    61   g_return_val_if_fail (str2 != NULL, 0);
       
    62 
       
    63   str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
       
    64   str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
       
    65 
       
    66   result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
       
    67 
       
    68   g_free (str1_norm);
       
    69   g_free (str2_norm);
       
    70 
       
    71 #else /* !__STDC_ISO_10646__ */
       
    72 
       
    73   const gchar *charset;
       
    74   gchar *str1_norm;
       
    75   gchar *str2_norm;
       
    76 
       
    77   g_return_val_if_fail (str1 != NULL, 0);
       
    78   g_return_val_if_fail (str2 != NULL, 0);
       
    79 
       
    80   str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
       
    81   str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
       
    82 
       
    83   if (g_get_charset (&charset))
       
    84     {
       
    85       result = strcoll (str1_norm, str2_norm);
       
    86     }
       
    87   else
       
    88     {
       
    89 #ifndef __SYMBIAN32__    
       
    90       gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
       
    91       gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
       
    92 #else
       
    93       gchar *str1_locale = g_convert (str1_norm, -1, "UTF-8", charset, NULL, NULL, NULL);
       
    94       gchar *str2_locale = g_convert (str2_norm, -1, "UTF-8", charset, NULL, NULL, NULL);
       
    95 #endif	//!__SYMBIAN32__  
       
    96 
       
    97       if (str1_locale && str2_locale)
       
    98 	result =  strcoll (str1_locale, str2_locale);
       
    99       else if (str1_locale)
       
   100 	result = -1;
       
   101       else if (str2_locale)
       
   102 	result = 1;
       
   103       else
       
   104 	result = strcmp (str1_norm, str2_norm);
       
   105 
       
   106       g_free (str1_locale);
       
   107       g_free (str2_locale);
       
   108     }
       
   109 
       
   110   g_free (str1_norm);
       
   111   g_free (str2_norm);
       
   112 
       
   113 #endif /* __STDC_ISO_10646__ */
       
   114 
       
   115   return result;
       
   116 }
       
   117 
       
   118 #ifdef __STDC_ISO_10646__
       
   119 /* We need UTF-8 encoding of numbers to encode the weights if
       
   120  * we are using wcsxfrm. However, we aren't encoding Unicode
       
   121  * characters, so we can't simply use g_unichar_to_utf8.
       
   122  *
       
   123  * The following routine is taken (with modification) from GNU
       
   124  * libc's strxfrm routine:
       
   125  *
       
   126  * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
       
   127  * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
       
   128  */
       
   129 static inline int
       
   130 utf8_encode (char *buf, wchar_t val)
       
   131 {
       
   132   int retval;
       
   133 
       
   134   if (val < 0x80)
       
   135     {
       
   136       if (buf)
       
   137 	*buf++ = (char) val;
       
   138       retval = 1;
       
   139     }
       
   140   else
       
   141     {
       
   142       int step;
       
   143 
       
   144       for (step = 2; step < 6; ++step)
       
   145         if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
       
   146           break;
       
   147       retval = step;
       
   148 
       
   149       if (buf)
       
   150 	{
       
   151 	  *buf = (unsigned char) (~0xff >> step);
       
   152 	  --step;
       
   153 	  do
       
   154 	    {
       
   155 	      buf[step] = 0x80 | (val & 0x3f);
       
   156 	      val >>= 6;
       
   157 	    }
       
   158 	  while (--step > 0);
       
   159 	  *buf |= val;
       
   160 	}
       
   161     }
       
   162 
       
   163   return retval;
       
   164 }
       
   165 #endif /* __STDC_ISO_10646__ */
       
   166 
       
   167 /**
       
   168  * g_utf8_collate_key:
       
   169  * @str: a UTF-8 encoded string.
       
   170  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
       
   171  *
       
   172  * Converts a string into a collation key that can be compared
       
   173  * with other collation keys produced by the same function using 
       
   174  * strcmp(). 
       
   175  * The results of comparing the collation keys of two strings 
       
   176  * with strcmp() will always be the same as 
       
   177  * comparing the two original keys with g_utf8_collate().
       
   178  * 
       
   179  * Return value: a newly allocated string. This string should
       
   180  *   be freed with g_free() when you are done with it.
       
   181  **/
       
   182 EXPORT_C gchar *
       
   183 g_utf8_collate_key (const gchar *str,
       
   184 		    gssize       len)
       
   185 {
       
   186   gchar *result;
       
   187   size_t xfrm_len = 0;
       
   188   
       
   189 #ifdef __STDC_ISO_10646__
       
   190 
       
   191   gunichar *str_norm;
       
   192   wchar_t *result_wc;
       
   193   size_t i;
       
   194   size_t result_len = 0;
       
   195 
       
   196   g_return_val_if_fail (str != NULL, NULL);
       
   197 
       
   198   str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
       
   199 
       
   200   setlocale (LC_COLLATE, "");
       
   201 
       
   202   xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
       
   203   result_wc = g_new (wchar_t, xfrm_len + 1);
       
   204   wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
       
   205 
       
   206   for (i=0; i < xfrm_len; i++)
       
   207     result_len += utf8_encode (NULL, result_wc[i]);
       
   208   result = g_malloc (result_len + 1);
       
   209   result_len = 0;
       
   210   for (i=0; i < xfrm_len; i++)
       
   211     result_len += utf8_encode (result + result_len, result_wc[i]);
       
   212 
       
   213   result[result_len] = '\0';
       
   214 
       
   215   g_free (result_wc);
       
   216   g_free (str_norm);
       
   217 
       
   218   return result;
       
   219 #else /* !__STDC_ISO_10646__ */
       
   220 
       
   221   const gchar *charset;
       
   222   gchar *str_norm;
       
   223 
       
   224   g_return_val_if_fail (str != NULL, NULL);
       
   225 
       
   226   str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
       
   227 
       
   228   if (g_get_charset (&charset))
       
   229     {
       
   230       xfrm_len = strxfrm (NULL, str_norm, 0);
       
   231       result = g_malloc (xfrm_len + 1);
       
   232       strxfrm (result, str_norm, xfrm_len + 1);
       
   233     }
       
   234   else
       
   235     {
       
   236       gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
       
   237 
       
   238       if (str_locale)
       
   239 	{
       
   240 	  xfrm_len = strxfrm (NULL, str_locale, 0);
       
   241 	  if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2)
       
   242 	    {
       
   243 	      g_free (str_locale);
       
   244 	      str_locale = NULL;
       
   245 	    }
       
   246 	}
       
   247       if (str_locale)
       
   248 	{
       
   249 	  result = g_malloc (xfrm_len + 2);
       
   250 	  result[0] = 'A';
       
   251 	  strxfrm (result + 1, str_locale, xfrm_len + 1);
       
   252 #ifdef __SYMBIAN32__ 		  
       
   253 		  result[xfrm_len + 1] = '\0';
       
   254 #endif //__SYMBIAN32__
       
   255 	  g_free (str_locale);
       
   256 	}
       
   257       else
       
   258 	{
       
   259 	  xfrm_len = strlen (str_norm);
       
   260 	  result = g_malloc (xfrm_len + 2);
       
   261 	  result[0] = 'B';
       
   262 	  memcpy (result + 1, str_norm, xfrm_len);
       
   263 	  result[xfrm_len+1] = '\0';
       
   264 	}
       
   265     }
       
   266 
       
   267   g_free (str_norm);
       
   268 #endif /* __STDC_ISO_10646__ */
       
   269 
       
   270   return result;
       
   271 }
       
   272 
       
   273 /* This is a collation key that is very very likely to sort before any
       
   274    collation key that libc strxfrm generates. We use this before any
       
   275    special case (dot or number) to make sure that its sorted before
       
   276    anything else.
       
   277  */
       
   278 #define COLLATION_SENTINEL "\1\1\1"
       
   279 
       
   280 /**
       
   281  * g_utf8_collate_key_for_filename:
       
   282  * @str: a UTF-8 encoded string.
       
   283  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
       
   284  *
       
   285  * Converts a string into a collation key that can be compared
       
   286  * with other collation keys produced by the same function using strcmp(). 
       
   287  * 
       
   288  * In order to sort filenames correctly, this function treats the dot '.' 
       
   289  * as a special case. Most dictionary orderings seem to consider it
       
   290  * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
       
   291  * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
       
   292  * would like to treat numbers intelligently so that "file1" "file10" "file5"
       
   293  * is sorted as "file1" "file5" "file10".
       
   294  *
       
   295  * Return value: a newly allocated string. This string should
       
   296  *   be freed with g_free() when you are done with it.
       
   297  *
       
   298  * Since: 2.8
       
   299  */
       
   300 EXPORT_C gchar*
       
   301 g_utf8_collate_key_for_filename (const gchar *str,
       
   302 				 gssize       len)
       
   303 {
       
   304   GString *result;
       
   305   GString *append;
       
   306   const gchar *p;
       
   307   const gchar *prev;
       
   308   const gchar *end;
       
   309   gchar *collate_key;
       
   310   gint digits;
       
   311   gint leading_zeros;
       
   312 
       
   313   /*
       
   314    * How it works:
       
   315    *
       
   316    * Split the filename into collatable substrings which do
       
   317    * not contain [.0-9] and special-cased substrings. The collatable 
       
   318    * substrings are run through the normal g_utf8_collate_key() and the 
       
   319    * resulting keys are concatenated with keys generated from the 
       
   320    * special-cased substrings.
       
   321    *
       
   322    * Special cases: Dots are handled by replacing them with '\1' which 
       
   323    * implies that short dot-delimited substrings are before long ones, 
       
   324    * e.g.
       
   325    * 
       
   326    *   a\1a   (a.a)
       
   327    *   a-\1a  (a-.a)
       
   328    *   aa\1a  (aa.a)
       
   329    * 
       
   330    * Numbers are handled by prepending to each number d-1 superdigits 
       
   331    * where d = number of digits in the number and SUPERDIGIT is a 
       
   332    * character with an integer value higher than any digit (for instance 
       
   333    * ':'). This ensures that single-digit numbers are sorted before 
       
   334    * double-digit numbers which in turn are sorted separately from 
       
   335    * triple-digit numbers, etc. To avoid strange side-effects when 
       
   336    * sorting strings that already contain SUPERDIGITs, a '\2'
       
   337    * is also prepended, like this
       
   338    *
       
   339    *   file\21      (file1)
       
   340    *   file\25      (file5)
       
   341    *   file\2:10    (file10)
       
   342    *   file\2:26    (file26)
       
   343    *   file\2::100  (file100)
       
   344    *   file:foo     (file:foo)
       
   345    * 
       
   346    * This has the side-effect of sorting numbers before everything else (except
       
   347    * dots), but this is probably OK.
       
   348    *
       
   349    * Leading digits are ignored when doing the above. To discriminate
       
   350    * numbers which differ only in the number of leading digits, we append
       
   351    * the number of leading digits as a byte at the very end of the collation
       
   352    * key.
       
   353    *
       
   354    * To try avoid conflict with any collation key sequence generated by libc we
       
   355    * start each switch to a special cased part with a sentinel that hopefully
       
   356    * will sort before anything libc will generate.
       
   357    */
       
   358 
       
   359   if (len < 0)
       
   360     len = strlen (str);
       
   361 
       
   362   result = g_string_sized_new (len * 2);
       
   363   append = g_string_sized_new (0);
       
   364 
       
   365   end = str + len;
       
   366 
       
   367   /* No need to use utf8 functions, since we're only looking for ascii chars */
       
   368   for (prev = p = str; p < end; p++)
       
   369     {
       
   370       switch (*p)
       
   371 	{
       
   372 	case '.':
       
   373 	  if (prev != p) 
       
   374 	    {
       
   375 	      collate_key = g_utf8_collate_key (prev, p - prev);
       
   376 	      g_string_append (result, collate_key);
       
   377 	      g_free (collate_key);
       
   378 	    }
       
   379 	  
       
   380 	  g_string_append (result, COLLATION_SENTINEL "\1");
       
   381 	  
       
   382 	  /* skip the dot */
       
   383 	  prev = p + 1;
       
   384 	  break;
       
   385 	  
       
   386 	case '0':
       
   387 	case '1':
       
   388 	case '2':
       
   389 	case '3':
       
   390 	case '4':
       
   391 	case '5':
       
   392 	case '6':
       
   393 	case '7':
       
   394 	case '8':
       
   395 	case '9':
       
   396 	  if (prev != p) 
       
   397 	    {
       
   398 	      collate_key = g_utf8_collate_key (prev, p - prev);
       
   399 	      g_string_append (result, collate_key);
       
   400 	      g_free (collate_key);
       
   401 	    }
       
   402 	  
       
   403 	  g_string_append (result, COLLATION_SENTINEL "\2");
       
   404 	  
       
   405 	  prev = p;
       
   406 	  
       
   407 	  /* write d-1 colons */
       
   408 	  if (*p == '0')
       
   409 	    {
       
   410 	      leading_zeros = 1;
       
   411 	      digits = 0;
       
   412 	    }
       
   413 	  else
       
   414 	    {
       
   415 	      leading_zeros = 0;
       
   416 	      digits = 1;
       
   417 	    }
       
   418 	  
       
   419 	  while (++p < end)
       
   420 	    {
       
   421 	      if (*p == '0' && !digits)
       
   422 		++leading_zeros;
       
   423 	      else if (g_ascii_isdigit(*p))
       
   424 		++digits;
       
   425 	      else
       
   426                 {
       
   427  		  /* count an all-zero sequence as
       
   428                    * one digit plus leading zeros
       
   429                    */
       
   430           	  if (!digits)
       
   431                     {
       
   432                       ++digits;
       
   433                       --leading_zeros;
       
   434                     }        
       
   435 		  break;
       
   436                 }
       
   437 	    }
       
   438 
       
   439 	  while (digits > 1)
       
   440 	    {
       
   441 	      g_string_append_c (result, ':');
       
   442 	      --digits;
       
   443 	    }
       
   444 
       
   445 	  if (leading_zeros > 0)
       
   446 	    {
       
   447 	      g_string_append_c (append, (char)leading_zeros);
       
   448 	      prev += leading_zeros;
       
   449 	    }
       
   450 	  
       
   451 	  /* write the number itself */
       
   452 	  g_string_append_len (result, prev, p - prev);
       
   453 	  
       
   454 	  prev = p;
       
   455 	  --p;	  /* go one step back to avoid disturbing outer loop */
       
   456 	  break;
       
   457 	  
       
   458 	default:
       
   459 	  /* other characters just accumulate */
       
   460 	  break;
       
   461 	}
       
   462     }
       
   463   
       
   464   if (prev != p) 
       
   465     {
       
   466       collate_key = g_utf8_collate_key (prev, p - prev);
       
   467       g_string_append (result, collate_key);
       
   468       g_free (collate_key);
       
   469     }
       
   470   
       
   471   g_string_append (result, append->str);
       
   472   g_string_free (append, TRUE);
       
   473 
       
   474   return g_string_free (result, FALSE);
       
   475 }
       
   476 
       
   477 
       
   478 #define __G_UNICOLLATE_C__
       
   479 #include "galiasdef.c"