glib/libglib/src/libcharset/localcharset.c
changeset 0 e4d67989cc36
equal deleted inserted replaced
-1:000000000000 0:e4d67989cc36
       
     1 /* Determine a canonical name for the current locale's character encoding.
       
     2 
       
     3    Copyright (C) 2000-2002 Free Software Foundation, Inc.
       
     4    Portions copyright (c) 2006 Nokia Corporation.  All rights reserved.
       
     5 
       
     6    This program is free software; you can redistribute it and/or modify it
       
     7    under the terms of the GNU Library General Public License as published
       
     8    by the Free Software Foundation; either version 2, or (at your option)
       
     9    any later version.
       
    10 
       
    11    This program is distributed in the hope that it will be useful,
       
    12    but WITHOUT ANY WARRANTY; without even the implied warranty of
       
    13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
       
    14    Library General Public License for more details.
       
    15 
       
    16    You should have received a copy of the GNU Library General Public
       
    17    License along with this program; if not, write to the Free Software
       
    18    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
       
    19    USA.  */
       
    20 
       
    21 /* Written by Bruno Haible <bruno@clisp.org>.  */
       
    22 
       
    23 #ifdef HAVE_CONFIG_H
       
    24 # include <config.h>
       
    25 #endif
       
    26 
       
    27 #include <glibconfig.h>
       
    28 
       
    29 #ifdef __SYMBIAN32__
       
    30 #ifndef LIBDIR
       
    31 	
       
    32 #define LIBDIR "c:\\sys\\bin" 
       
    33 
       
    34 #endif /* LIBDIR */
       
    35 #endif /* __SYMBIAN32__ */
       
    36 
       
    37 #if defined G_PLATFORM_WIN32
       
    38 /* Want to use Win32-specific code in this file also on Cygwin */
       
    39 # define _WIN32 1		
       
    40 #endif
       
    41 
       
    42 #if HAVE_STDDEF_H
       
    43 # include <stddef.h>
       
    44 #endif
       
    45 
       
    46 #include <stdio.h>
       
    47 #if HAVE_STRING_H
       
    48 # include <string.h>
       
    49 #else
       
    50 # include <strings.h>
       
    51 #endif
       
    52 #if HAVE_STDLIB_H
       
    53 # include <stdlib.h>
       
    54 #endif
       
    55 
       
    56 #if (defined _WIN32 || defined __WIN32__ ) && (!defined __SYMBIAN32__)
       
    57 # undef WIN32   /* avoid warning on mingw32 */
       
    58 # define WIN32
       
    59 #endif
       
    60 
       
    61 #if defined __EMX__
       
    62 /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
       
    63 # define OS2
       
    64 #endif
       
    65 
       
    66 #if !defined WIN32
       
    67 # if HAVE_LANGINFO_CODESET
       
    68 #  include <langinfo.h>
       
    69 # else
       
    70 #  if HAVE_SETLOCALE
       
    71 #   include <locale.h>
       
    72 #  endif
       
    73 # endif
       
    74 #elif defined WIN32
       
    75 # define WIN32_LEAN_AND_MEAN
       
    76 # include <windows.h>
       
    77 #endif
       
    78 #if defined OS2
       
    79 # define INCL_DOS
       
    80 # include <os2.h>
       
    81 #endif
       
    82 
       
    83 #ifdef __SYMBIAN32__
       
    84 #include <glib_wsd.h>
       
    85 #endif /* __SYMBIAN32__ */
       
    86 
       
    87 
       
    88 #if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__
       
    89   /* Win32, OS/2, DOS */
       
    90 # define ISSLASH(C) ((C) == '/' || (C) == '\\')
       
    91 #endif
       
    92 
       
    93 #ifndef DIRECTORY_SEPARATOR
       
    94 # define DIRECTORY_SEPARATOR '/'
       
    95 #endif
       
    96 
       
    97 #ifndef ISSLASH
       
    98 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
       
    99 #endif
       
   100 
       
   101 #ifdef HAVE_GETC_UNLOCKED
       
   102 # undef getc
       
   103 # define getc getc_unlocked
       
   104 #endif
       
   105 
       
   106 #ifdef __cplusplus
       
   107 /* When compiling with "gcc -x c++", produce a function with C linkage.  */
       
   108 extern "C" const char * locale_charset (void);
       
   109 #endif
       
   110 
       
   111 /* The following static variable is declared 'volatile' to avoid a
       
   112    possible multithread problem in the function get_charset_aliases. If we
       
   113    are running in a threaded environment, and if two threads initialize
       
   114    'charset_aliases' simultaneously, both will produce the same value,
       
   115    and everything will be ok if the two assignments to 'charset_aliases'
       
   116    are atomic. But I don't know what will happen if the two assignments mix.  */
       
   117 #if __STDC__ != 1
       
   118 # define volatile /* empty */
       
   119 #endif
       
   120 /* Pointer to the contents of the charset.alias file, if it has already been
       
   121    read, else NULL.  Its format is:
       
   122    ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
       
   123 
       
   124 #if EMULATOR
       
   125 
       
   126 PLS(charset_aliases,localcharset,const char * volatile)
       
   127 #define charset_aliases (*FUNCTION_NAME(charset_aliases,localcharset)())
       
   128 
       
   129 #else
       
   130 
       
   131 static const char * volatile charset_aliases;
       
   132 
       
   133 #endif /* EMULATOR */
       
   134 
       
   135 
       
   136 
       
   137 /* Return a pointer to the contents of the charset.alias file.  */
       
   138 const char *
       
   139 _g_locale_get_charset_aliases (void)
       
   140 {
       
   141   const char *cp;
       
   142 
       
   143   cp = charset_aliases;
       
   144   if (cp == NULL)
       
   145     {
       
   146 #if !defined WIN32
       
   147       FILE *fp;
       
   148       const char *dir = getenv ("LIBCHARSET_ALIAS_DIR");
       
   149       const char *base = "charset.alias";
       
   150       char *file_name;
       
   151 
       
   152       if (dir == NULL)
       
   153 	dir = LIBDIR;
       
   154 
       
   155       /* Concatenate dir and base into freshly allocated file_name.  */
       
   156       {
       
   157 	size_t dir_len = strlen (dir);
       
   158 	size_t base_len = strlen (base);
       
   159 	int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
       
   160 	file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
       
   161 	if (file_name != NULL)
       
   162 	  {
       
   163 	    memcpy (file_name, dir, dir_len);
       
   164 	    if (add_slash)
       
   165 	      file_name[dir_len] = DIRECTORY_SEPARATOR;
       
   166 	    memcpy (file_name + dir_len + add_slash, base, base_len + 1);
       
   167 	  }
       
   168       }
       
   169 
       
   170       if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
       
   171 	/* Out of memory or file not found, treat it as empty.  */
       
   172 	cp = "";
       
   173       else
       
   174 	{
       
   175 	  /* Parse the file's contents.  */
       
   176 	  int c;
       
   177 	  char buf1[50+1];
       
   178 	  char buf2[50+1];
       
   179 	  char *res_ptr = NULL;
       
   180 	  size_t res_size = 0;
       
   181 	  size_t l1, l2;
       
   182 
       
   183 	  for (;;)
       
   184 	    {
       
   185 	      c = getc (fp);
       
   186 	      if (c == EOF)
       
   187 		break;
       
   188 	      if (c == '\n' || c == ' ' || c == '\t')
       
   189 		continue;
       
   190 	      if (c == '#')
       
   191 		{
       
   192 		  /* Skip comment, to end of line.  */
       
   193 		  do
       
   194 		    c = getc (fp);
       
   195 		  while (!(c == EOF || c == '\n'));
       
   196 		  if (c == EOF)
       
   197 		    break;
       
   198 		  continue;
       
   199 		}
       
   200 	      ungetc (c, fp);
       
   201 	      if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
       
   202 		break;
       
   203 	      l1 = strlen (buf1);
       
   204 	      l2 = strlen (buf2);
       
   205 	      if (res_size == 0)
       
   206 		{
       
   207 		  res_size = l1 + 1 + l2 + 1;
       
   208 		  res_ptr = (char *) malloc (res_size + 1);
       
   209 		}
       
   210 	      else
       
   211 		{
       
   212 		  res_size += l1 + 1 + l2 + 1;
       
   213 		  res_ptr = (char *) realloc (res_ptr, res_size + 1);
       
   214 		}
       
   215 	      if (res_ptr == NULL)
       
   216 		{
       
   217 		  /* Out of memory. */
       
   218 		  res_size = 0;
       
   219 		  break;
       
   220 		}
       
   221 	      strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
       
   222 	      strcpy (res_ptr + res_size - (l2 + 1), buf2);
       
   223 	    }
       
   224 	  fclose (fp);
       
   225 	  if (res_size == 0)
       
   226 	    cp = "";
       
   227 	  else
       
   228 	    {
       
   229 	      *(res_ptr + res_size) = '\0';
       
   230 	      cp = res_ptr;
       
   231 	    }
       
   232 	}
       
   233 
       
   234       if (file_name != NULL)
       
   235 	free (file_name);
       
   236 
       
   237 #else
       
   238 
       
   239       /* To avoid the troubles of installing a separate file in the same
       
   240 	 directory as the DLL and of retrieving the DLL's directory at
       
   241 	 runtime, simply inline the aliases here.  */
       
   242 
       
   243 # if defined WIN32
       
   244       cp = "CP936" "\0" "GBK" "\0"
       
   245 	   "CP1361" "\0" "JOHAB" "\0"
       
   246 	   "CP20127" "\0" "ASCII" "\0"
       
   247 	   "CP20866" "\0" "KOI8-R" "\0"
       
   248 	   "CP21866" "\0" "KOI8-RU" "\0"
       
   249 	   "CP28591" "\0" "ISO-8859-1" "\0"
       
   250 	   "CP28592" "\0" "ISO-8859-2" "\0"
       
   251 	   "CP28593" "\0" "ISO-8859-3" "\0"
       
   252 	   "CP28594" "\0" "ISO-8859-4" "\0"
       
   253 	   "CP28595" "\0" "ISO-8859-5" "\0"
       
   254 	   "CP28596" "\0" "ISO-8859-6" "\0"
       
   255 	   "CP28597" "\0" "ISO-8859-7" "\0"
       
   256 	   "CP28598" "\0" "ISO-8859-8" "\0"
       
   257 	   "CP28599" "\0" "ISO-8859-9" "\0"
       
   258 	   "CP28605" "\0" "ISO-8859-15" "\0";
       
   259 # endif
       
   260 #endif
       
   261 
       
   262       charset_aliases = cp;
       
   263     }
       
   264 
       
   265   return cp;
       
   266 }
       
   267 
       
   268 /* Determine the current locale's character encoding, and canonicalize it
       
   269    into one of the canonical names listed in config.charset.
       
   270    The result must not be freed; it is statically allocated.
       
   271    If the canonical name cannot be determined, the result is a non-canonical
       
   272    name.  */
       
   273 
       
   274 const char *
       
   275 _g_locale_charset_raw (void)
       
   276 {
       
   277   const char *codeset;
       
   278 
       
   279 #if !(defined WIN32 || defined OS2)
       
   280 
       
   281 # if HAVE_LANGINFO_CODESET
       
   282 
       
   283   /* Most systems support nl_langinfo (CODESET) nowadays.  */
       
   284   codeset = nl_langinfo (CODESET);
       
   285 
       
   286 # else
       
   287 
       
   288   /* On old systems which lack it, use setlocale or getenv.  */
       
   289   const char *locale = NULL;
       
   290 
       
   291   /* But most old systems don't have a complete set of locales.  Some
       
   292      (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
       
   293      use setlocale here; it would return "C" when it doesn't support the
       
   294      locale name the user has set.  */
       
   295 #  if HAVE_SETLOCALE && 0
       
   296   locale = setlocale (LC_CTYPE, NULL);
       
   297 #  endif
       
   298   if (locale == NULL || locale[0] == '\0')
       
   299     {
       
   300       locale = getenv ("LC_ALL");
       
   301       if (locale == NULL || locale[0] == '\0')
       
   302 	{
       
   303 	  locale = getenv ("LC_CTYPE");
       
   304 	  if (locale == NULL || locale[0] == '\0')
       
   305 	    locale = getenv ("LANG");
       
   306 	}
       
   307     }
       
   308 
       
   309   /* On some old systems, one used to set locale = "iso8859_1". On others,
       
   310      you set it to "language_COUNTRY.charset". In any case, we resolve it
       
   311      through the charset.alias file.  */
       
   312   codeset = locale;
       
   313 
       
   314 # endif
       
   315 
       
   316 #elif defined WIN32
       
   317 
       
   318   static char buf[2 + 10 + 1];
       
   319 
       
   320   /* Woe32 has a function returning the locale's codepage as a number.  */
       
   321   sprintf (buf, "CP%u", GetACP ());
       
   322   codeset = buf;
       
   323 
       
   324 #elif defined OS2
       
   325 
       
   326   const char *locale;
       
   327   static char buf[2 + 10 + 1];
       
   328   ULONG cp[3];
       
   329   ULONG cplen;
       
   330 
       
   331   /* Allow user to override the codeset, as set in the operating system,
       
   332      with standard language environment variables.  */
       
   333   locale = getenv ("LC_ALL");
       
   334   if (locale == NULL || locale[0] == '\0')
       
   335     {
       
   336       locale = getenv ("LC_CTYPE");
       
   337       if (locale == NULL || locale[0] == '\0')
       
   338 	locale = getenv ("LANG");
       
   339     }
       
   340   if (locale != NULL && locale[0] != '\0')
       
   341     {
       
   342       /* If the locale name contains an encoding after the dot, return it.  */
       
   343       const char *dot = strchr (locale, '.');
       
   344 
       
   345       if (dot != NULL)
       
   346 	{
       
   347 	  const char *modifier;
       
   348 
       
   349 	  dot++;
       
   350 	  /* Look for the possible @... trailer and remove it, if any.  */
       
   351 	  modifier = strchr (dot, '@');
       
   352 	  if (modifier == NULL)
       
   353 	    return dot;
       
   354 	  if (modifier - dot < sizeof (buf))
       
   355 	    {
       
   356 	      memcpy (buf, dot, modifier - dot);
       
   357 	      buf [modifier - dot] = '\0';
       
   358 	      return buf;
       
   359 	    }
       
   360 	}
       
   361 
       
   362       /* Resolve through the charset.alias file.  */
       
   363       codeset = locale;
       
   364     }
       
   365   else
       
   366     {
       
   367       /* OS/2 has a function returning the locale's codepage as a number.  */
       
   368       if (DosQueryCp (sizeof (cp), cp, &cplen))
       
   369 	codeset = "";
       
   370       else
       
   371 	{
       
   372 	  sprintf (buf, "CP%u", cp[0]);
       
   373 	  codeset = buf;
       
   374 	}
       
   375     }
       
   376 
       
   377 #endif
       
   378 
       
   379   return codeset;
       
   380 }
       
   381 
       
   382 #ifdef STATIC
       
   383 STATIC
       
   384 #endif
       
   385 const char *
       
   386 _g_locale_charset_unalias (const char *codeset)
       
   387 {
       
   388   const char *aliases;
       
   389 
       
   390   if (codeset == NULL)
       
   391     /* The canonical name cannot be determined.  */
       
   392     codeset = "";
       
   393 
       
   394   /* Resolve alias. */
       
   395   for (aliases = _g_locale_get_charset_aliases ();
       
   396        *aliases != '\0';
       
   397        aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
       
   398     if (strcmp (codeset, aliases) == 0
       
   399 	|| (aliases[0] == '*' && aliases[1] == '\0'))
       
   400       {
       
   401 	codeset = aliases + strlen (aliases) + 1;
       
   402 	break;
       
   403       }
       
   404 
       
   405   /* Don't return an empty string.  GNU libc and GNU libiconv interpret
       
   406      the empty string as denoting "the locale's character encoding",
       
   407      thus GNU libiconv would call this function a second time.  */
       
   408   if (codeset[0] == '\0')
       
   409     codeset = "ASCII";
       
   410 
       
   411   return codeset;
       
   412 }