glib/tests/utf8-validate.c
changeset 18 47c74d1534e1
equal deleted inserted replaced
0:e4d67989cc36 18:47c74d1534e1
       
     1 /* GLIB - Library of useful routines for C programming
       
     2  * Copyright (C) 2001 Matthias Clasen <matthiasc@poet.de>
       
     3  * Portion Copyright © 2008-09 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
       
     4  * This library is free software; you can redistribute it and/or
       
     5  * modify it under the terms of the GNU Lesser General Public
       
     6  * License as published by the Free Software Foundation; either
       
     7  * version 2 of the License, or (at your option) any later version.
       
     8  *
       
     9  * This library is distributed in the hope that it will be useful,
       
    10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
       
    11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
       
    12  * Lesser General Public License for more details.
       
    13  *
       
    14  * You should have received a copy of the GNU Lesser General Public
       
    15  * License along with this library; if not, write to the
       
    16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
       
    17  * Boston, MA 02111-1307, USA.
       
    18  */
       
    19 
       
    20 #include "glib.h"
       
    21 #ifdef __SYMBIAN32__
       
    22 #include "mrt2_glib2_test.h"
       
    23 #endif /*__SYMBIAN32__*/
       
    24 
       
    25 #define UNICODE_VALID(Char)                   \
       
    26     ((Char) < 0x110000 &&                     \
       
    27      (((Char) & 0xFFFFF800) != 0xD800) &&     \
       
    28      ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
       
    29      ((Char) & 0xFFFE) != 0xFFFE)
       
    30 
       
    31 
       
    32 
       
    33 static gboolean any_failed = FALSE;
       
    34 
       
    35 struct {
       
    36   const gchar *text;
       
    37   gint max_len;
       
    38   gint offset;
       
    39   gboolean valid;
       
    40 } test[] = {  
       
    41   /* some tests to check max_len handling */
       
    42   /* length 1 */
       
    43   { "abcde", -1, 5, TRUE },
       
    44   { "abcde", 3, 3, TRUE },
       
    45   { "abcde", 5, 5, TRUE },
       
    46   { "abcde", 7, 5, FALSE },
       
    47   /* length 2 */
       
    48   { "\xc2\xa9\xc2\xa9\xc2\xa9", -1, 6, TRUE }, 
       
    49   { "\xc2\xa9\xc2\xa9\xc2\xa9",  1, 0, FALSE }, 
       
    50   { "\xc2\xa9\xc2\xa9\xc2\xa9",  2, 2, TRUE }, 
       
    51   { "\xc2\xa9\xc2\xa9\xc2\xa9",  3, 2, FALSE }, 
       
    52   { "\xc2\xa9\xc2\xa9\xc2\xa9",  4, 4, TRUE }, 
       
    53   { "\xc2\xa9\xc2\xa9\xc2\xa9",  5, 4, FALSE }, 
       
    54   { "\xc2\xa9\xc2\xa9\xc2\xa9",  6, 6, TRUE }, 
       
    55   { "\xc2\xa9\xc2\xa9\xc2\xa9",  7, 6, FALSE }, 
       
    56   /* length 3 */
       
    57   { "\xe2\x89\xa0\xe2\x89\xa0", -1, 6, TRUE },
       
    58   { "\xe2\x89\xa0\xe2\x89\xa0",  1, 0, FALSE },
       
    59   { "\xe2\x89\xa0\xe2\x89\xa0",  2, 0, FALSE },
       
    60   { "\xe2\x89\xa0\xe2\x89\xa0",  3, 3, TRUE },
       
    61   { "\xe2\x89\xa0\xe2\x89\xa0",  4, 3, FALSE },
       
    62   { "\xe2\x89\xa0\xe2\x89\xa0",  5, 3, FALSE },
       
    63   { "\xe2\x89\xa0\xe2\x89\xa0",  6, 6, TRUE },
       
    64   { "\xe2\x89\xa0\xe2\x89\xa0",  7, 6, FALSE },
       
    65 
       
    66   /* examples from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */
       
    67   /* greek 'kosme' */
       
    68   { "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", -1, 11, TRUE },
       
    69   /* first sequence of each length */
       
    70   { "\x00", -1, 0, TRUE },
       
    71   { "\xc2\x80", -1, 2, TRUE },
       
    72   { "\xe0\xa0\x80", -1, 3, TRUE },
       
    73   { "\xf0\x90\x80\x80", -1, 4, TRUE },
       
    74   { "\xf8\x88\x80\x80\x80", -1, 0, FALSE },
       
    75   { "\xfc\x84\x80\x80\x80\x80", -1, 0, FALSE },
       
    76   /* last sequence of each length */
       
    77   { "\x7f", -1, 1, TRUE },
       
    78   { "\xdf\xbf", -1, 2, TRUE },
       
    79   { "\xef\xbf\xbf", -1, 0, FALSE },
       
    80   { "\xf7\xbf\xbf\xbf", -1, 0, FALSE },
       
    81   { "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE },
       
    82   { "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE },
       
    83   /* other boundary conditions */
       
    84   { "\xed\x9f\xbf", -1, 3, TRUE },
       
    85   { "\xee\x80\x80", -1, 3, TRUE },
       
    86   { "\xef\xbf\xbd", -1, 3, TRUE },
       
    87   { "\xf4\x8f\xbf\xbf", -1, 0, FALSE },
       
    88   { "\xf4\x90\x80\x80", -1, 0, FALSE },
       
    89   /* malformed sequences */
       
    90   /* continuation bytes */
       
    91   { "\x80", -1, 0, FALSE },
       
    92   { "\xbf", -1, 0, FALSE },
       
    93   { "\x80\xbf", -1, 0, FALSE },
       
    94   { "\x80\xbf\x80", -1, 0, FALSE },
       
    95   { "\x80\xbf\x80\xbf", -1, 0, FALSE },
       
    96   { "\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
       
    97   { "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE },
       
    98   { "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
       
    99 
       
   100   /* all possible continuation byte */
       
   101   { "\x80", -1, 0, FALSE },
       
   102   { "\x81", -1, 0, FALSE },
       
   103   { "\x82", -1, 0, FALSE },
       
   104   { "\x83", -1, 0, FALSE },
       
   105   { "\x84", -1, 0, FALSE },
       
   106   { "\x85", -1, 0, FALSE },
       
   107   { "\x86", -1, 0, FALSE },
       
   108   { "\x87", -1, 0, FALSE },
       
   109   { "\x88", -1, 0, FALSE },
       
   110   { "\x89", -1, 0, FALSE },
       
   111   { "\x8a", -1, 0, FALSE },
       
   112   { "\x8b", -1, 0, FALSE },
       
   113   { "\x8c", -1, 0, FALSE },
       
   114   { "\x8d", -1, 0, FALSE },
       
   115   { "\x8e", -1, 0, FALSE },
       
   116   { "\x8f", -1, 0, FALSE },
       
   117   { "\x90", -1, 0, FALSE },
       
   118   { "\x91", -1, 0, FALSE },
       
   119   { "\x92", -1, 0, FALSE },
       
   120   { "\x93", -1, 0, FALSE },
       
   121   { "\x94", -1, 0, FALSE },
       
   122   { "\x95", -1, 0, FALSE },
       
   123   { "\x96", -1, 0, FALSE },
       
   124   { "\x97", -1, 0, FALSE },
       
   125   { "\x98", -1, 0, FALSE },
       
   126   { "\x99", -1, 0, FALSE },
       
   127   { "\x9a", -1, 0, FALSE },
       
   128   { "\x9b", -1, 0, FALSE },
       
   129   { "\x9c", -1, 0, FALSE },
       
   130   { "\x9d", -1, 0, FALSE },
       
   131   { "\x9e", -1, 0, FALSE },
       
   132   { "\x9f", -1, 0, FALSE },
       
   133   { "\xa0", -1, 0, FALSE },
       
   134   { "\xa1", -1, 0, FALSE },
       
   135   { "\xa2", -1, 0, FALSE },
       
   136   { "\xa3", -1, 0, FALSE },
       
   137   { "\xa4", -1, 0, FALSE },
       
   138   { "\xa5", -1, 0, FALSE },
       
   139   { "\xa6", -1, 0, FALSE },
       
   140   { "\xa7", -1, 0, FALSE },
       
   141   { "\xa8", -1, 0, FALSE },
       
   142   { "\xa9", -1, 0, FALSE },
       
   143   { "\xaa", -1, 0, FALSE },
       
   144   { "\xab", -1, 0, FALSE },
       
   145   { "\xac", -1, 0, FALSE },
       
   146   { "\xad", -1, 0, FALSE },
       
   147   { "\xae", -1, 0, FALSE },
       
   148   { "\xaf", -1, 0, FALSE },
       
   149   { "\xb0", -1, 0, FALSE },
       
   150   { "\xb1", -1, 0, FALSE },
       
   151   { "\xb2", -1, 0, FALSE },
       
   152   { "\xb3", -1, 0, FALSE },
       
   153   { "\xb4", -1, 0, FALSE },
       
   154   { "\xb5", -1, 0, FALSE },
       
   155   { "\xb6", -1, 0, FALSE },
       
   156   { "\xb7", -1, 0, FALSE },
       
   157   { "\xb8", -1, 0, FALSE },
       
   158   { "\xb9", -1, 0, FALSE },
       
   159   { "\xba", -1, 0, FALSE },
       
   160   { "\xbb", -1, 0, FALSE },
       
   161   { "\xbc", -1, 0, FALSE },
       
   162   { "\xbd", -1, 0, FALSE },
       
   163   { "\xbe", -1, 0, FALSE },
       
   164   { "\xbf", -1, 0, FALSE },
       
   165   /* lone start characters */
       
   166   { "\xc0\x20", -1, 0, FALSE },
       
   167   { "\xc1\x20", -1, 0, FALSE },
       
   168   { "\xc2\x20", -1, 0, FALSE },
       
   169   { "\xc3\x20", -1, 0, FALSE },
       
   170   { "\xc4\x20", -1, 0, FALSE },
       
   171   { "\xc5\x20", -1, 0, FALSE },
       
   172   { "\xc6\x20", -1, 0, FALSE },
       
   173   { "\xc7\x20", -1, 0, FALSE },
       
   174   { "\xc8\x20", -1, 0, FALSE },
       
   175   { "\xc9\x20", -1, 0, FALSE },
       
   176   { "\xca\x20", -1, 0, FALSE },
       
   177   { "\xcb\x20", -1, 0, FALSE },
       
   178   { "\xcc\x20", -1, 0, FALSE },
       
   179   { "\xcd\x20", -1, 0, FALSE },
       
   180   { "\xce\x20", -1, 0, FALSE },
       
   181   { "\xcf\x20", -1, 0, FALSE },
       
   182   { "\xd0\x20", -1, 0, FALSE },
       
   183   { "\xd1\x20", -1, 0, FALSE },
       
   184   { "\xd2\x20", -1, 0, FALSE },
       
   185   { "\xd3\x20", -1, 0, FALSE },
       
   186   { "\xd4\x20", -1, 0, FALSE },
       
   187   { "\xd5\x20", -1, 0, FALSE },
       
   188   { "\xd6\x20", -1, 0, FALSE },
       
   189   { "\xd7\x20", -1, 0, FALSE },
       
   190   { "\xd8\x20", -1, 0, FALSE },
       
   191   { "\xd9\x20", -1, 0, FALSE },
       
   192   { "\xda\x20", -1, 0, FALSE },
       
   193   { "\xdb\x20", -1, 0, FALSE },
       
   194   { "\xdc\x20", -1, 0, FALSE },
       
   195   { "\xdd\x20", -1, 0, FALSE },
       
   196   { "\xde\x20", -1, 0, FALSE },
       
   197   { "\xdf\x20", -1, 0, FALSE },
       
   198   { "\xe0\x20", -1, 0, FALSE },
       
   199   { "\xe1\x20", -1, 0, FALSE },
       
   200   { "\xe2\x20", -1, 0, FALSE },
       
   201   { "\xe3\x20", -1, 0, FALSE },
       
   202   { "\xe4\x20", -1, 0, FALSE },
       
   203   { "\xe5\x20", -1, 0, FALSE },
       
   204   { "\xe6\x20", -1, 0, FALSE },
       
   205   { "\xe7\x20", -1, 0, FALSE },
       
   206   { "\xe8\x20", -1, 0, FALSE },
       
   207   { "\xe9\x20", -1, 0, FALSE },
       
   208   { "\xea\x20", -1, 0, FALSE },
       
   209   { "\xeb\x20", -1, 0, FALSE },
       
   210   { "\xec\x20", -1, 0, FALSE },
       
   211   { "\xed\x20", -1, 0, FALSE },
       
   212   { "\xee\x20", -1, 0, FALSE },
       
   213   { "\xef\x20", -1, 0, FALSE },
       
   214   { "\xf0\x20", -1, 0, FALSE },
       
   215   { "\xf1\x20", -1, 0, FALSE },
       
   216   { "\xf2\x20", -1, 0, FALSE },
       
   217   { "\xf3\x20", -1, 0, FALSE },
       
   218   { "\xf4\x20", -1, 0, FALSE },
       
   219   { "\xf5\x20", -1, 0, FALSE },
       
   220   { "\xf6\x20", -1, 0, FALSE },
       
   221   { "\xf7\x20", -1, 0, FALSE },
       
   222   { "\xf8\x20", -1, 0, FALSE },
       
   223   { "\xf9\x20", -1, 0, FALSE },
       
   224   { "\xfa\x20", -1, 0, FALSE },
       
   225   { "\xfb\x20", -1, 0, FALSE },
       
   226   { "\xfc\x20", -1, 0, FALSE },
       
   227   { "\xfd\x20", -1, 0, FALSE },
       
   228   /* missing continuation bytes */
       
   229   { "\x20\xc0", -1, 1, FALSE },
       
   230   { "\x20\xe0\x80", -1, 1, FALSE },
       
   231   { "\x20\xf0\x80\x80", -1, 1, FALSE },
       
   232   { "\x20\xf8\x80\x80\x80", -1, 1, FALSE },
       
   233   { "\x20\xfc\x80\x80\x80\x80", -1, 1, FALSE },
       
   234   { "\x20\xdf", -1, 1, FALSE },
       
   235   { "\x20\xef\xbf", -1, 1, FALSE },
       
   236   { "\x20\xf7\xbf\xbf", -1, 1, FALSE },
       
   237   { "\x20\xfb\xbf\xbf\xbf", -1, 1, FALSE },
       
   238   { "\x20\xfd\xbf\xbf\xbf\xbf", -1, 1, FALSE },
       
   239   /* impossible bytes */
       
   240   { "\x20\xfe\x20", -1, 1, FALSE },
       
   241   { "\x20\xff\x20", -1, 1, FALSE },
       
   242   /* overlong sequences */
       
   243   { "\x20\xc0\xaf\x20", -1, 1, FALSE },
       
   244   { "\x20\xe0\x80\xaf\x20", -1, 1, FALSE },
       
   245   { "\x20\xf0\x80\x80\xaf\x20", -1, 1, FALSE },
       
   246   { "\x20\xf8\x80\x80\x80\xaf\x20", -1, 1, FALSE },
       
   247   { "\x20\xfc\x80\x80\x80\x80\xaf\x20", -1, 1, FALSE },
       
   248   { "\x20\xc1\xbf\x20", -1, 1, FALSE },
       
   249   { "\x20\xe0\x9f\xbf\x20", -1, 1, FALSE },
       
   250   { "\x20\xf0\x8f\xbf\xbf\x20", -1, 1, FALSE },
       
   251   { "\x20\xf8\x87\xbf\xbf\xbf\x20", -1, 1, FALSE },
       
   252   { "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20", -1, 1, FALSE },
       
   253   { "\x20\xc0\x80\x20", -1, 1, FALSE },
       
   254   { "\x20\xe0\x80\x80\x20", -1, 1, FALSE },
       
   255   { "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE },
       
   256   { "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE },
       
   257   { "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE },
       
   258   /* illegal code positions */
       
   259   { "\x20\xed\xa0\x80\x20", -1, 1, FALSE },
       
   260   { "\x20\xed\xad\xbf\x20", -1, 1, FALSE },
       
   261   { "\x20\xed\xae\x80\x20", -1, 1, FALSE },
       
   262   { "\x20\xed\xaf\xbf\x20", -1, 1, FALSE },
       
   263   { "\x20\xed\xb0\x80\x20", -1, 1, FALSE },
       
   264   { "\x20\xed\xbe\x80\x20", -1, 1, FALSE },
       
   265   { "\x20\xed\xbf\xbf\x20", -1, 1, FALSE },
       
   266   { "\x20\xed\xa0\x80\xed\xb0\x80\x20", -1, 1, FALSE },
       
   267   { "\x20\xed\xa0\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
       
   268   { "\x20\xed\xad\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
       
   269   { "\x20\xed\xad\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
       
   270   { "\x20\xed\xae\x80\xed\xb0\x80\x20", -1, 1, FALSE },
       
   271   { "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
       
   272   { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
       
   273   { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
       
   274   { "\x20\xef\xbf\xbe\x20", -1, 1, FALSE },
       
   275   { "\x20\xef\xbf\xbf\x20", -1, 1, FALSE },
       
   276 
       
   277   { NULL, }
       
   278 };
       
   279 
       
   280 static void 
       
   281 do_test (gint         index,
       
   282 	 const gchar *text, 
       
   283 	 gint         max_len,
       
   284 	 gint         offset,
       
   285 	 gboolean     valid)
       
   286 {
       
   287   const gchar *end;
       
   288   gboolean result;
       
   289   
       
   290   result = g_utf8_validate (text, max_len, &end);
       
   291 
       
   292   if (result != valid || end - text != offset)
       
   293     {
       
   294       GString *str;
       
   295       const gchar *p;
       
   296 
       
   297       any_failed = TRUE;
       
   298       
       
   299       str = g_string_new (0);
       
   300       for (p = text; *p; p++)
       
   301 	g_string_append_printf (str, "\\x%02hhx", *p);
       
   302       g_print ("%d: g_utf8_validate (\"%s\", %d) failed, "
       
   303 	       "expected %s %d, got %s %d\n",
       
   304 	       index,
       
   305 	       str->str, max_len, 
       
   306 	       valid ? "TRUE" : "FALSE", offset,
       
   307 	       result ? "TRUE" : "FALSE", (gint) (end - text));
       
   308       g_string_free (str, FALSE);
       
   309     }
       
   310 }
       
   311 
       
   312 int
       
   313 main (int argc, char *argv[])
       
   314 {
       
   315   gint i;
       
   316 
       
   317   #ifdef __SYMBIAN32__
       
   318   g_log_set_handler (NULL,  G_LOG_FLAG_FATAL| G_LOG_FLAG_RECURSION | G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING | G_LOG_LEVEL_MESSAGE | G_LOG_LEVEL_INFO | G_LOG_LEVEL_DEBUG, &mrtLogHandler, NULL);
       
   319   g_set_print_handler(mrtPrintHandler);
       
   320   #endif /*__SYMBIAN32__*/
       
   321 	  
       
   322 
       
   323   for (i = 0; test[i].text; i++)
       
   324     do_test (i, test[i].text, test[i].max_len, 
       
   325 	     test[i].offset, test[i].valid);
       
   326 
       
   327   #ifdef __SYMBIAN32__
       
   328   assert_failed = any_failed;
       
   329   testResultXml("utf8-validate");
       
   330   #endif /* EMULATOR */
       
   331   
       
   332   return any_failed ? 1 : 0;
       
   333 }