textinput/ptihangulcore/src/hanja.c
branchRCL_3
changeset 3 f5a1e66df979
equal deleted inserted replaced
0:eb1f2e154e89 3:f5a1e66df979
       
     1 /* libhangul
       
     2  * Copyright (c) 2005,2006 Choe Hwanjin
       
     3  * All rights reserved.
       
     4  *
       
     5  * This library is free software; you can redistribute it and/or
       
     6  * modify it under the terms of the GNU Lesser General Public
       
     7  * License as published by the Free Software Foundation; either
       
     8  * version 2.1 of the License, or (at your option) any later version.
       
     9  *
       
    10  * This library is distributed in the hope that it will be useful,
       
    11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
       
    12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
       
    13  * Lesser General Public License for more details.
       
    14  *
       
    15  * You should have received a copy of the GNU Lesser General Public
       
    16  * License along with this library; if not, write to the Free Software
       
    17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
       
    18  */
       
    19 
       
    20 #ifdef HAVE_CONFIG_H
       
    21 #include <config.h>
       
    22 #endif
       
    23 
       
    24 #include <sys/types.h>
       
    25 #include <sys/stat.h>
       
    26 #include <unistd.h>
       
    27 
       
    28 #ifdef HAVE_MMAP
       
    29 #include <sys/mman.h>
       
    30 #endif
       
    31 
       
    32 #include <limits.h>
       
    33 #include <stdio.h>
       
    34 #include <stdlib.h>
       
    35 #include <string.h>
       
    36 
       
    37 #include "hangul.h"
       
    38 #include "hangulinternals.h"
       
    39 
       
    40 #ifndef TRUE
       
    41 #define TRUE  1
       
    42 #endif
       
    43 
       
    44 #ifndef FALSE
       
    45 #define FALSE 0
       
    46 #endif
       
    47 
       
    48 typedef struct _HanjaIndex     HanjaIndex;
       
    49 
       
    50 typedef struct _HanjaPair      HanjaPair;
       
    51 typedef struct _HanjaPairArray HanjaPairArray;
       
    52 
       
    53 struct _Hanja {
       
    54     uint32_t key_offset;
       
    55     uint32_t value_offset;
       
    56     uint32_t comment_offset;
       
    57 };
       
    58 
       
    59 struct _HanjaList {
       
    60     char*         key;
       
    61     size_t        len;
       
    62     size_t        alloc;
       
    63     const Hanja** items; 
       
    64 };
       
    65 
       
    66 struct _HanjaIndex {
       
    67     unsigned offset;
       
    68     char     key[8];
       
    69 };
       
    70 
       
    71 struct _HanjaTable {
       
    72     HanjaIndex*    keytable;
       
    73     unsigned       nkeys;
       
    74     unsigned       key_size;
       
    75     FILE*          file;
       
    76 };
       
    77 
       
    78 struct _HanjaPair {
       
    79     ucschar first;
       
    80     ucschar second;
       
    81 };
       
    82 
       
    83 struct _HanjaPairArray {
       
    84     ucschar          key;
       
    85     const HanjaPair* pairs;
       
    86 };
       
    87 
       
    88 #include "hanjacompatible.h"
       
    89 
       
    90 static const char utf8_skip_table[256] = {
       
    91     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
       
    92     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
       
    93     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
       
    94     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
       
    95     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
       
    96     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
       
    97     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
       
    98     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
       
    99 };
       
   100 
       
   101 static 
       
   102 #ifndef __SYMBIAN32__
       
   103 inline 
       
   104 #endif
       
   105 int utf8_char_len(const char *p)
       
   106 {
       
   107     return utf8_skip_table[*(const unsigned char*)p];
       
   108 }
       
   109 
       
   110 static 
       
   111 #ifndef __SYMBIAN32__
       
   112 inline 
       
   113 #endif
       
   114 const char* utf8_next(const char *str)
       
   115 {
       
   116     int n = utf8_char_len(str);
       
   117 
       
   118     while (n > 0) {
       
   119 	str++;
       
   120 	if (*str == '\0')
       
   121 	    return str;
       
   122 	n--;
       
   123     }
       
   124 
       
   125     return str;
       
   126 }
       
   127 
       
   128 static 
       
   129 #ifndef __SYMBIAN32__
       
   130 inline 
       
   131 #endif
       
   132 char* utf8_prev(const char *str, const char *p)
       
   133 {
       
   134     for (--p; p >= str; --p) {
       
   135 	if ((*p & 0xc0) != 0x80)
       
   136 	    break;
       
   137     }
       
   138     return (char*)p;
       
   139 }
       
   140 
       
   141 /* hanja searching functions */
       
   142 static Hanja *
       
   143 hanja_new(const char *key, const char *value, const char *comment)
       
   144 {
       
   145     Hanja* hanja;
       
   146     size_t size;
       
   147     size_t keylen;
       
   148     size_t valuelen;
       
   149     size_t commentlen;
       
   150     char*  p;
       
   151 
       
   152     keylen = strlen(key) + 1;
       
   153     valuelen = strlen(value) + 1;
       
   154     if (comment != NULL)
       
   155 	commentlen = strlen(comment) + 1;
       
   156     else
       
   157 	commentlen = 1;
       
   158 
       
   159     size = sizeof(*hanja) + keylen + valuelen + commentlen;
       
   160     hanja = malloc(size);
       
   161     if (hanja == NULL)
       
   162 	return NULL;
       
   163 
       
   164     p = (char*)hanja + sizeof(*hanja);
       
   165     strcpy(p, key);
       
   166     p += keylen;
       
   167     strcpy(p, value);
       
   168     p += valuelen;
       
   169     if (comment != NULL)
       
   170 	strcpy(p, comment);
       
   171     else
       
   172 	*p = '\0';
       
   173     p += valuelen;
       
   174 
       
   175     hanja->key_offset     = sizeof(*hanja);
       
   176     hanja->value_offset   = sizeof(*hanja) + keylen;
       
   177     hanja->comment_offset = sizeof(*hanja) + keylen + valuelen;
       
   178 
       
   179     return hanja;
       
   180 }
       
   181 
       
   182 static void
       
   183 hanja_delete(Hanja* hanja)
       
   184 {
       
   185     free(hanja);
       
   186 }
       
   187 
       
   188 const char*
       
   189 hanja_get_key(const Hanja* hanja)
       
   190 {
       
   191     if (hanja != NULL) {
       
   192 	const char* p  = (const char*)hanja;
       
   193 	return p + hanja->key_offset;
       
   194     }
       
   195     return NULL;
       
   196 }
       
   197 
       
   198 const char*
       
   199 hanja_get_value(const Hanja* hanja)
       
   200 {
       
   201     if (hanja != NULL) {
       
   202 	const char* p  = (const char*)hanja;
       
   203 	return p + hanja->value_offset;
       
   204     }
       
   205     return NULL;
       
   206 }
       
   207 
       
   208 const char*
       
   209 hanja_get_comment(const Hanja* hanja)
       
   210 {
       
   211     if (hanja != NULL) {
       
   212 	const char* p  = (const char*)hanja;
       
   213 	return p + hanja->comment_offset;
       
   214     }
       
   215     return NULL;
       
   216 }
       
   217 
       
   218 static HanjaList *
       
   219 hanja_list_new(const char *key)
       
   220 {
       
   221     HanjaList *list;
       
   222 
       
   223     list = malloc(sizeof(*list));
       
   224     if (list != NULL) {
       
   225 	list->key = strdup(key);
       
   226 	list->len = 0;
       
   227 	list->alloc = 1;
       
   228 	list->items = malloc(list->alloc * sizeof(list->items[0]));
       
   229 	if (list->items == NULL) {
       
   230 	    free(list);
       
   231 	    list = NULL;
       
   232 	}
       
   233     }
       
   234 
       
   235     return list;
       
   236 }
       
   237 
       
   238 static void
       
   239 hanja_list_reserve(HanjaList* list, size_t n)
       
   240 {
       
   241     size_t size = list->alloc;
       
   242 
       
   243     if (n > SIZE_MAX / sizeof(list->items[0]) - list->len)
       
   244 	return;
       
   245 
       
   246     while (size < list->len + n)
       
   247 	size *= 2;
       
   248 
       
   249     if (size > SIZE_MAX / sizeof(list->items[0]))
       
   250 	return;
       
   251 
       
   252     if (list->alloc < list->len + n) {
       
   253 	const Hanja** data;
       
   254 
       
   255 	data = realloc(list->items, size * sizeof(list->items[0]));
       
   256 	if (data != NULL) {
       
   257 	    list->alloc = size;
       
   258 	    list->items = data;
       
   259 	}
       
   260     }
       
   261 }
       
   262 
       
   263 static void
       
   264 hanja_list_append_n(HanjaList* list, const Hanja* hanja, int n)
       
   265 {
       
   266     hanja_list_reserve(list, n);
       
   267 
       
   268     if (list->alloc >= list->len + n) {
       
   269 	unsigned int i;
       
   270 	for (i = 0; i < n ; i++)
       
   271 	    list->items[list->len + i] = hanja + i;
       
   272 	list->len += n;
       
   273     }
       
   274 }
       
   275 
       
   276 static void
       
   277 hanja_table_match(const HanjaTable* table,
       
   278 		  const char* key, HanjaList** list)
       
   279 {
       
   280     int low, high, mid = 0;
       
   281     int res = -1;
       
   282 
       
   283     low = 0;
       
   284     high = table->nkeys - 1;
       
   285 
       
   286     while (low < high) {
       
   287 	mid = (low + high) / 2;
       
   288 	res = strncmp(table->keytable[mid].key, key, table->key_size);
       
   289 	if (res < 0) {
       
   290 	    low = mid + 1;
       
   291 	} else if (res > 0) {
       
   292 	    high = mid - 1;
       
   293 	} else {
       
   294 	    break;
       
   295 	}
       
   296     }
       
   297 
       
   298     if (res != 0) {
       
   299 	mid = low;
       
   300 	res = strncmp(table->keytable[mid].key, key, table->key_size);
       
   301     }
       
   302 
       
   303     if (res == 0) {
       
   304 	unsigned offset;
       
   305 	char buf[512];
       
   306 
       
   307 	offset = table->keytable[mid].offset;
       
   308 	fseek(table->file, offset, SEEK_SET);
       
   309 
       
   310 	while (fgets(buf, sizeof(buf), table->file) != NULL) {
       
   311 	    char* save = NULL;
       
   312 	    char* p = strtok_r(buf, ":", &save);
       
   313 	    res = strcmp(p, key);
       
   314 	    if (res == 0) {
       
   315 		char* value   = strtok_r(NULL, ":", &save);
       
   316 		char* comment = strtok_r(NULL, "\r\n", &save);
       
   317 
       
   318 		Hanja* hanja = hanja_new(p, value, comment);
       
   319 
       
   320 		if (*list == NULL) {
       
   321 		    *list = hanja_list_new(key);
       
   322 		}
       
   323 
       
   324 		hanja_list_append_n(*list, hanja, 1);
       
   325 	    } else if (res > 0) {
       
   326 		break;
       
   327 	    }
       
   328 	}
       
   329     }
       
   330 }
       
   331 
       
   332 HanjaTable*
       
   333 hanja_table_load(const char* filename)
       
   334 {
       
   335     unsigned nkeys;
       
   336     char buf[512];
       
   337     int key_size = 5;
       
   338     char last_key[8] = { '\0', };
       
   339     char* save_ptr = NULL;
       
   340     char* key;
       
   341     long offset;
       
   342     unsigned i;
       
   343     FILE* file;
       
   344     HanjaIndex* keytable;
       
   345     HanjaTable* table;
       
   346 
       
   347     if (filename == NULL)
       
   348 	filename = LIBHANGUL_DEFAULT_HANJA_DIC;
       
   349 
       
   350     file = fopen(filename, "r");
       
   351     if (file == NULL) {
       
   352 	return NULL;
       
   353     }
       
   354 
       
   355     nkeys = 0;
       
   356     while (fgets(buf, sizeof(buf), file) != NULL) {
       
   357 	/* skip comments and empty lines */
       
   358 	if (buf[0] == '#' || buf[0] == '\r' || buf[0] == '\n' || buf[0] == '\0')
       
   359 	    continue;
       
   360 
       
   361 	save_ptr = NULL;
       
   362 	key = strtok_r(buf, ":", &save_ptr);
       
   363 
       
   364 	if (key == NULL || strlen(key) == 0)
       
   365 	    continue;
       
   366 
       
   367 	if (strncmp(last_key, key, key_size) != 0) {
       
   368 	    nkeys++;
       
   369 	    strncpy(last_key, key, key_size);
       
   370 	}
       
   371     }
       
   372 
       
   373     rewind(file);
       
   374     keytable = malloc(nkeys * sizeof(keytable[0]));
       
   375     memset(keytable, 0, nkeys * sizeof(keytable[0]));
       
   376 
       
   377     i = 0;
       
   378     offset = ftell(file);
       
   379     while (fgets(buf, sizeof(buf), file) != NULL) {
       
   380 	/* skip comments and empty lines */
       
   381 	if (buf[0] == '#' || buf[0] == '\r' || buf[0] == '\n' || buf[0] == '\0')
       
   382 	    continue;
       
   383 
       
   384 	save_ptr = NULL;
       
   385 	key = strtok_r(buf, ":", &save_ptr);
       
   386 
       
   387 	if (key == NULL || strlen(key) == 0)
       
   388 	    continue;
       
   389 
       
   390 	if (strncmp(last_key, key, key_size) != 0) {
       
   391 	    keytable[i].offset = offset;
       
   392 	    strncpy(keytable[i].key, key, key_size);
       
   393 	    strncpy(last_key, key, key_size);
       
   394 	    i++;
       
   395 	}
       
   396 	offset = ftell(file);
       
   397     }
       
   398 
       
   399     table = malloc(sizeof(*table));
       
   400     if (table == NULL) {
       
   401 	free(keytable);
       
   402 	fclose(file);
       
   403 	return NULL;
       
   404     }
       
   405 
       
   406     table->keytable = keytable;
       
   407     table->nkeys = nkeys;
       
   408     table->key_size = key_size;
       
   409     table->file = file;
       
   410 
       
   411     return table;
       
   412 }
       
   413 
       
   414 void
       
   415 hanja_table_delete(HanjaTable *table)
       
   416 {
       
   417     if (table != NULL) {
       
   418 	free(table->keytable);
       
   419 	fclose(table->file);
       
   420 	free(table);
       
   421     }
       
   422 }
       
   423 
       
   424 HanjaList*
       
   425 hanja_table_match_exact(const HanjaTable* table, const char *key)
       
   426 {
       
   427     HanjaList* ret = NULL;
       
   428 
       
   429     if (key == NULL || key[0] == '\0' || table == NULL)
       
   430 	return NULL;
       
   431 
       
   432     hanja_table_match(table, key, &ret);
       
   433 
       
   434     return ret;
       
   435 }
       
   436 
       
   437 HanjaList*
       
   438 hanja_table_match_prefix(const HanjaTable* table, const char *key)
       
   439 {
       
   440     char* p;
       
   441     char* newkey;
       
   442     HanjaList* ret = NULL;
       
   443 
       
   444     if (key == NULL || key[0] == '\0' || table == NULL)
       
   445 	return NULL;
       
   446 
       
   447     newkey = strdup(key);
       
   448     if (newkey == NULL)
       
   449 	return NULL;
       
   450 
       
   451     p = strchr(newkey, '\0');
       
   452     while (newkey[0] != '\0') {
       
   453 	hanja_table_match(table, newkey, &ret);
       
   454 	p = utf8_prev(newkey, p);
       
   455 	p[0] = '\0';
       
   456     }
       
   457     free(newkey);
       
   458 
       
   459     return ret;
       
   460 }
       
   461 
       
   462 HanjaList*
       
   463 hanja_table_match_suffix(const HanjaTable* table, const char *key)
       
   464 {
       
   465     const char* p;
       
   466     HanjaList* ret = NULL;
       
   467 
       
   468     if (key == NULL || key[0] == '\0' || table == NULL)
       
   469 	return NULL;
       
   470 
       
   471     p = key;
       
   472     while (p[0] != '\0') {
       
   473 	hanja_table_match(table, p, &ret);
       
   474 	p = utf8_next(p);
       
   475     }
       
   476 
       
   477     return ret;
       
   478 }
       
   479 
       
   480 int
       
   481 hanja_list_get_size(const HanjaList *list)
       
   482 {
       
   483     if (list != NULL)
       
   484 	return list->len;
       
   485     return 0;
       
   486 }
       
   487 
       
   488 const char*
       
   489 hanja_list_get_key(const HanjaList *list)
       
   490 {
       
   491     if (list != NULL)
       
   492 	return list->key;
       
   493     return NULL;
       
   494 }
       
   495 
       
   496 const Hanja*
       
   497 hanja_list_get_nth(const HanjaList *list, unsigned int n)
       
   498 {
       
   499     if (list != NULL) {
       
   500 	if (n < list->len)
       
   501 	    return list->items[n];
       
   502     }
       
   503     return NULL;
       
   504 }
       
   505 
       
   506 const char*
       
   507 hanja_list_get_nth_key(const HanjaList *list, unsigned int n)
       
   508 {
       
   509     const Hanja* hanja = hanja_list_get_nth(list, n);
       
   510     return hanja_get_key(hanja);
       
   511 }
       
   512 
       
   513 const char*
       
   514 hanja_list_get_nth_value(const HanjaList *list, unsigned int n)
       
   515 {
       
   516     const Hanja* hanja = hanja_list_get_nth(list, n);
       
   517     return hanja_get_value(hanja);
       
   518 }
       
   519 
       
   520 const char*
       
   521 hanja_list_get_nth_comment(const HanjaList *list, unsigned int n)
       
   522 {
       
   523     const Hanja* hanja = hanja_list_get_nth(list, n);
       
   524     return hanja_get_comment(hanja);
       
   525 }
       
   526 
       
   527 void
       
   528 hanja_list_delete(HanjaList *list)
       
   529 {
       
   530     if (list) {
       
   531 	size_t i;
       
   532 	for (i = 0; i < list->len; i++) {
       
   533 	    hanja_delete((Hanja*)list->items[i]);
       
   534 	}
       
   535 	free(list->items);
       
   536 	free(list->key);
       
   537 	free(list);
       
   538     }
       
   539 }
       
   540 
       
   541 static int
       
   542 compare_pair(const void* a, const void* b)
       
   543 {
       
   544     const ucschar*   c = a;
       
   545     const HanjaPair* y = b;
       
   546 
       
   547     return *c - y->first;
       
   548 }
       
   549 
       
   550 size_t
       
   551 hanja_compatibility_form(ucschar* hanja, const ucschar* hangul, size_t n)
       
   552 {
       
   553     size_t i;
       
   554     size_t nconverted;
       
   555 
       
   556     if (hangul == NULL || hanja == NULL)
       
   557 	return 0;
       
   558 
       
   559     nconverted = 0;
       
   560     for (i = 0; i < n && hangul[i] != 0 && hanja[i] != 0; i++) {
       
   561 	HanjaPairArray* p;
       
   562 
       
   563 	p = bsearch(&hanja[i],
       
   564 		    hanja_unified_to_compat_table,
       
   565 		    N_ELEMENTS(hanja_unified_to_compat_table),
       
   566 		    sizeof(hanja_unified_to_compat_table[0]),
       
   567 		    compare_pair);
       
   568 	if (p != NULL) {
       
   569 	    const HanjaPair* pair = p->pairs;
       
   570 	    while (pair->first != 0) {
       
   571 		if (pair->first == hangul[i]) {
       
   572 		    hanja[i] = pair->second;
       
   573 		    nconverted++;
       
   574 		    break;
       
   575 		}
       
   576 		pair++;
       
   577 	    }
       
   578 	}
       
   579     }
       
   580 
       
   581     return nconverted;
       
   582 }
       
   583 
       
   584 size_t
       
   585 hanja_unified_form(ucschar* str, size_t n)
       
   586 {
       
   587     size_t i;
       
   588     size_t nconverted;
       
   589 
       
   590     if (str == NULL)
       
   591 	return 0;
       
   592 
       
   593     nconverted = 0;
       
   594     for (i = 0; i < n && str[i] != 0; i++) {
       
   595 	if (str[i] >= 0xF900 && str[i] <= 0xFA0B) {
       
   596 	    str[i] = hanja_compat_to_unified_table[str[i] - 0xF900];
       
   597 	    nconverted++;
       
   598 	}
       
   599     }
       
   600 
       
   601     return nconverted;
       
   602 }
       
   603