textinput/ptihangulcore/src/hanja.c
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Fri, 19 Feb 2010 23:09:27 +0200
branchRCL_3
changeset 3 f5a1e66df979
permissions -rw-r--r--
Revision: 201003 Kit: 201007

/* libhangul
 * Copyright (c) 2005,2006 Choe Hwanjin
 * All rights reserved.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

#ifdef HAVE_MMAP
#include <sys/mman.h>
#endif

#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "hangul.h"
#include "hangulinternals.h"

#ifndef TRUE
#define TRUE  1
#endif

#ifndef FALSE
#define FALSE 0
#endif

typedef struct _HanjaIndex     HanjaIndex;

typedef struct _HanjaPair      HanjaPair;
typedef struct _HanjaPairArray HanjaPairArray;

struct _Hanja {
    uint32_t key_offset;
    uint32_t value_offset;
    uint32_t comment_offset;
};

struct _HanjaList {
    char*         key;
    size_t        len;
    size_t        alloc;
    const Hanja** items; 
};

struct _HanjaIndex {
    unsigned offset;
    char     key[8];
};

struct _HanjaTable {
    HanjaIndex*    keytable;
    unsigned       nkeys;
    unsigned       key_size;
    FILE*          file;
};

struct _HanjaPair {
    ucschar first;
    ucschar second;
};

struct _HanjaPairArray {
    ucschar          key;
    const HanjaPair* pairs;
};

#include "hanjacompatible.h"

static const char utf8_skip_table[256] = {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};

static 
#ifndef __SYMBIAN32__
inline 
#endif
int utf8_char_len(const char *p)
{
    return utf8_skip_table[*(const unsigned char*)p];
}

static 
#ifndef __SYMBIAN32__
inline 
#endif
const char* utf8_next(const char *str)
{
    int n = utf8_char_len(str);

    while (n > 0) {
	str++;
	if (*str == '\0')
	    return str;
	n--;
    }

    return str;
}

static 
#ifndef __SYMBIAN32__
inline 
#endif
char* utf8_prev(const char *str, const char *p)
{
    for (--p; p >= str; --p) {
	if ((*p & 0xc0) != 0x80)
	    break;
    }
    return (char*)p;
}

/* hanja searching functions */
static Hanja *
hanja_new(const char *key, const char *value, const char *comment)
{
    Hanja* hanja;
    size_t size;
    size_t keylen;
    size_t valuelen;
    size_t commentlen;
    char*  p;

    keylen = strlen(key) + 1;
    valuelen = strlen(value) + 1;
    if (comment != NULL)
	commentlen = strlen(comment) + 1;
    else
	commentlen = 1;

    size = sizeof(*hanja) + keylen + valuelen + commentlen;
    hanja = malloc(size);
    if (hanja == NULL)
	return NULL;

    p = (char*)hanja + sizeof(*hanja);
    strcpy(p, key);
    p += keylen;
    strcpy(p, value);
    p += valuelen;
    if (comment != NULL)
	strcpy(p, comment);
    else
	*p = '\0';
    p += valuelen;

    hanja->key_offset     = sizeof(*hanja);
    hanja->value_offset   = sizeof(*hanja) + keylen;
    hanja->comment_offset = sizeof(*hanja) + keylen + valuelen;

    return hanja;
}

static void
hanja_delete(Hanja* hanja)
{
    free(hanja);
}

const char*
hanja_get_key(const Hanja* hanja)
{
    if (hanja != NULL) {
	const char* p  = (const char*)hanja;
	return p + hanja->key_offset;
    }
    return NULL;
}

const char*
hanja_get_value(const Hanja* hanja)
{
    if (hanja != NULL) {
	const char* p  = (const char*)hanja;
	return p + hanja->value_offset;
    }
    return NULL;
}

const char*
hanja_get_comment(const Hanja* hanja)
{
    if (hanja != NULL) {
	const char* p  = (const char*)hanja;
	return p + hanja->comment_offset;
    }
    return NULL;
}

static HanjaList *
hanja_list_new(const char *key)
{
    HanjaList *list;

    list = malloc(sizeof(*list));
    if (list != NULL) {
	list->key = strdup(key);
	list->len = 0;
	list->alloc = 1;
	list->items = malloc(list->alloc * sizeof(list->items[0]));
	if (list->items == NULL) {
	    free(list);
	    list = NULL;
	}
    }

    return list;
}

static void
hanja_list_reserve(HanjaList* list, size_t n)
{
    size_t size = list->alloc;

    if (n > SIZE_MAX / sizeof(list->items[0]) - list->len)
	return;

    while (size < list->len + n)
	size *= 2;

    if (size > SIZE_MAX / sizeof(list->items[0]))
	return;

    if (list->alloc < list->len + n) {
	const Hanja** data;

	data = realloc(list->items, size * sizeof(list->items[0]));
	if (data != NULL) {
	    list->alloc = size;
	    list->items = data;
	}
    }
}

static void
hanja_list_append_n(HanjaList* list, const Hanja* hanja, int n)
{
    hanja_list_reserve(list, n);

    if (list->alloc >= list->len + n) {
	unsigned int i;
	for (i = 0; i < n ; i++)
	    list->items[list->len + i] = hanja + i;
	list->len += n;
    }
}

static void
hanja_table_match(const HanjaTable* table,
		  const char* key, HanjaList** list)
{
    int low, high, mid = 0;
    int res = -1;

    low = 0;
    high = table->nkeys - 1;

    while (low < high) {
	mid = (low + high) / 2;
	res = strncmp(table->keytable[mid].key, key, table->key_size);
	if (res < 0) {
	    low = mid + 1;
	} else if (res > 0) {
	    high = mid - 1;
	} else {
	    break;
	}
    }

    if (res != 0) {
	mid = low;
	res = strncmp(table->keytable[mid].key, key, table->key_size);
    }

    if (res == 0) {
	unsigned offset;
	char buf[512];

	offset = table->keytable[mid].offset;
	fseek(table->file, offset, SEEK_SET);

	while (fgets(buf, sizeof(buf), table->file) != NULL) {
	    char* save = NULL;
	    char* p = strtok_r(buf, ":", &save);
	    res = strcmp(p, key);
	    if (res == 0) {
		char* value   = strtok_r(NULL, ":", &save);
		char* comment = strtok_r(NULL, "\r\n", &save);

		Hanja* hanja = hanja_new(p, value, comment);

		if (*list == NULL) {
		    *list = hanja_list_new(key);
		}

		hanja_list_append_n(*list, hanja, 1);
	    } else if (res > 0) {
		break;
	    }
	}
    }
}

HanjaTable*
hanja_table_load(const char* filename)
{
    unsigned nkeys;
    char buf[512];
    int key_size = 5;
    char last_key[8] = { '\0', };
    char* save_ptr = NULL;
    char* key;
    long offset;
    unsigned i;
    FILE* file;
    HanjaIndex* keytable;
    HanjaTable* table;

    if (filename == NULL)
	filename = LIBHANGUL_DEFAULT_HANJA_DIC;

    file = fopen(filename, "r");
    if (file == NULL) {
	return NULL;
    }

    nkeys = 0;
    while (fgets(buf, sizeof(buf), file) != NULL) {
	/* skip comments and empty lines */
	if (buf[0] == '#' || buf[0] == '\r' || buf[0] == '\n' || buf[0] == '\0')
	    continue;

	save_ptr = NULL;
	key = strtok_r(buf, ":", &save_ptr);

	if (key == NULL || strlen(key) == 0)
	    continue;

	if (strncmp(last_key, key, key_size) != 0) {
	    nkeys++;
	    strncpy(last_key, key, key_size);
	}
    }

    rewind(file);
    keytable = malloc(nkeys * sizeof(keytable[0]));
    memset(keytable, 0, nkeys * sizeof(keytable[0]));

    i = 0;
    offset = ftell(file);
    while (fgets(buf, sizeof(buf), file) != NULL) {
	/* skip comments and empty lines */
	if (buf[0] == '#' || buf[0] == '\r' || buf[0] == '\n' || buf[0] == '\0')
	    continue;

	save_ptr = NULL;
	key = strtok_r(buf, ":", &save_ptr);

	if (key == NULL || strlen(key) == 0)
	    continue;

	if (strncmp(last_key, key, key_size) != 0) {
	    keytable[i].offset = offset;
	    strncpy(keytable[i].key, key, key_size);
	    strncpy(last_key, key, key_size);
	    i++;
	}
	offset = ftell(file);
    }

    table = malloc(sizeof(*table));
    if (table == NULL) {
	free(keytable);
	fclose(file);
	return NULL;
    }

    table->keytable = keytable;
    table->nkeys = nkeys;
    table->key_size = key_size;
    table->file = file;

    return table;
}

void
hanja_table_delete(HanjaTable *table)
{
    if (table != NULL) {
	free(table->keytable);
	fclose(table->file);
	free(table);
    }
}

HanjaList*
hanja_table_match_exact(const HanjaTable* table, const char *key)
{
    HanjaList* ret = NULL;

    if (key == NULL || key[0] == '\0' || table == NULL)
	return NULL;

    hanja_table_match(table, key, &ret);

    return ret;
}

HanjaList*
hanja_table_match_prefix(const HanjaTable* table, const char *key)
{
    char* p;
    char* newkey;
    HanjaList* ret = NULL;

    if (key == NULL || key[0] == '\0' || table == NULL)
	return NULL;

    newkey = strdup(key);
    if (newkey == NULL)
	return NULL;

    p = strchr(newkey, '\0');
    while (newkey[0] != '\0') {
	hanja_table_match(table, newkey, &ret);
	p = utf8_prev(newkey, p);
	p[0] = '\0';
    }
    free(newkey);

    return ret;
}

HanjaList*
hanja_table_match_suffix(const HanjaTable* table, const char *key)
{
    const char* p;
    HanjaList* ret = NULL;

    if (key == NULL || key[0] == '\0' || table == NULL)
	return NULL;

    p = key;
    while (p[0] != '\0') {
	hanja_table_match(table, p, &ret);
	p = utf8_next(p);
    }

    return ret;
}

int
hanja_list_get_size(const HanjaList *list)
{
    if (list != NULL)
	return list->len;
    return 0;
}

const char*
hanja_list_get_key(const HanjaList *list)
{
    if (list != NULL)
	return list->key;
    return NULL;
}

const Hanja*
hanja_list_get_nth(const HanjaList *list, unsigned int n)
{
    if (list != NULL) {
	if (n < list->len)
	    return list->items[n];
    }
    return NULL;
}

const char*
hanja_list_get_nth_key(const HanjaList *list, unsigned int n)
{
    const Hanja* hanja = hanja_list_get_nth(list, n);
    return hanja_get_key(hanja);
}

const char*
hanja_list_get_nth_value(const HanjaList *list, unsigned int n)
{
    const Hanja* hanja = hanja_list_get_nth(list, n);
    return hanja_get_value(hanja);
}

const char*
hanja_list_get_nth_comment(const HanjaList *list, unsigned int n)
{
    const Hanja* hanja = hanja_list_get_nth(list, n);
    return hanja_get_comment(hanja);
}

void
hanja_list_delete(HanjaList *list)
{
    if (list) {
	size_t i;
	for (i = 0; i < list->len; i++) {
	    hanja_delete((Hanja*)list->items[i]);
	}
	free(list->items);
	free(list->key);
	free(list);
    }
}

static int
compare_pair(const void* a, const void* b)
{
    const ucschar*   c = a;
    const HanjaPair* y = b;

    return *c - y->first;
}

size_t
hanja_compatibility_form(ucschar* hanja, const ucschar* hangul, size_t n)
{
    size_t i;
    size_t nconverted;

    if (hangul == NULL || hanja == NULL)
	return 0;

    nconverted = 0;
    for (i = 0; i < n && hangul[i] != 0 && hanja[i] != 0; i++) {
	HanjaPairArray* p;

	p = bsearch(&hanja[i],
		    hanja_unified_to_compat_table,
		    N_ELEMENTS(hanja_unified_to_compat_table),
		    sizeof(hanja_unified_to_compat_table[0]),
		    compare_pair);
	if (p != NULL) {
	    const HanjaPair* pair = p->pairs;
	    while (pair->first != 0) {
		if (pair->first == hangul[i]) {
		    hanja[i] = pair->second;
		    nconverted++;
		    break;
		}
		pair++;
	    }
	}
    }

    return nconverted;
}

size_t
hanja_unified_form(ucschar* str, size_t n)
{
    size_t i;
    size_t nconverted;

    if (str == NULL)
	return 0;

    nconverted = 0;
    for (i = 0; i < n && str[i] != 0; i++) {
	if (str[i] >= 0xF900 && str[i] <= 0xFA0B) {
	    str[i] = hanja_compat_to_unified_table[str[i] - 0xF900];
	    nconverted++;
	}
    }

    return nconverted;
}