persistentstorage/sqlite3api/SQLite/fts3_icu.c
changeset 0 08ec8eefde2f
equal deleted inserted replaced
-1:000000000000 0:08ec8eefde2f
       
     1 /*
       
     2 ** 2007 June 22
       
     3 **
       
     4 ** The author disclaims copyright to this source code.  In place of
       
     5 ** a legal notice, here is a blessing:
       
     6 **
       
     7 **    May you do good and not evil.
       
     8 **    May you find forgiveness for yourself and forgive others.
       
     9 **    May you share freely, never taking more than you give.
       
    10 **
       
    11 *************************************************************************
       
    12 ** This file implements a tokenizer for fts3 based on the ICU library.
       
    13 ** 
       
    14 ** $Id: fts3_icu.c,v 1.3 2008/09/01 18:34:20 danielk1977 Exp $
       
    15 */
       
    16 
       
    17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
       
    18 #ifdef SQLITE_ENABLE_ICU
       
    19 
       
    20 #include <assert.h>
       
    21 #include <string.h>
       
    22 #include "fts3_tokenizer.h"
       
    23 
       
    24 #include <unicode/ubrk.h>
       
    25 #include <unicode/ucol.h>
       
    26 #include <unicode/ustring.h>
       
    27 #include <unicode/utf16.h>
       
    28 
       
    29 typedef struct IcuTokenizer IcuTokenizer;
       
    30 typedef struct IcuCursor IcuCursor;
       
    31 
       
    32 struct IcuTokenizer {
       
    33   sqlite3_tokenizer base;
       
    34   char *zLocale;
       
    35 };
       
    36 
       
    37 struct IcuCursor {
       
    38   sqlite3_tokenizer_cursor base;
       
    39 
       
    40   UBreakIterator *pIter;      /* ICU break-iterator object */
       
    41   int nChar;                  /* Number of UChar elements in pInput */
       
    42   UChar *aChar;               /* Copy of input using utf-16 encoding */
       
    43   int *aOffset;               /* Offsets of each character in utf-8 input */
       
    44 
       
    45   int nBuffer;
       
    46   char *zBuffer;
       
    47 
       
    48   int iToken;
       
    49 };
       
    50 
       
    51 /*
       
    52 ** Create a new tokenizer instance.
       
    53 */
       
    54 static int icuCreate(
       
    55   int argc,                            /* Number of entries in argv[] */
       
    56   const char * const *argv,            /* Tokenizer creation arguments */
       
    57   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
       
    58 ){
       
    59   IcuTokenizer *p;
       
    60   int n = 0;
       
    61 
       
    62   if( argc>0 ){
       
    63     n = strlen(argv[0])+1;
       
    64   }
       
    65   p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
       
    66   if( !p ){
       
    67     return SQLITE_NOMEM;
       
    68   }
       
    69   memset(p, 0, sizeof(IcuTokenizer));
       
    70 
       
    71   if( n ){
       
    72     p->zLocale = (char *)&p[1];
       
    73     memcpy(p->zLocale, argv[0], n);
       
    74   }
       
    75 
       
    76   *ppTokenizer = (sqlite3_tokenizer *)p;
       
    77 
       
    78   return SQLITE_OK;
       
    79 }
       
    80 
       
    81 /*
       
    82 ** Destroy a tokenizer
       
    83 */
       
    84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){
       
    85   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
       
    86   sqlite3_free(p);
       
    87   return SQLITE_OK;
       
    88 }
       
    89 
       
    90 /*
       
    91 ** Prepare to begin tokenizing a particular string.  The input
       
    92 ** string to be tokenized is pInput[0..nBytes-1].  A cursor
       
    93 ** used to incrementally tokenize this string is returned in 
       
    94 ** *ppCursor.
       
    95 */
       
    96 static int icuOpen(
       
    97   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
       
    98   const char *zInput,                    /* Input string */
       
    99   int nInput,                            /* Length of zInput in bytes */
       
   100   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
       
   101 ){
       
   102   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
       
   103   IcuCursor *pCsr;
       
   104 
       
   105   const int32_t opt = U_FOLD_CASE_DEFAULT;
       
   106   UErrorCode status = U_ZERO_ERROR;
       
   107   int nChar;
       
   108 
       
   109   UChar32 c;
       
   110   int iInput = 0;
       
   111   int iOut = 0;
       
   112 
       
   113   *ppCursor = 0;
       
   114 
       
   115   if( nInput<0 ){
       
   116     nInput = strlen(zInput);
       
   117   }
       
   118   nChar = nInput+1;
       
   119   pCsr = (IcuCursor *)sqlite3_malloc(
       
   120       sizeof(IcuCursor) +                /* IcuCursor */
       
   121       nChar * sizeof(UChar) +            /* IcuCursor.aChar[] */
       
   122       (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */
       
   123   );
       
   124   if( !pCsr ){
       
   125     return SQLITE_NOMEM;
       
   126   }
       
   127   memset(pCsr, 0, sizeof(IcuCursor));
       
   128   pCsr->aChar = (UChar *)&pCsr[1];
       
   129   pCsr->aOffset = (int *)&pCsr->aChar[nChar];
       
   130 
       
   131   pCsr->aOffset[iOut] = iInput;
       
   132   U8_NEXT(zInput, iInput, nInput, c); 
       
   133   while( c>0 ){
       
   134     int isError = 0;
       
   135     c = u_foldCase(c, opt);
       
   136     U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
       
   137     if( isError ){
       
   138       sqlite3_free(pCsr);
       
   139       return SQLITE_ERROR;
       
   140     }
       
   141     pCsr->aOffset[iOut] = iInput;
       
   142 
       
   143     if( iInput<nInput ){
       
   144       U8_NEXT(zInput, iInput, nInput, c);
       
   145     }else{
       
   146       c = 0;
       
   147     }
       
   148   }
       
   149 
       
   150   pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
       
   151   if( !U_SUCCESS(status) ){
       
   152     sqlite3_free(pCsr);
       
   153     return SQLITE_ERROR;
       
   154   }
       
   155   pCsr->nChar = iOut;
       
   156 
       
   157   ubrk_first(pCsr->pIter);
       
   158   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
       
   159   return SQLITE_OK;
       
   160 }
       
   161 
       
   162 /*
       
   163 ** Close a tokenization cursor previously opened by a call to icuOpen().
       
   164 */
       
   165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){
       
   166   IcuCursor *pCsr = (IcuCursor *)pCursor;
       
   167   ubrk_close(pCsr->pIter);
       
   168   sqlite3_free(pCsr->zBuffer);
       
   169   sqlite3_free(pCsr);
       
   170   return SQLITE_OK;
       
   171 }
       
   172 
       
   173 /*
       
   174 ** Extract the next token from a tokenization cursor.
       
   175 */
       
   176 static int icuNext(
       
   177   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
       
   178   const char **ppToken,               /* OUT: *ppToken is the token text */
       
   179   int *pnBytes,                       /* OUT: Number of bytes in token */
       
   180   int *piStartOffset,                 /* OUT: Starting offset of token */
       
   181   int *piEndOffset,                   /* OUT: Ending offset of token */
       
   182   int *piPosition                     /* OUT: Position integer of token */
       
   183 ){
       
   184   IcuCursor *pCsr = (IcuCursor *)pCursor;
       
   185 
       
   186   int iStart = 0;
       
   187   int iEnd = 0;
       
   188   int nByte = 0;
       
   189 
       
   190   while( iStart==iEnd ){
       
   191     UChar32 c;
       
   192 
       
   193     iStart = ubrk_current(pCsr->pIter);
       
   194     iEnd = ubrk_next(pCsr->pIter);
       
   195     if( iEnd==UBRK_DONE ){
       
   196       return SQLITE_DONE;
       
   197     }
       
   198 
       
   199     while( iStart<iEnd ){
       
   200       int iWhite = iStart;
       
   201       U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
       
   202       if( u_isspace(c) ){
       
   203         iStart = iWhite;
       
   204       }else{
       
   205         break;
       
   206       }
       
   207     }
       
   208     assert(iStart<=iEnd);
       
   209   }
       
   210 
       
   211   do {
       
   212     UErrorCode status = U_ZERO_ERROR;
       
   213     if( nByte ){
       
   214       char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
       
   215       if( !zNew ){
       
   216         return SQLITE_NOMEM;
       
   217       }
       
   218       pCsr->zBuffer = zNew;
       
   219       pCsr->nBuffer = nByte;
       
   220     }
       
   221 
       
   222     u_strToUTF8(
       
   223         pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
       
   224         &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
       
   225         &status                                  /* Output success/failure */
       
   226     );
       
   227   } while( nByte>pCsr->nBuffer );
       
   228 
       
   229   *ppToken = pCsr->zBuffer;
       
   230   *pnBytes = nByte;
       
   231   *piStartOffset = pCsr->aOffset[iStart];
       
   232   *piEndOffset = pCsr->aOffset[iEnd];
       
   233   *piPosition = pCsr->iToken++;
       
   234 
       
   235   return SQLITE_OK;
       
   236 }
       
   237 
       
   238 /*
       
   239 ** The set of routines that implement the simple tokenizer
       
   240 */
       
   241 static const sqlite3_tokenizer_module icuTokenizerModule = {
       
   242   0,                           /* iVersion */
       
   243   icuCreate,                   /* xCreate  */
       
   244   icuDestroy,                  /* xCreate  */
       
   245   icuOpen,                     /* xOpen    */
       
   246   icuClose,                    /* xClose   */
       
   247   icuNext,                     /* xNext    */
       
   248 };
       
   249 
       
   250 /*
       
   251 ** Set *ppModule to point at the implementation of the ICU tokenizer.
       
   252 */
       
   253 void sqlite3Fts3IcuTokenizerModule(
       
   254   sqlite3_tokenizer_module const**ppModule
       
   255 ){
       
   256   *ppModule = &icuTokenizerModule;
       
   257 }
       
   258 
       
   259 #endif /* defined(SQLITE_ENABLE_ICU) */
       
   260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */