|
1 /* |
|
2 ** 2007 June 22 |
|
3 ** |
|
4 ** The author disclaims copyright to this source code. In place of |
|
5 ** a legal notice, here is a blessing: |
|
6 ** |
|
7 ** May you do good and not evil. |
|
8 ** May you find forgiveness for yourself and forgive others. |
|
9 ** May you share freely, never taking more than you give. |
|
10 ** |
|
11 ************************************************************************* |
|
12 ** This file implements a tokenizer for fts2 based on the ICU library. |
|
13 ** |
|
14 ** $Id: fts2_icu.c,v 1.2 2008/07/22 22:20:50 shess Exp $ |
|
15 */ |
|
16 |
|
17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) |
|
18 #ifdef SQLITE_ENABLE_ICU |
|
19 |
|
20 #include <assert.h> |
|
21 #include <string.h> |
|
22 #include "fts2_tokenizer.h" |
|
23 |
|
24 #include <unicode/ubrk.h> |
|
25 #include <unicode/ucol.h> |
|
26 #include <unicode/ustring.h> |
|
27 #include <unicode/utf16.h> |
|
28 |
|
29 typedef struct IcuTokenizer IcuTokenizer; |
|
30 typedef struct IcuCursor IcuCursor; |
|
31 |
|
32 struct IcuTokenizer { |
|
33 sqlite3_tokenizer base; |
|
34 char *zLocale; |
|
35 }; |
|
36 |
|
37 struct IcuCursor { |
|
38 sqlite3_tokenizer_cursor base; |
|
39 |
|
40 UBreakIterator *pIter; /* ICU break-iterator object */ |
|
41 int nChar; /* Number of UChar elements in pInput */ |
|
42 UChar *aChar; /* Copy of input using utf-16 encoding */ |
|
43 int *aOffset; /* Offsets of each character in utf-8 input */ |
|
44 |
|
45 int nBuffer; |
|
46 char *zBuffer; |
|
47 |
|
48 int iToken; |
|
49 }; |
|
50 |
|
51 /* |
|
52 ** Create a new tokenizer instance. |
|
53 */ |
|
54 static int icuCreate( |
|
55 int argc, /* Number of entries in argv[] */ |
|
56 const char * const *argv, /* Tokenizer creation arguments */ |
|
57 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ |
|
58 ){ |
|
59 IcuTokenizer *p; |
|
60 int n = 0; |
|
61 |
|
62 if( argc>0 ){ |
|
63 n = strlen(argv[0])+1; |
|
64 } |
|
65 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); |
|
66 if( !p ){ |
|
67 return SQLITE_NOMEM; |
|
68 } |
|
69 memset(p, 0, sizeof(IcuTokenizer)); |
|
70 |
|
71 if( n ){ |
|
72 p->zLocale = (char *)&p[1]; |
|
73 memcpy(p->zLocale, argv[0], n); |
|
74 } |
|
75 |
|
76 *ppTokenizer = (sqlite3_tokenizer *)p; |
|
77 |
|
78 return SQLITE_OK; |
|
79 } |
|
80 |
|
81 /* |
|
82 ** Destroy a tokenizer |
|
83 */ |
|
84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){ |
|
85 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; |
|
86 sqlite3_free(p); |
|
87 return SQLITE_OK; |
|
88 } |
|
89 |
|
90 /* |
|
91 ** Prepare to begin tokenizing a particular string. The input |
|
92 ** string to be tokenized is pInput[0..nBytes-1]. A cursor |
|
93 ** used to incrementally tokenize this string is returned in |
|
94 ** *ppCursor. |
|
95 */ |
|
96 static int icuOpen( |
|
97 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ |
|
98 const char *zInput, /* Input string */ |
|
99 int nInput, /* Length of zInput in bytes */ |
|
100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ |
|
101 ){ |
|
102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; |
|
103 IcuCursor *pCsr; |
|
104 |
|
105 const int32_t opt = U_FOLD_CASE_DEFAULT; |
|
106 UErrorCode status = U_ZERO_ERROR; |
|
107 int nChar; |
|
108 |
|
109 UChar32 c; |
|
110 int iInput = 0; |
|
111 int iOut = 0; |
|
112 |
|
113 *ppCursor = 0; |
|
114 |
|
115 if( -1 == nInput ) nInput = strlen(nInput); |
|
116 nChar = nInput+1; |
|
117 pCsr = (IcuCursor *)sqlite3_malloc( |
|
118 sizeof(IcuCursor) + /* IcuCursor */ |
|
119 nChar * sizeof(UChar) + /* IcuCursor.aChar[] */ |
|
120 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ |
|
121 ); |
|
122 if( !pCsr ){ |
|
123 return SQLITE_NOMEM; |
|
124 } |
|
125 memset(pCsr, 0, sizeof(IcuCursor)); |
|
126 pCsr->aChar = (UChar *)&pCsr[1]; |
|
127 pCsr->aOffset = (int *)&pCsr->aChar[nChar]; |
|
128 |
|
129 pCsr->aOffset[iOut] = iInput; |
|
130 U8_NEXT(zInput, iInput, nInput, c); |
|
131 while( c>0 ){ |
|
132 int isError = 0; |
|
133 c = u_foldCase(c, opt); |
|
134 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); |
|
135 if( isError ){ |
|
136 sqlite3_free(pCsr); |
|
137 return SQLITE_ERROR; |
|
138 } |
|
139 pCsr->aOffset[iOut] = iInput; |
|
140 |
|
141 if( iInput<nInput ){ |
|
142 U8_NEXT(zInput, iInput, nInput, c); |
|
143 }else{ |
|
144 c = 0; |
|
145 } |
|
146 } |
|
147 |
|
148 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); |
|
149 if( !U_SUCCESS(status) ){ |
|
150 sqlite3_free(pCsr); |
|
151 return SQLITE_ERROR; |
|
152 } |
|
153 pCsr->nChar = iOut; |
|
154 |
|
155 ubrk_first(pCsr->pIter); |
|
156 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; |
|
157 return SQLITE_OK; |
|
158 } |
|
159 |
|
160 /* |
|
161 ** Close a tokenization cursor previously opened by a call to icuOpen(). |
|
162 */ |
|
163 static int icuClose(sqlite3_tokenizer_cursor *pCursor){ |
|
164 IcuCursor *pCsr = (IcuCursor *)pCursor; |
|
165 ubrk_close(pCsr->pIter); |
|
166 sqlite3_free(pCsr->zBuffer); |
|
167 sqlite3_free(pCsr); |
|
168 return SQLITE_OK; |
|
169 } |
|
170 |
|
171 /* |
|
172 ** Extract the next token from a tokenization cursor. |
|
173 */ |
|
174 static int icuNext( |
|
175 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ |
|
176 const char **ppToken, /* OUT: *ppToken is the token text */ |
|
177 int *pnBytes, /* OUT: Number of bytes in token */ |
|
178 int *piStartOffset, /* OUT: Starting offset of token */ |
|
179 int *piEndOffset, /* OUT: Ending offset of token */ |
|
180 int *piPosition /* OUT: Position integer of token */ |
|
181 ){ |
|
182 IcuCursor *pCsr = (IcuCursor *)pCursor; |
|
183 |
|
184 int iStart = 0; |
|
185 int iEnd = 0; |
|
186 int nByte = 0; |
|
187 |
|
188 while( iStart==iEnd ){ |
|
189 UChar32 c; |
|
190 |
|
191 iStart = ubrk_current(pCsr->pIter); |
|
192 iEnd = ubrk_next(pCsr->pIter); |
|
193 if( iEnd==UBRK_DONE ){ |
|
194 return SQLITE_DONE; |
|
195 } |
|
196 |
|
197 while( iStart<iEnd ){ |
|
198 int iWhite = iStart; |
|
199 U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); |
|
200 if( u_isspace(c) ){ |
|
201 iStart = iWhite; |
|
202 }else{ |
|
203 break; |
|
204 } |
|
205 } |
|
206 assert(iStart<=iEnd); |
|
207 } |
|
208 |
|
209 do { |
|
210 UErrorCode status = U_ZERO_ERROR; |
|
211 if( nByte ){ |
|
212 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); |
|
213 if( !zNew ){ |
|
214 return SQLITE_NOMEM; |
|
215 } |
|
216 pCsr->zBuffer = zNew; |
|
217 pCsr->nBuffer = nByte; |
|
218 } |
|
219 |
|
220 u_strToUTF8( |
|
221 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ |
|
222 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ |
|
223 &status /* Output success/failure */ |
|
224 ); |
|
225 } while( nByte>pCsr->nBuffer ); |
|
226 |
|
227 *ppToken = pCsr->zBuffer; |
|
228 *pnBytes = nByte; |
|
229 *piStartOffset = pCsr->aOffset[iStart]; |
|
230 *piEndOffset = pCsr->aOffset[iEnd]; |
|
231 *piPosition = pCsr->iToken++; |
|
232 |
|
233 return SQLITE_OK; |
|
234 } |
|
235 |
|
236 /* |
|
237 ** The set of routines that implement the simple tokenizer |
|
238 */ |
|
239 static const sqlite3_tokenizer_module icuTokenizerModule = { |
|
240 0, /* iVersion */ |
|
241 icuCreate, /* xCreate */ |
|
242 icuDestroy, /* xCreate */ |
|
243 icuOpen, /* xOpen */ |
|
244 icuClose, /* xClose */ |
|
245 icuNext, /* xNext */ |
|
246 }; |
|
247 |
|
248 /* |
|
249 ** Set *ppModule to point at the implementation of the ICU tokenizer. |
|
250 */ |
|
251 void sqlite3Fts2IcuTokenizerModule( |
|
252 sqlite3_tokenizer_module const**ppModule |
|
253 ){ |
|
254 *ppModule = &icuTokenizerModule; |
|
255 } |
|
256 |
|
257 #endif /* defined(SQLITE_ENABLE_ICU) */ |
|
258 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ |