|
1 /* |
|
2 ** 2007 June 22 |
|
3 ** |
|
4 ** The author disclaims copyright to this source code. In place of |
|
5 ** a legal notice, here is a blessing: |
|
6 ** |
|
7 ** May you do good and not evil. |
|
8 ** May you find forgiveness for yourself and forgive others. |
|
9 ** May you share freely, never taking more than you give. |
|
10 ** |
|
11 ************************************************************************* |
|
12 ** This file implements a tokenizer for fts3 based on the ICU library. |
|
13 ** |
|
14 ** $Id: fts3_icu.c,v 1.3 2008/09/01 18:34:20 danielk1977 Exp $ |
|
15 */ |
|
16 |
|
17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) |
|
18 #ifdef SQLITE_ENABLE_ICU |
|
19 |
|
20 #include <assert.h> |
|
21 #include <string.h> |
|
22 #include "fts3_tokenizer.h" |
|
23 |
|
24 #include <unicode/ubrk.h> |
|
25 #include <unicode/ucol.h> |
|
26 #include <unicode/ustring.h> |
|
27 #include <unicode/utf16.h> |
|
28 |
|
29 typedef struct IcuTokenizer IcuTokenizer; |
|
30 typedef struct IcuCursor IcuCursor; |
|
31 |
|
32 struct IcuTokenizer { |
|
33 sqlite3_tokenizer base; |
|
34 char *zLocale; |
|
35 }; |
|
36 |
|
37 struct IcuCursor { |
|
38 sqlite3_tokenizer_cursor base; |
|
39 |
|
40 UBreakIterator *pIter; /* ICU break-iterator object */ |
|
41 int nChar; /* Number of UChar elements in pInput */ |
|
42 UChar *aChar; /* Copy of input using utf-16 encoding */ |
|
43 int *aOffset; /* Offsets of each character in utf-8 input */ |
|
44 |
|
45 int nBuffer; |
|
46 char *zBuffer; |
|
47 |
|
48 int iToken; |
|
49 }; |
|
50 |
|
51 /* |
|
52 ** Create a new tokenizer instance. |
|
53 */ |
|
54 static int icuCreate( |
|
55 int argc, /* Number of entries in argv[] */ |
|
56 const char * const *argv, /* Tokenizer creation arguments */ |
|
57 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ |
|
58 ){ |
|
59 IcuTokenizer *p; |
|
60 int n = 0; |
|
61 |
|
62 if( argc>0 ){ |
|
63 n = strlen(argv[0])+1; |
|
64 } |
|
65 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); |
|
66 if( !p ){ |
|
67 return SQLITE_NOMEM; |
|
68 } |
|
69 memset(p, 0, sizeof(IcuTokenizer)); |
|
70 |
|
71 if( n ){ |
|
72 p->zLocale = (char *)&p[1]; |
|
73 memcpy(p->zLocale, argv[0], n); |
|
74 } |
|
75 |
|
76 *ppTokenizer = (sqlite3_tokenizer *)p; |
|
77 |
|
78 return SQLITE_OK; |
|
79 } |
|
80 |
|
81 /* |
|
82 ** Destroy a tokenizer |
|
83 */ |
|
84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){ |
|
85 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; |
|
86 sqlite3_free(p); |
|
87 return SQLITE_OK; |
|
88 } |
|
89 |
|
90 /* |
|
91 ** Prepare to begin tokenizing a particular string. The input |
|
92 ** string to be tokenized is pInput[0..nBytes-1]. A cursor |
|
93 ** used to incrementally tokenize this string is returned in |
|
94 ** *ppCursor. |
|
95 */ |
|
96 static int icuOpen( |
|
97 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ |
|
98 const char *zInput, /* Input string */ |
|
99 int nInput, /* Length of zInput in bytes */ |
|
100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ |
|
101 ){ |
|
102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; |
|
103 IcuCursor *pCsr; |
|
104 |
|
105 const int32_t opt = U_FOLD_CASE_DEFAULT; |
|
106 UErrorCode status = U_ZERO_ERROR; |
|
107 int nChar; |
|
108 |
|
109 UChar32 c; |
|
110 int iInput = 0; |
|
111 int iOut = 0; |
|
112 |
|
113 *ppCursor = 0; |
|
114 |
|
115 if( nInput<0 ){ |
|
116 nInput = strlen(zInput); |
|
117 } |
|
118 nChar = nInput+1; |
|
119 pCsr = (IcuCursor *)sqlite3_malloc( |
|
120 sizeof(IcuCursor) + /* IcuCursor */ |
|
121 nChar * sizeof(UChar) + /* IcuCursor.aChar[] */ |
|
122 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ |
|
123 ); |
|
124 if( !pCsr ){ |
|
125 return SQLITE_NOMEM; |
|
126 } |
|
127 memset(pCsr, 0, sizeof(IcuCursor)); |
|
128 pCsr->aChar = (UChar *)&pCsr[1]; |
|
129 pCsr->aOffset = (int *)&pCsr->aChar[nChar]; |
|
130 |
|
131 pCsr->aOffset[iOut] = iInput; |
|
132 U8_NEXT(zInput, iInput, nInput, c); |
|
133 while( c>0 ){ |
|
134 int isError = 0; |
|
135 c = u_foldCase(c, opt); |
|
136 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); |
|
137 if( isError ){ |
|
138 sqlite3_free(pCsr); |
|
139 return SQLITE_ERROR; |
|
140 } |
|
141 pCsr->aOffset[iOut] = iInput; |
|
142 |
|
143 if( iInput<nInput ){ |
|
144 U8_NEXT(zInput, iInput, nInput, c); |
|
145 }else{ |
|
146 c = 0; |
|
147 } |
|
148 } |
|
149 |
|
150 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); |
|
151 if( !U_SUCCESS(status) ){ |
|
152 sqlite3_free(pCsr); |
|
153 return SQLITE_ERROR; |
|
154 } |
|
155 pCsr->nChar = iOut; |
|
156 |
|
157 ubrk_first(pCsr->pIter); |
|
158 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; |
|
159 return SQLITE_OK; |
|
160 } |
|
161 |
|
162 /* |
|
163 ** Close a tokenization cursor previously opened by a call to icuOpen(). |
|
164 */ |
|
165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){ |
|
166 IcuCursor *pCsr = (IcuCursor *)pCursor; |
|
167 ubrk_close(pCsr->pIter); |
|
168 sqlite3_free(pCsr->zBuffer); |
|
169 sqlite3_free(pCsr); |
|
170 return SQLITE_OK; |
|
171 } |
|
172 |
|
173 /* |
|
174 ** Extract the next token from a tokenization cursor. |
|
175 */ |
|
176 static int icuNext( |
|
177 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ |
|
178 const char **ppToken, /* OUT: *ppToken is the token text */ |
|
179 int *pnBytes, /* OUT: Number of bytes in token */ |
|
180 int *piStartOffset, /* OUT: Starting offset of token */ |
|
181 int *piEndOffset, /* OUT: Ending offset of token */ |
|
182 int *piPosition /* OUT: Position integer of token */ |
|
183 ){ |
|
184 IcuCursor *pCsr = (IcuCursor *)pCursor; |
|
185 |
|
186 int iStart = 0; |
|
187 int iEnd = 0; |
|
188 int nByte = 0; |
|
189 |
|
190 while( iStart==iEnd ){ |
|
191 UChar32 c; |
|
192 |
|
193 iStart = ubrk_current(pCsr->pIter); |
|
194 iEnd = ubrk_next(pCsr->pIter); |
|
195 if( iEnd==UBRK_DONE ){ |
|
196 return SQLITE_DONE; |
|
197 } |
|
198 |
|
199 while( iStart<iEnd ){ |
|
200 int iWhite = iStart; |
|
201 U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); |
|
202 if( u_isspace(c) ){ |
|
203 iStart = iWhite; |
|
204 }else{ |
|
205 break; |
|
206 } |
|
207 } |
|
208 assert(iStart<=iEnd); |
|
209 } |
|
210 |
|
211 do { |
|
212 UErrorCode status = U_ZERO_ERROR; |
|
213 if( nByte ){ |
|
214 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); |
|
215 if( !zNew ){ |
|
216 return SQLITE_NOMEM; |
|
217 } |
|
218 pCsr->zBuffer = zNew; |
|
219 pCsr->nBuffer = nByte; |
|
220 } |
|
221 |
|
222 u_strToUTF8( |
|
223 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ |
|
224 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ |
|
225 &status /* Output success/failure */ |
|
226 ); |
|
227 } while( nByte>pCsr->nBuffer ); |
|
228 |
|
229 *ppToken = pCsr->zBuffer; |
|
230 *pnBytes = nByte; |
|
231 *piStartOffset = pCsr->aOffset[iStart]; |
|
232 *piEndOffset = pCsr->aOffset[iEnd]; |
|
233 *piPosition = pCsr->iToken++; |
|
234 |
|
235 return SQLITE_OK; |
|
236 } |
|
237 |
|
238 /* |
|
239 ** The set of routines that implement the simple tokenizer |
|
240 */ |
|
241 static const sqlite3_tokenizer_module icuTokenizerModule = { |
|
242 0, /* iVersion */ |
|
243 icuCreate, /* xCreate */ |
|
244 icuDestroy, /* xCreate */ |
|
245 icuOpen, /* xOpen */ |
|
246 icuClose, /* xClose */ |
|
247 icuNext, /* xNext */ |
|
248 }; |
|
249 |
|
250 /* |
|
251 ** Set *ppModule to point at the implementation of the ICU tokenizer. |
|
252 */ |
|
253 void sqlite3Fts3IcuTokenizerModule( |
|
254 sqlite3_tokenizer_module const**ppModule |
|
255 ){ |
|
256 *ppModule = &icuTokenizerModule; |
|
257 } |
|
258 |
|
259 #endif /* defined(SQLITE_ENABLE_ICU) */ |
|
260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ |