|
1 /* |
|
2 ** 2007 June 22 |
|
3 ** |
|
4 ** The author disclaims copyright to this source code. In place of |
|
5 ** a legal notice, here is a blessing: |
|
6 ** |
|
7 ** May you do good and not evil. |
|
8 ** May you find forgiveness for yourself and forgive others. |
|
9 ** May you share freely, never taking more than you give. |
|
10 ** |
|
11 ****************************************************************************** |
|
12 ** |
|
13 ** This is part of an SQLite module implementing full-text search. |
|
14 ** This particular file implements the generic tokenizer interface. |
|
15 */ |
|
16 |
|
17 /* |
|
18 ** The code in this file is only compiled if: |
|
19 ** |
|
20 ** * The FTS2 module is being built as an extension |
|
21 ** (in which case SQLITE_CORE is not defined), or |
|
22 ** |
|
23 ** * The FTS2 module is being built into the core of |
|
24 ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined). |
|
25 */ |
|
26 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) |
|
27 |
|
28 |
|
29 #include "sqlite3.h" |
|
30 #include "sqlite3ext.h" |
|
31 SQLITE_EXTENSION_INIT1 |
|
32 |
|
33 #include "fts2_hash.h" |
|
34 #include "fts2_tokenizer.h" |
|
35 #include <assert.h> |
|
36 |
|
37 /* |
|
38 ** Implementation of the SQL scalar function for accessing the underlying |
|
39 ** hash table. This function may be called as follows: |
|
40 ** |
|
41 ** SELECT <function-name>(<key-name>); |
|
42 ** SELECT <function-name>(<key-name>, <pointer>); |
|
43 ** |
|
44 ** where <function-name> is the name passed as the second argument |
|
45 ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer'). |
|
46 ** |
|
47 ** If the <pointer> argument is specified, it must be a blob value |
|
48 ** containing a pointer to be stored as the hash data corresponding |
|
49 ** to the string <key-name>. If <pointer> is not specified, then |
|
50 ** the string <key-name> must already exist in the has table. Otherwise, |
|
51 ** an error is returned. |
|
52 ** |
|
53 ** Whether or not the <pointer> argument is specified, the value returned |
|
54 ** is a blob containing the pointer stored as the hash data corresponding |
|
55 ** to string <key-name> (after the hash-table is updated, if applicable). |
|
56 */ |
|
57 static void scalarFunc( |
|
58 sqlite3_context *context, |
|
59 int argc, |
|
60 sqlite3_value **argv |
|
61 ){ |
|
62 fts2Hash *pHash; |
|
63 void *pPtr = 0; |
|
64 const unsigned char *zName; |
|
65 int nName; |
|
66 |
|
67 assert( argc==1 || argc==2 ); |
|
68 |
|
69 pHash = (fts2Hash *)sqlite3_user_data(context); |
|
70 |
|
71 zName = sqlite3_value_text(argv[0]); |
|
72 nName = sqlite3_value_bytes(argv[0])+1; |
|
73 |
|
74 if( argc==2 ){ |
|
75 void *pOld; |
|
76 int n = sqlite3_value_bytes(argv[1]); |
|
77 if( n!=sizeof(pPtr) ){ |
|
78 sqlite3_result_error(context, "argument type mismatch", -1); |
|
79 return; |
|
80 } |
|
81 pPtr = *(void **)sqlite3_value_blob(argv[1]); |
|
82 pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr); |
|
83 if( pOld==pPtr ){ |
|
84 sqlite3_result_error(context, "out of memory", -1); |
|
85 return; |
|
86 } |
|
87 }else{ |
|
88 pPtr = sqlite3Fts2HashFind(pHash, zName, nName); |
|
89 if( !pPtr ){ |
|
90 char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName); |
|
91 sqlite3_result_error(context, zErr, -1); |
|
92 sqlite3_free(zErr); |
|
93 return; |
|
94 } |
|
95 } |
|
96 |
|
97 sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT); |
|
98 } |
|
99 |
|
100 #ifdef SQLITE_TEST |
|
101 |
|
102 #include "tcl.h" |
|
103 #include <string.h> |
|
104 |
|
105 /* |
|
106 ** Implementation of a special SQL scalar function for testing tokenizers |
|
107 ** designed to be used in concert with the Tcl testing framework. This |
|
108 ** function must be called with two arguments: |
|
109 ** |
|
110 ** SELECT <function-name>(<key-name>, <input-string>); |
|
111 ** SELECT <function-name>(<key-name>, <pointer>); |
|
112 ** |
|
113 ** where <function-name> is the name passed as the second argument |
|
114 ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer') |
|
115 ** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test'). |
|
116 ** |
|
117 ** The return value is a string that may be interpreted as a Tcl |
|
118 ** list. For each token in the <input-string>, three elements are |
|
119 ** added to the returned list. The first is the token position, the |
|
120 ** second is the token text (folded, stemmed, etc.) and the third is the |
|
121 ** substring of <input-string> associated with the token. For example, |
|
122 ** using the built-in "simple" tokenizer: |
|
123 ** |
|
124 ** SELECT fts_tokenizer_test('simple', 'I don't see how'); |
|
125 ** |
|
126 ** will return the string: |
|
127 ** |
|
128 ** "{0 i I 1 dont don't 2 see see 3 how how}" |
|
129 ** |
|
130 */ |
|
131 static void testFunc( |
|
132 sqlite3_context *context, |
|
133 int argc, |
|
134 sqlite3_value **argv |
|
135 ){ |
|
136 fts2Hash *pHash; |
|
137 sqlite3_tokenizer_module *p; |
|
138 sqlite3_tokenizer *pTokenizer = 0; |
|
139 sqlite3_tokenizer_cursor *pCsr = 0; |
|
140 |
|
141 const char *zErr = 0; |
|
142 |
|
143 const char *zName; |
|
144 int nName; |
|
145 const char *zInput; |
|
146 int nInput; |
|
147 |
|
148 const char *zArg = 0; |
|
149 |
|
150 const char *zToken; |
|
151 int nToken; |
|
152 int iStart; |
|
153 int iEnd; |
|
154 int iPos; |
|
155 |
|
156 Tcl_Obj *pRet; |
|
157 |
|
158 assert( argc==2 || argc==3 ); |
|
159 |
|
160 nName = sqlite3_value_bytes(argv[0]); |
|
161 zName = (const char *)sqlite3_value_text(argv[0]); |
|
162 nInput = sqlite3_value_bytes(argv[argc-1]); |
|
163 zInput = (const char *)sqlite3_value_text(argv[argc-1]); |
|
164 |
|
165 if( argc==3 ){ |
|
166 zArg = (const char *)sqlite3_value_text(argv[1]); |
|
167 } |
|
168 |
|
169 pHash = (fts2Hash *)sqlite3_user_data(context); |
|
170 p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1); |
|
171 |
|
172 if( !p ){ |
|
173 char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName); |
|
174 sqlite3_result_error(context, zErr, -1); |
|
175 sqlite3_free(zErr); |
|
176 return; |
|
177 } |
|
178 |
|
179 pRet = Tcl_NewObj(); |
|
180 Tcl_IncrRefCount(pRet); |
|
181 |
|
182 if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){ |
|
183 zErr = "error in xCreate()"; |
|
184 goto finish; |
|
185 } |
|
186 pTokenizer->pModule = p; |
|
187 if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){ |
|
188 zErr = "error in xOpen()"; |
|
189 goto finish; |
|
190 } |
|
191 pCsr->pTokenizer = pTokenizer; |
|
192 |
|
193 while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){ |
|
194 Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos)); |
|
195 Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken)); |
|
196 zToken = &zInput[iStart]; |
|
197 nToken = iEnd-iStart; |
|
198 Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken)); |
|
199 } |
|
200 |
|
201 if( SQLITE_OK!=p->xClose(pCsr) ){ |
|
202 zErr = "error in xClose()"; |
|
203 goto finish; |
|
204 } |
|
205 if( SQLITE_OK!=p->xDestroy(pTokenizer) ){ |
|
206 zErr = "error in xDestroy()"; |
|
207 goto finish; |
|
208 } |
|
209 |
|
210 finish: |
|
211 if( zErr ){ |
|
212 sqlite3_result_error(context, zErr, -1); |
|
213 }else{ |
|
214 sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT); |
|
215 } |
|
216 Tcl_DecrRefCount(pRet); |
|
217 } |
|
218 |
|
219 static |
|
220 int registerTokenizer( |
|
221 sqlite3 *db, |
|
222 char *zName, |
|
223 const sqlite3_tokenizer_module *p |
|
224 ){ |
|
225 int rc; |
|
226 sqlite3_stmt *pStmt; |
|
227 const char zSql[] = "SELECT fts2_tokenizer(?, ?)"; |
|
228 |
|
229 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); |
|
230 if( rc!=SQLITE_OK ){ |
|
231 return rc; |
|
232 } |
|
233 |
|
234 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); |
|
235 sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); |
|
236 sqlite3_step(pStmt); |
|
237 |
|
238 return sqlite3_finalize(pStmt); |
|
239 } |
|
240 |
|
241 static |
|
242 int queryTokenizer( |
|
243 sqlite3 *db, |
|
244 char *zName, |
|
245 const sqlite3_tokenizer_module **pp |
|
246 ){ |
|
247 int rc; |
|
248 sqlite3_stmt *pStmt; |
|
249 const char zSql[] = "SELECT fts2_tokenizer(?)"; |
|
250 |
|
251 *pp = 0; |
|
252 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0); |
|
253 if( rc!=SQLITE_OK ){ |
|
254 return rc; |
|
255 } |
|
256 |
|
257 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC); |
|
258 if( SQLITE_ROW==sqlite3_step(pStmt) ){ |
|
259 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){ |
|
260 memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp)); |
|
261 } |
|
262 } |
|
263 |
|
264 return sqlite3_finalize(pStmt); |
|
265 } |
|
266 |
|
267 void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule); |
|
268 |
|
269 /* |
|
270 ** Implementation of the scalar function fts2_tokenizer_internal_test(). |
|
271 ** This function is used for testing only, it is not included in the |
|
272 ** build unless SQLITE_TEST is defined. |
|
273 ** |
|
274 ** The purpose of this is to test that the fts2_tokenizer() function |
|
275 ** can be used as designed by the C-code in the queryTokenizer and |
|
276 ** registerTokenizer() functions above. These two functions are repeated |
|
277 ** in the README.tokenizer file as an example, so it is important to |
|
278 ** test them. |
|
279 ** |
|
280 ** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar |
|
281 ** function with no arguments. An assert() will fail if a problem is |
|
282 ** detected. i.e.: |
|
283 ** |
|
284 ** SELECT fts2_tokenizer_internal_test(); |
|
285 ** |
|
286 */ |
|
287 static void intTestFunc( |
|
288 sqlite3_context *context, |
|
289 int argc, |
|
290 sqlite3_value **argv |
|
291 ){ |
|
292 int rc; |
|
293 const sqlite3_tokenizer_module *p1; |
|
294 const sqlite3_tokenizer_module *p2; |
|
295 sqlite3 *db = (sqlite3 *)sqlite3_user_data(context); |
|
296 |
|
297 /* Test the query function */ |
|
298 sqlite3Fts2SimpleTokenizerModule(&p1); |
|
299 rc = queryTokenizer(db, "simple", &p2); |
|
300 assert( rc==SQLITE_OK ); |
|
301 assert( p1==p2 ); |
|
302 rc = queryTokenizer(db, "nosuchtokenizer", &p2); |
|
303 assert( rc==SQLITE_ERROR ); |
|
304 assert( p2==0 ); |
|
305 assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") ); |
|
306 |
|
307 /* Test the storage function */ |
|
308 rc = registerTokenizer(db, "nosuchtokenizer", p1); |
|
309 assert( rc==SQLITE_OK ); |
|
310 rc = queryTokenizer(db, "nosuchtokenizer", &p2); |
|
311 assert( rc==SQLITE_OK ); |
|
312 assert( p2==p1 ); |
|
313 |
|
314 sqlite3_result_text(context, "ok", -1, SQLITE_STATIC); |
|
315 } |
|
316 |
|
317 #endif |
|
318 |
|
319 /* |
|
320 ** Set up SQL objects in database db used to access the contents of |
|
321 ** the hash table pointed to by argument pHash. The hash table must |
|
322 ** been initialised to use string keys, and to take a private copy |
|
323 ** of the key when a value is inserted. i.e. by a call similar to: |
|
324 ** |
|
325 ** sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1); |
|
326 ** |
|
327 ** This function adds a scalar function (see header comment above |
|
328 ** scalarFunc() in this file for details) and, if ENABLE_TABLE is |
|
329 ** defined at compilation time, a temporary virtual table (see header |
|
330 ** comment above struct HashTableVtab) to the database schema. Both |
|
331 ** provide read/write access to the contents of *pHash. |
|
332 ** |
|
333 ** The third argument to this function, zName, is used as the name |
|
334 ** of both the scalar and, if created, the virtual table. |
|
335 */ |
|
336 int sqlite3Fts2InitHashTable( |
|
337 sqlite3 *db, |
|
338 fts2Hash *pHash, |
|
339 const char *zName |
|
340 ){ |
|
341 int rc = SQLITE_OK; |
|
342 void *p = (void *)pHash; |
|
343 const int any = SQLITE_ANY; |
|
344 char *zTest = 0; |
|
345 char *zTest2 = 0; |
|
346 |
|
347 #ifdef SQLITE_TEST |
|
348 void *pdb = (void *)db; |
|
349 zTest = sqlite3_mprintf("%s_test", zName); |
|
350 zTest2 = sqlite3_mprintf("%s_internal_test", zName); |
|
351 if( !zTest || !zTest2 ){ |
|
352 rc = SQLITE_NOMEM; |
|
353 } |
|
354 #endif |
|
355 |
|
356 if( rc!=SQLITE_OK |
|
357 || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0)) |
|
358 || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0)) |
|
359 #ifdef SQLITE_TEST |
|
360 || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0)) |
|
361 || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0)) |
|
362 || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0)) |
|
363 #endif |
|
364 ); |
|
365 |
|
366 sqlite3_free(zTest); |
|
367 sqlite3_free(zTest2); |
|
368 return rc; |
|
369 } |
|
370 |
|
371 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ |