persistentstorage/sqlite3api/SQLite/fts2_tokenizer.c
changeset 0 08ec8eefde2f
equal deleted inserted replaced
-1:000000000000 0:08ec8eefde2f
       
     1 /*
       
     2 ** 2007 June 22
       
     3 **
       
     4 ** The author disclaims copyright to this source code.  In place of
       
     5 ** a legal notice, here is a blessing:
       
     6 **
       
     7 **    May you do good and not evil.
       
     8 **    May you find forgiveness for yourself and forgive others.
       
     9 **    May you share freely, never taking more than you give.
       
    10 **
       
    11 ******************************************************************************
       
    12 **
       
    13 ** This is part of an SQLite module implementing full-text search.
       
    14 ** This particular file implements the generic tokenizer interface.
       
    15 */
       
    16 
       
    17 /*
       
    18 ** The code in this file is only compiled if:
       
    19 **
       
    20 **     * The FTS2 module is being built as an extension
       
    21 **       (in which case SQLITE_CORE is not defined), or
       
    22 **
       
    23 **     * The FTS2 module is being built into the core of
       
    24 **       SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
       
    25 */
       
    26 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
       
    27 
       
    28 
       
    29 #include "sqlite3.h"
       
    30 #include "sqlite3ext.h"
       
    31 SQLITE_EXTENSION_INIT1
       
    32 
       
    33 #include "fts2_hash.h"
       
    34 #include "fts2_tokenizer.h"
       
    35 #include <assert.h>
       
    36 
       
    37 /*
       
    38 ** Implementation of the SQL scalar function for accessing the underlying 
       
    39 ** hash table. This function may be called as follows:
       
    40 **
       
    41 **   SELECT <function-name>(<key-name>);
       
    42 **   SELECT <function-name>(<key-name>, <pointer>);
       
    43 **
       
    44 ** where <function-name> is the name passed as the second argument
       
    45 ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer').
       
    46 **
       
    47 ** If the <pointer> argument is specified, it must be a blob value
       
    48 ** containing a pointer to be stored as the hash data corresponding
       
    49 ** to the string <key-name>. If <pointer> is not specified, then
       
    50 ** the string <key-name> must already exist in the has table. Otherwise,
       
    51 ** an error is returned.
       
    52 **
       
    53 ** Whether or not the <pointer> argument is specified, the value returned
       
    54 ** is a blob containing the pointer stored as the hash data corresponding
       
    55 ** to string <key-name> (after the hash-table is updated, if applicable).
       
    56 */
       
    57 static void scalarFunc(
       
    58   sqlite3_context *context,
       
    59   int argc,
       
    60   sqlite3_value **argv
       
    61 ){
       
    62   fts2Hash *pHash;
       
    63   void *pPtr = 0;
       
    64   const unsigned char *zName;
       
    65   int nName;
       
    66 
       
    67   assert( argc==1 || argc==2 );
       
    68 
       
    69   pHash = (fts2Hash *)sqlite3_user_data(context);
       
    70 
       
    71   zName = sqlite3_value_text(argv[0]);
       
    72   nName = sqlite3_value_bytes(argv[0])+1;
       
    73 
       
    74   if( argc==2 ){
       
    75     void *pOld;
       
    76     int n = sqlite3_value_bytes(argv[1]);
       
    77     if( n!=sizeof(pPtr) ){
       
    78       sqlite3_result_error(context, "argument type mismatch", -1);
       
    79       return;
       
    80     }
       
    81     pPtr = *(void **)sqlite3_value_blob(argv[1]);
       
    82     pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr);
       
    83     if( pOld==pPtr ){
       
    84       sqlite3_result_error(context, "out of memory", -1);
       
    85       return;
       
    86     }
       
    87   }else{
       
    88     pPtr = sqlite3Fts2HashFind(pHash, zName, nName);
       
    89     if( !pPtr ){
       
    90       char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
       
    91       sqlite3_result_error(context, zErr, -1);
       
    92       sqlite3_free(zErr);
       
    93       return;
       
    94     }
       
    95   }
       
    96 
       
    97   sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
       
    98 }
       
    99 
       
   100 #ifdef SQLITE_TEST
       
   101 
       
   102 #include "tcl.h"
       
   103 #include <string.h>
       
   104 
       
   105 /*
       
   106 ** Implementation of a special SQL scalar function for testing tokenizers 
       
   107 ** designed to be used in concert with the Tcl testing framework. This
       
   108 ** function must be called with two arguments:
       
   109 **
       
   110 **   SELECT <function-name>(<key-name>, <input-string>);
       
   111 **   SELECT <function-name>(<key-name>, <pointer>);
       
   112 **
       
   113 ** where <function-name> is the name passed as the second argument
       
   114 ** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer')
       
   115 ** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test').
       
   116 **
       
   117 ** The return value is a string that may be interpreted as a Tcl
       
   118 ** list. For each token in the <input-string>, three elements are
       
   119 ** added to the returned list. The first is the token position, the 
       
   120 ** second is the token text (folded, stemmed, etc.) and the third is the
       
   121 ** substring of <input-string> associated with the token. For example, 
       
   122 ** using the built-in "simple" tokenizer:
       
   123 **
       
   124 **   SELECT fts_tokenizer_test('simple', 'I don't see how');
       
   125 **
       
   126 ** will return the string:
       
   127 **
       
   128 **   "{0 i I 1 dont don't 2 see see 3 how how}"
       
   129 **   
       
   130 */
       
   131 static void testFunc(
       
   132   sqlite3_context *context,
       
   133   int argc,
       
   134   sqlite3_value **argv
       
   135 ){
       
   136   fts2Hash *pHash;
       
   137   sqlite3_tokenizer_module *p;
       
   138   sqlite3_tokenizer *pTokenizer = 0;
       
   139   sqlite3_tokenizer_cursor *pCsr = 0;
       
   140 
       
   141   const char *zErr = 0;
       
   142 
       
   143   const char *zName;
       
   144   int nName;
       
   145   const char *zInput;
       
   146   int nInput;
       
   147 
       
   148   const char *zArg = 0;
       
   149 
       
   150   const char *zToken;
       
   151   int nToken;
       
   152   int iStart;
       
   153   int iEnd;
       
   154   int iPos;
       
   155 
       
   156   Tcl_Obj *pRet;
       
   157 
       
   158   assert( argc==2 || argc==3 );
       
   159 
       
   160   nName = sqlite3_value_bytes(argv[0]);
       
   161   zName = (const char *)sqlite3_value_text(argv[0]);
       
   162   nInput = sqlite3_value_bytes(argv[argc-1]);
       
   163   zInput = (const char *)sqlite3_value_text(argv[argc-1]);
       
   164 
       
   165   if( argc==3 ){
       
   166     zArg = (const char *)sqlite3_value_text(argv[1]);
       
   167   }
       
   168 
       
   169   pHash = (fts2Hash *)sqlite3_user_data(context);
       
   170   p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);
       
   171 
       
   172   if( !p ){
       
   173     char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
       
   174     sqlite3_result_error(context, zErr, -1);
       
   175     sqlite3_free(zErr);
       
   176     return;
       
   177   }
       
   178 
       
   179   pRet = Tcl_NewObj();
       
   180   Tcl_IncrRefCount(pRet);
       
   181 
       
   182   if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
       
   183     zErr = "error in xCreate()";
       
   184     goto finish;
       
   185   }
       
   186   pTokenizer->pModule = p;
       
   187   if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
       
   188     zErr = "error in xOpen()";
       
   189     goto finish;
       
   190   }
       
   191   pCsr->pTokenizer = pTokenizer;
       
   192 
       
   193   while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
       
   194     Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
       
   195     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
       
   196     zToken = &zInput[iStart];
       
   197     nToken = iEnd-iStart;
       
   198     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
       
   199   }
       
   200 
       
   201   if( SQLITE_OK!=p->xClose(pCsr) ){
       
   202     zErr = "error in xClose()";
       
   203     goto finish;
       
   204   }
       
   205   if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
       
   206     zErr = "error in xDestroy()";
       
   207     goto finish;
       
   208   }
       
   209 
       
   210 finish:
       
   211   if( zErr ){
       
   212     sqlite3_result_error(context, zErr, -1);
       
   213   }else{
       
   214     sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
       
   215   }
       
   216   Tcl_DecrRefCount(pRet);
       
   217 }
       
   218 
       
   219 static
       
   220 int registerTokenizer(
       
   221   sqlite3 *db, 
       
   222   char *zName, 
       
   223   const sqlite3_tokenizer_module *p
       
   224 ){
       
   225   int rc;
       
   226   sqlite3_stmt *pStmt;
       
   227   const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
       
   228 
       
   229   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
       
   230   if( rc!=SQLITE_OK ){
       
   231     return rc;
       
   232   }
       
   233 
       
   234   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
       
   235   sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
       
   236   sqlite3_step(pStmt);
       
   237 
       
   238   return sqlite3_finalize(pStmt);
       
   239 }
       
   240 
       
   241 static
       
   242 int queryTokenizer(
       
   243   sqlite3 *db, 
       
   244   char *zName,  
       
   245   const sqlite3_tokenizer_module **pp
       
   246 ){
       
   247   int rc;
       
   248   sqlite3_stmt *pStmt;
       
   249   const char zSql[] = "SELECT fts2_tokenizer(?)";
       
   250 
       
   251   *pp = 0;
       
   252   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
       
   253   if( rc!=SQLITE_OK ){
       
   254     return rc;
       
   255   }
       
   256 
       
   257   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
       
   258   if( SQLITE_ROW==sqlite3_step(pStmt) ){
       
   259     if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
       
   260       memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
       
   261     }
       
   262   }
       
   263 
       
   264   return sqlite3_finalize(pStmt);
       
   265 }
       
   266 
       
   267 void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
       
   268 
       
   269 /*
       
   270 ** Implementation of the scalar function fts2_tokenizer_internal_test().
       
   271 ** This function is used for testing only, it is not included in the
       
   272 ** build unless SQLITE_TEST is defined.
       
   273 **
       
   274 ** The purpose of this is to test that the fts2_tokenizer() function
       
   275 ** can be used as designed by the C-code in the queryTokenizer and
       
   276 ** registerTokenizer() functions above. These two functions are repeated
       
   277 ** in the README.tokenizer file as an example, so it is important to
       
   278 ** test them.
       
   279 **
       
   280 ** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar
       
   281 ** function with no arguments. An assert() will fail if a problem is
       
   282 ** detected. i.e.:
       
   283 **
       
   284 **     SELECT fts2_tokenizer_internal_test();
       
   285 **
       
   286 */
       
   287 static void intTestFunc(
       
   288   sqlite3_context *context,
       
   289   int argc,
       
   290   sqlite3_value **argv
       
   291 ){
       
   292   int rc;
       
   293   const sqlite3_tokenizer_module *p1;
       
   294   const sqlite3_tokenizer_module *p2;
       
   295   sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
       
   296 
       
   297   /* Test the query function */
       
   298   sqlite3Fts2SimpleTokenizerModule(&p1);
       
   299   rc = queryTokenizer(db, "simple", &p2);
       
   300   assert( rc==SQLITE_OK );
       
   301   assert( p1==p2 );
       
   302   rc = queryTokenizer(db, "nosuchtokenizer", &p2);
       
   303   assert( rc==SQLITE_ERROR );
       
   304   assert( p2==0 );
       
   305   assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
       
   306 
       
   307   /* Test the storage function */
       
   308   rc = registerTokenizer(db, "nosuchtokenizer", p1);
       
   309   assert( rc==SQLITE_OK );
       
   310   rc = queryTokenizer(db, "nosuchtokenizer", &p2);
       
   311   assert( rc==SQLITE_OK );
       
   312   assert( p2==p1 );
       
   313 
       
   314   sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
       
   315 }
       
   316 
       
   317 #endif
       
   318 
       
   319 /*
       
   320 ** Set up SQL objects in database db used to access the contents of
       
   321 ** the hash table pointed to by argument pHash. The hash table must
       
   322 ** been initialised to use string keys, and to take a private copy 
       
   323 ** of the key when a value is inserted. i.e. by a call similar to:
       
   324 **
       
   325 **    sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
       
   326 **
       
   327 ** This function adds a scalar function (see header comment above
       
   328 ** scalarFunc() in this file for details) and, if ENABLE_TABLE is
       
   329 ** defined at compilation time, a temporary virtual table (see header 
       
   330 ** comment above struct HashTableVtab) to the database schema. Both 
       
   331 ** provide read/write access to the contents of *pHash.
       
   332 **
       
   333 ** The third argument to this function, zName, is used as the name
       
   334 ** of both the scalar and, if created, the virtual table.
       
   335 */
       
   336 int sqlite3Fts2InitHashTable(
       
   337   sqlite3 *db, 
       
   338   fts2Hash *pHash, 
       
   339   const char *zName
       
   340 ){
       
   341   int rc = SQLITE_OK;
       
   342   void *p = (void *)pHash;
       
   343   const int any = SQLITE_ANY;
       
   344   char *zTest = 0;
       
   345   char *zTest2 = 0;
       
   346 
       
   347 #ifdef SQLITE_TEST
       
   348   void *pdb = (void *)db;
       
   349   zTest = sqlite3_mprintf("%s_test", zName);
       
   350   zTest2 = sqlite3_mprintf("%s_internal_test", zName);
       
   351   if( !zTest || !zTest2 ){
       
   352     rc = SQLITE_NOMEM;
       
   353   }
       
   354 #endif
       
   355 
       
   356   if( rc!=SQLITE_OK
       
   357    || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
       
   358    || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
       
   359 #ifdef SQLITE_TEST
       
   360    || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
       
   361    || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
       
   362    || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
       
   363 #endif
       
   364   );
       
   365 
       
   366   sqlite3_free(zTest);
       
   367   sqlite3_free(zTest2);
       
   368   return rc;
       
   369 }
       
   370 
       
   371 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */