persistentstorage/sqlite3api/SQLite/fts3_tokenizer.h
changeset 0 08ec8eefde2f
equal deleted inserted replaced
-1:000000000000 0:08ec8eefde2f
       
     1 /*
       
     2 ** 2006 July 10
       
     3 **
       
     4 ** The author disclaims copyright to this source code.
       
     5 **
       
     6 *************************************************************************
       
     7 ** Defines the interface to tokenizers used by fulltext-search.  There
       
     8 ** are three basic components:
       
     9 **
       
    10 ** sqlite3_tokenizer_module is a singleton defining the tokenizer
       
    11 ** interface functions.  This is essentially the class structure for
       
    12 ** tokenizers.
       
    13 **
       
    14 ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
       
    15 ** including customization information defined at creation time.
       
    16 **
       
    17 ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
       
    18 ** tokens from a particular input.
       
    19 */
       
    20 #ifndef _FTS3_TOKENIZER_H_
       
    21 #define _FTS3_TOKENIZER_H_
       
    22 
       
    23 /* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
       
    24 ** If tokenizers are to be allowed to call sqlite3_*() functions, then
       
    25 ** we will need a way to register the API consistently.
       
    26 */
       
    27 #include "sqlite3.h"
       
    28 
       
    29 /*
       
    30 ** Structures used by the tokenizer interface. When a new tokenizer
       
    31 ** implementation is registered, the caller provides a pointer to
       
    32 ** an sqlite3_tokenizer_module containing pointers to the callback
       
    33 ** functions that make up an implementation.
       
    34 **
       
    35 ** When an fts3 table is created, it passes any arguments passed to
       
    36 ** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
       
    37 ** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
       
    38 ** implementation. The xCreate() function in turn returns an 
       
    39 ** sqlite3_tokenizer structure representing the specific tokenizer to
       
    40 ** be used for the fts3 table (customized by the tokenizer clause arguments).
       
    41 **
       
    42 ** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
       
    43 ** method is called. It returns an sqlite3_tokenizer_cursor object
       
    44 ** that may be used to tokenize a specific input buffer based on
       
    45 ** the tokenization rules supplied by a specific sqlite3_tokenizer
       
    46 ** object.
       
    47 */
       
    48 typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
       
    49 typedef struct sqlite3_tokenizer sqlite3_tokenizer;
       
    50 typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
       
    51 
       
    52 struct sqlite3_tokenizer_module {
       
    53 
       
    54   /*
       
    55   ** Structure version. Should always be set to 0.
       
    56   */
       
    57   int iVersion;
       
    58 
       
    59   /*
       
    60   ** Create a new tokenizer. The values in the argv[] array are the
       
    61   ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
       
    62   ** TABLE statement that created the fts3 table. For example, if
       
    63   ** the following SQL is executed:
       
    64   **
       
    65   **   CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
       
    66   **
       
    67   ** then argc is set to 2, and the argv[] array contains pointers
       
    68   ** to the strings "arg1" and "arg2".
       
    69   **
       
    70   ** This method should return either SQLITE_OK (0), or an SQLite error 
       
    71   ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
       
    72   ** to point at the newly created tokenizer structure. The generic
       
    73   ** sqlite3_tokenizer.pModule variable should not be initialised by
       
    74   ** this callback. The caller will do so.
       
    75   */
       
    76   int (*xCreate)(
       
    77     int argc,                           /* Size of argv array */
       
    78     const char *const*argv,             /* Tokenizer argument strings */
       
    79     sqlite3_tokenizer **ppTokenizer     /* OUT: Created tokenizer */
       
    80   );
       
    81 
       
    82   /*
       
    83   ** Destroy an existing tokenizer. The fts3 module calls this method
       
    84   ** exactly once for each successful call to xCreate().
       
    85   */
       
    86   int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
       
    87 
       
    88   /*
       
    89   ** Create a tokenizer cursor to tokenize an input buffer. The caller
       
    90   ** is responsible for ensuring that the input buffer remains valid
       
    91   ** until the cursor is closed (using the xClose() method). 
       
    92   */
       
    93   int (*xOpen)(
       
    94     sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
       
    95     const char *pInput, int nBytes,      /* Input buffer */
       
    96     sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
       
    97   );
       
    98 
       
    99   /*
       
   100   ** Destroy an existing tokenizer cursor. The fts3 module calls this 
       
   101   ** method exactly once for each successful call to xOpen().
       
   102   */
       
   103   int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
       
   104 
       
   105   /*
       
   106   ** Retrieve the next token from the tokenizer cursor pCursor. This
       
   107   ** method should either return SQLITE_OK and set the values of the
       
   108   ** "OUT" variables identified below, or SQLITE_DONE to indicate that
       
   109   ** the end of the buffer has been reached, or an SQLite error code.
       
   110   **
       
   111   ** *ppToken should be set to point at a buffer containing the 
       
   112   ** normalized version of the token (i.e. after any case-folding and/or
       
   113   ** stemming has been performed). *pnBytes should be set to the length
       
   114   ** of this buffer in bytes. The input text that generated the token is
       
   115   ** identified by the byte offsets returned in *piStartOffset and
       
   116   ** *piEndOffset.
       
   117   **
       
   118   ** The buffer *ppToken is set to point at is managed by the tokenizer
       
   119   ** implementation. It is only required to be valid until the next call
       
   120   ** to xNext() or xClose(). 
       
   121   */
       
   122   /* TODO(shess) current implementation requires pInput to be
       
   123   ** nul-terminated.  This should either be fixed, or pInput/nBytes
       
   124   ** should be converted to zInput.
       
   125   */
       
   126   int (*xNext)(
       
   127     sqlite3_tokenizer_cursor *pCursor,   /* Tokenizer cursor */
       
   128     const char **ppToken, int *pnBytes,  /* OUT: Normalized text for token */
       
   129     int *piStartOffset,  /* OUT: Byte offset of token in input buffer */
       
   130     int *piEndOffset,    /* OUT: Byte offset of end of token in input buffer */
       
   131     int *piPosition      /* OUT: Number of tokens returned before this one */
       
   132   );
       
   133 };
       
   134 
       
   135 struct sqlite3_tokenizer {
       
   136   const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
       
   137   /* Tokenizer implementations will typically add additional fields */
       
   138 };
       
   139 
       
   140 struct sqlite3_tokenizer_cursor {
       
   141   sqlite3_tokenizer *pTokenizer;       /* Tokenizer for this cursor. */
       
   142   /* Tokenizer implementations will typically add additional fields */
       
   143 };
       
   144 
       
   145 #endif /* _FTS3_TOKENIZER_H_ */