|
1 /* |
|
2 ** 2006 Oct 10 |
|
3 ** |
|
4 ** The author disclaims copyright to this source code. In place of |
|
5 ** a legal notice, here is a blessing: |
|
6 ** |
|
7 ** May you do good and not evil. |
|
8 ** May you find forgiveness for yourself and forgive others. |
|
9 ** May you share freely, never taking more than you give. |
|
10 ** |
|
11 ****************************************************************************** |
|
12 ** |
|
13 ** This is an SQLite module implementing full-text search. |
|
14 */ |
|
15 |
|
16 /* |
|
17 ** The code in this file is only compiled if: |
|
18 ** |
|
19 ** * The FTS3 module is being built as an extension |
|
20 ** (in which case SQLITE_CORE is not defined), or |
|
21 ** |
|
22 ** * The FTS3 module is being built into the core of |
|
23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). |
|
24 */ |
|
25 |
|
26 /* TODO(shess) Consider exporting this comment to an HTML file or the |
|
27 ** wiki. |
|
28 */ |
|
29 /* The full-text index is stored in a series of b+tree (-like) |
|
30 ** structures called segments which map terms to doclists. The |
|
31 ** structures are like b+trees in layout, but are constructed from the |
|
32 ** bottom up in optimal fashion and are not updatable. Since trees |
|
33 ** are built from the bottom up, things will be described from the |
|
34 ** bottom up. |
|
35 ** |
|
36 ** |
|
37 **** Varints **** |
|
38 ** The basic unit of encoding is a variable-length integer called a |
|
39 ** varint. We encode variable-length integers in little-endian order |
|
40 ** using seven bits * per byte as follows: |
|
41 ** |
|
42 ** KEY: |
|
43 ** A = 0xxxxxxx 7 bits of data and one flag bit |
|
44 ** B = 1xxxxxxx 7 bits of data and one flag bit |
|
45 ** |
|
46 ** 7 bits - A |
|
47 ** 14 bits - BA |
|
48 ** 21 bits - BBA |
|
49 ** and so on. |
|
50 ** |
|
51 ** This is identical to how sqlite encodes varints (see util.c). |
|
52 ** |
|
53 ** |
|
54 **** Document lists **** |
|
55 ** A doclist (document list) holds a docid-sorted list of hits for a |
|
56 ** given term. Doclists hold docids, and can optionally associate |
|
57 ** token positions and offsets with docids. |
|
58 ** |
|
59 ** A DL_POSITIONS_OFFSETS doclist is stored like this: |
|
60 ** |
|
61 ** array { |
|
62 ** varint docid; |
|
63 ** array { (position list for column 0) |
|
64 ** varint position; (delta from previous position plus POS_BASE) |
|
65 ** varint startOffset; (delta from previous startOffset) |
|
66 ** varint endOffset; (delta from startOffset) |
|
67 ** } |
|
68 ** array { |
|
69 ** varint POS_COLUMN; (marks start of position list for new column) |
|
70 ** varint column; (index of new column) |
|
71 ** array { |
|
72 ** varint position; (delta from previous position plus POS_BASE) |
|
73 ** varint startOffset;(delta from previous startOffset) |
|
74 ** varint endOffset; (delta from startOffset) |
|
75 ** } |
|
76 ** } |
|
77 ** varint POS_END; (marks end of positions for this document. |
|
78 ** } |
|
79 ** |
|
80 ** Here, array { X } means zero or more occurrences of X, adjacent in |
|
81 ** memory. A "position" is an index of a token in the token stream |
|
82 ** generated by the tokenizer, while an "offset" is a byte offset, |
|
83 ** both based at 0. Note that POS_END and POS_COLUMN occur in the |
|
84 ** same logical place as the position element, and act as sentinals |
|
85 ** ending a position list array. |
|
86 ** |
|
87 ** A DL_POSITIONS doclist omits the startOffset and endOffset |
|
88 ** information. A DL_DOCIDS doclist omits both the position and |
|
89 ** offset information, becoming an array of varint-encoded docids. |
|
90 ** |
|
91 ** On-disk data is stored as type DL_DEFAULT, so we don't serialize |
|
92 ** the type. Due to how deletion is implemented in the segmentation |
|
93 ** system, on-disk doclists MUST store at least positions. |
|
94 ** |
|
95 ** |
|
96 **** Segment leaf nodes **** |
|
97 ** Segment leaf nodes store terms and doclists, ordered by term. Leaf |
|
98 ** nodes are written using LeafWriter, and read using LeafReader (to |
|
99 ** iterate through a single leaf node's data) and LeavesReader (to |
|
100 ** iterate through a segment's entire leaf layer). Leaf nodes have |
|
101 ** the format: |
|
102 ** |
|
103 ** varint iHeight; (height from leaf level, always 0) |
|
104 ** varint nTerm; (length of first term) |
|
105 ** char pTerm[nTerm]; (content of first term) |
|
106 ** varint nDoclist; (length of term's associated doclist) |
|
107 ** char pDoclist[nDoclist]; (content of doclist) |
|
108 ** array { |
|
109 ** (further terms are delta-encoded) |
|
110 ** varint nPrefix; (length of prefix shared with previous term) |
|
111 ** varint nSuffix; (length of unshared suffix) |
|
112 ** char pTermSuffix[nSuffix];(unshared suffix of next term) |
|
113 ** varint nDoclist; (length of term's associated doclist) |
|
114 ** char pDoclist[nDoclist]; (content of doclist) |
|
115 ** } |
|
116 ** |
|
117 ** Here, array { X } means zero or more occurrences of X, adjacent in |
|
118 ** memory. |
|
119 ** |
|
120 ** Leaf nodes are broken into blocks which are stored contiguously in |
|
121 ** the %_segments table in sorted order. This means that when the end |
|
122 ** of a node is reached, the next term is in the node with the next |
|
123 ** greater node id. |
|
124 ** |
|
125 ** New data is spilled to a new leaf node when the current node |
|
126 ** exceeds LEAF_MAX bytes (default 2048). New data which itself is |
|
127 ** larger than STANDALONE_MIN (default 1024) is placed in a standalone |
|
128 ** node (a leaf node with a single term and doclist). The goal of |
|
129 ** these settings is to pack together groups of small doclists while |
|
130 ** making it efficient to directly access large doclists. The |
|
131 ** assumption is that large doclists represent terms which are more |
|
132 ** likely to be query targets. |
|
133 ** |
|
134 ** TODO(shess) It may be useful for blocking decisions to be more |
|
135 ** dynamic. For instance, it may make more sense to have a 2.5k leaf |
|
136 ** node rather than splitting into 2k and .5k nodes. My intuition is |
|
137 ** that this might extend through 2x or 4x the pagesize. |
|
138 ** |
|
139 ** |
|
140 **** Segment interior nodes **** |
|
141 ** Segment interior nodes store blockids for subtree nodes and terms |
|
142 ** to describe what data is stored by the each subtree. Interior |
|
143 ** nodes are written using InteriorWriter, and read using |
|
144 ** InteriorReader. InteriorWriters are created as needed when |
|
145 ** SegmentWriter creates new leaf nodes, or when an interior node |
|
146 ** itself grows too big and must be split. The format of interior |
|
147 ** nodes: |
|
148 ** |
|
149 ** varint iHeight; (height from leaf level, always >0) |
|
150 ** varint iBlockid; (block id of node's leftmost subtree) |
|
151 ** optional { |
|
152 ** varint nTerm; (length of first term) |
|
153 ** char pTerm[nTerm]; (content of first term) |
|
154 ** array { |
|
155 ** (further terms are delta-encoded) |
|
156 ** varint nPrefix; (length of shared prefix with previous term) |
|
157 ** varint nSuffix; (length of unshared suffix) |
|
158 ** char pTermSuffix[nSuffix]; (unshared suffix of next term) |
|
159 ** } |
|
160 ** } |
|
161 ** |
|
162 ** Here, optional { X } means an optional element, while array { X } |
|
163 ** means zero or more occurrences of X, adjacent in memory. |
|
164 ** |
|
165 ** An interior node encodes n terms separating n+1 subtrees. The |
|
166 ** subtree blocks are contiguous, so only the first subtree's blockid |
|
167 ** is encoded. The subtree at iBlockid will contain all terms less |
|
168 ** than the first term encoded (or all terms if no term is encoded). |
|
169 ** Otherwise, for terms greater than or equal to pTerm[i] but less |
|
170 ** than pTerm[i+1], the subtree for that term will be rooted at |
|
171 ** iBlockid+i. Interior nodes only store enough term data to |
|
172 ** distinguish adjacent children (if the rightmost term of the left |
|
173 ** child is "something", and the leftmost term of the right child is |
|
174 ** "wicked", only "w" is stored). |
|
175 ** |
|
176 ** New data is spilled to a new interior node at the same height when |
|
177 ** the current node exceeds INTERIOR_MAX bytes (default 2048). |
|
178 ** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing |
|
179 ** interior nodes and making the tree too skinny. The interior nodes |
|
180 ** at a given height are naturally tracked by interior nodes at |
|
181 ** height+1, and so on. |
|
182 ** |
|
183 ** |
|
184 **** Segment directory **** |
|
185 ** The segment directory in table %_segdir stores meta-information for |
|
186 ** merging and deleting segments, and also the root node of the |
|
187 ** segment's tree. |
|
188 ** |
|
189 ** The root node is the top node of the segment's tree after encoding |
|
190 ** the entire segment, restricted to ROOT_MAX bytes (default 1024). |
|
191 ** This could be either a leaf node or an interior node. If the top |
|
192 ** node requires more than ROOT_MAX bytes, it is flushed to %_segments |
|
193 ** and a new root interior node is generated (which should always fit |
|
194 ** within ROOT_MAX because it only needs space for 2 varints, the |
|
195 ** height and the blockid of the previous root). |
|
196 ** |
|
197 ** The meta-information in the segment directory is: |
|
198 ** level - segment level (see below) |
|
199 ** idx - index within level |
|
200 ** - (level,idx uniquely identify a segment) |
|
201 ** start_block - first leaf node |
|
202 ** leaves_end_block - last leaf node |
|
203 ** end_block - last block (including interior nodes) |
|
204 ** root - contents of root node |
|
205 ** |
|
206 ** If the root node is a leaf node, then start_block, |
|
207 ** leaves_end_block, and end_block are all 0. |
|
208 ** |
|
209 ** |
|
210 **** Segment merging **** |
|
211 ** To amortize update costs, segments are groups into levels and |
|
212 ** merged in matches. Each increase in level represents exponentially |
|
213 ** more documents. |
|
214 ** |
|
215 ** New documents (actually, document updates) are tokenized and |
|
216 ** written individually (using LeafWriter) to a level 0 segment, with |
|
217 ** incrementing idx. When idx reaches MERGE_COUNT (default 16), all |
|
218 ** level 0 segments are merged into a single level 1 segment. Level 1 |
|
219 ** is populated like level 0, and eventually MERGE_COUNT level 1 |
|
220 ** segments are merged to a single level 2 segment (representing |
|
221 ** MERGE_COUNT^2 updates), and so on. |
|
222 ** |
|
223 ** A segment merge traverses all segments at a given level in |
|
224 ** parallel, performing a straightforward sorted merge. Since segment |
|
225 ** leaf nodes are written in to the %_segments table in order, this |
|
226 ** merge traverses the underlying sqlite disk structures efficiently. |
|
227 ** After the merge, all segment blocks from the merged level are |
|
228 ** deleted. |
|
229 ** |
|
230 ** MERGE_COUNT controls how often we merge segments. 16 seems to be |
|
231 ** somewhat of a sweet spot for insertion performance. 32 and 64 show |
|
232 ** very similar performance numbers to 16 on insertion, though they're |
|
233 ** a tiny bit slower (perhaps due to more overhead in merge-time |
|
234 ** sorting). 8 is about 20% slower than 16, 4 about 50% slower than |
|
235 ** 16, 2 about 66% slower than 16. |
|
236 ** |
|
237 ** At query time, high MERGE_COUNT increases the number of segments |
|
238 ** which need to be scanned and merged. For instance, with 100k docs |
|
239 ** inserted: |
|
240 ** |
|
241 ** MERGE_COUNT segments |
|
242 ** 16 25 |
|
243 ** 8 12 |
|
244 ** 4 10 |
|
245 ** 2 6 |
|
246 ** |
|
247 ** This appears to have only a moderate impact on queries for very |
|
248 ** frequent terms (which are somewhat dominated by segment merge |
|
249 ** costs), and infrequent and non-existent terms still seem to be fast |
|
250 ** even with many segments. |
|
251 ** |
|
252 ** TODO(shess) That said, it would be nice to have a better query-side |
|
253 ** argument for MERGE_COUNT of 16. Also, it is possible/likely that |
|
254 ** optimizations to things like doclist merging will swing the sweet |
|
255 ** spot around. |
|
256 ** |
|
257 ** |
|
258 ** |
|
259 **** Handling of deletions and updates **** |
|
260 ** Since we're using a segmented structure, with no docid-oriented |
|
261 ** index into the term index, we clearly cannot simply update the term |
|
262 ** index when a document is deleted or updated. For deletions, we |
|
263 ** write an empty doclist (varint(docid) varint(POS_END)), for updates |
|
264 ** we simply write the new doclist. Segment merges overwrite older |
|
265 ** data for a particular docid with newer data, so deletes or updates |
|
266 ** will eventually overtake the earlier data and knock it out. The |
|
267 ** query logic likewise merges doclists so that newer data knocks out |
|
268 ** older data. |
|
269 ** |
|
270 ** TODO(shess) Provide a VACUUM type operation to clear out all |
|
271 ** deletions and duplications. This would basically be a forced merge |
|
272 ** into a single segment. |
|
273 */ |
|
274 |
|
275 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) |
|
276 |
|
277 #if defined(SQLITE_ENABLE_FTS3) && !defined(SQLITE_CORE) |
|
278 # define SQLITE_CORE 1 |
|
279 #endif |
|
280 |
|
281 #include <assert.h> |
|
282 #include <stdlib.h> |
|
283 #include <stdio.h> |
|
284 #include <string.h> |
|
285 #include <ctype.h> |
|
286 |
|
287 #include "fts3.h" |
|
288 #include "fts3_hash.h" |
|
289 #include "fts3_tokenizer.h" |
|
290 #ifndef SQLITE_CORE |
|
291 # include "sqlite3ext.h" |
|
292 SQLITE_EXTENSION_INIT1 |
|
293 #endif |
|
294 |
|
295 |
|
296 /* TODO(shess) MAN, this thing needs some refactoring. At minimum, it |
|
297 ** would be nice to order the file better, perhaps something along the |
|
298 ** lines of: |
|
299 ** |
|
300 ** - utility functions |
|
301 ** - table setup functions |
|
302 ** - table update functions |
|
303 ** - table query functions |
|
304 ** |
|
305 ** Put the query functions last because they're likely to reference |
|
306 ** typedefs or functions from the table update section. |
|
307 */ |
|
308 |
|
309 #if 0 |
|
310 # define FTSTRACE(A) printf A; fflush(stdout) |
|
311 #else |
|
312 # define FTSTRACE(A) |
|
313 #endif |
|
314 |
|
315 /* |
|
316 ** Default span for NEAR operators. |
|
317 */ |
|
318 #define SQLITE_FTS3_DEFAULT_NEAR_PARAM 10 |
|
319 |
|
320 /* It is not safe to call isspace(), tolower(), or isalnum() on |
|
321 ** hi-bit-set characters. This is the same solution used in the |
|
322 ** tokenizer. |
|
323 */ |
|
324 /* TODO(shess) The snippet-generation code should be using the |
|
325 ** tokenizer-generated tokens rather than doing its own local |
|
326 ** tokenization. |
|
327 */ |
|
328 /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */ |
|
329 static int safe_isspace(char c){ |
|
330 return (c&0x80)==0 ? isspace(c) : 0; |
|
331 } |
|
332 static int safe_tolower(char c){ |
|
333 return (c&0x80)==0 ? tolower(c) : c; |
|
334 } |
|
335 static int safe_isalnum(char c){ |
|
336 return (c&0x80)==0 ? isalnum(c) : 0; |
|
337 } |
|
338 |
|
339 typedef enum DocListType { |
|
340 DL_DOCIDS, /* docids only */ |
|
341 DL_POSITIONS, /* docids + positions */ |
|
342 DL_POSITIONS_OFFSETS /* docids + positions + offsets */ |
|
343 } DocListType; |
|
344 |
|
345 /* |
|
346 ** By default, only positions and not offsets are stored in the doclists. |
|
347 ** To change this so that offsets are stored too, compile with |
|
348 ** |
|
349 ** -DDL_DEFAULT=DL_POSITIONS_OFFSETS |
|
350 ** |
|
351 ** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted |
|
352 ** into (no deletes or updates). |
|
353 */ |
|
354 #ifndef DL_DEFAULT |
|
355 # define DL_DEFAULT DL_POSITIONS |
|
356 #endif |
|
357 |
|
358 enum { |
|
359 POS_END = 0, /* end of this position list */ |
|
360 POS_COLUMN, /* followed by new column number */ |
|
361 POS_BASE |
|
362 }; |
|
363 |
|
364 /* MERGE_COUNT controls how often we merge segments (see comment at |
|
365 ** top of file). |
|
366 */ |
|
367 #define MERGE_COUNT 16 |
|
368 |
|
369 /* utility functions */ |
|
370 |
|
371 /* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single |
|
372 ** record to prevent errors of the form: |
|
373 ** |
|
374 ** my_function(SomeType *b){ |
|
375 ** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b) |
|
376 ** } |
|
377 */ |
|
378 /* TODO(shess) Obvious candidates for a header file. */ |
|
379 #define CLEAR(b) memset(b, '\0', sizeof(*(b))) |
|
380 |
|
381 #ifndef NDEBUG |
|
382 # define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b))) |
|
383 #else |
|
384 # define SCRAMBLE(b) |
|
385 #endif |
|
386 |
|
387 /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ |
|
388 #define VARINT_MAX 10 |
|
389 |
|
390 /* Write a 64-bit variable-length integer to memory starting at p[0]. |
|
391 * The length of data written will be between 1 and VARINT_MAX bytes. |
|
392 * The number of bytes written is returned. */ |
|
393 static int fts3PutVarint(char *p, sqlite_int64 v){ |
|
394 unsigned char *q = (unsigned char *) p; |
|
395 sqlite_uint64 vu = v; |
|
396 do{ |
|
397 *q++ = (unsigned char) ((vu & 0x7f) | 0x80); |
|
398 vu >>= 7; |
|
399 }while( vu!=0 ); |
|
400 q[-1] &= 0x7f; /* turn off high bit in final byte */ |
|
401 assert( q - (unsigned char *)p <= VARINT_MAX ); |
|
402 return (int) (q - (unsigned char *)p); |
|
403 } |
|
404 |
|
405 /* Read a 64-bit variable-length integer from memory starting at p[0]. |
|
406 * Return the number of bytes read, or 0 on error. |
|
407 * The value is stored in *v. */ |
|
408 static int fts3GetVarint(const char *p, sqlite_int64 *v){ |
|
409 const unsigned char *q = (const unsigned char *) p; |
|
410 sqlite_uint64 x = 0, y = 1; |
|
411 while( (*q & 0x80) == 0x80 ){ |
|
412 x += y * (*q++ & 0x7f); |
|
413 y <<= 7; |
|
414 if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ |
|
415 assert( 0 ); |
|
416 return 0; |
|
417 } |
|
418 } |
|
419 x += y * (*q++); |
|
420 *v = (sqlite_int64) x; |
|
421 return (int) (q - (unsigned char *)p); |
|
422 } |
|
423 |
|
424 static int fts3GetVarint32(const char *p, int *pi){ |
|
425 sqlite_int64 i; |
|
426 int ret = fts3GetVarint(p, &i); |
|
427 *pi = (int) i; |
|
428 assert( *pi==i ); |
|
429 return ret; |
|
430 } |
|
431 |
|
432 /*******************************************************************/ |
|
433 /* DataBuffer is used to collect data into a buffer in piecemeal |
|
434 ** fashion. It implements the usual distinction between amount of |
|
435 ** data currently stored (nData) and buffer capacity (nCapacity). |
|
436 ** |
|
437 ** dataBufferInit - create a buffer with given initial capacity. |
|
438 ** dataBufferReset - forget buffer's data, retaining capacity. |
|
439 ** dataBufferDestroy - free buffer's data. |
|
440 ** dataBufferSwap - swap contents of two buffers. |
|
441 ** dataBufferExpand - expand capacity without adding data. |
|
442 ** dataBufferAppend - append data. |
|
443 ** dataBufferAppend2 - append two pieces of data at once. |
|
444 ** dataBufferReplace - replace buffer's data. |
|
445 */ |
|
446 typedef struct DataBuffer { |
|
447 char *pData; /* Pointer to malloc'ed buffer. */ |
|
448 int nCapacity; /* Size of pData buffer. */ |
|
449 int nData; /* End of data loaded into pData. */ |
|
450 } DataBuffer; |
|
451 |
|
452 static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){ |
|
453 assert( nCapacity>=0 ); |
|
454 pBuffer->nData = 0; |
|
455 pBuffer->nCapacity = nCapacity; |
|
456 pBuffer->pData = nCapacity==0 ? NULL : sqlite3_malloc(nCapacity); |
|
457 } |
|
458 static void dataBufferReset(DataBuffer *pBuffer){ |
|
459 pBuffer->nData = 0; |
|
460 } |
|
461 static void dataBufferDestroy(DataBuffer *pBuffer){ |
|
462 if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData); |
|
463 SCRAMBLE(pBuffer); |
|
464 } |
|
465 static void dataBufferSwap(DataBuffer *pBuffer1, DataBuffer *pBuffer2){ |
|
466 DataBuffer tmp = *pBuffer1; |
|
467 *pBuffer1 = *pBuffer2; |
|
468 *pBuffer2 = tmp; |
|
469 } |
|
470 static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){ |
|
471 assert( nAddCapacity>0 ); |
|
472 /* TODO(shess) Consider expanding more aggressively. Note that the |
|
473 ** underlying malloc implementation may take care of such things for |
|
474 ** us already. |
|
475 */ |
|
476 if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){ |
|
477 pBuffer->nCapacity = pBuffer->nData+nAddCapacity; |
|
478 pBuffer->pData = sqlite3_realloc(pBuffer->pData, pBuffer->nCapacity); |
|
479 } |
|
480 } |
|
481 static void dataBufferAppend(DataBuffer *pBuffer, |
|
482 const char *pSource, int nSource){ |
|
483 assert( nSource>0 && pSource!=NULL ); |
|
484 dataBufferExpand(pBuffer, nSource); |
|
485 memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource); |
|
486 pBuffer->nData += nSource; |
|
487 } |
|
488 static void dataBufferAppend2(DataBuffer *pBuffer, |
|
489 const char *pSource1, int nSource1, |
|
490 const char *pSource2, int nSource2){ |
|
491 assert( nSource1>0 && pSource1!=NULL ); |
|
492 assert( nSource2>0 && pSource2!=NULL ); |
|
493 dataBufferExpand(pBuffer, nSource1+nSource2); |
|
494 memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1); |
|
495 memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2); |
|
496 pBuffer->nData += nSource1+nSource2; |
|
497 } |
|
498 static void dataBufferReplace(DataBuffer *pBuffer, |
|
499 const char *pSource, int nSource){ |
|
500 dataBufferReset(pBuffer); |
|
501 dataBufferAppend(pBuffer, pSource, nSource); |
|
502 } |
|
503 |
|
504 /* StringBuffer is a null-terminated version of DataBuffer. */ |
|
505 typedef struct StringBuffer { |
|
506 DataBuffer b; /* Includes null terminator. */ |
|
507 } StringBuffer; |
|
508 |
|
509 static void initStringBuffer(StringBuffer *sb){ |
|
510 dataBufferInit(&sb->b, 100); |
|
511 dataBufferReplace(&sb->b, "", 1); |
|
512 } |
|
513 static int stringBufferLength(StringBuffer *sb){ |
|
514 return sb->b.nData-1; |
|
515 } |
|
516 static char *stringBufferData(StringBuffer *sb){ |
|
517 return sb->b.pData; |
|
518 } |
|
519 static void stringBufferDestroy(StringBuffer *sb){ |
|
520 dataBufferDestroy(&sb->b); |
|
521 } |
|
522 |
|
523 static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){ |
|
524 assert( sb->b.nData>0 ); |
|
525 if( nFrom>0 ){ |
|
526 sb->b.nData--; |
|
527 dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1); |
|
528 } |
|
529 } |
|
530 static void append(StringBuffer *sb, const char *zFrom){ |
|
531 nappend(sb, zFrom, strlen(zFrom)); |
|
532 } |
|
533 |
|
534 /* Append a list of strings separated by commas. */ |
|
535 static void appendList(StringBuffer *sb, int nString, char **azString){ |
|
536 int i; |
|
537 for(i=0; i<nString; ++i){ |
|
538 if( i>0 ) append(sb, ", "); |
|
539 append(sb, azString[i]); |
|
540 } |
|
541 } |
|
542 |
|
543 static int endsInWhiteSpace(StringBuffer *p){ |
|
544 return stringBufferLength(p)>0 && |
|
545 safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]); |
|
546 } |
|
547 |
|
548 /* If the StringBuffer ends in something other than white space, add a |
|
549 ** single space character to the end. |
|
550 */ |
|
551 static void appendWhiteSpace(StringBuffer *p){ |
|
552 if( stringBufferLength(p)==0 ) return; |
|
553 if( !endsInWhiteSpace(p) ) append(p, " "); |
|
554 } |
|
555 |
|
556 /* Remove white space from the end of the StringBuffer */ |
|
557 static void trimWhiteSpace(StringBuffer *p){ |
|
558 while( endsInWhiteSpace(p) ){ |
|
559 p->b.pData[--p->b.nData-1] = '\0'; |
|
560 } |
|
561 } |
|
562 |
|
563 /*******************************************************************/ |
|
564 /* DLReader is used to read document elements from a doclist. The |
|
565 ** current docid is cached, so dlrDocid() is fast. DLReader does not |
|
566 ** own the doclist buffer. |
|
567 ** |
|
568 ** dlrAtEnd - true if there's no more data to read. |
|
569 ** dlrDocid - docid of current document. |
|
570 ** dlrDocData - doclist data for current document (including docid). |
|
571 ** dlrDocDataBytes - length of same. |
|
572 ** dlrAllDataBytes - length of all remaining data. |
|
573 ** dlrPosData - position data for current document. |
|
574 ** dlrPosDataLen - length of pos data for current document (incl POS_END). |
|
575 ** dlrStep - step to current document. |
|
576 ** dlrInit - initial for doclist of given type against given data. |
|
577 ** dlrDestroy - clean up. |
|
578 ** |
|
579 ** Expected usage is something like: |
|
580 ** |
|
581 ** DLReader reader; |
|
582 ** dlrInit(&reader, pData, nData); |
|
583 ** while( !dlrAtEnd(&reader) ){ |
|
584 ** // calls to dlrDocid() and kin. |
|
585 ** dlrStep(&reader); |
|
586 ** } |
|
587 ** dlrDestroy(&reader); |
|
588 */ |
|
589 typedef struct DLReader { |
|
590 DocListType iType; |
|
591 const char *pData; |
|
592 int nData; |
|
593 |
|
594 sqlite_int64 iDocid; |
|
595 int nElement; |
|
596 } DLReader; |
|
597 |
|
598 static int dlrAtEnd(DLReader *pReader){ |
|
599 assert( pReader->nData>=0 ); |
|
600 return pReader->nData==0; |
|
601 } |
|
602 static sqlite_int64 dlrDocid(DLReader *pReader){ |
|
603 assert( !dlrAtEnd(pReader) ); |
|
604 return pReader->iDocid; |
|
605 } |
|
606 static const char *dlrDocData(DLReader *pReader){ |
|
607 assert( !dlrAtEnd(pReader) ); |
|
608 return pReader->pData; |
|
609 } |
|
610 static int dlrDocDataBytes(DLReader *pReader){ |
|
611 assert( !dlrAtEnd(pReader) ); |
|
612 return pReader->nElement; |
|
613 } |
|
614 static int dlrAllDataBytes(DLReader *pReader){ |
|
615 assert( !dlrAtEnd(pReader) ); |
|
616 return pReader->nData; |
|
617 } |
|
618 /* TODO(shess) Consider adding a field to track iDocid varint length |
|
619 ** to make these two functions faster. This might matter (a tiny bit) |
|
620 ** for queries. |
|
621 */ |
|
622 static const char *dlrPosData(DLReader *pReader){ |
|
623 sqlite_int64 iDummy; |
|
624 int n = fts3GetVarint(pReader->pData, &iDummy); |
|
625 assert( !dlrAtEnd(pReader) ); |
|
626 return pReader->pData+n; |
|
627 } |
|
628 static int dlrPosDataLen(DLReader *pReader){ |
|
629 sqlite_int64 iDummy; |
|
630 int n = fts3GetVarint(pReader->pData, &iDummy); |
|
631 assert( !dlrAtEnd(pReader) ); |
|
632 return pReader->nElement-n; |
|
633 } |
|
634 static void dlrStep(DLReader *pReader){ |
|
635 assert( !dlrAtEnd(pReader) ); |
|
636 |
|
637 /* Skip past current doclist element. */ |
|
638 assert( pReader->nElement<=pReader->nData ); |
|
639 pReader->pData += pReader->nElement; |
|
640 pReader->nData -= pReader->nElement; |
|
641 |
|
642 /* If there is more data, read the next doclist element. */ |
|
643 if( pReader->nData!=0 ){ |
|
644 sqlite_int64 iDocidDelta; |
|
645 int iDummy, n = fts3GetVarint(pReader->pData, &iDocidDelta); |
|
646 pReader->iDocid += iDocidDelta; |
|
647 if( pReader->iType>=DL_POSITIONS ){ |
|
648 assert( n<pReader->nData ); |
|
649 while( 1 ){ |
|
650 n += fts3GetVarint32(pReader->pData+n, &iDummy); |
|
651 assert( n<=pReader->nData ); |
|
652 if( iDummy==POS_END ) break; |
|
653 if( iDummy==POS_COLUMN ){ |
|
654 n += fts3GetVarint32(pReader->pData+n, &iDummy); |
|
655 assert( n<pReader->nData ); |
|
656 }else if( pReader->iType==DL_POSITIONS_OFFSETS ){ |
|
657 n += fts3GetVarint32(pReader->pData+n, &iDummy); |
|
658 n += fts3GetVarint32(pReader->pData+n, &iDummy); |
|
659 assert( n<pReader->nData ); |
|
660 } |
|
661 } |
|
662 } |
|
663 pReader->nElement = n; |
|
664 assert( pReader->nElement<=pReader->nData ); |
|
665 } |
|
666 } |
|
667 static void dlrInit(DLReader *pReader, DocListType iType, |
|
668 const char *pData, int nData){ |
|
669 assert( pData!=NULL && nData!=0 ); |
|
670 pReader->iType = iType; |
|
671 pReader->pData = pData; |
|
672 pReader->nData = nData; |
|
673 pReader->nElement = 0; |
|
674 pReader->iDocid = 0; |
|
675 |
|
676 /* Load the first element's data. There must be a first element. */ |
|
677 dlrStep(pReader); |
|
678 } |
|
679 static void dlrDestroy(DLReader *pReader){ |
|
680 SCRAMBLE(pReader); |
|
681 } |
|
682 |
|
683 #ifndef NDEBUG |
|
684 /* Verify that the doclist can be validly decoded. Also returns the |
|
685 ** last docid found because it is convenient in other assertions for |
|
686 ** DLWriter. |
|
687 */ |
|
688 static void docListValidate(DocListType iType, const char *pData, int nData, |
|
689 sqlite_int64 *pLastDocid){ |
|
690 sqlite_int64 iPrevDocid = 0; |
|
691 assert( nData>0 ); |
|
692 assert( pData!=0 ); |
|
693 assert( pData+nData>pData ); |
|
694 while( nData!=0 ){ |
|
695 sqlite_int64 iDocidDelta; |
|
696 int n = fts3GetVarint(pData, &iDocidDelta); |
|
697 iPrevDocid += iDocidDelta; |
|
698 if( iType>DL_DOCIDS ){ |
|
699 int iDummy; |
|
700 while( 1 ){ |
|
701 n += fts3GetVarint32(pData+n, &iDummy); |
|
702 if( iDummy==POS_END ) break; |
|
703 if( iDummy==POS_COLUMN ){ |
|
704 n += fts3GetVarint32(pData+n, &iDummy); |
|
705 }else if( iType>DL_POSITIONS ){ |
|
706 n += fts3GetVarint32(pData+n, &iDummy); |
|
707 n += fts3GetVarint32(pData+n, &iDummy); |
|
708 } |
|
709 assert( n<=nData ); |
|
710 } |
|
711 } |
|
712 assert( n<=nData ); |
|
713 pData += n; |
|
714 nData -= n; |
|
715 } |
|
716 if( pLastDocid ) *pLastDocid = iPrevDocid; |
|
717 } |
|
718 #define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o) |
|
719 #else |
|
720 #define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 ) |
|
721 #endif |
|
722 |
|
723 /*******************************************************************/ |
|
724 /* DLWriter is used to write doclist data to a DataBuffer. DLWriter |
|
725 ** always appends to the buffer and does not own it. |
|
726 ** |
|
727 ** dlwInit - initialize to write a given type doclistto a buffer. |
|
728 ** dlwDestroy - clear the writer's memory. Does not free buffer. |
|
729 ** dlwAppend - append raw doclist data to buffer. |
|
730 ** dlwCopy - copy next doclist from reader to writer. |
|
731 ** dlwAdd - construct doclist element and append to buffer. |
|
732 ** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter). |
|
733 */ |
|
734 typedef struct DLWriter { |
|
735 DocListType iType; |
|
736 DataBuffer *b; |
|
737 sqlite_int64 iPrevDocid; |
|
738 #ifndef NDEBUG |
|
739 int has_iPrevDocid; |
|
740 #endif |
|
741 } DLWriter; |
|
742 |
|
743 static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){ |
|
744 pWriter->b = b; |
|
745 pWriter->iType = iType; |
|
746 pWriter->iPrevDocid = 0; |
|
747 #ifndef NDEBUG |
|
748 pWriter->has_iPrevDocid = 0; |
|
749 #endif |
|
750 } |
|
751 static void dlwDestroy(DLWriter *pWriter){ |
|
752 SCRAMBLE(pWriter); |
|
753 } |
|
754 /* iFirstDocid is the first docid in the doclist in pData. It is |
|
755 ** needed because pData may point within a larger doclist, in which |
|
756 ** case the first item would be delta-encoded. |
|
757 ** |
|
758 ** iLastDocid is the final docid in the doclist in pData. It is |
|
759 ** needed to create the new iPrevDocid for future delta-encoding. The |
|
760 ** code could decode the passed doclist to recreate iLastDocid, but |
|
761 ** the only current user (docListMerge) already has decoded this |
|
762 ** information. |
|
763 */ |
|
764 /* TODO(shess) This has become just a helper for docListMerge. |
|
765 ** Consider a refactor to make this cleaner. |
|
766 */ |
|
767 static void dlwAppend(DLWriter *pWriter, |
|
768 const char *pData, int nData, |
|
769 sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){ |
|
770 sqlite_int64 iDocid = 0; |
|
771 char c[VARINT_MAX]; |
|
772 int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */ |
|
773 #ifndef NDEBUG |
|
774 sqlite_int64 iLastDocidDelta; |
|
775 #endif |
|
776 |
|
777 /* Recode the initial docid as delta from iPrevDocid. */ |
|
778 nFirstOld = fts3GetVarint(pData, &iDocid); |
|
779 assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) ); |
|
780 nFirstNew = fts3PutVarint(c, iFirstDocid-pWriter->iPrevDocid); |
|
781 |
|
782 /* Verify that the incoming doclist is valid AND that it ends with |
|
783 ** the expected docid. This is essential because we'll trust this |
|
784 ** docid in future delta-encoding. |
|
785 */ |
|
786 ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta); |
|
787 assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta ); |
|
788 |
|
789 /* Append recoded initial docid and everything else. Rest of docids |
|
790 ** should have been delta-encoded from previous initial docid. |
|
791 */ |
|
792 if( nFirstOld<nData ){ |
|
793 dataBufferAppend2(pWriter->b, c, nFirstNew, |
|
794 pData+nFirstOld, nData-nFirstOld); |
|
795 }else{ |
|
796 dataBufferAppend(pWriter->b, c, nFirstNew); |
|
797 } |
|
798 pWriter->iPrevDocid = iLastDocid; |
|
799 } |
|
800 static void dlwCopy(DLWriter *pWriter, DLReader *pReader){ |
|
801 dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader), |
|
802 dlrDocid(pReader), dlrDocid(pReader)); |
|
803 } |
|
804 static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){ |
|
805 char c[VARINT_MAX]; |
|
806 int n = fts3PutVarint(c, iDocid-pWriter->iPrevDocid); |
|
807 |
|
808 /* Docids must ascend. */ |
|
809 assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid ); |
|
810 assert( pWriter->iType==DL_DOCIDS ); |
|
811 |
|
812 dataBufferAppend(pWriter->b, c, n); |
|
813 pWriter->iPrevDocid = iDocid; |
|
814 #ifndef NDEBUG |
|
815 pWriter->has_iPrevDocid = 1; |
|
816 #endif |
|
817 } |
|
818 |
|
819 /*******************************************************************/ |
|
820 /* PLReader is used to read data from a document's position list. As |
|
821 ** the caller steps through the list, data is cached so that varints |
|
822 ** only need to be decoded once. |
|
823 ** |
|
824 ** plrInit, plrDestroy - create/destroy a reader. |
|
825 ** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors |
|
826 ** plrAtEnd - at end of stream, only call plrDestroy once true. |
|
827 ** plrStep - step to the next element. |
|
828 */ |
|
829 typedef struct PLReader { |
|
830 /* These refer to the next position's data. nData will reach 0 when |
|
831 ** reading the last position, so plrStep() signals EOF by setting |
|
832 ** pData to NULL. |
|
833 */ |
|
834 const char *pData; |
|
835 int nData; |
|
836 |
|
837 DocListType iType; |
|
838 int iColumn; /* the last column read */ |
|
839 int iPosition; /* the last position read */ |
|
840 int iStartOffset; /* the last start offset read */ |
|
841 int iEndOffset; /* the last end offset read */ |
|
842 } PLReader; |
|
843 |
|
844 static int plrAtEnd(PLReader *pReader){ |
|
845 return pReader->pData==NULL; |
|
846 } |
|
847 static int plrColumn(PLReader *pReader){ |
|
848 assert( !plrAtEnd(pReader) ); |
|
849 return pReader->iColumn; |
|
850 } |
|
851 static int plrPosition(PLReader *pReader){ |
|
852 assert( !plrAtEnd(pReader) ); |
|
853 return pReader->iPosition; |
|
854 } |
|
855 static int plrStartOffset(PLReader *pReader){ |
|
856 assert( !plrAtEnd(pReader) ); |
|
857 return pReader->iStartOffset; |
|
858 } |
|
859 static int plrEndOffset(PLReader *pReader){ |
|
860 assert( !plrAtEnd(pReader) ); |
|
861 return pReader->iEndOffset; |
|
862 } |
|
863 static void plrStep(PLReader *pReader){ |
|
864 int i, n; |
|
865 |
|
866 assert( !plrAtEnd(pReader) ); |
|
867 |
|
868 if( pReader->nData==0 ){ |
|
869 pReader->pData = NULL; |
|
870 return; |
|
871 } |
|
872 |
|
873 n = fts3GetVarint32(pReader->pData, &i); |
|
874 if( i==POS_COLUMN ){ |
|
875 n += fts3GetVarint32(pReader->pData+n, &pReader->iColumn); |
|
876 pReader->iPosition = 0; |
|
877 pReader->iStartOffset = 0; |
|
878 n += fts3GetVarint32(pReader->pData+n, &i); |
|
879 } |
|
880 /* Should never see adjacent column changes. */ |
|
881 assert( i!=POS_COLUMN ); |
|
882 |
|
883 if( i==POS_END ){ |
|
884 pReader->nData = 0; |
|
885 pReader->pData = NULL; |
|
886 return; |
|
887 } |
|
888 |
|
889 pReader->iPosition += i-POS_BASE; |
|
890 if( pReader->iType==DL_POSITIONS_OFFSETS ){ |
|
891 n += fts3GetVarint32(pReader->pData+n, &i); |
|
892 pReader->iStartOffset += i; |
|
893 n += fts3GetVarint32(pReader->pData+n, &i); |
|
894 pReader->iEndOffset = pReader->iStartOffset+i; |
|
895 } |
|
896 assert( n<=pReader->nData ); |
|
897 pReader->pData += n; |
|
898 pReader->nData -= n; |
|
899 } |
|
900 |
|
901 static void plrInit(PLReader *pReader, DLReader *pDLReader){ |
|
902 pReader->pData = dlrPosData(pDLReader); |
|
903 pReader->nData = dlrPosDataLen(pDLReader); |
|
904 pReader->iType = pDLReader->iType; |
|
905 pReader->iColumn = 0; |
|
906 pReader->iPosition = 0; |
|
907 pReader->iStartOffset = 0; |
|
908 pReader->iEndOffset = 0; |
|
909 plrStep(pReader); |
|
910 } |
|
911 static void plrDestroy(PLReader *pReader){ |
|
912 SCRAMBLE(pReader); |
|
913 } |
|
914 |
|
915 /*******************************************************************/ |
|
916 /* PLWriter is used in constructing a document's position list. As a |
|
917 ** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op. |
|
918 ** PLWriter writes to the associated DLWriter's buffer. |
|
919 ** |
|
920 ** plwInit - init for writing a document's poslist. |
|
921 ** plwDestroy - clear a writer. |
|
922 ** plwAdd - append position and offset information. |
|
923 ** plwCopy - copy next position's data from reader to writer. |
|
924 ** plwTerminate - add any necessary doclist terminator. |
|
925 ** |
|
926 ** Calling plwAdd() after plwTerminate() may result in a corrupt |
|
927 ** doclist. |
|
928 */ |
|
929 /* TODO(shess) Until we've written the second item, we can cache the |
|
930 ** first item's information. Then we'd have three states: |
|
931 ** |
|
932 ** - initialized with docid, no positions. |
|
933 ** - docid and one position. |
|
934 ** - docid and multiple positions. |
|
935 ** |
|
936 ** Only the last state needs to actually write to dlw->b, which would |
|
937 ** be an improvement in the DLCollector case. |
|
938 */ |
|
939 typedef struct PLWriter { |
|
940 DLWriter *dlw; |
|
941 |
|
942 int iColumn; /* the last column written */ |
|
943 int iPos; /* the last position written */ |
|
944 int iOffset; /* the last start offset written */ |
|
945 } PLWriter; |
|
946 |
|
947 /* TODO(shess) In the case where the parent is reading these values |
|
948 ** from a PLReader, we could optimize to a copy if that PLReader has |
|
949 ** the same type as pWriter. |
|
950 */ |
|
951 static void plwAdd(PLWriter *pWriter, int iColumn, int iPos, |
|
952 int iStartOffset, int iEndOffset){ |
|
953 /* Worst-case space for POS_COLUMN, iColumn, iPosDelta, |
|
954 ** iStartOffsetDelta, and iEndOffsetDelta. |
|
955 */ |
|
956 char c[5*VARINT_MAX]; |
|
957 int n = 0; |
|
958 |
|
959 /* Ban plwAdd() after plwTerminate(). */ |
|
960 assert( pWriter->iPos!=-1 ); |
|
961 |
|
962 if( pWriter->dlw->iType==DL_DOCIDS ) return; |
|
963 |
|
964 if( iColumn!=pWriter->iColumn ){ |
|
965 n += fts3PutVarint(c+n, POS_COLUMN); |
|
966 n += fts3PutVarint(c+n, iColumn); |
|
967 pWriter->iColumn = iColumn; |
|
968 pWriter->iPos = 0; |
|
969 pWriter->iOffset = 0; |
|
970 } |
|
971 assert( iPos>=pWriter->iPos ); |
|
972 n += fts3PutVarint(c+n, POS_BASE+(iPos-pWriter->iPos)); |
|
973 pWriter->iPos = iPos; |
|
974 if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){ |
|
975 assert( iStartOffset>=pWriter->iOffset ); |
|
976 n += fts3PutVarint(c+n, iStartOffset-pWriter->iOffset); |
|
977 pWriter->iOffset = iStartOffset; |
|
978 assert( iEndOffset>=iStartOffset ); |
|
979 n += fts3PutVarint(c+n, iEndOffset-iStartOffset); |
|
980 } |
|
981 dataBufferAppend(pWriter->dlw->b, c, n); |
|
982 } |
|
983 static void plwCopy(PLWriter *pWriter, PLReader *pReader){ |
|
984 plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader), |
|
985 plrStartOffset(pReader), plrEndOffset(pReader)); |
|
986 } |
|
987 static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){ |
|
988 char c[VARINT_MAX]; |
|
989 int n; |
|
990 |
|
991 pWriter->dlw = dlw; |
|
992 |
|
993 /* Docids must ascend. */ |
|
994 assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid ); |
|
995 n = fts3PutVarint(c, iDocid-pWriter->dlw->iPrevDocid); |
|
996 dataBufferAppend(pWriter->dlw->b, c, n); |
|
997 pWriter->dlw->iPrevDocid = iDocid; |
|
998 #ifndef NDEBUG |
|
999 pWriter->dlw->has_iPrevDocid = 1; |
|
1000 #endif |
|
1001 |
|
1002 pWriter->iColumn = 0; |
|
1003 pWriter->iPos = 0; |
|
1004 pWriter->iOffset = 0; |
|
1005 } |
|
1006 /* TODO(shess) Should plwDestroy() also terminate the doclist? But |
|
1007 ** then plwDestroy() would no longer be just a destructor, it would |
|
1008 ** also be doing work, which isn't consistent with the overall idiom. |
|
1009 ** Another option would be for plwAdd() to always append any necessary |
|
1010 ** terminator, so that the output is always correct. But that would |
|
1011 ** add incremental work to the common case with the only benefit being |
|
1012 ** API elegance. Punt for now. |
|
1013 */ |
|
1014 static void plwTerminate(PLWriter *pWriter){ |
|
1015 if( pWriter->dlw->iType>DL_DOCIDS ){ |
|
1016 char c[VARINT_MAX]; |
|
1017 int n = fts3PutVarint(c, POS_END); |
|
1018 dataBufferAppend(pWriter->dlw->b, c, n); |
|
1019 } |
|
1020 #ifndef NDEBUG |
|
1021 /* Mark as terminated for assert in plwAdd(). */ |
|
1022 pWriter->iPos = -1; |
|
1023 #endif |
|
1024 } |
|
1025 static void plwDestroy(PLWriter *pWriter){ |
|
1026 SCRAMBLE(pWriter); |
|
1027 } |
|
1028 |
|
1029 /*******************************************************************/ |
|
1030 /* DLCollector wraps PLWriter and DLWriter to provide a |
|
1031 ** dynamically-allocated doclist area to use during tokenization. |
|
1032 ** |
|
1033 ** dlcNew - malloc up and initialize a collector. |
|
1034 ** dlcDelete - destroy a collector and all contained items. |
|
1035 ** dlcAddPos - append position and offset information. |
|
1036 ** dlcAddDoclist - add the collected doclist to the given buffer. |
|
1037 ** dlcNext - terminate the current document and open another. |
|
1038 */ |
|
1039 typedef struct DLCollector { |
|
1040 DataBuffer b; |
|
1041 DLWriter dlw; |
|
1042 PLWriter plw; |
|
1043 } DLCollector; |
|
1044 |
|
1045 /* TODO(shess) This could also be done by calling plwTerminate() and |
|
1046 ** dataBufferAppend(). I tried that, expecting nominal performance |
|
1047 ** differences, but it seemed to pretty reliably be worth 1% to code |
|
1048 ** it this way. I suspect it is the incremental malloc overhead (some |
|
1049 ** percentage of the plwTerminate() calls will cause a realloc), so |
|
1050 ** this might be worth revisiting if the DataBuffer implementation |
|
1051 ** changes. |
|
1052 */ |
|
1053 static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){ |
|
1054 if( pCollector->dlw.iType>DL_DOCIDS ){ |
|
1055 char c[VARINT_MAX]; |
|
1056 int n = fts3PutVarint(c, POS_END); |
|
1057 dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n); |
|
1058 }else{ |
|
1059 dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData); |
|
1060 } |
|
1061 } |
|
1062 static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){ |
|
1063 plwTerminate(&pCollector->plw); |
|
1064 plwDestroy(&pCollector->plw); |
|
1065 plwInit(&pCollector->plw, &pCollector->dlw, iDocid); |
|
1066 } |
|
1067 static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos, |
|
1068 int iStartOffset, int iEndOffset){ |
|
1069 plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset); |
|
1070 } |
|
1071 |
|
1072 static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){ |
|
1073 DLCollector *pCollector = sqlite3_malloc(sizeof(DLCollector)); |
|
1074 dataBufferInit(&pCollector->b, 0); |
|
1075 dlwInit(&pCollector->dlw, iType, &pCollector->b); |
|
1076 plwInit(&pCollector->plw, &pCollector->dlw, iDocid); |
|
1077 return pCollector; |
|
1078 } |
|
1079 static void dlcDelete(DLCollector *pCollector){ |
|
1080 plwDestroy(&pCollector->plw); |
|
1081 dlwDestroy(&pCollector->dlw); |
|
1082 dataBufferDestroy(&pCollector->b); |
|
1083 SCRAMBLE(pCollector); |
|
1084 sqlite3_free(pCollector); |
|
1085 } |
|
1086 |
|
1087 |
|
1088 /* Copy the doclist data of iType in pData/nData into *out, trimming |
|
1089 ** unnecessary data as we go. Only columns matching iColumn are |
|
1090 ** copied, all columns copied if iColumn is -1. Elements with no |
|
1091 ** matching columns are dropped. The output is an iOutType doclist. |
|
1092 */ |
|
1093 /* NOTE(shess) This code is only valid after all doclists are merged. |
|
1094 ** If this is run before merges, then doclist items which represent |
|
1095 ** deletion will be trimmed, and will thus not effect a deletion |
|
1096 ** during the merge. |
|
1097 */ |
|
1098 static void docListTrim(DocListType iType, const char *pData, int nData, |
|
1099 int iColumn, DocListType iOutType, DataBuffer *out){ |
|
1100 DLReader dlReader; |
|
1101 DLWriter dlWriter; |
|
1102 |
|
1103 assert( iOutType<=iType ); |
|
1104 |
|
1105 dlrInit(&dlReader, iType, pData, nData); |
|
1106 dlwInit(&dlWriter, iOutType, out); |
|
1107 |
|
1108 while( !dlrAtEnd(&dlReader) ){ |
|
1109 PLReader plReader; |
|
1110 PLWriter plWriter; |
|
1111 int match = 0; |
|
1112 |
|
1113 plrInit(&plReader, &dlReader); |
|
1114 |
|
1115 while( !plrAtEnd(&plReader) ){ |
|
1116 if( iColumn==-1 || plrColumn(&plReader)==iColumn ){ |
|
1117 if( !match ){ |
|
1118 plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader)); |
|
1119 match = 1; |
|
1120 } |
|
1121 plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader), |
|
1122 plrStartOffset(&plReader), plrEndOffset(&plReader)); |
|
1123 } |
|
1124 plrStep(&plReader); |
|
1125 } |
|
1126 if( match ){ |
|
1127 plwTerminate(&plWriter); |
|
1128 plwDestroy(&plWriter); |
|
1129 } |
|
1130 |
|
1131 plrDestroy(&plReader); |
|
1132 dlrStep(&dlReader); |
|
1133 } |
|
1134 dlwDestroy(&dlWriter); |
|
1135 dlrDestroy(&dlReader); |
|
1136 } |
|
1137 |
|
1138 /* Used by docListMerge() to keep doclists in the ascending order by |
|
1139 ** docid, then ascending order by age (so the newest comes first). |
|
1140 */ |
|
1141 typedef struct OrderedDLReader { |
|
1142 DLReader *pReader; |
|
1143 |
|
1144 /* TODO(shess) If we assume that docListMerge pReaders is ordered by |
|
1145 ** age (which we do), then we could use pReader comparisons to break |
|
1146 ** ties. |
|
1147 */ |
|
1148 int idx; |
|
1149 } OrderedDLReader; |
|
1150 |
|
1151 /* Order eof to end, then by docid asc, idx desc. */ |
|
1152 static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){ |
|
1153 if( dlrAtEnd(r1->pReader) ){ |
|
1154 if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */ |
|
1155 return 1; /* Only r1 atEnd(). */ |
|
1156 } |
|
1157 if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */ |
|
1158 |
|
1159 if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1; |
|
1160 if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1; |
|
1161 |
|
1162 /* Descending on idx. */ |
|
1163 return r2->idx-r1->idx; |
|
1164 } |
|
1165 |
|
1166 /* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that |
|
1167 ** p[1..n-1] is already sorted. |
|
1168 */ |
|
1169 /* TODO(shess) Is this frequent enough to warrant a binary search? |
|
1170 ** Before implementing that, instrument the code to check. In most |
|
1171 ** current usage, I expect that p[0] will be less than p[1] a very |
|
1172 ** high proportion of the time. |
|
1173 */ |
|
1174 static void orderedDLReaderReorder(OrderedDLReader *p, int n){ |
|
1175 while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){ |
|
1176 OrderedDLReader tmp = p[0]; |
|
1177 p[0] = p[1]; |
|
1178 p[1] = tmp; |
|
1179 n--; |
|
1180 p++; |
|
1181 } |
|
1182 } |
|
1183 |
|
1184 /* Given an array of doclist readers, merge their doclist elements |
|
1185 ** into out in sorted order (by docid), dropping elements from older |
|
1186 ** readers when there is a duplicate docid. pReaders is assumed to be |
|
1187 ** ordered by age, oldest first. |
|
1188 */ |
|
1189 /* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably |
|
1190 ** be fixed. |
|
1191 */ |
|
1192 static void docListMerge(DataBuffer *out, |
|
1193 DLReader *pReaders, int nReaders){ |
|
1194 OrderedDLReader readers[MERGE_COUNT]; |
|
1195 DLWriter writer; |
|
1196 int i, n; |
|
1197 const char *pStart = 0; |
|
1198 int nStart = 0; |
|
1199 sqlite_int64 iFirstDocid = 0, iLastDocid = 0; |
|
1200 |
|
1201 assert( nReaders>0 ); |
|
1202 if( nReaders==1 ){ |
|
1203 dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders)); |
|
1204 return; |
|
1205 } |
|
1206 |
|
1207 assert( nReaders<=MERGE_COUNT ); |
|
1208 n = 0; |
|
1209 for(i=0; i<nReaders; i++){ |
|
1210 assert( pReaders[i].iType==pReaders[0].iType ); |
|
1211 readers[i].pReader = pReaders+i; |
|
1212 readers[i].idx = i; |
|
1213 n += dlrAllDataBytes(&pReaders[i]); |
|
1214 } |
|
1215 /* Conservatively size output to sum of inputs. Output should end |
|
1216 ** up strictly smaller than input. |
|
1217 */ |
|
1218 dataBufferExpand(out, n); |
|
1219 |
|
1220 /* Get the readers into sorted order. */ |
|
1221 while( i-->0 ){ |
|
1222 orderedDLReaderReorder(readers+i, nReaders-i); |
|
1223 } |
|
1224 |
|
1225 dlwInit(&writer, pReaders[0].iType, out); |
|
1226 while( !dlrAtEnd(readers[0].pReader) ){ |
|
1227 sqlite_int64 iDocid = dlrDocid(readers[0].pReader); |
|
1228 |
|
1229 /* If this is a continuation of the current buffer to copy, extend |
|
1230 ** that buffer. memcpy() seems to be more efficient if it has a |
|
1231 ** lots of data to copy. |
|
1232 */ |
|
1233 if( dlrDocData(readers[0].pReader)==pStart+nStart ){ |
|
1234 nStart += dlrDocDataBytes(readers[0].pReader); |
|
1235 }else{ |
|
1236 if( pStart!=0 ){ |
|
1237 dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); |
|
1238 } |
|
1239 pStart = dlrDocData(readers[0].pReader); |
|
1240 nStart = dlrDocDataBytes(readers[0].pReader); |
|
1241 iFirstDocid = iDocid; |
|
1242 } |
|
1243 iLastDocid = iDocid; |
|
1244 dlrStep(readers[0].pReader); |
|
1245 |
|
1246 /* Drop all of the older elements with the same docid. */ |
|
1247 for(i=1; i<nReaders && |
|
1248 !dlrAtEnd(readers[i].pReader) && |
|
1249 dlrDocid(readers[i].pReader)==iDocid; i++){ |
|
1250 dlrStep(readers[i].pReader); |
|
1251 } |
|
1252 |
|
1253 /* Get the readers back into order. */ |
|
1254 while( i-->0 ){ |
|
1255 orderedDLReaderReorder(readers+i, nReaders-i); |
|
1256 } |
|
1257 } |
|
1258 |
|
1259 /* Copy over any remaining elements. */ |
|
1260 if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); |
|
1261 dlwDestroy(&writer); |
|
1262 } |
|
1263 |
|
1264 /* Helper function for posListUnion(). Compares the current position |
|
1265 ** between left and right, returning as standard C idiom of <0 if |
|
1266 ** left<right, >0 if left>right, and 0 if left==right. "End" always |
|
1267 ** compares greater. |
|
1268 */ |
|
1269 static int posListCmp(PLReader *pLeft, PLReader *pRight){ |
|
1270 assert( pLeft->iType==pRight->iType ); |
|
1271 if( pLeft->iType==DL_DOCIDS ) return 0; |
|
1272 |
|
1273 if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1; |
|
1274 if( plrAtEnd(pRight) ) return -1; |
|
1275 |
|
1276 if( plrColumn(pLeft)<plrColumn(pRight) ) return -1; |
|
1277 if( plrColumn(pLeft)>plrColumn(pRight) ) return 1; |
|
1278 |
|
1279 if( plrPosition(pLeft)<plrPosition(pRight) ) return -1; |
|
1280 if( plrPosition(pLeft)>plrPosition(pRight) ) return 1; |
|
1281 if( pLeft->iType==DL_POSITIONS ) return 0; |
|
1282 |
|
1283 if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1; |
|
1284 if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1; |
|
1285 |
|
1286 if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1; |
|
1287 if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1; |
|
1288 |
|
1289 return 0; |
|
1290 } |
|
1291 |
|
1292 /* Write the union of position lists in pLeft and pRight to pOut. |
|
1293 ** "Union" in this case meaning "All unique position tuples". Should |
|
1294 ** work with any doclist type, though both inputs and the output |
|
1295 ** should be the same type. |
|
1296 */ |
|
1297 static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){ |
|
1298 PLReader left, right; |
|
1299 PLWriter writer; |
|
1300 |
|
1301 assert( dlrDocid(pLeft)==dlrDocid(pRight) ); |
|
1302 assert( pLeft->iType==pRight->iType ); |
|
1303 assert( pLeft->iType==pOut->iType ); |
|
1304 |
|
1305 plrInit(&left, pLeft); |
|
1306 plrInit(&right, pRight); |
|
1307 plwInit(&writer, pOut, dlrDocid(pLeft)); |
|
1308 |
|
1309 while( !plrAtEnd(&left) || !plrAtEnd(&right) ){ |
|
1310 int c = posListCmp(&left, &right); |
|
1311 if( c<0 ){ |
|
1312 plwCopy(&writer, &left); |
|
1313 plrStep(&left); |
|
1314 }else if( c>0 ){ |
|
1315 plwCopy(&writer, &right); |
|
1316 plrStep(&right); |
|
1317 }else{ |
|
1318 plwCopy(&writer, &left); |
|
1319 plrStep(&left); |
|
1320 plrStep(&right); |
|
1321 } |
|
1322 } |
|
1323 |
|
1324 plwTerminate(&writer); |
|
1325 plwDestroy(&writer); |
|
1326 plrDestroy(&left); |
|
1327 plrDestroy(&right); |
|
1328 } |
|
1329 |
|
1330 /* Write the union of doclists in pLeft and pRight to pOut. For |
|
1331 ** docids in common between the inputs, the union of the position |
|
1332 ** lists is written. Inputs and outputs are always type DL_DEFAULT. |
|
1333 */ |
|
1334 static void docListUnion( |
|
1335 const char *pLeft, int nLeft, |
|
1336 const char *pRight, int nRight, |
|
1337 DataBuffer *pOut /* Write the combined doclist here */ |
|
1338 ){ |
|
1339 DLReader left, right; |
|
1340 DLWriter writer; |
|
1341 |
|
1342 if( nLeft==0 ){ |
|
1343 if( nRight!=0) dataBufferAppend(pOut, pRight, nRight); |
|
1344 return; |
|
1345 } |
|
1346 if( nRight==0 ){ |
|
1347 dataBufferAppend(pOut, pLeft, nLeft); |
|
1348 return; |
|
1349 } |
|
1350 |
|
1351 dlrInit(&left, DL_DEFAULT, pLeft, nLeft); |
|
1352 dlrInit(&right, DL_DEFAULT, pRight, nRight); |
|
1353 dlwInit(&writer, DL_DEFAULT, pOut); |
|
1354 |
|
1355 while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){ |
|
1356 if( dlrAtEnd(&right) ){ |
|
1357 dlwCopy(&writer, &left); |
|
1358 dlrStep(&left); |
|
1359 }else if( dlrAtEnd(&left) ){ |
|
1360 dlwCopy(&writer, &right); |
|
1361 dlrStep(&right); |
|
1362 }else if( dlrDocid(&left)<dlrDocid(&right) ){ |
|
1363 dlwCopy(&writer, &left); |
|
1364 dlrStep(&left); |
|
1365 }else if( dlrDocid(&left)>dlrDocid(&right) ){ |
|
1366 dlwCopy(&writer, &right); |
|
1367 dlrStep(&right); |
|
1368 }else{ |
|
1369 posListUnion(&left, &right, &writer); |
|
1370 dlrStep(&left); |
|
1371 dlrStep(&right); |
|
1372 } |
|
1373 } |
|
1374 |
|
1375 dlrDestroy(&left); |
|
1376 dlrDestroy(&right); |
|
1377 dlwDestroy(&writer); |
|
1378 } |
|
1379 |
|
1380 /* |
|
1381 ** This function is used as part of the implementation of phrase and |
|
1382 ** NEAR matching. |
|
1383 ** |
|
1384 ** pLeft and pRight are DLReaders positioned to the same docid in |
|
1385 ** lists of type DL_POSITION. This function writes an entry to the |
|
1386 ** DLWriter pOut for each position in pRight that is less than |
|
1387 ** (nNear+1) greater (but not equal to or smaller) than a position |
|
1388 ** in pLeft. For example, if nNear is 0, and the positions contained |
|
1389 ** by pLeft and pRight are: |
|
1390 ** |
|
1391 ** pLeft: 5 10 15 20 |
|
1392 ** pRight: 6 9 17 21 |
|
1393 ** |
|
1394 ** then the docid is added to pOut. If pOut is of type DL_POSITIONS, |
|
1395 ** then a positionids "6" and "21" are also added to pOut. |
|
1396 ** |
|
1397 ** If boolean argument isSaveLeft is true, then positionids are copied |
|
1398 ** from pLeft instead of pRight. In the example above, the positions "5" |
|
1399 ** and "20" would be added instead of "6" and "21". |
|
1400 */ |
|
1401 static void posListPhraseMerge( |
|
1402 DLReader *pLeft, |
|
1403 DLReader *pRight, |
|
1404 int nNear, |
|
1405 int isSaveLeft, |
|
1406 DLWriter *pOut |
|
1407 ){ |
|
1408 PLReader left, right; |
|
1409 PLWriter writer; |
|
1410 int match = 0; |
|
1411 |
|
1412 assert( dlrDocid(pLeft)==dlrDocid(pRight) ); |
|
1413 assert( pOut->iType!=DL_POSITIONS_OFFSETS ); |
|
1414 |
|
1415 plrInit(&left, pLeft); |
|
1416 plrInit(&right, pRight); |
|
1417 |
|
1418 while( !plrAtEnd(&left) && !plrAtEnd(&right) ){ |
|
1419 if( plrColumn(&left)<plrColumn(&right) ){ |
|
1420 plrStep(&left); |
|
1421 }else if( plrColumn(&left)>plrColumn(&right) ){ |
|
1422 plrStep(&right); |
|
1423 }else if( plrPosition(&left)>=plrPosition(&right) ){ |
|
1424 plrStep(&right); |
|
1425 }else{ |
|
1426 if( (plrPosition(&right)-plrPosition(&left))<=(nNear+1) ){ |
|
1427 if( !match ){ |
|
1428 plwInit(&writer, pOut, dlrDocid(pLeft)); |
|
1429 match = 1; |
|
1430 } |
|
1431 if( !isSaveLeft ){ |
|
1432 plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0); |
|
1433 }else{ |
|
1434 plwAdd(&writer, plrColumn(&left), plrPosition(&left), 0, 0); |
|
1435 } |
|
1436 plrStep(&right); |
|
1437 }else{ |
|
1438 plrStep(&left); |
|
1439 } |
|
1440 } |
|
1441 } |
|
1442 |
|
1443 if( match ){ |
|
1444 plwTerminate(&writer); |
|
1445 plwDestroy(&writer); |
|
1446 } |
|
1447 |
|
1448 plrDestroy(&left); |
|
1449 plrDestroy(&right); |
|
1450 } |
|
1451 |
|
1452 /* |
|
1453 ** Compare the values pointed to by the PLReaders passed as arguments. |
|
1454 ** Return -1 if the value pointed to by pLeft is considered less than |
|
1455 ** the value pointed to by pRight, +1 if it is considered greater |
|
1456 ** than it, or 0 if it is equal. i.e. |
|
1457 ** |
|
1458 ** (*pLeft - *pRight) |
|
1459 ** |
|
1460 ** A PLReader that is in the EOF condition is considered greater than |
|
1461 ** any other. If neither argument is in EOF state, the return value of |
|
1462 ** plrColumn() is used. If the plrColumn() values are equal, the |
|
1463 ** comparison is on the basis of plrPosition(). |
|
1464 */ |
|
1465 static int plrCompare(PLReader *pLeft, PLReader *pRight){ |
|
1466 assert(!plrAtEnd(pLeft) || !plrAtEnd(pRight)); |
|
1467 |
|
1468 if( plrAtEnd(pRight) || plrAtEnd(pLeft) ){ |
|
1469 return (plrAtEnd(pRight) ? -1 : 1); |
|
1470 } |
|
1471 if( plrColumn(pLeft)!=plrColumn(pRight) ){ |
|
1472 return ((plrColumn(pLeft)<plrColumn(pRight)) ? -1 : 1); |
|
1473 } |
|
1474 if( plrPosition(pLeft)!=plrPosition(pRight) ){ |
|
1475 return ((plrPosition(pLeft)<plrPosition(pRight)) ? -1 : 1); |
|
1476 } |
|
1477 return 0; |
|
1478 } |
|
1479 |
|
1480 /* We have two doclists with positions: pLeft and pRight. Depending |
|
1481 ** on the value of the nNear parameter, perform either a phrase |
|
1482 ** intersection (if nNear==0) or a NEAR intersection (if nNear>0) |
|
1483 ** and write the results into pOut. |
|
1484 ** |
|
1485 ** A phrase intersection means that two documents only match |
|
1486 ** if pLeft.iPos+1==pRight.iPos. |
|
1487 ** |
|
1488 ** A NEAR intersection means that two documents only match if |
|
1489 ** (abs(pLeft.iPos-pRight.iPos)<nNear). |
|
1490 ** |
|
1491 ** If a NEAR intersection is requested, then the nPhrase argument should |
|
1492 ** be passed the number of tokens in the two operands to the NEAR operator |
|
1493 ** combined. For example: |
|
1494 ** |
|
1495 ** Query syntax nPhrase |
|
1496 ** ------------------------------------ |
|
1497 ** "A B C" NEAR "D E" 5 |
|
1498 ** A NEAR B 2 |
|
1499 ** |
|
1500 ** iType controls the type of data written to pOut. If iType is |
|
1501 ** DL_POSITIONS, the positions are those from pRight. |
|
1502 */ |
|
1503 static void docListPhraseMerge( |
|
1504 const char *pLeft, int nLeft, |
|
1505 const char *pRight, int nRight, |
|
1506 int nNear, /* 0 for a phrase merge, non-zero for a NEAR merge */ |
|
1507 int nPhrase, /* Number of tokens in left+right operands to NEAR */ |
|
1508 DocListType iType, /* Type of doclist to write to pOut */ |
|
1509 DataBuffer *pOut /* Write the combined doclist here */ |
|
1510 ){ |
|
1511 DLReader left, right; |
|
1512 DLWriter writer; |
|
1513 |
|
1514 if( nLeft==0 || nRight==0 ) return; |
|
1515 |
|
1516 assert( iType!=DL_POSITIONS_OFFSETS ); |
|
1517 |
|
1518 dlrInit(&left, DL_POSITIONS, pLeft, nLeft); |
|
1519 dlrInit(&right, DL_POSITIONS, pRight, nRight); |
|
1520 dlwInit(&writer, iType, pOut); |
|
1521 |
|
1522 while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ |
|
1523 if( dlrDocid(&left)<dlrDocid(&right) ){ |
|
1524 dlrStep(&left); |
|
1525 }else if( dlrDocid(&right)<dlrDocid(&left) ){ |
|
1526 dlrStep(&right); |
|
1527 }else{ |
|
1528 if( nNear==0 ){ |
|
1529 posListPhraseMerge(&left, &right, 0, 0, &writer); |
|
1530 }else{ |
|
1531 /* This case occurs when two terms (simple terms or phrases) are |
|
1532 * connected by a NEAR operator, span (nNear+1). i.e. |
|
1533 * |
|
1534 * '"terrible company" NEAR widget' |
|
1535 */ |
|
1536 DataBuffer one = {0, 0, 0}; |
|
1537 DataBuffer two = {0, 0, 0}; |
|
1538 |
|
1539 DLWriter dlwriter2; |
|
1540 DLReader dr1 = {0, 0, 0, 0, 0}; |
|
1541 DLReader dr2 = {0, 0, 0, 0, 0}; |
|
1542 |
|
1543 dlwInit(&dlwriter2, iType, &one); |
|
1544 posListPhraseMerge(&right, &left, nNear-3+nPhrase, 1, &dlwriter2); |
|
1545 dlwInit(&dlwriter2, iType, &two); |
|
1546 posListPhraseMerge(&left, &right, nNear-1, 0, &dlwriter2); |
|
1547 |
|
1548 if( one.nData) dlrInit(&dr1, iType, one.pData, one.nData); |
|
1549 if( two.nData) dlrInit(&dr2, iType, two.pData, two.nData); |
|
1550 |
|
1551 if( !dlrAtEnd(&dr1) || !dlrAtEnd(&dr2) ){ |
|
1552 PLReader pr1 = {0}; |
|
1553 PLReader pr2 = {0}; |
|
1554 |
|
1555 PLWriter plwriter; |
|
1556 plwInit(&plwriter, &writer, dlrDocid(dlrAtEnd(&dr1)?&dr2:&dr1)); |
|
1557 |
|
1558 if( one.nData ) plrInit(&pr1, &dr1); |
|
1559 if( two.nData ) plrInit(&pr2, &dr2); |
|
1560 while( !plrAtEnd(&pr1) || !plrAtEnd(&pr2) ){ |
|
1561 int iCompare = plrCompare(&pr1, &pr2); |
|
1562 switch( iCompare ){ |
|
1563 case -1: |
|
1564 plwCopy(&plwriter, &pr1); |
|
1565 plrStep(&pr1); |
|
1566 break; |
|
1567 case 1: |
|
1568 plwCopy(&plwriter, &pr2); |
|
1569 plrStep(&pr2); |
|
1570 break; |
|
1571 case 0: |
|
1572 plwCopy(&plwriter, &pr1); |
|
1573 plrStep(&pr1); |
|
1574 plrStep(&pr2); |
|
1575 break; |
|
1576 } |
|
1577 } |
|
1578 plwTerminate(&plwriter); |
|
1579 } |
|
1580 dataBufferDestroy(&one); |
|
1581 dataBufferDestroy(&two); |
|
1582 } |
|
1583 dlrStep(&left); |
|
1584 dlrStep(&right); |
|
1585 } |
|
1586 } |
|
1587 |
|
1588 dlrDestroy(&left); |
|
1589 dlrDestroy(&right); |
|
1590 dlwDestroy(&writer); |
|
1591 } |
|
1592 |
|
1593 /* We have two DL_DOCIDS doclists: pLeft and pRight. |
|
1594 ** Write the intersection of these two doclists into pOut as a |
|
1595 ** DL_DOCIDS doclist. |
|
1596 */ |
|
1597 static void docListAndMerge( |
|
1598 const char *pLeft, int nLeft, |
|
1599 const char *pRight, int nRight, |
|
1600 DataBuffer *pOut /* Write the combined doclist here */ |
|
1601 ){ |
|
1602 DLReader left, right; |
|
1603 DLWriter writer; |
|
1604 |
|
1605 if( nLeft==0 || nRight==0 ) return; |
|
1606 |
|
1607 dlrInit(&left, DL_DOCIDS, pLeft, nLeft); |
|
1608 dlrInit(&right, DL_DOCIDS, pRight, nRight); |
|
1609 dlwInit(&writer, DL_DOCIDS, pOut); |
|
1610 |
|
1611 while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ |
|
1612 if( dlrDocid(&left)<dlrDocid(&right) ){ |
|
1613 dlrStep(&left); |
|
1614 }else if( dlrDocid(&right)<dlrDocid(&left) ){ |
|
1615 dlrStep(&right); |
|
1616 }else{ |
|
1617 dlwAdd(&writer, dlrDocid(&left)); |
|
1618 dlrStep(&left); |
|
1619 dlrStep(&right); |
|
1620 } |
|
1621 } |
|
1622 |
|
1623 dlrDestroy(&left); |
|
1624 dlrDestroy(&right); |
|
1625 dlwDestroy(&writer); |
|
1626 } |
|
1627 |
|
1628 /* We have two DL_DOCIDS doclists: pLeft and pRight. |
|
1629 ** Write the union of these two doclists into pOut as a |
|
1630 ** DL_DOCIDS doclist. |
|
1631 */ |
|
1632 static void docListOrMerge( |
|
1633 const char *pLeft, int nLeft, |
|
1634 const char *pRight, int nRight, |
|
1635 DataBuffer *pOut /* Write the combined doclist here */ |
|
1636 ){ |
|
1637 DLReader left, right; |
|
1638 DLWriter writer; |
|
1639 |
|
1640 if( nLeft==0 ){ |
|
1641 if( nRight!=0 ) dataBufferAppend(pOut, pRight, nRight); |
|
1642 return; |
|
1643 } |
|
1644 if( nRight==0 ){ |
|
1645 dataBufferAppend(pOut, pLeft, nLeft); |
|
1646 return; |
|
1647 } |
|
1648 |
|
1649 dlrInit(&left, DL_DOCIDS, pLeft, nLeft); |
|
1650 dlrInit(&right, DL_DOCIDS, pRight, nRight); |
|
1651 dlwInit(&writer, DL_DOCIDS, pOut); |
|
1652 |
|
1653 while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){ |
|
1654 if( dlrAtEnd(&right) ){ |
|
1655 dlwAdd(&writer, dlrDocid(&left)); |
|
1656 dlrStep(&left); |
|
1657 }else if( dlrAtEnd(&left) ){ |
|
1658 dlwAdd(&writer, dlrDocid(&right)); |
|
1659 dlrStep(&right); |
|
1660 }else if( dlrDocid(&left)<dlrDocid(&right) ){ |
|
1661 dlwAdd(&writer, dlrDocid(&left)); |
|
1662 dlrStep(&left); |
|
1663 }else if( dlrDocid(&right)<dlrDocid(&left) ){ |
|
1664 dlwAdd(&writer, dlrDocid(&right)); |
|
1665 dlrStep(&right); |
|
1666 }else{ |
|
1667 dlwAdd(&writer, dlrDocid(&left)); |
|
1668 dlrStep(&left); |
|
1669 dlrStep(&right); |
|
1670 } |
|
1671 } |
|
1672 |
|
1673 dlrDestroy(&left); |
|
1674 dlrDestroy(&right); |
|
1675 dlwDestroy(&writer); |
|
1676 } |
|
1677 |
|
1678 /* We have two DL_DOCIDS doclists: pLeft and pRight. |
|
1679 ** Write into pOut as DL_DOCIDS doclist containing all documents that |
|
1680 ** occur in pLeft but not in pRight. |
|
1681 */ |
|
1682 static void docListExceptMerge( |
|
1683 const char *pLeft, int nLeft, |
|
1684 const char *pRight, int nRight, |
|
1685 DataBuffer *pOut /* Write the combined doclist here */ |
|
1686 ){ |
|
1687 DLReader left, right; |
|
1688 DLWriter writer; |
|
1689 |
|
1690 if( nLeft==0 ) return; |
|
1691 if( nRight==0 ){ |
|
1692 dataBufferAppend(pOut, pLeft, nLeft); |
|
1693 return; |
|
1694 } |
|
1695 |
|
1696 dlrInit(&left, DL_DOCIDS, pLeft, nLeft); |
|
1697 dlrInit(&right, DL_DOCIDS, pRight, nRight); |
|
1698 dlwInit(&writer, DL_DOCIDS, pOut); |
|
1699 |
|
1700 while( !dlrAtEnd(&left) ){ |
|
1701 while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){ |
|
1702 dlrStep(&right); |
|
1703 } |
|
1704 if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){ |
|
1705 dlwAdd(&writer, dlrDocid(&left)); |
|
1706 } |
|
1707 dlrStep(&left); |
|
1708 } |
|
1709 |
|
1710 dlrDestroy(&left); |
|
1711 dlrDestroy(&right); |
|
1712 dlwDestroy(&writer); |
|
1713 } |
|
1714 |
|
1715 static char *string_dup_n(const char *s, int n){ |
|
1716 char *str = sqlite3_malloc(n + 1); |
|
1717 memcpy(str, s, n); |
|
1718 str[n] = '\0'; |
|
1719 return str; |
|
1720 } |
|
1721 |
|
1722 /* Duplicate a string; the caller must free() the returned string. |
|
1723 * (We don't use strdup() since it is not part of the standard C library and |
|
1724 * may not be available everywhere.) */ |
|
1725 static char *string_dup(const char *s){ |
|
1726 return string_dup_n(s, strlen(s)); |
|
1727 } |
|
1728 |
|
1729 /* Format a string, replacing each occurrence of the % character with |
|
1730 * zDb.zName. This may be more convenient than sqlite_mprintf() |
|
1731 * when one string is used repeatedly in a format string. |
|
1732 * The caller must free() the returned string. */ |
|
1733 static char *string_format(const char *zFormat, |
|
1734 const char *zDb, const char *zName){ |
|
1735 const char *p; |
|
1736 size_t len = 0; |
|
1737 size_t nDb = strlen(zDb); |
|
1738 size_t nName = strlen(zName); |
|
1739 size_t nFullTableName = nDb+1+nName; |
|
1740 char *result; |
|
1741 char *r; |
|
1742 |
|
1743 /* first compute length needed */ |
|
1744 for(p = zFormat ; *p ; ++p){ |
|
1745 len += (*p=='%' ? nFullTableName : 1); |
|
1746 } |
|
1747 len += 1; /* for null terminator */ |
|
1748 |
|
1749 r = result = sqlite3_malloc(len); |
|
1750 for(p = zFormat; *p; ++p){ |
|
1751 if( *p=='%' ){ |
|
1752 memcpy(r, zDb, nDb); |
|
1753 r += nDb; |
|
1754 *r++ = '.'; |
|
1755 memcpy(r, zName, nName); |
|
1756 r += nName; |
|
1757 } else { |
|
1758 *r++ = *p; |
|
1759 } |
|
1760 } |
|
1761 *r++ = '\0'; |
|
1762 assert( r == result + len ); |
|
1763 return result; |
|
1764 } |
|
1765 |
|
1766 static int sql_exec(sqlite3 *db, const char *zDb, const char *zName, |
|
1767 const char *zFormat){ |
|
1768 char *zCommand = string_format(zFormat, zDb, zName); |
|
1769 int rc; |
|
1770 FTSTRACE(("FTS3 sql: %s\n", zCommand)); |
|
1771 rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); |
|
1772 sqlite3_free(zCommand); |
|
1773 return rc; |
|
1774 } |
|
1775 |
|
1776 static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName, |
|
1777 sqlite3_stmt **ppStmt, const char *zFormat){ |
|
1778 char *zCommand = string_format(zFormat, zDb, zName); |
|
1779 int rc; |
|
1780 FTSTRACE(("FTS3 prepare: %s\n", zCommand)); |
|
1781 rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL); |
|
1782 sqlite3_free(zCommand); |
|
1783 return rc; |
|
1784 } |
|
1785 |
|
1786 /* end utility functions */ |
|
1787 |
|
1788 /* Forward reference */ |
|
1789 typedef struct fulltext_vtab fulltext_vtab; |
|
1790 |
|
1791 /* A single term in a query is represented by an instances of |
|
1792 ** the following structure. Each word which may match against |
|
1793 ** document content is a term. Operators, like NEAR or OR, are |
|
1794 ** not terms. Query terms are organized as a flat list stored |
|
1795 ** in the Query.pTerms array. |
|
1796 ** |
|
1797 ** If the QueryTerm.nPhrase variable is non-zero, then the QueryTerm |
|
1798 ** is the first in a contiguous string of terms that are either part |
|
1799 ** of the same phrase, or connected by the NEAR operator. |
|
1800 ** |
|
1801 ** If the QueryTerm.nNear variable is non-zero, then the token is followed |
|
1802 ** by a NEAR operator with span set to (nNear-1). For example, the |
|
1803 ** following query: |
|
1804 ** |
|
1805 ** The QueryTerm.iPhrase variable stores the index of the token within |
|
1806 ** its phrase, indexed starting at 1, or 1 if the token is not part |
|
1807 ** of any phrase. |
|
1808 ** |
|
1809 ** For example, the data structure used to represent the following query: |
|
1810 ** |
|
1811 ** ... MATCH 'sqlite NEAR/5 google NEAR/2 "search engine"' |
|
1812 ** |
|
1813 ** is: |
|
1814 ** |
|
1815 ** {nPhrase=4, iPhrase=1, nNear=6, pTerm="sqlite"}, |
|
1816 ** {nPhrase=0, iPhrase=1, nNear=3, pTerm="google"}, |
|
1817 ** {nPhrase=0, iPhrase=1, nNear=0, pTerm="search"}, |
|
1818 ** {nPhrase=0, iPhrase=2, nNear=0, pTerm="engine"}, |
|
1819 ** |
|
1820 ** compiling the FTS3 syntax to Query structures is done by the parseQuery() |
|
1821 ** function. |
|
1822 */ |
|
1823 typedef struct QueryTerm { |
|
1824 short int nPhrase; /* How many following terms are part of the same phrase */ |
|
1825 short int iPhrase; /* This is the i-th term of a phrase. */ |
|
1826 short int iColumn; /* Column of the index that must match this term */ |
|
1827 short int nNear; /* term followed by a NEAR operator with span=(nNear-1) */ |
|
1828 signed char isOr; /* this term is preceded by "OR" */ |
|
1829 signed char isNot; /* this term is preceded by "-" */ |
|
1830 signed char isPrefix; /* this term is followed by "*" */ |
|
1831 char *pTerm; /* text of the term. '\000' terminated. malloced */ |
|
1832 int nTerm; /* Number of bytes in pTerm[] */ |
|
1833 } QueryTerm; |
|
1834 |
|
1835 |
|
1836 /* A query string is parsed into a Query structure. |
|
1837 * |
|
1838 * We could, in theory, allow query strings to be complicated |
|
1839 * nested expressions with precedence determined by parentheses. |
|
1840 * But none of the major search engines do this. (Perhaps the |
|
1841 * feeling is that an parenthesized expression is two complex of |
|
1842 * an idea for the average user to grasp.) Taking our lead from |
|
1843 * the major search engines, we will allow queries to be a list |
|
1844 * of terms (with an implied AND operator) or phrases in double-quotes, |
|
1845 * with a single optional "-" before each non-phrase term to designate |
|
1846 * negation and an optional OR connector. |
|
1847 * |
|
1848 * OR binds more tightly than the implied AND, which is what the |
|
1849 * major search engines seem to do. So, for example: |
|
1850 * |
|
1851 * [one two OR three] ==> one AND (two OR three) |
|
1852 * [one OR two three] ==> (one OR two) AND three |
|
1853 * |
|
1854 * A "-" before a term matches all entries that lack that term. |
|
1855 * The "-" must occur immediately before the term with in intervening |
|
1856 * space. This is how the search engines do it. |
|
1857 * |
|
1858 * A NOT term cannot be the right-hand operand of an OR. If this |
|
1859 * occurs in the query string, the NOT is ignored: |
|
1860 * |
|
1861 * [one OR -two] ==> one OR two |
|
1862 * |
|
1863 */ |
|
1864 typedef struct Query { |
|
1865 fulltext_vtab *pFts; /* The full text index */ |
|
1866 int nTerms; /* Number of terms in the query */ |
|
1867 QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */ |
|
1868 int nextIsOr; /* Set the isOr flag on the next inserted term */ |
|
1869 int nextIsNear; /* Set the isOr flag on the next inserted term */ |
|
1870 int nextColumn; /* Next word parsed must be in this column */ |
|
1871 int dfltColumn; /* The default column */ |
|
1872 } Query; |
|
1873 |
|
1874 |
|
1875 /* |
|
1876 ** An instance of the following structure keeps track of generated |
|
1877 ** matching-word offset information and snippets. |
|
1878 */ |
|
1879 typedef struct Snippet { |
|
1880 int nMatch; /* Total number of matches */ |
|
1881 int nAlloc; /* Space allocated for aMatch[] */ |
|
1882 struct snippetMatch { /* One entry for each matching term */ |
|
1883 char snStatus; /* Status flag for use while constructing snippets */ |
|
1884 short int iCol; /* The column that contains the match */ |
|
1885 short int iTerm; /* The index in Query.pTerms[] of the matching term */ |
|
1886 int iToken; /* The index of the matching document token */ |
|
1887 short int nByte; /* Number of bytes in the term */ |
|
1888 int iStart; /* The offset to the first character of the term */ |
|
1889 } *aMatch; /* Points to space obtained from malloc */ |
|
1890 char *zOffset; /* Text rendering of aMatch[] */ |
|
1891 int nOffset; /* strlen(zOffset) */ |
|
1892 char *zSnippet; /* Snippet text */ |
|
1893 int nSnippet; /* strlen(zSnippet) */ |
|
1894 } Snippet; |
|
1895 |
|
1896 |
|
1897 typedef enum QueryType { |
|
1898 QUERY_GENERIC, /* table scan */ |
|
1899 QUERY_DOCID, /* lookup by docid */ |
|
1900 QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/ |
|
1901 } QueryType; |
|
1902 |
|
1903 typedef enum fulltext_statement { |
|
1904 CONTENT_INSERT_STMT, |
|
1905 CONTENT_SELECT_STMT, |
|
1906 CONTENT_UPDATE_STMT, |
|
1907 CONTENT_DELETE_STMT, |
|
1908 CONTENT_EXISTS_STMT, |
|
1909 |
|
1910 BLOCK_INSERT_STMT, |
|
1911 BLOCK_SELECT_STMT, |
|
1912 BLOCK_DELETE_STMT, |
|
1913 BLOCK_DELETE_ALL_STMT, |
|
1914 |
|
1915 SEGDIR_MAX_INDEX_STMT, |
|
1916 SEGDIR_SET_STMT, |
|
1917 SEGDIR_SELECT_LEVEL_STMT, |
|
1918 SEGDIR_SPAN_STMT, |
|
1919 SEGDIR_DELETE_STMT, |
|
1920 SEGDIR_SELECT_SEGMENT_STMT, |
|
1921 SEGDIR_SELECT_ALL_STMT, |
|
1922 SEGDIR_DELETE_ALL_STMT, |
|
1923 SEGDIR_COUNT_STMT, |
|
1924 |
|
1925 MAX_STMT /* Always at end! */ |
|
1926 } fulltext_statement; |
|
1927 |
|
1928 /* These must exactly match the enum above. */ |
|
1929 /* TODO(shess): Is there some risk that a statement will be used in two |
|
1930 ** cursors at once, e.g. if a query joins a virtual table to itself? |
|
1931 ** If so perhaps we should move some of these to the cursor object. |
|
1932 */ |
|
1933 static const char *const fulltext_zStatement[MAX_STMT] = { |
|
1934 /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */ |
|
1935 /* CONTENT_SELECT */ NULL, /* generated in contentSelectStatement() */ |
|
1936 /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */ |
|
1937 /* CONTENT_DELETE */ "delete from %_content where docid = ?", |
|
1938 /* CONTENT_EXISTS */ "select docid from %_content limit 1", |
|
1939 |
|
1940 /* BLOCK_INSERT */ |
|
1941 "insert into %_segments (blockid, block) values (null, ?)", |
|
1942 /* BLOCK_SELECT */ "select block from %_segments where blockid = ?", |
|
1943 /* BLOCK_DELETE */ "delete from %_segments where blockid between ? and ?", |
|
1944 /* BLOCK_DELETE_ALL */ "delete from %_segments", |
|
1945 |
|
1946 /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?", |
|
1947 /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)", |
|
1948 /* SEGDIR_SELECT_LEVEL */ |
|
1949 "select start_block, leaves_end_block, root from %_segdir " |
|
1950 " where level = ? order by idx", |
|
1951 /* SEGDIR_SPAN */ |
|
1952 "select min(start_block), max(end_block) from %_segdir " |
|
1953 " where level = ? and start_block <> 0", |
|
1954 /* SEGDIR_DELETE */ "delete from %_segdir where level = ?", |
|
1955 |
|
1956 /* NOTE(shess): The first three results of the following two |
|
1957 ** statements must match. |
|
1958 */ |
|
1959 /* SEGDIR_SELECT_SEGMENT */ |
|
1960 "select start_block, leaves_end_block, root from %_segdir " |
|
1961 " where level = ? and idx = ?", |
|
1962 /* SEGDIR_SELECT_ALL */ |
|
1963 "select start_block, leaves_end_block, root from %_segdir " |
|
1964 " order by level desc, idx asc", |
|
1965 /* SEGDIR_DELETE_ALL */ "delete from %_segdir", |
|
1966 /* SEGDIR_COUNT */ "select count(*), ifnull(max(level),0) from %_segdir", |
|
1967 }; |
|
1968 |
|
1969 /* |
|
1970 ** A connection to a fulltext index is an instance of the following |
|
1971 ** structure. The xCreate and xConnect methods create an instance |
|
1972 ** of this structure and xDestroy and xDisconnect free that instance. |
|
1973 ** All other methods receive a pointer to the structure as one of their |
|
1974 ** arguments. |
|
1975 */ |
|
1976 struct fulltext_vtab { |
|
1977 sqlite3_vtab base; /* Base class used by SQLite core */ |
|
1978 sqlite3 *db; /* The database connection */ |
|
1979 const char *zDb; /* logical database name */ |
|
1980 const char *zName; /* virtual table name */ |
|
1981 int nColumn; /* number of columns in virtual table */ |
|
1982 char **azColumn; /* column names. malloced */ |
|
1983 char **azContentColumn; /* column names in content table; malloced */ |
|
1984 sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ |
|
1985 |
|
1986 /* Precompiled statements which we keep as long as the table is |
|
1987 ** open. |
|
1988 */ |
|
1989 sqlite3_stmt *pFulltextStatements[MAX_STMT]; |
|
1990 |
|
1991 /* Precompiled statements used for segment merges. We run a |
|
1992 ** separate select across the leaf level of each tree being merged. |
|
1993 */ |
|
1994 sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT]; |
|
1995 /* The statement used to prepare pLeafSelectStmts. */ |
|
1996 #define LEAF_SELECT \ |
|
1997 "select block from %_segments where blockid between ? and ? order by blockid" |
|
1998 |
|
1999 /* These buffer pending index updates during transactions. |
|
2000 ** nPendingData estimates the memory size of the pending data. It |
|
2001 ** doesn't include the hash-bucket overhead, nor any malloc |
|
2002 ** overhead. When nPendingData exceeds kPendingThreshold, the |
|
2003 ** buffer is flushed even before the transaction closes. |
|
2004 ** pendingTerms stores the data, and is only valid when nPendingData |
|
2005 ** is >=0 (nPendingData<0 means pendingTerms has not been |
|
2006 ** initialized). iPrevDocid is the last docid written, used to make |
|
2007 ** certain we're inserting in sorted order. |
|
2008 */ |
|
2009 int nPendingData; |
|
2010 #define kPendingThreshold (1*1024*1024) |
|
2011 sqlite_int64 iPrevDocid; |
|
2012 fts3Hash pendingTerms; |
|
2013 }; |
|
2014 |
|
2015 /* |
|
2016 ** When the core wants to do a query, it create a cursor using a |
|
2017 ** call to xOpen. This structure is an instance of a cursor. It |
|
2018 ** is destroyed by xClose. |
|
2019 */ |
|
2020 typedef struct fulltext_cursor { |
|
2021 sqlite3_vtab_cursor base; /* Base class used by SQLite core */ |
|
2022 QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */ |
|
2023 sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */ |
|
2024 int eof; /* True if at End Of Results */ |
|
2025 Query q; /* Parsed query string */ |
|
2026 Snippet snippet; /* Cached snippet for the current row */ |
|
2027 int iColumn; /* Column being searched */ |
|
2028 DataBuffer result; /* Doclist results from fulltextQuery */ |
|
2029 DLReader reader; /* Result reader if result not empty */ |
|
2030 } fulltext_cursor; |
|
2031 |
|
2032 static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ |
|
2033 return (fulltext_vtab *) c->base.pVtab; |
|
2034 } |
|
2035 |
|
2036 static const sqlite3_module fts3Module; /* forward declaration */ |
|
2037 |
|
2038 /* Return a dynamically generated statement of the form |
|
2039 * insert into %_content (docid, ...) values (?, ...) |
|
2040 */ |
|
2041 static const char *contentInsertStatement(fulltext_vtab *v){ |
|
2042 StringBuffer sb; |
|
2043 int i; |
|
2044 |
|
2045 initStringBuffer(&sb); |
|
2046 append(&sb, "insert into %_content (docid, "); |
|
2047 appendList(&sb, v->nColumn, v->azContentColumn); |
|
2048 append(&sb, ") values (?"); |
|
2049 for(i=0; i<v->nColumn; ++i) |
|
2050 append(&sb, ", ?"); |
|
2051 append(&sb, ")"); |
|
2052 return stringBufferData(&sb); |
|
2053 } |
|
2054 |
|
2055 /* Return a dynamically generated statement of the form |
|
2056 * select <content columns> from %_content where docid = ? |
|
2057 */ |
|
2058 static const char *contentSelectStatement(fulltext_vtab *v){ |
|
2059 StringBuffer sb; |
|
2060 initStringBuffer(&sb); |
|
2061 append(&sb, "SELECT "); |
|
2062 appendList(&sb, v->nColumn, v->azContentColumn); |
|
2063 append(&sb, " FROM %_content WHERE docid = ?"); |
|
2064 return stringBufferData(&sb); |
|
2065 } |
|
2066 |
|
2067 /* Return a dynamically generated statement of the form |
|
2068 * update %_content set [col_0] = ?, [col_1] = ?, ... |
|
2069 * where docid = ? |
|
2070 */ |
|
2071 static const char *contentUpdateStatement(fulltext_vtab *v){ |
|
2072 StringBuffer sb; |
|
2073 int i; |
|
2074 |
|
2075 initStringBuffer(&sb); |
|
2076 append(&sb, "update %_content set "); |
|
2077 for(i=0; i<v->nColumn; ++i) { |
|
2078 if( i>0 ){ |
|
2079 append(&sb, ", "); |
|
2080 } |
|
2081 append(&sb, v->azContentColumn[i]); |
|
2082 append(&sb, " = ?"); |
|
2083 } |
|
2084 append(&sb, " where docid = ?"); |
|
2085 return stringBufferData(&sb); |
|
2086 } |
|
2087 |
|
2088 /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. |
|
2089 ** If the indicated statement has never been prepared, it is prepared |
|
2090 ** and cached, otherwise the cached version is reset. |
|
2091 */ |
|
2092 static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, |
|
2093 sqlite3_stmt **ppStmt){ |
|
2094 assert( iStmt<MAX_STMT ); |
|
2095 if( v->pFulltextStatements[iStmt]==NULL ){ |
|
2096 const char *zStmt; |
|
2097 int rc; |
|
2098 switch( iStmt ){ |
|
2099 case CONTENT_INSERT_STMT: |
|
2100 zStmt = contentInsertStatement(v); break; |
|
2101 case CONTENT_SELECT_STMT: |
|
2102 zStmt = contentSelectStatement(v); break; |
|
2103 case CONTENT_UPDATE_STMT: |
|
2104 zStmt = contentUpdateStatement(v); break; |
|
2105 default: |
|
2106 zStmt = fulltext_zStatement[iStmt]; |
|
2107 } |
|
2108 rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt], |
|
2109 zStmt); |
|
2110 if( zStmt != fulltext_zStatement[iStmt]) sqlite3_free((void *) zStmt); |
|
2111 if( rc!=SQLITE_OK ) return rc; |
|
2112 } else { |
|
2113 int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); |
|
2114 if( rc!=SQLITE_OK ) return rc; |
|
2115 } |
|
2116 |
|
2117 *ppStmt = v->pFulltextStatements[iStmt]; |
|
2118 return SQLITE_OK; |
|
2119 } |
|
2120 |
|
2121 /* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and |
|
2122 ** SQLITE_ROW to SQLITE_ERROR. Useful for statements like UPDATE, |
|
2123 ** where we expect no results. |
|
2124 */ |
|
2125 static int sql_single_step(sqlite3_stmt *s){ |
|
2126 int rc = sqlite3_step(s); |
|
2127 return (rc==SQLITE_DONE) ? SQLITE_OK : rc; |
|
2128 } |
|
2129 |
|
2130 /* Like sql_get_statement(), but for special replicated LEAF_SELECT |
|
2131 ** statements. idx -1 is a special case for an uncached version of |
|
2132 ** the statement (used in the optimize implementation). |
|
2133 */ |
|
2134 /* TODO(shess) Write version for generic statements and then share |
|
2135 ** that between the cached-statement functions. |
|
2136 */ |
|
2137 static int sql_get_leaf_statement(fulltext_vtab *v, int idx, |
|
2138 sqlite3_stmt **ppStmt){ |
|
2139 assert( idx>=-1 && idx<MERGE_COUNT ); |
|
2140 if( idx==-1 ){ |
|
2141 return sql_prepare(v->db, v->zDb, v->zName, ppStmt, LEAF_SELECT); |
|
2142 }else if( v->pLeafSelectStmts[idx]==NULL ){ |
|
2143 int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx], |
|
2144 LEAF_SELECT); |
|
2145 if( rc!=SQLITE_OK ) return rc; |
|
2146 }else{ |
|
2147 int rc = sqlite3_reset(v->pLeafSelectStmts[idx]); |
|
2148 if( rc!=SQLITE_OK ) return rc; |
|
2149 } |
|
2150 |
|
2151 *ppStmt = v->pLeafSelectStmts[idx]; |
|
2152 return SQLITE_OK; |
|
2153 } |
|
2154 |
|
2155 /* insert into %_content (docid, ...) values ([docid], [pValues]) |
|
2156 ** If the docid contains SQL NULL, then a unique docid will be |
|
2157 ** generated. |
|
2158 */ |
|
2159 static int content_insert(fulltext_vtab *v, sqlite3_value *docid, |
|
2160 sqlite3_value **pValues){ |
|
2161 sqlite3_stmt *s; |
|
2162 int i; |
|
2163 int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); |
|
2164 if( rc!=SQLITE_OK ) return rc; |
|
2165 |
|
2166 rc = sqlite3_bind_value(s, 1, docid); |
|
2167 if( rc!=SQLITE_OK ) return rc; |
|
2168 |
|
2169 for(i=0; i<v->nColumn; ++i){ |
|
2170 rc = sqlite3_bind_value(s, 2+i, pValues[i]); |
|
2171 if( rc!=SQLITE_OK ) return rc; |
|
2172 } |
|
2173 |
|
2174 return sql_single_step(s); |
|
2175 } |
|
2176 |
|
2177 /* update %_content set col0 = pValues[0], col1 = pValues[1], ... |
|
2178 * where docid = [iDocid] */ |
|
2179 static int content_update(fulltext_vtab *v, sqlite3_value **pValues, |
|
2180 sqlite_int64 iDocid){ |
|
2181 sqlite3_stmt *s; |
|
2182 int i; |
|
2183 int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s); |
|
2184 if( rc!=SQLITE_OK ) return rc; |
|
2185 |
|
2186 for(i=0; i<v->nColumn; ++i){ |
|
2187 rc = sqlite3_bind_value(s, 1+i, pValues[i]); |
|
2188 if( rc!=SQLITE_OK ) return rc; |
|
2189 } |
|
2190 |
|
2191 rc = sqlite3_bind_int64(s, 1+v->nColumn, iDocid); |
|
2192 if( rc!=SQLITE_OK ) return rc; |
|
2193 |
|
2194 return sql_single_step(s); |
|
2195 } |
|
2196 |
|
2197 static void freeStringArray(int nString, const char **pString){ |
|
2198 int i; |
|
2199 |
|
2200 for (i=0 ; i < nString ; ++i) { |
|
2201 if( pString[i]!=NULL ) sqlite3_free((void *) pString[i]); |
|
2202 } |
|
2203 sqlite3_free((void *) pString); |
|
2204 } |
|
2205 |
|
2206 /* select * from %_content where docid = [iDocid] |
|
2207 * The caller must delete the returned array and all strings in it. |
|
2208 * null fields will be NULL in the returned array. |
|
2209 * |
|
2210 * TODO: Perhaps we should return pointer/length strings here for consistency |
|
2211 * with other code which uses pointer/length. */ |
|
2212 static int content_select(fulltext_vtab *v, sqlite_int64 iDocid, |
|
2213 const char ***pValues){ |
|
2214 sqlite3_stmt *s; |
|
2215 const char **values; |
|
2216 int i; |
|
2217 int rc; |
|
2218 |
|
2219 *pValues = NULL; |
|
2220 |
|
2221 rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); |
|
2222 if( rc!=SQLITE_OK ) return rc; |
|
2223 |
|
2224 rc = sqlite3_bind_int64(s, 1, iDocid); |
|
2225 if( rc!=SQLITE_OK ) return rc; |
|
2226 |
|
2227 rc = sqlite3_step(s); |
|
2228 if( rc!=SQLITE_ROW ) return rc; |
|
2229 |
|
2230 values = (const char **) sqlite3_malloc(v->nColumn * sizeof(const char *)); |
|
2231 for(i=0; i<v->nColumn; ++i){ |
|
2232 if( sqlite3_column_type(s, i)==SQLITE_NULL ){ |
|
2233 values[i] = NULL; |
|
2234 }else{ |
|
2235 values[i] = string_dup((char*)sqlite3_column_text(s, i)); |
|
2236 } |
|
2237 } |
|
2238 |
|
2239 /* We expect only one row. We must execute another sqlite3_step() |
|
2240 * to complete the iteration; otherwise the table will remain locked. */ |
|
2241 rc = sqlite3_step(s); |
|
2242 if( rc==SQLITE_DONE ){ |
|
2243 *pValues = values; |
|
2244 return SQLITE_OK; |
|
2245 } |
|
2246 |
|
2247 freeStringArray(v->nColumn, values); |
|
2248 return rc; |
|
2249 } |
|
2250 |
|
2251 /* delete from %_content where docid = [iDocid ] */ |
|
2252 static int content_delete(fulltext_vtab *v, sqlite_int64 iDocid){ |
|
2253 sqlite3_stmt *s; |
|
2254 int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); |
|
2255 if( rc!=SQLITE_OK ) return rc; |
|
2256 |
|
2257 rc = sqlite3_bind_int64(s, 1, iDocid); |
|
2258 if( rc!=SQLITE_OK ) return rc; |
|
2259 |
|
2260 return sql_single_step(s); |
|
2261 } |
|
2262 |
|
2263 /* Returns SQLITE_ROW if any rows exist in %_content, SQLITE_DONE if |
|
2264 ** no rows exist, and any error in case of failure. |
|
2265 */ |
|
2266 static int content_exists(fulltext_vtab *v){ |
|
2267 sqlite3_stmt *s; |
|
2268 int rc = sql_get_statement(v, CONTENT_EXISTS_STMT, &s); |
|
2269 if( rc!=SQLITE_OK ) return rc; |
|
2270 |
|
2271 rc = sqlite3_step(s); |
|
2272 if( rc!=SQLITE_ROW ) return rc; |
|
2273 |
|
2274 /* We expect only one row. We must execute another sqlite3_step() |
|
2275 * to complete the iteration; otherwise the table will remain locked. */ |
|
2276 rc = sqlite3_step(s); |
|
2277 if( rc==SQLITE_DONE ) return SQLITE_ROW; |
|
2278 if( rc==SQLITE_ROW ) return SQLITE_ERROR; |
|
2279 return rc; |
|
2280 } |
|
2281 |
|
2282 /* insert into %_segments values ([pData]) |
|
2283 ** returns assigned blockid in *piBlockid |
|
2284 */ |
|
2285 static int block_insert(fulltext_vtab *v, const char *pData, int nData, |
|
2286 sqlite_int64 *piBlockid){ |
|
2287 sqlite3_stmt *s; |
|
2288 int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s); |
|
2289 if( rc!=SQLITE_OK ) return rc; |
|
2290 |
|
2291 rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC); |
|
2292 if( rc!=SQLITE_OK ) return rc; |
|
2293 |
|
2294 rc = sqlite3_step(s); |
|
2295 if( rc==SQLITE_ROW ) return SQLITE_ERROR; |
|
2296 if( rc!=SQLITE_DONE ) return rc; |
|
2297 |
|
2298 /* blockid column is an alias for rowid. */ |
|
2299 *piBlockid = sqlite3_last_insert_rowid(v->db); |
|
2300 return SQLITE_OK; |
|
2301 } |
|
2302 |
|
2303 /* delete from %_segments |
|
2304 ** where blockid between [iStartBlockid] and [iEndBlockid] |
|
2305 ** |
|
2306 ** Deletes the range of blocks, inclusive, used to delete the blocks |
|
2307 ** which form a segment. |
|
2308 */ |
|
2309 static int block_delete(fulltext_vtab *v, |
|
2310 sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){ |
|
2311 sqlite3_stmt *s; |
|
2312 int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s); |
|
2313 if( rc!=SQLITE_OK ) return rc; |
|
2314 |
|
2315 rc = sqlite3_bind_int64(s, 1, iStartBlockid); |
|
2316 if( rc!=SQLITE_OK ) return rc; |
|
2317 |
|
2318 rc = sqlite3_bind_int64(s, 2, iEndBlockid); |
|
2319 if( rc!=SQLITE_OK ) return rc; |
|
2320 |
|
2321 return sql_single_step(s); |
|
2322 } |
|
2323 |
|
2324 /* Returns SQLITE_ROW with *pidx set to the maximum segment idx found |
|
2325 ** at iLevel. Returns SQLITE_DONE if there are no segments at |
|
2326 ** iLevel. Otherwise returns an error. |
|
2327 */ |
|
2328 static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){ |
|
2329 sqlite3_stmt *s; |
|
2330 int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s); |
|
2331 if( rc!=SQLITE_OK ) return rc; |
|
2332 |
|
2333 rc = sqlite3_bind_int(s, 1, iLevel); |
|
2334 if( rc!=SQLITE_OK ) return rc; |
|
2335 |
|
2336 rc = sqlite3_step(s); |
|
2337 /* Should always get at least one row due to how max() works. */ |
|
2338 if( rc==SQLITE_DONE ) return SQLITE_DONE; |
|
2339 if( rc!=SQLITE_ROW ) return rc; |
|
2340 |
|
2341 /* NULL means that there were no inputs to max(). */ |
|
2342 if( SQLITE_NULL==sqlite3_column_type(s, 0) ){ |
|
2343 rc = sqlite3_step(s); |
|
2344 if( rc==SQLITE_ROW ) return SQLITE_ERROR; |
|
2345 return rc; |
|
2346 } |
|
2347 |
|
2348 *pidx = sqlite3_column_int(s, 0); |
|
2349 |
|
2350 /* We expect only one row. We must execute another sqlite3_step() |
|
2351 * to complete the iteration; otherwise the table will remain locked. */ |
|
2352 rc = sqlite3_step(s); |
|
2353 if( rc==SQLITE_ROW ) return SQLITE_ERROR; |
|
2354 if( rc!=SQLITE_DONE ) return rc; |
|
2355 return SQLITE_ROW; |
|
2356 } |
|
2357 |
|
2358 /* insert into %_segdir values ( |
|
2359 ** [iLevel], [idx], |
|
2360 ** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid], |
|
2361 ** [pRootData] |
|
2362 ** ) |
|
2363 */ |
|
2364 static int segdir_set(fulltext_vtab *v, int iLevel, int idx, |
|
2365 sqlite_int64 iStartBlockid, |
|
2366 sqlite_int64 iLeavesEndBlockid, |
|
2367 sqlite_int64 iEndBlockid, |
|
2368 const char *pRootData, int nRootData){ |
|
2369 sqlite3_stmt *s; |
|
2370 int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s); |
|
2371 if( rc!=SQLITE_OK ) return rc; |
|
2372 |
|
2373 rc = sqlite3_bind_int(s, 1, iLevel); |
|
2374 if( rc!=SQLITE_OK ) return rc; |
|
2375 |
|
2376 rc = sqlite3_bind_int(s, 2, idx); |
|
2377 if( rc!=SQLITE_OK ) return rc; |
|
2378 |
|
2379 rc = sqlite3_bind_int64(s, 3, iStartBlockid); |
|
2380 if( rc!=SQLITE_OK ) return rc; |
|
2381 |
|
2382 rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid); |
|
2383 if( rc!=SQLITE_OK ) return rc; |
|
2384 |
|
2385 rc = sqlite3_bind_int64(s, 5, iEndBlockid); |
|
2386 if( rc!=SQLITE_OK ) return rc; |
|
2387 |
|
2388 rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC); |
|
2389 if( rc!=SQLITE_OK ) return rc; |
|
2390 |
|
2391 return sql_single_step(s); |
|
2392 } |
|
2393 |
|
2394 /* Queries %_segdir for the block span of the segments in level |
|
2395 ** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel, |
|
2396 ** SQLITE_ROW if there are blocks, else an error. |
|
2397 */ |
|
2398 static int segdir_span(fulltext_vtab *v, int iLevel, |
|
2399 sqlite_int64 *piStartBlockid, |
|
2400 sqlite_int64 *piEndBlockid){ |
|
2401 sqlite3_stmt *s; |
|
2402 int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s); |
|
2403 if( rc!=SQLITE_OK ) return rc; |
|
2404 |
|
2405 rc = sqlite3_bind_int(s, 1, iLevel); |
|
2406 if( rc!=SQLITE_OK ) return rc; |
|
2407 |
|
2408 rc = sqlite3_step(s); |
|
2409 if( rc==SQLITE_DONE ) return SQLITE_DONE; /* Should never happen */ |
|
2410 if( rc!=SQLITE_ROW ) return rc; |
|
2411 |
|
2412 /* This happens if all segments at this level are entirely inline. */ |
|
2413 if( SQLITE_NULL==sqlite3_column_type(s, 0) ){ |
|
2414 /* We expect only one row. We must execute another sqlite3_step() |
|
2415 * to complete the iteration; otherwise the table will remain locked. */ |
|
2416 int rc2 = sqlite3_step(s); |
|
2417 if( rc2==SQLITE_ROW ) return SQLITE_ERROR; |
|
2418 return rc2; |
|
2419 } |
|
2420 |
|
2421 *piStartBlockid = sqlite3_column_int64(s, 0); |
|
2422 *piEndBlockid = sqlite3_column_int64(s, 1); |
|
2423 |
|
2424 /* We expect only one row. We must execute another sqlite3_step() |
|
2425 * to complete the iteration; otherwise the table will remain locked. */ |
|
2426 rc = sqlite3_step(s); |
|
2427 if( rc==SQLITE_ROW ) return SQLITE_ERROR; |
|
2428 if( rc!=SQLITE_DONE ) return rc; |
|
2429 return SQLITE_ROW; |
|
2430 } |
|
2431 |
|
2432 /* Delete the segment blocks and segment directory records for all |
|
2433 ** segments at iLevel. |
|
2434 */ |
|
2435 static int segdir_delete(fulltext_vtab *v, int iLevel){ |
|
2436 sqlite3_stmt *s; |
|
2437 sqlite_int64 iStartBlockid, iEndBlockid; |
|
2438 int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid); |
|
2439 if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; |
|
2440 |
|
2441 if( rc==SQLITE_ROW ){ |
|
2442 rc = block_delete(v, iStartBlockid, iEndBlockid); |
|
2443 if( rc!=SQLITE_OK ) return rc; |
|
2444 } |
|
2445 |
|
2446 /* Delete the segment directory itself. */ |
|
2447 rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s); |
|
2448 if( rc!=SQLITE_OK ) return rc; |
|
2449 |
|
2450 rc = sqlite3_bind_int64(s, 1, iLevel); |
|
2451 if( rc!=SQLITE_OK ) return rc; |
|
2452 |
|
2453 return sql_single_step(s); |
|
2454 } |
|
2455 |
|
2456 /* Delete entire fts index, SQLITE_OK on success, relevant error on |
|
2457 ** failure. |
|
2458 */ |
|
2459 static int segdir_delete_all(fulltext_vtab *v){ |
|
2460 sqlite3_stmt *s; |
|
2461 int rc = sql_get_statement(v, SEGDIR_DELETE_ALL_STMT, &s); |
|
2462 if( rc!=SQLITE_OK ) return rc; |
|
2463 |
|
2464 rc = sql_single_step(s); |
|
2465 if( rc!=SQLITE_OK ) return rc; |
|
2466 |
|
2467 rc = sql_get_statement(v, BLOCK_DELETE_ALL_STMT, &s); |
|
2468 if( rc!=SQLITE_OK ) return rc; |
|
2469 |
|
2470 return sql_single_step(s); |
|
2471 } |
|
2472 |
|
2473 /* Returns SQLITE_OK with *pnSegments set to the number of entries in |
|
2474 ** %_segdir and *piMaxLevel set to the highest level which has a |
|
2475 ** segment. Otherwise returns the SQLite error which caused failure. |
|
2476 */ |
|
2477 static int segdir_count(fulltext_vtab *v, int *pnSegments, int *piMaxLevel){ |
|
2478 sqlite3_stmt *s; |
|
2479 int rc = sql_get_statement(v, SEGDIR_COUNT_STMT, &s); |
|
2480 if( rc!=SQLITE_OK ) return rc; |
|
2481 |
|
2482 rc = sqlite3_step(s); |
|
2483 /* TODO(shess): This case should not be possible? Should stronger |
|
2484 ** measures be taken if it happens? |
|
2485 */ |
|
2486 if( rc==SQLITE_DONE ){ |
|
2487 *pnSegments = 0; |
|
2488 *piMaxLevel = 0; |
|
2489 return SQLITE_OK; |
|
2490 } |
|
2491 if( rc!=SQLITE_ROW ) return rc; |
|
2492 |
|
2493 *pnSegments = sqlite3_column_int(s, 0); |
|
2494 *piMaxLevel = sqlite3_column_int(s, 1); |
|
2495 |
|
2496 /* We expect only one row. We must execute another sqlite3_step() |
|
2497 * to complete the iteration; otherwise the table will remain locked. */ |
|
2498 rc = sqlite3_step(s); |
|
2499 if( rc==SQLITE_DONE ) return SQLITE_OK; |
|
2500 if( rc==SQLITE_ROW ) return SQLITE_ERROR; |
|
2501 return rc; |
|
2502 } |
|
2503 |
|
2504 /* TODO(shess) clearPendingTerms() is far down the file because |
|
2505 ** writeZeroSegment() is far down the file because LeafWriter is far |
|
2506 ** down the file. Consider refactoring the code to move the non-vtab |
|
2507 ** code above the vtab code so that we don't need this forward |
|
2508 ** reference. |
|
2509 */ |
|
2510 static int clearPendingTerms(fulltext_vtab *v); |
|
2511 |
|
2512 /* |
|
2513 ** Free the memory used to contain a fulltext_vtab structure. |
|
2514 */ |
|
2515 static void fulltext_vtab_destroy(fulltext_vtab *v){ |
|
2516 int iStmt, i; |
|
2517 |
|
2518 FTSTRACE(("FTS3 Destroy %p\n", v)); |
|
2519 for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ |
|
2520 if( v->pFulltextStatements[iStmt]!=NULL ){ |
|
2521 sqlite3_finalize(v->pFulltextStatements[iStmt]); |
|
2522 v->pFulltextStatements[iStmt] = NULL; |
|
2523 } |
|
2524 } |
|
2525 |
|
2526 for( i=0; i<MERGE_COUNT; i++ ){ |
|
2527 if( v->pLeafSelectStmts[i]!=NULL ){ |
|
2528 sqlite3_finalize(v->pLeafSelectStmts[i]); |
|
2529 v->pLeafSelectStmts[i] = NULL; |
|
2530 } |
|
2531 } |
|
2532 |
|
2533 if( v->pTokenizer!=NULL ){ |
|
2534 v->pTokenizer->pModule->xDestroy(v->pTokenizer); |
|
2535 v->pTokenizer = NULL; |
|
2536 } |
|
2537 |
|
2538 clearPendingTerms(v); |
|
2539 |
|
2540 sqlite3_free(v->azColumn); |
|
2541 for(i = 0; i < v->nColumn; ++i) { |
|
2542 sqlite3_free(v->azContentColumn[i]); |
|
2543 } |
|
2544 sqlite3_free(v->azContentColumn); |
|
2545 sqlite3_free(v); |
|
2546 } |
|
2547 |
|
2548 /* |
|
2549 ** Token types for parsing the arguments to xConnect or xCreate. |
|
2550 */ |
|
2551 #define TOKEN_EOF 0 /* End of file */ |
|
2552 #define TOKEN_SPACE 1 /* Any kind of whitespace */ |
|
2553 #define TOKEN_ID 2 /* An identifier */ |
|
2554 #define TOKEN_STRING 3 /* A string literal */ |
|
2555 #define TOKEN_PUNCT 4 /* A single punctuation character */ |
|
2556 |
|
2557 /* |
|
2558 ** If X is a character that can be used in an identifier then |
|
2559 ** ftsIdChar(X) will be true. Otherwise it is false. |
|
2560 ** |
|
2561 ** For ASCII, any character with the high-order bit set is |
|
2562 ** allowed in an identifier. For 7-bit characters, |
|
2563 ** isFtsIdChar[X] must be 1. |
|
2564 ** |
|
2565 ** Ticket #1066. the SQL standard does not allow '$' in the |
|
2566 ** middle of identfiers. But many SQL implementations do. |
|
2567 ** SQLite will allow '$' in identifiers for compatibility. |
|
2568 ** But the feature is undocumented. |
|
2569 */ |
|
2570 static const char isFtsIdChar[] = { |
|
2571 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ |
|
2572 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */ |
|
2573 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ |
|
2574 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ |
|
2575 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ |
|
2576 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ |
|
2577 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ |
|
2578 }; |
|
2579 #define ftsIdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isFtsIdChar[c-0x20])) |
|
2580 |
|
2581 |
|
2582 /* |
|
2583 ** Return the length of the token that begins at z[0]. |
|
2584 ** Store the token type in *tokenType before returning. |
|
2585 */ |
|
2586 static int ftsGetToken(const char *z, int *tokenType){ |
|
2587 int i, c; |
|
2588 switch( *z ){ |
|
2589 case 0: { |
|
2590 *tokenType = TOKEN_EOF; |
|
2591 return 0; |
|
2592 } |
|
2593 case ' ': case '\t': case '\n': case '\f': case '\r': { |
|
2594 for(i=1; safe_isspace(z[i]); i++){} |
|
2595 *tokenType = TOKEN_SPACE; |
|
2596 return i; |
|
2597 } |
|
2598 case '`': |
|
2599 case '\'': |
|
2600 case '"': { |
|
2601 int delim = z[0]; |
|
2602 for(i=1; (c=z[i])!=0; i++){ |
|
2603 if( c==delim ){ |
|
2604 if( z[i+1]==delim ){ |
|
2605 i++; |
|
2606 }else{ |
|
2607 break; |
|
2608 } |
|
2609 } |
|
2610 } |
|
2611 *tokenType = TOKEN_STRING; |
|
2612 return i + (c!=0); |
|
2613 } |
|
2614 case '[': { |
|
2615 for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){} |
|
2616 *tokenType = TOKEN_ID; |
|
2617 return i; |
|
2618 } |
|
2619 default: { |
|
2620 if( !ftsIdChar(*z) ){ |
|
2621 break; |
|
2622 } |
|
2623 for(i=1; ftsIdChar(z[i]); i++){} |
|
2624 *tokenType = TOKEN_ID; |
|
2625 return i; |
|
2626 } |
|
2627 } |
|
2628 *tokenType = TOKEN_PUNCT; |
|
2629 return 1; |
|
2630 } |
|
2631 |
|
2632 /* |
|
2633 ** A token extracted from a string is an instance of the following |
|
2634 ** structure. |
|
2635 */ |
|
2636 typedef struct FtsToken { |
|
2637 const char *z; /* Pointer to token text. Not '\000' terminated */ |
|
2638 short int n; /* Length of the token text in bytes. */ |
|
2639 } FtsToken; |
|
2640 |
|
2641 /* |
|
2642 ** Given a input string (which is really one of the argv[] parameters |
|
2643 ** passed into xConnect or xCreate) split the string up into tokens. |
|
2644 ** Return an array of pointers to '\000' terminated strings, one string |
|
2645 ** for each non-whitespace token. |
|
2646 ** |
|
2647 ** The returned array is terminated by a single NULL pointer. |
|
2648 ** |
|
2649 ** Space to hold the returned array is obtained from a single |
|
2650 ** malloc and should be freed by passing the return value to free(). |
|
2651 ** The individual strings within the token list are all a part of |
|
2652 ** the single memory allocation and will all be freed at once. |
|
2653 */ |
|
2654 static char **tokenizeString(const char *z, int *pnToken){ |
|
2655 int nToken = 0; |
|
2656 FtsToken *aToken = sqlite3_malloc( strlen(z) * sizeof(aToken[0]) ); |
|
2657 int n = 1; |
|
2658 int e, i; |
|
2659 int totalSize = 0; |
|
2660 char **azToken; |
|
2661 char *zCopy; |
|
2662 while( n>0 ){ |
|
2663 n = ftsGetToken(z, &e); |
|
2664 if( e!=TOKEN_SPACE ){ |
|
2665 aToken[nToken].z = z; |
|
2666 aToken[nToken].n = n; |
|
2667 nToken++; |
|
2668 totalSize += n+1; |
|
2669 } |
|
2670 z += n; |
|
2671 } |
|
2672 azToken = (char**)sqlite3_malloc( nToken*sizeof(char*) + totalSize ); |
|
2673 zCopy = (char*)&azToken[nToken]; |
|
2674 nToken--; |
|
2675 for(i=0; i<nToken; i++){ |
|
2676 azToken[i] = zCopy; |
|
2677 n = aToken[i].n; |
|
2678 memcpy(zCopy, aToken[i].z, n); |
|
2679 zCopy[n] = 0; |
|
2680 zCopy += n+1; |
|
2681 } |
|
2682 azToken[nToken] = 0; |
|
2683 sqlite3_free(aToken); |
|
2684 *pnToken = nToken; |
|
2685 return azToken; |
|
2686 } |
|
2687 |
|
2688 /* |
|
2689 ** Convert an SQL-style quoted string into a normal string by removing |
|
2690 ** the quote characters. The conversion is done in-place. If the |
|
2691 ** input does not begin with a quote character, then this routine |
|
2692 ** is a no-op. |
|
2693 ** |
|
2694 ** Examples: |
|
2695 ** |
|
2696 ** "abc" becomes abc |
|
2697 ** 'xyz' becomes xyz |
|
2698 ** [pqr] becomes pqr |
|
2699 ** `mno` becomes mno |
|
2700 */ |
|
2701 static void dequoteString(char *z){ |
|
2702 int quote; |
|
2703 int i, j; |
|
2704 if( z==0 ) return; |
|
2705 quote = z[0]; |
|
2706 switch( quote ){ |
|
2707 case '\'': break; |
|
2708 case '"': break; |
|
2709 case '`': break; /* For MySQL compatibility */ |
|
2710 case '[': quote = ']'; break; /* For MS SqlServer compatibility */ |
|
2711 default: return; |
|
2712 } |
|
2713 for(i=1, j=0; z[i]; i++){ |
|
2714 if( z[i]==quote ){ |
|
2715 if( z[i+1]==quote ){ |
|
2716 z[j++] = quote; |
|
2717 i++; |
|
2718 }else{ |
|
2719 z[j++] = 0; |
|
2720 break; |
|
2721 } |
|
2722 }else{ |
|
2723 z[j++] = z[i]; |
|
2724 } |
|
2725 } |
|
2726 } |
|
2727 |
|
2728 /* |
|
2729 ** The input azIn is a NULL-terminated list of tokens. Remove the first |
|
2730 ** token and all punctuation tokens. Remove the quotes from |
|
2731 ** around string literal tokens. |
|
2732 ** |
|
2733 ** Example: |
|
2734 ** |
|
2735 ** input: tokenize chinese ( 'simplifed' , 'mixed' ) |
|
2736 ** output: chinese simplifed mixed |
|
2737 ** |
|
2738 ** Another example: |
|
2739 ** |
|
2740 ** input: delimiters ( '[' , ']' , '...' ) |
|
2741 ** output: [ ] ... |
|
2742 */ |
|
2743 static void tokenListToIdList(char **azIn){ |
|
2744 int i, j; |
|
2745 if( azIn ){ |
|
2746 for(i=0, j=-1; azIn[i]; i++){ |
|
2747 if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){ |
|
2748 dequoteString(azIn[i]); |
|
2749 if( j>=0 ){ |
|
2750 azIn[j] = azIn[i]; |
|
2751 } |
|
2752 j++; |
|
2753 } |
|
2754 } |
|
2755 azIn[j] = 0; |
|
2756 } |
|
2757 } |
|
2758 |
|
2759 |
|
2760 /* |
|
2761 ** Find the first alphanumeric token in the string zIn. Null-terminate |
|
2762 ** this token. Remove any quotation marks. And return a pointer to |
|
2763 ** the result. |
|
2764 */ |
|
2765 static char *firstToken(char *zIn, char **pzTail){ |
|
2766 int n, ttype; |
|
2767 while(1){ |
|
2768 n = ftsGetToken(zIn, &ttype); |
|
2769 if( ttype==TOKEN_SPACE ){ |
|
2770 zIn += n; |
|
2771 }else if( ttype==TOKEN_EOF ){ |
|
2772 *pzTail = zIn; |
|
2773 return 0; |
|
2774 }else{ |
|
2775 zIn[n] = 0; |
|
2776 *pzTail = &zIn[1]; |
|
2777 dequoteString(zIn); |
|
2778 return zIn; |
|
2779 } |
|
2780 } |
|
2781 /*NOTREACHED*/ |
|
2782 } |
|
2783 |
|
2784 /* Return true if... |
|
2785 ** |
|
2786 ** * s begins with the string t, ignoring case |
|
2787 ** * s is longer than t |
|
2788 ** * The first character of s beyond t is not a alphanumeric |
|
2789 ** |
|
2790 ** Ignore leading space in *s. |
|
2791 ** |
|
2792 ** To put it another way, return true if the first token of |
|
2793 ** s[] is t[]. |
|
2794 */ |
|
2795 static int startsWith(const char *s, const char *t){ |
|
2796 while( safe_isspace(*s) ){ s++; } |
|
2797 while( *t ){ |
|
2798 if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0; |
|
2799 } |
|
2800 return *s!='_' && !safe_isalnum(*s); |
|
2801 } |
|
2802 |
|
2803 /* |
|
2804 ** An instance of this structure defines the "spec" of a |
|
2805 ** full text index. This structure is populated by parseSpec |
|
2806 ** and use by fulltextConnect and fulltextCreate. |
|
2807 */ |
|
2808 typedef struct TableSpec { |
|
2809 const char *zDb; /* Logical database name */ |
|
2810 const char *zName; /* Name of the full-text index */ |
|
2811 int nColumn; /* Number of columns to be indexed */ |
|
2812 char **azColumn; /* Original names of columns to be indexed */ |
|
2813 char **azContentColumn; /* Column names for %_content */ |
|
2814 char **azTokenizer; /* Name of tokenizer and its arguments */ |
|
2815 } TableSpec; |
|
2816 |
|
2817 /* |
|
2818 ** Reclaim all of the memory used by a TableSpec |
|
2819 */ |
|
2820 static void clearTableSpec(TableSpec *p) { |
|
2821 sqlite3_free(p->azColumn); |
|
2822 sqlite3_free(p->azContentColumn); |
|
2823 sqlite3_free(p->azTokenizer); |
|
2824 } |
|
2825 |
|
2826 /* Parse a CREATE VIRTUAL TABLE statement, which looks like this: |
|
2827 * |
|
2828 * CREATE VIRTUAL TABLE email |
|
2829 * USING fts3(subject, body, tokenize mytokenizer(myarg)) |
|
2830 * |
|
2831 * We return parsed information in a TableSpec structure. |
|
2832 * |
|
2833 */ |
|
2834 static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv, |
|
2835 char**pzErr){ |
|
2836 int i, n; |
|
2837 char *z, *zDummy; |
|
2838 char **azArg; |
|
2839 const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */ |
|
2840 |
|
2841 assert( argc>=3 ); |
|
2842 /* Current interface: |
|
2843 ** argv[0] - module name |
|
2844 ** argv[1] - database name |
|
2845 ** argv[2] - table name |
|
2846 ** argv[3..] - columns, optionally followed by tokenizer specification |
|
2847 ** and snippet delimiters specification. |
|
2848 */ |
|
2849 |
|
2850 /* Make a copy of the complete argv[][] array in a single allocation. |
|
2851 ** The argv[][] array is read-only and transient. We can write to the |
|
2852 ** copy in order to modify things and the copy is persistent. |
|
2853 */ |
|
2854 CLEAR(pSpec); |
|
2855 for(i=n=0; i<argc; i++){ |
|
2856 n += strlen(argv[i]) + 1; |
|
2857 } |
|
2858 azArg = sqlite3_malloc( sizeof(char*)*argc + n ); |
|
2859 if( azArg==0 ){ |
|
2860 return SQLITE_NOMEM; |
|
2861 } |
|
2862 z = (char*)&azArg[argc]; |
|
2863 for(i=0; i<argc; i++){ |
|
2864 azArg[i] = z; |
|
2865 strcpy(z, argv[i]); |
|
2866 z += strlen(z)+1; |
|
2867 } |
|
2868 |
|
2869 /* Identify the column names and the tokenizer and delimiter arguments |
|
2870 ** in the argv[][] array. |
|
2871 */ |
|
2872 pSpec->zDb = azArg[1]; |
|
2873 pSpec->zName = azArg[2]; |
|
2874 pSpec->nColumn = 0; |
|
2875 pSpec->azColumn = azArg; |
|
2876 zTokenizer = "tokenize simple"; |
|
2877 for(i=3; i<argc; ++i){ |
|
2878 if( startsWith(azArg[i],"tokenize") ){ |
|
2879 zTokenizer = azArg[i]; |
|
2880 }else{ |
|
2881 z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy); |
|
2882 pSpec->nColumn++; |
|
2883 } |
|
2884 } |
|
2885 if( pSpec->nColumn==0 ){ |
|
2886 azArg[0] = "content"; |
|
2887 pSpec->nColumn = 1; |
|
2888 } |
|
2889 |
|
2890 /* |
|
2891 ** Construct the list of content column names. |
|
2892 ** |
|
2893 ** Each content column name will be of the form cNNAAAA |
|
2894 ** where NN is the column number and AAAA is the sanitized |
|
2895 ** column name. "sanitized" means that special characters are |
|
2896 ** converted to "_". The cNN prefix guarantees that all column |
|
2897 ** names are unique. |
|
2898 ** |
|
2899 ** The AAAA suffix is not strictly necessary. It is included |
|
2900 ** for the convenience of people who might examine the generated |
|
2901 ** %_content table and wonder what the columns are used for. |
|
2902 */ |
|
2903 pSpec->azContentColumn = sqlite3_malloc( pSpec->nColumn * sizeof(char *) ); |
|
2904 if( pSpec->azContentColumn==0 ){ |
|
2905 clearTableSpec(pSpec); |
|
2906 return SQLITE_NOMEM; |
|
2907 } |
|
2908 for(i=0; i<pSpec->nColumn; i++){ |
|
2909 char *p; |
|
2910 pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); |
|
2911 for (p = pSpec->azContentColumn[i]; *p ; ++p) { |
|
2912 if( !safe_isalnum(*p) ) *p = '_'; |
|
2913 } |
|
2914 } |
|
2915 |
|
2916 /* |
|
2917 ** Parse the tokenizer specification string. |
|
2918 */ |
|
2919 pSpec->azTokenizer = tokenizeString(zTokenizer, &n); |
|
2920 tokenListToIdList(pSpec->azTokenizer); |
|
2921 |
|
2922 return SQLITE_OK; |
|
2923 } |
|
2924 |
|
2925 /* |
|
2926 ** Generate a CREATE TABLE statement that describes the schema of |
|
2927 ** the virtual table. Return a pointer to this schema string. |
|
2928 ** |
|
2929 ** Space is obtained from sqlite3_mprintf() and should be freed |
|
2930 ** using sqlite3_free(). |
|
2931 */ |
|
2932 static char *fulltextSchema( |
|
2933 int nColumn, /* Number of columns */ |
|
2934 const char *const* azColumn, /* List of columns */ |
|
2935 const char *zTableName /* Name of the table */ |
|
2936 ){ |
|
2937 int i; |
|
2938 char *zSchema, *zNext; |
|
2939 const char *zSep = "("; |
|
2940 zSchema = sqlite3_mprintf("CREATE TABLE x"); |
|
2941 for(i=0; i<nColumn; i++){ |
|
2942 zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]); |
|
2943 sqlite3_free(zSchema); |
|
2944 zSchema = zNext; |
|
2945 zSep = ","; |
|
2946 } |
|
2947 zNext = sqlite3_mprintf("%s,%Q HIDDEN", zSchema, zTableName); |
|
2948 sqlite3_free(zSchema); |
|
2949 zSchema = zNext; |
|
2950 zNext = sqlite3_mprintf("%s,docid HIDDEN)", zSchema); |
|
2951 sqlite3_free(zSchema); |
|
2952 return zNext; |
|
2953 } |
|
2954 |
|
2955 /* |
|
2956 ** Build a new sqlite3_vtab structure that will describe the |
|
2957 ** fulltext index defined by spec. |
|
2958 */ |
|
2959 static int constructVtab( |
|
2960 sqlite3 *db, /* The SQLite database connection */ |
|
2961 fts3Hash *pHash, /* Hash table containing tokenizers */ |
|
2962 TableSpec *spec, /* Parsed spec information from parseSpec() */ |
|
2963 sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */ |
|
2964 char **pzErr /* Write any error message here */ |
|
2965 ){ |
|
2966 int rc; |
|
2967 int n; |
|
2968 fulltext_vtab *v = 0; |
|
2969 const sqlite3_tokenizer_module *m = NULL; |
|
2970 char *schema; |
|
2971 |
|
2972 char const *zTok; /* Name of tokenizer to use for this fts table */ |
|
2973 int nTok; /* Length of zTok, including nul terminator */ |
|
2974 |
|
2975 v = (fulltext_vtab *) sqlite3_malloc(sizeof(fulltext_vtab)); |
|
2976 if( v==0 ) return SQLITE_NOMEM; |
|
2977 CLEAR(v); |
|
2978 /* sqlite will initialize v->base */ |
|
2979 v->db = db; |
|
2980 v->zDb = spec->zDb; /* Freed when azColumn is freed */ |
|
2981 v->zName = spec->zName; /* Freed when azColumn is freed */ |
|
2982 v->nColumn = spec->nColumn; |
|
2983 v->azContentColumn = spec->azContentColumn; |
|
2984 spec->azContentColumn = 0; |
|
2985 v->azColumn = spec->azColumn; |
|
2986 spec->azColumn = 0; |
|
2987 |
|
2988 if( spec->azTokenizer==0 ){ |
|
2989 return SQLITE_NOMEM; |
|
2990 } |
|
2991 |
|
2992 zTok = spec->azTokenizer[0]; |
|
2993 if( !zTok ){ |
|
2994 zTok = "simple"; |
|
2995 } |
|
2996 nTok = strlen(zTok)+1; |
|
2997 |
|
2998 m = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zTok, nTok); |
|
2999 if( !m ){ |
|
3000 *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]); |
|
3001 rc = SQLITE_ERROR; |
|
3002 goto err; |
|
3003 } |
|
3004 |
|
3005 for(n=0; spec->azTokenizer[n]; n++){} |
|
3006 if( n ){ |
|
3007 rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1], |
|
3008 &v->pTokenizer); |
|
3009 }else{ |
|
3010 rc = m->xCreate(0, 0, &v->pTokenizer); |
|
3011 } |
|
3012 if( rc!=SQLITE_OK ) goto err; |
|
3013 v->pTokenizer->pModule = m; |
|
3014 |
|
3015 /* TODO: verify the existence of backing tables foo_content, foo_term */ |
|
3016 |
|
3017 schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn, |
|
3018 spec->zName); |
|
3019 rc = sqlite3_declare_vtab(db, schema); |
|
3020 sqlite3_free(schema); |
|
3021 if( rc!=SQLITE_OK ) goto err; |
|
3022 |
|
3023 memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); |
|
3024 |
|
3025 /* Indicate that the buffer is not live. */ |
|
3026 v->nPendingData = -1; |
|
3027 |
|
3028 *ppVTab = &v->base; |
|
3029 FTSTRACE(("FTS3 Connect %p\n", v)); |
|
3030 |
|
3031 return rc; |
|
3032 |
|
3033 err: |
|
3034 fulltext_vtab_destroy(v); |
|
3035 return rc; |
|
3036 } |
|
3037 |
|
3038 static int fulltextConnect( |
|
3039 sqlite3 *db, |
|
3040 void *pAux, |
|
3041 int argc, const char *const*argv, |
|
3042 sqlite3_vtab **ppVTab, |
|
3043 char **pzErr |
|
3044 ){ |
|
3045 TableSpec spec; |
|
3046 int rc = parseSpec(&spec, argc, argv, pzErr); |
|
3047 if( rc!=SQLITE_OK ) return rc; |
|
3048 |
|
3049 rc = constructVtab(db, (fts3Hash *)pAux, &spec, ppVTab, pzErr); |
|
3050 clearTableSpec(&spec); |
|
3051 return rc; |
|
3052 } |
|
3053 |
|
3054 /* The %_content table holds the text of each document, with |
|
3055 ** the docid column exposed as the SQLite rowid for the table. |
|
3056 */ |
|
3057 /* TODO(shess) This comment needs elaboration to match the updated |
|
3058 ** code. Work it into the top-of-file comment at that time. |
|
3059 */ |
|
3060 static int fulltextCreate(sqlite3 *db, void *pAux, |
|
3061 int argc, const char * const *argv, |
|
3062 sqlite3_vtab **ppVTab, char **pzErr){ |
|
3063 int rc; |
|
3064 TableSpec spec; |
|
3065 StringBuffer schema; |
|
3066 FTSTRACE(("FTS3 Create\n")); |
|
3067 |
|
3068 rc = parseSpec(&spec, argc, argv, pzErr); |
|
3069 if( rc!=SQLITE_OK ) return rc; |
|
3070 |
|
3071 initStringBuffer(&schema); |
|
3072 append(&schema, "CREATE TABLE %_content("); |
|
3073 append(&schema, " docid INTEGER PRIMARY KEY,"); |
|
3074 appendList(&schema, spec.nColumn, spec.azContentColumn); |
|
3075 append(&schema, ")"); |
|
3076 rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema)); |
|
3077 stringBufferDestroy(&schema); |
|
3078 if( rc!=SQLITE_OK ) goto out; |
|
3079 |
|
3080 rc = sql_exec(db, spec.zDb, spec.zName, |
|
3081 "create table %_segments(" |
|
3082 " blockid INTEGER PRIMARY KEY," |
|
3083 " block blob" |
|
3084 ");" |
|
3085 ); |
|
3086 if( rc!=SQLITE_OK ) goto out; |
|
3087 |
|
3088 rc = sql_exec(db, spec.zDb, spec.zName, |
|
3089 "create table %_segdir(" |
|
3090 " level integer," |
|
3091 " idx integer," |
|
3092 " start_block integer," |
|
3093 " leaves_end_block integer," |
|
3094 " end_block integer," |
|
3095 " root blob," |
|
3096 " primary key(level, idx)" |
|
3097 ");"); |
|
3098 if( rc!=SQLITE_OK ) goto out; |
|
3099 |
|
3100 rc = constructVtab(db, (fts3Hash *)pAux, &spec, ppVTab, pzErr); |
|
3101 |
|
3102 out: |
|
3103 clearTableSpec(&spec); |
|
3104 return rc; |
|
3105 } |
|
3106 |
|
3107 /* Decide how to handle an SQL query. */ |
|
3108 static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ |
|
3109 fulltext_vtab *v = (fulltext_vtab *)pVTab; |
|
3110 int i; |
|
3111 FTSTRACE(("FTS3 BestIndex\n")); |
|
3112 |
|
3113 for(i=0; i<pInfo->nConstraint; ++i){ |
|
3114 const struct sqlite3_index_constraint *pConstraint; |
|
3115 pConstraint = &pInfo->aConstraint[i]; |
|
3116 if( pConstraint->usable ) { |
|
3117 if( (pConstraint->iColumn==-1 || pConstraint->iColumn==v->nColumn+1) && |
|
3118 pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){ |
|
3119 pInfo->idxNum = QUERY_DOCID; /* lookup by docid */ |
|
3120 FTSTRACE(("FTS3 QUERY_DOCID\n")); |
|
3121 } else if( pConstraint->iColumn>=0 && pConstraint->iColumn<=v->nColumn && |
|
3122 pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){ |
|
3123 /* full-text search */ |
|
3124 pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn; |
|
3125 FTSTRACE(("FTS3 QUERY_FULLTEXT %d\n", pConstraint->iColumn)); |
|
3126 } else continue; |
|
3127 |
|
3128 pInfo->aConstraintUsage[i].argvIndex = 1; |
|
3129 pInfo->aConstraintUsage[i].omit = 1; |
|
3130 |
|
3131 /* An arbitrary value for now. |
|
3132 * TODO: Perhaps docid matches should be considered cheaper than |
|
3133 * full-text searches. */ |
|
3134 pInfo->estimatedCost = 1.0; |
|
3135 |
|
3136 return SQLITE_OK; |
|
3137 } |
|
3138 } |
|
3139 pInfo->idxNum = QUERY_GENERIC; |
|
3140 return SQLITE_OK; |
|
3141 } |
|
3142 |
|
3143 static int fulltextDisconnect(sqlite3_vtab *pVTab){ |
|
3144 FTSTRACE(("FTS3 Disconnect %p\n", pVTab)); |
|
3145 fulltext_vtab_destroy((fulltext_vtab *)pVTab); |
|
3146 return SQLITE_OK; |
|
3147 } |
|
3148 |
|
3149 static int fulltextDestroy(sqlite3_vtab *pVTab){ |
|
3150 fulltext_vtab *v = (fulltext_vtab *)pVTab; |
|
3151 int rc; |
|
3152 |
|
3153 FTSTRACE(("FTS3 Destroy %p\n", pVTab)); |
|
3154 rc = sql_exec(v->db, v->zDb, v->zName, |
|
3155 "drop table if exists %_content;" |
|
3156 "drop table if exists %_segments;" |
|
3157 "drop table if exists %_segdir;" |
|
3158 ); |
|
3159 if( rc!=SQLITE_OK ) return rc; |
|
3160 |
|
3161 fulltext_vtab_destroy((fulltext_vtab *)pVTab); |
|
3162 return SQLITE_OK; |
|
3163 } |
|
3164 |
|
3165 static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ |
|
3166 fulltext_cursor *c; |
|
3167 |
|
3168 c = (fulltext_cursor *) sqlite3_malloc(sizeof(fulltext_cursor)); |
|
3169 if( c ){ |
|
3170 memset(c, 0, sizeof(fulltext_cursor)); |
|
3171 /* sqlite will initialize c->base */ |
|
3172 *ppCursor = &c->base; |
|
3173 FTSTRACE(("FTS3 Open %p: %p\n", pVTab, c)); |
|
3174 return SQLITE_OK; |
|
3175 }else{ |
|
3176 return SQLITE_NOMEM; |
|
3177 } |
|
3178 } |
|
3179 |
|
3180 |
|
3181 /* Free all of the dynamically allocated memory held by *q |
|
3182 */ |
|
3183 static void queryClear(Query *q){ |
|
3184 int i; |
|
3185 for(i = 0; i < q->nTerms; ++i){ |
|
3186 sqlite3_free(q->pTerms[i].pTerm); |
|
3187 } |
|
3188 sqlite3_free(q->pTerms); |
|
3189 CLEAR(q); |
|
3190 } |
|
3191 |
|
3192 /* Free all of the dynamically allocated memory held by the |
|
3193 ** Snippet |
|
3194 */ |
|
3195 static void snippetClear(Snippet *p){ |
|
3196 sqlite3_free(p->aMatch); |
|
3197 sqlite3_free(p->zOffset); |
|
3198 sqlite3_free(p->zSnippet); |
|
3199 CLEAR(p); |
|
3200 } |
|
3201 /* |
|
3202 ** Append a single entry to the p->aMatch[] log. |
|
3203 */ |
|
3204 static void snippetAppendMatch( |
|
3205 Snippet *p, /* Append the entry to this snippet */ |
|
3206 int iCol, int iTerm, /* The column and query term */ |
|
3207 int iToken, /* Matching token in document */ |
|
3208 int iStart, int nByte /* Offset and size of the match */ |
|
3209 ){ |
|
3210 int i; |
|
3211 struct snippetMatch *pMatch; |
|
3212 if( p->nMatch+1>=p->nAlloc ){ |
|
3213 p->nAlloc = p->nAlloc*2 + 10; |
|
3214 p->aMatch = sqlite3_realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) ); |
|
3215 if( p->aMatch==0 ){ |
|
3216 p->nMatch = 0; |
|
3217 p->nAlloc = 0; |
|
3218 return; |
|
3219 } |
|
3220 } |
|
3221 i = p->nMatch++; |
|
3222 pMatch = &p->aMatch[i]; |
|
3223 pMatch->iCol = iCol; |
|
3224 pMatch->iTerm = iTerm; |
|
3225 pMatch->iToken = iToken; |
|
3226 pMatch->iStart = iStart; |
|
3227 pMatch->nByte = nByte; |
|
3228 } |
|
3229 |
|
3230 /* |
|
3231 ** Sizing information for the circular buffer used in snippetOffsetsOfColumn() |
|
3232 */ |
|
3233 #define FTS3_ROTOR_SZ (32) |
|
3234 #define FTS3_ROTOR_MASK (FTS3_ROTOR_SZ-1) |
|
3235 |
|
3236 /* |
|
3237 ** Add entries to pSnippet->aMatch[] for every match that occurs against |
|
3238 ** document zDoc[0..nDoc-1] which is stored in column iColumn. |
|
3239 */ |
|
3240 static void snippetOffsetsOfColumn( |
|
3241 Query *pQuery, |
|
3242 Snippet *pSnippet, |
|
3243 int iColumn, |
|
3244 const char *zDoc, |
|
3245 int nDoc |
|
3246 ){ |
|
3247 const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */ |
|
3248 sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */ |
|
3249 sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */ |
|
3250 fulltext_vtab *pVtab; /* The full text index */ |
|
3251 int nColumn; /* Number of columns in the index */ |
|
3252 const QueryTerm *aTerm; /* Query string terms */ |
|
3253 int nTerm; /* Number of query string terms */ |
|
3254 int i, j; /* Loop counters */ |
|
3255 int rc; /* Return code */ |
|
3256 unsigned int match, prevMatch; /* Phrase search bitmasks */ |
|
3257 const char *zToken; /* Next token from the tokenizer */ |
|
3258 int nToken; /* Size of zToken */ |
|
3259 int iBegin, iEnd, iPos; /* Offsets of beginning and end */ |
|
3260 |
|
3261 /* The following variables keep a circular buffer of the last |
|
3262 ** few tokens */ |
|
3263 unsigned int iRotor = 0; /* Index of current token */ |
|
3264 int iRotorBegin[FTS3_ROTOR_SZ]; /* Beginning offset of token */ |
|
3265 int iRotorLen[FTS3_ROTOR_SZ]; /* Length of token */ |
|
3266 |
|
3267 pVtab = pQuery->pFts; |
|
3268 nColumn = pVtab->nColumn; |
|
3269 pTokenizer = pVtab->pTokenizer; |
|
3270 pTModule = pTokenizer->pModule; |
|
3271 rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor); |
|
3272 if( rc ) return; |
|
3273 pTCursor->pTokenizer = pTokenizer; |
|
3274 aTerm = pQuery->pTerms; |
|
3275 nTerm = pQuery->nTerms; |
|
3276 if( nTerm>=FTS3_ROTOR_SZ ){ |
|
3277 nTerm = FTS3_ROTOR_SZ - 1; |
|
3278 } |
|
3279 prevMatch = 0; |
|
3280 while(1){ |
|
3281 rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos); |
|
3282 if( rc ) break; |
|
3283 iRotorBegin[iRotor&FTS3_ROTOR_MASK] = iBegin; |
|
3284 iRotorLen[iRotor&FTS3_ROTOR_MASK] = iEnd-iBegin; |
|
3285 match = 0; |
|
3286 for(i=0; i<nTerm; i++){ |
|
3287 int iCol; |
|
3288 iCol = aTerm[i].iColumn; |
|
3289 if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue; |
|
3290 if( aTerm[i].nTerm>nToken ) continue; |
|
3291 if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue; |
|
3292 assert( aTerm[i].nTerm<=nToken ); |
|
3293 if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue; |
|
3294 if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue; |
|
3295 match |= 1<<i; |
|
3296 if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){ |
|
3297 for(j=aTerm[i].iPhrase-1; j>=0; j--){ |
|
3298 int k = (iRotor-j) & FTS3_ROTOR_MASK; |
|
3299 snippetAppendMatch(pSnippet, iColumn, i-j, iPos-j, |
|
3300 iRotorBegin[k], iRotorLen[k]); |
|
3301 } |
|
3302 } |
|
3303 } |
|
3304 prevMatch = match<<1; |
|
3305 iRotor++; |
|
3306 } |
|
3307 pTModule->xClose(pTCursor); |
|
3308 } |
|
3309 |
|
3310 /* |
|
3311 ** Remove entries from the pSnippet structure to account for the NEAR |
|
3312 ** operator. When this is called, pSnippet contains the list of token |
|
3313 ** offsets produced by treating all NEAR operators as AND operators. |
|
3314 ** This function removes any entries that should not be present after |
|
3315 ** accounting for the NEAR restriction. For example, if the queried |
|
3316 ** document is: |
|
3317 ** |
|
3318 ** "A B C D E A" |
|
3319 ** |
|
3320 ** and the query is: |
|
3321 ** |
|
3322 ** A NEAR/0 E |
|
3323 ** |
|
3324 ** then when this function is called the Snippet contains token offsets |
|
3325 ** 0, 4 and 5. This function removes the "0" entry (because the first A |
|
3326 ** is not near enough to an E). |
|
3327 */ |
|
3328 static void trimSnippetOffsetsForNear(Query *pQuery, Snippet *pSnippet){ |
|
3329 int ii; |
|
3330 int iDir = 1; |
|
3331 |
|
3332 while(iDir>-2) { |
|
3333 assert( iDir==1 || iDir==-1 ); |
|
3334 for(ii=0; ii<pSnippet->nMatch; ii++){ |
|
3335 int jj; |
|
3336 int nNear; |
|
3337 struct snippetMatch *pMatch = &pSnippet->aMatch[ii]; |
|
3338 QueryTerm *pQueryTerm = &pQuery->pTerms[pMatch->iTerm]; |
|
3339 |
|
3340 if( (pMatch->iTerm+iDir)<0 |
|
3341 || (pMatch->iTerm+iDir)>=pQuery->nTerms |
|
3342 ){ |
|
3343 continue; |
|
3344 } |
|
3345 |
|
3346 nNear = pQueryTerm->nNear; |
|
3347 if( iDir<0 ){ |
|
3348 nNear = pQueryTerm[-1].nNear; |
|
3349 } |
|
3350 |
|
3351 if( pMatch->iTerm>=0 && nNear ){ |
|
3352 int isOk = 0; |
|
3353 int iNextTerm = pMatch->iTerm+iDir; |
|
3354 int iPrevTerm = iNextTerm; |
|
3355 |
|
3356 int iEndToken; |
|
3357 int iStartToken; |
|
3358 |
|
3359 if( iDir<0 ){ |
|
3360 int nPhrase = 1; |
|
3361 iStartToken = pMatch->iToken; |
|
3362 while( (pMatch->iTerm+nPhrase)<pQuery->nTerms |
|
3363 && pQuery->pTerms[pMatch->iTerm+nPhrase].iPhrase>1 |
|
3364 ){ |
|
3365 nPhrase++; |
|
3366 } |
|
3367 iEndToken = iStartToken + nPhrase - 1; |
|
3368 }else{ |
|
3369 iEndToken = pMatch->iToken; |
|
3370 iStartToken = pMatch->iToken+1-pQueryTerm->iPhrase; |
|
3371 } |
|
3372 |
|
3373 while( pQuery->pTerms[iNextTerm].iPhrase>1 ){ |
|
3374 iNextTerm--; |
|
3375 } |
|
3376 while( (iPrevTerm+1)<pQuery->nTerms && |
|
3377 pQuery->pTerms[iPrevTerm+1].iPhrase>1 |
|
3378 ){ |
|
3379 iPrevTerm++; |
|
3380 } |
|
3381 |
|
3382 for(jj=0; isOk==0 && jj<pSnippet->nMatch; jj++){ |
|
3383 struct snippetMatch *p = &pSnippet->aMatch[jj]; |
|
3384 if( p->iCol==pMatch->iCol && (( |
|
3385 p->iTerm==iNextTerm && |
|
3386 p->iToken>iEndToken && |
|
3387 p->iToken<=iEndToken+nNear |
|
3388 ) || ( |
|
3389 p->iTerm==iPrevTerm && |
|
3390 p->iToken<iStartToken && |
|
3391 p->iToken>=iStartToken-nNear |
|
3392 ))){ |
|
3393 isOk = 1; |
|
3394 } |
|
3395 } |
|
3396 if( !isOk ){ |
|
3397 for(jj=1-pQueryTerm->iPhrase; jj<=0; jj++){ |
|
3398 pMatch[jj].iTerm = -1; |
|
3399 } |
|
3400 ii = -1; |
|
3401 iDir = 1; |
|
3402 } |
|
3403 } |
|
3404 } |
|
3405 iDir -= 2; |
|
3406 } |
|
3407 } |
|
3408 |
|
3409 /* |
|
3410 ** Compute all offsets for the current row of the query. |
|
3411 ** If the offsets have already been computed, this routine is a no-op. |
|
3412 */ |
|
3413 static void snippetAllOffsets(fulltext_cursor *p){ |
|
3414 int nColumn; |
|
3415 int iColumn, i; |
|
3416 int iFirst, iLast; |
|
3417 fulltext_vtab *pFts; |
|
3418 |
|
3419 if( p->snippet.nMatch ) return; |
|
3420 if( p->q.nTerms==0 ) return; |
|
3421 pFts = p->q.pFts; |
|
3422 nColumn = pFts->nColumn; |
|
3423 iColumn = (p->iCursorType - QUERY_FULLTEXT); |
|
3424 if( iColumn<0 || iColumn>=nColumn ){ |
|
3425 iFirst = 0; |
|
3426 iLast = nColumn-1; |
|
3427 }else{ |
|
3428 iFirst = iColumn; |
|
3429 iLast = iColumn; |
|
3430 } |
|
3431 for(i=iFirst; i<=iLast; i++){ |
|
3432 const char *zDoc; |
|
3433 int nDoc; |
|
3434 zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1); |
|
3435 nDoc = sqlite3_column_bytes(p->pStmt, i+1); |
|
3436 snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc); |
|
3437 } |
|
3438 |
|
3439 trimSnippetOffsetsForNear(&p->q, &p->snippet); |
|
3440 } |
|
3441 |
|
3442 /* |
|
3443 ** Convert the information in the aMatch[] array of the snippet |
|
3444 ** into the string zOffset[0..nOffset-1]. |
|
3445 */ |
|
3446 static void snippetOffsetText(Snippet *p){ |
|
3447 int i; |
|
3448 int cnt = 0; |
|
3449 StringBuffer sb; |
|
3450 char zBuf[200]; |
|
3451 if( p->zOffset ) return; |
|
3452 initStringBuffer(&sb); |
|
3453 for(i=0; i<p->nMatch; i++){ |
|
3454 struct snippetMatch *pMatch = &p->aMatch[i]; |
|
3455 if( pMatch->iTerm>=0 ){ |
|
3456 /* If snippetMatch.iTerm is less than 0, then the match was |
|
3457 ** discarded as part of processing the NEAR operator (see the |
|
3458 ** trimSnippetOffsetsForNear() function for details). Ignore |
|
3459 ** it in this case |
|
3460 */ |
|
3461 zBuf[0] = ' '; |
|
3462 sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d", |
|
3463 pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte); |
|
3464 append(&sb, zBuf); |
|
3465 cnt++; |
|
3466 } |
|
3467 } |
|
3468 p->zOffset = stringBufferData(&sb); |
|
3469 p->nOffset = stringBufferLength(&sb); |
|
3470 } |
|
3471 |
|
3472 /* |
|
3473 ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set |
|
3474 ** of matching words some of which might be in zDoc. zDoc is column |
|
3475 ** number iCol. |
|
3476 ** |
|
3477 ** iBreak is suggested spot in zDoc where we could begin or end an |
|
3478 ** excerpt. Return a value similar to iBreak but possibly adjusted |
|
3479 ** to be a little left or right so that the break point is better. |
|
3480 */ |
|
3481 static int wordBoundary( |
|
3482 int iBreak, /* The suggested break point */ |
|
3483 const char *zDoc, /* Document text */ |
|
3484 int nDoc, /* Number of bytes in zDoc[] */ |
|
3485 struct snippetMatch *aMatch, /* Matching words */ |
|
3486 int nMatch, /* Number of entries in aMatch[] */ |
|
3487 int iCol /* The column number for zDoc[] */ |
|
3488 ){ |
|
3489 int i; |
|
3490 if( iBreak<=10 ){ |
|
3491 return 0; |
|
3492 } |
|
3493 if( iBreak>=nDoc-10 ){ |
|
3494 return nDoc; |
|
3495 } |
|
3496 for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){} |
|
3497 while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; } |
|
3498 if( i<nMatch ){ |
|
3499 if( aMatch[i].iStart<iBreak+10 ){ |
|
3500 return aMatch[i].iStart; |
|
3501 } |
|
3502 if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ |
|
3503 return aMatch[i-1].iStart; |
|
3504 } |
|
3505 } |
|
3506 for(i=1; i<=10; i++){ |
|
3507 if( safe_isspace(zDoc[iBreak-i]) ){ |
|
3508 return iBreak - i + 1; |
|
3509 } |
|
3510 if( safe_isspace(zDoc[iBreak+i]) ){ |
|
3511 return iBreak + i + 1; |
|
3512 } |
|
3513 } |
|
3514 return iBreak; |
|
3515 } |
|
3516 |
|
3517 |
|
3518 |
|
3519 /* |
|
3520 ** Allowed values for Snippet.aMatch[].snStatus |
|
3521 */ |
|
3522 #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */ |
|
3523 #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */ |
|
3524 |
|
3525 /* |
|
3526 ** Generate the text of a snippet. |
|
3527 */ |
|
3528 static void snippetText( |
|
3529 fulltext_cursor *pCursor, /* The cursor we need the snippet for */ |
|
3530 const char *zStartMark, /* Markup to appear before each match */ |
|
3531 const char *zEndMark, /* Markup to appear after each match */ |
|
3532 const char *zEllipsis /* Ellipsis mark */ |
|
3533 ){ |
|
3534 int i, j; |
|
3535 struct snippetMatch *aMatch; |
|
3536 int nMatch; |
|
3537 int nDesired; |
|
3538 StringBuffer sb; |
|
3539 int tailCol; |
|
3540 int tailOffset; |
|
3541 int iCol; |
|
3542 int nDoc; |
|
3543 const char *zDoc; |
|
3544 int iStart, iEnd; |
|
3545 int tailEllipsis = 0; |
|
3546 int iMatch; |
|
3547 |
|
3548 |
|
3549 sqlite3_free(pCursor->snippet.zSnippet); |
|
3550 pCursor->snippet.zSnippet = 0; |
|
3551 aMatch = pCursor->snippet.aMatch; |
|
3552 nMatch = pCursor->snippet.nMatch; |
|
3553 initStringBuffer(&sb); |
|
3554 |
|
3555 for(i=0; i<nMatch; i++){ |
|
3556 aMatch[i].snStatus = SNIPPET_IGNORE; |
|
3557 } |
|
3558 nDesired = 0; |
|
3559 for(i=0; i<pCursor->q.nTerms; i++){ |
|
3560 for(j=0; j<nMatch; j++){ |
|
3561 if( aMatch[j].iTerm==i ){ |
|
3562 aMatch[j].snStatus = SNIPPET_DESIRED; |
|
3563 nDesired++; |
|
3564 break; |
|
3565 } |
|
3566 } |
|
3567 } |
|
3568 |
|
3569 iMatch = 0; |
|
3570 tailCol = -1; |
|
3571 tailOffset = 0; |
|
3572 for(i=0; i<nMatch && nDesired>0; i++){ |
|
3573 if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue; |
|
3574 nDesired--; |
|
3575 iCol = aMatch[i].iCol; |
|
3576 zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1); |
|
3577 nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1); |
|
3578 iStart = aMatch[i].iStart - 40; |
|
3579 iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol); |
|
3580 if( iStart<=10 ){ |
|
3581 iStart = 0; |
|
3582 } |
|
3583 if( iCol==tailCol && iStart<=tailOffset+20 ){ |
|
3584 iStart = tailOffset; |
|
3585 } |
|
3586 if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){ |
|
3587 trimWhiteSpace(&sb); |
|
3588 appendWhiteSpace(&sb); |
|
3589 append(&sb, zEllipsis); |
|
3590 appendWhiteSpace(&sb); |
|
3591 } |
|
3592 iEnd = aMatch[i].iStart + aMatch[i].nByte + 40; |
|
3593 iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol); |
|
3594 if( iEnd>=nDoc-10 ){ |
|
3595 iEnd = nDoc; |
|
3596 tailEllipsis = 0; |
|
3597 }else{ |
|
3598 tailEllipsis = 1; |
|
3599 } |
|
3600 while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; } |
|
3601 while( iStart<iEnd ){ |
|
3602 while( iMatch<nMatch && aMatch[iMatch].iStart<iStart |
|
3603 && aMatch[iMatch].iCol<=iCol ){ |
|
3604 iMatch++; |
|
3605 } |
|
3606 if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd |
|
3607 && aMatch[iMatch].iCol==iCol ){ |
|
3608 nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart); |
|
3609 iStart = aMatch[iMatch].iStart; |
|
3610 append(&sb, zStartMark); |
|
3611 nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte); |
|
3612 append(&sb, zEndMark); |
|
3613 iStart += aMatch[iMatch].nByte; |
|
3614 for(j=iMatch+1; j<nMatch; j++){ |
|
3615 if( aMatch[j].iTerm==aMatch[iMatch].iTerm |
|
3616 && aMatch[j].snStatus==SNIPPET_DESIRED ){ |
|
3617 nDesired--; |
|
3618 aMatch[j].snStatus = SNIPPET_IGNORE; |
|
3619 } |
|
3620 } |
|
3621 }else{ |
|
3622 nappend(&sb, &zDoc[iStart], iEnd - iStart); |
|
3623 iStart = iEnd; |
|
3624 } |
|
3625 } |
|
3626 tailCol = iCol; |
|
3627 tailOffset = iEnd; |
|
3628 } |
|
3629 trimWhiteSpace(&sb); |
|
3630 if( tailEllipsis ){ |
|
3631 appendWhiteSpace(&sb); |
|
3632 append(&sb, zEllipsis); |
|
3633 } |
|
3634 pCursor->snippet.zSnippet = stringBufferData(&sb); |
|
3635 pCursor->snippet.nSnippet = stringBufferLength(&sb); |
|
3636 } |
|
3637 |
|
3638 |
|
3639 /* |
|
3640 ** Close the cursor. For additional information see the documentation |
|
3641 ** on the xClose method of the virtual table interface. |
|
3642 */ |
|
3643 static int fulltextClose(sqlite3_vtab_cursor *pCursor){ |
|
3644 fulltext_cursor *c = (fulltext_cursor *) pCursor; |
|
3645 FTSTRACE(("FTS3 Close %p\n", c)); |
|
3646 sqlite3_finalize(c->pStmt); |
|
3647 queryClear(&c->q); |
|
3648 snippetClear(&c->snippet); |
|
3649 if( c->result.nData!=0 ) dlrDestroy(&c->reader); |
|
3650 dataBufferDestroy(&c->result); |
|
3651 sqlite3_free(c); |
|
3652 return SQLITE_OK; |
|
3653 } |
|
3654 |
|
3655 static int fulltextNext(sqlite3_vtab_cursor *pCursor){ |
|
3656 fulltext_cursor *c = (fulltext_cursor *) pCursor; |
|
3657 int rc; |
|
3658 |
|
3659 FTSTRACE(("FTS3 Next %p\n", pCursor)); |
|
3660 snippetClear(&c->snippet); |
|
3661 if( c->iCursorType < QUERY_FULLTEXT ){ |
|
3662 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ |
|
3663 rc = sqlite3_step(c->pStmt); |
|
3664 switch( rc ){ |
|
3665 case SQLITE_ROW: |
|
3666 c->eof = 0; |
|
3667 return SQLITE_OK; |
|
3668 case SQLITE_DONE: |
|
3669 c->eof = 1; |
|
3670 return SQLITE_OK; |
|
3671 default: |
|
3672 c->eof = 1; |
|
3673 return rc; |
|
3674 } |
|
3675 } else { /* full-text query */ |
|
3676 rc = sqlite3_reset(c->pStmt); |
|
3677 if( rc!=SQLITE_OK ) return rc; |
|
3678 |
|
3679 if( c->result.nData==0 || dlrAtEnd(&c->reader) ){ |
|
3680 c->eof = 1; |
|
3681 return SQLITE_OK; |
|
3682 } |
|
3683 rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader)); |
|
3684 dlrStep(&c->reader); |
|
3685 if( rc!=SQLITE_OK ) return rc; |
|
3686 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ |
|
3687 rc = sqlite3_step(c->pStmt); |
|
3688 if( rc==SQLITE_ROW ){ /* the case we expect */ |
|
3689 c->eof = 0; |
|
3690 return SQLITE_OK; |
|
3691 } |
|
3692 /* an error occurred; abort */ |
|
3693 return rc==SQLITE_DONE ? SQLITE_ERROR : rc; |
|
3694 } |
|
3695 } |
|
3696 |
|
3697 |
|
3698 /* TODO(shess) If we pushed LeafReader to the top of the file, or to |
|
3699 ** another file, term_select() could be pushed above |
|
3700 ** docListOfTerm(). |
|
3701 */ |
|
3702 static int termSelect(fulltext_vtab *v, int iColumn, |
|
3703 const char *pTerm, int nTerm, int isPrefix, |
|
3704 DocListType iType, DataBuffer *out); |
|
3705 |
|
3706 /* Return a DocList corresponding to the query term *pTerm. If *pTerm |
|
3707 ** is the first term of a phrase query, go ahead and evaluate the phrase |
|
3708 ** query and return the doclist for the entire phrase query. |
|
3709 ** |
|
3710 ** The resulting DL_DOCIDS doclist is stored in pResult, which is |
|
3711 ** overwritten. |
|
3712 */ |
|
3713 static int docListOfTerm( |
|
3714 fulltext_vtab *v, /* The full text index */ |
|
3715 int iColumn, /* column to restrict to. No restriction if >=nColumn */ |
|
3716 QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */ |
|
3717 DataBuffer *pResult /* Write the result here */ |
|
3718 ){ |
|
3719 DataBuffer left, right, new; |
|
3720 int i, rc; |
|
3721 |
|
3722 /* No phrase search if no position info. */ |
|
3723 assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS ); |
|
3724 |
|
3725 /* This code should never be called with buffered updates. */ |
|
3726 assert( v->nPendingData<0 ); |
|
3727 |
|
3728 dataBufferInit(&left, 0); |
|
3729 rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix, |
|
3730 (0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS), &left); |
|
3731 if( rc ) return rc; |
|
3732 for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){ |
|
3733 /* If this token is connected to the next by a NEAR operator, and |
|
3734 ** the next token is the start of a phrase, then set nPhraseRight |
|
3735 ** to the number of tokens in the phrase. Otherwise leave it at 1. |
|
3736 */ |
|
3737 int nPhraseRight = 1; |
|
3738 while( (i+nPhraseRight)<=pQTerm->nPhrase |
|
3739 && pQTerm[i+nPhraseRight].nNear==0 |
|
3740 ){ |
|
3741 nPhraseRight++; |
|
3742 } |
|
3743 |
|
3744 dataBufferInit(&right, 0); |
|
3745 rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, |
|
3746 pQTerm[i].isPrefix, DL_POSITIONS, &right); |
|
3747 if( rc ){ |
|
3748 dataBufferDestroy(&left); |
|
3749 return rc; |
|
3750 } |
|
3751 dataBufferInit(&new, 0); |
|
3752 docListPhraseMerge(left.pData, left.nData, right.pData, right.nData, |
|
3753 pQTerm[i-1].nNear, pQTerm[i-1].iPhrase + nPhraseRight, |
|
3754 ((i<pQTerm->nPhrase) ? DL_POSITIONS : DL_DOCIDS), |
|
3755 &new); |
|
3756 dataBufferDestroy(&left); |
|
3757 dataBufferDestroy(&right); |
|
3758 left = new; |
|
3759 } |
|
3760 *pResult = left; |
|
3761 return SQLITE_OK; |
|
3762 } |
|
3763 |
|
3764 /* Add a new term pTerm[0..nTerm-1] to the query *q. |
|
3765 */ |
|
3766 static void queryAdd(Query *q, const char *pTerm, int nTerm){ |
|
3767 QueryTerm *t; |
|
3768 ++q->nTerms; |
|
3769 q->pTerms = sqlite3_realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0])); |
|
3770 if( q->pTerms==0 ){ |
|
3771 q->nTerms = 0; |
|
3772 return; |
|
3773 } |
|
3774 t = &q->pTerms[q->nTerms - 1]; |
|
3775 CLEAR(t); |
|
3776 t->pTerm = sqlite3_malloc(nTerm+1); |
|
3777 memcpy(t->pTerm, pTerm, nTerm); |
|
3778 t->pTerm[nTerm] = 0; |
|
3779 t->nTerm = nTerm; |
|
3780 t->isOr = q->nextIsOr; |
|
3781 t->isPrefix = 0; |
|
3782 q->nextIsOr = 0; |
|
3783 t->iColumn = q->nextColumn; |
|
3784 q->nextColumn = q->dfltColumn; |
|
3785 } |
|
3786 |
|
3787 /* |
|
3788 ** Check to see if the string zToken[0...nToken-1] matches any |
|
3789 ** column name in the virtual table. If it does, |
|
3790 ** return the zero-indexed column number. If not, return -1. |
|
3791 */ |
|
3792 static int checkColumnSpecifier( |
|
3793 fulltext_vtab *pVtab, /* The virtual table */ |
|
3794 const char *zToken, /* Text of the token */ |
|
3795 int nToken /* Number of characters in the token */ |
|
3796 ){ |
|
3797 int i; |
|
3798 for(i=0; i<pVtab->nColumn; i++){ |
|
3799 if( memcmp(pVtab->azColumn[i], zToken, nToken)==0 |
|
3800 && pVtab->azColumn[i][nToken]==0 ){ |
|
3801 return i; |
|
3802 } |
|
3803 } |
|
3804 return -1; |
|
3805 } |
|
3806 |
|
3807 /* |
|
3808 ** Parse the text at zSegment[0..nSegment-1]. Add additional terms |
|
3809 ** to the query being assemblied in pQuery. |
|
3810 ** |
|
3811 ** inPhrase is true if zSegment[0..nSegement-1] is contained within |
|
3812 ** double-quotes. If inPhrase is true, then the first term |
|
3813 ** is marked with the number of terms in the phrase less one and |
|
3814 ** OR and "-" syntax is ignored. If inPhrase is false, then every |
|
3815 ** term found is marked with nPhrase=0 and OR and "-" syntax is significant. |
|
3816 */ |
|
3817 static int tokenizeSegment( |
|
3818 sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */ |
|
3819 const char *zSegment, int nSegment, /* Query expression being parsed */ |
|
3820 int inPhrase, /* True if within "..." */ |
|
3821 Query *pQuery /* Append results here */ |
|
3822 ){ |
|
3823 const sqlite3_tokenizer_module *pModule = pTokenizer->pModule; |
|
3824 sqlite3_tokenizer_cursor *pCursor; |
|
3825 int firstIndex = pQuery->nTerms; |
|
3826 int iCol; |
|
3827 int nTerm = 1; |
|
3828 |
|
3829 int rc = pModule->xOpen(pTokenizer, zSegment, nSegment, &pCursor); |
|
3830 if( rc!=SQLITE_OK ) return rc; |
|
3831 pCursor->pTokenizer = pTokenizer; |
|
3832 |
|
3833 while( 1 ){ |
|
3834 const char *zToken; |
|
3835 int nToken, iBegin, iEnd, iPos; |
|
3836 |
|
3837 rc = pModule->xNext(pCursor, |
|
3838 &zToken, &nToken, |
|
3839 &iBegin, &iEnd, &iPos); |
|
3840 if( rc!=SQLITE_OK ) break; |
|
3841 if( !inPhrase && |
|
3842 zSegment[iEnd]==':' && |
|
3843 (iCol = checkColumnSpecifier(pQuery->pFts, zToken, nToken))>=0 ){ |
|
3844 pQuery->nextColumn = iCol; |
|
3845 continue; |
|
3846 } |
|
3847 if( !inPhrase && pQuery->nTerms>0 && nToken==2 |
|
3848 && zSegment[iBegin+0]=='O' |
|
3849 && zSegment[iBegin+1]=='R' |
|
3850 ){ |
|
3851 pQuery->nextIsOr = 1; |
|
3852 continue; |
|
3853 } |
|
3854 if( !inPhrase && pQuery->nTerms>0 && !pQuery->nextIsOr && nToken==4 |
|
3855 && memcmp(&zSegment[iBegin], "NEAR", 4)==0 |
|
3856 ){ |
|
3857 QueryTerm *pTerm = &pQuery->pTerms[pQuery->nTerms-1]; |
|
3858 if( (iBegin+6)<nSegment |
|
3859 && zSegment[iBegin+4] == '/' |
|
3860 && isdigit(zSegment[iBegin+5]) |
|
3861 ){ |
|
3862 int k; |
|
3863 pTerm->nNear = 0; |
|
3864 for(k=5; (iBegin+k)<=nSegment && isdigit(zSegment[iBegin+k]); k++){ |
|
3865 pTerm->nNear = pTerm->nNear*10 + (zSegment[iBegin+k] - '0'); |
|
3866 } |
|
3867 pModule->xNext(pCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos); |
|
3868 } else { |
|
3869 pTerm->nNear = SQLITE_FTS3_DEFAULT_NEAR_PARAM; |
|
3870 } |
|
3871 pTerm->nNear++; |
|
3872 continue; |
|
3873 } |
|
3874 |
|
3875 queryAdd(pQuery, zToken, nToken); |
|
3876 if( !inPhrase && iBegin>0 && zSegment[iBegin-1]=='-' ){ |
|
3877 pQuery->pTerms[pQuery->nTerms-1].isNot = 1; |
|
3878 } |
|
3879 if( iEnd<nSegment && zSegment[iEnd]=='*' ){ |
|
3880 pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1; |
|
3881 } |
|
3882 pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm; |
|
3883 if( inPhrase ){ |
|
3884 nTerm++; |
|
3885 } |
|
3886 } |
|
3887 |
|
3888 if( inPhrase && pQuery->nTerms>firstIndex ){ |
|
3889 pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1; |
|
3890 } |
|
3891 |
|
3892 return pModule->xClose(pCursor); |
|
3893 } |
|
3894 |
|
3895 /* Parse a query string, yielding a Query object pQuery. |
|
3896 ** |
|
3897 ** The calling function will need to queryClear() to clean up |
|
3898 ** the dynamically allocated memory held by pQuery. |
|
3899 */ |
|
3900 static int parseQuery( |
|
3901 fulltext_vtab *v, /* The fulltext index */ |
|
3902 const char *zInput, /* Input text of the query string */ |
|
3903 int nInput, /* Size of the input text */ |
|
3904 int dfltColumn, /* Default column of the index to match against */ |
|
3905 Query *pQuery /* Write the parse results here. */ |
|
3906 ){ |
|
3907 int iInput, inPhrase = 0; |
|
3908 int ii; |
|
3909 QueryTerm *aTerm; |
|
3910 |
|
3911 if( zInput==0 ) nInput = 0; |
|
3912 if( nInput<0 ) nInput = strlen(zInput); |
|
3913 pQuery->nTerms = 0; |
|
3914 pQuery->pTerms = NULL; |
|
3915 pQuery->nextIsOr = 0; |
|
3916 pQuery->nextColumn = dfltColumn; |
|
3917 pQuery->dfltColumn = dfltColumn; |
|
3918 pQuery->pFts = v; |
|
3919 |
|
3920 for(iInput=0; iInput<nInput; ++iInput){ |
|
3921 int i; |
|
3922 for(i=iInput; i<nInput && zInput[i]!='"'; ++i){} |
|
3923 if( i>iInput ){ |
|
3924 tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase, |
|
3925 pQuery); |
|
3926 } |
|
3927 iInput = i; |
|
3928 if( i<nInput ){ |
|
3929 assert( zInput[i]=='"' ); |
|
3930 inPhrase = !inPhrase; |
|
3931 } |
|
3932 } |
|
3933 |
|
3934 if( inPhrase ){ |
|
3935 /* unmatched quote */ |
|
3936 queryClear(pQuery); |
|
3937 return SQLITE_ERROR; |
|
3938 } |
|
3939 |
|
3940 /* Modify the values of the QueryTerm.nPhrase variables to account for |
|
3941 ** the NEAR operator. For the purposes of QueryTerm.nPhrase, phrases |
|
3942 ** and tokens connected by the NEAR operator are handled as a single |
|
3943 ** phrase. See comments above the QueryTerm structure for details. |
|
3944 */ |
|
3945 aTerm = pQuery->pTerms; |
|
3946 for(ii=0; ii<pQuery->nTerms; ii++){ |
|
3947 if( aTerm[ii].nNear || aTerm[ii].nPhrase ){ |
|
3948 while (aTerm[ii+aTerm[ii].nPhrase].nNear) { |
|
3949 aTerm[ii].nPhrase += (1 + aTerm[ii+aTerm[ii].nPhrase+1].nPhrase); |
|
3950 } |
|
3951 } |
|
3952 } |
|
3953 |
|
3954 return SQLITE_OK; |
|
3955 } |
|
3956 |
|
3957 /* TODO(shess) Refactor the code to remove this forward decl. */ |
|
3958 static int flushPendingTerms(fulltext_vtab *v); |
|
3959 |
|
3960 /* Perform a full-text query using the search expression in |
|
3961 ** zInput[0..nInput-1]. Return a list of matching documents |
|
3962 ** in pResult. |
|
3963 ** |
|
3964 ** Queries must match column iColumn. Or if iColumn>=nColumn |
|
3965 ** they are allowed to match against any column. |
|
3966 */ |
|
3967 static int fulltextQuery( |
|
3968 fulltext_vtab *v, /* The full text index */ |
|
3969 int iColumn, /* Match against this column by default */ |
|
3970 const char *zInput, /* The query string */ |
|
3971 int nInput, /* Number of bytes in zInput[] */ |
|
3972 DataBuffer *pResult, /* Write the result doclist here */ |
|
3973 Query *pQuery /* Put parsed query string here */ |
|
3974 ){ |
|
3975 int i, iNext, rc; |
|
3976 DataBuffer left, right, or, new; |
|
3977 int nNot = 0; |
|
3978 QueryTerm *aTerm; |
|
3979 |
|
3980 /* TODO(shess) Instead of flushing pendingTerms, we could query for |
|
3981 ** the relevant term and merge the doclist into what we receive from |
|
3982 ** the database. Wait and see if this is a common issue, first. |
|
3983 ** |
|
3984 ** A good reason not to flush is to not generate update-related |
|
3985 ** error codes from here. |
|
3986 */ |
|
3987 |
|
3988 /* Flush any buffered updates before executing the query. */ |
|
3989 rc = flushPendingTerms(v); |
|
3990 if( rc!=SQLITE_OK ) return rc; |
|
3991 |
|
3992 /* TODO(shess) I think that the queryClear() calls below are not |
|
3993 ** necessary, because fulltextClose() already clears the query. |
|
3994 */ |
|
3995 rc = parseQuery(v, zInput, nInput, iColumn, pQuery); |
|
3996 if( rc!=SQLITE_OK ) return rc; |
|
3997 |
|
3998 /* Empty or NULL queries return no results. */ |
|
3999 if( pQuery->nTerms==0 ){ |
|
4000 dataBufferInit(pResult, 0); |
|
4001 return SQLITE_OK; |
|
4002 } |
|
4003 |
|
4004 /* Merge AND terms. */ |
|
4005 /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */ |
|
4006 aTerm = pQuery->pTerms; |
|
4007 for(i = 0; i<pQuery->nTerms; i=iNext){ |
|
4008 if( aTerm[i].isNot ){ |
|
4009 /* Handle all NOT terms in a separate pass */ |
|
4010 nNot++; |
|
4011 iNext = i + aTerm[i].nPhrase+1; |
|
4012 continue; |
|
4013 } |
|
4014 iNext = i + aTerm[i].nPhrase + 1; |
|
4015 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right); |
|
4016 if( rc ){ |
|
4017 if( i!=nNot ) dataBufferDestroy(&left); |
|
4018 queryClear(pQuery); |
|
4019 return rc; |
|
4020 } |
|
4021 while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){ |
|
4022 rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or); |
|
4023 iNext += aTerm[iNext].nPhrase + 1; |
|
4024 if( rc ){ |
|
4025 if( i!=nNot ) dataBufferDestroy(&left); |
|
4026 dataBufferDestroy(&right); |
|
4027 queryClear(pQuery); |
|
4028 return rc; |
|
4029 } |
|
4030 dataBufferInit(&new, 0); |
|
4031 docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new); |
|
4032 dataBufferDestroy(&right); |
|
4033 dataBufferDestroy(&or); |
|
4034 right = new; |
|
4035 } |
|
4036 if( i==nNot ){ /* first term processed. */ |
|
4037 left = right; |
|
4038 }else{ |
|
4039 dataBufferInit(&new, 0); |
|
4040 docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new); |
|
4041 dataBufferDestroy(&right); |
|
4042 dataBufferDestroy(&left); |
|
4043 left = new; |
|
4044 } |
|
4045 } |
|
4046 |
|
4047 if( nNot==pQuery->nTerms ){ |
|
4048 /* We do not yet know how to handle a query of only NOT terms */ |
|
4049 return SQLITE_ERROR; |
|
4050 } |
|
4051 |
|
4052 /* Do the EXCEPT terms */ |
|
4053 for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){ |
|
4054 if( !aTerm[i].isNot ) continue; |
|
4055 rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right); |
|
4056 if( rc ){ |
|
4057 queryClear(pQuery); |
|
4058 dataBufferDestroy(&left); |
|
4059 return rc; |
|
4060 } |
|
4061 dataBufferInit(&new, 0); |
|
4062 docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new); |
|
4063 dataBufferDestroy(&right); |
|
4064 dataBufferDestroy(&left); |
|
4065 left = new; |
|
4066 } |
|
4067 |
|
4068 *pResult = left; |
|
4069 return rc; |
|
4070 } |
|
4071 |
|
4072 /* |
|
4073 ** This is the xFilter interface for the virtual table. See |
|
4074 ** the virtual table xFilter method documentation for additional |
|
4075 ** information. |
|
4076 ** |
|
4077 ** If idxNum==QUERY_GENERIC then do a full table scan against |
|
4078 ** the %_content table. |
|
4079 ** |
|
4080 ** If idxNum==QUERY_DOCID then do a docid lookup for a single entry |
|
4081 ** in the %_content table. |
|
4082 ** |
|
4083 ** If idxNum>=QUERY_FULLTEXT then use the full text index. The |
|
4084 ** column on the left-hand side of the MATCH operator is column |
|
4085 ** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand |
|
4086 ** side of the MATCH operator. |
|
4087 */ |
|
4088 /* TODO(shess) Upgrade the cursor initialization and destruction to |
|
4089 ** account for fulltextFilter() being called multiple times on the |
|
4090 ** same cursor. The current solution is very fragile. Apply fix to |
|
4091 ** fts3 as appropriate. |
|
4092 */ |
|
4093 static int fulltextFilter( |
|
4094 sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */ |
|
4095 int idxNum, const char *idxStr, /* Which indexing scheme to use */ |
|
4096 int argc, sqlite3_value **argv /* Arguments for the indexing scheme */ |
|
4097 ){ |
|
4098 fulltext_cursor *c = (fulltext_cursor *) pCursor; |
|
4099 fulltext_vtab *v = cursor_vtab(c); |
|
4100 int rc; |
|
4101 |
|
4102 FTSTRACE(("FTS3 Filter %p\n",pCursor)); |
|
4103 |
|
4104 /* If the cursor has a statement that was not prepared according to |
|
4105 ** idxNum, clear it. I believe all calls to fulltextFilter with a |
|
4106 ** given cursor will have the same idxNum , but in this case it's |
|
4107 ** easy to be safe. |
|
4108 */ |
|
4109 if( c->pStmt && c->iCursorType!=idxNum ){ |
|
4110 sqlite3_finalize(c->pStmt); |
|
4111 c->pStmt = NULL; |
|
4112 } |
|
4113 |
|
4114 /* Get a fresh statement appropriate to idxNum. */ |
|
4115 /* TODO(shess): Add a prepared-statement cache in the vt structure. |
|
4116 ** The cache must handle multiple open cursors. Easier to cache the |
|
4117 ** statement variants at the vt to reduce malloc/realloc/free here. |
|
4118 ** Or we could have a StringBuffer variant which allowed stack |
|
4119 ** construction for small values. |
|
4120 */ |
|
4121 if( !c->pStmt ){ |
|
4122 StringBuffer sb; |
|
4123 initStringBuffer(&sb); |
|
4124 append(&sb, "SELECT docid, "); |
|
4125 appendList(&sb, v->nColumn, v->azContentColumn); |
|
4126 append(&sb, " FROM %_content"); |
|
4127 if( idxNum!=QUERY_GENERIC ) append(&sb, " WHERE docid = ?"); |
|
4128 rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, |
|
4129 stringBufferData(&sb)); |
|
4130 stringBufferDestroy(&sb); |
|
4131 if( rc!=SQLITE_OK ) return rc; |
|
4132 c->iCursorType = idxNum; |
|
4133 }else{ |
|
4134 sqlite3_reset(c->pStmt); |
|
4135 assert( c->iCursorType==idxNum ); |
|
4136 } |
|
4137 |
|
4138 switch( idxNum ){ |
|
4139 case QUERY_GENERIC: |
|
4140 break; |
|
4141 |
|
4142 case QUERY_DOCID: |
|
4143 rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0])); |
|
4144 if( rc!=SQLITE_OK ) return rc; |
|
4145 break; |
|
4146 |
|
4147 default: /* full-text search */ |
|
4148 { |
|
4149 const char *zQuery = (const char *)sqlite3_value_text(argv[0]); |
|
4150 assert( idxNum<=QUERY_FULLTEXT+v->nColumn); |
|
4151 assert( argc==1 ); |
|
4152 queryClear(&c->q); |
|
4153 if( c->result.nData!=0 ){ |
|
4154 /* This case happens if the same cursor is used repeatedly. */ |
|
4155 dlrDestroy(&c->reader); |
|
4156 dataBufferReset(&c->result); |
|
4157 }else{ |
|
4158 dataBufferInit(&c->result, 0); |
|
4159 } |
|
4160 rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q); |
|
4161 if( rc!=SQLITE_OK ) return rc; |
|
4162 if( c->result.nData!=0 ){ |
|
4163 dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData); |
|
4164 } |
|
4165 break; |
|
4166 } |
|
4167 } |
|
4168 |
|
4169 return fulltextNext(pCursor); |
|
4170 } |
|
4171 |
|
4172 /* This is the xEof method of the virtual table. The SQLite core |
|
4173 ** calls this routine to find out if it has reached the end of |
|
4174 ** a query's results set. |
|
4175 */ |
|
4176 static int fulltextEof(sqlite3_vtab_cursor *pCursor){ |
|
4177 fulltext_cursor *c = (fulltext_cursor *) pCursor; |
|
4178 return c->eof; |
|
4179 } |
|
4180 |
|
4181 /* This is the xColumn method of the virtual table. The SQLite |
|
4182 ** core calls this method during a query when it needs the value |
|
4183 ** of a column from the virtual table. This method needs to use |
|
4184 ** one of the sqlite3_result_*() routines to store the requested |
|
4185 ** value back in the pContext. |
|
4186 */ |
|
4187 static int fulltextColumn(sqlite3_vtab_cursor *pCursor, |
|
4188 sqlite3_context *pContext, int idxCol){ |
|
4189 fulltext_cursor *c = (fulltext_cursor *) pCursor; |
|
4190 fulltext_vtab *v = cursor_vtab(c); |
|
4191 |
|
4192 if( idxCol<v->nColumn ){ |
|
4193 sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1); |
|
4194 sqlite3_result_value(pContext, pVal); |
|
4195 }else if( idxCol==v->nColumn ){ |
|
4196 /* The extra column whose name is the same as the table. |
|
4197 ** Return a blob which is a pointer to the cursor |
|
4198 */ |
|
4199 sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT); |
|
4200 }else if( idxCol==v->nColumn+1 ){ |
|
4201 /* The docid column, which is an alias for rowid. */ |
|
4202 sqlite3_value *pVal = sqlite3_column_value(c->pStmt, 0); |
|
4203 sqlite3_result_value(pContext, pVal); |
|
4204 } |
|
4205 return SQLITE_OK; |
|
4206 } |
|
4207 |
|
4208 /* This is the xRowid method. The SQLite core calls this routine to |
|
4209 ** retrieve the rowid for the current row of the result set. fts3 |
|
4210 ** exposes %_content.docid as the rowid for the virtual table. The |
|
4211 ** rowid should be written to *pRowid. |
|
4212 */ |
|
4213 static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ |
|
4214 fulltext_cursor *c = (fulltext_cursor *) pCursor; |
|
4215 |
|
4216 *pRowid = sqlite3_column_int64(c->pStmt, 0); |
|
4217 return SQLITE_OK; |
|
4218 } |
|
4219 |
|
4220 /* Add all terms in [zText] to pendingTerms table. If [iColumn] > 0, |
|
4221 ** we also store positions and offsets in the hash table using that |
|
4222 ** column number. |
|
4223 */ |
|
4224 static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid, |
|
4225 const char *zText, int iColumn){ |
|
4226 sqlite3_tokenizer *pTokenizer = v->pTokenizer; |
|
4227 sqlite3_tokenizer_cursor *pCursor; |
|
4228 const char *pToken; |
|
4229 int nTokenBytes; |
|
4230 int iStartOffset, iEndOffset, iPosition; |
|
4231 int rc; |
|
4232 |
|
4233 rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); |
|
4234 if( rc!=SQLITE_OK ) return rc; |
|
4235 |
|
4236 pCursor->pTokenizer = pTokenizer; |
|
4237 while( SQLITE_OK==(rc=pTokenizer->pModule->xNext(pCursor, |
|
4238 &pToken, &nTokenBytes, |
|
4239 &iStartOffset, &iEndOffset, |
|
4240 &iPosition)) ){ |
|
4241 DLCollector *p; |
|
4242 int nData; /* Size of doclist before our update. */ |
|
4243 |
|
4244 /* Positions can't be negative; we use -1 as a terminator |
|
4245 * internally. Token can't be NULL or empty. */ |
|
4246 if( iPosition<0 || pToken == NULL || nTokenBytes == 0 ){ |
|
4247 rc = SQLITE_ERROR; |
|
4248 break; |
|
4249 } |
|
4250 |
|
4251 p = fts3HashFind(&v->pendingTerms, pToken, nTokenBytes); |
|
4252 if( p==NULL ){ |
|
4253 nData = 0; |
|
4254 p = dlcNew(iDocid, DL_DEFAULT); |
|
4255 fts3HashInsert(&v->pendingTerms, pToken, nTokenBytes, p); |
|
4256 |
|
4257 /* Overhead for our hash table entry, the key, and the value. */ |
|
4258 v->nPendingData += sizeof(struct fts3HashElem)+sizeof(*p)+nTokenBytes; |
|
4259 }else{ |
|
4260 nData = p->b.nData; |
|
4261 if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid); |
|
4262 } |
|
4263 if( iColumn>=0 ){ |
|
4264 dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset); |
|
4265 } |
|
4266 |
|
4267 /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */ |
|
4268 v->nPendingData += p->b.nData-nData; |
|
4269 } |
|
4270 |
|
4271 /* TODO(shess) Check return? Should this be able to cause errors at |
|
4272 ** this point? Actually, same question about sqlite3_finalize(), |
|
4273 ** though one could argue that failure there means that the data is |
|
4274 ** not durable. *ponder* |
|
4275 */ |
|
4276 pTokenizer->pModule->xClose(pCursor); |
|
4277 if( SQLITE_DONE == rc ) return SQLITE_OK; |
|
4278 return rc; |
|
4279 } |
|
4280 |
|
4281 /* Add doclists for all terms in [pValues] to pendingTerms table. */ |
|
4282 static int insertTerms(fulltext_vtab *v, sqlite_int64 iDocid, |
|
4283 sqlite3_value **pValues){ |
|
4284 int i; |
|
4285 for(i = 0; i < v->nColumn ; ++i){ |
|
4286 char *zText = (char*)sqlite3_value_text(pValues[i]); |
|
4287 int rc = buildTerms(v, iDocid, zText, i); |
|
4288 if( rc!=SQLITE_OK ) return rc; |
|
4289 } |
|
4290 return SQLITE_OK; |
|
4291 } |
|
4292 |
|
4293 /* Add empty doclists for all terms in the given row's content to |
|
4294 ** pendingTerms. |
|
4295 */ |
|
4296 static int deleteTerms(fulltext_vtab *v, sqlite_int64 iDocid){ |
|
4297 const char **pValues; |
|
4298 int i, rc; |
|
4299 |
|
4300 /* TODO(shess) Should we allow such tables at all? */ |
|
4301 if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR; |
|
4302 |
|
4303 rc = content_select(v, iDocid, &pValues); |
|
4304 if( rc!=SQLITE_OK ) return rc; |
|
4305 |
|
4306 for(i = 0 ; i < v->nColumn; ++i) { |
|
4307 rc = buildTerms(v, iDocid, pValues[i], -1); |
|
4308 if( rc!=SQLITE_OK ) break; |
|
4309 } |
|
4310 |
|
4311 freeStringArray(v->nColumn, pValues); |
|
4312 return SQLITE_OK; |
|
4313 } |
|
4314 |
|
4315 /* TODO(shess) Refactor the code to remove this forward decl. */ |
|
4316 static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid); |
|
4317 |
|
4318 /* Insert a row into the %_content table; set *piDocid to be the ID of the |
|
4319 ** new row. Add doclists for terms to pendingTerms. |
|
4320 */ |
|
4321 static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestDocid, |
|
4322 sqlite3_value **pValues, sqlite_int64 *piDocid){ |
|
4323 int rc; |
|
4324 |
|
4325 rc = content_insert(v, pRequestDocid, pValues); /* execute an SQL INSERT */ |
|
4326 if( rc!=SQLITE_OK ) return rc; |
|
4327 |
|
4328 /* docid column is an alias for rowid. */ |
|
4329 *piDocid = sqlite3_last_insert_rowid(v->db); |
|
4330 rc = initPendingTerms(v, *piDocid); |
|
4331 if( rc!=SQLITE_OK ) return rc; |
|
4332 |
|
4333 return insertTerms(v, *piDocid, pValues); |
|
4334 } |
|
4335 |
|
4336 /* Delete a row from the %_content table; add empty doclists for terms |
|
4337 ** to pendingTerms. |
|
4338 */ |
|
4339 static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ |
|
4340 int rc = initPendingTerms(v, iRow); |
|
4341 if( rc!=SQLITE_OK ) return rc; |
|
4342 |
|
4343 rc = deleteTerms(v, iRow); |
|
4344 if( rc!=SQLITE_OK ) return rc; |
|
4345 |
|
4346 return content_delete(v, iRow); /* execute an SQL DELETE */ |
|
4347 } |
|
4348 |
|
4349 /* Update a row in the %_content table; add delete doclists to |
|
4350 ** pendingTerms for old terms not in the new data, add insert doclists |
|
4351 ** to pendingTerms for terms in the new data. |
|
4352 */ |
|
4353 static int index_update(fulltext_vtab *v, sqlite_int64 iRow, |
|
4354 sqlite3_value **pValues){ |
|
4355 int rc = initPendingTerms(v, iRow); |
|
4356 if( rc!=SQLITE_OK ) return rc; |
|
4357 |
|
4358 /* Generate an empty doclist for each term that previously appeared in this |
|
4359 * row. */ |
|
4360 rc = deleteTerms(v, iRow); |
|
4361 if( rc!=SQLITE_OK ) return rc; |
|
4362 |
|
4363 rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */ |
|
4364 if( rc!=SQLITE_OK ) return rc; |
|
4365 |
|
4366 /* Now add positions for terms which appear in the updated row. */ |
|
4367 return insertTerms(v, iRow, pValues); |
|
4368 } |
|
4369 |
|
4370 /*******************************************************************/ |
|
4371 /* InteriorWriter is used to collect terms and block references into |
|
4372 ** interior nodes in %_segments. See commentary at top of file for |
|
4373 ** format. |
|
4374 */ |
|
4375 |
|
4376 /* How large interior nodes can grow. */ |
|
4377 #define INTERIOR_MAX 2048 |
|
4378 |
|
4379 /* Minimum number of terms per interior node (except the root). This |
|
4380 ** prevents large terms from making the tree too skinny - must be >0 |
|
4381 ** so that the tree always makes progress. Note that the min tree |
|
4382 ** fanout will be INTERIOR_MIN_TERMS+1. |
|
4383 */ |
|
4384 #define INTERIOR_MIN_TERMS 7 |
|
4385 #if INTERIOR_MIN_TERMS<1 |
|
4386 # error INTERIOR_MIN_TERMS must be greater than 0. |
|
4387 #endif |
|
4388 |
|
4389 /* ROOT_MAX controls how much data is stored inline in the segment |
|
4390 ** directory. |
|
4391 */ |
|
4392 /* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's |
|
4393 ** only here so that interiorWriterRootInfo() and leafWriterRootInfo() |
|
4394 ** can both see it, but if the caller passed it in, we wouldn't even |
|
4395 ** need a define. |
|
4396 */ |
|
4397 #define ROOT_MAX 1024 |
|
4398 #if ROOT_MAX<VARINT_MAX*2 |
|
4399 # error ROOT_MAX must have enough space for a header. |
|
4400 #endif |
|
4401 |
|
4402 /* InteriorBlock stores a linked-list of interior blocks while a lower |
|
4403 ** layer is being constructed. |
|
4404 */ |
|
4405 typedef struct InteriorBlock { |
|
4406 DataBuffer term; /* Leftmost term in block's subtree. */ |
|
4407 DataBuffer data; /* Accumulated data for the block. */ |
|
4408 struct InteriorBlock *next; |
|
4409 } InteriorBlock; |
|
4410 |
|
4411 static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock, |
|
4412 const char *pTerm, int nTerm){ |
|
4413 InteriorBlock *block = sqlite3_malloc(sizeof(InteriorBlock)); |
|
4414 char c[VARINT_MAX+VARINT_MAX]; |
|
4415 int n; |
|
4416 |
|
4417 if( block ){ |
|
4418 memset(block, 0, sizeof(*block)); |
|
4419 dataBufferInit(&block->term, 0); |
|
4420 dataBufferReplace(&block->term, pTerm, nTerm); |
|
4421 |
|
4422 n = fts3PutVarint(c, iHeight); |
|
4423 n += fts3PutVarint(c+n, iChildBlock); |
|
4424 dataBufferInit(&block->data, INTERIOR_MAX); |
|
4425 dataBufferReplace(&block->data, c, n); |
|
4426 } |
|
4427 return block; |
|
4428 } |
|
4429 |
|
4430 #ifndef NDEBUG |
|
4431 /* Verify that the data is readable as an interior node. */ |
|
4432 static void interiorBlockValidate(InteriorBlock *pBlock){ |
|
4433 const char *pData = pBlock->data.pData; |
|
4434 int nData = pBlock->data.nData; |
|
4435 int n, iDummy; |
|
4436 sqlite_int64 iBlockid; |
|
4437 |
|
4438 assert( nData>0 ); |
|
4439 assert( pData!=0 ); |
|
4440 assert( pData+nData>pData ); |
|
4441 |
|
4442 /* Must lead with height of node as a varint(n), n>0 */ |
|
4443 n = fts3GetVarint32(pData, &iDummy); |
|
4444 assert( n>0 ); |
|
4445 assert( iDummy>0 ); |
|
4446 assert( n<nData ); |
|
4447 pData += n; |
|
4448 nData -= n; |
|
4449 |
|
4450 /* Must contain iBlockid. */ |
|
4451 n = fts3GetVarint(pData, &iBlockid); |
|
4452 assert( n>0 ); |
|
4453 assert( n<=nData ); |
|
4454 pData += n; |
|
4455 nData -= n; |
|
4456 |
|
4457 /* Zero or more terms of positive length */ |
|
4458 if( nData!=0 ){ |
|
4459 /* First term is not delta-encoded. */ |
|
4460 n = fts3GetVarint32(pData, &iDummy); |
|
4461 assert( n>0 ); |
|
4462 assert( iDummy>0 ); |
|
4463 assert( n+iDummy>0); |
|
4464 assert( n+iDummy<=nData ); |
|
4465 pData += n+iDummy; |
|
4466 nData -= n+iDummy; |
|
4467 |
|
4468 /* Following terms delta-encoded. */ |
|
4469 while( nData!=0 ){ |
|
4470 /* Length of shared prefix. */ |
|
4471 n = fts3GetVarint32(pData, &iDummy); |
|
4472 assert( n>0 ); |
|
4473 assert( iDummy>=0 ); |
|
4474 assert( n<nData ); |
|
4475 pData += n; |
|
4476 nData -= n; |
|
4477 |
|
4478 /* Length and data of distinct suffix. */ |
|
4479 n = fts3GetVarint32(pData, &iDummy); |
|
4480 assert( n>0 ); |
|
4481 assert( iDummy>0 ); |
|
4482 assert( n+iDummy>0); |
|
4483 assert( n+iDummy<=nData ); |
|
4484 pData += n+iDummy; |
|
4485 nData -= n+iDummy; |
|
4486 } |
|
4487 } |
|
4488 } |
|
4489 #define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x) |
|
4490 #else |
|
4491 #define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 ) |
|
4492 #endif |
|
4493 |
|
4494 typedef struct InteriorWriter { |
|
4495 int iHeight; /* from 0 at leaves. */ |
|
4496 InteriorBlock *first, *last; |
|
4497 struct InteriorWriter *parentWriter; |
|
4498 |
|
4499 DataBuffer term; /* Last term written to block "last". */ |
|
4500 sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */ |
|
4501 #ifndef NDEBUG |
|
4502 sqlite_int64 iLastChildBlock; /* for consistency checks. */ |
|
4503 #endif |
|
4504 } InteriorWriter; |
|
4505 |
|
4506 /* Initialize an interior node where pTerm[nTerm] marks the leftmost |
|
4507 ** term in the tree. iChildBlock is the leftmost child block at the |
|
4508 ** next level down the tree. |
|
4509 */ |
|
4510 static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm, |
|
4511 sqlite_int64 iChildBlock, |
|
4512 InteriorWriter *pWriter){ |
|
4513 InteriorBlock *block; |
|
4514 assert( iHeight>0 ); |
|
4515 CLEAR(pWriter); |
|
4516 |
|
4517 pWriter->iHeight = iHeight; |
|
4518 pWriter->iOpeningChildBlock = iChildBlock; |
|
4519 #ifndef NDEBUG |
|
4520 pWriter->iLastChildBlock = iChildBlock; |
|
4521 #endif |
|
4522 block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm); |
|
4523 pWriter->last = pWriter->first = block; |
|
4524 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last); |
|
4525 dataBufferInit(&pWriter->term, 0); |
|
4526 } |
|
4527 |
|
4528 /* Append the child node rooted at iChildBlock to the interior node, |
|
4529 ** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree. |
|
4530 */ |
|
4531 static void interiorWriterAppend(InteriorWriter *pWriter, |
|
4532 const char *pTerm, int nTerm, |
|
4533 sqlite_int64 iChildBlock){ |
|
4534 char c[VARINT_MAX+VARINT_MAX]; |
|
4535 int n, nPrefix = 0; |
|
4536 |
|
4537 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last); |
|
4538 |
|
4539 /* The first term written into an interior node is actually |
|
4540 ** associated with the second child added (the first child was added |
|
4541 ** in interiorWriterInit, or in the if clause at the bottom of this |
|
4542 ** function). That term gets encoded straight up, with nPrefix left |
|
4543 ** at 0. |
|
4544 */ |
|
4545 if( pWriter->term.nData==0 ){ |
|
4546 n = fts3PutVarint(c, nTerm); |
|
4547 }else{ |
|
4548 while( nPrefix<pWriter->term.nData && |
|
4549 pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){ |
|
4550 nPrefix++; |
|
4551 } |
|
4552 |
|
4553 n = fts3PutVarint(c, nPrefix); |
|
4554 n += fts3PutVarint(c+n, nTerm-nPrefix); |
|
4555 } |
|
4556 |
|
4557 #ifndef NDEBUG |
|
4558 pWriter->iLastChildBlock++; |
|
4559 #endif |
|
4560 assert( pWriter->iLastChildBlock==iChildBlock ); |
|
4561 |
|
4562 /* Overflow to a new block if the new term makes the current block |
|
4563 ** too big, and the current block already has enough terms. |
|
4564 */ |
|
4565 if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX && |
|
4566 iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){ |
|
4567 pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock, |
|
4568 pTerm, nTerm); |
|
4569 pWriter->last = pWriter->last->next; |
|
4570 pWriter->iOpeningChildBlock = iChildBlock; |
|
4571 dataBufferReset(&pWriter->term); |
|
4572 }else{ |
|
4573 dataBufferAppend2(&pWriter->last->data, c, n, |
|
4574 pTerm+nPrefix, nTerm-nPrefix); |
|
4575 dataBufferReplace(&pWriter->term, pTerm, nTerm); |
|
4576 } |
|
4577 ASSERT_VALID_INTERIOR_BLOCK(pWriter->last); |
|
4578 } |
|
4579 |
|
4580 /* Free the space used by pWriter, including the linked-list of |
|
4581 ** InteriorBlocks, and parentWriter, if present. |
|
4582 */ |
|
4583 static int interiorWriterDestroy(InteriorWriter *pWriter){ |
|
4584 InteriorBlock *block = pWriter->first; |
|
4585 |
|
4586 while( block!=NULL ){ |
|
4587 InteriorBlock *b = block; |
|
4588 block = block->next; |
|
4589 dataBufferDestroy(&b->term); |
|
4590 dataBufferDestroy(&b->data); |
|
4591 sqlite3_free(b); |
|
4592 } |
|
4593 if( pWriter->parentWriter!=NULL ){ |
|
4594 interiorWriterDestroy(pWriter->parentWriter); |
|
4595 sqlite3_free(pWriter->parentWriter); |
|
4596 } |
|
4597 dataBufferDestroy(&pWriter->term); |
|
4598 SCRAMBLE(pWriter); |
|
4599 return SQLITE_OK; |
|
4600 } |
|
4601 |
|
4602 /* If pWriter can fit entirely in ROOT_MAX, return it as the root info |
|
4603 ** directly, leaving *piEndBlockid unchanged. Otherwise, flush |
|
4604 ** pWriter to %_segments, building a new layer of interior nodes, and |
|
4605 ** recursively ask for their root into. |
|
4606 */ |
|
4607 static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter, |
|
4608 char **ppRootInfo, int *pnRootInfo, |
|
4609 sqlite_int64 *piEndBlockid){ |
|
4610 InteriorBlock *block = pWriter->first; |
|
4611 sqlite_int64 iBlockid = 0; |
|
4612 int rc; |
|
4613 |
|
4614 /* If we can fit the segment inline */ |
|
4615 if( block==pWriter->last && block->data.nData<ROOT_MAX ){ |
|
4616 *ppRootInfo = block->data.pData; |
|
4617 *pnRootInfo = block->data.nData; |
|
4618 return SQLITE_OK; |
|
4619 } |
|
4620 |
|
4621 /* Flush the first block to %_segments, and create a new level of |
|
4622 ** interior node. |
|
4623 */ |
|
4624 ASSERT_VALID_INTERIOR_BLOCK(block); |
|
4625 rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid); |
|
4626 if( rc!=SQLITE_OK ) return rc; |
|
4627 *piEndBlockid = iBlockid; |
|
4628 |
|
4629 pWriter->parentWriter = sqlite3_malloc(sizeof(*pWriter->parentWriter)); |
|
4630 interiorWriterInit(pWriter->iHeight+1, |
|
4631 block->term.pData, block->term.nData, |
|
4632 iBlockid, pWriter->parentWriter); |
|
4633 |
|
4634 /* Flush additional blocks and append to the higher interior |
|
4635 ** node. |
|
4636 */ |
|
4637 for(block=block->next; block!=NULL; block=block->next){ |
|
4638 ASSERT_VALID_INTERIOR_BLOCK(block); |
|
4639 rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid); |
|
4640 if( rc!=SQLITE_OK ) return rc; |
|
4641 *piEndBlockid = iBlockid; |
|
4642 |
|
4643 interiorWriterAppend(pWriter->parentWriter, |
|
4644 block->term.pData, block->term.nData, iBlockid); |
|
4645 } |
|
4646 |
|
4647 /* Parent node gets the chance to be the root. */ |
|
4648 return interiorWriterRootInfo(v, pWriter->parentWriter, |
|
4649 ppRootInfo, pnRootInfo, piEndBlockid); |
|
4650 } |
|
4651 |
|
4652 /****************************************************************/ |
|
4653 /* InteriorReader is used to read off the data from an interior node |
|
4654 ** (see comment at top of file for the format). |
|
4655 */ |
|
4656 typedef struct InteriorReader { |
|
4657 const char *pData; |
|
4658 int nData; |
|
4659 |
|
4660 DataBuffer term; /* previous term, for decoding term delta. */ |
|
4661 |
|
4662 sqlite_int64 iBlockid; |
|
4663 } InteriorReader; |
|
4664 |
|
4665 static void interiorReaderDestroy(InteriorReader *pReader){ |
|
4666 dataBufferDestroy(&pReader->term); |
|
4667 SCRAMBLE(pReader); |
|
4668 } |
|
4669 |
|
4670 /* TODO(shess) The assertions are great, but what if we're in NDEBUG |
|
4671 ** and the blob is empty or otherwise contains suspect data? |
|
4672 */ |
|
4673 static void interiorReaderInit(const char *pData, int nData, |
|
4674 InteriorReader *pReader){ |
|
4675 int n, nTerm; |
|
4676 |
|
4677 /* Require at least the leading flag byte */ |
|
4678 assert( nData>0 ); |
|
4679 assert( pData[0]!='\0' ); |
|
4680 |
|
4681 CLEAR(pReader); |
|
4682 |
|
4683 /* Decode the base blockid, and set the cursor to the first term. */ |
|
4684 n = fts3GetVarint(pData+1, &pReader->iBlockid); |
|
4685 assert( 1+n<=nData ); |
|
4686 pReader->pData = pData+1+n; |
|
4687 pReader->nData = nData-(1+n); |
|
4688 |
|
4689 /* A single-child interior node (such as when a leaf node was too |
|
4690 ** large for the segment directory) won't have any terms. |
|
4691 ** Otherwise, decode the first term. |
|
4692 */ |
|
4693 if( pReader->nData==0 ){ |
|
4694 dataBufferInit(&pReader->term, 0); |
|
4695 }else{ |
|
4696 n = fts3GetVarint32(pReader->pData, &nTerm); |
|
4697 dataBufferInit(&pReader->term, nTerm); |
|
4698 dataBufferReplace(&pReader->term, pReader->pData+n, nTerm); |
|
4699 assert( n+nTerm<=pReader->nData ); |
|
4700 pReader->pData += n+nTerm; |
|
4701 pReader->nData -= n+nTerm; |
|
4702 } |
|
4703 } |
|
4704 |
|
4705 static int interiorReaderAtEnd(InteriorReader *pReader){ |
|
4706 return pReader->term.nData==0; |
|
4707 } |
|
4708 |
|
4709 static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){ |
|
4710 return pReader->iBlockid; |
|
4711 } |
|
4712 |
|
4713 static int interiorReaderTermBytes(InteriorReader *pReader){ |
|
4714 assert( !interiorReaderAtEnd(pReader) ); |
|
4715 return pReader->term.nData; |
|
4716 } |
|
4717 static const char *interiorReaderTerm(InteriorReader *pReader){ |
|
4718 assert( !interiorReaderAtEnd(pReader) ); |
|
4719 return pReader->term.pData; |
|
4720 } |
|
4721 |
|
4722 /* Step forward to the next term in the node. */ |
|
4723 static void interiorReaderStep(InteriorReader *pReader){ |
|
4724 assert( !interiorReaderAtEnd(pReader) ); |
|
4725 |
|
4726 /* If the last term has been read, signal eof, else construct the |
|
4727 ** next term. |
|
4728 */ |
|
4729 if( pReader->nData==0 ){ |
|
4730 dataBufferReset(&pReader->term); |
|
4731 }else{ |
|
4732 int n, nPrefix, nSuffix; |
|
4733 |
|
4734 n = fts3GetVarint32(pReader->pData, &nPrefix); |
|
4735 n += fts3GetVarint32(pReader->pData+n, &nSuffix); |
|
4736 |
|
4737 /* Truncate the current term and append suffix data. */ |
|
4738 pReader->term.nData = nPrefix; |
|
4739 dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix); |
|
4740 |
|
4741 assert( n+nSuffix<=pReader->nData ); |
|
4742 pReader->pData += n+nSuffix; |
|
4743 pReader->nData -= n+nSuffix; |
|
4744 } |
|
4745 pReader->iBlockid++; |
|
4746 } |
|
4747 |
|
4748 /* Compare the current term to pTerm[nTerm], returning strcmp-style |
|
4749 ** results. If isPrefix, equality means equal through nTerm bytes. |
|
4750 */ |
|
4751 static int interiorReaderTermCmp(InteriorReader *pReader, |
|
4752 const char *pTerm, int nTerm, int isPrefix){ |
|
4753 const char *pReaderTerm = interiorReaderTerm(pReader); |
|
4754 int nReaderTerm = interiorReaderTermBytes(pReader); |
|
4755 int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm; |
|
4756 |
|
4757 if( n==0 ){ |
|
4758 if( nReaderTerm>0 ) return -1; |
|
4759 if( nTerm>0 ) return 1; |
|
4760 return 0; |
|
4761 } |
|
4762 |
|
4763 c = memcmp(pReaderTerm, pTerm, n); |
|
4764 if( c!=0 ) return c; |
|
4765 if( isPrefix && n==nTerm ) return 0; |
|
4766 return nReaderTerm - nTerm; |
|
4767 } |
|
4768 |
|
4769 /****************************************************************/ |
|
4770 /* LeafWriter is used to collect terms and associated doclist data |
|
4771 ** into leaf blocks in %_segments (see top of file for format info). |
|
4772 ** Expected usage is: |
|
4773 ** |
|
4774 ** LeafWriter writer; |
|
4775 ** leafWriterInit(0, 0, &writer); |
|
4776 ** while( sorted_terms_left_to_process ){ |
|
4777 ** // data is doclist data for that term. |
|
4778 ** rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData); |
|
4779 ** if( rc!=SQLITE_OK ) goto err; |
|
4780 ** } |
|
4781 ** rc = leafWriterFinalize(v, &writer); |
|
4782 **err: |
|
4783 ** leafWriterDestroy(&writer); |
|
4784 ** return rc; |
|
4785 ** |
|
4786 ** leafWriterStep() may write a collected leaf out to %_segments. |
|
4787 ** leafWriterFinalize() finishes writing any buffered data and stores |
|
4788 ** a root node in %_segdir. leafWriterDestroy() frees all buffers and |
|
4789 ** InteriorWriters allocated as part of writing this segment. |
|
4790 ** |
|
4791 ** TODO(shess) Document leafWriterStepMerge(). |
|
4792 */ |
|
4793 |
|
4794 /* Put terms with data this big in their own block. */ |
|
4795 #define STANDALONE_MIN 1024 |
|
4796 |
|
4797 /* Keep leaf blocks below this size. */ |
|
4798 #define LEAF_MAX 2048 |
|
4799 |
|
4800 typedef struct LeafWriter { |
|
4801 int iLevel; |
|
4802 int idx; |
|
4803 sqlite_int64 iStartBlockid; /* needed to create the root info */ |
|
4804 sqlite_int64 iEndBlockid; /* when we're done writing. */ |
|
4805 |
|
4806 DataBuffer term; /* previous encoded term */ |
|
4807 DataBuffer data; /* encoding buffer */ |
|
4808 |
|
4809 /* bytes of first term in the current node which distinguishes that |
|
4810 ** term from the last term of the previous node. |
|
4811 */ |
|
4812 int nTermDistinct; |
|
4813 |
|
4814 InteriorWriter parentWriter; /* if we overflow */ |
|
4815 int has_parent; |
|
4816 } LeafWriter; |
|
4817 |
|
4818 static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){ |
|
4819 CLEAR(pWriter); |
|
4820 pWriter->iLevel = iLevel; |
|
4821 pWriter->idx = idx; |
|
4822 |
|
4823 dataBufferInit(&pWriter->term, 32); |
|
4824 |
|
4825 /* Start out with a reasonably sized block, though it can grow. */ |
|
4826 dataBufferInit(&pWriter->data, LEAF_MAX); |
|
4827 } |
|
4828 |
|
4829 #ifndef NDEBUG |
|
4830 /* Verify that the data is readable as a leaf node. */ |
|
4831 static void leafNodeValidate(const char *pData, int nData){ |
|
4832 int n, iDummy; |
|
4833 |
|
4834 if( nData==0 ) return; |
|
4835 assert( nData>0 ); |
|
4836 assert( pData!=0 ); |
|
4837 assert( pData+nData>pData ); |
|
4838 |
|
4839 /* Must lead with a varint(0) */ |
|
4840 n = fts3GetVarint32(pData, &iDummy); |
|
4841 assert( iDummy==0 ); |
|
4842 assert( n>0 ); |
|
4843 assert( n<nData ); |
|
4844 pData += n; |
|
4845 nData -= n; |
|
4846 |
|
4847 /* Leading term length and data must fit in buffer. */ |
|
4848 n = fts3GetVarint32(pData, &iDummy); |
|
4849 assert( n>0 ); |
|
4850 assert( iDummy>0 ); |
|
4851 assert( n+iDummy>0 ); |
|
4852 assert( n+iDummy<nData ); |
|
4853 pData += n+iDummy; |
|
4854 nData -= n+iDummy; |
|
4855 |
|
4856 /* Leading term's doclist length and data must fit. */ |
|
4857 n = fts3GetVarint32(pData, &iDummy); |
|
4858 assert( n>0 ); |
|
4859 assert( iDummy>0 ); |
|
4860 assert( n+iDummy>0 ); |
|
4861 assert( n+iDummy<=nData ); |
|
4862 ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL); |
|
4863 pData += n+iDummy; |
|
4864 nData -= n+iDummy; |
|
4865 |
|
4866 /* Verify that trailing terms and doclists also are readable. */ |
|
4867 while( nData!=0 ){ |
|
4868 n = fts3GetVarint32(pData, &iDummy); |
|
4869 assert( n>0 ); |
|
4870 assert( iDummy>=0 ); |
|
4871 assert( n<nData ); |
|
4872 pData += n; |
|
4873 nData -= n; |
|
4874 n = fts3GetVarint32(pData, &iDummy); |
|
4875 assert( n>0 ); |
|
4876 assert( iDummy>0 ); |
|
4877 assert( n+iDummy>0 ); |
|
4878 assert( n+iDummy<nData ); |
|
4879 pData += n+iDummy; |
|
4880 nData -= n+iDummy; |
|
4881 |
|
4882 n = fts3GetVarint32(pData, &iDummy); |
|
4883 assert( n>0 ); |
|
4884 assert( iDummy>0 ); |
|
4885 assert( n+iDummy>0 ); |
|
4886 assert( n+iDummy<=nData ); |
|
4887 ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL); |
|
4888 pData += n+iDummy; |
|
4889 nData -= n+iDummy; |
|
4890 } |
|
4891 } |
|
4892 #define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n) |
|
4893 #else |
|
4894 #define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 ) |
|
4895 #endif |
|
4896 |
|
4897 /* Flush the current leaf node to %_segments, and adding the resulting |
|
4898 ** blockid and the starting term to the interior node which will |
|
4899 ** contain it. |
|
4900 */ |
|
4901 static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter, |
|
4902 int iData, int nData){ |
|
4903 sqlite_int64 iBlockid = 0; |
|
4904 const char *pStartingTerm; |
|
4905 int nStartingTerm, rc, n; |
|
4906 |
|
4907 /* Must have the leading varint(0) flag, plus at least some |
|
4908 ** valid-looking data. |
|
4909 */ |
|
4910 assert( nData>2 ); |
|
4911 assert( iData>=0 ); |
|
4912 assert( iData+nData<=pWriter->data.nData ); |
|
4913 ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData); |
|
4914 |
|
4915 rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid); |
|
4916 if( rc!=SQLITE_OK ) return rc; |
|
4917 assert( iBlockid!=0 ); |
|
4918 |
|
4919 /* Reconstruct the first term in the leaf for purposes of building |
|
4920 ** the interior node. |
|
4921 */ |
|
4922 n = fts3GetVarint32(pWriter->data.pData+iData+1, &nStartingTerm); |
|
4923 pStartingTerm = pWriter->data.pData+iData+1+n; |
|
4924 assert( pWriter->data.nData>iData+1+n+nStartingTerm ); |
|
4925 assert( pWriter->nTermDistinct>0 ); |
|
4926 assert( pWriter->nTermDistinct<=nStartingTerm ); |
|
4927 nStartingTerm = pWriter->nTermDistinct; |
|
4928 |
|
4929 if( pWriter->has_parent ){ |
|
4930 interiorWriterAppend(&pWriter->parentWriter, |
|
4931 pStartingTerm, nStartingTerm, iBlockid); |
|
4932 }else{ |
|
4933 interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid, |
|
4934 &pWriter->parentWriter); |
|
4935 pWriter->has_parent = 1; |
|
4936 } |
|
4937 |
|
4938 /* Track the span of this segment's leaf nodes. */ |
|
4939 if( pWriter->iEndBlockid==0 ){ |
|
4940 pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid; |
|
4941 }else{ |
|
4942 pWriter->iEndBlockid++; |
|
4943 assert( iBlockid==pWriter->iEndBlockid ); |
|
4944 } |
|
4945 |
|
4946 return SQLITE_OK; |
|
4947 } |
|
4948 static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){ |
|
4949 int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData); |
|
4950 if( rc!=SQLITE_OK ) return rc; |
|
4951 |
|
4952 /* Re-initialize the output buffer. */ |
|
4953 dataBufferReset(&pWriter->data); |
|
4954 |
|
4955 return SQLITE_OK; |
|
4956 } |
|
4957 |
|
4958 /* Fetch the root info for the segment. If the entire leaf fits |
|
4959 ** within ROOT_MAX, then it will be returned directly, otherwise it |
|
4960 ** will be flushed and the root info will be returned from the |
|
4961 ** interior node. *piEndBlockid is set to the blockid of the last |
|
4962 ** interior or leaf node written to disk (0 if none are written at |
|
4963 ** all). |
|
4964 */ |
|
4965 static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter, |
|
4966 char **ppRootInfo, int *pnRootInfo, |
|
4967 sqlite_int64 *piEndBlockid){ |
|
4968 /* we can fit the segment entirely inline */ |
|
4969 if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){ |
|
4970 *ppRootInfo = pWriter->data.pData; |
|
4971 *pnRootInfo = pWriter->data.nData; |
|
4972 *piEndBlockid = 0; |
|
4973 return SQLITE_OK; |
|
4974 } |
|
4975 |
|
4976 /* Flush remaining leaf data. */ |
|
4977 if( pWriter->data.nData>0 ){ |
|
4978 int rc = leafWriterFlush(v, pWriter); |
|
4979 if( rc!=SQLITE_OK ) return rc; |
|
4980 } |
|
4981 |
|
4982 /* We must have flushed a leaf at some point. */ |
|
4983 assert( pWriter->has_parent ); |
|
4984 |
|
4985 /* Tenatively set the end leaf blockid as the end blockid. If the |
|
4986 ** interior node can be returned inline, this will be the final |
|
4987 ** blockid, otherwise it will be overwritten by |
|
4988 ** interiorWriterRootInfo(). |
|
4989 */ |
|
4990 *piEndBlockid = pWriter->iEndBlockid; |
|
4991 |
|
4992 return interiorWriterRootInfo(v, &pWriter->parentWriter, |
|
4993 ppRootInfo, pnRootInfo, piEndBlockid); |
|
4994 } |
|
4995 |
|
4996 /* Collect the rootInfo data and store it into the segment directory. |
|
4997 ** This has the effect of flushing the segment's leaf data to |
|
4998 ** %_segments, and also flushing any interior nodes to %_segments. |
|
4999 */ |
|
5000 static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){ |
|
5001 sqlite_int64 iEndBlockid; |
|
5002 char *pRootInfo; |
|
5003 int rc, nRootInfo; |
|
5004 |
|
5005 rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid); |
|
5006 if( rc!=SQLITE_OK ) return rc; |
|
5007 |
|
5008 /* Don't bother storing an entirely empty segment. */ |
|
5009 if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK; |
|
5010 |
|
5011 return segdir_set(v, pWriter->iLevel, pWriter->idx, |
|
5012 pWriter->iStartBlockid, pWriter->iEndBlockid, |
|
5013 iEndBlockid, pRootInfo, nRootInfo); |
|
5014 } |
|
5015 |
|
5016 static void leafWriterDestroy(LeafWriter *pWriter){ |
|
5017 if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter); |
|
5018 dataBufferDestroy(&pWriter->term); |
|
5019 dataBufferDestroy(&pWriter->data); |
|
5020 } |
|
5021 |
|
5022 /* Encode a term into the leafWriter, delta-encoding as appropriate. |
|
5023 ** Returns the length of the new term which distinguishes it from the |
|
5024 ** previous term, which can be used to set nTermDistinct when a node |
|
5025 ** boundary is crossed. |
|
5026 */ |
|
5027 static int leafWriterEncodeTerm(LeafWriter *pWriter, |
|
5028 const char *pTerm, int nTerm){ |
|
5029 char c[VARINT_MAX+VARINT_MAX]; |
|
5030 int n, nPrefix = 0; |
|
5031 |
|
5032 assert( nTerm>0 ); |
|
5033 while( nPrefix<pWriter->term.nData && |
|
5034 pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){ |
|
5035 nPrefix++; |
|
5036 /* Failing this implies that the terms weren't in order. */ |
|
5037 assert( nPrefix<nTerm ); |
|
5038 } |
|
5039 |
|
5040 if( pWriter->data.nData==0 ){ |
|
5041 /* Encode the node header and leading term as: |
|
5042 ** varint(0) |
|
5043 ** varint(nTerm) |
|
5044 ** char pTerm[nTerm] |
|
5045 */ |
|
5046 n = fts3PutVarint(c, '\0'); |
|
5047 n += fts3PutVarint(c+n, nTerm); |
|
5048 dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm); |
|
5049 }else{ |
|
5050 /* Delta-encode the term as: |
|
5051 ** varint(nPrefix) |
|
5052 ** varint(nSuffix) |
|
5053 ** char pTermSuffix[nSuffix] |
|
5054 */ |
|
5055 n = fts3PutVarint(c, nPrefix); |
|
5056 n += fts3PutVarint(c+n, nTerm-nPrefix); |
|
5057 dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix); |
|
5058 } |
|
5059 dataBufferReplace(&pWriter->term, pTerm, nTerm); |
|
5060 |
|
5061 return nPrefix+1; |
|
5062 } |
|
5063 |
|
5064 /* Used to avoid a memmove when a large amount of doclist data is in |
|
5065 ** the buffer. This constructs a node and term header before |
|
5066 ** iDoclistData and flushes the resulting complete node using |
|
5067 ** leafWriterInternalFlush(). |
|
5068 */ |
|
5069 static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter, |
|
5070 const char *pTerm, int nTerm, |
|
5071 int iDoclistData){ |
|
5072 char c[VARINT_MAX+VARINT_MAX]; |
|
5073 int iData, n = fts3PutVarint(c, 0); |
|
5074 n += fts3PutVarint(c+n, nTerm); |
|
5075 |
|
5076 /* There should always be room for the header. Even if pTerm shared |
|
5077 ** a substantial prefix with the previous term, the entire prefix |
|
5078 ** could be constructed from earlier data in the doclist, so there |
|
5079 ** should be room. |
|
5080 */ |
|
5081 assert( iDoclistData>=n+nTerm ); |
|
5082 |
|
5083 iData = iDoclistData-(n+nTerm); |
|
5084 memcpy(pWriter->data.pData+iData, c, n); |
|
5085 memcpy(pWriter->data.pData+iData+n, pTerm, nTerm); |
|
5086 |
|
5087 return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData); |
|
5088 } |
|
5089 |
|
5090 /* Push pTerm[nTerm] along with the doclist data to the leaf layer of |
|
5091 ** %_segments. |
|
5092 */ |
|
5093 static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter, |
|
5094 const char *pTerm, int nTerm, |
|
5095 DLReader *pReaders, int nReaders){ |
|
5096 char c[VARINT_MAX+VARINT_MAX]; |
|
5097 int iTermData = pWriter->data.nData, iDoclistData; |
|
5098 int i, nData, n, nActualData, nActual, rc, nTermDistinct; |
|
5099 |
|
5100 ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData); |
|
5101 nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm); |
|
5102 |
|
5103 /* Remember nTermDistinct if opening a new node. */ |
|
5104 if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct; |
|
5105 |
|
5106 iDoclistData = pWriter->data.nData; |
|
5107 |
|
5108 /* Estimate the length of the merged doclist so we can leave space |
|
5109 ** to encode it. |
|
5110 */ |
|
5111 for(i=0, nData=0; i<nReaders; i++){ |
|
5112 nData += dlrAllDataBytes(&pReaders[i]); |
|
5113 } |
|
5114 n = fts3PutVarint(c, nData); |
|
5115 dataBufferAppend(&pWriter->data, c, n); |
|
5116 |
|
5117 docListMerge(&pWriter->data, pReaders, nReaders); |
|
5118 ASSERT_VALID_DOCLIST(DL_DEFAULT, |
|
5119 pWriter->data.pData+iDoclistData+n, |
|
5120 pWriter->data.nData-iDoclistData-n, NULL); |
|
5121 |
|
5122 /* The actual amount of doclist data at this point could be smaller |
|
5123 ** than the length we encoded. Additionally, the space required to |
|
5124 ** encode this length could be smaller. For small doclists, this is |
|
5125 ** not a big deal, we can just use memmove() to adjust things. |
|
5126 */ |
|
5127 nActualData = pWriter->data.nData-(iDoclistData+n); |
|
5128 nActual = fts3PutVarint(c, nActualData); |
|
5129 assert( nActualData<=nData ); |
|
5130 assert( nActual<=n ); |
|
5131 |
|
5132 /* If the new doclist is big enough for force a standalone leaf |
|
5133 ** node, we can immediately flush it inline without doing the |
|
5134 ** memmove(). |
|
5135 */ |
|
5136 /* TODO(shess) This test matches leafWriterStep(), which does this |
|
5137 ** test before it knows the cost to varint-encode the term and |
|
5138 ** doclist lengths. At some point, change to |
|
5139 ** pWriter->data.nData-iTermData>STANDALONE_MIN. |
|
5140 */ |
|
5141 if( nTerm+nActualData>STANDALONE_MIN ){ |
|
5142 /* Push leaf node from before this term. */ |
|
5143 if( iTermData>0 ){ |
|
5144 rc = leafWriterInternalFlush(v, pWriter, 0, iTermData); |
|
5145 if( rc!=SQLITE_OK ) return rc; |
|
5146 |
|
5147 pWriter->nTermDistinct = nTermDistinct; |
|
5148 } |
|
5149 |
|
5150 /* Fix the encoded doclist length. */ |
|
5151 iDoclistData += n - nActual; |
|
5152 memcpy(pWriter->data.pData+iDoclistData, c, nActual); |
|
5153 |
|
5154 /* Push the standalone leaf node. */ |
|
5155 rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData); |
|
5156 if( rc!=SQLITE_OK ) return rc; |
|
5157 |
|
5158 /* Leave the node empty. */ |
|
5159 dataBufferReset(&pWriter->data); |
|
5160 |
|
5161 return rc; |
|
5162 } |
|
5163 |
|
5164 /* At this point, we know that the doclist was small, so do the |
|
5165 ** memmove if indicated. |
|
5166 */ |
|
5167 if( nActual<n ){ |
|
5168 memmove(pWriter->data.pData+iDoclistData+nActual, |
|
5169 pWriter->data.pData+iDoclistData+n, |
|
5170 pWriter->data.nData-(iDoclistData+n)); |
|
5171 pWriter->data.nData -= n-nActual; |
|
5172 } |
|
5173 |
|
5174 /* Replace written length with actual length. */ |
|
5175 memcpy(pWriter->data.pData+iDoclistData, c, nActual); |
|
5176 |
|
5177 /* If the node is too large, break things up. */ |
|
5178 /* TODO(shess) This test matches leafWriterStep(), which does this |
|
5179 ** test before it knows the cost to varint-encode the term and |
|
5180 ** doclist lengths. At some point, change to |
|
5181 ** pWriter->data.nData>LEAF_MAX. |
|
5182 */ |
|
5183 if( iTermData+nTerm+nActualData>LEAF_MAX ){ |
|
5184 /* Flush out the leading data as a node */ |
|
5185 rc = leafWriterInternalFlush(v, pWriter, 0, iTermData); |
|
5186 if( rc!=SQLITE_OK ) return rc; |
|
5187 |
|
5188 pWriter->nTermDistinct = nTermDistinct; |
|
5189 |
|
5190 /* Rebuild header using the current term */ |
|
5191 n = fts3PutVarint(pWriter->data.pData, 0); |
|
5192 n += fts3PutVarint(pWriter->data.pData+n, nTerm); |
|
5193 memcpy(pWriter->data.pData+n, pTerm, nTerm); |
|
5194 n += nTerm; |
|
5195 |
|
5196 /* There should always be room, because the previous encoding |
|
5197 ** included all data necessary to construct the term. |
|
5198 */ |
|
5199 assert( n<iDoclistData ); |
|
5200 /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the |
|
5201 ** following memcpy() is safe (as opposed to needing a memmove). |
|
5202 */ |
|
5203 assert( 2*STANDALONE_MIN<=LEAF_MAX ); |
|
5204 assert( n+pWriter->data.nData-iDoclistData<iDoclistData ); |
|
5205 memcpy(pWriter->data.pData+n, |
|
5206 pWriter->data.pData+iDoclistData, |
|
5207 pWriter->data.nData-iDoclistData); |
|
5208 pWriter->data.nData -= iDoclistData-n; |
|
5209 } |
|
5210 ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData); |
|
5211 |
|
5212 return SQLITE_OK; |
|
5213 } |
|
5214 |
|
5215 /* Push pTerm[nTerm] along with the doclist data to the leaf layer of |
|
5216 ** %_segments. |
|
5217 */ |
|
5218 /* TODO(shess) Revise writeZeroSegment() so that doclists are |
|
5219 ** constructed directly in pWriter->data. |
|
5220 */ |
|
5221 static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter, |
|
5222 const char *pTerm, int nTerm, |
|
5223 const char *pData, int nData){ |
|
5224 int rc; |
|
5225 DLReader reader; |
|
5226 |
|
5227 dlrInit(&reader, DL_DEFAULT, pData, nData); |
|
5228 rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1); |
|
5229 dlrDestroy(&reader); |
|
5230 |
|
5231 return rc; |
|
5232 } |
|
5233 |
|
5234 |
|
5235 /****************************************************************/ |
|
5236 /* LeafReader is used to iterate over an individual leaf node. */ |
|
5237 typedef struct LeafReader { |
|
5238 DataBuffer term; /* copy of current term. */ |
|
5239 |
|
5240 const char *pData; /* data for current term. */ |
|
5241 int nData; |
|
5242 } LeafReader; |
|
5243 |
|
5244 static void leafReaderDestroy(LeafReader *pReader){ |
|
5245 dataBufferDestroy(&pReader->term); |
|
5246 SCRAMBLE(pReader); |
|
5247 } |
|
5248 |
|
5249 static int leafReaderAtEnd(LeafReader *pReader){ |
|
5250 return pReader->nData<=0; |
|
5251 } |
|
5252 |
|
5253 /* Access the current term. */ |
|
5254 static int leafReaderTermBytes(LeafReader *pReader){ |
|
5255 return pReader->term.nData; |
|
5256 } |
|
5257 static const char *leafReaderTerm(LeafReader *pReader){ |
|
5258 assert( pReader->term.nData>0 ); |
|
5259 return pReader->term.pData; |
|
5260 } |
|
5261 |
|
5262 /* Access the doclist data for the current term. */ |
|
5263 static int leafReaderDataBytes(LeafReader *pReader){ |
|
5264 int nData; |
|
5265 assert( pReader->term.nData>0 ); |
|
5266 fts3GetVarint32(pReader->pData, &nData); |
|
5267 return nData; |
|
5268 } |
|
5269 static const char *leafReaderData(LeafReader *pReader){ |
|
5270 int n, nData; |
|
5271 assert( pReader->term.nData>0 ); |
|
5272 n = fts3GetVarint32(pReader->pData, &nData); |
|
5273 return pReader->pData+n; |
|
5274 } |
|
5275 |
|
5276 static void leafReaderInit(const char *pData, int nData, |
|
5277 LeafReader *pReader){ |
|
5278 int nTerm, n; |
|
5279 |
|
5280 assert( nData>0 ); |
|
5281 assert( pData[0]=='\0' ); |
|
5282 |
|
5283 CLEAR(pReader); |
|
5284 |
|
5285 /* Read the first term, skipping the header byte. */ |
|
5286 n = fts3GetVarint32(pData+1, &nTerm); |
|
5287 dataBufferInit(&pReader->term, nTerm); |
|
5288 dataBufferReplace(&pReader->term, pData+1+n, nTerm); |
|
5289 |
|
5290 /* Position after the first term. */ |
|
5291 assert( 1+n+nTerm<nData ); |
|
5292 pReader->pData = pData+1+n+nTerm; |
|
5293 pReader->nData = nData-1-n-nTerm; |
|
5294 } |
|
5295 |
|
5296 /* Step the reader forward to the next term. */ |
|
5297 static void leafReaderStep(LeafReader *pReader){ |
|
5298 int n, nData, nPrefix, nSuffix; |
|
5299 assert( !leafReaderAtEnd(pReader) ); |
|
5300 |
|
5301 /* Skip previous entry's data block. */ |
|
5302 n = fts3GetVarint32(pReader->pData, &nData); |
|
5303 assert( n+nData<=pReader->nData ); |
|
5304 pReader->pData += n+nData; |
|
5305 pReader->nData -= n+nData; |
|
5306 |
|
5307 if( !leafReaderAtEnd(pReader) ){ |
|
5308 /* Construct the new term using a prefix from the old term plus a |
|
5309 ** suffix from the leaf data. |
|
5310 */ |
|
5311 n = fts3GetVarint32(pReader->pData, &nPrefix); |
|
5312 n += fts3GetVarint32(pReader->pData+n, &nSuffix); |
|
5313 assert( n+nSuffix<pReader->nData ); |
|
5314 pReader->term.nData = nPrefix; |
|
5315 dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix); |
|
5316 |
|
5317 pReader->pData += n+nSuffix; |
|
5318 pReader->nData -= n+nSuffix; |
|
5319 } |
|
5320 } |
|
5321 |
|
5322 /* strcmp-style comparison of pReader's current term against pTerm. |
|
5323 ** If isPrefix, equality means equal through nTerm bytes. |
|
5324 */ |
|
5325 static int leafReaderTermCmp(LeafReader *pReader, |
|
5326 const char *pTerm, int nTerm, int isPrefix){ |
|
5327 int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm; |
|
5328 if( n==0 ){ |
|
5329 if( pReader->term.nData>0 ) return -1; |
|
5330 if(nTerm>0 ) return 1; |
|
5331 return 0; |
|
5332 } |
|
5333 |
|
5334 c = memcmp(pReader->term.pData, pTerm, n); |
|
5335 if( c!=0 ) return c; |
|
5336 if( isPrefix && n==nTerm ) return 0; |
|
5337 return pReader->term.nData - nTerm; |
|
5338 } |
|
5339 |
|
5340 |
|
5341 /****************************************************************/ |
|
5342 /* LeavesReader wraps LeafReader to allow iterating over the entire |
|
5343 ** leaf layer of the tree. |
|
5344 */ |
|
5345 typedef struct LeavesReader { |
|
5346 int idx; /* Index within the segment. */ |
|
5347 |
|
5348 sqlite3_stmt *pStmt; /* Statement we're streaming leaves from. */ |
|
5349 int eof; /* we've seen SQLITE_DONE from pStmt. */ |
|
5350 |
|
5351 LeafReader leafReader; /* reader for the current leaf. */ |
|
5352 DataBuffer rootData; /* root data for inline. */ |
|
5353 } LeavesReader; |
|
5354 |
|
5355 /* Access the current term. */ |
|
5356 static int leavesReaderTermBytes(LeavesReader *pReader){ |
|
5357 assert( !pReader->eof ); |
|
5358 return leafReaderTermBytes(&pReader->leafReader); |
|
5359 } |
|
5360 static const char *leavesReaderTerm(LeavesReader *pReader){ |
|
5361 assert( !pReader->eof ); |
|
5362 return leafReaderTerm(&pReader->leafReader); |
|
5363 } |
|
5364 |
|
5365 /* Access the doclist data for the current term. */ |
|
5366 static int leavesReaderDataBytes(LeavesReader *pReader){ |
|
5367 assert( !pReader->eof ); |
|
5368 return leafReaderDataBytes(&pReader->leafReader); |
|
5369 } |
|
5370 static const char *leavesReaderData(LeavesReader *pReader){ |
|
5371 assert( !pReader->eof ); |
|
5372 return leafReaderData(&pReader->leafReader); |
|
5373 } |
|
5374 |
|
5375 static int leavesReaderAtEnd(LeavesReader *pReader){ |
|
5376 return pReader->eof; |
|
5377 } |
|
5378 |
|
5379 /* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus |
|
5380 ** leaving the statement handle open, which locks the table. |
|
5381 */ |
|
5382 /* TODO(shess) This "solution" is not satisfactory. Really, there |
|
5383 ** should be check-in function for all statement handles which |
|
5384 ** arranges to call sqlite3_reset(). This most likely will require |
|
5385 ** modification to control flow all over the place, though, so for now |
|
5386 ** just punt. |
|
5387 ** |
|
5388 ** Note the the current system assumes that segment merges will run to |
|
5389 ** completion, which is why this particular probably hasn't arisen in |
|
5390 ** this case. Probably a brittle assumption. |
|
5391 */ |
|
5392 static int leavesReaderReset(LeavesReader *pReader){ |
|
5393 return sqlite3_reset(pReader->pStmt); |
|
5394 } |
|
5395 |
|
5396 static void leavesReaderDestroy(LeavesReader *pReader){ |
|
5397 /* If idx is -1, that means we're using a non-cached statement |
|
5398 ** handle in the optimize() case, so we need to release it. |
|
5399 */ |
|
5400 if( pReader->pStmt!=NULL && pReader->idx==-1 ){ |
|
5401 sqlite3_finalize(pReader->pStmt); |
|
5402 } |
|
5403 leafReaderDestroy(&pReader->leafReader); |
|
5404 dataBufferDestroy(&pReader->rootData); |
|
5405 SCRAMBLE(pReader); |
|
5406 } |
|
5407 |
|
5408 /* Initialize pReader with the given root data (if iStartBlockid==0 |
|
5409 ** the leaf data was entirely contained in the root), or from the |
|
5410 ** stream of blocks between iStartBlockid and iEndBlockid, inclusive. |
|
5411 */ |
|
5412 static int leavesReaderInit(fulltext_vtab *v, |
|
5413 int idx, |
|
5414 sqlite_int64 iStartBlockid, |
|
5415 sqlite_int64 iEndBlockid, |
|
5416 const char *pRootData, int nRootData, |
|
5417 LeavesReader *pReader){ |
|
5418 CLEAR(pReader); |
|
5419 pReader->idx = idx; |
|
5420 |
|
5421 dataBufferInit(&pReader->rootData, 0); |
|
5422 if( iStartBlockid==0 ){ |
|
5423 /* Entire leaf level fit in root data. */ |
|
5424 dataBufferReplace(&pReader->rootData, pRootData, nRootData); |
|
5425 leafReaderInit(pReader->rootData.pData, pReader->rootData.nData, |
|
5426 &pReader->leafReader); |
|
5427 }else{ |
|
5428 sqlite3_stmt *s; |
|
5429 int rc = sql_get_leaf_statement(v, idx, &s); |
|
5430 if( rc!=SQLITE_OK ) return rc; |
|
5431 |
|
5432 rc = sqlite3_bind_int64(s, 1, iStartBlockid); |
|
5433 if( rc!=SQLITE_OK ) return rc; |
|
5434 |
|
5435 rc = sqlite3_bind_int64(s, 2, iEndBlockid); |
|
5436 if( rc!=SQLITE_OK ) return rc; |
|
5437 |
|
5438 rc = sqlite3_step(s); |
|
5439 if( rc==SQLITE_DONE ){ |
|
5440 pReader->eof = 1; |
|
5441 return SQLITE_OK; |
|
5442 } |
|
5443 if( rc!=SQLITE_ROW ) return rc; |
|
5444 |
|
5445 pReader->pStmt = s; |
|
5446 leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0), |
|
5447 sqlite3_column_bytes(pReader->pStmt, 0), |
|
5448 &pReader->leafReader); |
|
5449 } |
|
5450 return SQLITE_OK; |
|
5451 } |
|
5452 |
|
5453 /* Step the current leaf forward to the next term. If we reach the |
|
5454 ** end of the current leaf, step forward to the next leaf block. |
|
5455 */ |
|
5456 static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){ |
|
5457 assert( !leavesReaderAtEnd(pReader) ); |
|
5458 leafReaderStep(&pReader->leafReader); |
|
5459 |
|
5460 if( leafReaderAtEnd(&pReader->leafReader) ){ |
|
5461 int rc; |
|
5462 if( pReader->rootData.pData ){ |
|
5463 pReader->eof = 1; |
|
5464 return SQLITE_OK; |
|
5465 } |
|
5466 rc = sqlite3_step(pReader->pStmt); |
|
5467 if( rc!=SQLITE_ROW ){ |
|
5468 pReader->eof = 1; |
|
5469 return rc==SQLITE_DONE ? SQLITE_OK : rc; |
|
5470 } |
|
5471 leafReaderDestroy(&pReader->leafReader); |
|
5472 leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0), |
|
5473 sqlite3_column_bytes(pReader->pStmt, 0), |
|
5474 &pReader->leafReader); |
|
5475 } |
|
5476 return SQLITE_OK; |
|
5477 } |
|
5478 |
|
5479 /* Order LeavesReaders by their term, ignoring idx. Readers at eof |
|
5480 ** always sort to the end. |
|
5481 */ |
|
5482 static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){ |
|
5483 if( leavesReaderAtEnd(lr1) ){ |
|
5484 if( leavesReaderAtEnd(lr2) ) return 0; |
|
5485 return 1; |
|
5486 } |
|
5487 if( leavesReaderAtEnd(lr2) ) return -1; |
|
5488 |
|
5489 return leafReaderTermCmp(&lr1->leafReader, |
|
5490 leavesReaderTerm(lr2), leavesReaderTermBytes(lr2), |
|
5491 0); |
|
5492 } |
|
5493 |
|
5494 /* Similar to leavesReaderTermCmp(), with additional ordering by idx |
|
5495 ** so that older segments sort before newer segments. |
|
5496 */ |
|
5497 static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){ |
|
5498 int c = leavesReaderTermCmp(lr1, lr2); |
|
5499 if( c!=0 ) return c; |
|
5500 return lr1->idx-lr2->idx; |
|
5501 } |
|
5502 |
|
5503 /* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its |
|
5504 ** sorted position. |
|
5505 */ |
|
5506 static void leavesReaderReorder(LeavesReader *pLr, int nLr){ |
|
5507 while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){ |
|
5508 LeavesReader tmp = pLr[0]; |
|
5509 pLr[0] = pLr[1]; |
|
5510 pLr[1] = tmp; |
|
5511 nLr--; |
|
5512 pLr++; |
|
5513 } |
|
5514 } |
|
5515 |
|
5516 /* Initializes pReaders with the segments from level iLevel, returning |
|
5517 ** the number of segments in *piReaders. Leaves pReaders in sorted |
|
5518 ** order. |
|
5519 */ |
|
5520 static int leavesReadersInit(fulltext_vtab *v, int iLevel, |
|
5521 LeavesReader *pReaders, int *piReaders){ |
|
5522 sqlite3_stmt *s; |
|
5523 int i, rc = sql_get_statement(v, SEGDIR_SELECT_LEVEL_STMT, &s); |
|
5524 if( rc!=SQLITE_OK ) return rc; |
|
5525 |
|
5526 rc = sqlite3_bind_int(s, 1, iLevel); |
|
5527 if( rc!=SQLITE_OK ) return rc; |
|
5528 |
|
5529 i = 0; |
|
5530 while( (rc = sqlite3_step(s))==SQLITE_ROW ){ |
|
5531 sqlite_int64 iStart = sqlite3_column_int64(s, 0); |
|
5532 sqlite_int64 iEnd = sqlite3_column_int64(s, 1); |
|
5533 const char *pRootData = sqlite3_column_blob(s, 2); |
|
5534 int nRootData = sqlite3_column_bytes(s, 2); |
|
5535 |
|
5536 assert( i<MERGE_COUNT ); |
|
5537 rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData, |
|
5538 &pReaders[i]); |
|
5539 if( rc!=SQLITE_OK ) break; |
|
5540 |
|
5541 i++; |
|
5542 } |
|
5543 if( rc!=SQLITE_DONE ){ |
|
5544 while( i-->0 ){ |
|
5545 leavesReaderDestroy(&pReaders[i]); |
|
5546 } |
|
5547 return rc; |
|
5548 } |
|
5549 |
|
5550 *piReaders = i; |
|
5551 |
|
5552 /* Leave our results sorted by term, then age. */ |
|
5553 while( i-- ){ |
|
5554 leavesReaderReorder(pReaders+i, *piReaders-i); |
|
5555 } |
|
5556 return SQLITE_OK; |
|
5557 } |
|
5558 |
|
5559 /* Merge doclists from pReaders[nReaders] into a single doclist, which |
|
5560 ** is written to pWriter. Assumes pReaders is ordered oldest to |
|
5561 ** newest. |
|
5562 */ |
|
5563 /* TODO(shess) Consider putting this inline in segmentMerge(). */ |
|
5564 static int leavesReadersMerge(fulltext_vtab *v, |
|
5565 LeavesReader *pReaders, int nReaders, |
|
5566 LeafWriter *pWriter){ |
|
5567 DLReader dlReaders[MERGE_COUNT]; |
|
5568 const char *pTerm = leavesReaderTerm(pReaders); |
|
5569 int i, nTerm = leavesReaderTermBytes(pReaders); |
|
5570 |
|
5571 assert( nReaders<=MERGE_COUNT ); |
|
5572 |
|
5573 for(i=0; i<nReaders; i++){ |
|
5574 dlrInit(&dlReaders[i], DL_DEFAULT, |
|
5575 leavesReaderData(pReaders+i), |
|
5576 leavesReaderDataBytes(pReaders+i)); |
|
5577 } |
|
5578 |
|
5579 return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders); |
|
5580 } |
|
5581 |
|
5582 /* Forward ref due to mutual recursion with segdirNextIndex(). */ |
|
5583 static int segmentMerge(fulltext_vtab *v, int iLevel); |
|
5584 |
|
5585 /* Put the next available index at iLevel into *pidx. If iLevel |
|
5586 ** already has MERGE_COUNT segments, they are merged to a higher |
|
5587 ** level to make room. |
|
5588 */ |
|
5589 static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){ |
|
5590 int rc = segdir_max_index(v, iLevel, pidx); |
|
5591 if( rc==SQLITE_DONE ){ /* No segments at iLevel. */ |
|
5592 *pidx = 0; |
|
5593 }else if( rc==SQLITE_ROW ){ |
|
5594 if( *pidx==(MERGE_COUNT-1) ){ |
|
5595 rc = segmentMerge(v, iLevel); |
|
5596 if( rc!=SQLITE_OK ) return rc; |
|
5597 *pidx = 0; |
|
5598 }else{ |
|
5599 (*pidx)++; |
|
5600 } |
|
5601 }else{ |
|
5602 return rc; |
|
5603 } |
|
5604 return SQLITE_OK; |
|
5605 } |
|
5606 |
|
5607 /* Merge MERGE_COUNT segments at iLevel into a new segment at |
|
5608 ** iLevel+1. If iLevel+1 is already full of segments, those will be |
|
5609 ** merged to make room. |
|
5610 */ |
|
5611 static int segmentMerge(fulltext_vtab *v, int iLevel){ |
|
5612 LeafWriter writer; |
|
5613 LeavesReader lrs[MERGE_COUNT]; |
|
5614 int i, rc, idx = 0; |
|
5615 |
|
5616 /* Determine the next available segment index at the next level, |
|
5617 ** merging as necessary. |
|
5618 */ |
|
5619 rc = segdirNextIndex(v, iLevel+1, &idx); |
|
5620 if( rc!=SQLITE_OK ) return rc; |
|
5621 |
|
5622 /* TODO(shess) This assumes that we'll always see exactly |
|
5623 ** MERGE_COUNT segments to merge at a given level. That will be |
|
5624 ** broken if we allow the developer to request preemptive or |
|
5625 ** deferred merging. |
|
5626 */ |
|
5627 memset(&lrs, '\0', sizeof(lrs)); |
|
5628 rc = leavesReadersInit(v, iLevel, lrs, &i); |
|
5629 if( rc!=SQLITE_OK ) return rc; |
|
5630 assert( i==MERGE_COUNT ); |
|
5631 |
|
5632 leafWriterInit(iLevel+1, idx, &writer); |
|
5633 |
|
5634 /* Since leavesReaderReorder() pushes readers at eof to the end, |
|
5635 ** when the first reader is empty, all will be empty. |
|
5636 */ |
|
5637 while( !leavesReaderAtEnd(lrs) ){ |
|
5638 /* Figure out how many readers share their next term. */ |
|
5639 for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){ |
|
5640 if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break; |
|
5641 } |
|
5642 |
|
5643 rc = leavesReadersMerge(v, lrs, i, &writer); |
|
5644 if( rc!=SQLITE_OK ) goto err; |
|
5645 |
|
5646 /* Step forward those that were merged. */ |
|
5647 while( i-->0 ){ |
|
5648 rc = leavesReaderStep(v, lrs+i); |
|
5649 if( rc!=SQLITE_OK ) goto err; |
|
5650 |
|
5651 /* Reorder by term, then by age. */ |
|
5652 leavesReaderReorder(lrs+i, MERGE_COUNT-i); |
|
5653 } |
|
5654 } |
|
5655 |
|
5656 for(i=0; i<MERGE_COUNT; i++){ |
|
5657 leavesReaderDestroy(&lrs[i]); |
|
5658 } |
|
5659 |
|
5660 rc = leafWriterFinalize(v, &writer); |
|
5661 leafWriterDestroy(&writer); |
|
5662 if( rc!=SQLITE_OK ) return rc; |
|
5663 |
|
5664 /* Delete the merged segment data. */ |
|
5665 return segdir_delete(v, iLevel); |
|
5666 |
|
5667 err: |
|
5668 for(i=0; i<MERGE_COUNT; i++){ |
|
5669 leavesReaderDestroy(&lrs[i]); |
|
5670 } |
|
5671 leafWriterDestroy(&writer); |
|
5672 return rc; |
|
5673 } |
|
5674 |
|
5675 /* Accumulate the union of *acc and *pData into *acc. */ |
|
5676 static void docListAccumulateUnion(DataBuffer *acc, |
|
5677 const char *pData, int nData) { |
|
5678 DataBuffer tmp = *acc; |
|
5679 dataBufferInit(acc, tmp.nData+nData); |
|
5680 docListUnion(tmp.pData, tmp.nData, pData, nData, acc); |
|
5681 dataBufferDestroy(&tmp); |
|
5682 } |
|
5683 |
|
5684 /* TODO(shess) It might be interesting to explore different merge |
|
5685 ** strategies, here. For instance, since this is a sorted merge, we |
|
5686 ** could easily merge many doclists in parallel. With some |
|
5687 ** comprehension of the storage format, we could merge all of the |
|
5688 ** doclists within a leaf node directly from the leaf node's storage. |
|
5689 ** It may be worthwhile to merge smaller doclists before larger |
|
5690 ** doclists, since they can be traversed more quickly - but the |
|
5691 ** results may have less overlap, making them more expensive in a |
|
5692 ** different way. |
|
5693 */ |
|
5694 |
|
5695 /* Scan pReader for pTerm/nTerm, and merge the term's doclist over |
|
5696 ** *out (any doclists with duplicate docids overwrite those in *out). |
|
5697 ** Internal function for loadSegmentLeaf(). |
|
5698 */ |
|
5699 static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader, |
|
5700 const char *pTerm, int nTerm, int isPrefix, |
|
5701 DataBuffer *out){ |
|
5702 /* doclist data is accumulated into pBuffers similar to how one does |
|
5703 ** increment in binary arithmetic. If index 0 is empty, the data is |
|
5704 ** stored there. If there is data there, it is merged and the |
|
5705 ** results carried into position 1, with further merge-and-carry |
|
5706 ** until an empty position is found. |
|
5707 */ |
|
5708 DataBuffer *pBuffers = NULL; |
|
5709 int nBuffers = 0, nMaxBuffers = 0, rc; |
|
5710 |
|
5711 assert( nTerm>0 ); |
|
5712 |
|
5713 for(rc=SQLITE_OK; rc==SQLITE_OK && !leavesReaderAtEnd(pReader); |
|
5714 rc=leavesReaderStep(v, pReader)){ |
|
5715 /* TODO(shess) Really want leavesReaderTermCmp(), but that name is |
|
5716 ** already taken to compare the terms of two LeavesReaders. Think |
|
5717 ** on a better name. [Meanwhile, break encapsulation rather than |
|
5718 ** use a confusing name.] |
|
5719 */ |
|
5720 int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix); |
|
5721 if( c>0 ) break; /* Past any possible matches. */ |
|
5722 if( c==0 ){ |
|
5723 const char *pData = leavesReaderData(pReader); |
|
5724 int iBuffer, nData = leavesReaderDataBytes(pReader); |
|
5725 |
|
5726 /* Find the first empty buffer. */ |
|
5727 for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){ |
|
5728 if( 0==pBuffers[iBuffer].nData ) break; |
|
5729 } |
|
5730 |
|
5731 /* Out of buffers, add an empty one. */ |
|
5732 if( iBuffer==nBuffers ){ |
|
5733 if( nBuffers==nMaxBuffers ){ |
|
5734 DataBuffer *p; |
|
5735 nMaxBuffers += 20; |
|
5736 |
|
5737 /* Manual realloc so we can handle NULL appropriately. */ |
|
5738 p = sqlite3_malloc(nMaxBuffers*sizeof(*pBuffers)); |
|
5739 if( p==NULL ){ |
|
5740 rc = SQLITE_NOMEM; |
|
5741 break; |
|
5742 } |
|
5743 |
|
5744 if( nBuffers>0 ){ |
|
5745 assert(pBuffers!=NULL); |
|
5746 memcpy(p, pBuffers, nBuffers*sizeof(*pBuffers)); |
|
5747 sqlite3_free(pBuffers); |
|
5748 } |
|
5749 pBuffers = p; |
|
5750 } |
|
5751 dataBufferInit(&(pBuffers[nBuffers]), 0); |
|
5752 nBuffers++; |
|
5753 } |
|
5754 |
|
5755 /* At this point, must have an empty at iBuffer. */ |
|
5756 assert(iBuffer<nBuffers && pBuffers[iBuffer].nData==0); |
|
5757 |
|
5758 /* If empty was first buffer, no need for merge logic. */ |
|
5759 if( iBuffer==0 ){ |
|
5760 dataBufferReplace(&(pBuffers[0]), pData, nData); |
|
5761 }else{ |
|
5762 /* pAcc is the empty buffer the merged data will end up in. */ |
|
5763 DataBuffer *pAcc = &(pBuffers[iBuffer]); |
|
5764 DataBuffer *p = &(pBuffers[0]); |
|
5765 |
|
5766 /* Handle position 0 specially to avoid need to prime pAcc |
|
5767 ** with pData/nData. |
|
5768 */ |
|
5769 dataBufferSwap(p, pAcc); |
|
5770 docListAccumulateUnion(pAcc, pData, nData); |
|
5771 |
|
5772 /* Accumulate remaining doclists into pAcc. */ |
|
5773 for(++p; p<pAcc; ++p){ |
|
5774 docListAccumulateUnion(pAcc, p->pData, p->nData); |
|
5775 |
|
5776 /* dataBufferReset() could allow a large doclist to blow up |
|
5777 ** our memory requirements. |
|
5778 */ |
|
5779 if( p->nCapacity<1024 ){ |
|
5780 dataBufferReset(p); |
|
5781 }else{ |
|
5782 dataBufferDestroy(p); |
|
5783 dataBufferInit(p, 0); |
|
5784 } |
|
5785 } |
|
5786 } |
|
5787 } |
|
5788 } |
|
5789 |
|
5790 /* Union all the doclists together into *out. */ |
|
5791 /* TODO(shess) What if *out is big? Sigh. */ |
|
5792 if( rc==SQLITE_OK && nBuffers>0 ){ |
|
5793 int iBuffer; |
|
5794 for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){ |
|
5795 if( pBuffers[iBuffer].nData>0 ){ |
|
5796 if( out->nData==0 ){ |
|
5797 dataBufferSwap(out, &(pBuffers[iBuffer])); |
|
5798 }else{ |
|
5799 docListAccumulateUnion(out, pBuffers[iBuffer].pData, |
|
5800 pBuffers[iBuffer].nData); |
|
5801 } |
|
5802 } |
|
5803 } |
|
5804 } |
|
5805 |
|
5806 while( nBuffers-- ){ |
|
5807 dataBufferDestroy(&(pBuffers[nBuffers])); |
|
5808 } |
|
5809 if( pBuffers!=NULL ) sqlite3_free(pBuffers); |
|
5810 |
|
5811 return rc; |
|
5812 } |
|
5813 |
|
5814 /* Call loadSegmentLeavesInt() with pData/nData as input. */ |
|
5815 static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData, |
|
5816 const char *pTerm, int nTerm, int isPrefix, |
|
5817 DataBuffer *out){ |
|
5818 LeavesReader reader; |
|
5819 int rc; |
|
5820 |
|
5821 assert( nData>1 ); |
|
5822 assert( *pData=='\0' ); |
|
5823 rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader); |
|
5824 if( rc!=SQLITE_OK ) return rc; |
|
5825 |
|
5826 rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out); |
|
5827 leavesReaderReset(&reader); |
|
5828 leavesReaderDestroy(&reader); |
|
5829 return rc; |
|
5830 } |
|
5831 |
|
5832 /* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to |
|
5833 ** iEndLeaf (inclusive) as input, and merge the resulting doclist into |
|
5834 ** out. |
|
5835 */ |
|
5836 static int loadSegmentLeaves(fulltext_vtab *v, |
|
5837 sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf, |
|
5838 const char *pTerm, int nTerm, int isPrefix, |
|
5839 DataBuffer *out){ |
|
5840 int rc; |
|
5841 LeavesReader reader; |
|
5842 |
|
5843 assert( iStartLeaf<=iEndLeaf ); |
|
5844 rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader); |
|
5845 if( rc!=SQLITE_OK ) return rc; |
|
5846 |
|
5847 rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out); |
|
5848 leavesReaderReset(&reader); |
|
5849 leavesReaderDestroy(&reader); |
|
5850 return rc; |
|
5851 } |
|
5852 |
|
5853 /* Taking pData/nData as an interior node, find the sequence of child |
|
5854 ** nodes which could include pTerm/nTerm/isPrefix. Note that the |
|
5855 ** interior node terms logically come between the blocks, so there is |
|
5856 ** one more blockid than there are terms (that block contains terms >= |
|
5857 ** the last interior-node term). |
|
5858 */ |
|
5859 /* TODO(shess) The calling code may already know that the end child is |
|
5860 ** not worth calculating, because the end may be in a later sibling |
|
5861 ** node. Consider whether breaking symmetry is worthwhile. I suspect |
|
5862 ** it is not worthwhile. |
|
5863 */ |
|
5864 static void getChildrenContaining(const char *pData, int nData, |
|
5865 const char *pTerm, int nTerm, int isPrefix, |
|
5866 sqlite_int64 *piStartChild, |
|
5867 sqlite_int64 *piEndChild){ |
|
5868 InteriorReader reader; |
|
5869 |
|
5870 assert( nData>1 ); |
|
5871 assert( *pData!='\0' ); |
|
5872 interiorReaderInit(pData, nData, &reader); |
|
5873 |
|
5874 /* Scan for the first child which could contain pTerm/nTerm. */ |
|
5875 while( !interiorReaderAtEnd(&reader) ){ |
|
5876 if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break; |
|
5877 interiorReaderStep(&reader); |
|
5878 } |
|
5879 *piStartChild = interiorReaderCurrentBlockid(&reader); |
|
5880 |
|
5881 /* Keep scanning to find a term greater than our term, using prefix |
|
5882 ** comparison if indicated. If isPrefix is false, this will be the |
|
5883 ** same blockid as the starting block. |
|
5884 */ |
|
5885 while( !interiorReaderAtEnd(&reader) ){ |
|
5886 if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break; |
|
5887 interiorReaderStep(&reader); |
|
5888 } |
|
5889 *piEndChild = interiorReaderCurrentBlockid(&reader); |
|
5890 |
|
5891 interiorReaderDestroy(&reader); |
|
5892 |
|
5893 /* Children must ascend, and if !prefix, both must be the same. */ |
|
5894 assert( *piEndChild>=*piStartChild ); |
|
5895 assert( isPrefix || *piStartChild==*piEndChild ); |
|
5896 } |
|
5897 |
|
5898 /* Read block at iBlockid and pass it with other params to |
|
5899 ** getChildrenContaining(). |
|
5900 */ |
|
5901 static int loadAndGetChildrenContaining( |
|
5902 fulltext_vtab *v, |
|
5903 sqlite_int64 iBlockid, |
|
5904 const char *pTerm, int nTerm, int isPrefix, |
|
5905 sqlite_int64 *piStartChild, sqlite_int64 *piEndChild |
|
5906 ){ |
|
5907 sqlite3_stmt *s = NULL; |
|
5908 int rc; |
|
5909 |
|
5910 assert( iBlockid!=0 ); |
|
5911 assert( pTerm!=NULL ); |
|
5912 assert( nTerm!=0 ); /* TODO(shess) Why not allow this? */ |
|
5913 assert( piStartChild!=NULL ); |
|
5914 assert( piEndChild!=NULL ); |
|
5915 |
|
5916 rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s); |
|
5917 if( rc!=SQLITE_OK ) return rc; |
|
5918 |
|
5919 rc = sqlite3_bind_int64(s, 1, iBlockid); |
|
5920 if( rc!=SQLITE_OK ) return rc; |
|
5921 |
|
5922 rc = sqlite3_step(s); |
|
5923 if( rc==SQLITE_DONE ) return SQLITE_ERROR; |
|
5924 if( rc!=SQLITE_ROW ) return rc; |
|
5925 |
|
5926 getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0), |
|
5927 pTerm, nTerm, isPrefix, piStartChild, piEndChild); |
|
5928 |
|
5929 /* We expect only one row. We must execute another sqlite3_step() |
|
5930 * to complete the iteration; otherwise the table will remain |
|
5931 * locked. */ |
|
5932 rc = sqlite3_step(s); |
|
5933 if( rc==SQLITE_ROW ) return SQLITE_ERROR; |
|
5934 if( rc!=SQLITE_DONE ) return rc; |
|
5935 |
|
5936 return SQLITE_OK; |
|
5937 } |
|
5938 |
|
5939 /* Traverse the tree represented by pData[nData] looking for |
|
5940 ** pTerm[nTerm], placing its doclist into *out. This is internal to |
|
5941 ** loadSegment() to make error-handling cleaner. |
|
5942 */ |
|
5943 static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData, |
|
5944 sqlite_int64 iLeavesEnd, |
|
5945 const char *pTerm, int nTerm, int isPrefix, |
|
5946 DataBuffer *out){ |
|
5947 /* Special case where root is a leaf. */ |
|
5948 if( *pData=='\0' ){ |
|
5949 return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out); |
|
5950 }else{ |
|
5951 int rc; |
|
5952 sqlite_int64 iStartChild, iEndChild; |
|
5953 |
|
5954 /* Process pData as an interior node, then loop down the tree |
|
5955 ** until we find the set of leaf nodes to scan for the term. |
|
5956 */ |
|
5957 getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix, |
|
5958 &iStartChild, &iEndChild); |
|
5959 while( iStartChild>iLeavesEnd ){ |
|
5960 sqlite_int64 iNextStart, iNextEnd; |
|
5961 rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix, |
|
5962 &iNextStart, &iNextEnd); |
|
5963 if( rc!=SQLITE_OK ) return rc; |
|
5964 |
|
5965 /* If we've branched, follow the end branch, too. */ |
|
5966 if( iStartChild!=iEndChild ){ |
|
5967 sqlite_int64 iDummy; |
|
5968 rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix, |
|
5969 &iDummy, &iNextEnd); |
|
5970 if( rc!=SQLITE_OK ) return rc; |
|
5971 } |
|
5972 |
|
5973 assert( iNextStart<=iNextEnd ); |
|
5974 iStartChild = iNextStart; |
|
5975 iEndChild = iNextEnd; |
|
5976 } |
|
5977 assert( iStartChild<=iLeavesEnd ); |
|
5978 assert( iEndChild<=iLeavesEnd ); |
|
5979 |
|
5980 /* Scan through the leaf segments for doclists. */ |
|
5981 return loadSegmentLeaves(v, iStartChild, iEndChild, |
|
5982 pTerm, nTerm, isPrefix, out); |
|
5983 } |
|
5984 } |
|
5985 |
|
5986 /* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then |
|
5987 ** merge its doclist over *out (any duplicate doclists read from the |
|
5988 ** segment rooted at pData will overwrite those in *out). |
|
5989 */ |
|
5990 /* TODO(shess) Consider changing this to determine the depth of the |
|
5991 ** leaves using either the first characters of interior nodes (when |
|
5992 ** ==1, we're one level above the leaves), or the first character of |
|
5993 ** the root (which will describe the height of the tree directly). |
|
5994 ** Either feels somewhat tricky to me. |
|
5995 */ |
|
5996 /* TODO(shess) The current merge is likely to be slow for large |
|
5997 ** doclists (though it should process from newest/smallest to |
|
5998 ** oldest/largest, so it may not be that bad). It might be useful to |
|
5999 ** modify things to allow for N-way merging. This could either be |
|
6000 ** within a segment, with pairwise merges across segments, or across |
|
6001 ** all segments at once. |
|
6002 */ |
|
6003 static int loadSegment(fulltext_vtab *v, const char *pData, int nData, |
|
6004 sqlite_int64 iLeavesEnd, |
|
6005 const char *pTerm, int nTerm, int isPrefix, |
|
6006 DataBuffer *out){ |
|
6007 DataBuffer result; |
|
6008 int rc; |
|
6009 |
|
6010 assert( nData>1 ); |
|
6011 |
|
6012 /* This code should never be called with buffered updates. */ |
|
6013 assert( v->nPendingData<0 ); |
|
6014 |
|
6015 dataBufferInit(&result, 0); |
|
6016 rc = loadSegmentInt(v, pData, nData, iLeavesEnd, |
|
6017 pTerm, nTerm, isPrefix, &result); |
|
6018 if( rc==SQLITE_OK && result.nData>0 ){ |
|
6019 if( out->nData==0 ){ |
|
6020 DataBuffer tmp = *out; |
|
6021 *out = result; |
|
6022 result = tmp; |
|
6023 }else{ |
|
6024 DataBuffer merged; |
|
6025 DLReader readers[2]; |
|
6026 |
|
6027 dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData); |
|
6028 dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData); |
|
6029 dataBufferInit(&merged, out->nData+result.nData); |
|
6030 docListMerge(&merged, readers, 2); |
|
6031 dataBufferDestroy(out); |
|
6032 *out = merged; |
|
6033 dlrDestroy(&readers[0]); |
|
6034 dlrDestroy(&readers[1]); |
|
6035 } |
|
6036 } |
|
6037 dataBufferDestroy(&result); |
|
6038 return rc; |
|
6039 } |
|
6040 |
|
6041 /* Scan the database and merge together the posting lists for the term |
|
6042 ** into *out. |
|
6043 */ |
|
6044 static int termSelect(fulltext_vtab *v, int iColumn, |
|
6045 const char *pTerm, int nTerm, int isPrefix, |
|
6046 DocListType iType, DataBuffer *out){ |
|
6047 DataBuffer doclist; |
|
6048 sqlite3_stmt *s; |
|
6049 int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s); |
|
6050 if( rc!=SQLITE_OK ) return rc; |
|
6051 |
|
6052 /* This code should never be called with buffered updates. */ |
|
6053 assert( v->nPendingData<0 ); |
|
6054 |
|
6055 dataBufferInit(&doclist, 0); |
|
6056 |
|
6057 /* Traverse the segments from oldest to newest so that newer doclist |
|
6058 ** elements for given docids overwrite older elements. |
|
6059 */ |
|
6060 while( (rc = sqlite3_step(s))==SQLITE_ROW ){ |
|
6061 const char *pData = sqlite3_column_blob(s, 2); |
|
6062 const int nData = sqlite3_column_bytes(s, 2); |
|
6063 const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1); |
|
6064 rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix, |
|
6065 &doclist); |
|
6066 if( rc!=SQLITE_OK ) goto err; |
|
6067 } |
|
6068 if( rc==SQLITE_DONE ){ |
|
6069 if( doclist.nData!=0 ){ |
|
6070 /* TODO(shess) The old term_select_all() code applied the column |
|
6071 ** restrict as we merged segments, leading to smaller buffers. |
|
6072 ** This is probably worthwhile to bring back, once the new storage |
|
6073 ** system is checked in. |
|
6074 */ |
|
6075 if( iColumn==v->nColumn) iColumn = -1; |
|
6076 docListTrim(DL_DEFAULT, doclist.pData, doclist.nData, |
|
6077 iColumn, iType, out); |
|
6078 } |
|
6079 rc = SQLITE_OK; |
|
6080 } |
|
6081 |
|
6082 err: |
|
6083 dataBufferDestroy(&doclist); |
|
6084 return rc; |
|
6085 } |
|
6086 |
|
6087 /****************************************************************/ |
|
6088 /* Used to hold hashtable data for sorting. */ |
|
6089 typedef struct TermData { |
|
6090 const char *pTerm; |
|
6091 int nTerm; |
|
6092 DLCollector *pCollector; |
|
6093 } TermData; |
|
6094 |
|
6095 /* Orders TermData elements in strcmp fashion ( <0 for less-than, 0 |
|
6096 ** for equal, >0 for greater-than). |
|
6097 */ |
|
6098 static int termDataCmp(const void *av, const void *bv){ |
|
6099 const TermData *a = (const TermData *)av; |
|
6100 const TermData *b = (const TermData *)bv; |
|
6101 int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm; |
|
6102 int c = memcmp(a->pTerm, b->pTerm, n); |
|
6103 if( c!=0 ) return c; |
|
6104 return a->nTerm-b->nTerm; |
|
6105 } |
|
6106 |
|
6107 /* Order pTerms data by term, then write a new level 0 segment using |
|
6108 ** LeafWriter. |
|
6109 */ |
|
6110 static int writeZeroSegment(fulltext_vtab *v, fts3Hash *pTerms){ |
|
6111 fts3HashElem *e; |
|
6112 int idx, rc, i, n; |
|
6113 TermData *pData; |
|
6114 LeafWriter writer; |
|
6115 DataBuffer dl; |
|
6116 |
|
6117 /* Determine the next index at level 0, merging as necessary. */ |
|
6118 rc = segdirNextIndex(v, 0, &idx); |
|
6119 if( rc!=SQLITE_OK ) return rc; |
|
6120 |
|
6121 n = fts3HashCount(pTerms); |
|
6122 pData = sqlite3_malloc(n*sizeof(TermData)); |
|
6123 |
|
6124 for(i = 0, e = fts3HashFirst(pTerms); e; i++, e = fts3HashNext(e)){ |
|
6125 assert( i<n ); |
|
6126 pData[i].pTerm = fts3HashKey(e); |
|
6127 pData[i].nTerm = fts3HashKeysize(e); |
|
6128 pData[i].pCollector = fts3HashData(e); |
|
6129 } |
|
6130 assert( i==n ); |
|
6131 |
|
6132 /* TODO(shess) Should we allow user-defined collation sequences, |
|
6133 ** here? I think we only need that once we support prefix searches. |
|
6134 */ |
|
6135 if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp); |
|
6136 |
|
6137 /* TODO(shess) Refactor so that we can write directly to the segment |
|
6138 ** DataBuffer, as happens for segment merges. |
|
6139 */ |
|
6140 leafWriterInit(0, idx, &writer); |
|
6141 dataBufferInit(&dl, 0); |
|
6142 for(i=0; i<n; i++){ |
|
6143 dataBufferReset(&dl); |
|
6144 dlcAddDoclist(pData[i].pCollector, &dl); |
|
6145 rc = leafWriterStep(v, &writer, |
|
6146 pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData); |
|
6147 if( rc!=SQLITE_OK ) goto err; |
|
6148 } |
|
6149 rc = leafWriterFinalize(v, &writer); |
|
6150 |
|
6151 err: |
|
6152 dataBufferDestroy(&dl); |
|
6153 sqlite3_free(pData); |
|
6154 leafWriterDestroy(&writer); |
|
6155 return rc; |
|
6156 } |
|
6157 |
|
6158 /* If pendingTerms has data, free it. */ |
|
6159 static int clearPendingTerms(fulltext_vtab *v){ |
|
6160 if( v->nPendingData>=0 ){ |
|
6161 fts3HashElem *e; |
|
6162 for(e=fts3HashFirst(&v->pendingTerms); e; e=fts3HashNext(e)){ |
|
6163 dlcDelete(fts3HashData(e)); |
|
6164 } |
|
6165 fts3HashClear(&v->pendingTerms); |
|
6166 v->nPendingData = -1; |
|
6167 } |
|
6168 return SQLITE_OK; |
|
6169 } |
|
6170 |
|
6171 /* If pendingTerms has data, flush it to a level-zero segment, and |
|
6172 ** free it. |
|
6173 */ |
|
6174 static int flushPendingTerms(fulltext_vtab *v){ |
|
6175 if( v->nPendingData>=0 ){ |
|
6176 int rc = writeZeroSegment(v, &v->pendingTerms); |
|
6177 if( rc==SQLITE_OK ) clearPendingTerms(v); |
|
6178 return rc; |
|
6179 } |
|
6180 return SQLITE_OK; |
|
6181 } |
|
6182 |
|
6183 /* If pendingTerms is "too big", or docid is out of order, flush it. |
|
6184 ** Regardless, be certain that pendingTerms is initialized for use. |
|
6185 */ |
|
6186 static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){ |
|
6187 /* TODO(shess) Explore whether partially flushing the buffer on |
|
6188 ** forced-flush would provide better performance. I suspect that if |
|
6189 ** we ordered the doclists by size and flushed the largest until the |
|
6190 ** buffer was half empty, that would let the less frequent terms |
|
6191 ** generate longer doclists. |
|
6192 */ |
|
6193 if( iDocid<=v->iPrevDocid || v->nPendingData>kPendingThreshold ){ |
|
6194 int rc = flushPendingTerms(v); |
|
6195 if( rc!=SQLITE_OK ) return rc; |
|
6196 } |
|
6197 if( v->nPendingData<0 ){ |
|
6198 fts3HashInit(&v->pendingTerms, FTS3_HASH_STRING, 1); |
|
6199 v->nPendingData = 0; |
|
6200 } |
|
6201 v->iPrevDocid = iDocid; |
|
6202 return SQLITE_OK; |
|
6203 } |
|
6204 |
|
6205 /* This function implements the xUpdate callback; it is the top-level entry |
|
6206 * point for inserting, deleting or updating a row in a full-text table. */ |
|
6207 static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, |
|
6208 sqlite_int64 *pRowid){ |
|
6209 fulltext_vtab *v = (fulltext_vtab *) pVtab; |
|
6210 int rc; |
|
6211 |
|
6212 FTSTRACE(("FTS3 Update %p\n", pVtab)); |
|
6213 |
|
6214 if( nArg<2 ){ |
|
6215 rc = index_delete(v, sqlite3_value_int64(ppArg[0])); |
|
6216 if( rc==SQLITE_OK ){ |
|
6217 /* If we just deleted the last row in the table, clear out the |
|
6218 ** index data. |
|
6219 */ |
|
6220 rc = content_exists(v); |
|
6221 if( rc==SQLITE_ROW ){ |
|
6222 rc = SQLITE_OK; |
|
6223 }else if( rc==SQLITE_DONE ){ |
|
6224 /* Clear the pending terms so we don't flush a useless level-0 |
|
6225 ** segment when the transaction closes. |
|
6226 */ |
|
6227 rc = clearPendingTerms(v); |
|
6228 if( rc==SQLITE_OK ){ |
|
6229 rc = segdir_delete_all(v); |
|
6230 } |
|
6231 } |
|
6232 } |
|
6233 } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ |
|
6234 /* An update: |
|
6235 * ppArg[0] = old rowid |
|
6236 * ppArg[1] = new rowid |
|
6237 * ppArg[2..2+v->nColumn-1] = values |
|
6238 * ppArg[2+v->nColumn] = value for magic column (we ignore this) |
|
6239 * ppArg[2+v->nColumn+1] = value for docid |
|
6240 */ |
|
6241 sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]); |
|
6242 if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER || |
|
6243 sqlite3_value_int64(ppArg[1]) != rowid ){ |
|
6244 rc = SQLITE_ERROR; /* we don't allow changing the rowid */ |
|
6245 }else if( sqlite3_value_type(ppArg[2+v->nColumn+1]) != SQLITE_INTEGER || |
|
6246 sqlite3_value_int64(ppArg[2+v->nColumn+1]) != rowid ){ |
|
6247 rc = SQLITE_ERROR; /* we don't allow changing the docid */ |
|
6248 }else{ |
|
6249 assert( nArg==2+v->nColumn+2); |
|
6250 rc = index_update(v, rowid, &ppArg[2]); |
|
6251 } |
|
6252 } else { |
|
6253 /* An insert: |
|
6254 * ppArg[1] = requested rowid |
|
6255 * ppArg[2..2+v->nColumn-1] = values |
|
6256 * ppArg[2+v->nColumn] = value for magic column (we ignore this) |
|
6257 * ppArg[2+v->nColumn+1] = value for docid |
|
6258 */ |
|
6259 sqlite3_value *pRequestDocid = ppArg[2+v->nColumn+1]; |
|
6260 assert( nArg==2+v->nColumn+2); |
|
6261 if( SQLITE_NULL != sqlite3_value_type(pRequestDocid) && |
|
6262 SQLITE_NULL != sqlite3_value_type(ppArg[1]) ){ |
|
6263 /* TODO(shess) Consider allowing this to work if the values are |
|
6264 ** identical. I'm inclined to discourage that usage, though, |
|
6265 ** given that both rowid and docid are special columns. Better |
|
6266 ** would be to define one or the other as the default winner, |
|
6267 ** but should it be fts3-centric (docid) or SQLite-centric |
|
6268 ** (rowid)? |
|
6269 */ |
|
6270 rc = SQLITE_ERROR; |
|
6271 }else{ |
|
6272 if( SQLITE_NULL == sqlite3_value_type(pRequestDocid) ){ |
|
6273 pRequestDocid = ppArg[1]; |
|
6274 } |
|
6275 rc = index_insert(v, pRequestDocid, &ppArg[2], pRowid); |
|
6276 } |
|
6277 } |
|
6278 |
|
6279 return rc; |
|
6280 } |
|
6281 |
|
6282 static int fulltextSync(sqlite3_vtab *pVtab){ |
|
6283 FTSTRACE(("FTS3 xSync()\n")); |
|
6284 return flushPendingTerms((fulltext_vtab *)pVtab); |
|
6285 } |
|
6286 |
|
6287 static int fulltextBegin(sqlite3_vtab *pVtab){ |
|
6288 fulltext_vtab *v = (fulltext_vtab *) pVtab; |
|
6289 FTSTRACE(("FTS3 xBegin()\n")); |
|
6290 |
|
6291 /* Any buffered updates should have been cleared by the previous |
|
6292 ** transaction. |
|
6293 */ |
|
6294 assert( v->nPendingData<0 ); |
|
6295 return clearPendingTerms(v); |
|
6296 } |
|
6297 |
|
6298 static int fulltextCommit(sqlite3_vtab *pVtab){ |
|
6299 fulltext_vtab *v = (fulltext_vtab *) pVtab; |
|
6300 FTSTRACE(("FTS3 xCommit()\n")); |
|
6301 |
|
6302 /* Buffered updates should have been cleared by fulltextSync(). */ |
|
6303 assert( v->nPendingData<0 ); |
|
6304 return clearPendingTerms(v); |
|
6305 } |
|
6306 |
|
6307 static int fulltextRollback(sqlite3_vtab *pVtab){ |
|
6308 FTSTRACE(("FTS3 xRollback()\n")); |
|
6309 return clearPendingTerms((fulltext_vtab *)pVtab); |
|
6310 } |
|
6311 |
|
6312 /* |
|
6313 ** Implementation of the snippet() function for FTS3 |
|
6314 */ |
|
6315 static void snippetFunc( |
|
6316 sqlite3_context *pContext, |
|
6317 int argc, |
|
6318 sqlite3_value **argv |
|
6319 ){ |
|
6320 fulltext_cursor *pCursor; |
|
6321 if( argc<1 ) return; |
|
6322 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || |
|
6323 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ |
|
6324 sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1); |
|
6325 }else{ |
|
6326 const char *zStart = "<b>"; |
|
6327 const char *zEnd = "</b>"; |
|
6328 const char *zEllipsis = "<b>...</b>"; |
|
6329 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); |
|
6330 if( argc>=2 ){ |
|
6331 zStart = (const char*)sqlite3_value_text(argv[1]); |
|
6332 if( argc>=3 ){ |
|
6333 zEnd = (const char*)sqlite3_value_text(argv[2]); |
|
6334 if( argc>=4 ){ |
|
6335 zEllipsis = (const char*)sqlite3_value_text(argv[3]); |
|
6336 } |
|
6337 } |
|
6338 } |
|
6339 snippetAllOffsets(pCursor); |
|
6340 snippetText(pCursor, zStart, zEnd, zEllipsis); |
|
6341 sqlite3_result_text(pContext, pCursor->snippet.zSnippet, |
|
6342 pCursor->snippet.nSnippet, SQLITE_STATIC); |
|
6343 } |
|
6344 } |
|
6345 |
|
6346 /* |
|
6347 ** Implementation of the offsets() function for FTS3 |
|
6348 */ |
|
6349 static void snippetOffsetsFunc( |
|
6350 sqlite3_context *pContext, |
|
6351 int argc, |
|
6352 sqlite3_value **argv |
|
6353 ){ |
|
6354 fulltext_cursor *pCursor; |
|
6355 if( argc<1 ) return; |
|
6356 if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || |
|
6357 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ |
|
6358 sqlite3_result_error(pContext, "illegal first argument to offsets",-1); |
|
6359 }else{ |
|
6360 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); |
|
6361 snippetAllOffsets(pCursor); |
|
6362 snippetOffsetText(&pCursor->snippet); |
|
6363 sqlite3_result_text(pContext, |
|
6364 pCursor->snippet.zOffset, pCursor->snippet.nOffset, |
|
6365 SQLITE_STATIC); |
|
6366 } |
|
6367 } |
|
6368 |
|
6369 /* OptLeavesReader is nearly identical to LeavesReader, except that |
|
6370 ** where LeavesReader is geared towards the merging of complete |
|
6371 ** segment levels (with exactly MERGE_COUNT segments), OptLeavesReader |
|
6372 ** is geared towards implementation of the optimize() function, and |
|
6373 ** can merge all segments simultaneously. This version may be |
|
6374 ** somewhat less efficient than LeavesReader because it merges into an |
|
6375 ** accumulator rather than doing an N-way merge, but since segment |
|
6376 ** size grows exponentially (so segment count logrithmically) this is |
|
6377 ** probably not an immediate problem. |
|
6378 */ |
|
6379 /* TODO(shess): Prove that assertion, or extend the merge code to |
|
6380 ** merge tree fashion (like the prefix-searching code does). |
|
6381 */ |
|
6382 /* TODO(shess): OptLeavesReader and LeavesReader could probably be |
|
6383 ** merged with little or no loss of performance for LeavesReader. The |
|
6384 ** merged code would need to handle >MERGE_COUNT segments, and would |
|
6385 ** also need to be able to optionally optimize away deletes. |
|
6386 */ |
|
6387 typedef struct OptLeavesReader { |
|
6388 /* Segment number, to order readers by age. */ |
|
6389 int segment; |
|
6390 LeavesReader reader; |
|
6391 } OptLeavesReader; |
|
6392 |
|
6393 static int optLeavesReaderAtEnd(OptLeavesReader *pReader){ |
|
6394 return leavesReaderAtEnd(&pReader->reader); |
|
6395 } |
|
6396 static int optLeavesReaderTermBytes(OptLeavesReader *pReader){ |
|
6397 return leavesReaderTermBytes(&pReader->reader); |
|
6398 } |
|
6399 static const char *optLeavesReaderData(OptLeavesReader *pReader){ |
|
6400 return leavesReaderData(&pReader->reader); |
|
6401 } |
|
6402 static int optLeavesReaderDataBytes(OptLeavesReader *pReader){ |
|
6403 return leavesReaderDataBytes(&pReader->reader); |
|
6404 } |
|
6405 static const char *optLeavesReaderTerm(OptLeavesReader *pReader){ |
|
6406 return leavesReaderTerm(&pReader->reader); |
|
6407 } |
|
6408 static int optLeavesReaderStep(fulltext_vtab *v, OptLeavesReader *pReader){ |
|
6409 return leavesReaderStep(v, &pReader->reader); |
|
6410 } |
|
6411 static int optLeavesReaderTermCmp(OptLeavesReader *lr1, OptLeavesReader *lr2){ |
|
6412 return leavesReaderTermCmp(&lr1->reader, &lr2->reader); |
|
6413 } |
|
6414 /* Order by term ascending, segment ascending (oldest to newest), with |
|
6415 ** exhausted readers to the end. |
|
6416 */ |
|
6417 static int optLeavesReaderCmp(OptLeavesReader *lr1, OptLeavesReader *lr2){ |
|
6418 int c = optLeavesReaderTermCmp(lr1, lr2); |
|
6419 if( c!=0 ) return c; |
|
6420 return lr1->segment-lr2->segment; |
|
6421 } |
|
6422 /* Bubble pLr[0] to appropriate place in pLr[1..nLr-1]. Assumes that |
|
6423 ** pLr[1..nLr-1] is already sorted. |
|
6424 */ |
|
6425 static void optLeavesReaderReorder(OptLeavesReader *pLr, int nLr){ |
|
6426 while( nLr>1 && optLeavesReaderCmp(pLr, pLr+1)>0 ){ |
|
6427 OptLeavesReader tmp = pLr[0]; |
|
6428 pLr[0] = pLr[1]; |
|
6429 pLr[1] = tmp; |
|
6430 nLr--; |
|
6431 pLr++; |
|
6432 } |
|
6433 } |
|
6434 |
|
6435 /* optimize() helper function. Put the readers in order and iterate |
|
6436 ** through them, merging doclists for matching terms into pWriter. |
|
6437 ** Returns SQLITE_OK on success, or the SQLite error code which |
|
6438 ** prevented success. |
|
6439 */ |
|
6440 static int optimizeInternal(fulltext_vtab *v, |
|
6441 OptLeavesReader *readers, int nReaders, |
|
6442 LeafWriter *pWriter){ |
|
6443 int i, rc = SQLITE_OK; |
|
6444 DataBuffer doclist, merged, tmp; |
|
6445 |
|
6446 /* Order the readers. */ |
|
6447 i = nReaders; |
|
6448 while( i-- > 0 ){ |
|
6449 optLeavesReaderReorder(&readers[i], nReaders-i); |
|
6450 } |
|
6451 |
|
6452 dataBufferInit(&doclist, LEAF_MAX); |
|
6453 dataBufferInit(&merged, LEAF_MAX); |
|
6454 |
|
6455 /* Exhausted readers bubble to the end, so when the first reader is |
|
6456 ** at eof, all are at eof. |
|
6457 */ |
|
6458 while( !optLeavesReaderAtEnd(&readers[0]) ){ |
|
6459 |
|
6460 /* Figure out how many readers share the next term. */ |
|
6461 for(i=1; i<nReaders && !optLeavesReaderAtEnd(&readers[i]); i++){ |
|
6462 if( 0!=optLeavesReaderTermCmp(&readers[0], &readers[i]) ) break; |
|
6463 } |
|
6464 |
|
6465 /* Special-case for no merge. */ |
|
6466 if( i==1 ){ |
|
6467 /* Trim deletions from the doclist. */ |
|
6468 dataBufferReset(&merged); |
|
6469 docListTrim(DL_DEFAULT, |
|
6470 optLeavesReaderData(&readers[0]), |
|
6471 optLeavesReaderDataBytes(&readers[0]), |
|
6472 -1, DL_DEFAULT, &merged); |
|
6473 }else{ |
|
6474 DLReader dlReaders[MERGE_COUNT]; |
|
6475 int iReader, nReaders; |
|
6476 |
|
6477 /* Prime the pipeline with the first reader's doclist. After |
|
6478 ** one pass index 0 will reference the accumulated doclist. |
|
6479 */ |
|
6480 dlrInit(&dlReaders[0], DL_DEFAULT, |
|
6481 optLeavesReaderData(&readers[0]), |
|
6482 optLeavesReaderDataBytes(&readers[0])); |
|
6483 iReader = 1; |
|
6484 |
|
6485 assert( iReader<i ); /* Must execute the loop at least once. */ |
|
6486 while( iReader<i ){ |
|
6487 /* Merge 16 inputs per pass. */ |
|
6488 for( nReaders=1; iReader<i && nReaders<MERGE_COUNT; |
|
6489 iReader++, nReaders++ ){ |
|
6490 dlrInit(&dlReaders[nReaders], DL_DEFAULT, |
|
6491 optLeavesReaderData(&readers[iReader]), |
|
6492 optLeavesReaderDataBytes(&readers[iReader])); |
|
6493 } |
|
6494 |
|
6495 /* Merge doclists and swap result into accumulator. */ |
|
6496 dataBufferReset(&merged); |
|
6497 docListMerge(&merged, dlReaders, nReaders); |
|
6498 tmp = merged; |
|
6499 merged = doclist; |
|
6500 doclist = tmp; |
|
6501 |
|
6502 while( nReaders-- > 0 ){ |
|
6503 dlrDestroy(&dlReaders[nReaders]); |
|
6504 } |
|
6505 |
|
6506 /* Accumulated doclist to reader 0 for next pass. */ |
|
6507 dlrInit(&dlReaders[0], DL_DEFAULT, doclist.pData, doclist.nData); |
|
6508 } |
|
6509 |
|
6510 /* Destroy reader that was left in the pipeline. */ |
|
6511 dlrDestroy(&dlReaders[0]); |
|
6512 |
|
6513 /* Trim deletions from the doclist. */ |
|
6514 dataBufferReset(&merged); |
|
6515 docListTrim(DL_DEFAULT, doclist.pData, doclist.nData, |
|
6516 -1, DL_DEFAULT, &merged); |
|
6517 } |
|
6518 |
|
6519 /* Only pass doclists with hits (skip if all hits deleted). */ |
|
6520 if( merged.nData>0 ){ |
|
6521 rc = leafWriterStep(v, pWriter, |
|
6522 optLeavesReaderTerm(&readers[0]), |
|
6523 optLeavesReaderTermBytes(&readers[0]), |
|
6524 merged.pData, merged.nData); |
|
6525 if( rc!=SQLITE_OK ) goto err; |
|
6526 } |
|
6527 |
|
6528 /* Step merged readers to next term and reorder. */ |
|
6529 while( i-- > 0 ){ |
|
6530 rc = optLeavesReaderStep(v, &readers[i]); |
|
6531 if( rc!=SQLITE_OK ) goto err; |
|
6532 |
|
6533 optLeavesReaderReorder(&readers[i], nReaders-i); |
|
6534 } |
|
6535 } |
|
6536 |
|
6537 err: |
|
6538 dataBufferDestroy(&doclist); |
|
6539 dataBufferDestroy(&merged); |
|
6540 return rc; |
|
6541 } |
|
6542 |
|
6543 /* Implement optimize() function for FTS3. optimize(t) merges all |
|
6544 ** segments in the fts index into a single segment. 't' is the magic |
|
6545 ** table-named column. |
|
6546 */ |
|
6547 static void optimizeFunc(sqlite3_context *pContext, |
|
6548 int argc, sqlite3_value **argv){ |
|
6549 fulltext_cursor *pCursor; |
|
6550 if( argc>1 ){ |
|
6551 sqlite3_result_error(pContext, "excess arguments to optimize()",-1); |
|
6552 }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || |
|
6553 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ |
|
6554 sqlite3_result_error(pContext, "illegal first argument to optimize",-1); |
|
6555 }else{ |
|
6556 fulltext_vtab *v; |
|
6557 int i, rc, iMaxLevel; |
|
6558 OptLeavesReader *readers; |
|
6559 int nReaders; |
|
6560 LeafWriter writer; |
|
6561 sqlite3_stmt *s; |
|
6562 |
|
6563 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); |
|
6564 v = cursor_vtab(pCursor); |
|
6565 |
|
6566 /* Flush any buffered updates before optimizing. */ |
|
6567 rc = flushPendingTerms(v); |
|
6568 if( rc!=SQLITE_OK ) goto err; |
|
6569 |
|
6570 rc = segdir_count(v, &nReaders, &iMaxLevel); |
|
6571 if( rc!=SQLITE_OK ) goto err; |
|
6572 if( nReaders==0 || nReaders==1 ){ |
|
6573 sqlite3_result_text(pContext, "Index already optimal", -1, |
|
6574 SQLITE_STATIC); |
|
6575 return; |
|
6576 } |
|
6577 |
|
6578 rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s); |
|
6579 if( rc!=SQLITE_OK ) goto err; |
|
6580 |
|
6581 readers = sqlite3_malloc(nReaders*sizeof(readers[0])); |
|
6582 if( readers==NULL ) goto err; |
|
6583 |
|
6584 /* Note that there will already be a segment at this position |
|
6585 ** until we call segdir_delete() on iMaxLevel. |
|
6586 */ |
|
6587 leafWriterInit(iMaxLevel, 0, &writer); |
|
6588 |
|
6589 i = 0; |
|
6590 while( (rc = sqlite3_step(s))==SQLITE_ROW ){ |
|
6591 sqlite_int64 iStart = sqlite3_column_int64(s, 0); |
|
6592 sqlite_int64 iEnd = sqlite3_column_int64(s, 1); |
|
6593 const char *pRootData = sqlite3_column_blob(s, 2); |
|
6594 int nRootData = sqlite3_column_bytes(s, 2); |
|
6595 |
|
6596 assert( i<nReaders ); |
|
6597 rc = leavesReaderInit(v, -1, iStart, iEnd, pRootData, nRootData, |
|
6598 &readers[i].reader); |
|
6599 if( rc!=SQLITE_OK ) break; |
|
6600 |
|
6601 readers[i].segment = i; |
|
6602 i++; |
|
6603 } |
|
6604 |
|
6605 /* If we managed to succesfully read them all, optimize them. */ |
|
6606 if( rc==SQLITE_DONE ){ |
|
6607 assert( i==nReaders ); |
|
6608 rc = optimizeInternal(v, readers, nReaders, &writer); |
|
6609 } |
|
6610 |
|
6611 while( i-- > 0 ){ |
|
6612 leavesReaderDestroy(&readers[i].reader); |
|
6613 } |
|
6614 sqlite3_free(readers); |
|
6615 |
|
6616 /* If we've successfully gotten to here, delete the old segments |
|
6617 ** and flush the interior structure of the new segment. |
|
6618 */ |
|
6619 if( rc==SQLITE_OK ){ |
|
6620 for( i=0; i<=iMaxLevel; i++ ){ |
|
6621 rc = segdir_delete(v, i); |
|
6622 if( rc!=SQLITE_OK ) break; |
|
6623 } |
|
6624 |
|
6625 if( rc==SQLITE_OK ) rc = leafWriterFinalize(v, &writer); |
|
6626 } |
|
6627 |
|
6628 leafWriterDestroy(&writer); |
|
6629 |
|
6630 if( rc!=SQLITE_OK ) goto err; |
|
6631 |
|
6632 sqlite3_result_text(pContext, "Index optimized", -1, SQLITE_STATIC); |
|
6633 return; |
|
6634 |
|
6635 /* TODO(shess): Error-handling needs to be improved along the |
|
6636 ** lines of the dump_ functions. |
|
6637 */ |
|
6638 err: |
|
6639 { |
|
6640 char buf[512]; |
|
6641 sqlite3_snprintf(sizeof(buf), buf, "Error in optimize: %s", |
|
6642 sqlite3_errmsg(sqlite3_context_db_handle(pContext))); |
|
6643 sqlite3_result_error(pContext, buf, -1); |
|
6644 } |
|
6645 } |
|
6646 } |
|
6647 |
|
6648 #ifdef SQLITE_TEST |
|
6649 /* Generate an error of the form "<prefix>: <msg>". If msg is NULL, |
|
6650 ** pull the error from the context's db handle. |
|
6651 */ |
|
6652 static void generateError(sqlite3_context *pContext, |
|
6653 const char *prefix, const char *msg){ |
|
6654 char buf[512]; |
|
6655 if( msg==NULL ) msg = sqlite3_errmsg(sqlite3_context_db_handle(pContext)); |
|
6656 sqlite3_snprintf(sizeof(buf), buf, "%s: %s", prefix, msg); |
|
6657 sqlite3_result_error(pContext, buf, -1); |
|
6658 } |
|
6659 |
|
6660 /* Helper function to collect the set of terms in the segment into |
|
6661 ** pTerms. The segment is defined by the leaf nodes between |
|
6662 ** iStartBlockid and iEndBlockid, inclusive, or by the contents of |
|
6663 ** pRootData if iStartBlockid is 0 (in which case the entire segment |
|
6664 ** fit in a leaf). |
|
6665 */ |
|
6666 static int collectSegmentTerms(fulltext_vtab *v, sqlite3_stmt *s, |
|
6667 fts3Hash *pTerms){ |
|
6668 const sqlite_int64 iStartBlockid = sqlite3_column_int64(s, 0); |
|
6669 const sqlite_int64 iEndBlockid = sqlite3_column_int64(s, 1); |
|
6670 const char *pRootData = sqlite3_column_blob(s, 2); |
|
6671 const int nRootData = sqlite3_column_bytes(s, 2); |
|
6672 LeavesReader reader; |
|
6673 int rc = leavesReaderInit(v, 0, iStartBlockid, iEndBlockid, |
|
6674 pRootData, nRootData, &reader); |
|
6675 if( rc!=SQLITE_OK ) return rc; |
|
6676 |
|
6677 while( rc==SQLITE_OK && !leavesReaderAtEnd(&reader) ){ |
|
6678 const char *pTerm = leavesReaderTerm(&reader); |
|
6679 const int nTerm = leavesReaderTermBytes(&reader); |
|
6680 void *oldValue = sqlite3Fts3HashFind(pTerms, pTerm, nTerm); |
|
6681 void *newValue = (void *)((char *)oldValue+1); |
|
6682 |
|
6683 /* From the comment before sqlite3Fts3HashInsert in fts3_hash.c, |
|
6684 ** the data value passed is returned in case of malloc failure. |
|
6685 */ |
|
6686 if( newValue==sqlite3Fts3HashInsert(pTerms, pTerm, nTerm, newValue) ){ |
|
6687 rc = SQLITE_NOMEM; |
|
6688 }else{ |
|
6689 rc = leavesReaderStep(v, &reader); |
|
6690 } |
|
6691 } |
|
6692 |
|
6693 leavesReaderDestroy(&reader); |
|
6694 return rc; |
|
6695 } |
|
6696 |
|
6697 /* Helper function to build the result string for dump_terms(). */ |
|
6698 static int generateTermsResult(sqlite3_context *pContext, fts3Hash *pTerms){ |
|
6699 int iTerm, nTerms, nResultBytes, iByte; |
|
6700 char *result; |
|
6701 TermData *pData; |
|
6702 fts3HashElem *e; |
|
6703 |
|
6704 /* Iterate pTerms to generate an array of terms in pData for |
|
6705 ** sorting. |
|
6706 */ |
|
6707 nTerms = fts3HashCount(pTerms); |
|
6708 assert( nTerms>0 ); |
|
6709 pData = sqlite3_malloc(nTerms*sizeof(TermData)); |
|
6710 if( pData==NULL ) return SQLITE_NOMEM; |
|
6711 |
|
6712 nResultBytes = 0; |
|
6713 for(iTerm = 0, e = fts3HashFirst(pTerms); e; iTerm++, e = fts3HashNext(e)){ |
|
6714 nResultBytes += fts3HashKeysize(e)+1; /* Term plus trailing space */ |
|
6715 assert( iTerm<nTerms ); |
|
6716 pData[iTerm].pTerm = fts3HashKey(e); |
|
6717 pData[iTerm].nTerm = fts3HashKeysize(e); |
|
6718 pData[iTerm].pCollector = fts3HashData(e); /* unused */ |
|
6719 } |
|
6720 assert( iTerm==nTerms ); |
|
6721 |
|
6722 assert( nResultBytes>0 ); /* nTerms>0, nResultsBytes must be, too. */ |
|
6723 result = sqlite3_malloc(nResultBytes); |
|
6724 if( result==NULL ){ |
|
6725 sqlite3_free(pData); |
|
6726 return SQLITE_NOMEM; |
|
6727 } |
|
6728 |
|
6729 if( nTerms>1 ) qsort(pData, nTerms, sizeof(*pData), termDataCmp); |
|
6730 |
|
6731 /* Read the terms in order to build the result. */ |
|
6732 iByte = 0; |
|
6733 for(iTerm=0; iTerm<nTerms; ++iTerm){ |
|
6734 memcpy(result+iByte, pData[iTerm].pTerm, pData[iTerm].nTerm); |
|
6735 iByte += pData[iTerm].nTerm; |
|
6736 result[iByte++] = ' '; |
|
6737 } |
|
6738 assert( iByte==nResultBytes ); |
|
6739 assert( result[nResultBytes-1]==' ' ); |
|
6740 result[nResultBytes-1] = '\0'; |
|
6741 |
|
6742 /* Passes away ownership of result. */ |
|
6743 sqlite3_result_text(pContext, result, nResultBytes-1, sqlite3_free); |
|
6744 sqlite3_free(pData); |
|
6745 return SQLITE_OK; |
|
6746 } |
|
6747 |
|
6748 /* Implements dump_terms() for use in inspecting the fts3 index from |
|
6749 ** tests. TEXT result containing the ordered list of terms joined by |
|
6750 ** spaces. dump_terms(t, level, idx) dumps the terms for the segment |
|
6751 ** specified by level, idx (in %_segdir), while dump_terms(t) dumps |
|
6752 ** all terms in the index. In both cases t is the fts table's magic |
|
6753 ** table-named column. |
|
6754 */ |
|
6755 static void dumpTermsFunc( |
|
6756 sqlite3_context *pContext, |
|
6757 int argc, sqlite3_value **argv |
|
6758 ){ |
|
6759 fulltext_cursor *pCursor; |
|
6760 if( argc!=3 && argc!=1 ){ |
|
6761 generateError(pContext, "dump_terms", "incorrect arguments"); |
|
6762 }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || |
|
6763 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ |
|
6764 generateError(pContext, "dump_terms", "illegal first argument"); |
|
6765 }else{ |
|
6766 fulltext_vtab *v; |
|
6767 fts3Hash terms; |
|
6768 sqlite3_stmt *s = NULL; |
|
6769 int rc; |
|
6770 |
|
6771 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); |
|
6772 v = cursor_vtab(pCursor); |
|
6773 |
|
6774 /* If passed only the cursor column, get all segments. Otherwise |
|
6775 ** get the segment described by the following two arguments. |
|
6776 */ |
|
6777 if( argc==1 ){ |
|
6778 rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s); |
|
6779 }else{ |
|
6780 rc = sql_get_statement(v, SEGDIR_SELECT_SEGMENT_STMT, &s); |
|
6781 if( rc==SQLITE_OK ){ |
|
6782 rc = sqlite3_bind_int(s, 1, sqlite3_value_int(argv[1])); |
|
6783 if( rc==SQLITE_OK ){ |
|
6784 rc = sqlite3_bind_int(s, 2, sqlite3_value_int(argv[2])); |
|
6785 } |
|
6786 } |
|
6787 } |
|
6788 |
|
6789 if( rc!=SQLITE_OK ){ |
|
6790 generateError(pContext, "dump_terms", NULL); |
|
6791 return; |
|
6792 } |
|
6793 |
|
6794 /* Collect the terms for each segment. */ |
|
6795 sqlite3Fts3HashInit(&terms, FTS3_HASH_STRING, 1); |
|
6796 while( (rc = sqlite3_step(s))==SQLITE_ROW ){ |
|
6797 rc = collectSegmentTerms(v, s, &terms); |
|
6798 if( rc!=SQLITE_OK ) break; |
|
6799 } |
|
6800 |
|
6801 if( rc!=SQLITE_DONE ){ |
|
6802 sqlite3_reset(s); |
|
6803 generateError(pContext, "dump_terms", NULL); |
|
6804 }else{ |
|
6805 const int nTerms = fts3HashCount(&terms); |
|
6806 if( nTerms>0 ){ |
|
6807 rc = generateTermsResult(pContext, &terms); |
|
6808 if( rc==SQLITE_NOMEM ){ |
|
6809 generateError(pContext, "dump_terms", "out of memory"); |
|
6810 }else{ |
|
6811 assert( rc==SQLITE_OK ); |
|
6812 } |
|
6813 }else if( argc==3 ){ |
|
6814 /* The specific segment asked for could not be found. */ |
|
6815 generateError(pContext, "dump_terms", "segment not found"); |
|
6816 }else{ |
|
6817 /* No segments found. */ |
|
6818 /* TODO(shess): It should be impossible to reach this. This |
|
6819 ** case can only happen for an empty table, in which case |
|
6820 ** SQLite has no rows to call this function on. |
|
6821 */ |
|
6822 sqlite3_result_null(pContext); |
|
6823 } |
|
6824 } |
|
6825 sqlite3Fts3HashClear(&terms); |
|
6826 } |
|
6827 } |
|
6828 |
|
6829 /* Expand the DL_DEFAULT doclist in pData into a text result in |
|
6830 ** pContext. |
|
6831 */ |
|
6832 static void createDoclistResult(sqlite3_context *pContext, |
|
6833 const char *pData, int nData){ |
|
6834 DataBuffer dump; |
|
6835 DLReader dlReader; |
|
6836 |
|
6837 assert( pData!=NULL && nData>0 ); |
|
6838 |
|
6839 dataBufferInit(&dump, 0); |
|
6840 dlrInit(&dlReader, DL_DEFAULT, pData, nData); |
|
6841 for( ; !dlrAtEnd(&dlReader); dlrStep(&dlReader) ){ |
|
6842 char buf[256]; |
|
6843 PLReader plReader; |
|
6844 |
|
6845 plrInit(&plReader, &dlReader); |
|
6846 if( DL_DEFAULT==DL_DOCIDS || plrAtEnd(&plReader) ){ |
|
6847 sqlite3_snprintf(sizeof(buf), buf, "[%lld] ", dlrDocid(&dlReader)); |
|
6848 dataBufferAppend(&dump, buf, strlen(buf)); |
|
6849 }else{ |
|
6850 int iColumn = plrColumn(&plReader); |
|
6851 |
|
6852 sqlite3_snprintf(sizeof(buf), buf, "[%lld %d[", |
|
6853 dlrDocid(&dlReader), iColumn); |
|
6854 dataBufferAppend(&dump, buf, strlen(buf)); |
|
6855 |
|
6856 for( ; !plrAtEnd(&plReader); plrStep(&plReader) ){ |
|
6857 if( plrColumn(&plReader)!=iColumn ){ |
|
6858 iColumn = plrColumn(&plReader); |
|
6859 sqlite3_snprintf(sizeof(buf), buf, "] %d[", iColumn); |
|
6860 assert( dump.nData>0 ); |
|
6861 dump.nData--; /* Overwrite trailing space. */ |
|
6862 assert( dump.pData[dump.nData]==' '); |
|
6863 dataBufferAppend(&dump, buf, strlen(buf)); |
|
6864 } |
|
6865 if( DL_DEFAULT==DL_POSITIONS_OFFSETS ){ |
|
6866 sqlite3_snprintf(sizeof(buf), buf, "%d,%d,%d ", |
|
6867 plrPosition(&plReader), |
|
6868 plrStartOffset(&plReader), plrEndOffset(&plReader)); |
|
6869 }else if( DL_DEFAULT==DL_POSITIONS ){ |
|
6870 sqlite3_snprintf(sizeof(buf), buf, "%d ", plrPosition(&plReader)); |
|
6871 }else{ |
|
6872 assert( NULL=="Unhandled DL_DEFAULT value"); |
|
6873 } |
|
6874 dataBufferAppend(&dump, buf, strlen(buf)); |
|
6875 } |
|
6876 plrDestroy(&plReader); |
|
6877 |
|
6878 assert( dump.nData>0 ); |
|
6879 dump.nData--; /* Overwrite trailing space. */ |
|
6880 assert( dump.pData[dump.nData]==' '); |
|
6881 dataBufferAppend(&dump, "]] ", 3); |
|
6882 } |
|
6883 } |
|
6884 dlrDestroy(&dlReader); |
|
6885 |
|
6886 assert( dump.nData>0 ); |
|
6887 dump.nData--; /* Overwrite trailing space. */ |
|
6888 assert( dump.pData[dump.nData]==' '); |
|
6889 dump.pData[dump.nData] = '\0'; |
|
6890 assert( dump.nData>0 ); |
|
6891 |
|
6892 /* Passes ownership of dump's buffer to pContext. */ |
|
6893 sqlite3_result_text(pContext, dump.pData, dump.nData, sqlite3_free); |
|
6894 dump.pData = NULL; |
|
6895 dump.nData = dump.nCapacity = 0; |
|
6896 } |
|
6897 |
|
6898 /* Implements dump_doclist() for use in inspecting the fts3 index from |
|
6899 ** tests. TEXT result containing a string representation of the |
|
6900 ** doclist for the indicated term. dump_doclist(t, term, level, idx) |
|
6901 ** dumps the doclist for term from the segment specified by level, idx |
|
6902 ** (in %_segdir), while dump_doclist(t, term) dumps the logical |
|
6903 ** doclist for the term across all segments. The per-segment doclist |
|
6904 ** can contain deletions, while the full-index doclist will not |
|
6905 ** (deletions are omitted). |
|
6906 ** |
|
6907 ** Result formats differ with the setting of DL_DEFAULTS. Examples: |
|
6908 ** |
|
6909 ** DL_DOCIDS: [1] [3] [7] |
|
6910 ** DL_POSITIONS: [1 0[0 4] 1[17]] [3 1[5]] |
|
6911 ** DL_POSITIONS_OFFSETS: [1 0[0,0,3 4,23,26] 1[17,102,105]] [3 1[5,20,23]] |
|
6912 ** |
|
6913 ** In each case the number after the outer '[' is the docid. In the |
|
6914 ** latter two cases, the number before the inner '[' is the column |
|
6915 ** associated with the values within. For DL_POSITIONS the numbers |
|
6916 ** within are the positions, for DL_POSITIONS_OFFSETS they are the |
|
6917 ** position, the start offset, and the end offset. |
|
6918 */ |
|
6919 static void dumpDoclistFunc( |
|
6920 sqlite3_context *pContext, |
|
6921 int argc, sqlite3_value **argv |
|
6922 ){ |
|
6923 fulltext_cursor *pCursor; |
|
6924 if( argc!=2 && argc!=4 ){ |
|
6925 generateError(pContext, "dump_doclist", "incorrect arguments"); |
|
6926 }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || |
|
6927 sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ |
|
6928 generateError(pContext, "dump_doclist", "illegal first argument"); |
|
6929 }else if( sqlite3_value_text(argv[1])==NULL || |
|
6930 sqlite3_value_text(argv[1])[0]=='\0' ){ |
|
6931 generateError(pContext, "dump_doclist", "empty second argument"); |
|
6932 }else{ |
|
6933 const char *pTerm = (const char *)sqlite3_value_text(argv[1]); |
|
6934 const int nTerm = strlen(pTerm); |
|
6935 fulltext_vtab *v; |
|
6936 int rc; |
|
6937 DataBuffer doclist; |
|
6938 |
|
6939 memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); |
|
6940 v = cursor_vtab(pCursor); |
|
6941 |
|
6942 dataBufferInit(&doclist, 0); |
|
6943 |
|
6944 /* termSelect() yields the same logical doclist that queries are |
|
6945 ** run against. |
|
6946 */ |
|
6947 if( argc==2 ){ |
|
6948 rc = termSelect(v, v->nColumn, pTerm, nTerm, 0, DL_DEFAULT, &doclist); |
|
6949 }else{ |
|
6950 sqlite3_stmt *s = NULL; |
|
6951 |
|
6952 /* Get our specific segment's information. */ |
|
6953 rc = sql_get_statement(v, SEGDIR_SELECT_SEGMENT_STMT, &s); |
|
6954 if( rc==SQLITE_OK ){ |
|
6955 rc = sqlite3_bind_int(s, 1, sqlite3_value_int(argv[2])); |
|
6956 if( rc==SQLITE_OK ){ |
|
6957 rc = sqlite3_bind_int(s, 2, sqlite3_value_int(argv[3])); |
|
6958 } |
|
6959 } |
|
6960 |
|
6961 if( rc==SQLITE_OK ){ |
|
6962 rc = sqlite3_step(s); |
|
6963 |
|
6964 if( rc==SQLITE_DONE ){ |
|
6965 dataBufferDestroy(&doclist); |
|
6966 generateError(pContext, "dump_doclist", "segment not found"); |
|
6967 return; |
|
6968 } |
|
6969 |
|
6970 /* Found a segment, load it into doclist. */ |
|
6971 if( rc==SQLITE_ROW ){ |
|
6972 const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1); |
|
6973 const char *pData = sqlite3_column_blob(s, 2); |
|
6974 const int nData = sqlite3_column_bytes(s, 2); |
|
6975 |
|
6976 /* loadSegment() is used by termSelect() to load each |
|
6977 ** segment's data. |
|
6978 */ |
|
6979 rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, 0, |
|
6980 &doclist); |
|
6981 if( rc==SQLITE_OK ){ |
|
6982 rc = sqlite3_step(s); |
|
6983 |
|
6984 /* Should not have more than one matching segment. */ |
|
6985 if( rc!=SQLITE_DONE ){ |
|
6986 sqlite3_reset(s); |
|
6987 dataBufferDestroy(&doclist); |
|
6988 generateError(pContext, "dump_doclist", "invalid segdir"); |
|
6989 return; |
|
6990 } |
|
6991 rc = SQLITE_OK; |
|
6992 } |
|
6993 } |
|
6994 } |
|
6995 |
|
6996 sqlite3_reset(s); |
|
6997 } |
|
6998 |
|
6999 if( rc==SQLITE_OK ){ |
|
7000 if( doclist.nData>0 ){ |
|
7001 createDoclistResult(pContext, doclist.pData, doclist.nData); |
|
7002 }else{ |
|
7003 /* TODO(shess): This can happen if the term is not present, or |
|
7004 ** if all instances of the term have been deleted and this is |
|
7005 ** an all-index dump. It may be interesting to distinguish |
|
7006 ** these cases. |
|
7007 */ |
|
7008 sqlite3_result_text(pContext, "", 0, SQLITE_STATIC); |
|
7009 } |
|
7010 }else if( rc==SQLITE_NOMEM ){ |
|
7011 /* Handle out-of-memory cases specially because if they are |
|
7012 ** generated in fts3 code they may not be reflected in the db |
|
7013 ** handle. |
|
7014 */ |
|
7015 /* TODO(shess): Handle this more comprehensively. |
|
7016 ** sqlite3ErrStr() has what I need, but is internal. |
|
7017 */ |
|
7018 generateError(pContext, "dump_doclist", "out of memory"); |
|
7019 }else{ |
|
7020 generateError(pContext, "dump_doclist", NULL); |
|
7021 } |
|
7022 |
|
7023 dataBufferDestroy(&doclist); |
|
7024 } |
|
7025 } |
|
7026 #endif |
|
7027 |
|
7028 /* |
|
7029 ** This routine implements the xFindFunction method for the FTS3 |
|
7030 ** virtual table. |
|
7031 */ |
|
7032 static int fulltextFindFunction( |
|
7033 sqlite3_vtab *pVtab, |
|
7034 int nArg, |
|
7035 const char *zName, |
|
7036 void (**pxFunc)(sqlite3_context*,int,sqlite3_value**), |
|
7037 void **ppArg |
|
7038 ){ |
|
7039 if( strcmp(zName,"snippet")==0 ){ |
|
7040 *pxFunc = snippetFunc; |
|
7041 return 1; |
|
7042 }else if( strcmp(zName,"offsets")==0 ){ |
|
7043 *pxFunc = snippetOffsetsFunc; |
|
7044 return 1; |
|
7045 }else if( strcmp(zName,"optimize")==0 ){ |
|
7046 *pxFunc = optimizeFunc; |
|
7047 return 1; |
|
7048 #ifdef SQLITE_TEST |
|
7049 /* NOTE(shess): These functions are present only for testing |
|
7050 ** purposes. No particular effort is made to optimize their |
|
7051 ** execution or how they build their results. |
|
7052 */ |
|
7053 }else if( strcmp(zName,"dump_terms")==0 ){ |
|
7054 /* fprintf(stderr, "Found dump_terms\n"); */ |
|
7055 *pxFunc = dumpTermsFunc; |
|
7056 return 1; |
|
7057 }else if( strcmp(zName,"dump_doclist")==0 ){ |
|
7058 /* fprintf(stderr, "Found dump_doclist\n"); */ |
|
7059 *pxFunc = dumpDoclistFunc; |
|
7060 return 1; |
|
7061 #endif |
|
7062 } |
|
7063 return 0; |
|
7064 } |
|
7065 |
|
7066 /* |
|
7067 ** Rename an fts3 table. |
|
7068 */ |
|
7069 static int fulltextRename( |
|
7070 sqlite3_vtab *pVtab, |
|
7071 const char *zName |
|
7072 ){ |
|
7073 fulltext_vtab *p = (fulltext_vtab *)pVtab; |
|
7074 int rc = SQLITE_NOMEM; |
|
7075 char *zSql = sqlite3_mprintf( |
|
7076 "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';" |
|
7077 "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';" |
|
7078 "ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';" |
|
7079 , p->zDb, p->zName, zName |
|
7080 , p->zDb, p->zName, zName |
|
7081 , p->zDb, p->zName, zName |
|
7082 ); |
|
7083 if( zSql ){ |
|
7084 rc = sqlite3_exec(p->db, zSql, 0, 0, 0); |
|
7085 sqlite3_free(zSql); |
|
7086 } |
|
7087 return rc; |
|
7088 } |
|
7089 |
|
7090 static const sqlite3_module fts3Module = { |
|
7091 /* iVersion */ 0, |
|
7092 /* xCreate */ fulltextCreate, |
|
7093 /* xConnect */ fulltextConnect, |
|
7094 /* xBestIndex */ fulltextBestIndex, |
|
7095 /* xDisconnect */ fulltextDisconnect, |
|
7096 /* xDestroy */ fulltextDestroy, |
|
7097 /* xOpen */ fulltextOpen, |
|
7098 /* xClose */ fulltextClose, |
|
7099 /* xFilter */ fulltextFilter, |
|
7100 /* xNext */ fulltextNext, |
|
7101 /* xEof */ fulltextEof, |
|
7102 /* xColumn */ fulltextColumn, |
|
7103 /* xRowid */ fulltextRowid, |
|
7104 /* xUpdate */ fulltextUpdate, |
|
7105 /* xBegin */ fulltextBegin, |
|
7106 /* xSync */ fulltextSync, |
|
7107 /* xCommit */ fulltextCommit, |
|
7108 /* xRollback */ fulltextRollback, |
|
7109 /* xFindFunction */ fulltextFindFunction, |
|
7110 /* xRename */ fulltextRename, |
|
7111 }; |
|
7112 |
|
7113 static void hashDestroy(void *p){ |
|
7114 fts3Hash *pHash = (fts3Hash *)p; |
|
7115 sqlite3Fts3HashClear(pHash); |
|
7116 sqlite3_free(pHash); |
|
7117 } |
|
7118 |
|
7119 /* |
|
7120 ** The fts3 built-in tokenizers - "simple" and "porter" - are implemented |
|
7121 ** in files fts3_tokenizer1.c and fts3_porter.c respectively. The following |
|
7122 ** two forward declarations are for functions declared in these files |
|
7123 ** used to retrieve the respective implementations. |
|
7124 ** |
|
7125 ** Calling sqlite3Fts3SimpleTokenizerModule() sets the value pointed |
|
7126 ** to by the argument to point a the "simple" tokenizer implementation. |
|
7127 ** Function ...PorterTokenizerModule() sets *pModule to point to the |
|
7128 ** porter tokenizer/stemmer implementation. |
|
7129 */ |
|
7130 void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule); |
|
7131 void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule); |
|
7132 void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule); |
|
7133 |
|
7134 int sqlite3Fts3InitHashTable(sqlite3 *, fts3Hash *, const char *); |
|
7135 |
|
7136 /* |
|
7137 ** Initialise the fts3 extension. If this extension is built as part |
|
7138 ** of the sqlite library, then this function is called directly by |
|
7139 ** SQLite. If fts3 is built as a dynamically loadable extension, this |
|
7140 ** function is called by the sqlite3_extension_init() entry point. |
|
7141 */ |
|
7142 int sqlite3Fts3Init(sqlite3 *db){ |
|
7143 int rc = SQLITE_OK; |
|
7144 fts3Hash *pHash = 0; |
|
7145 const sqlite3_tokenizer_module *pSimple = 0; |
|
7146 const sqlite3_tokenizer_module *pPorter = 0; |
|
7147 const sqlite3_tokenizer_module *pIcu = 0; |
|
7148 |
|
7149 sqlite3Fts3SimpleTokenizerModule(&pSimple); |
|
7150 sqlite3Fts3PorterTokenizerModule(&pPorter); |
|
7151 #ifdef SQLITE_ENABLE_ICU |
|
7152 sqlite3Fts3IcuTokenizerModule(&pIcu); |
|
7153 #endif |
|
7154 |
|
7155 /* Allocate and initialise the hash-table used to store tokenizers. */ |
|
7156 pHash = sqlite3_malloc(sizeof(fts3Hash)); |
|
7157 if( !pHash ){ |
|
7158 rc = SQLITE_NOMEM; |
|
7159 }else{ |
|
7160 sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1); |
|
7161 } |
|
7162 |
|
7163 /* Load the built-in tokenizers into the hash table */ |
|
7164 if( rc==SQLITE_OK ){ |
|
7165 if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple) |
|
7166 || sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter) |
|
7167 || (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu)) |
|
7168 ){ |
|
7169 rc = SQLITE_NOMEM; |
|
7170 } |
|
7171 } |
|
7172 |
|
7173 /* Create the virtual table wrapper around the hash-table and overload |
|
7174 ** the two scalar functions. If this is successful, register the |
|
7175 ** module with sqlite. |
|
7176 */ |
|
7177 if( SQLITE_OK==rc |
|
7178 && SQLITE_OK==(rc = sqlite3Fts3InitHashTable(db, pHash, "fts3_tokenizer")) |
|
7179 && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1)) |
|
7180 && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1)) |
|
7181 && SQLITE_OK==(rc = sqlite3_overload_function(db, "optimize", -1)) |
|
7182 #ifdef SQLITE_TEST |
|
7183 && SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_terms", -1)) |
|
7184 && SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_doclist", -1)) |
|
7185 #endif |
|
7186 ){ |
|
7187 return sqlite3_create_module_v2( |
|
7188 db, "fts3", &fts3Module, (void *)pHash, hashDestroy |
|
7189 ); |
|
7190 } |
|
7191 |
|
7192 /* An error has occured. Delete the hash table and return the error code. */ |
|
7193 assert( rc!=SQLITE_OK ); |
|
7194 if( pHash ){ |
|
7195 sqlite3Fts3HashClear(pHash); |
|
7196 sqlite3_free(pHash); |
|
7197 } |
|
7198 return rc; |
|
7199 } |
|
7200 |
|
7201 #if !SQLITE_CORE |
|
7202 int sqlite3_extension_init( |
|
7203 sqlite3 *db, |
|
7204 char **pzErrMsg, |
|
7205 const sqlite3_api_routines *pApi |
|
7206 ){ |
|
7207 SQLITE_EXTENSION_INIT2(pApi) |
|
7208 return sqlite3Fts3Init(db); |
|
7209 } |
|
7210 #endif |
|
7211 |
|
7212 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ |