|
1 /** |
|
2 * Copyright 2002-2004 The Apache Software Foundation |
|
3 * |
|
4 * Licensed under the Apache License, Version 2.0 (the "License"); |
|
5 * you may not use this file except in compliance with the License. |
|
6 * You may obtain a copy of the License at |
|
7 * |
|
8 * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 * |
|
10 * Unless required by applicable law or agreed to in writing, software |
|
11 * distributed under the License is distributed on an "AS IS" BASIS, |
|
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
13 * See the License for the specific language governing permissions and |
|
14 * limitations under the License. |
|
15 */ |
|
16 |
|
17 #ifndef _lucene_search_highlight_tokensources_ |
|
18 #define _lucene_search_highlight_tokensources_ |
|
19 |
|
20 #if defined(_LUCENE_PRAGMA_ONCE) |
|
21 # pragma once |
|
22 #endif |
|
23 |
|
24 #include "CLucene/analysis/AnalysisHeader.h" |
|
25 #include "CLucene/index/IndexReader.h" |
|
26 #include "CLucene/index/TermVector.h" |
|
27 #include "CLucene/analysis/AnalysisHeader.h" |
|
28 |
|
29 CL_NS_DEF2(search,highlight) |
|
30 |
|
31 class TokenSources: LUCENE_BASE |
|
32 { |
|
33 //an object used to iterate across an array of tokens |
|
34 class StoredTokenStream:public CL_NS(analysis)::TokenStream |
|
35 { |
|
36 public: |
|
37 CL_NS(analysis)::Token** tokens; |
|
38 size_t length; |
|
39 int32_t currentToken; |
|
40 StoredTokenStream(CL_NS(analysis)::Token** tokens, size_t len); |
|
41 bool next(CL_NS(analysis)::Token* token); |
|
42 void close(); |
|
43 }; |
|
44 public: |
|
45 TokenSources(void); |
|
46 ~TokenSources(void); |
|
47 |
|
48 /** |
|
49 * A convenience method that tries a number of approaches to getting a token stream. |
|
50 * The cost of finding there are no termVectors in the index is minimal (1000 invocations still |
|
51 * registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable |
|
52 * @param reader |
|
53 * @param docId |
|
54 * @param field |
|
55 * @param analyzer |
|
56 * @return null if field not stored correctly |
|
57 * @throws IOException |
|
58 */ |
|
59 static CL_NS(analysis)::TokenStream* getAnyTokenStream(CL_NS(index)::IndexReader* reader,int32_t docId, TCHAR* field, CL_NS(analysis)::Analyzer* analyzer); |
|
60 |
|
61 static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::TermPositionVector* tpv); |
|
62 |
|
63 /** |
|
64 * Low level api. |
|
65 * Returns a token stream or null if no offset info available in index. |
|
66 * This can be used to feed the highlighter with a pre-parsed token stream |
|
67 * |
|
68 * In my tests the speeds to recreate 1000 token streams using this method are: |
|
69 * - with TermVector offset only data stored - 420 milliseconds |
|
70 * - with TermVector offset AND position data stored - 271 milliseconds |
|
71 * (nb timings for TermVector with position data are based on a tokenizer with contiguous |
|
72 * positions - no overlaps or gaps) |
|
73 * The cost of not using TermPositionVector to store |
|
74 * pre-parsed content and using an analyzer to re-parse the original content: |
|
75 * - reanalyzing the original content - 980 milliseconds |
|
76 * |
|
77 * The re-analyze timings will typically vary depending on - |
|
78 * 1) The complexity of the analyzer code (timings above were using a |
|
79 * stemmer/lowercaser/stopword combo) |
|
80 * 2) The number of other fields (Lucene reads ALL fields off the disk |
|
81 * when accessing just one document field - can cost dear!) |
|
82 * 3) Use of compression on field storage - could be faster cos of compression (less disk IO) |
|
83 * or slower (more CPU burn) depending on the content. |
|
84 * |
|
85 * @param tpv |
|
86 * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking |
|
87 * to eek out the last drops of performance, set to true. If in doubt, set to false. |
|
88 */ |
|
89 static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::TermPositionVector* tpv, bool tokenPositionsGuaranteedContiguous); |
|
90 |
|
91 static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::IndexReader* reader,int32_t docId, TCHAR* field); |
|
92 |
|
93 //convenience method |
|
94 static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::IndexReader* reader,int32_t docId, TCHAR* field,CL_NS(analysis)::Analyzer* analyzer); |
|
95 }; |
|
96 |
|
97 CL_NS_END2 |
|
98 #endif |