|
1 /** |
|
2 * Copyright 2002-2004 The Apache Software Foundation |
|
3 * |
|
4 * Licensed under the Apache License, Version 2.0 (the "License"); |
|
5 * you may not use this file except in compliance with the License. |
|
6 * You may obtain a copy of the License at |
|
7 * |
|
8 * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 * |
|
10 * Unless required by applicable law or agreed to in writing, software |
|
11 * distributed under the License is distributed on an "AS IS" BASIS, |
|
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
13 * See the License for the specific language governing permissions and |
|
14 * limitations under the License. |
|
15 */ |
|
16 |
|
17 #ifndef _lucene_search_highlight_highlighter_ |
|
18 #define _lucene_search_highlight_highlighter_ |
|
19 |
|
20 #if defined(_LUCENE_PRAGMA_ONCE) |
|
21 # pragma once |
|
22 #endif |
|
23 |
|
24 #include "CLucene/util/StringBuffer.h" |
|
25 #include "CLucene/util/PriorityQueue.h" |
|
26 #include "CLucene/util/VoidList.h" |
|
27 #include "CLucene/highlighter/Formatter.h" |
|
28 #include "CLucene/highlighter/Encoder.h" |
|
29 #include "CLucene/highlighter/SimpleHTMLFormatter.h" |
|
30 #include "CLucene/highlighter/Fragmenter.h" |
|
31 #include "CLucene/highlighter/HighlightScorer.h" |
|
32 #include "CLucene/highlighter/SimpleFragmenter.h" |
|
33 #include "CLucene/highlighter/TextFragment.h" |
|
34 |
|
35 CL_NS_DEF2(search,highlight) |
|
36 |
|
37 /** |
|
38 * Class used to markup highlighted terms found in the best sections of a |
|
39 * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}, |
|
40 * and tokenizers. |
|
41 * {@link Encoder} and tokenizers. |
|
42 */ |
|
43 class Highlighter :LUCENE_BASE |
|
44 { |
|
45 private: |
|
46 int32_t maxDocBytesToAnalyze; |
|
47 |
|
48 Formatter * _formatter; |
|
49 bool delete_formatter; |
|
50 |
|
51 Encoder* _encoder; |
|
52 bool delete_encoder; |
|
53 |
|
54 Fragmenter * _textFragmenter; |
|
55 bool delete_textFragmenter; |
|
56 |
|
57 HighlightScorer * _fragmentScorer; |
|
58 bool delete_fragmentScorer; |
|
59 |
|
60 /** Improves readability of a score-sorted list of TextFragments by merging any fragments |
|
61 * that were contiguous in the original text into one larger fragment with the correct order. |
|
62 * This will leave a "null" in the array entry for the lesser scored fragment. |
|
63 * |
|
64 * @param frag An array of document fragments in descending score |
|
65 */ |
|
66 void _mergeContiguousFragments(TextFragment** frag, int32_t fragsLen); |
|
67 |
|
68 public: |
|
69 LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024); |
|
70 |
|
71 /** |
|
72 * Constructs a Highlighter object with the provided scorer. The HighlightScorer object is owned |
|
73 * by the Highlighter object, and it will freed in the destructor. |
|
74 */ |
|
75 Highlighter(HighlightScorer * fragmentScorer); |
|
76 |
|
77 Highlighter(Formatter * formatter, HighlightScorer * fragmentScorer); |
|
78 |
|
79 Highlighter(Formatter * formatter, Encoder* encoder, HighlightScorer * fragmentScorer); |
|
80 |
|
81 |
|
82 /** |
|
83 * Destructor for Highlighter. It deletes the owned HighlightScorer, formatter and textFragmenter. |
|
84 */ |
|
85 ~Highlighter(); |
|
86 |
|
87 /** |
|
88 * Highlights chosen terms in a text, extracting the most relevant section. |
|
89 * The document text is analysed in chunks to record hit statistics |
|
90 * across the document. After accumulating stats, the fragment with the highest score |
|
91 * is returned |
|
92 * |
|
93 * @param tokenStream a stream of tokens identified in the text parameter, including offset information. |
|
94 * This is typically produced by an analyzer re-parsing a document's |
|
95 * text. Some work may be done on retrieving TokenStreams more efficently |
|
96 * by adding support for storing original text position data in the Lucene |
|
97 * index but this support is not currently available (as of Lucene 1.4 rc2). |
|
98 * @param text text to highlight terms in |
|
99 * |
|
100 * @return highlighted text fragment or null if no terms found |
|
101 */ |
|
102 TCHAR* getBestFragment(CL_NS(analysis)::TokenStream * tokenStream, const TCHAR* text); |
|
103 |
|
104 /** |
|
105 * Highlights chosen terms in a text, extracting the most relevant section. |
|
106 * This is a convenience method that calls |
|
107 * {@link #getBestFragment(TokenStream, const TCHAR*)} |
|
108 * |
|
109 * @param analyzer the analyzer that will be used to split <code>text</code> |
|
110 * into chunks |
|
111 * @param text text to highlight terms in |
|
112 * @param fieldName Name of field used to influence analyzer's tokenization policy |
|
113 * |
|
114 * @return highlighted text fragment or null if no terms found |
|
115 */ |
|
116 TCHAR* getBestFragment(CL_NS(analysis)::Analyzer* analyzer, const TCHAR* fieldName, const TCHAR* text); |
|
117 |
|
118 /** |
|
119 * Highlights chosen terms in a text, extracting the most relevant sections. |
|
120 * This is a convenience method that calls |
|
121 * {@link #getBestFragments(TokenStream, const TCHAR*, int)} |
|
122 * |
|
123 * @param analyzer the analyzer that will be used to split <code>text</code> |
|
124 * into chunks |
|
125 * @param text text to highlight terms in |
|
126 * @param maxNumFragments the maximum number of fragments. |
|
127 * |
|
128 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) |
|
129 */ |
|
130 TCHAR** getBestFragments( |
|
131 CL_NS(analysis)::Analyzer* analyzer, |
|
132 const TCHAR* text, |
|
133 int32_t maxNumFragments); |
|
134 |
|
135 /** |
|
136 * Highlights chosen terms in a text, extracting the most relevant sections. |
|
137 * The document text is analysed in chunks to record hit statistics |
|
138 * across the document. After accumulating stats, the fragments with the highest scores |
|
139 * are returned as an array of strings in order of score (contiguous fragments are merged into |
|
140 * one in their original order to improve readability) |
|
141 * |
|
142 * @param text text to highlight terms in |
|
143 * @param maxNumFragments the maximum number of fragments. |
|
144 * |
|
145 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) |
|
146 */ |
|
147 TCHAR** getBestFragments( |
|
148 CL_NS(analysis)::TokenStream * tokenStream, |
|
149 const TCHAR* text, |
|
150 int32_t maxNumFragments); |
|
151 |
|
152 /** |
|
153 * Low level api to get the most relevant (formatted) sections of the document. |
|
154 * This method has been made public to allow visibility of score information held in TextFragment objects. |
|
155 * Thanks to Jason Calabrese for help in redefining the interface. |
|
156 * @param tokenStream |
|
157 * @param text |
|
158 * @param maxNumFragments |
|
159 * @param mergeContiguousFragments |
|
160 */ |
|
161 TextFragment** getBestTextFragments( |
|
162 CL_NS(util)::StringBuffer* writeTo, |
|
163 CL_NS(analysis)::TokenStream * tokenStream, |
|
164 const TCHAR* text, |
|
165 bool mergeContiguousFragments, |
|
166 int32_t maxNumFragments); |
|
167 |
|
168 /** |
|
169 * Highlights terms in the text , extracting the most relevant sections |
|
170 * and concatenating the chosen fragments with a separator (typically "..."). |
|
171 * The document text is analysed in chunks to record hit statistics |
|
172 * across the document. After accumulating stats, the fragments with the highest scores |
|
173 * are returned in order as "separator" delimited strings. |
|
174 * |
|
175 * @param text text to highlight terms in |
|
176 * @param maxNumFragments the maximum number of fragments. |
|
177 * @param separator the separator used to intersperse the document fragments (typically "...") |
|
178 * |
|
179 * @return highlighted text |
|
180 */ |
|
181 TCHAR* getBestFragments( |
|
182 CL_NS(analysis)::TokenStream * tokenStream, |
|
183 const TCHAR* text, |
|
184 int32_t maxNumFragments, |
|
185 const TCHAR* separator); |
|
186 |
|
187 /** |
|
188 * @return the maximum number of bytes to be tokenized per doc |
|
189 */ |
|
190 int32_t getMaxDocBytesToAnalyze() |
|
191 { |
|
192 return maxDocBytesToAnalyze; |
|
193 } |
|
194 |
|
195 /** |
|
196 * @param byteCount the maximum number of bytes to be tokenized per doc |
|
197 * (This can improve performance with large documents) |
|
198 */ |
|
199 void setMaxDocBytesToAnalyze(int32_t byteCount) |
|
200 { |
|
201 maxDocBytesToAnalyze = byteCount; |
|
202 } |
|
203 |
|
204 /** |
|
205 */ |
|
206 Fragmenter * getTextFragmenter() |
|
207 { |
|
208 return _textFragmenter; |
|
209 } |
|
210 |
|
211 /** |
|
212 * @param fragmenter |
|
213 */ |
|
214 void setTextFragmenter(Fragmenter * fragmenter) |
|
215 { |
|
216 if ( delete_textFragmenter ){ |
|
217 _CLDELETE(_textFragmenter); |
|
218 delete_textFragmenter = false; |
|
219 } |
|
220 _textFragmenter = fragmenter; |
|
221 } |
|
222 |
|
223 /** |
|
224 * @return Object used to score each text fragment |
|
225 */ |
|
226 HighlightScorer * getFragmentScorer() |
|
227 { |
|
228 return _fragmentScorer; |
|
229 } |
|
230 |
|
231 |
|
232 /** |
|
233 * @param HighlightScorer |
|
234 */ |
|
235 void setFragmentScorer(HighlightScorer * scorer) |
|
236 { |
|
237 if ( delete_fragmentScorer ){ |
|
238 delete_fragmentScorer = false; |
|
239 _CLDELETE(scorer); |
|
240 } |
|
241 _fragmentScorer = scorer; |
|
242 } |
|
243 |
|
244 |
|
245 Encoder* getEncoder() |
|
246 { |
|
247 return _encoder; |
|
248 } |
|
249 void setEncoder(Encoder* encoder) |
|
250 { |
|
251 if ( delete_encoder ){ |
|
252 _CLDELETE(encoder); |
|
253 delete_encoder = false; |
|
254 } |
|
255 this->_encoder = encoder; |
|
256 } |
|
257 |
|
258 |
|
259 }; |
|
260 |
|
261 |
|
262 CL_NS_END2 |
|
263 |
|
264 #endif |
|
265 |