|
1 /*------------------------------------------------------------------------------ |
|
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team |
|
3 * |
|
4 * Distributable under the terms of either the Apache License (Version 2.0) or |
|
5 * the GNU Lesser General Public License, as specified in the COPYING file. |
|
6 ------------------------------------------------------------------------------*/ |
|
7 #ifndef _lucene_analysis_Analyzers_ |
|
8 #define _lucene_analysis_Analyzers_ |
|
9 |
|
10 #if defined(_LUCENE_PRAGMA_ONCE) |
|
11 # pragma once |
|
12 #endif |
|
13 |
|
14 #include "CLucene/util/Reader.h" |
|
15 #include "CLucene/analysis/AnalysisHeader.h" |
|
16 #include "clucene/util/misc.h" |
|
17 |
|
18 CL_NS_DEF(analysis) |
|
19 |
|
20 /** An abstract base class for simple, character-oriented tokenizers.*/ |
|
21 class CharTokenizer:public Tokenizer { |
|
22 private: |
|
23 int32_t offset, bufferIndex, dataLen; |
|
24 TCHAR buffer[LUCENE_MAX_WORD_LEN+1]; |
|
25 const TCHAR* ioBuffer; |
|
26 protected: |
|
27 |
|
28 /** Returns true iff a character should be included in a token. This |
|
29 * tokenizer generates as tokens adjacent sequences of characters which |
|
30 * satisfy this predicate. Characters for which this is false are used to |
|
31 * define token boundaries and are not included in tokens. */ |
|
32 virtual bool isTokenChar(const TCHAR c) const = 0; |
|
33 |
|
34 /** Called on each token character to normalize it before it is added to the |
|
35 * token. The default implementation does nothing. Subclasses may use this |
|
36 * to, e.g., lowercase tokens. */ |
|
37 virtual TCHAR normalize(const TCHAR c) const; |
|
38 |
|
39 public: |
|
40 CharTokenizer(CL_NS(util)::Reader* in); |
|
41 virtual ~CharTokenizer(){ |
|
42 } |
|
43 bool next(Token* token); |
|
44 }; |
|
45 |
|
46 |
|
47 /** A LetterTokenizer is a tokenizer that divides text at non-letters. That's |
|
48 to say, it defines tokens as maximal strings of adjacent letters, as defined |
|
49 by java.lang.Character.isLetter() predicate. |
|
50 |
|
51 Note: this does a decent job for most European languages, but does a terrible |
|
52 job for some Asian languages, where words are not separated by spaces. */ |
|
53 class LetterTokenizer:public CharTokenizer { |
|
54 public: |
|
55 // Construct a new LetterTokenizer. |
|
56 LetterTokenizer(CL_NS(util)::Reader* in): |
|
57 CharTokenizer(in) {} |
|
58 |
|
59 ~LetterTokenizer(){} |
|
60 protected: |
|
61 /** Collects only characters which satisfy _istalpha.*/ |
|
62 bool isTokenChar(const TCHAR c) const; |
|
63 }; |
|
64 |
|
65 |
|
66 |
|
67 /** |
|
68 * LowerCaseTokenizer performs the function of LetterTokenizer |
|
69 * and LowerCaseFilter together. It divides text at non-letters and converts |
|
70 * them to lower case. While it is functionally equivalent to the combination |
|
71 * of LetterTokenizer and LowerCaseFilter, there is a performance advantage |
|
72 * to doing the two tasks at once, hence this (redundant) implementation. |
|
73 * <P> |
|
74 * Note: this does a decent job for most European languages, but does a terrible |
|
75 * job for some Asian languages, where words are not separated by spaces. |
|
76 */ |
|
77 class LowerCaseTokenizer:public LetterTokenizer { |
|
78 public: |
|
79 /** Construct a new LowerCaseTokenizer. */ |
|
80 LowerCaseTokenizer(CL_NS(util)::Reader* in): |
|
81 LetterTokenizer(in) {} |
|
82 |
|
83 ~LowerCaseTokenizer(){} |
|
84 protected: |
|
85 /** Collects only characters which satisfy _totlower. */ |
|
86 TCHAR normalize(const TCHAR chr) const; |
|
87 }; |
|
88 |
|
89 |
|
90 /** A WhitespaceTokenizer is a tokenizer that divides text at whitespace. |
|
91 * Adjacent sequences of non-Whitespace characters form tokens. */ |
|
92 class WhitespaceTokenizer: public CharTokenizer { |
|
93 public: |
|
94 /** Construct a new WhitespaceTokenizer. */ |
|
95 WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {} |
|
96 ~WhitespaceTokenizer(){} |
|
97 protected: |
|
98 /** Collects only characters which do not satisfy _istspace. |
|
99 */ |
|
100 bool isTokenChar(const TCHAR c) const; |
|
101 }; |
|
102 |
|
103 |
|
104 /** An Analyzer that uses WhitespaceTokenizer. */ |
|
105 class WhitespaceAnalyzer: public Analyzer { |
|
106 public: |
|
107 TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); |
|
108 ~WhitespaceAnalyzer(){} |
|
109 }; |
|
110 |
|
111 /** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */ |
|
112 class SimpleAnalyzer: public Analyzer { |
|
113 public: |
|
114 TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); |
|
115 ~SimpleAnalyzer(){} |
|
116 }; |
|
117 |
|
118 |
|
119 |
|
120 /** |
|
121 * Normalizes token text to lower case. |
|
122 */ |
|
123 class LowerCaseFilter: public TokenFilter { |
|
124 public: |
|
125 LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {} |
|
126 ~LowerCaseFilter(){} |
|
127 bool next(Token* token); |
|
128 }; |
|
129 |
|
130 |
|
131 /** |
|
132 * Removes stop words from a token stream. |
|
133 */ |
|
134 class StopFilter: public TokenFilter { |
|
135 private: |
|
136 //bvk: i found this to work faster with a non-hash table. the number of items |
|
137 //in the stop table is not like to make it worth having hashing. |
|
138 CL_NS(util)::CLSetList<const TCHAR*>* table; |
|
139 bool ownTable; |
|
140 public: |
|
141 // Constructs a filter which removes words from the input |
|
142 // TokenStream that are named in the array of words. |
|
143 StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords); |
|
144 |
|
145 ~StopFilter(); |
|
146 |
|
147 /** Constructs a filter which removes words from the input |
|
148 * TokenStream that are named in the CLSetList. |
|
149 */ |
|
150 StopFilter(TokenStream* in, bool deleteTokenStream, CL_NS(util)::CLSetList<const TCHAR*>* stopTable): |
|
151 TokenFilter(in, deleteTokenStream), |
|
152 table(stopTable), |
|
153 ownTable(false) |
|
154 {} |
|
155 |
|
156 |
|
157 /** |
|
158 * Builds a Hashtable from an array of stop words, appropriate for passing |
|
159 * into the StopFilter constructor. This permits this table construction to |
|
160 * be cached once when an Analyzer is constructed. |
|
161 * Note: the stopWords list must be a static list because the strings are not copied |
|
162 */ |
|
163 static void fillStopTable(CL_NS(util)::CLSetList<const TCHAR*>* stopTable, |
|
164 const TCHAR** stopWords); |
|
165 |
|
166 /** |
|
167 * Returns the next input Token whose termText() is not a stop word. |
|
168 */ |
|
169 bool next(Token* token); |
|
170 }; |
|
171 |
|
172 |
|
173 |
|
174 |
|
175 /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */ |
|
176 class StopAnalyzer: public Analyzer { |
|
177 CL_NS(util)::CLSetList<const TCHAR*> stopTable; |
|
178 |
|
179 public: |
|
180 /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */ |
|
181 StopAnalyzer(); |
|
182 ~StopAnalyzer(); |
|
183 |
|
184 /** Builds an analyzer which removes words in the provided array. */ |
|
185 StopAnalyzer( const TCHAR** stopWords ); |
|
186 /** Filters LowerCaseTokenizer with StopFilter. */ |
|
187 TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); |
|
188 |
|
189 /** An array containing some common English words that are not usually useful |
|
190 for searching. */ |
|
191 static const TCHAR* ENGLISH_STOP_WORDS[]; |
|
192 }; |
|
193 |
|
194 |
|
195 |
|
196 /** |
|
197 * This analyzer is used to facilitate scenarios where different |
|
198 * fields require different analysis techniques. Use {@link #addAnalyzer} |
|
199 * to add a non-default analyzer on a field name basis. |
|
200 * |
|
201 * <p>Example usage: |
|
202 * |
|
203 * <pre> |
|
204 * PerFieldAnalyzerWrapper aWrapper = |
|
205 * new PerFieldAnalyzerWrapper(new StandardAnalyzer()); |
|
206 * aWrapper.addAnalyzer("firstname", new KeywordAnalyzer()); |
|
207 * aWrapper.addAnalyzer("lastname", new KeywordAnalyzer()); |
|
208 * </pre> |
|
209 * |
|
210 * <p>In this example, StandardAnalyzer will be used for all fields except "firstname" |
|
211 * and "lastname", for which KeywordAnalyzer will be used. |
|
212 * |
|
213 * <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing |
|
214 * and query parsing. |
|
215 */ |
|
216 class PerFieldAnalyzerWrapper : public Analyzer { |
|
217 private: |
|
218 Analyzer* defaultAnalyzer; |
|
219 CL_NS(util)::LHashMap<const TCHAR*, Analyzer*, CL_NS(util)::Compare::TChar, |
|
220 CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > analyzerMap; |
|
221 public: |
|
222 /** |
|
223 * Constructs with default analyzer. |
|
224 * |
|
225 * @param defaultAnalyzer Any fields not specifically |
|
226 * defined to use a different analyzer will use the one provided here. |
|
227 */ |
|
228 PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer); |
|
229 ~PerFieldAnalyzerWrapper(); |
|
230 |
|
231 /** |
|
232 * Defines an analyzer to use for the specified field. |
|
233 * |
|
234 * @param fieldName field name requiring a non-default analyzer |
|
235 * @param analyzer non-default analyzer to use for field |
|
236 */ |
|
237 void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer); |
|
238 TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); |
|
239 }; |
|
240 |
|
241 |
|
242 /** |
|
243 * A filter that replaces accented characters in the ISO Latin 1 character set |
|
244 * (ISO-8859-1) by their unaccented equivalent. The case will not be altered. |
|
245 * <p> |
|
246 * For instance, 'à' will be replaced by 'a'. |
|
247 * <p> |
|
248 */ |
|
249 class ISOLatin1AccentFilter: public TokenFilter { |
|
250 public: |
|
251 ISOLatin1AccentFilter(TokenStream* input, bool deleteTs): |
|
252 TokenFilter(input,deleteTs) |
|
253 { |
|
254 } |
|
255 |
|
256 /** |
|
257 * To replace accented characters in a |
|
258 * String by unaccented equivalents. |
|
259 */ |
|
260 bool next(Token* token); |
|
261 }; |
|
262 |
|
263 |
|
264 /** |
|
265 * Emits the entire input as a single token. |
|
266 */ |
|
267 class KeywordTokenizer: public Tokenizer { |
|
268 private: |
|
269 LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256); |
|
270 bool done; |
|
271 int bufferSize; |
|
272 public: |
|
273 KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1); |
|
274 virtual ~KeywordTokenizer(); |
|
275 bool next(Token* token); |
|
276 }; |
|
277 |
|
278 /** |
|
279 * "Tokenizes" the entire stream as a single token. This is useful |
|
280 * for data like zip codes, ids, and some product names. |
|
281 */ |
|
282 class KeywordAnalyzer: public Analyzer { |
|
283 public: |
|
284 TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); |
|
285 virtual ~KeywordAnalyzer(){} |
|
286 }; |
|
287 |
|
288 |
|
289 /** |
|
290 * Removes words that are too long and too short from the stream. |
|
291 * |
|
292 */ |
|
293 class LengthFilter: public TokenFilter { |
|
294 private: |
|
295 int _min; |
|
296 int _max; |
|
297 public: |
|
298 /** |
|
299 * Build a filter that removes words that are too long or too |
|
300 * short from the text. |
|
301 */ |
|
302 LengthFilter(TokenStream* in, int _min, int _max); |
|
303 LengthFilter(TokenStream* in, bool deleteTs, int _min, int _max); |
|
304 |
|
305 /** |
|
306 * Returns the next input Token whose termText() is the right len |
|
307 */ |
|
308 bool next(Token* token); |
|
309 }; |
|
310 |
|
311 |
|
312 CL_NS_END |
|
313 #endif |