|
1 /*------------------------------------------------------------------------------ |
|
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team |
|
3 * |
|
4 * Distributable under the terms of either the Apache License (Version 2.0) or |
|
5 * the GNU Lesser General Public License, as specified in the COPYING file. |
|
6 ------------------------------------------------------------------------------*/ |
|
7 #include "clucene/stdheader.h" |
|
8 #include "Analyzers.h" |
|
9 #include "clucene/util/stringbuffer.h" |
|
10 |
|
11 CL_NS_USE(util) |
|
12 CL_NS_DEF(analysis) |
|
13 |
|
14 CharTokenizer::CharTokenizer(Reader* in) : |
|
15 Tokenizer(in), |
|
16 offset(0), |
|
17 bufferIndex(0), |
|
18 dataLen(0), |
|
19 ioBuffer(NULL) |
|
20 { |
|
21 buffer[0]=0; |
|
22 } |
|
23 |
|
24 TCHAR CharTokenizer::normalize(const TCHAR c) const |
|
25 { |
|
26 return c; |
|
27 } |
|
28 bool CharTokenizer::next(Token* token){ |
|
29 int32_t length = 0; |
|
30 int32_t start = offset; |
|
31 while (true) { |
|
32 TCHAR c; |
|
33 offset++; |
|
34 if (bufferIndex >= dataLen) { |
|
35 dataLen = input->read(ioBuffer, LUCENE_IO_BUFFER_SIZE); |
|
36 if (dataLen == -1) |
|
37 dataLen = 0; |
|
38 bufferIndex = 0; |
|
39 } |
|
40 if (dataLen <= 0 ) { |
|
41 if (length > 0) |
|
42 break; |
|
43 else |
|
44 return false; |
|
45 }else |
|
46 c = ioBuffer[bufferIndex++]; |
|
47 if (isTokenChar(c)) { // if it's a token TCHAR |
|
48 |
|
49 if (length == 0) // start of token |
|
50 start = offset-1; |
|
51 |
|
52 buffer[length++] = normalize(c); // buffer it, normalized |
|
53 |
|
54 if (length == LUCENE_MAX_WORD_LEN) // buffer overflow! |
|
55 break; |
|
56 |
|
57 } else if (length > 0) // at non-Letter w/ chars |
|
58 break; // return 'em |
|
59 |
|
60 } |
|
61 buffer[length]=0; |
|
62 token->set( buffer, start, start+length); |
|
63 return true; |
|
64 } |
|
65 |
|
66 bool LetterTokenizer::isTokenChar(const TCHAR c) const { |
|
67 return _istalpha(c)!=0; |
|
68 } |
|
69 |
|
70 |
|
71 TCHAR LowerCaseTokenizer::normalize(const TCHAR chr) const { |
|
72 return _totlower(chr); |
|
73 } |
|
74 |
|
75 bool WhitespaceTokenizer::isTokenChar(const TCHAR c) const{ |
|
76 return _istspace(c)==0; //(return true if NOT a space) |
|
77 } |
|
78 |
|
79 TokenStream* WhitespaceAnalyzer::tokenStream(const TCHAR* , Reader* reader) { |
|
80 return _CLNEW WhitespaceTokenizer(reader); |
|
81 } |
|
82 |
|
83 TokenStream* SimpleAnalyzer::tokenStream(const TCHAR* , Reader* reader) { |
|
84 return _CLNEW LowerCaseTokenizer(reader); |
|
85 } |
|
86 |
|
87 bool LowerCaseFilter::next(Token* t){ |
|
88 if (!input->next(t)) |
|
89 return false; |
|
90 stringCaseFold( t->_termText ); |
|
91 return true; |
|
92 } |
|
93 |
|
94 StopFilter::StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords): |
|
95 TokenFilter(in, deleteTokenStream), |
|
96 table(_CLNEW CLSetList<const TCHAR*>(false)), |
|
97 ownTable(true) |
|
98 { |
|
99 fillStopTable( table,stopWords ); |
|
100 } |
|
101 |
|
102 StopFilter::~StopFilter() |
|
103 { |
|
104 if (ownTable) { |
|
105 _CLDELETE( table ); |
|
106 } |
|
107 } |
|
108 |
|
109 |
|
110 void StopFilter::fillStopTable(CLSetList<const TCHAR*>* stopTable, |
|
111 const TCHAR** stopWords) { |
|
112 for (int32_t i = 0; stopWords[i]!=NULL; i++) |
|
113 stopTable->insert(stopWords[i]); |
|
114 } |
|
115 |
|
116 bool StopFilter::next(Token* token) { |
|
117 // return the first non-stop word found |
|
118 while (input->next(token)){ |
|
119 if (table->find(token->_termText)==table->end()){ |
|
120 return true; |
|
121 } |
|
122 } |
|
123 |
|
124 // reached EOS -- return nothing |
|
125 return false; |
|
126 } |
|
127 |
|
128 StopAnalyzer::StopAnalyzer():stopTable(false) |
|
129 { |
|
130 StopFilter::fillStopTable(&stopTable,ENGLISH_STOP_WORDS); |
|
131 } |
|
132 StopAnalyzer::~StopAnalyzer() |
|
133 { |
|
134 } |
|
135 StopAnalyzer::StopAnalyzer( const TCHAR** stopWords) { |
|
136 StopFilter::fillStopTable(&stopTable,stopWords); |
|
137 } |
|
138 TokenStream* StopAnalyzer::tokenStream(const TCHAR* , Reader* reader) { |
|
139 return _CLNEW StopFilter(_CLNEW LowerCaseTokenizer(reader),true, &stopTable); |
|
140 } |
|
141 |
|
142 const TCHAR* StopAnalyzer::ENGLISH_STOP_WORDS[] = |
|
143 { |
|
144 _T("a"), _T("an"), _T("and"), _T("are"), _T("as"), _T("at"), _T("be"), _T("but"), _T("by"), |
|
145 _T("for"), _T("if"), _T("in"), _T("into"), _T("is"), _T("it"), |
|
146 _T("no"), _T("not"), _T("of"), _T("on"), _T("or"), _T("s"), _T("such"), |
|
147 _T("t"), _T("that"), _T("the"), _T("their"), _T("then"), _T("there"), _T("these"), |
|
148 _T("they"), _T("this"), _T("to"), _T("was"), _T("will"), _T("with"), NULL |
|
149 }; |
|
150 |
|
151 PerFieldAnalyzerWrapper::PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer): |
|
152 analyzerMap(true,true) |
|
153 { |
|
154 this->defaultAnalyzer = defaultAnalyzer; |
|
155 } |
|
156 PerFieldAnalyzerWrapper::~PerFieldAnalyzerWrapper(){ |
|
157 analyzerMap.clear(); |
|
158 _CLDELETE(defaultAnalyzer); |
|
159 } |
|
160 |
|
161 void PerFieldAnalyzerWrapper::addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer) { |
|
162 analyzerMap.put(STRDUP_TtoT(fieldName), analyzer); |
|
163 } |
|
164 |
|
165 TokenStream* PerFieldAnalyzerWrapper::tokenStream(const TCHAR* fieldName, Reader* reader) { |
|
166 Analyzer* analyzer = (fieldName==NULL?defaultAnalyzer:analyzerMap.get(fieldName)); |
|
167 if (analyzer == NULL) { |
|
168 analyzer = defaultAnalyzer; |
|
169 } |
|
170 |
|
171 return analyzer->tokenStream(fieldName, reader); |
|
172 } |
|
173 |
|
174 |
|
175 |
|
176 bool ISOLatin1AccentFilter::next(Token* token){ |
|
177 if ( input->next(token) ){ |
|
178 int32_t l = token->termTextLength(); |
|
179 const TCHAR* chars = token->termText(); |
|
180 bool doProcess = false; |
|
181 for (int32_t i = 0; i < l; ++i) { |
|
182 #ifdef _UCS2 |
|
183 if ( chars[i] >= 0xC0 && chars[i] <= 0x178 ) { |
|
184 #else |
|
185 if ( (chars[i] >= 0xC0 && chars[i] <= 0xFF) || chars[i] < 0 ) { |
|
186 #endif |
|
187 doProcess = true; |
|
188 break; |
|
189 } |
|
190 |
|
191 } |
|
192 if ( !doProcess ) { |
|
193 return true; |
|
194 } |
|
195 |
|
196 StringBuffer output(l*2); |
|
197 for (int32_t j = 0; j < l; j++) { |
|
198 #ifdef _UCS2 |
|
199 TCHAR c = chars[j]; |
|
200 #else |
|
201 unsigned char c = chars[j]; |
|
202 #endif |
|
203 switch (c) { |
|
204 case 0xC0 : // À |
|
205 case 0xC1 : // Á |
|
206 case 0xC2 : // Â |
|
207 case 0xC3 : // Ã |
|
208 case 0xC4 : // Ä |
|
209 case 0xC5 : // Å |
|
210 output.appendChar('A'); |
|
211 break; |
|
212 case 0xC6 : // Æ |
|
213 output.append(_T("AE")); |
|
214 break; |
|
215 case 0xC7 : // Ç |
|
216 output.appendChar('C'); |
|
217 break; |
|
218 case 0xC8 : // È |
|
219 case 0xC9 : // É |
|
220 case 0xCA : // Ê |
|
221 case 0xCB : // Ë |
|
222 output.appendChar('E'); |
|
223 break; |
|
224 case 0xCC : // Ì |
|
225 case 0xCD : // Í |
|
226 case 0xCE : // Î |
|
227 case 0xCF : // Ï |
|
228 output.appendChar('I'); |
|
229 break; |
|
230 case 0xD0 : // Ð |
|
231 output.appendChar('D'); |
|
232 break; |
|
233 case 0xD1 : // Ñ |
|
234 output.appendChar('N'); |
|
235 break; |
|
236 case 0xD2 : // Ò |
|
237 case 0xD3 : // Ó |
|
238 case 0xD4 : // Ô |
|
239 case 0xD5 : // Õ |
|
240 case 0xD6 : // Ö |
|
241 case 0xD8 : // Ø |
|
242 output.appendChar('O'); |
|
243 break; |
|
244 case 0xDE : // Þ |
|
245 output.append(_T("TH")); |
|
246 break; |
|
247 case 0xD9 : // Ù |
|
248 case 0xDA : // Ú |
|
249 case 0xDB : // Û |
|
250 case 0xDC : // Ü |
|
251 output.appendChar('U'); |
|
252 break; |
|
253 case 0xDD : // Ý |
|
254 output.appendChar('Y'); |
|
255 break; |
|
256 case 0xE0 : // à |
|
257 case 0xE1 : // á |
|
258 case 0xE2 : // â |
|
259 case 0xE3 : // ã |
|
260 case 0xE4 : // ä |
|
261 case 0xE5 : // å |
|
262 output.appendChar('a'); |
|
263 break; |
|
264 case 0xE6 : // æ |
|
265 output.append(_T("ae")); |
|
266 break; |
|
267 case 0xE7 : // ç |
|
268 output.appendChar('c'); |
|
269 break; |
|
270 case 0xE8 : // è |
|
271 case 0xE9 : // é |
|
272 case 0xEA : // ê |
|
273 case 0xEB : // ë |
|
274 output.appendChar('e'); |
|
275 break; |
|
276 case 0xEC : // ì |
|
277 case 0xED : // í |
|
278 case 0xEE : // î |
|
279 case 0xEF : // ï |
|
280 output.appendChar('i'); |
|
281 break; |
|
282 case 0xF0 : // ð |
|
283 output.appendChar('d'); |
|
284 break; |
|
285 case 0xF1 : // ñ |
|
286 output.appendChar('n'); |
|
287 break; |
|
288 case 0xF2 : // ò |
|
289 case 0xF3 : // ó |
|
290 case 0xF4 : // ô |
|
291 case 0xF5 : // õ |
|
292 case 0xF6 : // ö |
|
293 case 0xF8 : // ø |
|
294 output.appendChar('o'); |
|
295 break; |
|
296 case 0xDF : // ß |
|
297 output.append(_T("ss")); |
|
298 break; |
|
299 case 0xFE : // þ |
|
300 output.append(_T("th")); |
|
301 break; |
|
302 case 0xF9 : // ù |
|
303 case 0xFA : // ú |
|
304 case 0xFB : // û |
|
305 case 0xFC : // ü |
|
306 output.appendChar('u'); |
|
307 break; |
|
308 case 0xFD : // ý |
|
309 case 0xFF : // ÿ |
|
310 output.appendChar('y'); |
|
311 break; |
|
312 |
|
313 #ifdef _UCS2 |
|
314 case 0x152 : // Œ |
|
315 output.append(_T("OE")); |
|
316 break; |
|
317 case 0x153 : // œ |
|
318 output.append(_T("oe")); |
|
319 break; |
|
320 case 0x178 : // Ÿ |
|
321 output.appendChar('Y'); |
|
322 break; |
|
323 #endif |
|
324 default : |
|
325 output.appendChar(c); |
|
326 break; |
|
327 } |
|
328 } |
|
329 token->setText(output.getBuffer()); |
|
330 return true; |
|
331 } |
|
332 return false; |
|
333 } |
|
334 |
|
335 |
|
336 TokenStream* KeywordAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader){ |
|
337 return _CLNEW KeywordTokenizer(reader); |
|
338 } |
|
339 |
|
340 KeywordTokenizer::KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize): |
|
341 Tokenizer(input) |
|
342 { |
|
343 this->done = false; |
|
344 if ( bufferSize < 0 ) |
|
345 this->bufferSize = DEFAULT_BUFFER_SIZE; |
|
346 } |
|
347 KeywordTokenizer::~KeywordTokenizer(){ |
|
348 } |
|
349 |
|
350 bool KeywordTokenizer::next(Token* token){ |
|
351 if (!done) { |
|
352 done = true; |
|
353 int32_t rd; |
|
354 const TCHAR* buffer=0; |
|
355 while (true) { |
|
356 rd = input->read(buffer, bufferSize); |
|
357 if (rd == -1) |
|
358 break; |
|
359 token->growBuffer(token->_termTextLen +rd+1); |
|
360 |
|
361 int32_t cp = rd; |
|
362 if ( token->_termTextLen + cp > token->bufferLength() ) |
|
363 cp = token->bufferLength() - token->_termTextLen; |
|
364 _tcsncpy(token->_termText+token->_termTextLen,buffer,cp); |
|
365 token->_termTextLen+=rd; |
|
366 } |
|
367 token->_termText[token->_termTextLen]=0; |
|
368 token->set(token->_termText,0,token->_termTextLen); |
|
369 return true; |
|
370 } |
|
371 return false; |
|
372 } |
|
373 |
|
374 |
|
375 LengthFilter::LengthFilter(TokenStream* in, int _min, int _max): |
|
376 TokenFilter(in) |
|
377 { |
|
378 this->_min = _min; |
|
379 this->_max = _max; |
|
380 } |
|
381 |
|
382 LengthFilter::LengthFilter(TokenStream* in, bool deleteTs, int _min, int _max): |
|
383 TokenFilter(in, deleteTs) |
|
384 { |
|
385 this->_min = _min; |
|
386 this->_max = _max; |
|
387 } |
|
388 |
|
389 bool LengthFilter::next(Token* token) |
|
390 { |
|
391 // return the first non-stop word found |
|
392 while ( input->next(token) ) |
|
393 { |
|
394 size_t len = token->termTextLength(); |
|
395 if (len >= _min && len <= _max) |
|
396 return true; |
|
397 // note: else we ignore it but should we index each part of it? |
|
398 } |
|
399 // reached EOS -- return null |
|
400 return false; |
|
401 } |
|
402 |
|
403 |
|
404 CL_NS_END |