|
1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 |
|
19 // system library |
|
20 #include "wchar.h" |
|
21 #include <string> |
|
22 #include <vector> |
|
23 #include <sstream> |
|
24 #include <iostream> |
|
25 #include <glib.h> |
|
26 |
|
27 // clucene |
|
28 #include "CLucene.h" |
|
29 #include "CLucene/analysis/AnalysisHeader.h" |
|
30 #include "CLucene/analysis/Analyzers.h" |
|
31 |
|
32 // local libary |
|
33 #include "thaianalysis.h" |
|
34 #include "ngram.h" |
|
35 #include "koreananalyzer.h" |
|
36 #include "cjkanalyzer.h" |
|
37 #include "cpixparsetools.h" |
|
38 #include "prefixfilter.h" |
|
39 |
|
40 // cpix internal |
|
41 #include "customanalyzer.h" |
|
42 #include "cpixanalyzer.h" |
|
43 #include "analyzer.h" |
|
44 #include "cluceneext.h" |
|
45 #include "analyzerexp.h" |
|
46 #include "indevicecfg.h" |
|
47 #include "cpixexc.h" |
|
48 #include "localization.h" |
|
49 |
|
50 namespace Cpix { |
|
51 |
|
52 // |
|
53 // Following sections provide the glue code for connecting the |
|
54 // analyzer definition syntax with analyzer, tokenizers and filter |
|
55 // implementations. |
|
56 // |
|
57 // The glue code is template heavy with the indent of providing |
|
58 // automation for associating specific keywords with specific |
|
59 // analyzers, tokenizers and filters implementing corresponding |
|
60 // CLucene abstractions. Additional classes are needed only if |
|
61 // filters, tokenizers, etc. accept parameters. |
|
62 // |
|
63 // NOTE: To understand the analyzers, it is sufficient to understand |
|
64 // that an analyzer transforms characters stream into specific token streams |
|
65 // (e.g. character stream 'foobarmetawords' can be transformed into token |
|
66 // stream 'foo', 'bar' 'meta' 'words'). Analysis consist of two main |
|
67 // parts which are tokenization and filtering. Tokenization converts |
|
68 // the character stream into token stream (e.g. 'FoO bAr' -> 'FoO' 'bAr') |
|
69 // and filtering modifies the tokens (e.g. lowercase filtering 'FoO' -> |
|
70 // 'foo', 'bAr' -> 'bar'). Analyzer as an object is responsible for |
|
71 // constructing a tokenizer and a sequence of filters to perform |
|
72 // these required tasks. |
|
73 // |
|
74 // See the documentation around TokenizerClassEntries and |
|
75 // FilterClassEntries to see how implementations not taking parameters |
|
76 // can be easily added. |
|
77 // |
|
78 |
|
79 using namespace Cpix::AnalyzerExp; |
|
80 |
|
81 // Safe assumption |
|
82 #define MAX_LANGCODE_LENGTH 256 |
|
83 |
|
84 class LocaleSwitchStreamFactory : public TokenStreamFactory { |
|
85 public: |
|
86 |
|
87 LocaleSwitchStreamFactory(const AnalyzerExp::LocaleSwitch& sw, const wchar_t* config); |
|
88 |
|
89 ~LocaleSwitchStreamFactory(); |
|
90 |
|
91 virtual lucene::analysis::TokenStream* tokenStream(const wchar_t * fieldName, |
|
92 lucene::util::Reader * reader); |
|
93 |
|
94 lucene::analysis::TokenStream* tokenStream(std::vector<std::wstring>& languages, |
|
95 const wchar_t * fieldName, |
|
96 lucene::util::Reader * reader); |
|
97 |
|
98 private: |
|
99 std::map<std::wstring, CustomAnalyzer*> analyzers_; |
|
100 std::auto_ptr<CustomAnalyzer> default_; |
|
101 }; |
|
102 |
|
103 |
|
104 TokenStreamFactory::~TokenStreamFactory() {}; |
|
105 |
|
106 LocaleSwitchStreamFactory::LocaleSwitchStreamFactory(const LocaleSwitch& sw, const wchar_t* config) { |
|
107 for (int i = 0; i < sw.cases().size(); i++) { |
|
108 const Case& cs = *sw.cases()[i]; |
|
109 for (int j = 0; j < cs.cases().size(); j++) { |
|
110 std::wstring c = cs.cases()[j]; |
|
111 if (analyzers_.count(c)) delete analyzers_[c]; |
|
112 analyzers_[c] = new CustomAnalyzer(cs.piping(), config); |
|
113 } |
|
114 } |
|
115 default_.reset(new CustomAnalyzer(sw.def())); |
|
116 } |
|
117 |
|
118 LocaleSwitchStreamFactory::~LocaleSwitchStreamFactory() { |
|
119 typedef std::map<std::wstring, CustomAnalyzer*>::iterator iter; |
|
120 for (iter i = analyzers_.begin(); i != analyzers_.end(); i++) { |
|
121 delete i->second; |
|
122 } |
|
123 } |
|
124 |
|
125 lucene::analysis::TokenStream* |
|
126 LocaleSwitchStreamFactory::tokenStream(const wchar_t * fieldName, |
|
127 lucene::util::Reader * reader) { |
|
128 std::vector<std::wstring> languages = |
|
129 Localization::instance().getLanguageNames(); |
|
130 |
|
131 return tokenStream(languages, fieldName, reader); |
|
132 } |
|
133 |
|
134 lucene::analysis::TokenStream* |
|
135 LocaleSwitchStreamFactory::tokenStream(std::vector<std::wstring>& languages, |
|
136 const wchar_t * fieldName, |
|
137 lucene::util::Reader * reader) { |
|
138 for (int i = 0; i < languages.size(); i++) { |
|
139 if ( analyzers_.count(languages[i]) ) { |
|
140 return analyzers_[languages[i]]->tokenStream( fieldName, reader ); |
|
141 } |
|
142 } |
|
143 return default_->tokenStream( fieldName, reader ); |
|
144 } |
|
145 |
|
146 class DefaultTokenStreamFactory : public TokenStreamFactory { |
|
147 public: |
|
148 |
|
149 enum Target { |
|
150 NORMAL, |
|
151 INDEXING, |
|
152 QUERY, |
|
153 PREFIX |
|
154 }; |
|
155 |
|
156 DefaultTokenStreamFactory(const Invokation& invokation) { |
|
157 if (invokation.params().size() == 1) { |
|
158 const Identifier* id = dynamic_cast<const Identifier*>( invokation.params()[0] ); |
|
159 if ( id ) { |
|
160 if ( id->id() == CPIX_ID_INDEXING ) { |
|
161 target_ = INDEXING; |
|
162 } else if ( id->id() == CPIX_ID_QUERY ) { |
|
163 target_ = QUERY; |
|
164 } else if ( id->id() == CPIX_ID_PREFIX ) { |
|
165 target_ = PREFIX; |
|
166 } else { |
|
167 THROW_CPIXEXC(L"Default analyzer does not accept %S for parameter", id->id().c_str()); |
|
168 } |
|
169 } else { |
|
170 THROW_CPIXEXC(L"Default accepts only identifier as a parameter."); |
|
171 } |
|
172 } else if (invokation.params().size() > 1) { |
|
173 THROW_CPIXEXC(L"Default analyzer does not accept more than one parameter"); |
|
174 } else { |
|
175 target_ = NORMAL; |
|
176 } |
|
177 } |
|
178 |
|
179 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
180 lucene::util::Reader * reader) { |
|
181 switch (target_) { |
|
182 case QUERY: |
|
183 return Analysis::getQueryAnalyzer().tokenStream( fieldName, reader ); |
|
184 case PREFIX: |
|
185 return Analysis::getPrefixAnalyzer().tokenStream( fieldName, reader ); |
|
186 } |
|
187 return Analysis::getDefaultAnalyzer().tokenStream( fieldName, reader ); |
|
188 } |
|
189 |
|
190 private: |
|
191 |
|
192 Target target_; |
|
193 |
|
194 }; |
|
195 |
|
196 /** |
|
197 * Template class used to create CLucene tokenizers. Template |
|
198 * parameter T must implement lucene::analysis::Tokenizer abstraction. |
|
199 */ |
|
200 template<class T> |
|
201 class TokenizerFactory : public TokenStreamFactory |
|
202 { |
|
203 public: |
|
204 TokenizerFactory(const Invokation& invokation) { |
|
205 if (invokation.params().size() > 0) { |
|
206 THROW_CPIXEXC(L"Tokenizer %S does not accept parameters", |
|
207 invokation.id().c_str()); |
|
208 } |
|
209 } |
|
210 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * /*fieldName*/, |
|
211 lucene::util::Reader * reader) { |
|
212 return _CLNEW T(reader); |
|
213 } |
|
214 }; |
|
215 |
|
216 template<> |
|
217 class TokenizerFactory<analysis::CjkNGramTokenizer> : public TokenStreamFactory |
|
218 { |
|
219 public: |
|
220 static const int DefaultNgramSize = 1; |
|
221 TokenizerFactory(const Invokation& invokation) { |
|
222 using namespace Cpix::AnalyzerExp; |
|
223 if (invokation.params().size() > 1) { |
|
224 THROW_CPIXEXC(L"Cjk Ngram tokenizer does not accept more than one parameter", |
|
225 invokation.id().c_str()); |
|
226 } |
|
227 if (invokation.params().size() == DefaultNgramSize) { |
|
228 IntegerLit* ngramSize = dynamic_cast<IntegerLit*>(invokation.params()[0]); |
|
229 if ( ngramSize ) { |
|
230 ngramSize_ = ngramSize->value(); |
|
231 } else { |
|
232 THROW_CPIXEXC(L"Cjk Ngram tokenizer parameter must be an integer"); |
|
233 } |
|
234 } else { |
|
235 ngramSize_ = 1; |
|
236 } |
|
237 } |
|
238 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * /*fieldName*/, |
|
239 lucene::util::Reader * reader) { |
|
240 return _CLNEW analysis::CjkNGramTokenizer(reader, ngramSize_); |
|
241 } |
|
242 |
|
243 private: |
|
244 |
|
245 int ngramSize_; |
|
246 }; |
|
247 |
|
248 |
|
249 /** |
|
250 * Template class wrapping CLucene analyzers. Template parameter T must |
|
251 * implement lucene::analysis::Analyzer abstraction. |
|
252 */ |
|
253 template<class T> |
|
254 class AnalyzerWrap : public TokenStreamFactory |
|
255 { |
|
256 public: |
|
257 AnalyzerWrap(const Invokation& invokation) : analyzer_() { |
|
258 if (invokation.params().size() > 0) { |
|
259 THROW_CPIXEXC(L"Tokenizer %S does not accept parameters", |
|
260 invokation.id().c_str()); |
|
261 } |
|
262 } |
|
263 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
264 lucene::util::Reader * reader) { |
|
265 return analyzer_.tokenStream(fieldName, reader); |
|
266 } |
|
267 private: |
|
268 T analyzer_; |
|
269 }; |
|
270 |
|
271 /** |
|
272 * Template class associated with CLucene filter and a TokenStreamFactory. |
|
273 * Uses TokenStreamFactory to transform given character stream into tokenstream |
|
274 * and then applies the given Clucene filter to the token stream. |
|
275 * The template parameter T must implement lucene::analysis::Filter abstraction. |
|
276 */ |
|
277 template<class T> |
|
278 class FilterFactory : public TokenStreamFactory |
|
279 { |
|
280 public: |
|
281 FilterFactory(const Invokation& invokation, auto_ptr<TokenStreamFactory> factory) : factory_(factory) { |
|
282 if (invokation.params().size() > 0) { |
|
283 THROW_CPIXEXC(L"Filter %S does not accept parameters", |
|
284 invokation.id().c_str()); |
|
285 } |
|
286 } |
|
287 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
288 lucene::util::Reader * reader) { |
|
289 return _CLNEW T(factory_->tokenStream(fieldName, reader), true); |
|
290 } |
|
291 private: |
|
292 std::auto_ptr<TokenStreamFactory> factory_; |
|
293 }; |
|
294 |
|
295 /** |
|
296 * Specialized Analyzer wrap for CLucene's PerFieldAnalyzer. Specialized |
|
297 * template is needed because perfield analyzer accepts parameters |
|
298 * (specific analyzers for different field plus default analyzer) |
|
299 */ |
|
300 template<> |
|
301 class AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper> : public TokenStreamFactory { |
|
302 public: |
|
303 AnalyzerWrap(const Switch& sw, const wchar_t* config) : analyzer_(0) { |
|
304 using namespace Cpt::Parser; |
|
305 using namespace lucene::analysis; |
|
306 |
|
307 analyzer_ = _CLNEW PerFieldAnalyzerWrapper(_CLNEW CustomAnalyzer(sw.def())); |
|
308 |
|
309 for (int i = 0; i < sw.cases().size(); i++) { |
|
310 const Case& cs = *sw.cases()[i]; |
|
311 for (int j = 0; j < cs.cases().size(); j++) { |
|
312 analyzer_->addAnalyzer( cs.cases()[j].c_str(), _CLNEW CustomAnalyzer( cs.piping(), config ) ); |
|
313 } |
|
314 } |
|
315 } |
|
316 virtual ~AnalyzerWrap() { |
|
317 _CLDELETE(analyzer_); |
|
318 } |
|
319 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
320 lucene::util::Reader * reader) { |
|
321 return analyzer_->tokenStream(fieldName, reader); |
|
322 } |
|
323 private: |
|
324 lucene::analysis::PerFieldAnalyzerWrapper* analyzer_; |
|
325 }; |
|
326 |
|
327 |
|
328 |
|
329 /** |
|
330 * Specialized StopFilter factory. Specialized filter is needed |
|
331 * because StopFilter needs parameters (stop word list or a language) |
|
332 */ |
|
333 template<> |
|
334 class FilterFactory<lucene::analysis::StopFilter> : public TokenStreamFactory |
|
335 { |
|
336 public: |
|
337 FilterFactory(const Invokation& invokation, |
|
338 auto_ptr<TokenStreamFactory> factory) |
|
339 :words_(0), ownWords_(0), factory_(factory) { |
|
340 using namespace Cpt::Parser; |
|
341 if (invokation.params().size() == 1 && dynamic_cast<Identifier*>(invokation.params()[0])) { |
|
342 Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]); |
|
343 //cpix_LangCode lang; |
|
344 if (id->id() == CPIX_WLANG_EN) { |
|
345 words_ = lucene::analysis::StopAnalyzer::ENGLISH_STOP_WORDS; |
|
346 } else if (id->id() == CPIX_WLANG_FR) { |
|
347 words_ = analysis::NonEnglishStopWords::FRENCH_STOP_WORDS; |
|
348 } else { |
|
349 THROW_CPIXEXC(L"No prepared stopword list for language code '%S'", |
|
350 id->id().c_str()); |
|
351 } |
|
352 } else { |
|
353 ownWords_ = new wchar_t*[invokation.params().size()+1]; |
|
354 memset(ownWords_, 0, sizeof(wchar_t*)*(invokation.params().size()+1)); |
|
355 // FIXE: args may leak |
|
356 for (int i = 0; i < invokation.params().size(); i++) { |
|
357 StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]); |
|
358 if (lit) { |
|
359 const wstring& str = lit->text(); |
|
360 ownWords_[i] = new wchar_t[str.length()+1]; |
|
361 wcscpy(ownWords_[i], str.c_str()); |
|
362 } else { |
|
363 THROW_CPIXEXC(L"StopFilter accepts only language identifer or list of strings as a parameters."); |
|
364 } |
|
365 } |
|
366 } |
|
367 |
|
368 } |
|
369 virtual ~FilterFactory() { |
|
370 if (ownWords_) { |
|
371 for (int i = 0; ownWords_[i]; i++) { |
|
372 delete[] ownWords_[i]; |
|
373 } |
|
374 delete[] ownWords_; |
|
375 } |
|
376 } |
|
377 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
378 lucene::util::Reader * reader) { |
|
379 return _CLNEW lucene::analysis::StopFilter(factory_->tokenStream(fieldName, reader), true, ownWords_ ? const_cast<const wchar_t**>(ownWords_) : words_); |
|
380 } |
|
381 private: |
|
382 const wchar_t **words_; |
|
383 wchar_t **ownWords_; // owned |
|
384 std::auto_ptr<TokenStreamFactory> factory_; |
|
385 }; |
|
386 |
|
387 /** |
|
388 * Specialized SnowballFilter factory is needed, because SnowballFilter |
|
389 * accepts parameters (the language). |
|
390 */ |
|
391 template<> |
|
392 class FilterFactory<lucene::analysis::SnowballFilter> : public TokenStreamFactory |
|
393 { |
|
394 public: |
|
395 FilterFactory(const Invokation& invokation, |
|
396 auto_ptr<TokenStreamFactory> factory) |
|
397 : factory_(factory) { |
|
398 using namespace Cpt::Parser; |
|
399 if (invokation.params().size() != 1 || !dynamic_cast<Identifier*>(invokation.params()[0])) { |
|
400 THROW_CPIXEXC(L"Snowball filter takes exactly one identifier as a parameter." ); |
|
401 } |
|
402 Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]); |
|
403 if (id->id() == CPIX_WLANG_EN) { |
|
404 lang_ = cpix_LANG_EN; |
|
405 } else { |
|
406 THROW_CPIXEXC(L"Language identifier %S is not supported for stemming", |
|
407 id->id().c_str()); |
|
408 } |
|
409 } |
|
410 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
411 lucene::util::Reader * reader) { |
|
412 return _CLNEW lucene::analysis::SnowballFilter(factory_->tokenStream(fieldName, reader), true, lang_); |
|
413 } |
|
414 private: |
|
415 cpix_LangCode lang_; |
|
416 std::auto_ptr<TokenStreamFactory> factory_; |
|
417 }; |
|
418 |
|
419 /** |
|
420 * Specialized LengthFilter factory is needed, because length filter |
|
421 * accepts parameters (minimum length and maximum length) |
|
422 */ |
|
423 template<> |
|
424 class FilterFactory<lucene::analysis::LengthFilter> : public TokenStreamFactory |
|
425 { |
|
426 public: |
|
427 FilterFactory(const Invokation& invokation, |
|
428 auto_ptr<TokenStreamFactory> factory) |
|
429 : factory_(factory) { |
|
430 using namespace Cpt::Parser; |
|
431 if (invokation.params().size() != 2 || |
|
432 !dynamic_cast<IntegerLit*>(invokation.params()[0]) || |
|
433 !dynamic_cast<IntegerLit*>(invokation.params()[1])) { |
|
434 THROW_CPIXEXC("Length filter takes exactly two integer parameters"); |
|
435 } |
|
436 min_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value(); |
|
437 max_ = dynamic_cast<IntegerLit*>(invokation.params()[1])->value(); |
|
438 } |
|
439 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
440 lucene::util::Reader * reader) { |
|
441 return _CLNEW lucene::analysis::LengthFilter(factory_->tokenStream(fieldName, reader), true, min_, max_ ); |
|
442 } |
|
443 private: |
|
444 int min_, max_; |
|
445 std::auto_ptr<TokenStreamFactory> factory_; |
|
446 }; |
|
447 |
|
448 /** |
|
449 * Specialized PrefixGenerator factory is needed, because PrefixGenerator |
|
450 * requires the max prefix size. |
|
451 */ |
|
452 template<> |
|
453 class FilterFactory<PrefixGenerator> : public TokenStreamFactory |
|
454 { |
|
455 public: |
|
456 FilterFactory(const Invokation& invokation, |
|
457 auto_ptr<TokenStreamFactory> factory) |
|
458 : factory_(factory) { |
|
459 using namespace Cpt::Parser; |
|
460 if (invokation.params().size() != 1 || |
|
461 !dynamic_cast<IntegerLit*>(invokation.params()[0])) { |
|
462 THROW_CPIXEXC("Prefix generator takes exactly one integer parameter"); |
|
463 } |
|
464 maxPrefixLength_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value(); |
|
465 } |
|
466 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
467 lucene::util::Reader * reader) { |
|
468 return _CLNEW PrefixGenerator(factory_->tokenStream(fieldName, reader), true, maxPrefixLength_ ); |
|
469 } |
|
470 private: |
|
471 int maxPrefixLength_; |
|
472 std::auto_ptr<TokenStreamFactory> factory_; |
|
473 }; |
|
474 |
|
475 /** |
|
476 * Specialized PrefixFilter factory is needed, because prefix filter |
|
477 * accepts parameters (language set or prefixes) |
|
478 */ |
|
479 template<> |
|
480 class FilterFactory<analysis::PrefixFilter> : public TokenStreamFactory |
|
481 { |
|
482 public: |
|
483 FilterFactory(const Invokation& invokation, |
|
484 auto_ptr<TokenStreamFactory> factory) |
|
485 : prefixes_(0), ownPrefixes_(0), factory_(factory) { |
|
486 using namespace Cpt::Parser; |
|
487 if (invokation.params().size() == 1 && |
|
488 dynamic_cast<Identifier*>(invokation.params()[0])) { |
|
489 Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]); |
|
490 //cpix_LangCode lang; |
|
491 if (id->id() == CPIX_WLANG_HE) { |
|
492 prefixes_ = analysis::HebrewPrefixes; |
|
493 } else { |
|
494 THROW_CPIXEXC(L"No prepared prefix list for language code '%S'", |
|
495 id->id().c_str()); |
|
496 } |
|
497 } else { |
|
498 ownPrefixes_ = new wchar_t*[invokation.params().size()+1]; |
|
499 memset(ownPrefixes_, 0, sizeof(wchar_t*)*(invokation.params().size()+1)); |
|
500 // FIXE: args may leak |
|
501 for (int i = 0; i < invokation.params().size(); i++) { |
|
502 StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]); |
|
503 if (lit) { |
|
504 const wstring& str = lit->text(); |
|
505 ownPrefixes_[i] = new wchar_t[str.length()+1]; |
|
506 wcscpy(ownPrefixes_[i], str.c_str()); |
|
507 } else { |
|
508 THROW_CPIXEXC(L"PrefixFilter accepts only language identifer or list of strings as a parameters."); |
|
509 } |
|
510 } |
|
511 } |
|
512 } |
|
513 virtual ~FilterFactory() { |
|
514 if (ownPrefixes_) { |
|
515 for (int i = 0; ownPrefixes_[i]; i++) { |
|
516 delete[] ownPrefixes_[i]; |
|
517 } |
|
518 delete[] ownPrefixes_; |
|
519 } |
|
520 } |
|
521 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
522 lucene::util::Reader * reader) { |
|
523 return _CLNEW analysis::PrefixFilter(factory_->tokenStream(fieldName, reader), true, ownPrefixes_ ? const_cast<const wchar_t**>(ownPrefixes_) : prefixes_); |
|
524 } |
|
525 private: |
|
526 const wchar_t **prefixes_; |
|
527 wchar_t **ownPrefixes_; // owned |
|
528 std::auto_ptr<TokenStreamFactory> factory_; |
|
529 }; |
|
530 |
|
531 /** |
|
532 * Specialized ElisionFilter factory is needed, because elision filter |
|
533 * accepts parameters (language set or articles) |
|
534 */ |
|
535 template<> |
|
536 class FilterFactory<analysis::ElisionFilter> : public TokenStreamFactory |
|
537 { |
|
538 public: |
|
539 FilterFactory(const Invokation& invokation, |
|
540 auto_ptr<TokenStreamFactory> factory) |
|
541 : articles_(0), ownArticles_(0), factory_(factory) { |
|
542 using namespace Cpt::Parser; |
|
543 if (invokation.params().size() == 1 && |
|
544 dynamic_cast<Identifier*>(invokation.params()[0])) { |
|
545 Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]); |
|
546 //cpix_LangCode lang; |
|
547 if (id->id() == CPIX_WLANG_FR) { |
|
548 articles_ = analysis::FrenchArticles; |
|
549 } else { |
|
550 THROW_CPIXEXC(L"No prepared article list for language code '%S'", |
|
551 id->id().c_str()); |
|
552 } |
|
553 } else { |
|
554 ownArticles_ = new wchar_t*[invokation.params().size()+1]; |
|
555 memset(ownArticles_, 0, sizeof(wchar_t*)*(invokation.params().size()+1)); |
|
556 // FIXE: args may leak |
|
557 for (int i = 0; i < invokation.params().size(); i++) { |
|
558 StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]); |
|
559 if (lit) { |
|
560 const wstring& str = lit->text(); |
|
561 ownArticles_[i] = new wchar_t[str.length()+1]; |
|
562 wcscpy(ownArticles_[i], str.c_str()); |
|
563 } else { |
|
564 THROW_CPIXEXC(L"PrefixFilter accepts only language identifer or list of strings as a parameters."); |
|
565 } |
|
566 } |
|
567 } |
|
568 } |
|
569 virtual ~FilterFactory() { |
|
570 if (ownArticles_) { |
|
571 for (int i = 0; ownArticles_[i]; i++) { |
|
572 delete[] ownArticles_[i]; |
|
573 } |
|
574 delete[] ownArticles_; |
|
575 } |
|
576 } |
|
577 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
578 lucene::util::Reader * reader) { |
|
579 return _CLNEW analysis::ElisionFilter(factory_->tokenStream(fieldName, reader), true, ownArticles_ ? const_cast<const wchar_t**>(ownArticles_) : articles_); |
|
580 } |
|
581 private: |
|
582 const wchar_t **articles_; |
|
583 wchar_t **ownArticles_; // owned |
|
584 std::auto_ptr<TokenStreamFactory> factory_; |
|
585 }; |
|
586 |
|
587 typedef auto_ptr<TokenStreamFactory> (*TokenizerFactoryCreator)(const Invokation& invokation); |
|
588 typedef auto_ptr<TokenStreamFactory> (*FilterFactoryCreator)(const Invokation& invokation, |
|
589 auto_ptr<TokenStreamFactory> factory); |
|
590 |
|
591 template<class T> |
|
592 struct TokenStreamFactoryCtor |
|
593 { |
|
594 static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) { |
|
595 return auto_ptr<TokenStreamFactory>(new T(invokation)); |
|
596 } |
|
597 }; |
|
598 |
|
599 /** |
|
600 * Sets up a tokenizer factory with given invokation parameters |
|
601 */ |
|
602 template<class T> |
|
603 struct TokenizerFactoryCtor |
|
604 { |
|
605 static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) { |
|
606 return auto_ptr<TokenStreamFactory>(new TokenizerFactory<T>(invokation)); |
|
607 } |
|
608 }; |
|
609 |
|
610 /** |
|
611 * Sets up an analyzer wrap with given invokation parameters |
|
612 */ |
|
613 template<class T> |
|
614 struct AnalyzerWrapCtor |
|
615 { |
|
616 static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) { |
|
617 return auto_ptr<TokenStreamFactory>(new AnalyzerWrap<T>(invokation)); |
|
618 } |
|
619 }; |
|
620 |
|
621 /** |
|
622 * Sets up a filter factory with given invokation parameters |
|
623 */ |
|
624 template<class T> |
|
625 struct FilterFactoryCtor |
|
626 { |
|
627 static auto_ptr<TokenStreamFactory> create(const Invokation& invokation, |
|
628 auto_ptr<TokenStreamFactory> factory) { |
|
629 return auto_ptr<TokenStreamFactory>(new FilterFactory<T>(invokation, factory)); |
|
630 } |
|
631 }; |
|
632 |
|
633 struct TokenizerClassEntry { |
|
634 const wchar_t *id_; |
|
635 TokenizerFactoryCreator createFactory_; |
|
636 }; |
|
637 |
|
638 // |
|
639 // Following TokenizerClassEntries and FilterClassEntries contain |
|
640 // the mapping from tokenizer/analyzer/filter names into glue code |
|
641 // templates providing the implementations. |
|
642 // |
|
643 |
|
644 TokenizerClassEntry TokenizerClassEntries[] = { |
|
645 {CPIX_TOKENIZER_STANDARD, TokenizerFactoryCtor<lucene::analysis::standard::StandardTokenizer>::create}, |
|
646 {CPIX_TOKENIZER_WHITESPACE, TokenizerFactoryCtor<lucene::analysis::WhitespaceTokenizer>::create}, |
|
647 {CPIX_TOKENIZER_LETTER, TokenizerFactoryCtor<lucene::analysis::LetterTokenizer>::create}, |
|
648 {CPIX_TOKENIZER_KEYWORD, TokenizerFactoryCtor<lucene::analysis::KeywordTokenizer>::create}, |
|
649 {CPIX_TOKENIZER_CJK, TokenizerFactoryCtor<lucene::analysis::cjk::CJKTokenizer>::create}, |
|
650 {CPIX_TOKENIZER_NGRAM, TokenizerFactoryCtor<analysis::CjkNGramTokenizer>::create}, |
|
651 {CPIX_TOKENIZER_KOREAN, TokenizerFactoryCtor<analysis::KoreanTokenizer>::create}, |
|
652 {CPIX_TOKENIZER_KOREAN_QUERY,TokenizerFactoryCtor<analysis::KoreanQueryTokenizer>::create}, |
|
653 |
|
654 {CPIX_ANALYZER_STANDARD, AnalyzerWrapCtor<lucene::analysis::standard::StandardAnalyzer>::create}, |
|
655 {CPIX_ANALYZER_DEFAULT, TokenStreamFactoryCtor<DefaultTokenStreamFactory>::create}, |
|
656 |
|
657 // TODO: Add more Tokenizers/Analyzers |
|
658 |
|
659 // Example tokenizer (works as such if tokenizers don't take parameters) |
|
660 // {CPIX_TOKENIZER_MYTOKENIZER,TokenizerFactoryCtor<MyTokenizer>::create}, |
|
661 |
|
662 // Example analyzer (works as such if analyzer don't take parameters) |
|
663 // {CPIX_ANALYZER_MYANALYZER, AnalyzerWrapCtor<MyAnalyzer>::create}, |
|
664 |
|
665 {0, 0} |
|
666 }; |
|
667 |
|
668 struct FilterClassEntry { |
|
669 const wchar_t *id_; |
|
670 FilterFactoryCreator createFactory_; |
|
671 }; |
|
672 |
|
673 FilterClassEntry FilterClassEntries[] = { |
|
674 {CPIX_FILTER_STANDARD, FilterFactoryCtor<lucene::analysis::standard::StandardFilter>::create}, |
|
675 {CPIX_FILTER_LOWERCASE, FilterFactoryCtor<lucene::analysis::LowerCaseFilter>::create}, |
|
676 {CPIX_FILTER_ACCENT, FilterFactoryCtor<lucene::analysis::ISOLatin1AccentFilter>::create}, |
|
677 {CPIX_FILTER_STOP, FilterFactoryCtor<lucene::analysis::StopFilter>::create}, |
|
678 {CPIX_FILTER_STEM, FilterFactoryCtor<lucene::analysis::SnowballFilter>::create}, |
|
679 {CPIX_FILTER_LENGTH, FilterFactoryCtor<lucene::analysis::LengthFilter>::create}, |
|
680 {CPIX_FILTER_PREFIXES, FilterFactoryCtor<PrefixGenerator>::create}, |
|
681 {CPIX_FILTER_THAI, FilterFactoryCtor<analysis::ThaiWordFilter>::create}, |
|
682 {CPIX_FILTER_PREFIX, FilterFactoryCtor<analysis::PrefixFilter>::create}, |
|
683 {CPIX_FILTER_ELISION, FilterFactoryCtor<analysis::ElisionFilter>::create}, |
|
684 |
|
685 // TODO: Add more Filters |
|
686 |
|
687 // Example filter (works as such if filter don't take parameters) |
|
688 // {CPIX_FILTER_MYFILTER, FilterFactoryCtor<MyFilter>::create}, |
|
689 |
|
690 {0, 0} |
|
691 }; |
|
692 |
|
693 CustomAnalyzer::CustomAnalyzer(const wchar_t* definition, const wchar_t* config) { |
|
694 std::auto_ptr<Piping> piping = AnalyzerExp::ParsePiping( definition ); |
|
695 setup( *piping, config ); |
|
696 } |
|
697 |
|
698 CustomAnalyzer::CustomAnalyzer(const Piping& definition, const wchar_t* config) { |
|
699 setup(definition, config); |
|
700 } |
|
701 |
|
702 using namespace Cpt::Parser; |
|
703 |
|
704 void CustomAnalyzer::setup(const Piping& piping, const wchar_t* config) { |
|
705 |
|
706 // If the first item is invokation, create corresponding analyzer/tokenizer |
|
707 if (dynamic_cast<const Invokation*>(&piping.tokenizer())) { |
|
708 const Invokation& tokenizer = dynamic_cast<const Invokation&>(piping.tokenizer()); |
|
709 TokenizerClassEntry& tokenizerEntry = getTokenizerEntry( tokenizer.id() ); |
|
710 factory_ = tokenizerEntry.createFactory_( tokenizer ); |
|
711 } else if (dynamic_cast<const Switch*>(&piping.tokenizer())) { |
|
712 // If the first item is switch statement, create per-field analyzer |
|
713 const Switch& tokenizer = dynamic_cast<const Switch&>(piping.tokenizer()); |
|
714 factory_ = new AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper>( tokenizer, config ); |
|
715 } else if (dynamic_cast<const LocaleSwitch*>(&piping.tokenizer())) { |
|
716 const LocaleSwitch& tokenizer = dynamic_cast<const LocaleSwitch&>(piping.tokenizer()); |
|
717 factory_ = new LocaleSwitchStreamFactory( tokenizer, config ); |
|
718 } else if (dynamic_cast<const ConfigSwitch*>(&piping.tokenizer())) { |
|
719 const ConfigSwitch& tokenizer = dynamic_cast<const ConfigSwitch&>(piping.tokenizer()); |
|
720 factory_ = resolveConfigSwitch( tokenizer, config ); |
|
721 } else { |
|
722 THROW_CPIXEXC(L"Analyzer definition syntax did not begin with valid tokenizer"); |
|
723 } |
|
724 |
|
725 // Add filters |
|
726 const std::vector<Invokation*>& filters = piping.filters(); |
|
727 for (int i = 0; i < filters.size(); i++) { |
|
728 FilterClassEntry& filterEntry = getFilterEntry( filters[i]->id() ); |
|
729 factory_ = filterEntry.createFactory_( *filters[i], factory_ ); |
|
730 } |
|
731 } |
|
732 |
|
733 std::auto_ptr<TokenStreamFactory> CustomAnalyzer::resolveConfigSwitch(const ConfigSwitch& csw, const wchar_t* config) { |
|
734 if (config) { |
|
735 for (int i = 0; i < csw.cases().size(); i++) { |
|
736 const Case& cs = *csw.cases()[i]; |
|
737 for (int j = 0; j < cs.cases().size(); j++) { |
|
738 if (wcscmp(config, cs.cases()[j].c_str()) == 0) { |
|
739 return std::auto_ptr<TokenStreamFactory>( |
|
740 new CustomAnalyzer(cs.piping(), config)); |
|
741 } |
|
742 } |
|
743 } |
|
744 } |
|
745 return std::auto_ptr<TokenStreamFactory>(new CustomAnalyzer(csw.def(), config)); |
|
746 } |
|
747 |
|
748 TokenizerClassEntry& CustomAnalyzer::getTokenizerEntry(std::wstring id) { |
|
749 |
|
750 // Looks for a match in the TokenizerClassEntries. After finding |
|
751 // a match it returns a proper tokenizer/analyzer implementation provider |
|
752 // |
|
753 for (int i = 0; TokenizerClassEntries[i].id_; i++) { |
|
754 if (id == std::wstring(TokenizerClassEntries[i].id_)) { |
|
755 return TokenizerClassEntries[i]; |
|
756 } |
|
757 } |
|
758 |
|
759 THROW_CPIXEXC(L"Unknown tokenizer '%S'.", |
|
760 id.c_str()); |
|
761 } |
|
762 |
|
763 FilterClassEntry& CustomAnalyzer::getFilterEntry(std::wstring id) { |
|
764 |
|
765 // Looks for a match in the FilterClassEntries. After finding |
|
766 // a match it returns a proper tokenizer/analyzer implementation |
|
767 // provider |
|
768 // |
|
769 for (int i = 0; FilterClassEntries[i].id_; i++) { |
|
770 if (id == std::wstring(FilterClassEntries[i].id_)) { |
|
771 return FilterClassEntries[i]; |
|
772 } |
|
773 } |
|
774 |
|
775 THROW_CPIXEXC(L"Unknown filter '%S'.", |
|
776 id.c_str()); |
|
777 } |
|
778 |
|
779 CustomAnalyzer::~CustomAnalyzer() {} |
|
780 |
|
781 lucene::analysis::TokenStream* CustomAnalyzer::tokenStream(const wchar_t * fieldName, |
|
782 lucene::util::Reader * reader) { |
|
783 // Utilizes the the token stream factory to form token stream. |
|
784 // token stream factory is prepared during custom analyzer construction |
|
785 // and based on the analyzer definition string. |
|
786 |
|
787 return factory_->tokenStream(fieldName, reader); |
|
788 } |
|
789 |
|
790 std::auto_ptr<lucene::analysis::Analyzer> CreateDefaultAnalyzer() |
|
791 { |
|
792 return |
|
793 std::auto_ptr<lucene::analysis::Analyzer>( |
|
794 new SystemAnalyzer(_CLNEW lucene::analysis::standard::StandardAnalyzer())); |
|
795 } |
|
796 |
|
797 } |