13 * |
13 * |
14 * Description: |
14 * Description: |
15 * |
15 * |
16 */ |
16 */ |
17 |
17 |
18 |
18 // general utilities |
19 #include "CLucene.h" |
|
20 #include "CLucene/analysis/AnalysisHeader.h" |
|
21 #include "CLucene/analysis/Analyzers.h" |
|
22 |
|
23 #include "analyzer.h" |
|
24 #include "analyzerexp.h" |
|
25 #include "cpixanalyzer.h" |
|
26 #include "cluceneext.h" |
|
27 |
|
28 #include "cpixexc.h" |
|
29 #include "cpixparsetools.h" |
|
30 |
|
31 #include "wchar.h" |
19 #include "wchar.h" |
32 #include <string> |
20 #include <string> |
33 #include <vector> |
21 #include <vector> |
34 #include <sstream> |
22 #include <sstream> |
35 #include <iostream> |
23 #include <iostream> |
36 |
24 #include <fstream> |
|
25 #include <algorithm> |
|
26 |
|
27 // clucene |
|
28 #include "CLucene.h" |
|
29 |
|
30 // support |
|
31 #include "cpixparsetools.h" |
|
32 #include "cpixfstools.h" |
|
33 |
|
34 // internal |
|
35 #include "analyzer.h" |
|
36 #include "cpixanalyzer.h" |
|
37 #include "cpixexc.h" |
37 #include "document.h" |
38 #include "document.h" |
38 |
39 #include "cluceneext.h" |
39 #include "indevicecfg.h" |
40 #include "indevicecfg.h" |
40 |
|
41 #include "initparams.h" |
41 #include "initparams.h" |
|
42 #include "thaianalysis.h" |
|
43 |
|
44 #include "analyzerexp.h" |
|
45 #include "customanalyzer.h" |
|
46 #include "common/cpixlog.h" |
|
47 |
42 namespace |
48 namespace |
43 { |
49 { |
44 const char AGGR_NONFILEREADERPROXY_ERR[] |
50 const char AGGR_NONFILEREADERPROXY_ERR[] |
45 = "Aggregated reader field should be FileReaderProxy instance"; |
51 = "Aggregated reader field should be FileReaderProxy instance"; |
46 |
52 |
47 const char AGGR_STREAMREADER_ERR[] |
53 const char AGGR_STREAMREADER_ERR[] |
48 = "Aggregating streamValue-fields not implemented"; |
54 = "Aggregating streamValue-fields not implemented"; |
|
55 |
|
56 const char THAI_LANGUAGE_FILE[] |
|
57 = "thaidict.sm"; |
|
58 |
|
59 const char ANALYZER_FILE[] |
|
60 = "analyzer.loc"; |
|
61 |
|
62 const wchar_t DEFAULT_ANALYZER_CONFIG[] |
|
63 = L"default"; |
|
64 |
|
65 const wchar_t QUERY_ANALYZER_CONFIG[] |
|
66 = L"query"; |
|
67 |
|
68 const wchar_t PREFIX_ANALYZER_CONFIG[] |
|
69 = L"prefix"; |
|
70 |
|
71 // const wchar_t CPIX_ANALYZER_FALLBACK[] |
|
72 // = CPIX_ANALYZER_STANDARD; |
|
73 // |
|
74 // const wchar_t CPIX_PREFIX_ANALYZER_FALLBACK[] |
|
75 // = CPIX_TOKENIZER_LETTER L">" CPIX_FILTER_LOWERCASE; |
|
76 |
|
77 |
49 } |
78 } |
50 |
79 |
51 |
80 |
52 namespace Cpix { |
81 namespace Cpix { |
|
82 |
|
83 |
|
84 Analysis* Analysis::theInstance_ = NULL; |
|
85 |
|
86 void Analysis::init(InitParams& ip) { |
|
87 // Init thai analysis with thai dictionary |
|
88 std::string thai( Cpt::appendpath(ip.getResourceDir(), |
|
89 THAI_LANGUAGE_FILE) ); |
|
90 |
|
91 if ( Cpt::filesize( thai.c_str() ) ) { |
|
92 analysis::InitThaiAnalysis(thai.c_str()); |
|
93 } else { |
|
94 logMsg(CPIX_LL_WARNING, |
|
95 "Thai dictionary could not be found. Thai analysis will NOT work."); |
|
96 } |
|
97 |
|
98 // Setup the analysis instance |
|
99 theInstance_ = new Analysis(ip); |
|
100 } |
|
101 |
|
102 Analysis::Analysis(InitParams& ip) |
|
103 : defaultAnalyzer_(), |
|
104 queryAnalyzer_(), |
|
105 prefixAnalyzer_() { |
|
106 |
|
107 auto_ptr<AnalyzerExp::Piping> p = parse( Cpt::appendpath( ip.getResourceDir(), ANALYZER_FILE ) ); |
|
108 |
|
109 defaultAnalyzer_.reset( new CustomAnalyzer( *p, DEFAULT_ANALYZER_CONFIG ) ); |
|
110 queryAnalyzer_.reset( new CustomAnalyzer( *p, QUERY_ANALYZER_CONFIG ) ); |
|
111 prefixAnalyzer_.reset( new CustomAnalyzer( *p, PREFIX_ANALYZER_CONFIG ) ); |
|
112 } |
|
113 |
|
114 auto_ptr<AnalyzerExp::Piping> Analysis::parse(std::string path) { |
|
115 std::wifstream in(path.c_str()); |
|
116 auto_ptr<AnalyzerExp::Piping> ret; |
|
117 if ( in ) { |
|
118 |
|
119 // Reserve constant size buffer and populate it with definition |
|
120 // |
|
121 int filesize = Cpt::filesize(path.c_str()); |
|
122 Cpt::auto_array<wchar_t> buf( new wchar_t[filesize+1] ); |
|
123 in.read(buf.get(), filesize); |
|
124 buf.get()[filesize] = '\0'; |
|
125 if ( !in.fail() ) { |
|
126 try { |
|
127 ret = AnalyzerExp::ParsePiping( buf.get() ); |
|
128 } catch (...) {} |
|
129 } |
|
130 in.close(); |
|
131 } |
|
132 |
|
133 if ( !ret.get() ) { |
|
134 THROW_CPIXEXC("Analyzer definition not found. %s could not be opened. ", path.c_str()); |
|
135 } |
|
136 return ret; |
|
137 } |
|
138 |
|
139 void Analysis::shutdown() { |
|
140 analysis::ShutdownThaiAnalysis(); |
|
141 delete theInstance_; |
|
142 theInstance_ = NULL; |
|
143 } |
|
144 |
|
145 lucene::analysis::Analyzer& Analysis::getDefaultAnalyzer() { |
|
146 // TODO: Assert( theInstance_ ); |
|
147 return *theInstance_->defaultAnalyzer_; |
|
148 } |
|
149 |
|
150 lucene::analysis::Analyzer& Analysis::getQueryAnalyzer() { |
|
151 // TODO: Assert( theInstance_ ); |
|
152 return *theInstance_->queryAnalyzer_; |
|
153 } |
|
154 |
|
155 lucene::analysis::Analyzer& Analysis::getPrefixAnalyzer() { |
|
156 // TODO: Assert( theInstance_ ); |
|
157 return *theInstance_->prefixAnalyzer_; |
|
158 } |
53 |
159 |
54 PrefixGenerator::PrefixGenerator( |
160 PrefixGenerator::PrefixGenerator( |
55 lucene::analysis::TokenStream* in, |
161 lucene::analysis::TokenStream* in, |
56 bool deleteTS, |
162 bool deleteTS, |
57 size_t maxPrefixLength) |
163 size_t maxPrefixLength) |
219 return ret; |
325 return ret; |
220 } else { |
326 } else { |
221 return analyzer_->tokenStream( fieldName, reader ); |
327 return analyzer_->tokenStream( fieldName, reader ); |
222 } |
328 } |
223 } |
329 } |
224 |
|
225 // |
|
226 // Following sections provide the glue code for connecting the |
|
227 // analyzer definition syntax with analyzer, tokenizers and filter |
|
228 // implementations. |
|
229 // |
|
230 // The glue code is template heavy with the indent of providing |
|
231 // automation for associating specific keywords with specific |
|
232 // analyzers, tokenizers and filters implementing corresponding |
|
233 // CLucene abstractions. Additional classes are needed only if |
|
234 // filters, tokenizers, etc. accept parameters. |
|
235 // |
|
236 // NOTE: To understand the analyzers, it is sufficient to understand |
|
237 // that an analyzer transforms characters stream into specific token streams |
|
238 // (e.g. character stream 'foobarmetawords' can be transformed into token |
|
239 // stream 'foo', 'bar' 'meta' 'words'). Analysis consist of two main |
|
240 // parts which are tokenization and filtering. Tokenization converts |
|
241 // the character stream into token stream (e.g. 'FoO bAr' -> 'FoO' 'bAr') |
|
242 // and filtering modifies the tokens (e.g. lowercase filtering 'FoO' -> |
|
243 // 'foo', 'bAr' -> 'bar'). Analyzer as an object is responsible for |
|
244 // constructing a tokenizer and a sequence of filters to perform |
|
245 // these required tasks. |
|
246 // |
|
247 // See the documentation around TokenizerClassEntries and |
|
248 // FilterClassEntries to see how implementations not taking parameters |
|
249 // can be easily added. |
|
250 // |
|
251 |
|
252 using namespace Cpix::AnalyzerExp; |
|
253 |
|
254 /** |
|
255 * Creates token stream for the given reader and fieldName. |
|
256 * This class in in many ways similar to CLucene analyzer class |
|
257 * definition. |
|
258 */ |
|
259 class TokenStreamFactory { |
|
260 public: |
|
261 virtual ~TokenStreamFactory(); |
|
262 virtual lucene::analysis::TokenStream* tokenStream(const wchar_t * fieldName, |
|
263 lucene::util::Reader * reader) = 0; |
|
264 }; |
|
265 |
|
266 TokenStreamFactory::~TokenStreamFactory() {}; |
|
267 |
|
268 /** |
|
269 * Template class used to create CLucene tokenizers. Template |
|
270 * parameter T must implement lucene::analysis::Tokenizer abstraction. |
|
271 */ |
|
272 template<class T> |
|
273 class TokenizerFactory : public TokenStreamFactory |
|
274 { |
|
275 public: |
|
276 TokenizerFactory(const Invokation& invokation) { |
|
277 if (invokation.params().size() > 0) { |
|
278 THROW_CPIXEXC(L"Tokenizer %S does not accept parameters", |
|
279 invokation.id().c_str()); |
|
280 } |
|
281 } |
|
282 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * /*fieldName*/, |
|
283 lucene::util::Reader * reader) { |
|
284 return _CLNEW T(reader); |
|
285 } |
|
286 }; |
|
287 |
|
288 /** |
|
289 * Template class wrapping CLucene analyzers. Template parameter T must |
|
290 * implement lucene::analysis::Analyzer abstraction. |
|
291 */ |
|
292 template<class T> |
|
293 class AnalyzerWrap : public TokenStreamFactory |
|
294 { |
|
295 public: |
|
296 AnalyzerWrap(const Invokation& invokation) : analyzer_() { |
|
297 if (invokation.params().size() > 0) { |
|
298 THROW_CPIXEXC(L"Tokenizer %S does not accept parameters", |
|
299 invokation.id().c_str()); |
|
300 } |
|
301 } |
|
302 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
303 lucene::util::Reader * reader) { |
|
304 return analyzer_.tokenStream(fieldName, reader); |
|
305 } |
|
306 private: |
|
307 T analyzer_; |
|
308 }; |
|
309 |
|
310 /** |
|
311 * Template class associated with CLucene filter and a TokenStreamFactory. |
|
312 * Uses TokenStreamFactory to transform given character stream into tokenstream |
|
313 * and then applies the given Clucene filter to the token stream. |
|
314 * The template parameter T must implement lucene::analysis::Filter abstraction. |
|
315 */ |
|
316 template<class T> |
|
317 class FilterFactory : public TokenStreamFactory |
|
318 { |
|
319 public: |
|
320 FilterFactory(const Invokation& invokation, auto_ptr<TokenStreamFactory> factory) : factory_(factory) { |
|
321 if (invokation.params().size() > 0) { |
|
322 THROW_CPIXEXC(L"Filter %S does not accept parameters", |
|
323 invokation.id().c_str()); |
|
324 } |
|
325 } |
|
326 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
327 lucene::util::Reader * reader) { |
|
328 return _CLNEW T(factory_->tokenStream(fieldName, reader), true); |
|
329 } |
|
330 private: |
|
331 std::auto_ptr<TokenStreamFactory> factory_; |
|
332 }; |
|
333 |
|
334 /** |
|
335 * Specialized Analyzer wrap for CLucene's PerFieldAnalyzer. Specialized |
|
336 * template is needed because perfield analyzer accepts parameters |
|
337 * (specific analyzers for different field plus default analyzer) |
|
338 */ |
|
339 template<> |
|
340 class AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper> : public TokenStreamFactory { |
|
341 public: |
|
342 AnalyzerWrap(const Switch& sw) : analyzer_(0) { |
|
343 using namespace Cpt::Parser; |
|
344 using namespace lucene::analysis; |
|
345 |
|
346 analyzer_ = _CLNEW PerFieldAnalyzerWrapper(_CLNEW CustomAnalyzer(sw.def())); |
|
347 |
|
348 for (int i = 0; i < sw.cases().size(); i++) { |
|
349 const Case& cs = *sw.cases()[i]; |
|
350 for (int j = 0; j < cs.fields().size(); j++) { |
|
351 analyzer_->addAnalyzer( cs.fields()[j].c_str(), _CLNEW CustomAnalyzer( cs.piping() ) ); |
|
352 } |
|
353 } |
|
354 } |
|
355 virtual ~AnalyzerWrap() { |
|
356 _CLDELETE(analyzer_); |
|
357 } |
|
358 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
359 lucene::util::Reader * reader) { |
|
360 return analyzer_->tokenStream(fieldName, reader); |
|
361 } |
|
362 private: |
|
363 lucene::analysis::PerFieldAnalyzerWrapper* analyzer_; |
|
364 }; |
|
365 |
|
366 |
|
367 |
|
368 /** |
|
369 * Specialized StopFilter factory. Specialized filter is needed |
|
370 * because StopFilter needs parameters (stop word list or a language) |
|
371 */ |
|
372 template<> |
|
373 class FilterFactory<lucene::analysis::StopFilter> : public TokenStreamFactory |
|
374 { |
|
375 public: |
|
376 FilterFactory(const Invokation& invokation, |
|
377 auto_ptr<TokenStreamFactory> factory) |
|
378 :words_(0), ownWords_(0), factory_(factory) { |
|
379 using namespace Cpt::Parser; |
|
380 if (invokation.params().size() == 1 && dynamic_cast<Identifier*>(invokation.params()[0])) { |
|
381 Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]); |
|
382 //cpix_LangCode lang; |
|
383 if (id->id() == CPIX_WLANG_EN) { |
|
384 words_ = lucene::analysis::StopAnalyzer::ENGLISH_STOP_WORDS; |
|
385 } else { |
|
386 THROW_CPIXEXC(L"No prepared stopword list for language code '%S'", |
|
387 id->id().c_str()); |
|
388 } |
|
389 } else { |
|
390 ownWords_ = new wchar_t*[invokation.params().size()+1]; |
|
391 memset(ownWords_, 0, sizeof(wchar_t*)*(invokation.params().size()+1)); |
|
392 // FIXE: args may leak |
|
393 for (int i = 0; i < invokation.params().size(); i++) { |
|
394 StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]); |
|
395 if (lit) { |
|
396 const wstring& str = lit->text(); |
|
397 ownWords_[i] = new wchar_t[str.length()+1]; |
|
398 wcscpy(ownWords_[i], str.c_str()); |
|
399 } else { |
|
400 THROW_CPIXEXC(L"StopFilter accepts only language identifer or list of strings as a parameters."); |
|
401 } |
|
402 } |
|
403 } |
|
404 |
|
405 } |
|
406 virtual ~FilterFactory() { |
|
407 if (ownWords_) { |
|
408 for (int i = 0; ownWords_[i]; i++) { |
|
409 delete[] ownWords_[i]; |
|
410 } |
|
411 delete[] ownWords_; |
|
412 } |
|
413 } |
|
414 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
415 lucene::util::Reader * reader) { |
|
416 return _CLNEW lucene::analysis::StopFilter(factory_->tokenStream(fieldName, reader), true, ownWords_ ? const_cast<const wchar_t**>(ownWords_) : words_); |
|
417 } |
|
418 private: |
|
419 const wchar_t **words_; |
|
420 wchar_t **ownWords_; // owned |
|
421 std::auto_ptr<TokenStreamFactory> factory_; |
|
422 }; |
|
423 |
|
424 /** |
|
425 * Specialized SnowballFilter factory is needed, because SnowballFilter |
|
426 * accepts parameters (the language). |
|
427 */ |
|
428 template<> |
|
429 class FilterFactory<lucene::analysis::SnowballFilter> : public TokenStreamFactory |
|
430 { |
|
431 public: |
|
432 FilterFactory(const Invokation& invokation, |
|
433 auto_ptr<TokenStreamFactory> factory) |
|
434 : factory_(factory) { |
|
435 using namespace Cpt::Parser; |
|
436 if (invokation.params().size() != 1 || !dynamic_cast<Identifier*>(invokation.params()[0])) { |
|
437 THROW_CPIXEXC(L"Snowball filter takes exactly one identifier as a parameter." ); |
|
438 } |
|
439 Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]); |
|
440 if (id->id() == CPIX_WLANG_EN) { |
|
441 lang_ = cpix_LANG_EN; |
|
442 } else { |
|
443 THROW_CPIXEXC(L"Language identifier %S is not supported for stemming", |
|
444 id->id().c_str()); |
|
445 } |
|
446 } |
|
447 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
448 lucene::util::Reader * reader) { |
|
449 return _CLNEW lucene::analysis::SnowballFilter(factory_->tokenStream(fieldName, reader), true, lang_); |
|
450 } |
|
451 private: |
|
452 cpix_LangCode lang_; |
|
453 std::auto_ptr<TokenStreamFactory> factory_; |
|
454 }; |
|
455 |
|
456 /** |
|
457 * Specialized LengthFilter factory is needed, because length filter |
|
458 * accepts parameters (minimum length and maximum length) |
|
459 */ |
|
460 template<> |
|
461 class FilterFactory<lucene::analysis::LengthFilter> : public TokenStreamFactory |
|
462 { |
|
463 public: |
|
464 FilterFactory(const Invokation& invokation, |
|
465 auto_ptr<TokenStreamFactory> factory) |
|
466 : factory_(factory) { |
|
467 using namespace Cpt::Parser; |
|
468 if (!(invokation.params().empty())) { |
|
469 if (invokation.params().size() != 2 || |
|
470 !dynamic_cast<IntegerLit*>(invokation.params()[0]) || |
|
471 !dynamic_cast<IntegerLit*>(invokation.params()[1])) { |
|
472 THROW_CPIXEXC("Length filter takes exactly two integer parameters"); |
|
473 } |
|
474 min_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value(); |
|
475 max_ = dynamic_cast<IntegerLit*>(invokation.params()[1])->value(); |
|
476 } |
|
477 } |
|
478 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
479 lucene::util::Reader * reader) { |
|
480 return _CLNEW lucene::analysis::LengthFilter(factory_->tokenStream(fieldName, reader), true, min_, max_ ); |
|
481 } |
|
482 private: |
|
483 int min_, max_; |
|
484 std::auto_ptr<TokenStreamFactory> factory_; |
|
485 }; |
|
486 |
|
487 /** |
|
488 * Specialized PrefixGenerator factory is needed, because PrefixGenerator |
|
489 * requires the max prefix size. |
|
490 */ |
|
491 template<> |
|
492 class FilterFactory<PrefixGenerator> : public TokenStreamFactory |
|
493 { |
|
494 public: |
|
495 FilterFactory(const Invokation& invokation, |
|
496 auto_ptr<TokenStreamFactory> factory) |
|
497 : factory_(factory) { |
|
498 using namespace Cpt::Parser; |
|
499 if (invokation.params().empty()) { |
|
500 if (invokation.params().size() != 1 || |
|
501 !dynamic_cast<IntegerLit*>(invokation.params()[0])) { |
|
502 THROW_CPIXEXC("Prefix generator takes exactly one integer parameter"); |
|
503 } |
|
504 maxPrefixLength_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value(); |
|
505 } |
|
506 } |
|
507 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
508 lucene::util::Reader * reader) { |
|
509 return _CLNEW PrefixGenerator(factory_->tokenStream(fieldName, reader), true, maxPrefixLength_ ); |
|
510 } |
|
511 private: |
|
512 int maxPrefixLength_; |
|
513 std::auto_ptr<TokenStreamFactory> factory_; |
|
514 }; |
|
515 |
|
516 |
|
517 typedef auto_ptr<TokenStreamFactory> (*TokenizerFactoryCreator)(const Invokation& invokation); |
|
518 typedef auto_ptr<TokenStreamFactory> (*FilterFactoryCreator)(const Invokation& invokation, |
|
519 auto_ptr<TokenStreamFactory> factory); |
|
520 /** |
|
521 * Sets up a tokenizer factory with given invokation parameters |
|
522 */ |
|
523 template<class T> |
|
524 struct TokenizerFactoryCtor |
|
525 { |
|
526 static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) { |
|
527 return auto_ptr<TokenStreamFactory>(new TokenizerFactory<T>(invokation)); |
|
528 } |
|
529 }; |
|
530 |
|
531 /** |
|
532 * Sets up an analyzer wrap with given invokation parameters |
|
533 */ |
|
534 template<class T> |
|
535 struct AnalyzerWrapCtor |
|
536 { |
|
537 static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) { |
|
538 return auto_ptr<TokenStreamFactory>(new AnalyzerWrap<T>(invokation)); |
|
539 } |
|
540 }; |
|
541 |
|
542 /** |
|
543 * Sets up a filter factory with given invokation parameters |
|
544 */ |
|
545 template<class T> |
|
546 struct FilterFactoryCtor |
|
547 { |
|
548 static auto_ptr<TokenStreamFactory> create(const Invokation& invokation, |
|
549 auto_ptr<TokenStreamFactory> factory) { |
|
550 return auto_ptr<TokenStreamFactory>(new FilterFactory<T>(invokation, factory)); |
|
551 } |
|
552 }; |
|
553 |
|
554 struct TokenizerClassEntry { |
|
555 const wchar_t *id_; |
|
556 TokenizerFactoryCreator createFactory_; |
|
557 }; |
|
558 |
|
559 // |
|
560 // Following TokenizerClassEntries and FilterClassEntries contain |
|
561 // the mapping from tokenizer/analyzer/filter names into glue code |
|
562 // templates providing the implementations. |
|
563 // |
|
564 |
|
565 TokenizerClassEntry TokenizerClassEntries[] = { |
|
566 {CPIX_TOKENIZER_STANDARD, TokenizerFactoryCtor<lucene::analysis::standard::StandardTokenizer>::create}, |
|
567 {CPIX_TOKENIZER_WHITESPACE, TokenizerFactoryCtor<lucene::analysis::WhitespaceTokenizer>::create}, |
|
568 {CPIX_TOKENIZER_LETTER, TokenizerFactoryCtor<lucene::analysis::LetterTokenizer>::create}, |
|
569 {CPIX_TOKENIZER_KEYWORD, TokenizerFactoryCtor<lucene::analysis::KeywordTokenizer>::create}, |
|
570 {CPIX_ANALYZER_STANDARD, AnalyzerWrapCtor<lucene::analysis::standard::StandardAnalyzer>::create}, |
|
571 |
|
572 // TODO: Add more Tokenizers/Analyzers |
|
573 |
|
574 // Example tokenizer (works as such if tokenizers don't take parameters) |
|
575 // {CPIX_TOKENIZER_MYTOKENIZER,TokenizerFactoryCtor<MyTokenizer>::create}, |
|
576 |
|
577 // Example analyzer (works as such if analyzer don't take parameters) |
|
578 // {CPIX_ANALYZER_MYANALYZER, AnalyzerWrapCtor<MyAnalyzer>::create}, |
|
579 |
|
580 {0, 0} |
|
581 }; |
|
582 |
|
583 struct FilterClassEntry { |
|
584 const wchar_t *id_; |
|
585 FilterFactoryCreator createFactory_; |
|
586 }; |
|
587 |
|
588 FilterClassEntry FilterClassEntries[] = { |
|
589 {CPIX_FILTER_STANDARD, FilterFactoryCtor<lucene::analysis::standard::StandardFilter>::create}, |
|
590 {CPIX_FILTER_LOWERCASE, FilterFactoryCtor<lucene::analysis::LowerCaseFilter>::create}, |
|
591 {CPIX_FILTER_ACCENT, FilterFactoryCtor<lucene::analysis::ISOLatin1AccentFilter>::create}, |
|
592 {CPIX_FILTER_STOP, FilterFactoryCtor<lucene::analysis::StopFilter>::create}, |
|
593 {CPIX_FILTER_STEM, FilterFactoryCtor<lucene::analysis::SnowballFilter>::create}, |
|
594 {CPIX_FILTER_LENGTH, FilterFactoryCtor<lucene::analysis::LengthFilter>::create}, |
|
595 {CPIX_FILTER_PREFIXES, FilterFactoryCtor<PrefixGenerator>::create}, |
|
596 |
|
597 // TODO: Add more Filters |
|
598 |
|
599 // Example filter (works as such if analyzer don't take parameters) |
|
600 // {CPIX_FILTER_MYFILTER, FilterFactoryCtor<MyFilter>::create}, |
|
601 |
|
602 {0, 0} |
|
603 }; |
|
604 |
|
605 CustomAnalyzer::CustomAnalyzer(const wchar_t* definition) |
|
606 { |
|
607 using namespace Cpt::Lex; |
|
608 using namespace Cpt::Parser; |
|
609 |
|
610 |
|
611 try |
|
612 { |
|
613 // 1. Setup an tokenizer |
|
614 Cpix::AnalyzerExp::Tokenizer |
|
615 tokenizer; |
|
616 StdLexer |
|
617 lexer(tokenizer, definition); |
|
618 |
|
619 // 2. Parse |
|
620 std::auto_ptr<Piping> |
|
621 def = ParsePiping(lexer); |
|
622 lexer.eatEof(); |
|
623 |
|
624 // 3. Setup this item based on parsed definition |
|
625 setup(*def); |
|
626 } |
|
627 catch (Cpt::ITxtCtxtExc & exc) |
|
628 { |
|
629 // provide addition info for thrown exception |
|
630 exc.setContext(definition); |
|
631 |
|
632 // throw it fwd |
|
633 throw; |
|
634 } |
|
635 } |
|
636 |
|
637 CustomAnalyzer::CustomAnalyzer(const Piping& definition) |
|
638 { |
|
639 setup(definition); |
|
640 } |
|
641 using namespace Cpt::Parser; |
|
642 |
|
643 void CustomAnalyzer::setup(const Piping& piping) { |
|
644 |
|
645 // If the first item is invokation, create corresponding analyzer/tokenizer |
|
646 if (dynamic_cast<const Invokation*>(&piping.tokenizer())) |
|
647 { |
|
648 const Invokation& tokenizer = dynamic_cast<const Invokation&>(piping.tokenizer()); |
|
649 TokenizerClassEntry& tokenizerEntry = getTokenizerEntry( tokenizer.id() ); |
|
650 factory_ = tokenizerEntry.createFactory_( tokenizer ); |
|
651 } else { |
|
652 // If the first item is switch statement, create per-field analyzer |
|
653 const Switch& tokenizer = dynamic_cast<const Switch&>(piping.tokenizer()); |
|
654 factory_ = new AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper>( tokenizer ); |
|
655 } |
|
656 |
|
657 // Add filters |
|
658 const std::vector<Invokation*>& filters = piping.filters(); |
|
659 for (int i = 0; i < filters.size(); i++) { |
|
660 FilterClassEntry& filterEntry = getFilterEntry( filters[i]->id() ); |
|
661 factory_ = filterEntry.createFactory_( *filters[i], factory_ ); |
|
662 } |
|
663 } |
|
664 |
|
665 TokenizerClassEntry& CustomAnalyzer::getTokenizerEntry(std::wstring id) { |
|
666 |
|
667 // Looks for a match in the TokenizerClassEntries. After finding |
|
668 // a match it returns a proper tokenizer/analyzer implementation provider |
|
669 // |
|
670 for (int i = 0; TokenizerClassEntries[i].id_; i++) { |
|
671 if (id == std::wstring(TokenizerClassEntries[i].id_)) { |
|
672 return TokenizerClassEntries[i]; |
|
673 } |
|
674 } |
|
675 |
|
676 THROW_CPIXEXC(L"Unknown tokenizer '%S'.", |
|
677 id.c_str()); |
|
678 } |
|
679 |
|
680 FilterClassEntry& CustomAnalyzer::getFilterEntry(std::wstring id) { |
|
681 |
|
682 // Looks for a match in the FilterClassEntries. After finding |
|
683 // a match it returns a proper tokenizer/analyzer implementation |
|
684 // provider |
|
685 // |
|
686 for (int i = 0; FilterClassEntries[i].id_; i++) { |
|
687 if (id == std::wstring(FilterClassEntries[i].id_)) { |
|
688 return FilterClassEntries[i]; |
|
689 } |
|
690 } |
|
691 |
|
692 THROW_CPIXEXC(L"Unknown filter '%S'.", |
|
693 id.c_str()); |
|
694 } |
|
695 |
|
696 CustomAnalyzer::~CustomAnalyzer() {} |
|
697 |
|
698 lucene::analysis::TokenStream* CustomAnalyzer::tokenStream(const wchar_t * fieldName, |
|
699 lucene::util::Reader * reader) { |
|
700 // Utilizes the the token stream factory to form token stream. |
|
701 // token stream factory is prepared during custom analyzer construction |
|
702 // and based on the analyzer definition string. |
|
703 |
|
704 return factory_->tokenStream(fieldName, reader); |
|
705 } |
|
706 |
|
707 } |
330 } |
708 |
331 |