48 } |
49 } |
49 |
50 |
50 |
51 |
51 namespace Cpix { |
52 namespace Cpix { |
52 |
53 |
|
54 PrefixGenerator::PrefixGenerator( |
|
55 lucene::analysis::TokenStream* in, |
|
56 bool deleteTS, |
|
57 size_t maxPrefixLength) |
|
58 : TokenFilter(in, deleteTS), |
|
59 token_(), |
|
60 prefixLength_(0), |
|
61 maxPrefixLength_(maxPrefixLength) {} |
|
62 |
|
63 |
|
64 PrefixGenerator::~PrefixGenerator() { |
|
65 } |
|
66 |
|
67 |
|
68 bool PrefixGenerator::next(lucene::analysis::Token* token) { |
|
69 token_.setPositionIncrement(0); |
|
70 |
|
71 while (prefixLength_ == 0) { |
|
72 token_.setPositionIncrement(1); // default position increment |
|
73 if (!input->next(&token_)) { |
|
74 return false; |
|
75 } |
|
76 prefixLength_ = std::min(token_.termTextLength(), maxPrefixLength_); |
|
77 } |
|
78 |
|
79 // Clip token |
|
80 std::wstring clipped; |
|
81 clipped = token_.termText(); |
|
82 token_.setText(clipped.substr(0, prefixLength_).c_str()); |
|
83 |
|
84 // Copy |
|
85 token->set(token_.termText(), token_.startOffset(), token_.endOffset(), token_.type()); |
|
86 token->setPositionIncrement(token_.getPositionIncrement()); |
|
87 |
|
88 // Reduce prefixLength_ |
|
89 prefixLength_--; |
|
90 return true; |
|
91 } |
53 |
92 |
54 AggregateFieldTokenStream::AggregateFieldTokenStream(lucene::analysis::Analyzer& analyzer, |
93 AggregateFieldTokenStream::AggregateFieldTokenStream(lucene::analysis::Analyzer& analyzer, |
55 DocumentFieldIterator* fields) |
94 DocumentFieldIterator* fields) |
56 : stream_(), analyzer_( analyzer ), reader_(), fields_( fields ) { |
95 : stream_(), analyzer_( analyzer ), reader_(), fields_( fields ) { |
57 getNextStream(); |
96 getNextStream(); |
134 |
173 |
135 lucene::analysis::TokenStream* AggregateFieldAnalyzer::tokenStream(const TCHAR * fieldName, |
174 lucene::analysis::TokenStream* AggregateFieldAnalyzer::tokenStream(const TCHAR * fieldName, |
136 lucene::util::Reader * reader) { |
175 lucene::util::Reader * reader) { |
137 if ( wcscmp( fieldName, LCPIX_DEFAULT_FIELD ) == 0 ) { |
176 if ( wcscmp( fieldName, LCPIX_DEFAULT_FIELD ) == 0 ) { |
138 return new AggregateFieldTokenStream( analyzer_, document_.fields()); |
177 return new AggregateFieldTokenStream( analyzer_, document_.fields()); |
|
178 } else if ( wcscmp( fieldName, LCPIX_DEFAULT_PREFIX_FIELD ) == 0 ) { |
|
179 return |
|
180 new PrefixGenerator( |
|
181 new AggregateFieldTokenStream( analyzer_, document_.fields()), |
|
182 true, |
|
183 OPTIMIZED_PREFIX_MAX_LENGTH); |
139 } else { |
184 } else { |
140 return analyzer_.tokenStream( fieldName, reader ); |
185 return analyzer_.tokenStream( fieldName, reader ); |
141 } |
186 } |
142 } |
187 } |
143 |
188 |
426 } |
471 } |
427 private: |
472 private: |
428 int min_, max_; |
473 int min_, max_; |
429 std::auto_ptr<TokenStreamFactory> factory_; |
474 std::auto_ptr<TokenStreamFactory> factory_; |
430 }; |
475 }; |
|
476 |
|
477 /** |
|
478 * Specialized PrefixGenerator factory is needed, because PrefixGenerator |
|
479 * requires the max prefix size. |
|
480 */ |
|
481 template<> |
|
482 class FilterFactory<PrefixGenerator> : public TokenStreamFactory |
|
483 { |
|
484 public: |
|
485 FilterFactory(const Invokation& invokation, |
|
486 auto_ptr<TokenStreamFactory> factory) |
|
487 : factory_(factory) { |
|
488 using namespace Cpt::Parser; |
|
489 if (invokation.params().size() != 1 || |
|
490 !dynamic_cast<IntegerLit*>(invokation.params()[0])) { |
|
491 THROW_CPIXEXC("Prefix generator takes exactly one integer parameter"); |
|
492 } |
|
493 maxPrefixLength_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value(); |
|
494 } |
|
495 virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName, |
|
496 lucene::util::Reader * reader) { |
|
497 return _CLNEW PrefixGenerator(factory_->tokenStream(fieldName, reader), true, maxPrefixLength_ ); |
|
498 } |
|
499 private: |
|
500 int maxPrefixLength_; |
|
501 std::auto_ptr<TokenStreamFactory> factory_; |
|
502 }; |
|
503 |
431 |
504 |
432 typedef auto_ptr<TokenStreamFactory> (*TokenizerFactoryCreator)(const Invokation& invokation); |
505 typedef auto_ptr<TokenStreamFactory> (*TokenizerFactoryCreator)(const Invokation& invokation); |
433 typedef auto_ptr<TokenStreamFactory> (*FilterFactoryCreator)(const Invokation& invokation, |
506 typedef auto_ptr<TokenStreamFactory> (*FilterFactoryCreator)(const Invokation& invokation, |
434 auto_ptr<TokenStreamFactory> factory); |
507 auto_ptr<TokenStreamFactory> factory); |
435 /** |
508 /** |
505 {CPIX_FILTER_LOWERCASE, FilterFactoryCtor<lucene::analysis::LowerCaseFilter>::create}, |
578 {CPIX_FILTER_LOWERCASE, FilterFactoryCtor<lucene::analysis::LowerCaseFilter>::create}, |
506 {CPIX_FILTER_ACCENT, FilterFactoryCtor<lucene::analysis::ISOLatin1AccentFilter>::create}, |
579 {CPIX_FILTER_ACCENT, FilterFactoryCtor<lucene::analysis::ISOLatin1AccentFilter>::create}, |
507 {CPIX_FILTER_STOP, FilterFactoryCtor<lucene::analysis::StopFilter>::create}, |
580 {CPIX_FILTER_STOP, FilterFactoryCtor<lucene::analysis::StopFilter>::create}, |
508 {CPIX_FILTER_STEM, FilterFactoryCtor<lucene::analysis::SnowballFilter>::create}, |
581 {CPIX_FILTER_STEM, FilterFactoryCtor<lucene::analysis::SnowballFilter>::create}, |
509 {CPIX_FILTER_LENGTH, FilterFactoryCtor<lucene::analysis::LengthFilter>::create}, |
582 {CPIX_FILTER_LENGTH, FilterFactoryCtor<lucene::analysis::LengthFilter>::create}, |
|
583 {CPIX_FILTER_PREFIXES, FilterFactoryCtor<PrefixGenerator>::create}, |
510 |
584 |
511 // TODO: Add more Filters |
585 // TODO: Add more Filters |
512 |
586 |
513 // Example filter (works as such if analyzer don't take parameters) |
587 // Example filter (works as such if analyzer don't take parameters) |
514 // {CPIX_FILTER_MYFILTER, FilterFactoryCtor<MyFilter>::create}, |
588 // {CPIX_FILTER_MYFILTER, FilterFactoryCtor<MyFilter>::create}, |