searchengine/cpix/cpix/src/analyzer.cpp
changeset 1 6f2c1c46032b
parent 0 671dee74050a
child 2 6c1a2771f4b7
equal deleted inserted replaced
0:671dee74050a 1:6f2c1c46032b
    36 
    36 
    37 #include "document.h"
    37 #include "document.h"
    38 
    38 
    39 #include "indevicecfg.h" 
    39 #include "indevicecfg.h" 
    40 
    40 
       
    41 #include "initparams.h"
    41 namespace
    42 namespace
    42 {
    43 {
    43     const char AGGR_NONFILEREADERPROXY_ERR[] 
    44     const char AGGR_NONFILEREADERPROXY_ERR[] 
    44     = "Aggregated reader field should be FileReaderProxy instance";
    45     = "Aggregated reader field should be FileReaderProxy instance";
    45 
    46 
    48 }
    49 }
    49 
    50 
    50 
    51 
    51 namespace Cpix {
    52 namespace Cpix {
    52 
    53 
       
    54 	PrefixGenerator::PrefixGenerator(
       
    55 		lucene::analysis::TokenStream* in, 
       
    56 		bool deleteTS, 
       
    57 		size_t maxPrefixLength) 
       
    58 	: 	TokenFilter(in, deleteTS),
       
    59 	  	token_(), 
       
    60 	  	prefixLength_(0),
       
    61 	  	maxPrefixLength_(maxPrefixLength) {}
       
    62 	
       
    63 	
       
    64 	PrefixGenerator::~PrefixGenerator() {
       
    65 	}
       
    66 
       
    67 	
       
    68 	bool PrefixGenerator::next(lucene::analysis::Token* token) {
       
    69 		token_.setPositionIncrement(0); 
       
    70 
       
    71 		while (prefixLength_ == 0) {
       
    72 			token_.setPositionIncrement(1); // default position increment
       
    73 			if (!input->next(&token_)) {
       
    74 				return false;
       
    75 			}
       
    76 			prefixLength_ = std::min(token_.termTextLength(), maxPrefixLength_);
       
    77 		}
       
    78 			
       
    79 		// Clip token
       
    80 		std::wstring clipped; 
       
    81 		clipped = token_.termText();
       
    82 		token_.setText(clipped.substr(0, prefixLength_).c_str());
       
    83 		
       
    84 		// Copy
       
    85 		token->set(token_.termText(), token_.startOffset(), token_.endOffset(), token_.type());
       
    86 		token->setPositionIncrement(token_.getPositionIncrement());
       
    87 		
       
    88 		// Reduce prefixLength_
       
    89 		prefixLength_--;
       
    90 		return true; 
       
    91 	}
    53 
    92 
    54     AggregateFieldTokenStream::AggregateFieldTokenStream(lucene::analysis::Analyzer& analyzer, 
    93     AggregateFieldTokenStream::AggregateFieldTokenStream(lucene::analysis::Analyzer& analyzer, 
    55                                                          DocumentFieldIterator* fields) 
    94                                                          DocumentFieldIterator* fields) 
    56 	: stream_(), analyzer_( analyzer ), reader_(), fields_( fields ) {
    95 	: stream_(), analyzer_( analyzer ), reader_(), fields_( fields ) {
    57         getNextStream(); 
    96         getNextStream(); 
   134 	
   173 	
   135     lucene::analysis::TokenStream* AggregateFieldAnalyzer::tokenStream(const TCHAR     * fieldName, 
   174     lucene::analysis::TokenStream* AggregateFieldAnalyzer::tokenStream(const TCHAR     * fieldName, 
   136                                                                        lucene::util::Reader * reader) {
   175                                                                        lucene::util::Reader * reader) {
   137         if ( wcscmp( fieldName, LCPIX_DEFAULT_FIELD ) == 0 ) {
   176         if ( wcscmp( fieldName, LCPIX_DEFAULT_FIELD ) == 0 ) {
   138             return new AggregateFieldTokenStream( analyzer_, document_.fields()); 
   177             return new AggregateFieldTokenStream( analyzer_, document_.fields()); 
       
   178         } else if ( wcscmp( fieldName, LCPIX_DEFAULT_PREFIX_FIELD ) == 0 ) {
       
   179             return
       
   180 				new PrefixGenerator(
       
   181 					new AggregateFieldTokenStream( analyzer_, document_.fields()),
       
   182 					true,
       
   183 					OPTIMIZED_PREFIX_MAX_LENGTH);
   139         } else {
   184         } else {
   140             return analyzer_.tokenStream( fieldName, reader ); 
   185             return analyzer_.tokenStream( fieldName, reader ); 
   141         }
   186         }
   142     }
   187     }
   143 	
   188 	
   426         }
   471         }
   427     private: 
   472     private: 
   428         int min_, max_;
   473         int min_, max_;
   429         std::auto_ptr<TokenStreamFactory> factory_; 
   474         std::auto_ptr<TokenStreamFactory> factory_; 
   430     };
   475     };
       
   476     
       
   477     /**
       
   478      * Specialized PrefixGenerator factory is needed, because PrefixGenerator
       
   479      * requires the max prefix size. 
       
   480      */
       
   481     template<>
       
   482     class FilterFactory<PrefixGenerator> : public TokenStreamFactory 
       
   483     {
       
   484     public:
       
   485         FilterFactory(const Invokation& invokation, 
       
   486                       auto_ptr<TokenStreamFactory> factory) 
       
   487             : factory_(factory) {
       
   488             using namespace Cpt::Parser;
       
   489             if (invokation.params().size() != 1 || 
       
   490                 !dynamic_cast<IntegerLit*>(invokation.params()[0])) {
       
   491                 THROW_CPIXEXC("Prefix generator takes exactly one integer parameter");
       
   492             }
       
   493             maxPrefixLength_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value();
       
   494         }
       
   495         virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   496                                                            lucene::util::Reader * reader) {
       
   497             return _CLNEW PrefixGenerator(factory_->tokenStream(fieldName, reader), true, maxPrefixLength_ ); 
       
   498         }
       
   499     private: 
       
   500         int maxPrefixLength_;
       
   501         std::auto_ptr<TokenStreamFactory> factory_; 
       
   502     };
       
   503 
   431 
   504 
   432     typedef auto_ptr<TokenStreamFactory> (*TokenizerFactoryCreator)(const Invokation& invokation);
   505     typedef auto_ptr<TokenStreamFactory> (*TokenizerFactoryCreator)(const Invokation& invokation);
   433     typedef auto_ptr<TokenStreamFactory> (*FilterFactoryCreator)(const Invokation& invokation, 
   506     typedef auto_ptr<TokenStreamFactory> (*FilterFactoryCreator)(const Invokation& invokation, 
   434                                                                  auto_ptr<TokenStreamFactory> factory);
   507                                                                  auto_ptr<TokenStreamFactory> factory);
   435     /**
   508     /**
   505         {CPIX_FILTER_LOWERCASE, FilterFactoryCtor<lucene::analysis::LowerCaseFilter>::create},
   578         {CPIX_FILTER_LOWERCASE, FilterFactoryCtor<lucene::analysis::LowerCaseFilter>::create},
   506         {CPIX_FILTER_ACCENT, 	FilterFactoryCtor<lucene::analysis::ISOLatin1AccentFilter>::create},
   579         {CPIX_FILTER_ACCENT, 	FilterFactoryCtor<lucene::analysis::ISOLatin1AccentFilter>::create},
   507         {CPIX_FILTER_STOP, 		FilterFactoryCtor<lucene::analysis::StopFilter>::create},
   580         {CPIX_FILTER_STOP, 		FilterFactoryCtor<lucene::analysis::StopFilter>::create},
   508         {CPIX_FILTER_STEM, 		FilterFactoryCtor<lucene::analysis::SnowballFilter>::create},
   581         {CPIX_FILTER_STEM, 		FilterFactoryCtor<lucene::analysis::SnowballFilter>::create},
   509         {CPIX_FILTER_LENGTH, 	FilterFactoryCtor<lucene::analysis::LengthFilter>::create},
   582         {CPIX_FILTER_LENGTH, 	FilterFactoryCtor<lucene::analysis::LengthFilter>::create},
       
   583         {CPIX_FILTER_PREFIXES, 	FilterFactoryCtor<PrefixGenerator>::create},
   510 
   584 
   511 // 		TODO: Add more Filters
   585 // 		TODO: Add more Filters
   512 
   586 
   513 // 		Example filter (works as such if analyzer don't take parameters)
   587 // 		Example filter (works as such if analyzer don't take parameters)
   514 //      {CPIX_FILTER_MYFILTER,	FilterFactoryCtor<MyFilter>::create},
   588 //      {CPIX_FILTER_MYFILTER,	FilterFactoryCtor<MyFilter>::create},