searchengine/cpix/cpix/src/analyzer.cpp
changeset 10 afe194b6b1cd
parent 7 a5fbfefd615f
child 19 e3c09e9691e0
equal deleted inserted replaced
9:d575fd691cf9 10:afe194b6b1cd
    13 *
    13 *
    14 * Description: 
    14 * Description: 
    15 *
    15 *
    16 */
    16 */
    17 
    17 
    18 
    18 // general utilities
    19 #include "CLucene.h"
       
    20 #include "CLucene/analysis/AnalysisHeader.h"
       
    21 #include "CLucene/analysis/Analyzers.h"
       
    22 
       
    23 #include "analyzer.h"
       
    24 #include "analyzerexp.h"
       
    25 #include "cpixanalyzer.h"
       
    26 #include "cluceneext.h"
       
    27 
       
    28 #include "cpixexc.h"
       
    29 #include "cpixparsetools.h"
       
    30 
       
    31 #include "wchar.h"
    19 #include "wchar.h"
    32 #include <string>
    20 #include <string>
    33 #include <vector>
    21 #include <vector>
    34 #include <sstream>
    22 #include <sstream>
    35 #include <iostream>
    23 #include <iostream>
    36 
    24 #include <fstream>
       
    25 #include <algorithm>
       
    26 
       
    27 // clucene
       
    28 #include "CLucene.h"
       
    29 
       
    30 // support
       
    31 #include "cpixparsetools.h"
       
    32 #include "cpixfstools.h"
       
    33 
       
    34 // internal
       
    35 #include "analyzer.h"
       
    36 #include "cpixanalyzer.h"
       
    37 #include "cpixexc.h"
    37 #include "document.h"
    38 #include "document.h"
    38 
    39 #include "cluceneext.h"
    39 #include "indevicecfg.h" 
    40 #include "indevicecfg.h"
    40 
       
    41 #include "initparams.h"
    41 #include "initparams.h"
       
    42 #include "thaianalysis.h"
       
    43 
       
    44 #include "analyzerexp.h"
       
    45 #include "customanalyzer.h"
       
    46 #include "common/cpixlog.h"
       
    47 
    42 namespace
    48 namespace
    43 {
    49 {
    44     const char AGGR_NONFILEREADERPROXY_ERR[] 
    50     const char AGGR_NONFILEREADERPROXY_ERR[] 
    45     = "Aggregated reader field should be FileReaderProxy instance";
    51     = "Aggregated reader field should be FileReaderProxy instance";
    46 
    52 
    47     const char AGGR_STREAMREADER_ERR[] 
    53     const char AGGR_STREAMREADER_ERR[] 
    48     = "Aggregating streamValue-fields not implemented";
    54     = "Aggregating streamValue-fields not implemented";
       
    55     
       
    56     const char THAI_LANGUAGE_FILE[] 
       
    57     = "thaidict.sm";
       
    58 
       
    59     const char ANALYZER_FILE[]
       
    60     = "analyzer.loc";
       
    61 
       
    62     const wchar_t DEFAULT_ANALYZER_CONFIG[]
       
    63         = L"default";
       
    64 
       
    65     const wchar_t QUERY_ANALYZER_CONFIG[]
       
    66         = L"query";
       
    67 
       
    68     const wchar_t PREFIX_ANALYZER_CONFIG[]
       
    69         = L"prefix";
       
    70 
       
    71 //    const wchar_t CPIX_ANALYZER_FALLBACK[]
       
    72 //    = CPIX_ANALYZER_STANDARD;
       
    73 //
       
    74 //    const wchar_t CPIX_PREFIX_ANALYZER_FALLBACK[]
       
    75 //    = CPIX_TOKENIZER_LETTER L">" CPIX_FILTER_LOWERCASE;
       
    76 
       
    77     
    49 }
    78 }
    50 
    79 
    51 
    80 
    52 namespace Cpix {
    81 namespace Cpix {
       
    82 
       
    83 	
       
    84 Analysis* Analysis::theInstance_ = NULL; 
       
    85 
       
    86 	void Analysis::init(InitParams& ip) {
       
    87 		// Init thai analysis with thai dictionary
       
    88 		std::string thai( Cpt::appendpath(ip.getResourceDir(),
       
    89 										  THAI_LANGUAGE_FILE) );
       
    90 		
       
    91 		if ( Cpt::filesize( thai.c_str() ) ) {
       
    92 			analysis::InitThaiAnalysis(thai.c_str());
       
    93 		} else {
       
    94 			logMsg(CPIX_LL_WARNING,
       
    95 				   "Thai dictionary could not be found. Thai analysis will NOT work.");
       
    96 		}
       
    97 	
       
    98 		// Setup the analysis instance
       
    99 		theInstance_ = new Analysis(ip);
       
   100 	}
       
   101 	
       
   102 	Analysis::Analysis(InitParams& ip) 
       
   103 	:	defaultAnalyzer_(),
       
   104 	 	queryAnalyzer_(), 
       
   105 		prefixAnalyzer_() {
       
   106 		
       
   107 		auto_ptr<AnalyzerExp::Piping> p = parse( Cpt::appendpath( ip.getResourceDir(), ANALYZER_FILE ) );
       
   108 		
       
   109 		defaultAnalyzer_.reset( new CustomAnalyzer( *p, DEFAULT_ANALYZER_CONFIG ) ); 
       
   110 		queryAnalyzer_.reset( new CustomAnalyzer( *p, QUERY_ANALYZER_CONFIG ) ); 
       
   111 		prefixAnalyzer_.reset( new CustomAnalyzer( *p, PREFIX_ANALYZER_CONFIG ) ); 
       
   112 	}
       
   113 	
       
   114 	auto_ptr<AnalyzerExp::Piping> Analysis::parse(std::string path) {
       
   115 		std::wifstream in(path.c_str());
       
   116 		auto_ptr<AnalyzerExp::Piping> ret; 
       
   117 		if ( in ) {
       
   118 		
       
   119 			// Reserve constant size buffer and populate it with definition
       
   120 			//
       
   121 			int filesize = Cpt::filesize(path.c_str()); 
       
   122 			Cpt::auto_array<wchar_t> buf( new wchar_t[filesize+1] );
       
   123 			in.read(buf.get(), filesize);
       
   124 			buf.get()[filesize] = '\0'; 
       
   125 			if ( !in.fail() ) {
       
   126 				try {
       
   127 					ret = AnalyzerExp::ParsePiping( buf.get() );
       
   128 				} catch (...) {}
       
   129 			} 
       
   130 			in.close();
       
   131 		} 
       
   132 		
       
   133 		if ( !ret.get() ) { 
       
   134 			THROW_CPIXEXC("Analyzer definition not found. %s could not be opened. ", path.c_str()); 
       
   135 		}
       
   136 		return ret; 
       
   137 	}
       
   138 	
       
   139 	void Analysis::shutdown() {
       
   140 		analysis::ShutdownThaiAnalysis(); 
       
   141 		delete theInstance_;
       
   142 		theInstance_ = NULL; 
       
   143 	}
       
   144 
       
   145 	lucene::analysis::Analyzer& Analysis::getDefaultAnalyzer() {
       
   146 		// TODO: Assert( theInstance_ );
       
   147 		return *theInstance_->defaultAnalyzer_; 
       
   148 	}
       
   149 
       
   150 	lucene::analysis::Analyzer& Analysis::getQueryAnalyzer() {
       
   151 		// TODO: Assert( theInstance_ );
       
   152 		return *theInstance_->queryAnalyzer_; 
       
   153 	}
       
   154 
       
   155 	lucene::analysis::Analyzer& Analysis::getPrefixAnalyzer() {
       
   156 		// TODO: Assert( theInstance_ );
       
   157 		return *theInstance_->prefixAnalyzer_; 
       
   158 	}
    53 
   159 
    54 	PrefixGenerator::PrefixGenerator(
   160 	PrefixGenerator::PrefixGenerator(
    55 		lucene::analysis::TokenStream* in, 
   161 		lucene::analysis::TokenStream* in, 
    56 		bool deleteTS, 
   162 		bool deleteTS, 
    57 		size_t maxPrefixLength) 
   163 		size_t maxPrefixLength) 
   219             return ret;
   325             return ret;
   220         } else {
   326         } else {
   221             return analyzer_->tokenStream( fieldName, reader ); 
   327             return analyzer_->tokenStream( fieldName, reader ); 
   222         }									 
   328         }									 
   223     }
   329     }
   224 
       
   225     //
       
   226     // Following sections provide the glue code for connecting the 
       
   227     // analyzer definition syntax with analyzer, tokenizers and filter 
       
   228     // implementations. 
       
   229     //
       
   230     // The glue code is template heavy with the indent of providing 
       
   231     // automation for associating specific keywords with specific
       
   232     // analyzers, tokenizers and filters implementing corresponding 
       
   233     // CLucene abstractions. Additional classes are needed only if 
       
   234     // filters, tokenizers, etc. accept parameters.
       
   235     //
       
   236     // NOTE: To understand the analyzers, it is sufficient to understand
       
   237     // that an analyzer transforms characters stream into specific token streams 
       
   238     // (e.g. character stream 'foobarmetawords' can be transformed into token 
       
   239     // stream 'foo', 'bar' 'meta' 'words'). Analysis consist of two main
       
   240     // parts which are tokenization and filtering. Tokenization converts
       
   241     // the character stream into token stream (e.g. 'FoO bAr' -> 'FoO' 'bAr')
       
   242     // and filtering modifies the tokens (e.g. lowercase filtering 'FoO' -> 
       
   243     // 'foo', 'bAr' -> 'bar'). Analyzer as an object is responsible for
       
   244     // constructing a tokenizer and a sequence of filters to perform
       
   245     // these required tasks.  
       
   246     // 
       
   247     // See the documentation around TokenizerClassEntries and 
       
   248     // FilterClassEntries to see how implementations not taking parameters
       
   249     // can be easily added.  
       
   250     // 
       
   251 
       
   252     using namespace Cpix::AnalyzerExp;
       
   253     
       
   254     /**
       
   255      * Creates token stream for the given reader and fieldName.
       
   256      * This class in in many ways similar to CLucene analyzer class 
       
   257      * definition.   
       
   258      */
       
   259     class TokenStreamFactory {
       
   260     public: 
       
   261         virtual ~TokenStreamFactory(); 
       
   262         virtual lucene::analysis::TokenStream* tokenStream(const wchar_t        * fieldName, 
       
   263                                                            lucene::util::Reader * reader) = 0;
       
   264     };
       
   265 	
       
   266     TokenStreamFactory::~TokenStreamFactory() {};
       
   267 	
       
   268     /**
       
   269      * Template class used to create CLucene tokenizers. Template
       
   270      * parameter T must implement lucene::analysis::Tokenizer abstraction.  
       
   271      */    
       
   272     template<class T>
       
   273     class TokenizerFactory : public TokenStreamFactory 
       
   274     {
       
   275     public:
       
   276         TokenizerFactory(const Invokation& invokation) {
       
   277             if (invokation.params().size() > 0) {
       
   278                 THROW_CPIXEXC(L"Tokenizer %S does not accept parameters",
       
   279                               invokation.id().c_str());
       
   280             }
       
   281         }
       
   282         virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * /*fieldName*/, 
       
   283                                                            lucene::util::Reader * reader) {
       
   284             return _CLNEW T(reader); 
       
   285         }
       
   286     };
       
   287 
       
   288     /**
       
   289      * Template class wrapping CLucene analyzers. Template parameter T must 
       
   290      * implement lucene::analysis::Analyzer abstraction.  
       
   291      */    
       
   292     template<class T>
       
   293     class AnalyzerWrap : public TokenStreamFactory 
       
   294     {
       
   295     public:
       
   296         AnalyzerWrap(const Invokation& invokation) : analyzer_() {
       
   297             if (invokation.params().size() > 0) {
       
   298                 THROW_CPIXEXC(L"Tokenizer %S does not accept parameters",
       
   299                               invokation.id().c_str());
       
   300             }
       
   301         }
       
   302         virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   303                                                            lucene::util::Reader * reader) {
       
   304             return analyzer_.tokenStream(fieldName, reader); 
       
   305         }
       
   306     private: 
       
   307         T analyzer_;
       
   308     };
       
   309 
       
   310     /**
       
   311      * Template class associated with CLucene filter and a TokenStreamFactory. 
       
   312      * Uses TokenStreamFactory to transform given character stream into tokenstream
       
   313      * and then applies the given Clucene filter to the token stream. 
       
   314      * The template parameter T must implement lucene::analysis::Filter abstraction.     
       
   315      */    
       
   316     template<class T>
       
   317     class FilterFactory : public TokenStreamFactory 
       
   318     {
       
   319     public:
       
   320         FilterFactory(const Invokation& invokation, auto_ptr<TokenStreamFactory> factory) : factory_(factory) {
       
   321             if (invokation.params().size() > 0) {
       
   322                 THROW_CPIXEXC(L"Filter %S does not accept parameters",
       
   323                               invokation.id().c_str());
       
   324             }
       
   325         }
       
   326         virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   327                                                            lucene::util::Reader * reader) {
       
   328             return _CLNEW T(factory_->tokenStream(fieldName, reader), true); 
       
   329         }
       
   330     private: 
       
   331         std::auto_ptr<TokenStreamFactory> factory_; 
       
   332     };
       
   333 
       
   334 	/**
       
   335 	 * Specialized Analyzer wrap for CLucene's PerFieldAnalyzer. Specialized
       
   336 	 * template is needed because perfield analyzer accepts parameters
       
   337 	 * (specific analyzers for different field plus default analyzer)
       
   338 	 */
       
   339     template<>
       
   340     class AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper> : public TokenStreamFactory {
       
   341     public:
       
   342         AnalyzerWrap(const Switch& sw) : analyzer_(0) {
       
   343             using namespace Cpt::Parser;
       
   344             using namespace lucene::analysis;
       
   345 			
       
   346             analyzer_ = _CLNEW PerFieldAnalyzerWrapper(_CLNEW CustomAnalyzer(sw.def()));
       
   347 			
       
   348             for (int i = 0; i < sw.cases().size(); i++) {
       
   349                 const Case& cs = *sw.cases()[i];
       
   350                 for (int j = 0; j < cs.fields().size(); j++) {
       
   351                     analyzer_->addAnalyzer( cs.fields()[j].c_str(), _CLNEW CustomAnalyzer( cs.piping() ) );
       
   352                 }
       
   353             }
       
   354         }
       
   355         virtual ~AnalyzerWrap() {
       
   356             _CLDELETE(analyzer_);
       
   357         }
       
   358         virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   359                                                            lucene::util::Reader * reader) {
       
   360             return analyzer_->tokenStream(fieldName, reader); 
       
   361         }
       
   362     private: 
       
   363         lucene::analysis::PerFieldAnalyzerWrapper* analyzer_;
       
   364     };
       
   365 		
       
   366 	
       
   367 	
       
   368 	/**
       
   369 	 * Specialized StopFilter factory. Specialized filter is needed
       
   370 	 * because StopFilter needs parameters (stop word list or a language) 
       
   371 	 */
       
   372     template<>
       
   373     class FilterFactory<lucene::analysis::StopFilter> : public TokenStreamFactory 
       
   374     {
       
   375     public:
       
   376         FilterFactory(const Invokation& invokation,
       
   377                       auto_ptr<TokenStreamFactory> factory)
       
   378             :words_(0),  ownWords_(0), factory_(factory) {
       
   379             using namespace Cpt::Parser;
       
   380             if (invokation.params().size() == 1 && dynamic_cast<Identifier*>(invokation.params()[0])) {
       
   381                 Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
       
   382                 //cpix_LangCode lang; 
       
   383                 if (id->id() == CPIX_WLANG_EN) {
       
   384                     words_ = lucene::analysis::StopAnalyzer::ENGLISH_STOP_WORDS;
       
   385                 } else {
       
   386                     THROW_CPIXEXC(L"No prepared stopword list for language code '%S'",
       
   387                                   id->id().c_str());
       
   388                 }
       
   389             } else {
       
   390                 ownWords_ = new wchar_t*[invokation.params().size()+1];
       
   391                 memset(ownWords_, 0, sizeof(wchar_t*)*(invokation.params().size()+1)); 
       
   392                 // FIXE: args may leak
       
   393                 for (int i = 0; i < invokation.params().size(); i++) {
       
   394                     StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]);
       
   395                     if (lit) {
       
   396                         const wstring& str = lit->text(); 
       
   397                         ownWords_[i] = new wchar_t[str.length()+1]; 
       
   398                         wcscpy(ownWords_[i], str.c_str());
       
   399                     } else {
       
   400                         THROW_CPIXEXC(L"StopFilter accepts only language identifer or list of strings as a parameters.");
       
   401                     }
       
   402                 }
       
   403             }
       
   404 		
       
   405         }
       
   406         virtual ~FilterFactory() { 
       
   407             if (ownWords_) {
       
   408                 for (int i = 0; ownWords_[i]; i++) {
       
   409                     delete[] ownWords_[i]; 
       
   410                 }
       
   411                 delete[] ownWords_;
       
   412             }
       
   413         }
       
   414         virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   415                                                            lucene::util::Reader * reader) {
       
   416             return _CLNEW lucene::analysis::StopFilter(factory_->tokenStream(fieldName, reader), true, ownWords_ ? const_cast<const wchar_t**>(ownWords_) : words_); 
       
   417         }
       
   418     private: 
       
   419         const wchar_t **words_;
       
   420         wchar_t **ownWords_; // owned
       
   421         std::auto_ptr<TokenStreamFactory> factory_; 
       
   422     };
       
   423 	
       
   424     /**
       
   425      * Specialized SnowballFilter factory is needed, because SnowballFilter
       
   426      * accepts parameters (the language). 
       
   427      */
       
   428     template<>
       
   429     class FilterFactory<lucene::analysis::SnowballFilter> : public TokenStreamFactory 
       
   430     {
       
   431     public:
       
   432         FilterFactory(const Invokation& invokation, 		
       
   433                       auto_ptr<TokenStreamFactory> factory)
       
   434             : factory_(factory) {
       
   435             using namespace Cpt::Parser;
       
   436             if (invokation.params().size() != 1 || !dynamic_cast<Identifier*>(invokation.params()[0])) {
       
   437                 THROW_CPIXEXC(L"Snowball filter takes exactly one identifier as a parameter." );
       
   438             }
       
   439             Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
       
   440             if (id->id() == CPIX_WLANG_EN) {
       
   441                 lang_ = cpix_LANG_EN; 
       
   442             } else {
       
   443                 THROW_CPIXEXC(L"Language identifier %S is not supported for stemming",
       
   444                               id->id().c_str());
       
   445             }
       
   446         }
       
   447         virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   448                                                            lucene::util::Reader * reader) {
       
   449             return _CLNEW lucene::analysis::SnowballFilter(factory_->tokenStream(fieldName, reader), true, lang_); 
       
   450         }
       
   451     private: 
       
   452         cpix_LangCode lang_;
       
   453         std::auto_ptr<TokenStreamFactory> factory_; 
       
   454     };
       
   455 
       
   456     /**
       
   457      * Specialized LengthFilter factory is needed, because length filter 
       
   458      * accepts parameters (minimum length and maximum length)
       
   459      */
       
   460     template<>
       
   461     class FilterFactory<lucene::analysis::LengthFilter> : public TokenStreamFactory 
       
   462     {
       
   463     public:
       
   464         FilterFactory(const Invokation& invokation, 
       
   465                       auto_ptr<TokenStreamFactory> factory) 
       
   466             : factory_(factory) {
       
   467             using namespace Cpt::Parser;
       
   468             if (!(invokation.params().empty())) {
       
   469                 if (invokation.params().size() != 2 || 
       
   470                         !dynamic_cast<IntegerLit*>(invokation.params()[0]) || 
       
   471                         !dynamic_cast<IntegerLit*>(invokation.params()[1])) {
       
   472                     THROW_CPIXEXC("Length filter takes exactly two integer parameters");
       
   473                 }
       
   474                 min_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value();
       
   475                 max_ = dynamic_cast<IntegerLit*>(invokation.params()[1])->value();
       
   476             }
       
   477         }
       
   478         virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   479                                                            lucene::util::Reader * reader) {
       
   480             return _CLNEW lucene::analysis::LengthFilter(factory_->tokenStream(fieldName, reader), true, min_, max_ ); 
       
   481         }
       
   482     private: 
       
   483         int min_, max_;
       
   484         std::auto_ptr<TokenStreamFactory> factory_; 
       
   485     };
       
   486     
       
   487     /**
       
   488      * Specialized PrefixGenerator factory is needed, because PrefixGenerator
       
   489      * requires the max prefix size. 
       
   490      */
       
   491     template<>
       
   492     class FilterFactory<PrefixGenerator> : public TokenStreamFactory 
       
   493     {
       
   494     public:
       
   495         FilterFactory(const Invokation& invokation, 
       
   496                       auto_ptr<TokenStreamFactory> factory) 
       
   497             : factory_(factory) {
       
   498             using namespace Cpt::Parser;
       
   499             if (invokation.params().empty()) {
       
   500                 if (invokation.params().size() != 1 || 
       
   501                     !dynamic_cast<IntegerLit*>(invokation.params()[0])) {
       
   502                     THROW_CPIXEXC("Prefix generator takes exactly one integer parameter");
       
   503                 }            
       
   504                 maxPrefixLength_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value();
       
   505             }
       
   506         }
       
   507         virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   508                                                            lucene::util::Reader * reader) {
       
   509             return _CLNEW PrefixGenerator(factory_->tokenStream(fieldName, reader), true, maxPrefixLength_ ); 
       
   510         }
       
   511     private: 
       
   512         int maxPrefixLength_;
       
   513         std::auto_ptr<TokenStreamFactory> factory_; 
       
   514     };
       
   515 
       
   516 
       
   517     typedef auto_ptr<TokenStreamFactory> (*TokenizerFactoryCreator)(const Invokation& invokation);
       
   518     typedef auto_ptr<TokenStreamFactory> (*FilterFactoryCreator)(const Invokation& invokation, 
       
   519                                                                  auto_ptr<TokenStreamFactory> factory);
       
   520     /**
       
   521      * Sets up a tokenizer factory with given invokation parameters
       
   522      */
       
   523     template<class T>
       
   524     struct TokenizerFactoryCtor
       
   525     {
       
   526         static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) {
       
   527             return auto_ptr<TokenStreamFactory>(new TokenizerFactory<T>(invokation)); 
       
   528         }
       
   529     };
       
   530 
       
   531     /**
       
   532      * Sets up an analyzer wrap with given invokation parameters
       
   533      */
       
   534     template<class T>
       
   535     struct AnalyzerWrapCtor
       
   536     {
       
   537         static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) {
       
   538             return auto_ptr<TokenStreamFactory>(new AnalyzerWrap<T>(invokation)); 
       
   539         }
       
   540     };
       
   541 
       
   542     /**
       
   543 	 * Sets up a filter factory with given invokation parameters
       
   544 	 */
       
   545 	template<class T>
       
   546 	struct FilterFactoryCtor 
       
   547     {
       
   548         static auto_ptr<TokenStreamFactory> create(const Invokation& invokation,
       
   549                                                    auto_ptr<TokenStreamFactory> factory) {
       
   550             return auto_ptr<TokenStreamFactory>(new FilterFactory<T>(invokation, factory)); 
       
   551         }
       
   552     };
       
   553 
       
   554     struct TokenizerClassEntry {
       
   555         const wchar_t *id_;
       
   556         TokenizerFactoryCreator createFactory_;
       
   557     };
       
   558     
       
   559     //
       
   560     // Following TokenizerClassEntries and FilterClassEntries contain
       
   561     // the mapping from tokenizer/analyzer/filter names into glue code
       
   562     // templates providing the implementations. 
       
   563     // 
       
   564 	
       
   565     TokenizerClassEntry TokenizerClassEntries[] = { 
       
   566         {CPIX_TOKENIZER_STANDARD, 	TokenizerFactoryCtor<lucene::analysis::standard::StandardTokenizer>::create},
       
   567         {CPIX_TOKENIZER_WHITESPACE, TokenizerFactoryCtor<lucene::analysis::WhitespaceTokenizer>::create},
       
   568         {CPIX_TOKENIZER_LETTER, 	TokenizerFactoryCtor<lucene::analysis::LetterTokenizer>::create},
       
   569         {CPIX_TOKENIZER_KEYWORD, 	TokenizerFactoryCtor<lucene::analysis::KeywordTokenizer>::create},
       
   570         {CPIX_ANALYZER_STANDARD, 	AnalyzerWrapCtor<lucene::analysis::standard::StandardAnalyzer>::create},
       
   571 
       
   572 // 		TODO: Add more Tokenizers/Analyzers
       
   573         
       
   574 // 		Example tokenizer (works as such if tokenizers don't take parameters)
       
   575 //      {CPIX_TOKENIZER_MYTOKENIZER,TokenizerFactoryCtor<MyTokenizer>::create},
       
   576 
       
   577 // 		Example analyzer (works as such if analyzer don't take parameters)
       
   578 //      {CPIX_ANALYZER_MYANALYZER,	AnalyzerWrapCtor<MyAnalyzer>::create},
       
   579 
       
   580         {0, 						0}
       
   581     };
       
   582 	
       
   583     struct FilterClassEntry {
       
   584         const wchar_t *id_;
       
   585         FilterFactoryCreator createFactory_;
       
   586     };
       
   587 
       
   588     FilterClassEntry FilterClassEntries[] = {
       
   589         {CPIX_FILTER_STANDARD, 	FilterFactoryCtor<lucene::analysis::standard::StandardFilter>::create},
       
   590         {CPIX_FILTER_LOWERCASE, FilterFactoryCtor<lucene::analysis::LowerCaseFilter>::create},
       
   591         {CPIX_FILTER_ACCENT, 	FilterFactoryCtor<lucene::analysis::ISOLatin1AccentFilter>::create},
       
   592         {CPIX_FILTER_STOP, 		FilterFactoryCtor<lucene::analysis::StopFilter>::create},
       
   593         {CPIX_FILTER_STEM, 		FilterFactoryCtor<lucene::analysis::SnowballFilter>::create},
       
   594         {CPIX_FILTER_LENGTH, 	FilterFactoryCtor<lucene::analysis::LengthFilter>::create},
       
   595         {CPIX_FILTER_PREFIXES, 	FilterFactoryCtor<PrefixGenerator>::create},
       
   596 
       
   597 // 		TODO: Add more Filters
       
   598 
       
   599 // 		Example filter (works as such if analyzer don't take parameters)
       
   600 //      {CPIX_FILTER_MYFILTER,	FilterFactoryCtor<MyFilter>::create},
       
   601 
       
   602         {0, 					0}
       
   603     };
       
   604 	
       
   605     CustomAnalyzer::CustomAnalyzer(const wchar_t* definition)
       
   606     {
       
   607         using namespace Cpt::Lex;
       
   608         using namespace Cpt::Parser;
       
   609 
       
   610 
       
   611         try
       
   612             {
       
   613 				// 1. Setup an tokenizer
       
   614                 Cpix::AnalyzerExp::Tokenizer 
       
   615                     tokenizer; 
       
   616                 StdLexer 
       
   617                     lexer(tokenizer, definition);
       
   618                 
       
   619                 // 2. Parse 
       
   620                 std::auto_ptr<Piping> 
       
   621                     def = ParsePiping(lexer); 
       
   622                 lexer.eatEof();
       
   623                 
       
   624                 // 3. Setup this item based on parsed definition
       
   625                 setup(*def);
       
   626             }
       
   627         catch (Cpt::ITxtCtxtExc & exc)
       
   628             {
       
   629                 // provide addition info for thrown exception
       
   630                 exc.setContext(definition);
       
   631 
       
   632                 // throw it fwd
       
   633                 throw;
       
   634             }
       
   635     }
       
   636 
       
   637     CustomAnalyzer::CustomAnalyzer(const Piping& definition)
       
   638     {	
       
   639         setup(definition);
       
   640     }
       
   641     using namespace Cpt::Parser;
       
   642 	
       
   643     void CustomAnalyzer::setup(const Piping& piping) {
       
   644     
       
   645 		// If the first item is invokation, create corresponding analyzer/tokenizer 
       
   646         if (dynamic_cast<const Invokation*>(&piping.tokenizer())) 
       
   647         {
       
   648             const Invokation& tokenizer = dynamic_cast<const Invokation&>(piping.tokenizer());
       
   649             TokenizerClassEntry& tokenizerEntry = getTokenizerEntry( tokenizer.id() ); 
       
   650             factory_ = tokenizerEntry.createFactory_( tokenizer );
       
   651         } else {
       
   652             // If the first item is switch statement, create per-field analyzer 
       
   653             const Switch& tokenizer = dynamic_cast<const Switch&>(piping.tokenizer());
       
   654             factory_ = new AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper>( tokenizer );
       
   655         }
       
   656         
       
   657         // Add filters
       
   658         const std::vector<Invokation*>& filters = piping.filters(); 
       
   659         for (int i = 0; i < filters.size(); i++) {
       
   660             FilterClassEntry& filterEntry = getFilterEntry( filters[i]->id() ); 
       
   661             factory_ = filterEntry.createFactory_( *filters[i], factory_ );
       
   662         }
       
   663     }
       
   664 
       
   665     TokenizerClassEntry& CustomAnalyzer::getTokenizerEntry(std::wstring id) {
       
   666     
       
   667 		// Looks for a match in the TokenizerClassEntries. After finding 
       
   668 		// a match it returns a proper tokenizer/analyzer implementation provider 
       
   669 		// 
       
   670         for (int i = 0; TokenizerClassEntries[i].id_; i++) {
       
   671             if (id == std::wstring(TokenizerClassEntries[i].id_)) {
       
   672                 return TokenizerClassEntries[i];
       
   673             }
       
   674         }
       
   675 
       
   676         THROW_CPIXEXC(L"Unknown tokenizer '%S'.",
       
   677                       id.c_str());
       
   678     }
       
   679 
       
   680     FilterClassEntry& CustomAnalyzer::getFilterEntry(std::wstring id) {
       
   681     
       
   682 		// Looks for a match in the FilterClassEntries. After finding 
       
   683 		// a match it returns a proper tokenizer/analyzer implementation 
       
   684 		// provider 
       
   685 		// 
       
   686         for (int i = 0; FilterClassEntries[i].id_; i++) {
       
   687             if (id == std::wstring(FilterClassEntries[i].id_)) {
       
   688                 return FilterClassEntries[i];
       
   689             }
       
   690         }
       
   691 
       
   692         THROW_CPIXEXC(L"Unknown filter '%S'.",
       
   693                       id.c_str());
       
   694     }
       
   695 	
       
   696     CustomAnalyzer::~CustomAnalyzer() {} 
       
   697 
       
   698     lucene::analysis::TokenStream* CustomAnalyzer::tokenStream(const wchar_t        * fieldName, 
       
   699                                                                lucene::util::Reader * reader) {
       
   700         // Utilizes the the token stream factory to form token stream. 
       
   701         // token stream factory is prepared during custom analyzer construction
       
   702         // and based on the analyzer definition string.
       
   703                                                                
       
   704         return factory_->tokenStream(fieldName, reader);
       
   705     }
       
   706 
       
   707 }
   330 }
   708 
   331