searchengine/cpix/cpix/src/customanalyzer.cpp
changeset 10 afe194b6b1cd
child 19 e3c09e9691e0
equal deleted inserted replaced
9:d575fd691cf9 10:afe194b6b1cd
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 
       
    19 // system library
       
    20 #include "wchar.h"
       
    21 #include <string>
       
    22 #include <vector>
       
    23 #include <sstream>
       
    24 #include <iostream>
       
    25 #include <glib.h>
       
    26 
       
    27 // clucene 
       
    28 #include "CLucene.h"
       
    29 #include "CLucene/analysis/AnalysisHeader.h"
       
    30 #include "CLucene/analysis/Analyzers.h"
       
    31 
       
    32 // local libary
       
    33 #include "thaianalysis.h"
       
    34 #include "ngram.h"
       
    35 #include "koreananalyzer.h"
       
    36 #include "cjkanalyzer.h"
       
    37 #include "cpixparsetools.h"
       
    38 #include "prefixfilter.h"
       
    39 
       
    40 // cpix internal
       
    41 #include "customanalyzer.h"
       
    42 #include "cpixanalyzer.h"
       
    43 #include "analyzer.h"
       
    44 #include "cluceneext.h"
       
    45 #include "analyzerexp.h"
       
    46 #include "indevicecfg.h"
       
    47 #include "cpixexc.h"
       
    48 #include "localization.h"
       
    49 
       
    50 namespace Cpix {
       
    51 	
       
    52 	//
       
    53 	// Following sections provide the glue code for connecting the 
       
    54 	// analyzer definition syntax with analyzer, tokenizers and filter 
       
    55 	// implementations. 
       
    56 	//
       
    57 	// The glue code is template heavy with the indent of providing 
       
    58 	// automation for associating specific keywords with specific
       
    59 	// analyzers, tokenizers and filters implementing corresponding 
       
    60 	// CLucene abstractions. Additional classes are needed only if 
       
    61 	// filters, tokenizers, etc. accept parameters.
       
    62 	//
       
    63 	// NOTE: To understand the analyzers, it is sufficient to understand
       
    64 	// that an analyzer transforms characters stream into specific token streams 
       
    65 	// (e.g. character stream 'foobarmetawords' can be transformed into token 
       
    66 	// stream 'foo', 'bar' 'meta' 'words'). Analysis consist of two main
       
    67 	// parts which are tokenization and filtering. Tokenization converts
       
    68 	// the character stream into token stream (e.g. 'FoO bAr' -> 'FoO' 'bAr')
       
    69 	// and filtering modifies the tokens (e.g. lowercase filtering 'FoO' -> 
       
    70 	// 'foo', 'bAr' -> 'bar'). Analyzer as an object is responsible for
       
    71 	// constructing a tokenizer and a sequence of filters to perform
       
    72 	// these required tasks.  
       
    73 	// 
       
    74 	// See the documentation around TokenizerClassEntries and 
       
    75 	// FilterClassEntries to see how implementations not taking parameters
       
    76 	// can be easily added.  
       
    77 	// 
       
    78 	
       
    79 	using namespace Cpix::AnalyzerExp;
       
    80 	
       
    81 // Safe assumption
       
    82 #define MAX_LANGCODE_LENGTH 256
       
    83 	
       
    84 	class LocaleSwitchStreamFactory : public TokenStreamFactory {
       
    85 	public: 
       
    86 		
       
    87 		LocaleSwitchStreamFactory(const AnalyzerExp::LocaleSwitch& sw, const wchar_t* config);
       
    88 		
       
    89 		~LocaleSwitchStreamFactory();
       
    90 		
       
    91 		virtual lucene::analysis::TokenStream* tokenStream(const wchar_t        * fieldName, 
       
    92 														   lucene::util::Reader * reader);
       
    93 		
       
    94 		lucene::analysis::TokenStream* tokenStream(std::vector<std::wstring>& languages, 
       
    95 												   const wchar_t            * fieldName, 
       
    96 												   lucene::util::Reader     * reader);
       
    97 		
       
    98 	private: 
       
    99 		std::map<std::wstring, CustomAnalyzer*> analyzers_;
       
   100 		std::auto_ptr<CustomAnalyzer> default_;  
       
   101 	};
       
   102 
       
   103 
       
   104 	TokenStreamFactory::~TokenStreamFactory() {};
       
   105 
       
   106 	LocaleSwitchStreamFactory::LocaleSwitchStreamFactory(const LocaleSwitch& sw, const wchar_t* config) {
       
   107 		for (int i = 0; i < sw.cases().size(); i++) {
       
   108 			const Case& cs = *sw.cases()[i];
       
   109 			for (int j = 0; j < cs.cases().size(); j++) {
       
   110 				std::wstring c = cs.cases()[j]; 
       
   111 				if (analyzers_.count(c)) delete analyzers_[c]; 
       
   112 				analyzers_[c] = new CustomAnalyzer(cs.piping(), config);
       
   113 			}
       
   114 		}
       
   115 		default_.reset(new CustomAnalyzer(sw.def())); 
       
   116 	}
       
   117 	
       
   118 	LocaleSwitchStreamFactory::~LocaleSwitchStreamFactory() {
       
   119 		typedef std::map<std::wstring, CustomAnalyzer*>::iterator iter;
       
   120 		for (iter i = analyzers_.begin(); i != analyzers_.end(); i++) {
       
   121 			delete i->second;
       
   122 		}
       
   123 	}
       
   124 		
       
   125 	lucene::analysis::TokenStream* 
       
   126 		LocaleSwitchStreamFactory::tokenStream(const wchar_t        * fieldName, 
       
   127 											   lucene::util::Reader * reader) {
       
   128 		std::vector<std::wstring> languages = 
       
   129 				Localization::instance().getLanguageNames();
       
   130 	 
       
   131 		return tokenStream(languages, fieldName, reader); 
       
   132 	}
       
   133 	
       
   134 	lucene::analysis::TokenStream* 
       
   135 		LocaleSwitchStreamFactory::tokenStream(std::vector<std::wstring>& languages, 
       
   136 											   const wchar_t            * fieldName, 
       
   137 											   lucene::util::Reader     * reader) {
       
   138 		for (int i = 0; i < languages.size(); i++) {
       
   139 			if ( analyzers_.count(languages[i]) ) {
       
   140 				return analyzers_[languages[i]]->tokenStream( fieldName, reader );
       
   141 			}
       
   142 		}		
       
   143 		return default_->tokenStream( fieldName, reader ); 
       
   144 	}
       
   145 	
       
   146 	class DefaultTokenStreamFactory : public TokenStreamFactory {
       
   147 	public:
       
   148 	
       
   149 		enum Target {
       
   150 			NORMAL, 
       
   151 			INDEXING,
       
   152 			QUERY,
       
   153 			PREFIX
       
   154 		};
       
   155 	
       
   156 		DefaultTokenStreamFactory(const Invokation& invokation) {
       
   157 			if (invokation.params().size() == 1) {
       
   158 				const Identifier* id = dynamic_cast<const Identifier*>( invokation.params()[0] ); 
       
   159 				if ( id ) {
       
   160 					if ( id->id() == CPIX_ID_INDEXING ) {
       
   161 						target_ = INDEXING;
       
   162 					} else if ( id->id() == CPIX_ID_QUERY ) {
       
   163 						target_ = QUERY;
       
   164 					} else if ( id->id() == CPIX_ID_PREFIX ) {
       
   165 						target_ = PREFIX;
       
   166 					} else {
       
   167 						THROW_CPIXEXC(L"Default analyzer does not accept %S for parameter", id->id().c_str());
       
   168 					}
       
   169 				} else {
       
   170 					THROW_CPIXEXC(L"Default accepts only identifier as a parameter.");
       
   171 				}
       
   172 			} else if (invokation.params().size() > 1) {
       
   173 				THROW_CPIXEXC(L"Default analyzer does not accept more than one parameter");
       
   174 			} else {
       
   175 				target_ = NORMAL;
       
   176 			}
       
   177 		}
       
   178 		
       
   179 		virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   180 														   lucene::util::Reader * reader) {
       
   181 			switch (target_) {
       
   182 				case QUERY: 
       
   183 					return Analysis::getQueryAnalyzer().tokenStream( fieldName, reader );
       
   184 				case PREFIX: 
       
   185 					return Analysis::getPrefixAnalyzer().tokenStream( fieldName, reader );
       
   186 			}
       
   187 			return Analysis::getDefaultAnalyzer().tokenStream( fieldName, reader );
       
   188 		}
       
   189 		
       
   190 	private:
       
   191 	
       
   192 		Target target_;
       
   193 		
       
   194 	};
       
   195 		
       
   196 	/**
       
   197 	 * Template class used to create CLucene tokenizers. Template
       
   198 	 * parameter T must implement lucene::analysis::Tokenizer abstraction.  
       
   199 	 */    
       
   200 	template<class T>
       
   201 	class TokenizerFactory : public TokenStreamFactory 
       
   202 	{
       
   203 	public:
       
   204 		TokenizerFactory(const Invokation& invokation) {
       
   205 			if (invokation.params().size() > 0) {
       
   206 				THROW_CPIXEXC(L"Tokenizer %S does not accept parameters",
       
   207 							  invokation.id().c_str());
       
   208 			}
       
   209 		}
       
   210 		virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * /*fieldName*/, 
       
   211 														   lucene::util::Reader * reader) {
       
   212 			return _CLNEW T(reader); 
       
   213 		}
       
   214 	};
       
   215 	
       
   216 	template<>
       
   217     class TokenizerFactory<analysis::CjkNGramTokenizer> : public TokenStreamFactory 
       
   218     {   
       
   219     public:
       
   220         static const int DefaultNgramSize = 1;
       
   221         TokenizerFactory(const Invokation& invokation) {
       
   222             using namespace Cpix::AnalyzerExp;
       
   223             if (invokation.params().size() > 1) {
       
   224                 THROW_CPIXEXC(L"Cjk Ngram tokenizer does not accept more than one parameter",
       
   225                               invokation.id().c_str());
       
   226             }
       
   227             if (invokation.params().size() == DefaultNgramSize) {
       
   228                 IntegerLit* ngramSize = dynamic_cast<IntegerLit*>(invokation.params()[0]);
       
   229                 if ( ngramSize ) {
       
   230                     ngramSize_ = ngramSize->value();
       
   231                 } else {
       
   232                     THROW_CPIXEXC(L"Cjk Ngram tokenizer parameter must be an integer");
       
   233                 }
       
   234             } else {
       
   235                 ngramSize_ = 1;
       
   236             }
       
   237         }
       
   238         virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * /*fieldName*/, 
       
   239                                                            lucene::util::Reader * reader) {
       
   240             return _CLNEW analysis::CjkNGramTokenizer(reader, ngramSize_); 
       
   241         }
       
   242         
       
   243     private:
       
   244         
       
   245         int ngramSize_;
       
   246     };
       
   247 
       
   248 	
       
   249 	/**
       
   250 	 * Template class wrapping CLucene analyzers. Template parameter T must 
       
   251 	 * implement lucene::analysis::Analyzer abstraction.  
       
   252 	 */    
       
   253 	template<class T>
       
   254 	class AnalyzerWrap : public TokenStreamFactory 
       
   255 	{
       
   256 	public:
       
   257 		AnalyzerWrap(const Invokation& invokation) : analyzer_() {
       
   258 			if (invokation.params().size() > 0) {
       
   259 				THROW_CPIXEXC(L"Tokenizer %S does not accept parameters",
       
   260 							  invokation.id().c_str());
       
   261 			}
       
   262 		}
       
   263 		virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   264 														   lucene::util::Reader * reader) {
       
   265 			return analyzer_.tokenStream(fieldName, reader); 
       
   266 		}
       
   267 	private: 
       
   268 		T analyzer_;
       
   269 	};
       
   270 	
       
   271 	/**
       
   272 	 * Template class associated with CLucene filter and a TokenStreamFactory. 
       
   273 	 * Uses TokenStreamFactory to transform given character stream into tokenstream
       
   274 	 * and then applies the given Clucene filter to the token stream. 
       
   275 	 * The template parameter T must implement lucene::analysis::Filter abstraction.     
       
   276 	 */    
       
   277 	template<class T>
       
   278 	class FilterFactory : public TokenStreamFactory 
       
   279 	{
       
   280 	public:
       
   281 		FilterFactory(const Invokation& invokation, auto_ptr<TokenStreamFactory> factory) : factory_(factory) {
       
   282 			if (invokation.params().size() > 0) {
       
   283 				THROW_CPIXEXC(L"Filter %S does not accept parameters",
       
   284 							  invokation.id().c_str());
       
   285 			}
       
   286 		}
       
   287 		virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   288 														   lucene::util::Reader * reader) {
       
   289 			return _CLNEW T(factory_->tokenStream(fieldName, reader), true); 
       
   290 		}
       
   291 	private: 
       
   292 		std::auto_ptr<TokenStreamFactory> factory_; 
       
   293 	};
       
   294 	
       
   295 	/**
       
   296 	 * Specialized Analyzer wrap for CLucene's PerFieldAnalyzer. Specialized
       
   297 	 * template is needed because perfield analyzer accepts parameters
       
   298 	 * (specific analyzers for different field plus default analyzer)
       
   299 	 */
       
   300 	template<>
       
   301 	class AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper> : public TokenStreamFactory {
       
   302 	public:
       
   303 		AnalyzerWrap(const Switch& sw, const wchar_t* config) : analyzer_(0) {
       
   304 			using namespace Cpt::Parser;
       
   305 			using namespace lucene::analysis;
       
   306 			
       
   307 			analyzer_ = _CLNEW PerFieldAnalyzerWrapper(_CLNEW CustomAnalyzer(sw.def()));
       
   308 			
       
   309 			for (int i = 0; i < sw.cases().size(); i++) {
       
   310 				const Case& cs = *sw.cases()[i];
       
   311 				for (int j = 0; j < cs.cases().size(); j++) {
       
   312 					analyzer_->addAnalyzer( cs.cases()[j].c_str(), _CLNEW CustomAnalyzer( cs.piping(), config ) );
       
   313 				}
       
   314 			}
       
   315 		}
       
   316 		virtual ~AnalyzerWrap() {
       
   317 			_CLDELETE(analyzer_);
       
   318 		}
       
   319 		virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   320 														   lucene::util::Reader * reader) {
       
   321 			return analyzer_->tokenStream(fieldName, reader); 
       
   322 		}
       
   323 	private: 
       
   324 		lucene::analysis::PerFieldAnalyzerWrapper* analyzer_;
       
   325 	};
       
   326 		
       
   327 	
       
   328 	
       
   329 	/**
       
   330 	 * Specialized StopFilter factory. Specialized filter is needed
       
   331 	 * because StopFilter needs parameters (stop word list or a language) 
       
   332 	 */
       
   333 	template<>
       
   334 	class FilterFactory<lucene::analysis::StopFilter> : public TokenStreamFactory 
       
   335 	{
       
   336 	public:
       
   337 		FilterFactory(const Invokation& invokation,
       
   338 					  auto_ptr<TokenStreamFactory> factory)
       
   339 			:words_(0),  ownWords_(0), factory_(factory) {
       
   340 			using namespace Cpt::Parser;
       
   341 			if (invokation.params().size() == 1 && dynamic_cast<Identifier*>(invokation.params()[0])) {
       
   342 				Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
       
   343 				//cpix_LangCode lang; 
       
   344 				if (id->id() == CPIX_WLANG_EN) {
       
   345 					words_ = lucene::analysis::StopAnalyzer::ENGLISH_STOP_WORDS;
       
   346                 } else if (id->id() == CPIX_WLANG_FR) {
       
   347                     words_ = analysis::NonEnglishStopWords::FRENCH_STOP_WORDS;
       
   348 				} else {
       
   349 					THROW_CPIXEXC(L"No prepared stopword list for language code '%S'",
       
   350 								  id->id().c_str());
       
   351 				}
       
   352 			} else {
       
   353 				ownWords_ = new wchar_t*[invokation.params().size()+1];
       
   354 				memset(ownWords_, 0, sizeof(wchar_t*)*(invokation.params().size()+1)); 
       
   355 				// FIXE: args may leak
       
   356 				for (int i = 0; i < invokation.params().size(); i++) {
       
   357 					StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]);
       
   358 					if (lit) {
       
   359 						const wstring& str = lit->text(); 
       
   360 						ownWords_[i] = new wchar_t[str.length()+1]; 
       
   361 						wcscpy(ownWords_[i], str.c_str());
       
   362 					} else {
       
   363 						THROW_CPIXEXC(L"StopFilter accepts only language identifer or list of strings as a parameters.");
       
   364 					}
       
   365 				}
       
   366 			}
       
   367 		
       
   368 		}
       
   369 		virtual ~FilterFactory() { 
       
   370 			if (ownWords_) {
       
   371 				for (int i = 0; ownWords_[i]; i++) {
       
   372 					delete[] ownWords_[i]; 
       
   373 				}
       
   374 				delete[] ownWords_;
       
   375 			}
       
   376 		}
       
   377 		virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   378 														   lucene::util::Reader * reader) {
       
   379 			return _CLNEW lucene::analysis::StopFilter(factory_->tokenStream(fieldName, reader), true, ownWords_ ? const_cast<const wchar_t**>(ownWords_) : words_); 
       
   380 		}
       
   381 	private: 
       
   382 		const wchar_t **words_;
       
   383 		wchar_t **ownWords_; // owned
       
   384 		std::auto_ptr<TokenStreamFactory> factory_; 
       
   385 	};
       
   386 	
       
   387 	/**
       
   388 	 * Specialized SnowballFilter factory is needed, because SnowballFilter
       
   389 	 * accepts parameters (the language). 
       
   390 	 */
       
   391 	template<>
       
   392 	class FilterFactory<lucene::analysis::SnowballFilter> : public TokenStreamFactory 
       
   393 	{
       
   394 	public:
       
   395 		FilterFactory(const Invokation& invokation, 		
       
   396 					  auto_ptr<TokenStreamFactory> factory)
       
   397 			: factory_(factory) {
       
   398 			using namespace Cpt::Parser;
       
   399 			if (invokation.params().size() != 1 || !dynamic_cast<Identifier*>(invokation.params()[0])) {
       
   400 				THROW_CPIXEXC(L"Snowball filter takes exactly one identifier as a parameter." );
       
   401 			}
       
   402 			Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
       
   403 			if (id->id() == CPIX_WLANG_EN) {
       
   404 				lang_ = cpix_LANG_EN; 
       
   405 			} else {
       
   406 				THROW_CPIXEXC(L"Language identifier %S is not supported for stemming",
       
   407 							  id->id().c_str());
       
   408 			}
       
   409 		}
       
   410 		virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   411 														   lucene::util::Reader * reader) {
       
   412 			return _CLNEW lucene::analysis::SnowballFilter(factory_->tokenStream(fieldName, reader), true, lang_); 
       
   413 		}
       
   414 	private: 
       
   415 		cpix_LangCode lang_;
       
   416 		std::auto_ptr<TokenStreamFactory> factory_; 
       
   417 	};
       
   418 	
       
   419 	/**
       
   420 	 * Specialized LengthFilter factory is needed, because length filter 
       
   421 	 * accepts parameters (minimum length and maximum length)
       
   422 	 */
       
   423 	template<>
       
   424 	class FilterFactory<lucene::analysis::LengthFilter> : public TokenStreamFactory 
       
   425 	{
       
   426 	public:
       
   427 		FilterFactory(const Invokation& invokation, 
       
   428 					  auto_ptr<TokenStreamFactory> factory) 
       
   429 			: factory_(factory) {
       
   430 			using namespace Cpt::Parser;
       
   431 			if (invokation.params().size() != 2 || 
       
   432 				!dynamic_cast<IntegerLit*>(invokation.params()[0]) || 
       
   433 				!dynamic_cast<IntegerLit*>(invokation.params()[1])) {
       
   434 				THROW_CPIXEXC("Length filter takes exactly two integer parameters");
       
   435 			}
       
   436 			min_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value();
       
   437 			max_ = dynamic_cast<IntegerLit*>(invokation.params()[1])->value();
       
   438 		}
       
   439 		virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   440 														   lucene::util::Reader * reader) {
       
   441 			return _CLNEW lucene::analysis::LengthFilter(factory_->tokenStream(fieldName, reader), true, min_, max_ ); 
       
   442 		}
       
   443 	private: 
       
   444 		int min_, max_;
       
   445 		std::auto_ptr<TokenStreamFactory> factory_; 
       
   446 	};
       
   447 	
       
   448 	/**
       
   449 	 * Specialized PrefixGenerator factory is needed, because PrefixGenerator
       
   450 	 * requires the max prefix size. 
       
   451 	 */
       
   452 	template<>
       
   453 	class FilterFactory<PrefixGenerator> : public TokenStreamFactory 
       
   454 	{
       
   455 	public:
       
   456 		FilterFactory(const Invokation& invokation, 
       
   457 					  auto_ptr<TokenStreamFactory> factory) 
       
   458 			: factory_(factory) {
       
   459 			using namespace Cpt::Parser;
       
   460 			if (invokation.params().size() != 1 || 
       
   461 				!dynamic_cast<IntegerLit*>(invokation.params()[0])) {
       
   462 				THROW_CPIXEXC("Prefix generator takes exactly one integer parameter");
       
   463 			}
       
   464 			maxPrefixLength_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value();
       
   465 		}
       
   466 		virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   467 														   lucene::util::Reader * reader) {
       
   468 			return _CLNEW PrefixGenerator(factory_->tokenStream(fieldName, reader), true, maxPrefixLength_ ); 
       
   469 		}
       
   470 	private: 
       
   471 		int maxPrefixLength_;
       
   472 		std::auto_ptr<TokenStreamFactory> factory_; 
       
   473 	};
       
   474 
       
   475 	/**
       
   476 	 * Specialized PrefixFilter factory is needed, because prefix filter 
       
   477 	 * accepts parameters (language set or prefixes)
       
   478 	 */
       
   479 	template<>
       
   480 	class FilterFactory<analysis::PrefixFilter> : public TokenStreamFactory 
       
   481 	{
       
   482 	public:
       
   483 		FilterFactory(const Invokation& invokation,
       
   484 					  auto_ptr<TokenStreamFactory> factory)
       
   485 			:	prefixes_(0),  ownPrefixes_(0), factory_(factory) {
       
   486 			using namespace Cpt::Parser;
       
   487 			if (invokation.params().size() == 1 && 
       
   488 				dynamic_cast<Identifier*>(invokation.params()[0])) {
       
   489 				Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
       
   490 				//cpix_LangCode lang; 
       
   491 				if (id->id() == CPIX_WLANG_HE) {
       
   492 					prefixes_ = analysis::HebrewPrefixes;
       
   493 				} else {
       
   494 					THROW_CPIXEXC(L"No prepared prefix list for language code '%S'",
       
   495 								  id->id().c_str());
       
   496 				}
       
   497 			} else {
       
   498 				ownPrefixes_ = new wchar_t*[invokation.params().size()+1];
       
   499 				memset(ownPrefixes_, 0, sizeof(wchar_t*)*(invokation.params().size()+1)); 
       
   500 				// FIXE: args may leak
       
   501 				for (int i = 0; i < invokation.params().size(); i++) {
       
   502 					StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]);
       
   503 					if (lit) {
       
   504 						const wstring& str = lit->text(); 
       
   505 						ownPrefixes_[i] = new wchar_t[str.length()+1]; 
       
   506 						wcscpy(ownPrefixes_[i], str.c_str());
       
   507 					} else {
       
   508 						THROW_CPIXEXC(L"PrefixFilter accepts only language identifer or list of strings as a parameters.");
       
   509 					}
       
   510 				}
       
   511 			}
       
   512 		}
       
   513 		virtual ~FilterFactory() { 
       
   514 			if (ownPrefixes_) {
       
   515 				for (int i = 0; ownPrefixes_[i]; i++) {
       
   516 					delete[] ownPrefixes_[i]; 
       
   517 				}
       
   518 				delete[] ownPrefixes_;
       
   519 			}
       
   520 		}
       
   521 		virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   522 														   lucene::util::Reader * reader) {
       
   523 			return _CLNEW analysis::PrefixFilter(factory_->tokenStream(fieldName, reader), true, ownPrefixes_ ? const_cast<const wchar_t**>(ownPrefixes_) : prefixes_); 
       
   524 		}
       
   525 	private: 
       
   526 		const wchar_t **prefixes_;
       
   527 		wchar_t **ownPrefixes_; // owned
       
   528 		std::auto_ptr<TokenStreamFactory> factory_; 
       
   529 	};
       
   530 	
       
   531 	/**
       
   532 	 * Specialized ElisionFilter factory is needed, because elision filter 
       
   533 	 * accepts parameters (language set or articles)
       
   534 	 */
       
   535 	template<>
       
   536 	class FilterFactory<analysis::ElisionFilter> : public TokenStreamFactory 
       
   537 	{
       
   538 	public:
       
   539 		FilterFactory(const Invokation& invokation,
       
   540 					  auto_ptr<TokenStreamFactory> factory)
       
   541 			:	articles_(0),  ownArticles_(0), factory_(factory) {
       
   542 			using namespace Cpt::Parser;
       
   543 			if (invokation.params().size() == 1 && 
       
   544 				dynamic_cast<Identifier*>(invokation.params()[0])) {
       
   545 				Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
       
   546 				//cpix_LangCode lang; 
       
   547 				if (id->id() == CPIX_WLANG_FR) {
       
   548 					articles_ = analysis::FrenchArticles;
       
   549 				} else {
       
   550 					THROW_CPIXEXC(L"No prepared article list for language code '%S'",
       
   551 								  id->id().c_str());
       
   552 				}
       
   553 			} else {
       
   554 				ownArticles_ = new wchar_t*[invokation.params().size()+1];
       
   555 				memset(ownArticles_, 0, sizeof(wchar_t*)*(invokation.params().size()+1)); 
       
   556 				// FIXE: args may leak
       
   557 				for (int i = 0; i < invokation.params().size(); i++) {
       
   558 					StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]);
       
   559 					if (lit) {
       
   560 						const wstring& str = lit->text(); 
       
   561 						ownArticles_[i] = new wchar_t[str.length()+1]; 
       
   562 						wcscpy(ownArticles_[i], str.c_str());
       
   563 					} else {
       
   564 						THROW_CPIXEXC(L"PrefixFilter accepts only language identifer or list of strings as a parameters.");
       
   565 					}
       
   566 				}
       
   567 			}
       
   568 		}
       
   569 		virtual ~FilterFactory() { 
       
   570 			if (ownArticles_) {
       
   571 				for (int i = 0; ownArticles_[i]; i++) {
       
   572 					delete[] ownArticles_[i]; 
       
   573 				}
       
   574 				delete[] ownArticles_;
       
   575 			}
       
   576 		}
       
   577 		virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
       
   578 														   lucene::util::Reader * reader) {
       
   579 			return _CLNEW analysis::ElisionFilter(factory_->tokenStream(fieldName, reader), true, ownArticles_ ? const_cast<const wchar_t**>(ownArticles_) : articles_); 
       
   580 		}
       
   581 	private: 
       
   582 		const wchar_t **articles_;
       
   583 		wchar_t **ownArticles_; // owned
       
   584 		std::auto_ptr<TokenStreamFactory> factory_; 
       
   585 	};
       
   586 	
       
   587 	typedef auto_ptr<TokenStreamFactory> (*TokenizerFactoryCreator)(const Invokation& invokation);
       
   588 	typedef auto_ptr<TokenStreamFactory> (*FilterFactoryCreator)(const Invokation& invokation, 
       
   589 																 auto_ptr<TokenStreamFactory> factory);
       
   590 
       
   591 	template<class T>
       
   592 	struct TokenStreamFactoryCtor
       
   593 	{
       
   594 		static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) {
       
   595 			return auto_ptr<TokenStreamFactory>(new T(invokation)); 
       
   596 		}
       
   597 	};
       
   598 
       
   599 	/**
       
   600 	 * Sets up a tokenizer factory with given invokation parameters
       
   601 	 */
       
   602 	template<class T>
       
   603 	struct TokenizerFactoryCtor
       
   604 	{
       
   605 		static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) {
       
   606 			return auto_ptr<TokenStreamFactory>(new TokenizerFactory<T>(invokation)); 
       
   607 		}
       
   608 	};
       
   609 	
       
   610 	/**
       
   611 	 * Sets up an analyzer wrap with given invokation parameters
       
   612 	 */
       
   613 	template<class T>
       
   614 	struct AnalyzerWrapCtor
       
   615 	{
       
   616 		static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) {
       
   617 			return auto_ptr<TokenStreamFactory>(new AnalyzerWrap<T>(invokation)); 
       
   618 		}
       
   619 	};
       
   620 	
       
   621 	/**
       
   622 	 * Sets up a filter factory with given invokation parameters
       
   623 	 */
       
   624 	template<class T>
       
   625 	struct FilterFactoryCtor 
       
   626 	{
       
   627 		static auto_ptr<TokenStreamFactory> create(const Invokation& invokation,
       
   628 												   auto_ptr<TokenStreamFactory> factory) {
       
   629 			return auto_ptr<TokenStreamFactory>(new FilterFactory<T>(invokation, factory)); 
       
   630 		}
       
   631 	};
       
   632 	
       
   633 	struct TokenizerClassEntry {
       
   634 		const wchar_t *id_;
       
   635 		TokenizerFactoryCreator createFactory_;
       
   636 	};
       
   637 	
       
   638 	//
       
   639 	// Following TokenizerClassEntries and FilterClassEntries contain
       
   640 	// the mapping from tokenizer/analyzer/filter names into glue code
       
   641 	// templates providing the implementations. 
       
   642 	// 
       
   643 	
       
   644 	TokenizerClassEntry TokenizerClassEntries[] = { 
       
   645 		{CPIX_TOKENIZER_STANDARD, 	TokenizerFactoryCtor<lucene::analysis::standard::StandardTokenizer>::create},
       
   646 		{CPIX_TOKENIZER_WHITESPACE, TokenizerFactoryCtor<lucene::analysis::WhitespaceTokenizer>::create},
       
   647 		{CPIX_TOKENIZER_LETTER, 	TokenizerFactoryCtor<lucene::analysis::LetterTokenizer>::create},
       
   648 		{CPIX_TOKENIZER_KEYWORD, 	TokenizerFactoryCtor<lucene::analysis::KeywordTokenizer>::create},
       
   649 		{CPIX_TOKENIZER_CJK, 		TokenizerFactoryCtor<lucene::analysis::cjk::CJKTokenizer>::create},
       
   650         {CPIX_TOKENIZER_NGRAM,      TokenizerFactoryCtor<analysis::CjkNGramTokenizer>::create},
       
   651         {CPIX_TOKENIZER_KOREAN,     TokenizerFactoryCtor<analysis::KoreanTokenizer>::create},
       
   652         {CPIX_TOKENIZER_KOREAN_QUERY,TokenizerFactoryCtor<analysis::KoreanQueryTokenizer>::create},
       
   653         
       
   654 		{CPIX_ANALYZER_STANDARD, 	AnalyzerWrapCtor<lucene::analysis::standard::StandardAnalyzer>::create},
       
   655 		{CPIX_ANALYZER_DEFAULT, 	TokenStreamFactoryCtor<DefaultTokenStreamFactory>::create},
       
   656 	
       
   657 	// 	TODO: Add more Tokenizers/Analyzers
       
   658 		
       
   659 	// 	Example tokenizer (works as such if tokenizers don't take parameters)
       
   660 	//  {CPIX_TOKENIZER_MYTOKENIZER,TokenizerFactoryCtor<MyTokenizer>::create},
       
   661 	
       
   662 	// 	Example analyzer (works as such if analyzer don't take parameters)
       
   663 	//  {CPIX_ANALYZER_MYANALYZER,	AnalyzerWrapCtor<MyAnalyzer>::create},
       
   664 	
       
   665 		{0, 						0}
       
   666 	};
       
   667 	
       
   668 	struct FilterClassEntry {
       
   669 		const wchar_t *id_;
       
   670 		FilterFactoryCreator createFactory_;
       
   671 	};
       
   672 	
       
   673 	FilterClassEntry FilterClassEntries[] = {
       
   674 		{CPIX_FILTER_STANDARD, 	FilterFactoryCtor<lucene::analysis::standard::StandardFilter>::create},
       
   675 		{CPIX_FILTER_LOWERCASE, FilterFactoryCtor<lucene::analysis::LowerCaseFilter>::create},
       
   676 		{CPIX_FILTER_ACCENT, 	FilterFactoryCtor<lucene::analysis::ISOLatin1AccentFilter>::create},
       
   677 		{CPIX_FILTER_STOP, 		FilterFactoryCtor<lucene::analysis::StopFilter>::create},
       
   678 		{CPIX_FILTER_STEM, 		FilterFactoryCtor<lucene::analysis::SnowballFilter>::create},
       
   679 		{CPIX_FILTER_LENGTH, 	FilterFactoryCtor<lucene::analysis::LengthFilter>::create},
       
   680 		{CPIX_FILTER_PREFIXES, 	FilterFactoryCtor<PrefixGenerator>::create},
       
   681 		{CPIX_FILTER_THAI, 		FilterFactoryCtor<analysis::ThaiWordFilter>::create},
       
   682 		{CPIX_FILTER_PREFIX, 	FilterFactoryCtor<analysis::PrefixFilter>::create},
       
   683 		{CPIX_FILTER_ELISION, 	FilterFactoryCtor<analysis::ElisionFilter>::create},
       
   684 		
       
   685 	// 	TODO: Add more Filters
       
   686 	
       
   687 	// 	Example filter (works as such if filter don't take parameters)
       
   688 	//  {CPIX_FILTER_MYFILTER,	FilterFactoryCtor<MyFilter>::create},
       
   689 	
       
   690 		{0, 					0}
       
   691 	};
       
   692 	
       
   693 	CustomAnalyzer::CustomAnalyzer(const wchar_t* definition, const wchar_t* config) {
       
   694 		std::auto_ptr<Piping> piping = AnalyzerExp::ParsePiping( definition );
       
   695 		setup( *piping, config );
       
   696 	}
       
   697 	
       
   698 	CustomAnalyzer::CustomAnalyzer(const Piping& definition, const wchar_t* config) {	
       
   699 		setup(definition, config);
       
   700 	}
       
   701 	
       
   702 	using namespace Cpt::Parser;
       
   703 	
       
   704 	void CustomAnalyzer::setup(const Piping& piping, const wchar_t* config) {
       
   705 	
       
   706 		// If the first item is invokation, create corresponding analyzer/tokenizer 
       
   707 		if (dynamic_cast<const Invokation*>(&piping.tokenizer())) {
       
   708 			const Invokation& tokenizer = dynamic_cast<const Invokation&>(piping.tokenizer());
       
   709 			TokenizerClassEntry& tokenizerEntry = getTokenizerEntry( tokenizer.id() ); 
       
   710 			factory_ = tokenizerEntry.createFactory_( tokenizer );
       
   711 		} else if (dynamic_cast<const Switch*>(&piping.tokenizer())) {
       
   712 			// If the first item is switch statement, create per-field analyzer 
       
   713 			const Switch& tokenizer = dynamic_cast<const Switch&>(piping.tokenizer());
       
   714 			factory_ = new AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper>( tokenizer, config );
       
   715 		} else if (dynamic_cast<const LocaleSwitch*>(&piping.tokenizer())) {
       
   716 			const LocaleSwitch& tokenizer = dynamic_cast<const LocaleSwitch&>(piping.tokenizer());
       
   717 			factory_ = new LocaleSwitchStreamFactory( tokenizer, config );
       
   718 		} else if (dynamic_cast<const ConfigSwitch*>(&piping.tokenizer())) {
       
   719 			const ConfigSwitch& tokenizer = dynamic_cast<const ConfigSwitch&>(piping.tokenizer());
       
   720 			factory_ = resolveConfigSwitch( tokenizer, config );
       
   721 		} else {
       
   722 			THROW_CPIXEXC(L"Analyzer definition syntax did not begin with valid tokenizer");
       
   723 		}
       
   724 		
       
   725 		// Add filters
       
   726 		const std::vector<Invokation*>& filters = piping.filters(); 
       
   727 		for (int i = 0; i < filters.size(); i++) {
       
   728 			FilterClassEntry& filterEntry = getFilterEntry( filters[i]->id() ); 
       
   729 			factory_ = filterEntry.createFactory_( *filters[i], factory_ );
       
   730 		}
       
   731 	}
       
   732 
       
   733 	std::auto_ptr<TokenStreamFactory> CustomAnalyzer::resolveConfigSwitch(const ConfigSwitch& csw, const wchar_t* config) {
       
   734 		if (config) {
       
   735 			for (int i = 0; i < csw.cases().size(); i++) {
       
   736 				const Case& cs = *csw.cases()[i];
       
   737 				for (int j = 0; j < cs.cases().size(); j++) {
       
   738 					if (wcscmp(config, cs.cases()[j].c_str()) == 0) {
       
   739 						return std::auto_ptr<TokenStreamFactory>(
       
   740 							new CustomAnalyzer(cs.piping(), config)); 
       
   741 					}
       
   742 				}
       
   743 			}
       
   744 		}
       
   745 		return std::auto_ptr<TokenStreamFactory>(new CustomAnalyzer(csw.def(), config));
       
   746 	}
       
   747 
       
   748 	TokenizerClassEntry& CustomAnalyzer::getTokenizerEntry(std::wstring id) {
       
   749 	
       
   750 		// Looks for a match in the TokenizerClassEntries. After finding 
       
   751 		// a match it returns a proper tokenizer/analyzer implementation provider 
       
   752 		// 
       
   753 		for (int i = 0; TokenizerClassEntries[i].id_; i++) {
       
   754 			if (id == std::wstring(TokenizerClassEntries[i].id_)) {
       
   755 				return TokenizerClassEntries[i];
       
   756 			}
       
   757 		}
       
   758 	
       
   759 		THROW_CPIXEXC(L"Unknown tokenizer '%S'.",
       
   760 					  id.c_str());
       
   761 	}
       
   762 	
       
   763 	FilterClassEntry& CustomAnalyzer::getFilterEntry(std::wstring id) {
       
   764 	
       
   765 		// Looks for a match in the FilterClassEntries. After finding 
       
   766 		// a match it returns a proper tokenizer/analyzer implementation 
       
   767 		// provider 
       
   768 		// 
       
   769 		for (int i = 0; FilterClassEntries[i].id_; i++) {
       
   770 			if (id == std::wstring(FilterClassEntries[i].id_)) {
       
   771 				return FilterClassEntries[i];
       
   772 			}
       
   773 		}
       
   774 	
       
   775 		THROW_CPIXEXC(L"Unknown filter '%S'.",
       
   776 					  id.c_str());
       
   777 	}
       
   778 	
       
   779 	CustomAnalyzer::~CustomAnalyzer() {} 
       
   780 	
       
   781 	lucene::analysis::TokenStream* CustomAnalyzer::tokenStream(const wchar_t        * fieldName, 
       
   782 															   lucene::util::Reader * reader) {
       
   783 		// Utilizes the the token stream factory to form token stream. 
       
   784 		// token stream factory is prepared during custom analyzer construction
       
   785 		// and based on the analyzer definition string.
       
   786 															   
       
   787 		return factory_->tokenStream(fieldName, reader);
       
   788 	}
       
   789 	
       
   790 	std::auto_ptr<lucene::analysis::Analyzer> CreateDefaultAnalyzer()
       
   791 	{
       
   792 		return 
       
   793 			std::auto_ptr<lucene::analysis::Analyzer>(
       
   794 				new SystemAnalyzer(_CLNEW lucene::analysis::standard::StandardAnalyzer()));  
       
   795 	}
       
   796 
       
   797 }