--- a/searchengine/cpix/cpix/src/analyzer.cpp Fri Jun 11 14:43:47 2010 +0300
+++ b/searchengine/cpix/cpix/src/analyzer.cpp Mon Jun 28 10:34:53 2010 +0530
@@ -15,30 +15,36 @@
*
*/
-
-#include "CLucene.h"
-#include "CLucene/analysis/AnalysisHeader.h"
-#include "CLucene/analysis/Analyzers.h"
-
-#include "analyzer.h"
-#include "analyzerexp.h"
-#include "cpixanalyzer.h"
-#include "cluceneext.h"
-
-#include "cpixexc.h"
-#include "cpixparsetools.h"
-
+// general utilities
#include "wchar.h"
#include <string>
#include <vector>
#include <sstream>
#include <iostream>
+#include <fstream>
+#include <algorithm>
-#include "document.h"
+// clucene
+#include "CLucene.h"
+
+// support
+#include "cpixparsetools.h"
+#include "cpixfstools.h"
-#include "indevicecfg.h"
+// internal
+#include "analyzer.h"
+#include "cpixanalyzer.h"
+#include "cpixexc.h"
+#include "document.h"
+#include "cluceneext.h"
+#include "indevicecfg.h"
+#include "initparams.h"
+#include "thaianalysis.h"
-#include "initparams.h"
+#include "analyzerexp.h"
+#include "customanalyzer.h"
+#include "common/cpixlog.h"
+
namespace
{
const char AGGR_NONFILEREADERPROXY_ERR[]
@@ -46,11 +52,111 @@
const char AGGR_STREAMREADER_ERR[]
= "Aggregating streamValue-fields not implemented";
+
+ const char THAI_LANGUAGE_FILE[]
+ = "thaidict.sm";
+
+ const char ANALYZER_FILE[]
+ = "analyzer.loc";
+
+ const wchar_t DEFAULT_ANALYZER_CONFIG[]
+ = L"default";
+
+ const wchar_t QUERY_ANALYZER_CONFIG[]
+ = L"query";
+
+ const wchar_t PREFIX_ANALYZER_CONFIG[]
+ = L"prefix";
+
+// const wchar_t CPIX_ANALYZER_FALLBACK[]
+// = CPIX_ANALYZER_STANDARD;
+//
+// const wchar_t CPIX_PREFIX_ANALYZER_FALLBACK[]
+// = CPIX_TOKENIZER_LETTER L">" CPIX_FILTER_LOWERCASE;
+
+
}
namespace Cpix {
+
+Analysis* Analysis::theInstance_ = NULL;
+
+ void Analysis::init(InitParams& ip) {
+ // Init thai analysis with thai dictionary
+ std::string thai( Cpt::appendpath(ip.getResourceDir(),
+ THAI_LANGUAGE_FILE) );
+
+ if ( Cpt::filesize( thai.c_str() ) ) {
+ analysis::InitThaiAnalysis(thai.c_str());
+ } else {
+ logMsg(CPIX_LL_WARNING,
+ "Thai dictionary could not be found. Thai analysis will NOT work.");
+ }
+
+ // Setup the analysis instance
+ theInstance_ = new Analysis(ip);
+ }
+
+ Analysis::Analysis(InitParams& ip)
+ : defaultAnalyzer_(),
+ queryAnalyzer_(),
+ prefixAnalyzer_() {
+
+ auto_ptr<AnalyzerExp::Piping> p = parse( Cpt::appendpath( ip.getResourceDir(), ANALYZER_FILE ) );
+
+ defaultAnalyzer_.reset( new CustomAnalyzer( *p, DEFAULT_ANALYZER_CONFIG ) );
+ queryAnalyzer_.reset( new CustomAnalyzer( *p, QUERY_ANALYZER_CONFIG ) );
+ prefixAnalyzer_.reset( new CustomAnalyzer( *p, PREFIX_ANALYZER_CONFIG ) );
+ }
+
+ auto_ptr<AnalyzerExp::Piping> Analysis::parse(std::string path) {
+ std::wifstream in(path.c_str());
+ auto_ptr<AnalyzerExp::Piping> ret;
+ if ( in ) {
+
+ // Reserve constant size buffer and populate it with definition
+ //
+ int filesize = Cpt::filesize(path.c_str());
+ Cpt::auto_array<wchar_t> buf( new wchar_t[filesize+1] );
+ in.read(buf.get(), filesize);
+ buf.get()[filesize] = '\0';
+ if ( !in.fail() ) {
+ try {
+ ret = AnalyzerExp::ParsePiping( buf.get() );
+ } catch (...) {}
+ }
+ in.close();
+ }
+
+ if ( !ret.get() ) {
+ THROW_CPIXEXC("Analyzer definition not found. %s could not be opened. ", path.c_str());
+ }
+ return ret;
+ }
+
+ void Analysis::shutdown() {
+ analysis::ShutdownThaiAnalysis();
+ delete theInstance_;
+ theInstance_ = NULL;
+ }
+
+ lucene::analysis::Analyzer& Analysis::getDefaultAnalyzer() {
+ // TODO: Assert( theInstance_ );
+ return *theInstance_->defaultAnalyzer_;
+ }
+
+ lucene::analysis::Analyzer& Analysis::getQueryAnalyzer() {
+ // TODO: Assert( theInstance_ );
+ return *theInstance_->queryAnalyzer_;
+ }
+
+ lucene::analysis::Analyzer& Analysis::getPrefixAnalyzer() {
+ // TODO: Assert( theInstance_ );
+ return *theInstance_->prefixAnalyzer_;
+ }
+
PrefixGenerator::PrefixGenerator(
lucene::analysis::TokenStream* in,
bool deleteTS,
@@ -221,488 +327,5 @@
return analyzer_->tokenStream( fieldName, reader );
}
}
-
- //
- // Following sections provide the glue code for connecting the
- // analyzer definition syntax with analyzer, tokenizers and filter
- // implementations.
- //
- // The glue code is template heavy with the indent of providing
- // automation for associating specific keywords with specific
- // analyzers, tokenizers and filters implementing corresponding
- // CLucene abstractions. Additional classes are needed only if
- // filters, tokenizers, etc. accept parameters.
- //
- // NOTE: To understand the analyzers, it is sufficient to understand
- // that an analyzer transforms characters stream into specific token streams
- // (e.g. character stream 'foobarmetawords' can be transformed into token
- // stream 'foo', 'bar' 'meta' 'words'). Analysis consist of two main
- // parts which are tokenization and filtering. Tokenization converts
- // the character stream into token stream (e.g. 'FoO bAr' -> 'FoO' 'bAr')
- // and filtering modifies the tokens (e.g. lowercase filtering 'FoO' ->
- // 'foo', 'bAr' -> 'bar'). Analyzer as an object is responsible for
- // constructing a tokenizer and a sequence of filters to perform
- // these required tasks.
- //
- // See the documentation around TokenizerClassEntries and
- // FilterClassEntries to see how implementations not taking parameters
- // can be easily added.
- //
-
- using namespace Cpix::AnalyzerExp;
-
- /**
- * Creates token stream for the given reader and fieldName.
- * This class in in many ways similar to CLucene analyzer class
- * definition.
- */
- class TokenStreamFactory {
- public:
- virtual ~TokenStreamFactory();
- virtual lucene::analysis::TokenStream* tokenStream(const wchar_t * fieldName,
- lucene::util::Reader * reader) = 0;
- };
-
- TokenStreamFactory::~TokenStreamFactory() {};
-
- /**
- * Template class used to create CLucene tokenizers. Template
- * parameter T must implement lucene::analysis::Tokenizer abstraction.
- */
- template<class T>
- class TokenizerFactory : public TokenStreamFactory
- {
- public:
- TokenizerFactory(const Invokation& invokation) {
- if (invokation.params().size() > 0) {
- THROW_CPIXEXC(L"Tokenizer %S does not accept parameters",
- invokation.id().c_str());
- }
- }
- virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * /*fieldName*/,
- lucene::util::Reader * reader) {
- return _CLNEW T(reader);
- }
- };
-
- /**
- * Template class wrapping CLucene analyzers. Template parameter T must
- * implement lucene::analysis::Analyzer abstraction.
- */
- template<class T>
- class AnalyzerWrap : public TokenStreamFactory
- {
- public:
- AnalyzerWrap(const Invokation& invokation) : analyzer_() {
- if (invokation.params().size() > 0) {
- THROW_CPIXEXC(L"Tokenizer %S does not accept parameters",
- invokation.id().c_str());
- }
- }
- virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
- lucene::util::Reader * reader) {
- return analyzer_.tokenStream(fieldName, reader);
- }
- private:
- T analyzer_;
- };
-
- /**
- * Template class associated with CLucene filter and a TokenStreamFactory.
- * Uses TokenStreamFactory to transform given character stream into tokenstream
- * and then applies the given Clucene filter to the token stream.
- * The template parameter T must implement lucene::analysis::Filter abstraction.
- */
- template<class T>
- class FilterFactory : public TokenStreamFactory
- {
- public:
- FilterFactory(const Invokation& invokation, auto_ptr<TokenStreamFactory> factory) : factory_(factory) {
- if (invokation.params().size() > 0) {
- THROW_CPIXEXC(L"Filter %S does not accept parameters",
- invokation.id().c_str());
- }
- }
- virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
- lucene::util::Reader * reader) {
- return _CLNEW T(factory_->tokenStream(fieldName, reader), true);
- }
- private:
- std::auto_ptr<TokenStreamFactory> factory_;
- };
-
- /**
- * Specialized Analyzer wrap for CLucene's PerFieldAnalyzer. Specialized
- * template is needed because perfield analyzer accepts parameters
- * (specific analyzers for different field plus default analyzer)
- */
- template<>
- class AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper> : public TokenStreamFactory {
- public:
- AnalyzerWrap(const Switch& sw) : analyzer_(0) {
- using namespace Cpt::Parser;
- using namespace lucene::analysis;
-
- analyzer_ = _CLNEW PerFieldAnalyzerWrapper(_CLNEW CustomAnalyzer(sw.def()));
-
- for (int i = 0; i < sw.cases().size(); i++) {
- const Case& cs = *sw.cases()[i];
- for (int j = 0; j < cs.fields().size(); j++) {
- analyzer_->addAnalyzer( cs.fields()[j].c_str(), _CLNEW CustomAnalyzer( cs.piping() ) );
- }
- }
- }
- virtual ~AnalyzerWrap() {
- _CLDELETE(analyzer_);
- }
- virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
- lucene::util::Reader * reader) {
- return analyzer_->tokenStream(fieldName, reader);
- }
- private:
- lucene::analysis::PerFieldAnalyzerWrapper* analyzer_;
- };
-
-
-
- /**
- * Specialized StopFilter factory. Specialized filter is needed
- * because StopFilter needs parameters (stop word list or a language)
- */
- template<>
- class FilterFactory<lucene::analysis::StopFilter> : public TokenStreamFactory
- {
- public:
- FilterFactory(const Invokation& invokation,
- auto_ptr<TokenStreamFactory> factory)
- :words_(0), ownWords_(0), factory_(factory) {
- using namespace Cpt::Parser;
- if (invokation.params().size() == 1 && dynamic_cast<Identifier*>(invokation.params()[0])) {
- Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
- //cpix_LangCode lang;
- if (id->id() == CPIX_WLANG_EN) {
- words_ = lucene::analysis::StopAnalyzer::ENGLISH_STOP_WORDS;
- } else {
- THROW_CPIXEXC(L"No prepared stopword list for language code '%S'",
- id->id().c_str());
- }
- } else {
- ownWords_ = new wchar_t*[invokation.params().size()+1];
- memset(ownWords_, 0, sizeof(wchar_t*)*(invokation.params().size()+1));
- // FIXE: args may leak
- for (int i = 0; i < invokation.params().size(); i++) {
- StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]);
- if (lit) {
- const wstring& str = lit->text();
- ownWords_[i] = new wchar_t[str.length()+1];
- wcscpy(ownWords_[i], str.c_str());
- } else {
- THROW_CPIXEXC(L"StopFilter accepts only language identifer or list of strings as a parameters.");
- }
- }
- }
-
- }
- virtual ~FilterFactory() {
- if (ownWords_) {
- for (int i = 0; ownWords_[i]; i++) {
- delete[] ownWords_[i];
- }
- delete[] ownWords_;
- }
- }
- virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
- lucene::util::Reader * reader) {
- return _CLNEW lucene::analysis::StopFilter(factory_->tokenStream(fieldName, reader), true, ownWords_ ? const_cast<const wchar_t**>(ownWords_) : words_);
- }
- private:
- const wchar_t **words_;
- wchar_t **ownWords_; // owned
- std::auto_ptr<TokenStreamFactory> factory_;
- };
-
- /**
- * Specialized SnowballFilter factory is needed, because SnowballFilter
- * accepts parameters (the language).
- */
- template<>
- class FilterFactory<lucene::analysis::SnowballFilter> : public TokenStreamFactory
- {
- public:
- FilterFactory(const Invokation& invokation,
- auto_ptr<TokenStreamFactory> factory)
- : factory_(factory) {
- using namespace Cpt::Parser;
- if (invokation.params().size() != 1 || !dynamic_cast<Identifier*>(invokation.params()[0])) {
- THROW_CPIXEXC(L"Snowball filter takes exactly one identifier as a parameter." );
- }
- Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
- if (id->id() == CPIX_WLANG_EN) {
- lang_ = cpix_LANG_EN;
- } else {
- THROW_CPIXEXC(L"Language identifier %S is not supported for stemming",
- id->id().c_str());
- }
- }
- virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
- lucene::util::Reader * reader) {
- return _CLNEW lucene::analysis::SnowballFilter(factory_->tokenStream(fieldName, reader), true, lang_);
- }
- private:
- cpix_LangCode lang_;
- std::auto_ptr<TokenStreamFactory> factory_;
- };
-
- /**
- * Specialized LengthFilter factory is needed, because length filter
- * accepts parameters (minimum length and maximum length)
- */
- template<>
- class FilterFactory<lucene::analysis::LengthFilter> : public TokenStreamFactory
- {
- public:
- FilterFactory(const Invokation& invokation,
- auto_ptr<TokenStreamFactory> factory)
- : factory_(factory) {
- using namespace Cpt::Parser;
- if (!(invokation.params().empty())) {
- if (invokation.params().size() != 2 ||
- !dynamic_cast<IntegerLit*>(invokation.params()[0]) ||
- !dynamic_cast<IntegerLit*>(invokation.params()[1])) {
- THROW_CPIXEXC("Length filter takes exactly two integer parameters");
- }
- min_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value();
- max_ = dynamic_cast<IntegerLit*>(invokation.params()[1])->value();
- }
- }
- virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
- lucene::util::Reader * reader) {
- return _CLNEW lucene::analysis::LengthFilter(factory_->tokenStream(fieldName, reader), true, min_, max_ );
- }
- private:
- int min_, max_;
- std::auto_ptr<TokenStreamFactory> factory_;
- };
-
- /**
- * Specialized PrefixGenerator factory is needed, because PrefixGenerator
- * requires the max prefix size.
- */
- template<>
- class FilterFactory<PrefixGenerator> : public TokenStreamFactory
- {
- public:
- FilterFactory(const Invokation& invokation,
- auto_ptr<TokenStreamFactory> factory)
- : factory_(factory) {
- using namespace Cpt::Parser;
- if (invokation.params().empty()) {
- if (invokation.params().size() != 1 ||
- !dynamic_cast<IntegerLit*>(invokation.params()[0])) {
- THROW_CPIXEXC("Prefix generator takes exactly one integer parameter");
- }
- maxPrefixLength_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value();
- }
- }
- virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
- lucene::util::Reader * reader) {
- return _CLNEW PrefixGenerator(factory_->tokenStream(fieldName, reader), true, maxPrefixLength_ );
- }
- private:
- int maxPrefixLength_;
- std::auto_ptr<TokenStreamFactory> factory_;
- };
-
-
- typedef auto_ptr<TokenStreamFactory> (*TokenizerFactoryCreator)(const Invokation& invokation);
- typedef auto_ptr<TokenStreamFactory> (*FilterFactoryCreator)(const Invokation& invokation,
- auto_ptr<TokenStreamFactory> factory);
- /**
- * Sets up a tokenizer factory with given invokation parameters
- */
- template<class T>
- struct TokenizerFactoryCtor
- {
- static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) {
- return auto_ptr<TokenStreamFactory>(new TokenizerFactory<T>(invokation));
- }
- };
-
- /**
- * Sets up an analyzer wrap with given invokation parameters
- */
- template<class T>
- struct AnalyzerWrapCtor
- {
- static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) {
- return auto_ptr<TokenStreamFactory>(new AnalyzerWrap<T>(invokation));
- }
- };
-
- /**
- * Sets up a filter factory with given invokation parameters
- */
- template<class T>
- struct FilterFactoryCtor
- {
- static auto_ptr<TokenStreamFactory> create(const Invokation& invokation,
- auto_ptr<TokenStreamFactory> factory) {
- return auto_ptr<TokenStreamFactory>(new FilterFactory<T>(invokation, factory));
- }
- };
-
- struct TokenizerClassEntry {
- const wchar_t *id_;
- TokenizerFactoryCreator createFactory_;
- };
-
- //
- // Following TokenizerClassEntries and FilterClassEntries contain
- // the mapping from tokenizer/analyzer/filter names into glue code
- // templates providing the implementations.
- //
-
- TokenizerClassEntry TokenizerClassEntries[] = {
- {CPIX_TOKENIZER_STANDARD, TokenizerFactoryCtor<lucene::analysis::standard::StandardTokenizer>::create},
- {CPIX_TOKENIZER_WHITESPACE, TokenizerFactoryCtor<lucene::analysis::WhitespaceTokenizer>::create},
- {CPIX_TOKENIZER_LETTER, TokenizerFactoryCtor<lucene::analysis::LetterTokenizer>::create},
- {CPIX_TOKENIZER_KEYWORD, TokenizerFactoryCtor<lucene::analysis::KeywordTokenizer>::create},
- {CPIX_ANALYZER_STANDARD, AnalyzerWrapCtor<lucene::analysis::standard::StandardAnalyzer>::create},
-
-// TODO: Add more Tokenizers/Analyzers
-
-// Example tokenizer (works as such if tokenizers don't take parameters)
-// {CPIX_TOKENIZER_MYTOKENIZER,TokenizerFactoryCtor<MyTokenizer>::create},
-
-// Example analyzer (works as such if analyzer don't take parameters)
-// {CPIX_ANALYZER_MYANALYZER, AnalyzerWrapCtor<MyAnalyzer>::create},
-
- {0, 0}
- };
-
- struct FilterClassEntry {
- const wchar_t *id_;
- FilterFactoryCreator createFactory_;
- };
-
- FilterClassEntry FilterClassEntries[] = {
- {CPIX_FILTER_STANDARD, FilterFactoryCtor<lucene::analysis::standard::StandardFilter>::create},
- {CPIX_FILTER_LOWERCASE, FilterFactoryCtor<lucene::analysis::LowerCaseFilter>::create},
- {CPIX_FILTER_ACCENT, FilterFactoryCtor<lucene::analysis::ISOLatin1AccentFilter>::create},
- {CPIX_FILTER_STOP, FilterFactoryCtor<lucene::analysis::StopFilter>::create},
- {CPIX_FILTER_STEM, FilterFactoryCtor<lucene::analysis::SnowballFilter>::create},
- {CPIX_FILTER_LENGTH, FilterFactoryCtor<lucene::analysis::LengthFilter>::create},
- {CPIX_FILTER_PREFIXES, FilterFactoryCtor<PrefixGenerator>::create},
-
-// TODO: Add more Filters
-
-// Example filter (works as such if analyzer don't take parameters)
-// {CPIX_FILTER_MYFILTER, FilterFactoryCtor<MyFilter>::create},
-
- {0, 0}
- };
-
- CustomAnalyzer::CustomAnalyzer(const wchar_t* definition)
- {
- using namespace Cpt::Lex;
- using namespace Cpt::Parser;
-
-
- try
- {
- // 1. Setup an tokenizer
- Cpix::AnalyzerExp::Tokenizer
- tokenizer;
- StdLexer
- lexer(tokenizer, definition);
-
- // 2. Parse
- std::auto_ptr<Piping>
- def = ParsePiping(lexer);
- lexer.eatEof();
-
- // 3. Setup this item based on parsed definition
- setup(*def);
- }
- catch (Cpt::ITxtCtxtExc & exc)
- {
- // provide addition info for thrown exception
- exc.setContext(definition);
-
- // throw it fwd
- throw;
- }
- }
-
- CustomAnalyzer::CustomAnalyzer(const Piping& definition)
- {
- setup(definition);
- }
- using namespace Cpt::Parser;
-
- void CustomAnalyzer::setup(const Piping& piping) {
-
- // If the first item is invokation, create corresponding analyzer/tokenizer
- if (dynamic_cast<const Invokation*>(&piping.tokenizer()))
- {
- const Invokation& tokenizer = dynamic_cast<const Invokation&>(piping.tokenizer());
- TokenizerClassEntry& tokenizerEntry = getTokenizerEntry( tokenizer.id() );
- factory_ = tokenizerEntry.createFactory_( tokenizer );
- } else {
- // If the first item is switch statement, create per-field analyzer
- const Switch& tokenizer = dynamic_cast<const Switch&>(piping.tokenizer());
- factory_ = new AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper>( tokenizer );
- }
-
- // Add filters
- const std::vector<Invokation*>& filters = piping.filters();
- for (int i = 0; i < filters.size(); i++) {
- FilterClassEntry& filterEntry = getFilterEntry( filters[i]->id() );
- factory_ = filterEntry.createFactory_( *filters[i], factory_ );
- }
- }
-
- TokenizerClassEntry& CustomAnalyzer::getTokenizerEntry(std::wstring id) {
-
- // Looks for a match in the TokenizerClassEntries. After finding
- // a match it returns a proper tokenizer/analyzer implementation provider
- //
- for (int i = 0; TokenizerClassEntries[i].id_; i++) {
- if (id == std::wstring(TokenizerClassEntries[i].id_)) {
- return TokenizerClassEntries[i];
- }
- }
-
- THROW_CPIXEXC(L"Unknown tokenizer '%S'.",
- id.c_str());
- }
-
- FilterClassEntry& CustomAnalyzer::getFilterEntry(std::wstring id) {
-
- // Looks for a match in the FilterClassEntries. After finding
- // a match it returns a proper tokenizer/analyzer implementation
- // provider
- //
- for (int i = 0; FilterClassEntries[i].id_; i++) {
- if (id == std::wstring(FilterClassEntries[i].id_)) {
- return FilterClassEntries[i];
- }
- }
-
- THROW_CPIXEXC(L"Unknown filter '%S'.",
- id.c_str());
- }
-
- CustomAnalyzer::~CustomAnalyzer() {}
-
- lucene::analysis::TokenStream* CustomAnalyzer::tokenStream(const wchar_t * fieldName,
- lucene::util::Reader * reader) {
- // Utilizes the the token stream factory to form token stream.
- // token stream factory is prepared during custom analyzer construction
- // and based on the analyzer definition string.
-
- return factory_->tokenStream(fieldName, reader);
- }
-
}