/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description:
*
*/
#include "CLucene.h"
#include "CLucene/analysis/AnalysisHeader.h"
#include "CLucene/analysis/Analyzers.h"
#include "analyzer.h"
#include "analyzerexp.h"
#include "cpixanalyzer.h"
#include "cluceneext.h"
#include "cpixexc.h"
#include "cpixparsetools.h"
#include "wchar.h"
#include <string>
#include <vector>
#include <sstream>
#include <iostream>
#include "document.h"
#include "indevicecfg.h"
#include "initparams.h"
namespace
{
const char AGGR_NONFILEREADERPROXY_ERR[]
= "Aggregated reader field should be FileReaderProxy instance";
const char AGGR_STREAMREADER_ERR[]
= "Aggregating streamValue-fields not implemented";
}
namespace Cpix {
PrefixGenerator::PrefixGenerator(
lucene::analysis::TokenStream* in,
bool deleteTS,
size_t maxPrefixLength)
: TokenFilter(in, deleteTS),
token_(),
prefixLength_(0),
maxPrefixLength_(maxPrefixLength) {}
PrefixGenerator::~PrefixGenerator() {
}
bool PrefixGenerator::next(lucene::analysis::Token* token) {
token_.setPositionIncrement(0);
while (prefixLength_ == 0) {
token_.setPositionIncrement(1); // default position increment
if (!input->next(&token_)) {
return false;
}
prefixLength_ = std::min(token_.termTextLength(), maxPrefixLength_);
}
// Clip token
std::wstring clipped;
clipped = token_.termText();
token_.setText(clipped.substr(0, prefixLength_).c_str());
// Copy
token->set(token_.termText(), token_.startOffset(), token_.endOffset(), token_.type());
token->setPositionIncrement(token_.getPositionIncrement());
// Reduce prefixLength_
prefixLength_--;
return true;
}
AggregateFieldTokenStream::AggregateFieldTokenStream(lucene::analysis::Analyzer& analyzer,
DocumentFieldIterator* fields)
: stream_(), analyzer_( analyzer ), reader_(), fields_( fields ) {
getNextStream();
}
AggregateFieldTokenStream::~AggregateFieldTokenStream() {
_CLDELETE( stream_ );
delete fields_;
}
bool AggregateFieldTokenStream::next(lucene::analysis::Token* token) {
while ( stream_ ) {
if ( stream_->next( token ) ) {
return true;
}
getNextStream();
}
return false;
}
void AggregateFieldTokenStream::close() {
if (stream_) stream_->close();
_CLDELETE( stream_ );
_CLDELETE( reader_ );
}
void AggregateFieldTokenStream::getNextStream()
{
using namespace lucene::document;
using namespace lucene::util;
if ( stream_ ) stream_->close();
_CLDELETE( stream_ );
_CLDELETE( reader_ );
Field* field = 0;
while (*fields_ && field == NULL)
{
field = (*fields_)++;
if (!field->isAggregated())
{
field = 0;
}
}
if (field) {
if (field->stringValue() != NULL)
{
reader_ = _CLNEW CL_NS(util)::StringReader(field->stringValue(),_tcslen(field->stringValue()),false);
}
else if (field->native().readerValue() != NULL)
{
Reader* r = field->native().readerValue();
FileReaderProxy
* frp =
dynamic_cast<FileReaderProxy*>(r);
if (frp == NULL)
{
_CLTHROWA(CL_ERR_IO, AGGR_NONFILEREADERPROXY_ERR);
}
else
{
reader_ = frp->clone();
}
}
else
{
_CLTHROWA(CL_ERR_IO, AGGR_STREAMREADER_ERR);
}
stream_ = analyzer_.tokenStream( field->name(), reader_ );
}
}
AggregateFieldAnalyzer::AggregateFieldAnalyzer(Cpix::Document& document,
lucene::analysis::Analyzer& analyzer)
: analyzer_(analyzer), document_(document)
{
}
lucene::analysis::TokenStream* AggregateFieldAnalyzer::tokenStream(const TCHAR * fieldName,
lucene::util::Reader * reader) {
if ( wcscmp( fieldName, LCPIX_DEFAULT_FIELD ) == 0 ) {
return new AggregateFieldTokenStream( analyzer_, document_.fields());
} else if ( wcscmp( fieldName, LCPIX_DEFAULT_PREFIX_FIELD ) == 0 ) {
return
new PrefixGenerator(
new AggregateFieldTokenStream( analyzer_, document_.fields()),
true,
OPTIMIZED_PREFIX_MAX_LENGTH);
} else {
return analyzer_.tokenStream( fieldName, reader );
}
}
SystemAnalyzer::SystemAnalyzer(lucene::analysis::Analyzer* analyzer) : analyzer_(analyzer) {}
SystemAnalyzer::~SystemAnalyzer() { _CLDELETE(analyzer_); }
lucene::analysis::TokenStream* SystemAnalyzer::tokenStream(const TCHAR * fieldName,
lucene::util::Reader * reader) {
using namespace lucene::analysis;
if ( wcscmp( fieldName, LCPIX_DEFAULT_FIELD ) == 0 ) {
// Use standard analyzer without stop filter for this task
TokenStream* ret = _CLNEW standard::StandardTokenizer(reader);
ret = _CLNEW standard::StandardFilter(ret,true);
ret = _CLNEW LowerCaseFilter(ret,true);
return ret;
} else if (wcscmp( fieldName, LCPIX_DOCUID_FIELD) == 0){
// Use standard analyzer without stop filter for this task
return _CLNEW KeywordTokenizer(reader);
} else if (wcscmp( fieldName, LCPIX_APPCLASS_FIELD ) == 0){
// Use standard analyzer without stop filter for this task
TokenStream* ret = _CLNEW WhitespaceTokenizer(reader);
ret = _CLNEW LowerCaseFilter(ret,true);
return ret;
} else if (wcscmp( fieldName, LCPIX_MIMETYPE_FIELD ) == 0) {
TokenStream* ret = _CLNEW KeywordTokenizer(reader);
return ret;
} else {
return analyzer_->tokenStream( fieldName, reader );
}
}
//
// Following sections provide the glue code for connecting the
// analyzer definition syntax with analyzer, tokenizers and filter
// implementations.
//
// The glue code is template heavy with the indent of providing
// automation for associating specific keywords with specific
// analyzers, tokenizers and filters implementing corresponding
// CLucene abstractions. Additional classes are needed only if
// filters, tokenizers, etc. accept parameters.
//
// NOTE: To understand the analyzers, it is sufficient to understand
// that an analyzer transforms characters stream into specific token streams
// (e.g. character stream 'foobarmetawords' can be transformed into token
// stream 'foo', 'bar' 'meta' 'words'). Analysis consist of two main
// parts which are tokenization and filtering. Tokenization converts
// the character stream into token stream (e.g. 'FoO bAr' -> 'FoO' 'bAr')
// and filtering modifies the tokens (e.g. lowercase filtering 'FoO' ->
// 'foo', 'bAr' -> 'bar'). Analyzer as an object is responsible for
// constructing a tokenizer and a sequence of filters to perform
// these required tasks.
//
// See the documentation around TokenizerClassEntries and
// FilterClassEntries to see how implementations not taking parameters
// can be easily added.
//
using namespace Cpix::AnalyzerExp;
/**
* Creates token stream for the given reader and fieldName.
* This class in in many ways similar to CLucene analyzer class
* definition.
*/
class TokenStreamFactory {
public:
virtual ~TokenStreamFactory();
virtual lucene::analysis::TokenStream* tokenStream(const wchar_t * fieldName,
lucene::util::Reader * reader) = 0;
};
TokenStreamFactory::~TokenStreamFactory() {};
/**
* Template class used to create CLucene tokenizers. Template
* parameter T must implement lucene::analysis::Tokenizer abstraction.
*/
template<class T>
class TokenizerFactory : public TokenStreamFactory
{
public:
TokenizerFactory(const Invokation& invokation) {
if (invokation.params().size() > 0) {
THROW_CPIXEXC(L"Tokenizer %S does not accept parameters",
invokation.id().c_str());
}
}
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * /*fieldName*/,
lucene::util::Reader * reader) {
return _CLNEW T(reader);
}
};
/**
* Template class wrapping CLucene analyzers. Template parameter T must
* implement lucene::analysis::Analyzer abstraction.
*/
template<class T>
class AnalyzerWrap : public TokenStreamFactory
{
public:
AnalyzerWrap(const Invokation& invokation) : analyzer_() {
if (invokation.params().size() > 0) {
THROW_CPIXEXC(L"Tokenizer %S does not accept parameters",
invokation.id().c_str());
}
}
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
lucene::util::Reader * reader) {
return analyzer_.tokenStream(fieldName, reader);
}
private:
T analyzer_;
};
/**
* Template class associated with CLucene filter and a TokenStreamFactory.
* Uses TokenStreamFactory to transform given character stream into tokenstream
* and then applies the given Clucene filter to the token stream.
* The template parameter T must implement lucene::analysis::Filter abstraction.
*/
template<class T>
class FilterFactory : public TokenStreamFactory
{
public:
FilterFactory(const Invokation& invokation, auto_ptr<TokenStreamFactory> factory) : factory_(factory) {
if (invokation.params().size() > 0) {
THROW_CPIXEXC(L"Filter %S does not accept parameters",
invokation.id().c_str());
}
}
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
lucene::util::Reader * reader) {
return _CLNEW T(factory_->tokenStream(fieldName, reader), true);
}
private:
std::auto_ptr<TokenStreamFactory> factory_;
};
/**
* Specialized Analyzer wrap for CLucene's PerFieldAnalyzer. Specialized
* template is needed because perfield analyzer accepts parameters
* (specific analyzers for different field plus default analyzer)
*/
template<>
class AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper> : public TokenStreamFactory {
public:
AnalyzerWrap(const Switch& sw) : analyzer_(0) {
using namespace Cpt::Parser;
using namespace lucene::analysis;
analyzer_ = _CLNEW PerFieldAnalyzerWrapper(_CLNEW CustomAnalyzer(sw.def()));
for (int i = 0; i < sw.cases().size(); i++) {
const Case& cs = *sw.cases()[i];
for (int j = 0; j < cs.fields().size(); j++) {
analyzer_->addAnalyzer( cs.fields()[j].c_str(), _CLNEW CustomAnalyzer( cs.piping() ) );
}
}
}
virtual ~AnalyzerWrap() {
_CLDELETE(analyzer_);
}
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
lucene::util::Reader * reader) {
return analyzer_->tokenStream(fieldName, reader);
}
private:
lucene::analysis::PerFieldAnalyzerWrapper* analyzer_;
};
/**
* Specialized StopFilter factory. Specialized filter is needed
* because StopFilter needs parameters (stop word list or a language)
*/
template<>
class FilterFactory<lucene::analysis::StopFilter> : public TokenStreamFactory
{
public:
FilterFactory(const Invokation& invokation,
auto_ptr<TokenStreamFactory> factory)
:words_(0), ownWords_(0), factory_(factory) {
using namespace Cpt::Parser;
if (invokation.params().size() == 1 && dynamic_cast<Identifier*>(invokation.params()[0])) {
Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
//cpix_LangCode lang;
if (id->id() == CPIX_WLANG_EN) {
words_ = lucene::analysis::StopAnalyzer::ENGLISH_STOP_WORDS;
} else {
THROW_CPIXEXC(L"No prepared stopword list for language code '%S'",
id->id().c_str());
}
} else {
ownWords_ = new wchar_t*[invokation.params().size()+1];
memset(ownWords_, 0, sizeof(wchar_t*)*(invokation.params().size()+1));
// FIXE: args may leak
for (int i = 0; i < invokation.params().size(); i++) {
StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]);
if (lit) {
const wstring& str = lit->text();
ownWords_[i] = new wchar_t[str.length()+1];
wcscpy(ownWords_[i], str.c_str());
} else {
THROW_CPIXEXC(L"StopFilter accepts only language identifer or list of strings as a parameters.");
}
}
}
}
virtual ~FilterFactory() {
if (ownWords_) {
for (int i = 0; ownWords_[i]; i++) {
delete[] ownWords_[i];
}
delete[] ownWords_;
}
}
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
lucene::util::Reader * reader) {
return _CLNEW lucene::analysis::StopFilter(factory_->tokenStream(fieldName, reader), true, ownWords_ ? const_cast<const wchar_t**>(ownWords_) : words_);
}
private:
const wchar_t **words_;
wchar_t **ownWords_; // owned
std::auto_ptr<TokenStreamFactory> factory_;
};
/**
* Specialized SnowballFilter factory is needed, because SnowballFilter
* accepts parameters (the language).
*/
template<>
class FilterFactory<lucene::analysis::SnowballFilter> : public TokenStreamFactory
{
public:
FilterFactory(const Invokation& invokation,
auto_ptr<TokenStreamFactory> factory)
: factory_(factory) {
using namespace Cpt::Parser;
if (invokation.params().size() != 1 || !dynamic_cast<Identifier*>(invokation.params()[0])) {
THROW_CPIXEXC(L"Snowball filter takes exactly one identifier as a parameter." );
}
Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
if (id->id() == CPIX_WLANG_EN) {
lang_ = cpix_LANG_EN;
} else {
THROW_CPIXEXC(L"Language identifier %S is not supported for stemming",
id->id().c_str());
}
}
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
lucene::util::Reader * reader) {
return _CLNEW lucene::analysis::SnowballFilter(factory_->tokenStream(fieldName, reader), true, lang_);
}
private:
cpix_LangCode lang_;
std::auto_ptr<TokenStreamFactory> factory_;
};
/**
* Specialized LengthFilter factory is needed, because length filter
* accepts parameters (minimum length and maximum length)
*/
template<>
class FilterFactory<lucene::analysis::LengthFilter> : public TokenStreamFactory
{
public:
FilterFactory(const Invokation& invokation,
auto_ptr<TokenStreamFactory> factory)
: factory_(factory) {
using namespace Cpt::Parser;
if (invokation.params().size() != 2 ||
!dynamic_cast<IntegerLit*>(invokation.params()[0]) ||
!dynamic_cast<IntegerLit*>(invokation.params()[1])) {
THROW_CPIXEXC("Length filter takes exactly two integer parameters");
}
min_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value();
max_ = dynamic_cast<IntegerLit*>(invokation.params()[1])->value();
}
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
lucene::util::Reader * reader) {
return _CLNEW lucene::analysis::LengthFilter(factory_->tokenStream(fieldName, reader), true, min_, max_ );
}
private:
int min_, max_;
std::auto_ptr<TokenStreamFactory> factory_;
};
/**
* Specialized PrefixGenerator factory is needed, because PrefixGenerator
* requires the max prefix size.
*/
template<>
class FilterFactory<PrefixGenerator> : public TokenStreamFactory
{
public:
FilterFactory(const Invokation& invokation,
auto_ptr<TokenStreamFactory> factory)
: factory_(factory) {
using namespace Cpt::Parser;
if (invokation.params().size() != 1 ||
!dynamic_cast<IntegerLit*>(invokation.params()[0])) {
THROW_CPIXEXC("Prefix generator takes exactly one integer parameter");
}
maxPrefixLength_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value();
}
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
lucene::util::Reader * reader) {
return _CLNEW PrefixGenerator(factory_->tokenStream(fieldName, reader), true, maxPrefixLength_ );
}
private:
int maxPrefixLength_;
std::auto_ptr<TokenStreamFactory> factory_;
};
typedef auto_ptr<TokenStreamFactory> (*TokenizerFactoryCreator)(const Invokation& invokation);
typedef auto_ptr<TokenStreamFactory> (*FilterFactoryCreator)(const Invokation& invokation,
auto_ptr<TokenStreamFactory> factory);
/**
* Sets up a tokenizer factory with given invokation parameters
*/
template<class T>
struct TokenizerFactoryCtor
{
static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) {
return auto_ptr<TokenStreamFactory>(new TokenizerFactory<T>(invokation));
}
};
/**
* Sets up an analyzer wrap with given invokation parameters
*/
template<class T>
struct AnalyzerWrapCtor
{
static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) {
return auto_ptr<TokenStreamFactory>(new AnalyzerWrap<T>(invokation));
}
};
/**
* Sets up a filter factory with given invokation parameters
*/
template<class T>
struct FilterFactoryCtor
{
static auto_ptr<TokenStreamFactory> create(const Invokation& invokation,
auto_ptr<TokenStreamFactory> factory) {
return auto_ptr<TokenStreamFactory>(new FilterFactory<T>(invokation, factory));
}
};
struct TokenizerClassEntry {
const wchar_t *id_;
TokenizerFactoryCreator createFactory_;
};
//
// Following TokenizerClassEntries and FilterClassEntries contain
// the mapping from tokenizer/analyzer/filter names into glue code
// templates providing the implementations.
//
TokenizerClassEntry TokenizerClassEntries[] = {
{CPIX_TOKENIZER_STANDARD, TokenizerFactoryCtor<lucene::analysis::standard::StandardTokenizer>::create},
{CPIX_TOKENIZER_WHITESPACE, TokenizerFactoryCtor<lucene::analysis::WhitespaceTokenizer>::create},
{CPIX_TOKENIZER_LETTER, TokenizerFactoryCtor<lucene::analysis::LetterTokenizer>::create},
{CPIX_TOKENIZER_KEYWORD, TokenizerFactoryCtor<lucene::analysis::KeywordTokenizer>::create},
{CPIX_ANALYZER_STANDARD, AnalyzerWrapCtor<lucene::analysis::standard::StandardAnalyzer>::create},
// TODO: Add more Tokenizers/Analyzers
// Example tokenizer (works as such if tokenizers don't take parameters)
// {CPIX_TOKENIZER_MYTOKENIZER,TokenizerFactoryCtor<MyTokenizer>::create},
// Example analyzer (works as such if analyzer don't take parameters)
// {CPIX_ANALYZER_MYANALYZER, AnalyzerWrapCtor<MyAnalyzer>::create},
{0, 0}
};
struct FilterClassEntry {
const wchar_t *id_;
FilterFactoryCreator createFactory_;
};
FilterClassEntry FilterClassEntries[] = {
{CPIX_FILTER_STANDARD, FilterFactoryCtor<lucene::analysis::standard::StandardFilter>::create},
{CPIX_FILTER_LOWERCASE, FilterFactoryCtor<lucene::analysis::LowerCaseFilter>::create},
{CPIX_FILTER_ACCENT, FilterFactoryCtor<lucene::analysis::ISOLatin1AccentFilter>::create},
{CPIX_FILTER_STOP, FilterFactoryCtor<lucene::analysis::StopFilter>::create},
{CPIX_FILTER_STEM, FilterFactoryCtor<lucene::analysis::SnowballFilter>::create},
{CPIX_FILTER_LENGTH, FilterFactoryCtor<lucene::analysis::LengthFilter>::create},
{CPIX_FILTER_PREFIXES, FilterFactoryCtor<PrefixGenerator>::create},
// TODO: Add more Filters
// Example filter (works as such if analyzer don't take parameters)
// {CPIX_FILTER_MYFILTER, FilterFactoryCtor<MyFilter>::create},
{0, 0}
};
CustomAnalyzer::CustomAnalyzer(const wchar_t* definition)
{
using namespace Cpt::Lex;
using namespace Cpt::Parser;
try
{
// 1. Setup an tokenizer
Cpix::AnalyzerExp::Tokenizer
tokenizer;
StdLexer
lexer(tokenizer, definition);
// 2. Parse
std::auto_ptr<Piping>
def = ParsePiping(lexer);
lexer.eatEof();
// 3. Setup this item based on parsed definition
setup(*def);
}
catch (Cpt::ITxtCtxtExc & exc)
{
// provide addition info for thrown exception
exc.setContext(definition);
// throw it fwd
throw;
}
}
CustomAnalyzer::CustomAnalyzer(const Piping& definition)
{
setup(definition);
}
using namespace Cpt::Parser;
void CustomAnalyzer::setup(const Piping& piping) {
// If the first item is invokation, create corresponding analyzer/tokenizer
if (dynamic_cast<const Invokation*>(&piping.tokenizer()))
{
const Invokation& tokenizer = dynamic_cast<const Invokation&>(piping.tokenizer());
TokenizerClassEntry& tokenizerEntry = getTokenizerEntry( tokenizer.id() );
factory_ = tokenizerEntry.createFactory_( tokenizer );
} else {
// If the first item is switch statement, create per-field analyzer
const Switch& tokenizer = dynamic_cast<const Switch&>(piping.tokenizer());
factory_ = new AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper>( tokenizer );
}
// Add filters
const std::vector<Invokation*>& filters = piping.filters();
for (int i = 0; i < filters.size(); i++) {
FilterClassEntry& filterEntry = getFilterEntry( filters[i]->id() );
factory_ = filterEntry.createFactory_( *filters[i], factory_ );
}
}
TokenizerClassEntry& CustomAnalyzer::getTokenizerEntry(std::wstring id) {
// Looks for a match in the TokenizerClassEntries. After finding
// a match it returns a proper tokenizer/analyzer implementation provider
//
for (int i = 0; TokenizerClassEntries[i].id_; i++) {
if (id == std::wstring(TokenizerClassEntries[i].id_)) {
return TokenizerClassEntries[i];
}
}
THROW_CPIXEXC(L"Unknown tokenizer '%S'.",
id.c_str());
}
FilterClassEntry& CustomAnalyzer::getFilterEntry(std::wstring id) {
// Looks for a match in the FilterClassEntries. After finding
// a match it returns a proper tokenizer/analyzer implementation
// provider
//
for (int i = 0; FilterClassEntries[i].id_; i++) {
if (id == std::wstring(FilterClassEntries[i].id_)) {
return FilterClassEntries[i];
}
}
THROW_CPIXEXC(L"Unknown filter '%S'.",
id.c_str());
}
CustomAnalyzer::~CustomAnalyzer() {}
lucene::analysis::TokenStream* CustomAnalyzer::tokenStream(const wchar_t * fieldName,
lucene::util::Reader * reader) {
// Utilizes the the token stream factory to form token stream.
// token stream factory is prepared during custom analyzer construction
// and based on the analyzer definition string.
return factory_->tokenStream(fieldName, reader);
}
}