diff -r d4d56f5e7c55 -r 65456528cac2 searchengine/oss/loc/analysis/inc/public/prefixfilter.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/searchengine/oss/loc/analysis/inc/public/prefixfilter.h Fri Oct 15 12:09:28 2010 +0530 @@ -0,0 +1,125 @@ +/* +* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ + +#ifndef PREFIXANALYSIS_H_ +#define PREFIXANALYSIS_H_ + +#include "CLucene.h" + +namespace analysis { + + extern const wchar_t* HebrewPrefixes[]; + + /** + * Returns alternative versions of encountered tokens. First version + * is without modifications. Other versions are with the potential prefixes + * removed. So, if we have word with three prefixes P1, P2 and P3 and some + * word stem W, then first returned word is P1P2P3W, second is P2P3W, third + * is P3W and last one is W. + */ + class PrefixFilter : public lucene::analysis::TokenFilter { + + public: + + PrefixFilter(lucene::analysis::TokenStream* input, + bool deleteTs, + const wchar_t** prefixes); + + /** Sets token to the next token in the stream, returns false at the EOS. */ + virtual bool next(lucene::analysis::Token* token); + + private: + const wchar_t** prefixes_; + lucene::analysis::Token token_; + + bool prefixFound_; + + + + }; + + /** + * Standard tokenizer + standard filter + lowercase analyzer + hebrew analyzer + */ + class HebrewAnalyzer : public lucene::analysis::Analyzer { + + public: + virtual lucene::analysis::TokenStream* + tokenStream(const wchar_t* fieldName, lucene::util::Reader* reader); + + }; + + /** + * Standard tokenizer + standard filter + lowercase analyzer + */ + class HebrewQueryAnalyzer : public lucene::analysis::Analyzer { + + public: + virtual lucene::analysis::TokenStream* + tokenStream(const wchar_t* fieldName, lucene::util::Reader* reader); + + }; + + extern const wchar_t* FrenchArticles[]; + + /** + * Elision filter drops article+apostrophe pairs from the beginning + * of the words. + */ + class ElisionFilter : public lucene::analysis::TokenFilter { + + public: + + ElisionFilter(lucene::analysis::TokenStream* input, + bool deleteTs, + const wchar_t** articles); + + /** Sets token to the next token in the stream, returns false at the EOS. */ + virtual bool next(lucene::analysis::Token* token); + + private: + + const wchar_t** articles_; + + }; + + /** Standard analyzer + standard filter + lowercase filter + elision filter */ + class FrenchAnalyzer : public lucene::analysis::Analyzer { + + public: + virtual lucene::analysis::TokenStream* + tokenStream(const wchar_t* fieldName, lucene::util::Reader* reader); + + }; + + class NonEnglishStopWords { + + public: + static const TCHAR* FRENCH_STOP_WORDS[]; + static const TCHAR* BRAZILIAN_STOP_WORDS[]; + static const TCHAR* CZECH_STOP_WORDS[]; + static const TCHAR* GERMAN_STOP_WORDS[]; + static const TCHAR* GREEK_STOP_WORDS[]; + static const TCHAR* DUTCH_STOP_WORDS[]; + static const TCHAR* RUSSIAN_STOP_WORDS[]; + static const TCHAR* EXTENDED_ENGLISH_STOP_WORDS[]; + + }; + +} + +#endif /* PREFIXANALYSIS_H_ */