searchengine/oss/cl/clucene/src/clucene/analysis/analyzers.h
changeset 0 671dee74050a
child 21 2c484ac32ef0
equal deleted inserted replaced
-1:000000000000 0:671dee74050a
       
     1 /*------------------------------------------------------------------------------
       
     2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
       
     3 * 
       
     4 * Distributable under the terms of either the Apache License (Version 2.0) or 
       
     5 * the GNU Lesser General Public License, as specified in the COPYING file.
       
     6 ------------------------------------------------------------------------------*/
       
     7 #ifndef _lucene_analysis_Analyzers_
       
     8 #define _lucene_analysis_Analyzers_
       
     9 
       
    10 #if defined(_LUCENE_PRAGMA_ONCE)
       
    11 # pragma once
       
    12 #endif
       
    13 
       
    14 #include "CLucene/util/Reader.h"
       
    15 #include "CLucene/analysis/AnalysisHeader.h"
       
    16 #include "clucene/util/misc.h"
       
    17 
       
    18 CL_NS_DEF(analysis)
       
    19 
       
    20 /** An abstract base class for simple, character-oriented tokenizers.*/
       
    21 class CharTokenizer:public Tokenizer {
       
    22 private:
       
    23 	int32_t offset, bufferIndex, dataLen;
       
    24 	TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
       
    25 	const TCHAR* ioBuffer;
       
    26 protected:
       
    27     
       
    28     /** Returns true iff a character should be included in a token.  This
       
    29     * tokenizer generates as tokens adjacent sequences of characters which
       
    30     * satisfy this predicate.  Characters for which this is false are used to
       
    31     * define token boundaries and are not included in tokens. */
       
    32 	virtual bool isTokenChar(const TCHAR c) const = 0;
       
    33 
       
    34     /** Called on each token character to normalize it before it is added to the
       
    35     * token.  The default implementation does nothing.  Subclasses may use this
       
    36     * to, e.g., lowercase tokens. */
       
    37    	virtual TCHAR normalize(const TCHAR c) const;
       
    38 
       
    39 public:
       
    40 	CharTokenizer(CL_NS(util)::Reader* in);
       
    41 	virtual ~CharTokenizer(){
       
    42 	}
       
    43 	bool next(Token* token);
       
    44 };
       
    45 
       
    46 
       
    47 /** A LetterTokenizer is a tokenizer that divides text at non-letters.  That's
       
    48 to say, it defines tokens as maximal strings of adjacent letters, as defined
       
    49 by java.lang.Character.isLetter() predicate.
       
    50 
       
    51 Note: this does a decent job for most European languages, but does a terrible
       
    52 job for some Asian languages, where words are not separated by spaces. */
       
    53 class LetterTokenizer:public CharTokenizer {
       
    54 public:
       
    55 	// Construct a new LetterTokenizer. 
       
    56 	LetterTokenizer(CL_NS(util)::Reader* in):
       
    57 	CharTokenizer(in) {}
       
    58 
       
    59     ~LetterTokenizer(){}
       
    60 protected:
       
    61     /** Collects only characters which satisfy _istalpha.*/
       
    62 	bool isTokenChar(const TCHAR c) const;
       
    63 };
       
    64 
       
    65 
       
    66 
       
    67 /**
       
    68 * LowerCaseTokenizer performs the function of LetterTokenizer
       
    69 * and LowerCaseFilter together.  It divides text at non-letters and converts
       
    70 * them to lower case.  While it is functionally equivalent to the combination
       
    71 * of LetterTokenizer and LowerCaseFilter, there is a performance advantage
       
    72 * to doing the two tasks at once, hence this (redundant) implementation.
       
    73 * <P>
       
    74 * Note: this does a decent job for most European languages, but does a terrible
       
    75 * job for some Asian languages, where words are not separated by spaces.
       
    76 */
       
    77 class LowerCaseTokenizer:public LetterTokenizer {
       
    78 public:
       
    79 	/** Construct a new LowerCaseTokenizer. */
       
    80 	LowerCaseTokenizer(CL_NS(util)::Reader* in):
       
    81 	LetterTokenizer(in) {}
       
    82 
       
    83     ~LowerCaseTokenizer(){}
       
    84 protected:
       
    85 	/** Collects only characters which satisfy _totlower. */
       
    86 	TCHAR normalize(const TCHAR chr) const;
       
    87 };
       
    88 
       
    89 
       
    90 /** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
       
    91  * Adjacent sequences of non-Whitespace characters form tokens. */
       
    92 class WhitespaceTokenizer: public CharTokenizer {
       
    93 public:
       
    94 	/** Construct a new WhitespaceTokenizer. */ 
       
    95 	WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {}
       
    96 	~WhitespaceTokenizer(){}
       
    97 protected:
       
    98 	/** Collects only characters which do not satisfy _istspace.
       
    99 	*/
       
   100 	bool isTokenChar(const TCHAR c) const;
       
   101 };
       
   102 
       
   103 
       
   104 /** An Analyzer that uses WhitespaceTokenizer. */
       
   105 class WhitespaceAnalyzer: public Analyzer {
       
   106  public:
       
   107   TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
       
   108   ~WhitespaceAnalyzer(){}
       
   109 };
       
   110 
       
   111 /** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
       
   112 class SimpleAnalyzer: public Analyzer {
       
   113 public:
       
   114 	TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
       
   115 	~SimpleAnalyzer(){}
       
   116 };
       
   117 
       
   118 
       
   119 
       
   120 /**
       
   121 * Normalizes token text to lower case.
       
   122 */
       
   123 class LowerCaseFilter: public TokenFilter {
       
   124 public:
       
   125 	LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {}
       
   126 	~LowerCaseFilter(){}
       
   127 	bool next(Token* token);
       
   128 };
       
   129 
       
   130 
       
   131 /**
       
   132  * Removes stop words from a token stream.
       
   133  */
       
   134 class StopFilter: public TokenFilter {
       
   135 private:
       
   136 	//bvk: i found this to work faster with a non-hash table. the number of items
       
   137 	//in the stop table is not like to make it worth having hashing.
       
   138 	CL_NS(util)::CLSetList<const TCHAR*>* table;
       
   139 	bool ownTable; 
       
   140 public:
       
   141 	// Constructs a filter which removes words from the input
       
   142 	//	TokenStream that are named in the array of words. 
       
   143 	StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords);
       
   144 
       
   145 	~StopFilter();
       
   146 
       
   147 	/** Constructs a filter which removes words from the input
       
   148 	*	TokenStream that are named in the CLSetList.
       
   149 	*/
       
   150 	StopFilter(TokenStream* in, bool deleteTokenStream, CL_NS(util)::CLSetList<const TCHAR*>* stopTable):
       
   151 		TokenFilter(in, deleteTokenStream),
       
   152 		table(stopTable),
       
   153 		ownTable(false)
       
   154 	{} 
       
   155 	  
       
   156 	
       
   157 	/**
       
   158 	* Builds a Hashtable from an array of stop words, appropriate for passing
       
   159 	* into the StopFilter constructor.  This permits this table construction to
       
   160 	* be cached once when an Analyzer is constructed. 
       
   161 	* Note: the stopWords list must be a static list because the strings are not copied
       
   162 	*/
       
   163 	static void fillStopTable(CL_NS(util)::CLSetList<const TCHAR*>* stopTable,
       
   164                               const TCHAR** stopWords);
       
   165 
       
   166 	/**
       
   167 	* Returns the next input Token whose termText() is not a stop word.
       
   168 	*/ 
       
   169 	bool next(Token* token);
       
   170 };
       
   171 
       
   172 
       
   173 
       
   174 
       
   175 /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
       
   176 class StopAnalyzer: public Analyzer {
       
   177     CL_NS(util)::CLSetList<const TCHAR*> stopTable;
       
   178 
       
   179 public:
       
   180     /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
       
   181     StopAnalyzer();
       
   182     ~StopAnalyzer();
       
   183     
       
   184     /** Builds an analyzer which removes words in the provided array. */
       
   185     StopAnalyzer( const TCHAR** stopWords );
       
   186     /** Filters LowerCaseTokenizer with StopFilter. */
       
   187     TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
       
   188 	
       
   189 	/** An array containing some common English words that are not usually useful
       
   190     for searching. */
       
   191     static const TCHAR* ENGLISH_STOP_WORDS[];
       
   192 };
       
   193 
       
   194 
       
   195 
       
   196 /**
       
   197  * This analyzer is used to facilitate scenarios where different
       
   198  * fields require different analysis techniques.  Use {@link #addAnalyzer}
       
   199  * to add a non-default analyzer on a field name basis.
       
   200  * 
       
   201  * <p>Example usage:
       
   202  * 
       
   203  * <pre>
       
   204  *   PerFieldAnalyzerWrapper aWrapper =
       
   205  *      new PerFieldAnalyzerWrapper(new StandardAnalyzer());
       
   206  *   aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
       
   207  *   aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
       
   208  * </pre>
       
   209  * 
       
   210  * <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
       
   211  * and "lastname", for which KeywordAnalyzer will be used.
       
   212  * 
       
   213  * <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
       
   214  * and query parsing.
       
   215  */
       
   216 class PerFieldAnalyzerWrapper : public Analyzer {
       
   217 private:
       
   218     Analyzer* defaultAnalyzer;
       
   219     CL_NS(util)::LHashMap<const TCHAR*, Analyzer*, CL_NS(util)::Compare::TChar,
       
   220     CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > analyzerMap;
       
   221 public:
       
   222     /**
       
   223     * Constructs with default analyzer.
       
   224     *
       
   225     * @param defaultAnalyzer Any fields not specifically
       
   226     * defined to use a different analyzer will use the one provided here.
       
   227     */
       
   228     PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer);
       
   229     ~PerFieldAnalyzerWrapper();
       
   230     
       
   231     /**
       
   232     * Defines an analyzer to use for the specified field.
       
   233     *
       
   234     * @param fieldName field name requiring a non-default analyzer
       
   235     * @param analyzer non-default analyzer to use for field
       
   236     */
       
   237     void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer);
       
   238     TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
       
   239 };
       
   240 
       
   241 
       
   242 /**
       
   243  * A filter that replaces accented characters in the ISO Latin 1 character set 
       
   244  * (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
       
   245  * <p>
       
   246  * For instance, '&agrave;' will be replaced by 'a'.
       
   247  * <p>
       
   248  */
       
   249 class ISOLatin1AccentFilter: public TokenFilter {
       
   250 public:
       
   251 	ISOLatin1AccentFilter(TokenStream* input, bool deleteTs):
       
   252 		TokenFilter(input,deleteTs)
       
   253 	{
       
   254 	}
       
   255 	
       
   256 	/**
       
   257 	 * To replace accented characters in a
       
   258 	 *  String by unaccented equivalents.
       
   259 	 */
       
   260 	bool next(Token* token);
       
   261 };
       
   262 
       
   263 
       
   264 /**
       
   265  * Emits the entire input as a single token.
       
   266  */
       
   267 class KeywordTokenizer: public Tokenizer {
       
   268 private:
       
   269     LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256);
       
   270     bool done;
       
   271     int bufferSize;
       
   272 public:
       
   273     KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1);
       
   274     virtual ~KeywordTokenizer();
       
   275     bool next(Token* token);
       
   276 };
       
   277 
       
   278 /**
       
   279  * "Tokenizes" the entire stream as a single token. This is useful
       
   280  * for data like zip codes, ids, and some product names.
       
   281  */
       
   282 class KeywordAnalyzer: public Analyzer {
       
   283 public:
       
   284     TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
       
   285     virtual ~KeywordAnalyzer(){}
       
   286 };
       
   287 
       
   288     
       
   289 /**
       
   290  * Removes words that are too long and too short from the stream.
       
   291  *
       
   292  */
       
   293 class LengthFilter: public TokenFilter {
       
   294 private:
       
   295     int _min;
       
   296     int _max;
       
   297 public:
       
   298     /**
       
   299     * Build a filter that removes words that are too long or too
       
   300     * short from the text.
       
   301     */
       
   302     LengthFilter(TokenStream* in, int _min, int _max);
       
   303     LengthFilter(TokenStream* in, bool deleteTs, int _min, int _max);
       
   304     
       
   305     /**
       
   306     * Returns the next input Token whose termText() is the right len
       
   307     */
       
   308     bool next(Token* token);
       
   309 };
       
   310 
       
   311 
       
   312 CL_NS_END
       
   313 #endif