searchengine/oss/cl/clucene/src/clucene/analysis/analyzers.h
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Mon, 19 Apr 2010 14:40:16 +0300
changeset 0 671dee74050a
child 21 2c484ac32ef0
permissions -rw-r--r--
Revision: 201011 Kit: 201015

/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#ifndef _lucene_analysis_Analyzers_
#define _lucene_analysis_Analyzers_

#if defined(_LUCENE_PRAGMA_ONCE)
# pragma once
#endif

#include "CLucene/util/Reader.h"
#include "CLucene/analysis/AnalysisHeader.h"
#include "clucene/util/misc.h"

CL_NS_DEF(analysis)

/** An abstract base class for simple, character-oriented tokenizers.*/
class CharTokenizer:public Tokenizer {
private:
	int32_t offset, bufferIndex, dataLen;
	TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
	const TCHAR* ioBuffer;
protected:
    
    /** Returns true iff a character should be included in a token.  This
    * tokenizer generates as tokens adjacent sequences of characters which
    * satisfy this predicate.  Characters for which this is false are used to
    * define token boundaries and are not included in tokens. */
	virtual bool isTokenChar(const TCHAR c) const = 0;

    /** Called on each token character to normalize it before it is added to the
    * token.  The default implementation does nothing.  Subclasses may use this
    * to, e.g., lowercase tokens. */
   	virtual TCHAR normalize(const TCHAR c) const;

public:
	CharTokenizer(CL_NS(util)::Reader* in);
	virtual ~CharTokenizer(){
	}
	bool next(Token* token);
};


/** A LetterTokenizer is a tokenizer that divides text at non-letters.  That's
to say, it defines tokens as maximal strings of adjacent letters, as defined
by java.lang.Character.isLetter() predicate.

Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces. */
class LetterTokenizer:public CharTokenizer {
public:
	// Construct a new LetterTokenizer. 
	LetterTokenizer(CL_NS(util)::Reader* in):
	CharTokenizer(in) {}

    ~LetterTokenizer(){}
protected:
    /** Collects only characters which satisfy _istalpha.*/
	bool isTokenChar(const TCHAR c) const;
};



/**
* LowerCaseTokenizer performs the function of LetterTokenizer
* and LowerCaseFilter together.  It divides text at non-letters and converts
* them to lower case.  While it is functionally equivalent to the combination
* of LetterTokenizer and LowerCaseFilter, there is a performance advantage
* to doing the two tasks at once, hence this (redundant) implementation.
* <P>
* Note: this does a decent job for most European languages, but does a terrible
* job for some Asian languages, where words are not separated by spaces.
*/
class LowerCaseTokenizer:public LetterTokenizer {
public:
	/** Construct a new LowerCaseTokenizer. */
	LowerCaseTokenizer(CL_NS(util)::Reader* in):
	LetterTokenizer(in) {}

    ~LowerCaseTokenizer(){}
protected:
	/** Collects only characters which satisfy _totlower. */
	TCHAR normalize(const TCHAR chr) const;
};


/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
 * Adjacent sequences of non-Whitespace characters form tokens. */
class WhitespaceTokenizer: public CharTokenizer {
public:
	/** Construct a new WhitespaceTokenizer. */ 
	WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {}
	~WhitespaceTokenizer(){}
protected:
	/** Collects only characters which do not satisfy _istspace.
	*/
	bool isTokenChar(const TCHAR c) const;
};


/** An Analyzer that uses WhitespaceTokenizer. */
class WhitespaceAnalyzer: public Analyzer {
 public:
  TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
  ~WhitespaceAnalyzer(){}
};

/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
class SimpleAnalyzer: public Analyzer {
public:
	TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
	~SimpleAnalyzer(){}
};



/**
* Normalizes token text to lower case.
*/
class LowerCaseFilter: public TokenFilter {
public:
	LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {}
	~LowerCaseFilter(){}
	bool next(Token* token);
};


/**
 * Removes stop words from a token stream.
 */
class StopFilter: public TokenFilter {
private:
	//bvk: i found this to work faster with a non-hash table. the number of items
	//in the stop table is not like to make it worth having hashing.
	CL_NS(util)::CLSetList<const TCHAR*>* table;
	bool ownTable; 
public:
	// Constructs a filter which removes words from the input
	//	TokenStream that are named in the array of words. 
	StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords);

	~StopFilter();

	/** Constructs a filter which removes words from the input
	*	TokenStream that are named in the CLSetList.
	*/
	StopFilter(TokenStream* in, bool deleteTokenStream, CL_NS(util)::CLSetList<const TCHAR*>* stopTable):
		TokenFilter(in, deleteTokenStream),
		table(stopTable),
		ownTable(false)
	{} 
	  
	
	/**
	* Builds a Hashtable from an array of stop words, appropriate for passing
	* into the StopFilter constructor.  This permits this table construction to
	* be cached once when an Analyzer is constructed. 
	* Note: the stopWords list must be a static list because the strings are not copied
	*/
	static void fillStopTable(CL_NS(util)::CLSetList<const TCHAR*>* stopTable,
                              const TCHAR** stopWords);

	/**
	* Returns the next input Token whose termText() is not a stop word.
	*/ 
	bool next(Token* token);
};




/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
class StopAnalyzer: public Analyzer {
    CL_NS(util)::CLSetList<const TCHAR*> stopTable;

public:
    /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
    StopAnalyzer();
    ~StopAnalyzer();
    
    /** Builds an analyzer which removes words in the provided array. */
    StopAnalyzer( const TCHAR** stopWords );
    /** Filters LowerCaseTokenizer with StopFilter. */
    TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
	
	/** An array containing some common English words that are not usually useful
    for searching. */
    static const TCHAR* ENGLISH_STOP_WORDS[];
};



/**
 * This analyzer is used to facilitate scenarios where different
 * fields require different analysis techniques.  Use {@link #addAnalyzer}
 * to add a non-default analyzer on a field name basis.
 * 
 * <p>Example usage:
 * 
 * <pre>
 *   PerFieldAnalyzerWrapper aWrapper =
 *      new PerFieldAnalyzerWrapper(new StandardAnalyzer());
 *   aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
 *   aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
 * </pre>
 * 
 * <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
 * and "lastname", for which KeywordAnalyzer will be used.
 * 
 * <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
 * and query parsing.
 */
class PerFieldAnalyzerWrapper : public Analyzer {
private:
    Analyzer* defaultAnalyzer;
    CL_NS(util)::LHashMap<const TCHAR*, Analyzer*, CL_NS(util)::Compare::TChar,
    CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > analyzerMap;
public:
    /**
    * Constructs with default analyzer.
    *
    * @param defaultAnalyzer Any fields not specifically
    * defined to use a different analyzer will use the one provided here.
    */
    PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer);
    ~PerFieldAnalyzerWrapper();
    
    /**
    * Defines an analyzer to use for the specified field.
    *
    * @param fieldName field name requiring a non-default analyzer
    * @param analyzer non-default analyzer to use for field
    */
    void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer);
    TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
};


/**
 * A filter that replaces accented characters in the ISO Latin 1 character set 
 * (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
 * <p>
 * For instance, '&agrave;' will be replaced by 'a'.
 * <p>
 */
class ISOLatin1AccentFilter: public TokenFilter {
public:
	ISOLatin1AccentFilter(TokenStream* input, bool deleteTs):
		TokenFilter(input,deleteTs)
	{
	}
	
	/**
	 * To replace accented characters in a
	 *  String by unaccented equivalents.
	 */
	bool next(Token* token);
};


/**
 * Emits the entire input as a single token.
 */
class KeywordTokenizer: public Tokenizer {
private:
    LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256);
    bool done;
    int bufferSize;
public:
    KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1);
    virtual ~KeywordTokenizer();
    bool next(Token* token);
};

/**
 * "Tokenizes" the entire stream as a single token. This is useful
 * for data like zip codes, ids, and some product names.
 */
class KeywordAnalyzer: public Analyzer {
public:
    TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
    virtual ~KeywordAnalyzer(){}
};

    
/**
 * Removes words that are too long and too short from the stream.
 *
 */
class LengthFilter: public TokenFilter {
private:
    int _min;
    int _max;
public:
    /**
    * Build a filter that removes words that are too long or too
    * short from the text.
    */
    LengthFilter(TokenStream* in, int _min, int _max);
    LengthFilter(TokenStream* in, bool deleteTs, int _min, int _max);
    
    /**
    * Returns the next input Token whose termText() is the right len
    */
    bool next(Token* token);
};


CL_NS_END
#endif