FCL/sf/mw/searchsrv: comparison searchengine/oss/cl/clucene/src/clucene/analysis/analyzers.h

equal deleted inserted replaced

--1:000000000000
+:671dee74050a
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_Analyzers_
+#define _lucene_analysis_Analyzers_
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+#include "CLucene/util/Reader.h"
+#include "CLucene/analysis/AnalysisHeader.h"
+#include "clucene/util/misc.h"
+CL_NS_DEF(analysis)
+/** An abstract base class for simple, character-oriented tokenizers.*/
+class CharTokenizer:public Tokenizer {
+private:
+	int32_t offset, bufferIndex, dataLen;
+	TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
+	const TCHAR* ioBuffer;
+protected:
+/** Returns true iff a character should be included in a token.  This
+* tokenizer generates as tokens adjacent sequences of characters which
+* satisfy this predicate.  Characters for which this is false are used to
+* define token boundaries and are not included in tokens. */
+	virtual bool isTokenChar(const TCHAR c) const = 0;
+/** Called on each token character to normalize it before it is added to the
+* token.  The default implementation does nothing.  Subclasses may use this
+* to, e.g., lowercase tokens. */
+	virtual TCHAR normalize(const TCHAR c) const;
+public:
+	CharTokenizer(CL_NS(util)::Reader* in);
+	virtual ~CharTokenizer(){
+	}
+	bool next(Token* token);
+};
+/** A LetterTokenizer is a tokenizer that divides text at non-letters.  That's
+to say, it defines tokens as maximal strings of adjacent letters, as defined
+by java.lang.Character.isLetter() predicate.
+Note: this does a decent job for most European languages, but does a terrible
+job for some Asian languages, where words are not separated by spaces. */
+class LetterTokenizer:public CharTokenizer {
+public:
+	// Construct a new LetterTokenizer.
+	LetterTokenizer(CL_NS(util)::Reader* in):
+	CharTokenizer(in) {}
+~LetterTokenizer(){}
+protected:
+/** Collects only characters which satisfy _istalpha.*/
+	bool isTokenChar(const TCHAR c) const;
+};
+/**
+* LowerCaseTokenizer performs the function of LetterTokenizer
+* and LowerCaseFilter together.  It divides text at non-letters and converts
+* them to lower case.  While it is functionally equivalent to the combination
+* of LetterTokenizer and LowerCaseFilter, there is a performance advantage
+* to doing the two tasks at once, hence this (redundant) implementation.
+* <P>
+* Note: this does a decent job for most European languages, but does a terrible
+* job for some Asian languages, where words are not separated by spaces.
+*/
+class LowerCaseTokenizer:public LetterTokenizer {
+public:
+	/** Construct a new LowerCaseTokenizer. */
+	LowerCaseTokenizer(CL_NS(util)::Reader* in):
+	LetterTokenizer(in) {}
+~LowerCaseTokenizer(){}
+protected:
+	/** Collects only characters which satisfy _totlower. */
+	TCHAR normalize(const TCHAR chr) const;
+};
+/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
+* Adjacent sequences of non-Whitespace characters form tokens. */
+class WhitespaceTokenizer: public CharTokenizer {
+public:
+	/** Construct a new WhitespaceTokenizer. */
+	WhitespaceTokenizer(CL_NS(util)::Reader* in):CharTokenizer(in) {}
+	~WhitespaceTokenizer(){}
+protected:
+	/** Collects only characters which do not satisfy _istspace.
+	*/
+	bool isTokenChar(const TCHAR c) const;
+};
+/** An Analyzer that uses WhitespaceTokenizer. */
+class WhitespaceAnalyzer: public Analyzer {
+public:
+TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+~WhitespaceAnalyzer(){}
+};
+/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
+class SimpleAnalyzer: public Analyzer {
+public:
+	TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+	~SimpleAnalyzer(){}
+};
+/**
+* Normalizes token text to lower case.
+*/
+class LowerCaseFilter: public TokenFilter {
+public:
+	LowerCaseFilter(TokenStream* in, bool deleteTokenStream):TokenFilter(in,deleteTokenStream) {}
+	~LowerCaseFilter(){}
+	bool next(Token* token);
+};
+/**
+* Removes stop words from a token stream.
+*/
+class StopFilter: public TokenFilter {
+private:
+	//bvk: i found this to work faster with a non-hash table. the number of items
+	//in the stop table is not like to make it worth having hashing.
+	CL_NS(util)::CLSetList<const TCHAR*>* table;
+	bool ownTable;
+public:
+	// Constructs a filter which removes words from the input
+	//	TokenStream that are named in the array of words.
+	StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords);
+	~StopFilter();
+	/** Constructs a filter which removes words from the input
+	*	TokenStream that are named in the CLSetList.
+	*/
+	StopFilter(TokenStream* in, bool deleteTokenStream, CL_NS(util)::CLSetList<const TCHAR*>* stopTable):
+		TokenFilter(in, deleteTokenStream),
+		table(stopTable),
+		ownTable(false)
+	{}
+	/**
+	* Builds a Hashtable from an array of stop words, appropriate for passing
+	* into the StopFilter constructor.  This permits this table construction to
+	* be cached once when an Analyzer is constructed.
+	* Note: the stopWords list must be a static list because the strings are not copied
+	*/
+	static void fillStopTable(CL_NS(util)::CLSetList<const TCHAR*>* stopTable,
+const TCHAR** stopWords);
+	/**
+	* Returns the next input Token whose termText() is not a stop word.
+	*/
+	bool next(Token* token);
+};
+/** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
+class StopAnalyzer: public Analyzer {
+CL_NS(util)::CLSetList<const TCHAR*> stopTable;
+public:
+/** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
+StopAnalyzer();
+~StopAnalyzer();
+/** Builds an analyzer which removes words in the provided array. */
+StopAnalyzer( const TCHAR** stopWords );
+/** Filters LowerCaseTokenizer with StopFilter. */
+TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+	/** An array containing some common English words that are not usually useful
+for searching. */
+static const TCHAR* ENGLISH_STOP_WORDS[];
+};
+/**
+* This analyzer is used to facilitate scenarios where different
+* fields require different analysis techniques.  Use {@link #addAnalyzer}
+* to add a non-default analyzer on a field name basis.
+*
+* <p>Example usage:
+*
+* <pre>
+*   PerFieldAnalyzerWrapper aWrapper =
+*      new PerFieldAnalyzerWrapper(new StandardAnalyzer());
+*   aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
+*   aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
+* </pre>
+*
+* <p>In this example, StandardAnalyzer will be used for all fields except "firstname"
+* and "lastname", for which KeywordAnalyzer will be used.
+*
+* <p>A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing
+* and query parsing.
+*/
+class PerFieldAnalyzerWrapper : public Analyzer {
+private:
+Analyzer* defaultAnalyzer;
+CL_NS(util)::LHashMap<const TCHAR*, Analyzer*, CL_NS(util)::Compare::TChar,
+CL_NS(util)::Equals::TChar, CL_NS(util)::Deletor::tcArray,CL_NS(util)::Deletor::Void<Analyzer> > analyzerMap;
+public:
+/**
+* Constructs with default analyzer.
+*
+* @param defaultAnalyzer Any fields not specifically
+* defined to use a different analyzer will use the one provided here.
+*/
+PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer);
+~PerFieldAnalyzerWrapper();
+/**
+* Defines an analyzer to use for the specified field.
+*
+* @param fieldName field name requiring a non-default analyzer
+* @param analyzer non-default analyzer to use for field
+*/
+void addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer);
+TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+};
+/**
+* A filter that replaces accented characters in the ISO Latin 1 character set
+* (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
+* <p>
+* For instance, '&agrave;' will be replaced by 'a'.
+* <p>
+*/
+class ISOLatin1AccentFilter: public TokenFilter {
+public:
+	ISOLatin1AccentFilter(TokenStream* input, bool deleteTs):
+		TokenFilter(input,deleteTs)
+	{
+	}
+	/**
+	 * To replace accented characters in a
+	 *  String by unaccented equivalents.
+	 */
+	bool next(Token* token);
+};
+/**
+* Emits the entire input as a single token.
+*/
+class KeywordTokenizer: public Tokenizer {
+private:
+LUCENE_STATIC_CONSTANT(int, DEFAULT_BUFFER_SIZE = 256);
+bool done;
+int bufferSize;
+public:
+KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize=-1);
+virtual ~KeywordTokenizer();
+bool next(Token* token);
+};
+/**
+* "Tokenizes" the entire stream as a single token. This is useful
+* for data like zip codes, ids, and some product names.
+*/
+class KeywordAnalyzer: public Analyzer {
+public:
+TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+virtual ~KeywordAnalyzer(){}
+};
+/**
+* Removes words that are too long and too short from the stream.
+*
+*/
+class LengthFilter: public TokenFilter {
+private:
+int _min;
+int _max;
+public:
+/**
+* Build a filter that removes words that are too long or too
+* short from the text.
+*/
+LengthFilter(TokenStream* in, int _min, int _max);
+LengthFilter(TokenStream* in, bool deleteTs, int _min, int _max);
+/**
+* Returns the next input Token whose termText() is the right len
+*/
+bool next(Token* token);
+};
+CL_NS_END
+#endif

changeset 0	671dee74050a
child 21	2c484ac32ef0