--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/cl/clucene/src/clucene/analysis/analyzers.cpp Mon Apr 19 14:40:16 2010 +0300
@@ -0,0 +1,404 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "clucene/stdheader.h"
+#include "Analyzers.h"
+#include "clucene/util/stringbuffer.h"
+
+CL_NS_USE(util)
+CL_NS_DEF(analysis)
+
+CharTokenizer::CharTokenizer(Reader* in) :
+ Tokenizer(in),
+ offset(0),
+ bufferIndex(0),
+ dataLen(0),
+ ioBuffer(NULL)
+{
+ buffer[0]=0;
+}
+
+TCHAR CharTokenizer::normalize(const TCHAR c) const
+{
+ return c;
+}
+bool CharTokenizer::next(Token* token){
+ int32_t length = 0;
+ int32_t start = offset;
+ while (true) {
+ TCHAR c;
+ offset++;
+ if (bufferIndex >= dataLen) {
+ dataLen = input->read(ioBuffer, LUCENE_IO_BUFFER_SIZE);
+ if (dataLen == -1)
+ dataLen = 0;
+ bufferIndex = 0;
+ }
+ if (dataLen <= 0 ) {
+ if (length > 0)
+ break;
+ else
+ return false;
+ }else
+ c = ioBuffer[bufferIndex++];
+ if (isTokenChar(c)) { // if it's a token TCHAR
+
+ if (length == 0) // start of token
+ start = offset-1;
+
+ buffer[length++] = normalize(c); // buffer it, normalized
+
+ if (length == LUCENE_MAX_WORD_LEN) // buffer overflow!
+ break;
+
+ } else if (length > 0) // at non-Letter w/ chars
+ break; // return 'em
+
+ }
+ buffer[length]=0;
+ token->set( buffer, start, start+length);
+ return true;
+}
+
+bool LetterTokenizer::isTokenChar(const TCHAR c) const {
+ return _istalpha(c)!=0;
+}
+
+
+TCHAR LowerCaseTokenizer::normalize(const TCHAR chr) const {
+ return _totlower(chr);
+}
+
+bool WhitespaceTokenizer::isTokenChar(const TCHAR c) const{
+ return _istspace(c)==0; //(return true if NOT a space)
+}
+
+TokenStream* WhitespaceAnalyzer::tokenStream(const TCHAR* , Reader* reader) {
+ return _CLNEW WhitespaceTokenizer(reader);
+}
+
+TokenStream* SimpleAnalyzer::tokenStream(const TCHAR* , Reader* reader) {
+ return _CLNEW LowerCaseTokenizer(reader);
+}
+
+bool LowerCaseFilter::next(Token* t){
+ if (!input->next(t))
+ return false;
+ stringCaseFold( t->_termText );
+ return true;
+}
+
+StopFilter::StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords):
+ TokenFilter(in, deleteTokenStream),
+ table(_CLNEW CLSetList<const TCHAR*>(false)),
+ ownTable(true)
+{
+ fillStopTable( table,stopWords );
+}
+
+StopFilter::~StopFilter()
+{
+ if (ownTable) {
+ _CLDELETE( table );
+ }
+}
+
+
+void StopFilter::fillStopTable(CLSetList<const TCHAR*>* stopTable,
+ const TCHAR** stopWords) {
+ for (int32_t i = 0; stopWords[i]!=NULL; i++)
+ stopTable->insert(stopWords[i]);
+}
+
+bool StopFilter::next(Token* token) {
+ // return the first non-stop word found
+ while (input->next(token)){
+ if (table->find(token->_termText)==table->end()){
+ return true;
+ }
+ }
+
+ // reached EOS -- return nothing
+ return false;
+}
+
+StopAnalyzer::StopAnalyzer():stopTable(false)
+{
+ StopFilter::fillStopTable(&stopTable,ENGLISH_STOP_WORDS);
+}
+StopAnalyzer::~StopAnalyzer()
+{
+}
+StopAnalyzer::StopAnalyzer( const TCHAR** stopWords) {
+ StopFilter::fillStopTable(&stopTable,stopWords);
+}
+TokenStream* StopAnalyzer::tokenStream(const TCHAR* , Reader* reader) {
+ return _CLNEW StopFilter(_CLNEW LowerCaseTokenizer(reader),true, &stopTable);
+}
+
+const TCHAR* StopAnalyzer::ENGLISH_STOP_WORDS[] =
+{
+ _T("a"), _T("an"), _T("and"), _T("are"), _T("as"), _T("at"), _T("be"), _T("but"), _T("by"),
+ _T("for"), _T("if"), _T("in"), _T("into"), _T("is"), _T("it"),
+ _T("no"), _T("not"), _T("of"), _T("on"), _T("or"), _T("s"), _T("such"),
+ _T("t"), _T("that"), _T("the"), _T("their"), _T("then"), _T("there"), _T("these"),
+ _T("they"), _T("this"), _T("to"), _T("was"), _T("will"), _T("with"), NULL
+};
+
+PerFieldAnalyzerWrapper::PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer):
+ analyzerMap(true,true)
+{
+ this->defaultAnalyzer = defaultAnalyzer;
+}
+PerFieldAnalyzerWrapper::~PerFieldAnalyzerWrapper(){
+ analyzerMap.clear();
+ _CLDELETE(defaultAnalyzer);
+}
+
+void PerFieldAnalyzerWrapper::addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer) {
+ analyzerMap.put(STRDUP_TtoT(fieldName), analyzer);
+}
+
+TokenStream* PerFieldAnalyzerWrapper::tokenStream(const TCHAR* fieldName, Reader* reader) {
+ Analyzer* analyzer = (fieldName==NULL?defaultAnalyzer:analyzerMap.get(fieldName));
+ if (analyzer == NULL) {
+ analyzer = defaultAnalyzer;
+ }
+
+ return analyzer->tokenStream(fieldName, reader);
+}
+
+
+
+bool ISOLatin1AccentFilter::next(Token* token){
+ if ( input->next(token) ){
+ int32_t l = token->termTextLength();
+ const TCHAR* chars = token->termText();
+ bool doProcess = false;
+ for (int32_t i = 0; i < l; ++i) {
+ #ifdef _UCS2
+ if ( chars[i] >= 0xC0 && chars[i] <= 0x178 ) {
+ #else
+ if ( (chars[i] >= 0xC0 && chars[i] <= 0xFF) || chars[i] < 0 ) {
+ #endif
+ doProcess = true;
+ break;
+ }
+
+ }
+ if ( !doProcess ) {
+ return true;
+ }
+
+ StringBuffer output(l*2);
+ for (int32_t j = 0; j < l; j++) {
+ #ifdef _UCS2
+ TCHAR c = chars[j];
+ #else
+ unsigned char c = chars[j];
+ #endif
+ switch (c) {
+ case 0xC0 : // À
+ case 0xC1 : // Á
+ case 0xC2 : // Â
+ case 0xC3 : // Ã
+ case 0xC4 : // Ä
+ case 0xC5 : // Å
+ output.appendChar('A');
+ break;
+ case 0xC6 : // Æ
+ output.append(_T("AE"));
+ break;
+ case 0xC7 : // Ç
+ output.appendChar('C');
+ break;
+ case 0xC8 : // È
+ case 0xC9 : // É
+ case 0xCA : // Ê
+ case 0xCB : // Ë
+ output.appendChar('E');
+ break;
+ case 0xCC : // Ì
+ case 0xCD : // Í
+ case 0xCE : // Î
+ case 0xCF : // Ï
+ output.appendChar('I');
+ break;
+ case 0xD0 : // Ð
+ output.appendChar('D');
+ break;
+ case 0xD1 : // Ñ
+ output.appendChar('N');
+ break;
+ case 0xD2 : // Ò
+ case 0xD3 : // Ó
+ case 0xD4 : // Ô
+ case 0xD5 : // Õ
+ case 0xD6 : // Ö
+ case 0xD8 : // Ø
+ output.appendChar('O');
+ break;
+ case 0xDE : // Þ
+ output.append(_T("TH"));
+ break;
+ case 0xD9 : // Ù
+ case 0xDA : // Ú
+ case 0xDB : // Û
+ case 0xDC : // Ü
+ output.appendChar('U');
+ break;
+ case 0xDD : // Ý
+ output.appendChar('Y');
+ break;
+ case 0xE0 : // à
+ case 0xE1 : // á
+ case 0xE2 : // â
+ case 0xE3 : // ã
+ case 0xE4 : // ä
+ case 0xE5 : // å
+ output.appendChar('a');
+ break;
+ case 0xE6 : // æ
+ output.append(_T("ae"));
+ break;
+ case 0xE7 : // ç
+ output.appendChar('c');
+ break;
+ case 0xE8 : // è
+ case 0xE9 : // é
+ case 0xEA : // ê
+ case 0xEB : // ë
+ output.appendChar('e');
+ break;
+ case 0xEC : // ì
+ case 0xED : // í
+ case 0xEE : // î
+ case 0xEF : // ï
+ output.appendChar('i');
+ break;
+ case 0xF0 : // ð
+ output.appendChar('d');
+ break;
+ case 0xF1 : // ñ
+ output.appendChar('n');
+ break;
+ case 0xF2 : // ò
+ case 0xF3 : // ó
+ case 0xF4 : // ô
+ case 0xF5 : // õ
+ case 0xF6 : // ö
+ case 0xF8 : // ø
+ output.appendChar('o');
+ break;
+ case 0xDF : // ß
+ output.append(_T("ss"));
+ break;
+ case 0xFE : // þ
+ output.append(_T("th"));
+ break;
+ case 0xF9 : // ù
+ case 0xFA : // ú
+ case 0xFB : // û
+ case 0xFC : // ü
+ output.appendChar('u');
+ break;
+ case 0xFD : // ý
+ case 0xFF : // ÿ
+ output.appendChar('y');
+ break;
+
+ #ifdef _UCS2
+ case 0x152 : // Œ
+ output.append(_T("OE"));
+ break;
+ case 0x153 : // œ
+ output.append(_T("oe"));
+ break;
+ case 0x178 : // Ÿ
+ output.appendChar('Y');
+ break;
+ #endif
+ default :
+ output.appendChar(c);
+ break;
+ }
+ }
+ token->setText(output.getBuffer());
+ return true;
+ }
+ return false;
+}
+
+
+TokenStream* KeywordAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader){
+ return _CLNEW KeywordTokenizer(reader);
+}
+
+KeywordTokenizer::KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize):
+ Tokenizer(input)
+{
+ this->done = false;
+ if ( bufferSize < 0 )
+ this->bufferSize = DEFAULT_BUFFER_SIZE;
+}
+KeywordTokenizer::~KeywordTokenizer(){
+}
+
+bool KeywordTokenizer::next(Token* token){
+ if (!done) {
+ done = true;
+ int32_t rd;
+ const TCHAR* buffer=0;
+ while (true) {
+ rd = input->read(buffer, bufferSize);
+ if (rd == -1)
+ break;
+ token->growBuffer(token->_termTextLen +rd+1);
+
+ int32_t cp = rd;
+ if ( token->_termTextLen + cp > token->bufferLength() )
+ cp = token->bufferLength() - token->_termTextLen;
+ _tcsncpy(token->_termText+token->_termTextLen,buffer,cp);
+ token->_termTextLen+=rd;
+ }
+ token->_termText[token->_termTextLen]=0;
+ token->set(token->_termText,0,token->_termTextLen);
+ return true;
+ }
+ return false;
+}
+
+
+LengthFilter::LengthFilter(TokenStream* in, int _min, int _max):
+ TokenFilter(in)
+{
+ this->_min = _min;
+ this->_max = _max;
+}
+
+LengthFilter::LengthFilter(TokenStream* in, bool deleteTs, int _min, int _max):
+ TokenFilter(in, deleteTs)
+{
+ this->_min = _min;
+ this->_max = _max;
+}
+
+bool LengthFilter::next(Token* token)
+{
+ // return the first non-stop word found
+ while ( input->next(token) )
+ {
+ size_t len = token->termTextLength();
+ if (len >= _min && len <= _max)
+ return true;
+ // note: else we ignore it but should we index each part of it?
+ }
+ // reached EOS -- return null
+ return false;
+}
+
+
+CL_NS_END