searchengine/oss/cl/clucene/src/clucene/analysis/analyzers.cpp
changeset 0 671dee74050a
child 21 2c484ac32ef0
equal deleted inserted replaced
-1:000000000000 0:671dee74050a
       
     1 /*------------------------------------------------------------------------------
       
     2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
       
     3 * 
       
     4 * Distributable under the terms of either the Apache License (Version 2.0) or 
       
     5 * the GNU Lesser General Public License, as specified in the COPYING file.
       
     6 ------------------------------------------------------------------------------*/
       
     7 #include "clucene/stdheader.h"
       
     8 #include "Analyzers.h"
       
     9 #include "clucene/util/stringbuffer.h"
       
    10 
       
    11 CL_NS_USE(util)
       
    12 CL_NS_DEF(analysis)
       
    13 		
       
    14 CharTokenizer::CharTokenizer(Reader* in) :
       
    15 	Tokenizer(in),
       
    16 	offset(0),
       
    17 	bufferIndex(0),
       
    18 	dataLen(0),
       
    19 	ioBuffer(NULL)
       
    20 {
       
    21 	buffer[0]=0;
       
    22 }
       
    23 
       
    24 TCHAR CharTokenizer::normalize(const TCHAR c) const 
       
    25 { 
       
    26 	return c; 
       
    27 }
       
    28 bool CharTokenizer::next(Token* token){
       
    29 	int32_t length = 0;
       
    30 	int32_t start = offset;
       
    31 	while (true) {
       
    32 		TCHAR c;
       
    33 		offset++;
       
    34 		if (bufferIndex >= dataLen) {
       
    35 			dataLen = input->read(ioBuffer, LUCENE_IO_BUFFER_SIZE);
       
    36 			if (dataLen == -1)
       
    37 				dataLen = 0;
       
    38 			bufferIndex = 0;
       
    39 		}
       
    40 		if (dataLen <= 0 ) {
       
    41 			if (length > 0)
       
    42 				break;
       
    43 			else
       
    44 				return false;
       
    45 		}else
       
    46 			c = ioBuffer[bufferIndex++];
       
    47 		if (isTokenChar(c)) {                       // if it's a token TCHAR
       
    48 
       
    49 			if (length == 0)			  // start of token
       
    50 				start = offset-1;
       
    51 
       
    52 			buffer[length++] = normalize(c);          // buffer it, normalized
       
    53 
       
    54 			if (length == LUCENE_MAX_WORD_LEN)		  // buffer overflow!
       
    55 				break;
       
    56 
       
    57 		} else if (length > 0)			  // at non-Letter w/ chars
       
    58 			break;					  // return 'em
       
    59 
       
    60 	}
       
    61 	buffer[length]=0;
       
    62 	token->set( buffer, start, start+length);
       
    63 	return true;
       
    64 }
       
    65 
       
    66 bool LetterTokenizer::isTokenChar(const TCHAR c) const {
       
    67 	return _istalpha(c)!=0;
       
    68 }
       
    69 
       
    70 
       
    71 TCHAR LowerCaseTokenizer::normalize(const TCHAR chr) const {
       
    72 	return _totlower(chr);
       
    73 }
       
    74 
       
    75 bool WhitespaceTokenizer::isTokenChar(const TCHAR c)  const{
       
    76 	return _istspace(c)==0; //(return true if NOT a space)
       
    77 }
       
    78 
       
    79 TokenStream* WhitespaceAnalyzer::tokenStream(const TCHAR* , Reader* reader) {
       
    80 	return _CLNEW WhitespaceTokenizer(reader);
       
    81 }
       
    82 
       
    83 TokenStream* SimpleAnalyzer::tokenStream(const TCHAR* , Reader* reader) {
       
    84 	return _CLNEW LowerCaseTokenizer(reader);
       
    85 }
       
    86 
       
    87 bool LowerCaseFilter::next(Token* t){
       
    88 	if (!input->next(t))
       
    89 		return false;
       
    90  	stringCaseFold( t->_termText );
       
    91 	return true;
       
    92 }
       
    93 
       
    94 StopFilter::StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords):
       
    95 	TokenFilter(in, deleteTokenStream),
       
    96 	table(_CLNEW CLSetList<const TCHAR*>(false)),
       
    97 	ownTable(true)
       
    98 {
       
    99 	fillStopTable( table,stopWords );
       
   100 }
       
   101 
       
   102 StopFilter::~StopFilter()
       
   103 {
       
   104 	if (ownTable) { 
       
   105 		_CLDELETE( table ); 
       
   106 	}
       
   107 }
       
   108 
       
   109 
       
   110 void StopFilter::fillStopTable(CLSetList<const TCHAR*>* stopTable,
       
   111 								  const TCHAR** stopWords) {
       
   112 	for (int32_t i = 0; stopWords[i]!=NULL; i++)
       
   113 		stopTable->insert(stopWords[i]);
       
   114 }
       
   115 
       
   116 bool StopFilter::next(Token* token) {
       
   117 	// return the first non-stop word found
       
   118 	while (input->next(token)){
       
   119 		if (table->find(token->_termText)==table->end()){
       
   120 			return true;
       
   121 		}
       
   122 	}
       
   123 
       
   124 	// reached EOS -- return nothing
       
   125 	return false;
       
   126 }
       
   127 
       
   128 StopAnalyzer::StopAnalyzer():stopTable(false)
       
   129 {
       
   130 	StopFilter::fillStopTable(&stopTable,ENGLISH_STOP_WORDS);
       
   131 }
       
   132 StopAnalyzer::~StopAnalyzer()
       
   133 {
       
   134 }
       
   135 StopAnalyzer::StopAnalyzer( const TCHAR** stopWords) {
       
   136 	StopFilter::fillStopTable(&stopTable,stopWords);
       
   137 }
       
   138 TokenStream* StopAnalyzer::tokenStream(const TCHAR* , Reader* reader) {
       
   139 	return _CLNEW StopFilter(_CLNEW LowerCaseTokenizer(reader),true, &stopTable);
       
   140 }
       
   141 
       
   142 const TCHAR* StopAnalyzer::ENGLISH_STOP_WORDS[]  = 
       
   143 {
       
   144 	_T("a"), _T("an"), _T("and"), _T("are"), _T("as"), _T("at"), _T("be"), _T("but"), _T("by"),
       
   145 	_T("for"), _T("if"), _T("in"), _T("into"), _T("is"), _T("it"),
       
   146 	_T("no"), _T("not"), _T("of"), _T("on"), _T("or"), _T("s"), _T("such"),
       
   147 	_T("t"), _T("that"), _T("the"), _T("their"), _T("then"), _T("there"), _T("these"),
       
   148 	_T("they"), _T("this"), _T("to"), _T("was"), _T("will"), _T("with"), NULL
       
   149 };
       
   150 
       
   151 PerFieldAnalyzerWrapper::PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer):
       
   152     analyzerMap(true,true)
       
   153 {
       
   154     this->defaultAnalyzer = defaultAnalyzer;
       
   155 }
       
   156 PerFieldAnalyzerWrapper::~PerFieldAnalyzerWrapper(){
       
   157     analyzerMap.clear();
       
   158     _CLDELETE(defaultAnalyzer);
       
   159 }
       
   160 
       
   161 void PerFieldAnalyzerWrapper::addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer) {
       
   162     analyzerMap.put(STRDUP_TtoT(fieldName), analyzer);
       
   163 }
       
   164 
       
   165 TokenStream* PerFieldAnalyzerWrapper::tokenStream(const TCHAR* fieldName, Reader* reader) {
       
   166     Analyzer* analyzer = (fieldName==NULL?defaultAnalyzer:analyzerMap.get(fieldName));
       
   167     if (analyzer == NULL) {
       
   168       analyzer = defaultAnalyzer;
       
   169     }
       
   170     
       
   171     return analyzer->tokenStream(fieldName, reader);
       
   172 }
       
   173 
       
   174 
       
   175 
       
   176 bool ISOLatin1AccentFilter::next(Token* token){
       
   177 	if ( input->next(token) ){
       
   178 		int32_t l = token->termTextLength();
       
   179 		const TCHAR* chars = token->termText();
       
   180 		bool doProcess = false;
       
   181 		for (int32_t i = 0; i < l; ++i) {
       
   182 			#ifdef _UCS2
       
   183 			if ( chars[i] >= 0xC0 && chars[i] <= 0x178 ) {
       
   184 			#else
       
   185 			if ( (chars[i] >= 0xC0 && chars[i] <= 0xFF) || chars[i] < 0 ) {
       
   186 			#endif
       
   187 				doProcess = true;
       
   188 				break;
       
   189 			}
       
   190 			
       
   191 		}
       
   192 		if ( !doProcess ) {
       
   193 			return true;
       
   194 		}
       
   195 
       
   196 		StringBuffer output(l*2);
       
   197 		for (int32_t j = 0; j < l; j++) {
       
   198 			#ifdef _UCS2
       
   199 			TCHAR c = chars[j];
       
   200 			#else
       
   201 			unsigned char c = chars[j];
       
   202 			#endif
       
   203 			switch (c) {
       
   204 				case 0xC0 : // À
       
   205 				case 0xC1 : // Á
       
   206 				case 0xC2 : // Â
       
   207 				case 0xC3 : // Ã
       
   208 				case 0xC4 : // Ä
       
   209 				case 0xC5 : // Å
       
   210 					output.appendChar('A');
       
   211 					break;
       
   212 				case 0xC6 : // Æ
       
   213 					output.append(_T("AE"));
       
   214 					break;
       
   215 				case 0xC7 : // Ç
       
   216 					output.appendChar('C');
       
   217 					break;
       
   218 				case 0xC8 : // È
       
   219 				case 0xC9 : // É
       
   220 				case 0xCA : // Ê
       
   221 				case 0xCB : // Ë
       
   222 					output.appendChar('E');
       
   223 					break;
       
   224 				case 0xCC : // Ì
       
   225 				case 0xCD : // Í
       
   226 				case 0xCE : // Î
       
   227 				case 0xCF : // Ï
       
   228 					output.appendChar('I');
       
   229 					break;
       
   230 				case 0xD0 : // Ð
       
   231 					output.appendChar('D');
       
   232 					break;
       
   233 				case 0xD1 : // Ñ
       
   234 					output.appendChar('N');
       
   235 					break;
       
   236 				case 0xD2 : // Ò
       
   237 				case 0xD3 : // Ó
       
   238 				case 0xD4 : // Ô
       
   239 				case 0xD5 : // Õ
       
   240 				case 0xD6 : // Ö
       
   241 				case 0xD8 : // Ø
       
   242 					output.appendChar('O');
       
   243 					break;
       
   244 				case 0xDE : // Þ
       
   245 					output.append(_T("TH"));
       
   246 					break;
       
   247 				case 0xD9 : // Ù
       
   248 				case 0xDA : // Ú
       
   249 				case 0xDB : // Û
       
   250 				case 0xDC : // Ü
       
   251 					output.appendChar('U');
       
   252 					break;
       
   253 				case 0xDD : // Ý
       
   254 					output.appendChar('Y');
       
   255 					break;
       
   256 				case 0xE0 : // à
       
   257 				case 0xE1 : // á
       
   258 				case 0xE2 : // â
       
   259 				case 0xE3 : // ã
       
   260 				case 0xE4 : // ä
       
   261 				case 0xE5 : // å
       
   262 					output.appendChar('a');
       
   263 					break;
       
   264 				case 0xE6 : // æ
       
   265 					output.append(_T("ae"));
       
   266 					break;
       
   267 				case 0xE7 : // ç
       
   268 					output.appendChar('c');
       
   269 					break;
       
   270 				case 0xE8 : // è
       
   271 				case 0xE9 : // é
       
   272 				case 0xEA : // ê
       
   273 				case 0xEB : // ë
       
   274 					output.appendChar('e');
       
   275 					break;
       
   276 				case 0xEC : // ì
       
   277 				case 0xED : // í
       
   278 				case 0xEE : // î
       
   279 				case 0xEF : // ï
       
   280 					output.appendChar('i');
       
   281 					break;
       
   282 				case 0xF0 : // ð
       
   283 					output.appendChar('d');
       
   284 					break;
       
   285 				case 0xF1 : // ñ
       
   286 					output.appendChar('n');
       
   287 					break;
       
   288 				case 0xF2 : // ò
       
   289 				case 0xF3 : // ó
       
   290 				case 0xF4 : // ô
       
   291 				case 0xF5 : // õ
       
   292 				case 0xF6 : // ö
       
   293 				case 0xF8 : // ø
       
   294 					output.appendChar('o');
       
   295 					break;
       
   296 				case 0xDF : // ß
       
   297 					output.append(_T("ss"));
       
   298 					break;
       
   299 				case 0xFE : // þ
       
   300 					output.append(_T("th"));
       
   301 					break;
       
   302 				case 0xF9 : // ù
       
   303 				case 0xFA : // ú
       
   304 				case 0xFB : // û
       
   305 				case 0xFC : // ü
       
   306 					output.appendChar('u');
       
   307 					break;
       
   308 				case 0xFD : // ý
       
   309 				case 0xFF : // ÿ
       
   310 					output.appendChar('y');
       
   311 					break;
       
   312 
       
   313 				#ifdef _UCS2
       
   314 				case 0x152 : // Œ
       
   315 					output.append(_T("OE"));
       
   316 					break;
       
   317 				case 0x153 : // œ
       
   318 					output.append(_T("oe"));
       
   319 					break;
       
   320 				case 0x178 : // Ÿ
       
   321 					output.appendChar('Y');
       
   322 					break;
       
   323 				#endif
       
   324 				default :
       
   325 					output.appendChar(c);
       
   326 					break;
       
   327 			}
       
   328 		}
       
   329 		token->setText(output.getBuffer());
       
   330 		return true;
       
   331 	}
       
   332 	return false;
       
   333 }
       
   334 
       
   335 
       
   336 TokenStream* KeywordAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader){
       
   337     return _CLNEW KeywordTokenizer(reader);
       
   338 }
       
   339 
       
   340 KeywordTokenizer::KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize):
       
   341 	Tokenizer(input)
       
   342 {
       
   343     this->done = false;
       
   344 	if ( bufferSize < 0 )
       
   345 		this->bufferSize = DEFAULT_BUFFER_SIZE;
       
   346 }
       
   347 KeywordTokenizer::~KeywordTokenizer(){
       
   348 }
       
   349 
       
   350 bool KeywordTokenizer::next(Token* token){
       
   351     if (!done) {
       
   352       done = true;
       
   353 	  int32_t rd;
       
   354 	  const TCHAR* buffer=0;
       
   355       while (true) {
       
   356         rd = input->read(buffer, bufferSize);
       
   357         if (rd == -1) 
       
   358 			break;
       
   359 		token->growBuffer(token->_termTextLen +rd+1);
       
   360 
       
   361 		int32_t cp = rd;
       
   362 		if ( token->_termTextLen + cp > token->bufferLength() )
       
   363 			cp = token->bufferLength() -  token->_termTextLen;
       
   364 		_tcsncpy(token->_termText+token->_termTextLen,buffer,cp);
       
   365 		token->_termTextLen+=rd;
       
   366       }
       
   367 	  token->_termText[token->_termTextLen]=0;
       
   368 	  token->set(token->_termText,0,token->_termTextLen);
       
   369 	  return true;
       
   370     }
       
   371     return false;
       
   372 }
       
   373 
       
   374 
       
   375 LengthFilter::LengthFilter(TokenStream* in, int _min, int _max):
       
   376     TokenFilter(in)
       
   377 {
       
   378     this->_min = _min;
       
   379     this->_max = _max;
       
   380 }
       
   381 
       
   382 LengthFilter::LengthFilter(TokenStream* in, bool deleteTs, int _min, int _max):
       
   383     TokenFilter(in, deleteTs)
       
   384 {
       
   385     this->_min = _min;
       
   386     this->_max = _max;
       
   387 }
       
   388 
       
   389 bool LengthFilter::next(Token* token)
       
   390 {
       
   391     // return the first non-stop word found
       
   392     while ( input->next(token) )
       
   393     {
       
   394         size_t len = token->termTextLength();
       
   395         if (len >= _min && len <= _max)
       
   396             return true;
       
   397         // note: else we ignore it but should we index each part of it?
       
   398     }
       
   399     // reached EOS -- return null
       
   400     return false;
       
   401 }
       
   402 
       
   403 
       
   404 CL_NS_END