searchengine/oss/cl/clucene/src/clucene/analysis/analyzers.cpp
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
parent 0 671dee74050a
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     1
/*------------------------------------------------------------------------------
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     2
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     3
* 
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     4
* Distributable under the terms of either the Apache License (Version 2.0) or 
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     5
* the GNU Lesser General Public License, as specified in the COPYING file.
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     6
------------------------------------------------------------------------------*/
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     7
#include "clucene/stdheader.h"
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     8
#include "Analyzers.h"
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
     9
#include "clucene/util/stringbuffer.h"
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    10
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    11
CL_NS_USE(util)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    12
CL_NS_DEF(analysis)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    13
		
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    14
CharTokenizer::CharTokenizer(Reader* in) :
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    15
	Tokenizer(in),
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    16
	offset(0),
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    17
	bufferIndex(0),
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    18
	dataLen(0),
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    19
	ioBuffer(NULL)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    20
{
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    21
	buffer[0]=0;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    22
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    23
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    24
TCHAR CharTokenizer::normalize(const TCHAR c) const 
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    25
{ 
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    26
	return c; 
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    27
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    28
bool CharTokenizer::next(Token* token){
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    29
	int32_t length = 0;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    30
	int32_t start = offset;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    31
	while (true) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    32
		TCHAR c;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    33
		offset++;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    34
		if (bufferIndex >= dataLen) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    35
			dataLen = input->read(ioBuffer, LUCENE_IO_BUFFER_SIZE);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    36
			if (dataLen == -1)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    37
				dataLen = 0;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    38
			bufferIndex = 0;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    39
		}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    40
		if (dataLen <= 0 ) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    41
			if (length > 0)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    42
				break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    43
			else
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    44
				return false;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    45
		}else
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    46
			c = ioBuffer[bufferIndex++];
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    47
		if (isTokenChar(c)) {                       // if it's a token TCHAR
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    48
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    49
			if (length == 0)			  // start of token
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    50
				start = offset-1;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    51
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    52
			buffer[length++] = normalize(c);          // buffer it, normalized
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    53
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    54
			if (length == LUCENE_MAX_WORD_LEN)		  // buffer overflow!
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    55
				break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    56
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    57
		} else if (length > 0)			  // at non-Letter w/ chars
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    58
			break;					  // return 'em
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    59
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    60
	}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    61
	buffer[length]=0;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    62
	token->set( buffer, start, start+length);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    63
	return true;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    64
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    65
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    66
bool LetterTokenizer::isTokenChar(const TCHAR c) const {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    67
	return _istalpha(c)!=0;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    68
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    69
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    70
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    71
TCHAR LowerCaseTokenizer::normalize(const TCHAR chr) const {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    72
	return _totlower(chr);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    73
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    74
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    75
bool WhitespaceTokenizer::isTokenChar(const TCHAR c)  const{
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    76
	return _istspace(c)==0; //(return true if NOT a space)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    77
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    78
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    79
TokenStream* WhitespaceAnalyzer::tokenStream(const TCHAR* , Reader* reader) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    80
	return _CLNEW WhitespaceTokenizer(reader);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    81
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    82
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    83
TokenStream* SimpleAnalyzer::tokenStream(const TCHAR* , Reader* reader) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    84
	return _CLNEW LowerCaseTokenizer(reader);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    85
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    86
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    87
bool LowerCaseFilter::next(Token* t){
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    88
	if (!input->next(t))
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    89
		return false;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    90
 	stringCaseFold( t->_termText );
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    91
	return true;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    92
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    93
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    94
StopFilter::StopFilter(TokenStream* in, bool deleteTokenStream, const TCHAR** stopWords):
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    95
	TokenFilter(in, deleteTokenStream),
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    96
	table(_CLNEW CLSetList<const TCHAR*>(false)),
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    97
	ownTable(true)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    98
{
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
    99
	fillStopTable( table,stopWords );
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   100
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   101
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   102
StopFilter::~StopFilter()
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   103
{
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   104
	if (ownTable) { 
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   105
		_CLDELETE( table ); 
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   106
	}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   107
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   108
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   109
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   110
void StopFilter::fillStopTable(CLSetList<const TCHAR*>* stopTable,
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   111
								  const TCHAR** stopWords) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   112
	for (int32_t i = 0; stopWords[i]!=NULL; i++)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   113
		stopTable->insert(stopWords[i]);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   114
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   115
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   116
bool StopFilter::next(Token* token) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   117
	// return the first non-stop word found
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   118
	while (input->next(token)){
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   119
		if (table->find(token->_termText)==table->end()){
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   120
			return true;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   121
		}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   122
	}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   123
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   124
	// reached EOS -- return nothing
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   125
	return false;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   126
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   127
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   128
StopAnalyzer::StopAnalyzer():stopTable(false)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   129
{
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   130
	StopFilter::fillStopTable(&stopTable,ENGLISH_STOP_WORDS);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   131
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   132
StopAnalyzer::~StopAnalyzer()
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   133
{
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   134
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   135
StopAnalyzer::StopAnalyzer( const TCHAR** stopWords) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   136
	StopFilter::fillStopTable(&stopTable,stopWords);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   137
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   138
TokenStream* StopAnalyzer::tokenStream(const TCHAR* , Reader* reader) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   139
	return _CLNEW StopFilter(_CLNEW LowerCaseTokenizer(reader),true, &stopTable);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   140
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   141
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   142
const TCHAR* StopAnalyzer::ENGLISH_STOP_WORDS[]  = 
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   143
{
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   144
	_T("a"), _T("an"), _T("and"), _T("are"), _T("as"), _T("at"), _T("be"), _T("but"), _T("by"),
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   145
	_T("for"), _T("if"), _T("in"), _T("into"), _T("is"), _T("it"),
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   146
	_T("no"), _T("not"), _T("of"), _T("on"), _T("or"), _T("s"), _T("such"),
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   147
	_T("t"), _T("that"), _T("the"), _T("their"), _T("then"), _T("there"), _T("these"),
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   148
	_T("they"), _T("this"), _T("to"), _T("was"), _T("will"), _T("with"), NULL
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   149
};
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   150
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   151
PerFieldAnalyzerWrapper::PerFieldAnalyzerWrapper(Analyzer* defaultAnalyzer):
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   152
    analyzerMap(true,true)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   153
{
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   154
    this->defaultAnalyzer = defaultAnalyzer;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   155
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   156
PerFieldAnalyzerWrapper::~PerFieldAnalyzerWrapper(){
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   157
    analyzerMap.clear();
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   158
    _CLDELETE(defaultAnalyzer);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   159
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   160
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   161
void PerFieldAnalyzerWrapper::addAnalyzer(const TCHAR* fieldName, Analyzer* analyzer) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   162
    analyzerMap.put(STRDUP_TtoT(fieldName), analyzer);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   163
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   164
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   165
TokenStream* PerFieldAnalyzerWrapper::tokenStream(const TCHAR* fieldName, Reader* reader) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   166
    Analyzer* analyzer = (fieldName==NULL?defaultAnalyzer:analyzerMap.get(fieldName));
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   167
    if (analyzer == NULL) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   168
      analyzer = defaultAnalyzer;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   169
    }
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   170
    
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   171
    return analyzer->tokenStream(fieldName, reader);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   172
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   173
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   174
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   175
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   176
bool ISOLatin1AccentFilter::next(Token* token){
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   177
	if ( input->next(token) ){
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   178
		int32_t l = token->termTextLength();
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   179
		const TCHAR* chars = token->termText();
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   180
		bool doProcess = false;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   181
		for (int32_t i = 0; i < l; ++i) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   182
			#ifdef _UCS2
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   183
			if ( chars[i] >= 0xC0 && chars[i] <= 0x178 ) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   184
			#else
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   185
			if ( (chars[i] >= 0xC0 && chars[i] <= 0xFF) || chars[i] < 0 ) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   186
			#endif
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   187
				doProcess = true;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   188
				break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   189
			}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   190
			
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   191
		}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   192
		if ( !doProcess ) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   193
			return true;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   194
		}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   195
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   196
		StringBuffer output(l*2);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   197
		for (int32_t j = 0; j < l; j++) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   198
			#ifdef _UCS2
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   199
			TCHAR c = chars[j];
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   200
			#else
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   201
			unsigned char c = chars[j];
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   202
			#endif
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   203
			switch (c) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   204
				case 0xC0 : // À
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   205
				case 0xC1 : // Á
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   206
				case 0xC2 : // Â
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   207
				case 0xC3 : // Ã
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   208
				case 0xC4 : // Ä
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   209
				case 0xC5 : // Å
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   210
					output.appendChar('A');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   211
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   212
				case 0xC6 : // Æ
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   213
					output.append(_T("AE"));
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   214
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   215
				case 0xC7 : // Ç
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   216
					output.appendChar('C');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   217
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   218
				case 0xC8 : // È
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   219
				case 0xC9 : // É
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   220
				case 0xCA : // Ê
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   221
				case 0xCB : // Ë
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   222
					output.appendChar('E');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   223
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   224
				case 0xCC : // Ì
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   225
				case 0xCD : // Í
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   226
				case 0xCE : // Î
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   227
				case 0xCF : // Ï
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   228
					output.appendChar('I');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   229
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   230
				case 0xD0 : // Ð
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   231
					output.appendChar('D');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   232
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   233
				case 0xD1 : // Ñ
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   234
					output.appendChar('N');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   235
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   236
				case 0xD2 : // Ò
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   237
				case 0xD3 : // Ó
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   238
				case 0xD4 : // Ô
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   239
				case 0xD5 : // Õ
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   240
				case 0xD6 : // Ö
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   241
				case 0xD8 : // Ø
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   242
					output.appendChar('O');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   243
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   244
				case 0xDE : // Þ
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   245
					output.append(_T("TH"));
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   246
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   247
				case 0xD9 : // Ù
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   248
				case 0xDA : // Ú
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   249
				case 0xDB : // Û
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   250
				case 0xDC : // Ü
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   251
					output.appendChar('U');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   252
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   253
				case 0xDD : // Ý
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   254
					output.appendChar('Y');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   255
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   256
				case 0xE0 : // à
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   257
				case 0xE1 : // á
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   258
				case 0xE2 : // â
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   259
				case 0xE3 : // ã
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   260
				case 0xE4 : // ä
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   261
				case 0xE5 : // å
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   262
					output.appendChar('a');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   263
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   264
				case 0xE6 : // æ
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   265
					output.append(_T("ae"));
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   266
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   267
				case 0xE7 : // ç
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   268
					output.appendChar('c');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   269
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   270
				case 0xE8 : // è
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   271
				case 0xE9 : // é
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   272
				case 0xEA : // ê
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   273
				case 0xEB : // ë
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   274
					output.appendChar('e');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   275
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   276
				case 0xEC : // ì
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   277
				case 0xED : // í
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   278
				case 0xEE : // î
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   279
				case 0xEF : // ï
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   280
					output.appendChar('i');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   281
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   282
				case 0xF0 : // ð
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   283
					output.appendChar('d');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   284
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   285
				case 0xF1 : // ñ
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   286
					output.appendChar('n');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   287
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   288
				case 0xF2 : // ò
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   289
				case 0xF3 : // ó
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   290
				case 0xF4 : // ô
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   291
				case 0xF5 : // õ
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   292
				case 0xF6 : // ö
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   293
				case 0xF8 : // ø
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   294
					output.appendChar('o');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   295
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   296
				case 0xDF : // ß
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   297
					output.append(_T("ss"));
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   298
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   299
				case 0xFE : // þ
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   300
					output.append(_T("th"));
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   301
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   302
				case 0xF9 : // ù
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   303
				case 0xFA : // ú
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   304
				case 0xFB : // û
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   305
				case 0xFC : // ü
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   306
					output.appendChar('u');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   307
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   308
				case 0xFD : // ý
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   309
				case 0xFF : // ÿ
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   310
					output.appendChar('y');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   311
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   312
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   313
				#ifdef _UCS2
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   314
				case 0x152 : // Œ
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   315
					output.append(_T("OE"));
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   316
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   317
				case 0x153 : // œ
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   318
					output.append(_T("oe"));
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   319
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   320
				case 0x178 : // Ÿ
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   321
					output.appendChar('Y');
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   322
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   323
				#endif
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   324
				default :
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   325
					output.appendChar(c);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   326
					break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   327
			}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   328
		}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   329
		token->setText(output.getBuffer());
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   330
		return true;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   331
	}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   332
	return false;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   333
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   334
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   335
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   336
TokenStream* KeywordAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader){
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   337
    return _CLNEW KeywordTokenizer(reader);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   338
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   339
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   340
KeywordTokenizer::KeywordTokenizer(CL_NS(util)::Reader* input, int bufferSize):
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   341
	Tokenizer(input)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   342
{
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   343
    this->done = false;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   344
	if ( bufferSize < 0 )
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   345
		this->bufferSize = DEFAULT_BUFFER_SIZE;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   346
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   347
KeywordTokenizer::~KeywordTokenizer(){
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   348
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   349
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   350
bool KeywordTokenizer::next(Token* token){
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   351
    if (!done) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   352
      done = true;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   353
	  int32_t rd;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   354
	  const TCHAR* buffer=0;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   355
      while (true) {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   356
        rd = input->read(buffer, bufferSize);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   357
        if (rd == -1) 
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   358
			break;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   359
		token->growBuffer(token->_termTextLen +rd+1);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   360
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   361
		int32_t cp = rd;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   362
		if ( token->_termTextLen + cp > token->bufferLength() )
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   363
			cp = token->bufferLength() -  token->_termTextLen;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   364
		_tcsncpy(token->_termText+token->_termTextLen,buffer,cp);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   365
		token->_termTextLen+=rd;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   366
      }
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   367
	  token->_termText[token->_termTextLen]=0;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   368
	  token->set(token->_termText,0,token->_termTextLen);
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   369
	  return true;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   370
    }
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   371
    return false;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   372
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   373
24
hgs
parents: 0
diff changeset
   374
TokenStream* PhoneNumberAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader){
hgs
parents: 0
diff changeset
   375
    TokenStream* ret = _CLNEW PhoneNumberTokenizer(reader);
hgs
parents: 0
diff changeset
   376
    return ret;
hgs
parents: 0
diff changeset
   377
    
hgs
parents: 0
diff changeset
   378
}
hgs
parents: 0
diff changeset
   379
hgs
parents: 0
diff changeset
   380
PhoneNumberTokenizer::PhoneNumberTokenizer(CL_NS(util)::Reader* input):
hgs
parents: 0
diff changeset
   381
    Tokenizer(input)
hgs
parents: 0
diff changeset
   382
{
hgs
parents: 0
diff changeset
   383
      termLen = input->read(termText);
hgs
parents: 0
diff changeset
   384
      
hgs
parents: 0
diff changeset
   385
      tokenLen = 0;
hgs
parents: 0
diff changeset
   386
}
hgs
parents: 0
diff changeset
   387
  bool PhoneNumberTokenizer::next(Token* token) {
hgs
parents: 0
diff changeset
   388
      
hgs
parents: 0
diff changeset
   389
         if(tokenLen < termLen)     
hgs
parents: 0
diff changeset
   390
         {
hgs
parents: 0
diff changeset
   391
            token->set(termText+tokenLen,0,termLen-tokenLen);
hgs
parents: 0
diff changeset
   392
            tokenLen ++;
hgs
parents: 0
diff changeset
   393
            return true;
hgs
parents: 0
diff changeset
   394
         }
hgs
parents: 0
diff changeset
   395
hgs
parents: 0
diff changeset
   396
      return false;
hgs
parents: 0
diff changeset
   397
      
hgs
parents: 0
diff changeset
   398
 }
hgs
parents: 0
diff changeset
   399
0
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   400
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   401
LengthFilter::LengthFilter(TokenStream* in, int _min, int _max):
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   402
    TokenFilter(in)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   403
{
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   404
    this->_min = _min;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   405
    this->_max = _max;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   406
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   407
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   408
LengthFilter::LengthFilter(TokenStream* in, bool deleteTs, int _min, int _max):
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   409
    TokenFilter(in, deleteTs)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   410
{
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   411
    this->_min = _min;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   412
    this->_max = _max;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   413
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   414
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   415
bool LengthFilter::next(Token* token)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   416
{
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   417
    // return the first non-stop word found
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   418
    while ( input->next(token) )
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   419
    {
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   420
        size_t len = token->termTextLength();
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   421
        if (len >= _min && len <= _max)
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   422
            return true;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   423
        // note: else we ignore it but should we index each part of it?
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   424
    }
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   425
    // reached EOS -- return null
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   426
    return false;
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   427
}
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   428
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   429
671dee74050a Revision: 201011
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff changeset
   430
CL_NS_END