searchengine/oss/loc/analysis/src/prefixfilter.cpp
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
hgs
parents:
diff changeset
     1
/*
hgs
parents:
diff changeset
     2
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
hgs
parents:
diff changeset
     3
* All rights reserved.
hgs
parents:
diff changeset
     4
* This component and the accompanying materials are made available
hgs
parents:
diff changeset
     5
* under the terms of "Eclipse Public License v1.0"
hgs
parents:
diff changeset
     6
* which accompanies this distribution, and is available
hgs
parents:
diff changeset
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
hgs
parents:
diff changeset
     8
*
hgs
parents:
diff changeset
     9
* Initial Contributors:
hgs
parents:
diff changeset
    10
* Nokia Corporation - initial contribution.
hgs
parents:
diff changeset
    11
*
hgs
parents:
diff changeset
    12
* Contributors:
hgs
parents:
diff changeset
    13
*
hgs
parents:
diff changeset
    14
* Description: 
hgs
parents:
diff changeset
    15
*
hgs
parents:
diff changeset
    16
*/
hgs
parents:
diff changeset
    17
hgs
parents:
diff changeset
    18
#include "prefixfilter.h"
hgs
parents:
diff changeset
    19
hgs
parents:
diff changeset
    20
#include "Clucene.h"
hgs
parents:
diff changeset
    21
hgs
parents:
diff changeset
    22
namespace analysis {
hgs
parents:
diff changeset
    23
hgs
parents:
diff changeset
    24
    using namespace lucene::analysis;
hgs
parents:
diff changeset
    25
    using namespace lucene::util;
hgs
parents:
diff changeset
    26
hgs
parents:
diff changeset
    27
    const wchar_t* HebrewPrefixes[] = {
hgs
parents:
diff changeset
    28
       L"\x05d0",  // aleph
hgs
parents:
diff changeset
    29
       L"\x05d1",  // bet
hgs
parents:
diff changeset
    30
       L"\x05d4",  // he
hgs
parents:
diff changeset
    31
       L"\x05d5",  // vav
hgs
parents:
diff changeset
    32
       L"\x05d9",  // yud
hgs
parents:
diff changeset
    33
       L"\x05db",  // kaf 
hgs
parents:
diff changeset
    34
       L"\x05dc",  // lamed
hgs
parents:
diff changeset
    35
       L"\x05de",  // mem 
hgs
parents:
diff changeset
    36
       L"\x05e0",  // nun 
hgs
parents:
diff changeset
    37
       L"\x05e9",  // shin 
hgs
parents:
diff changeset
    38
       L"\x05ea",  // tav
hgs
parents:
diff changeset
    39
       NULL
hgs
parents:
diff changeset
    40
    };
hgs
parents:
diff changeset
    41
hgs
parents:
diff changeset
    42
    PrefixFilter::PrefixFilter(TokenStream* input, bool deleteTs, const wchar_t** prefixes) 
hgs
parents:
diff changeset
    43
    : TokenFilter(input, deleteTs), prefixes_(prefixes), token_(), prefixFound_( false ) {}
hgs
parents:
diff changeset
    44
    
hgs
parents:
diff changeset
    45
    bool PrefixFilter::next(Token* token) {
hgs
parents:
diff changeset
    46
		// 1. Either use reduced form of last token or fetch a new token
hgs
parents:
diff changeset
    47
		if ( prefixFound_ ) {
hgs
parents:
diff changeset
    48
			token->set(token_.termText(),
hgs
parents:
diff changeset
    49
					   token_.startOffset(), 
hgs
parents:
diff changeset
    50
					   token_.endOffset(), 
hgs
parents:
diff changeset
    51
					   token_.type());
hgs
parents:
diff changeset
    52
			token->setPositionIncrement(0);
hgs
parents:
diff changeset
    53
			prefixFound_ = false;
hgs
parents:
diff changeset
    54
		} else if (!input->next( token )) {
hgs
parents:
diff changeset
    55
			// no token found
hgs
parents:
diff changeset
    56
			return false;
hgs
parents:
diff changeset
    57
		}
hgs
parents:
diff changeset
    58
		
hgs
parents:
diff changeset
    59
		// 2. Try to match prefixes with the token
hgs
parents:
diff changeset
    60
		const wchar_t* text = token->_termText;
hgs
parents:
diff changeset
    61
		int sz = 0; 
hgs
parents:
diff changeset
    62
		for (int i = 0; prefixes_[i] && !sz; i++) {
hgs
parents:
diff changeset
    63
			for (; prefixes_[i][sz]; sz++) {
hgs
parents:
diff changeset
    64
				if (text[sz] != prefixes_[i][sz]) {
hgs
parents:
diff changeset
    65
					sz = 0;
hgs
parents:
diff changeset
    66
					break;
hgs
parents:
diff changeset
    67
				}
hgs
parents:
diff changeset
    68
			}
hgs
parents:
diff changeset
    69
		}
hgs
parents:
diff changeset
    70
		// 3. If prefix found, cut prefix and store cut form of token,
hgs
parents:
diff changeset
    71
		//    if cut form is non-empty
hgs
parents:
diff changeset
    72
		if ( sz && token->termTextLength() > sz) {
hgs
parents:
diff changeset
    73
			token_.set(token->termText() + sz,
hgs
parents:
diff changeset
    74
					   token->startOffset(), 
hgs
parents:
diff changeset
    75
					   token->endOffset(), 
hgs
parents:
diff changeset
    76
					   token->type());
hgs
parents:
diff changeset
    77
			prefixFound_ = true; 
hgs
parents:
diff changeset
    78
		}
hgs
parents:
diff changeset
    79
		return true; 
hgs
parents:
diff changeset
    80
    }
hgs
parents:
diff changeset
    81
    
hgs
parents:
diff changeset
    82
#define APOSTROPHE1 L'\''
hgs
parents:
diff changeset
    83
#define APOSTROPHE2 L'\x2019'
hgs
parents:
diff changeset
    84
    
hgs
parents:
diff changeset
    85
    const wchar_t* FrenchArticles[] = {
hgs
parents:
diff changeset
    86
    	L"l",
hgs
parents:
diff changeset
    87
    	L"m",
hgs
parents:
diff changeset
    88
    	L"t",
hgs
parents:
diff changeset
    89
    	L"qu",
hgs
parents:
diff changeset
    90
    	L"n",
hgs
parents:
diff changeset
    91
    	L"s",
hgs
parents:
diff changeset
    92
    	L"j",
hgs
parents:
diff changeset
    93
    	NULL
hgs
parents:
diff changeset
    94
    };
hgs
parents:
diff changeset
    95
hgs
parents:
diff changeset
    96
    // This list should not be relied on
hgs
parents:
diff changeset
    97
    // Consult language experts!
hgs
parents:
diff changeset
    98
    const wchar_t* ItalianArticles[] = {
hgs
parents:
diff changeset
    99
    	L"l", 		// the
hgs
parents:
diff changeset
   100
    	L"d", 		// from
hgs
parents:
diff changeset
   101
    	L"un", 		// one, a
hgs
parents:
diff changeset
   102
    	L"dell", 	// to
hgs
parents:
diff changeset
   103
    	L"all",	
hgs
parents:
diff changeset
   104
    	L"e",
hgs
parents:
diff changeset
   105
    	L"quest",
hgs
parents:
diff changeset
   106
    	L"quell",
hgs
parents:
diff changeset
   107
    	L"buon", 	// means good, should it be here?
hgs
parents:
diff changeset
   108
    	NULL
hgs
parents:
diff changeset
   109
    };
hgs
parents:
diff changeset
   110
hgs
parents:
diff changeset
   111
    ElisionFilter::ElisionFilter(TokenStream* input, bool deleteTs, const wchar_t** articles) 
hgs
parents:
diff changeset
   112
    : TokenFilter(input, deleteTs), articles_(articles) {}
hgs
parents:
diff changeset
   113
    
hgs
parents:
diff changeset
   114
    bool ElisionFilter::next(Token* token) {
hgs
parents:
diff changeset
   115
    	if ( input->next(token) ) {
hgs
parents:
diff changeset
   116
			wchar_t* text = token->_termText;
hgs
parents:
diff changeset
   117
			int i = 0;
hgs
parents:
diff changeset
   118
			for (; text[i]; i++) {
hgs
parents:
diff changeset
   119
				if ( text[i] == APOSTROPHE1 
hgs
parents:
diff changeset
   120
				  || text[i] == APOSTROPHE2 ) {
hgs
parents:
diff changeset
   121
					break; 
hgs
parents:
diff changeset
   122
				}
hgs
parents:
diff changeset
   123
			}
hgs
parents:
diff changeset
   124
			if ( text[i] ) {
hgs
parents:
diff changeset
   125
				for (int j = 0; articles_[j]; j++) {
hgs
parents:
diff changeset
   126
					if ( memcmp( articles_[j], text, i ) ) {
hgs
parents:
diff changeset
   127
						for (int k = 0; ; k++) {
hgs
parents:
diff changeset
   128
							text[k] = text[k+i+1];
hgs
parents:
diff changeset
   129
							if (!text[k+i+1]) break; // end of string 
hgs
parents:
diff changeset
   130
						}
hgs
parents:
diff changeset
   131
						token->resetTermTextLen();
hgs
parents:
diff changeset
   132
						return true; 
hgs
parents:
diff changeset
   133
					}
hgs
parents:
diff changeset
   134
				}
hgs
parents:
diff changeset
   135
			}
hgs
parents:
diff changeset
   136
			return true; 
hgs
parents:
diff changeset
   137
    	}
hgs
parents:
diff changeset
   138
    	return false; 
hgs
parents:
diff changeset
   139
    }
hgs
parents:
diff changeset
   140
    
hgs
parents:
diff changeset
   141
	using namespace lucene::analysis::standard; 
hgs
parents:
diff changeset
   142
hgs
parents:
diff changeset
   143
	TokenStream* HebrewAnalyzer::tokenStream(const wchar_t* fieldName, Reader* reader) {
hgs
parents:
diff changeset
   144
		auto_ptr<TokenStream> ret(  new StandardTokenizer(reader) ); 
hgs
parents:
diff changeset
   145
		
hgs
parents:
diff changeset
   146
		ret.reset( new LowerCaseFilter( ret.release(), true ) ); 
hgs
parents:
diff changeset
   147
		ret.reset( new StandardFilter( ret.release(), true ) ); 
hgs
parents:
diff changeset
   148
		ret.reset( new PrefixFilter( ret.release(), true, HebrewPrefixes ) ); 
hgs
parents:
diff changeset
   149
		
hgs
parents:
diff changeset
   150
		return ret.release();  	
hgs
parents:
diff changeset
   151
	}
hgs
parents:
diff changeset
   152
hgs
parents:
diff changeset
   153
	TokenStream* HebrewQueryAnalyzer::tokenStream(const wchar_t* fieldName, Reader* reader) {
hgs
parents:
diff changeset
   154
		auto_ptr<TokenStream> ret(  new StandardTokenizer(reader) ); 
hgs
parents:
diff changeset
   155
		
hgs
parents:
diff changeset
   156
		ret.reset( new LowerCaseFilter( ret.release(), true ) ); 
hgs
parents:
diff changeset
   157
		ret.reset( new StandardFilter( ret.release(), true ) ); 
hgs
parents:
diff changeset
   158
		
hgs
parents:
diff changeset
   159
		return ret.release();  	
hgs
parents:
diff changeset
   160
	}
hgs
parents:
diff changeset
   161
hgs
parents:
diff changeset
   162
	TokenStream* FrenchAnalyzer::tokenStream(const wchar_t* fieldName, Reader* reader) {
hgs
parents:
diff changeset
   163
		auto_ptr<TokenStream> ret(  new StandardTokenizer(reader) ); 
hgs
parents:
diff changeset
   164
		
hgs
parents:
diff changeset
   165
		ret.reset( new LowerCaseFilter( ret.release(), true ) ); 
hgs
parents:
diff changeset
   166
		ret.reset( new StandardFilter( ret.release(), true ) ); 
hgs
parents:
diff changeset
   167
		ret.reset( new ElisionFilter( ret.release(), true, FrenchArticles ) ); 
hgs
parents:
diff changeset
   168
		
hgs
parents:
diff changeset
   169
		return ret.release();  	
hgs
parents:
diff changeset
   170
	}
hgs
parents:
diff changeset
   171
	
hgs
parents:
diff changeset
   172
	const TCHAR* NonEnglishStopWords::FRENCH_STOP_WORDS[] = {
hgs
parents:
diff changeset
   173
	  _T("a"), _T("afin"), _T("ai"), _T("ainsi"), _T("après"), _T("attendu"), _T("au"), _T("aujourd"),
hgs
parents:
diff changeset
   174
	  _T("auquel"), _T("aussi"), _T("autre"), _T("autres"), _T("aux"), _T("auxquelles"), _T("auxquels"),
hgs
parents:
diff changeset
   175
	  _T("avait"), _T("avant"), _T("avec"), _T("avoir"), _T("c"), _T("car"), _T("ce"), _T("ceci"), _T("cela"), _T("celle"), _T("celles"), _T("celui"), _T("cependant"), _T("certain"),
hgs
parents:
diff changeset
   176
	  _T("certaine"), _T("certaines"), _T("certains"), _T("ces"), _T("cet"), _T("cette"), _T("ceux"), _T("chez"), _T("ci"),
hgs
parents:
diff changeset
   177
	  _T("combien"), _T("comme"), _T("comment"), _T("concernant"), _T("contre"), _T("d"), _T("dans"), _T("de"), _T("debout"),
hgs
parents:
diff changeset
   178
	  _T("dedans"), _T("dehors"), _T("delà"), _T("depuis"), _T("derrière"), _T("des"), _T("désormais"), _T("desquelles"),
hgs
parents:
diff changeset
   179
	  _T("desquels"), _T("dessous"), _T("dessus"), _T("devant"), _T("devers"), _T("devra"), _T("divers"), _T("diverse"),
hgs
parents:
diff changeset
   180
	  _T("diverses"), _T("doit"), _T("donc"), _T("dont"), _T("du"), _T("duquel"), _T("durant"), _T("dès"), _T("elle"), _T("elles"),
hgs
parents:
diff changeset
   181
	  _T("en"), _T("entre"), _T("environ"), _T("est"), _T("et"), _T("etc"), _T("etre"), _T("eu"), _T("eux"), _T("excepté"), _T("hormis"),
hgs
parents:
diff changeset
   182
	  _T("hors"), _T("hélas"), _T("hui"), _T("il"), _T("ils"), _T("j"), _T("je"), _T("jusqu"), _T("jusque"), _T("l"), _T("la"), _T("laquelle"),
hgs
parents:
diff changeset
   183
	  _T("le"), _T("lequel"), _T("les"), _T("lesquelles"), _T("lesquels"), _T("leur"), _T("leurs"), _T("lorsque"), _T("lui"), _T("là"),
hgs
parents:
diff changeset
   184
	  _T("ma"), _T("mais"), _T("malgré"), _T("me"), _T("merci"), _T("mes"), _T("mien"), _T("mienne"), _T("miennes"), _T("miens"), _T("moi"),
hgs
parents:
diff changeset
   185
	  _T("moins"), _T("mon"), _T("moyennant"), _T("même"), _T("mêmes"), _T("n"), _T("ne"), _T("ni"), _T("non"), _T("nos"), _T("notre"),
hgs
parents:
diff changeset
   186
	  _T("nous"), _T("néanmoins"), _T("nôtre"), _T("nôtres"), _T("on"), _T("ont"), _T("ou"), _T("outre"), _T("où"), _T("par"), _T("parmi"),
hgs
parents:
diff changeset
   187
	  _T("partant"), _T("pas"), _T("passé"), _T("pendant"), _T("plein"), _T("plus"), _T("plusieurs"), _T("pour"), _T("pourquoi"),
hgs
parents:
diff changeset
   188
	  _T("proche"), _T("près"), _T("puisque"), _T("qu"), _T("quand"), _T("que"), _T("quel"), _T("quelle"), _T("quelles"), _T("quels"),
hgs
parents:
diff changeset
   189
	  _T("qui"), _T("quoi"), _T("quoique"), _T("revoici"), _T("revoilà"), _T("s"), _T("sa"), _T("sans"), _T("sauf"), _T("se"), _T("selon"),
hgs
parents:
diff changeset
   190
	  _T("seront"), _T("ses"), _T("si"), _T("sien"), _T("sienne"), _T("siennes"), _T("siens"), _T("sinon"), _T("soi"), _T("soit"),
hgs
parents:
diff changeset
   191
	  _T("son"), _T("sont"), _T("sous"), _T("suivant"), _T("sur"), _T("ta"), _T("te"), _T("tes"), _T("tien"), _T("tienne"), _T("tiennes"),
hgs
parents:
diff changeset
   192
	  _T("tiens"), _T("toi"), _T("ton"), _T("tous"), _T("tout"), _T("toute"), _T("toutes"), _T("tu"), _T("un"), _T("une"), _T("va"), _T("vers"),
hgs
parents:
diff changeset
   193
	  _T("voici"), _T("voilà"), _T("vos"), _T("votre"), _T("vous"), _T("vu"), _T("vôtre"), _T("vôtres"), _T("y"), _T("à"), _T("ça"), _T("ès"),
hgs
parents:
diff changeset
   194
	  _T("été"), _T("être"), _T("ô"), NULL
hgs
parents:
diff changeset
   195
	};
hgs
parents:
diff changeset
   196
hgs
parents:
diff changeset
   197
	const TCHAR* NonEnglishStopWords::BRAZILIAN_STOP_WORDS[] = {
hgs
parents:
diff changeset
   198
   _T("a"),_T("ainda"),_T("alem"), _T("ambas"), _T("ambos"), _T("antes"),
hgs
parents:
diff changeset
   199
   _T("ao"), _T("aonde"), _T("aos"), _T("apos"), _T("aquele"), _T("aqueles"),
hgs
parents:
diff changeset
   200
   _T("as"), _T("assim"), _T("com"), _T("como"), _T("contra"), _T("contudo"),
hgs
parents:
diff changeset
   201
   _T("cuja"), _T("cujas"), _T("cujo"), _T("cujos"), _T("da"), _T("das"), _T("de"),
hgs
parents:
diff changeset
   202
   _T("dela"), _T("dele"), _T("deles"), _T("demais"), _T("depois"), _T("desde"),
hgs
parents:
diff changeset
   203
   _T("desta"), _T("deste"), _T("dispoe"), _T("dispoem"), _T("diversa"),
hgs
parents:
diff changeset
   204
   _T("diversas"), _T("diversos"), _T("do"), _T("dos"), _T("durante"), _T("e"),
hgs
parents:
diff changeset
   205
   _T("ela"), _T("elas"), _T("ele"), _T("eles"), _T("em"), _T("entao"), _T("entre"),
hgs
parents:
diff changeset
   206
   _T("essa"), _T("essas"), _T("esse"), _T("esses"), _T("esta"), _T("estas"),
hgs
parents:
diff changeset
   207
   _T("este"), _T("estes"), _T("ha"), _T("isso"), _T("isto"), _T("logo"), _T("mais"),
hgs
parents:
diff changeset
   208
   _T("mas"), _T("mediante"), _T("menos"), _T("mesma"), _T("mesmas"), _T("mesmo"),
hgs
parents:
diff changeset
   209
   _T("mesmos"), _T("na"), _T("nas"), _T("nao"), _T("nas"), _T("nem"), _T("nesse"), _T("neste"),
hgs
parents:
diff changeset
   210
   _T("nos"), _T("o"), _T("os"), _T("ou"), _T("outra"), _T("outras"), _T("outro"), _T("outros"),
hgs
parents:
diff changeset
   211
   _T("pelas"), _T("pelas"), _T("pelo"), _T("pelos"), _T("perante"), _T("pois"), _T("por"),
hgs
parents:
diff changeset
   212
   _T("porque"), _T("portanto"), _T("proprio"), _T("propios"), _T("quais"), _T("qual"),
hgs
parents:
diff changeset
   213
   _T("qualquer"), _T("quando"), _T("quanto"), _T("que"), _T("quem"), _T("quer"),_T("se"),
hgs
parents:
diff changeset
   214
   _T("seja"), _T("sem"), _T("sendo"), _T("seu"), _T("seus"), _T("sob"), _T("sobre"), _T("sua"),
hgs
parents:
diff changeset
   215
   _T("suas"), _T("tal"), _T("tambem"), _T("teu"), _T("teus"), _T("toda"), _T("todas"), _T("todo"),
hgs
parents:
diff changeset
   216
   _T("todos"), _T("tua"), _T("tuas"), _T("tudo"), _T("um"), _T("uma"), _T("umas"), _T("uns"),
hgs
parents:
diff changeset
   217
   NULL
hgs
parents:
diff changeset
   218
	};
hgs
parents:
diff changeset
   219
   
hgs
parents:
diff changeset
   220
   const TCHAR* NonEnglishStopWords::CZECH_STOP_WORDS[] = {
hgs
parents:
diff changeset
   221
    _T("a"),_T("s"),_T("k"),_T("o"),_T("i"),_T("u"),_T("v"), _T("z"),_T("dnes"),
hgs
parents:
diff changeset
   222
    _T("cz"),_T("t\u00edmto"),_T("bude\u0161"),_T("budem"), _T("byli"), _T("jse\u0161"), _T("m\u016fj"), _T("sv\u00fdm"), _T("ta"), _T("tomto"), _T("tohle"), _T("tuto"), _T("tyto"),
hgs
parents:
diff changeset
   223
    _T("jej"), _T("zda"), _T("pro\u010d"), _T("m\u00e1te"), _T("tato"), _T("kam"), _T("tohoto"), _T("kdo"), _T("kte\u0159\u00ed"),
hgs
parents:
diff changeset
   224
    _T("mi"), _T("n\u00e1m"), _T("tom"), _T("tomuto"), _T("m\u00edt"), _T("nic"), _T("proto"), _T("kterou"), _T("byla"),
hgs
parents:
diff changeset
   225
    _T("toho"), _T("proto\u017ee"), _T("asi"), _T("ho"), _T("na\u0161i"), _T("napi\u0161te"), _T("re"), _T("co\u017e"), _T("t\u00edm"),
hgs
parents:
diff changeset
   226
    _T("tak\u017ee"), _T("sv\u00fdch"), _T("jej\u00ed"), _T("sv\u00fdmi"), _T("jste"), _T("aj"), _T("tu"), _T("tedy"), _T("teto"),
hgs
parents:
diff changeset
   227
    _T("bylo"), _T("kde"), _T("ke"), _T("prav\u00e9"), _T("ji"), _T("nad"), _T("nejsou"), _T("\u010di"), _T("pod"), _T("t\u00e9ma"),
hgs
parents:
diff changeset
   228
    _T("mezi"), _T("p\u0159es"), _T("ty"), _T("pak"), _T("v\u00e1m"), _T("ani"), _T("kdy\u017e"), _T("v\u0161ak"), _T("neg"), _T("jsem"),
hgs
parents:
diff changeset
   229
    _T("tento"), _T("\u010dl\u00e1nku"), _T("\u010dl\u00e1nky"), _T("aby"), _T("jsme"), _T("p\u0159ed"), _T("pta"), _T("jejich"),
hgs
parents:
diff changeset
   230
    _T("byl"), _T("je\u0161t\u011b"), _T("a\u017e"), _T("bez"), _T("tak\u00e9"), _T("pouze"), _T("prvn\u00ed"), _T("va\u0161e"), _T("kter\u00e1"),
hgs
parents:
diff changeset
   231
    _T("n\u00e1s"), _T("nov\u00fd"), _T("tipy"), _T("pokud"), _T("m\u016f\u017ee"), _T("strana"), _T("jeho"), _T("sv\u00e9"), _T("jin\u00e9"),
hgs
parents:
diff changeset
   232
    _T("zpr\u00e1vy"), _T("nov\u00e9"), _T("nen\u00ed"), _T("v\u00e1s"), _T("jen"), _T("podle"), _T("zde"), _T("u\u017e"), _T("b\u00fdt"), _T("v\u00edce"),
hgs
parents:
diff changeset
   233
    _T("bude"), _T("ji\u017e"), _T("ne\u017e"), _T("kter\u00fd"), _T("by"), _T("kter\u00e9"), _T("co"), _T("nebo"), _T("ten"), _T("tak"),
hgs
parents:
diff changeset
   234
    _T("m\u00e1"), _T("p\u0159i"), _T("od"), _T("po"), _T("jsou"), _T("jak"), _T("dal\u0161\u00ed"), _T("ale"), _T("si"), _T("se"), _T("ve"),
hgs
parents:
diff changeset
   235
    _T("to"), _T("jako"), _T("za"), _T("zp\u011bt"), _T("ze"), _T("do"), _T("pro"), _T("je"), _T("na"), _T("atd"), _T("atp"),
hgs
parents:
diff changeset
   236
    _T("jakmile"), _T("p\u0159i\u010dem\u017e"), _T("j\u00e1"), _T("on"), _T("ona"), _T("ono"), _T("oni"), _T("ony"), _T("my"), _T("vy"),
hgs
parents:
diff changeset
   237
    _T( "j\u00ed"), _T("ji"), _T("m\u011b"), _T("mne"), _T("jemu"), _T("tomu"), _T("t\u011bm"), _T("t\u011bmu"), _T("n\u011bmu"), _T("n\u011bmu\u017e"),
hgs
parents:
diff changeset
   238
    _T("jeho\u017e"), _T("j\u00ed\u017e"), _T("jeliko\u017e"), _T("je\u017e"), _T("jako\u017e"), _T("na\u010de\u017e"),
hgs
parents:
diff changeset
   239
    NULL
hgs
parents:
diff changeset
   240
    };
hgs
parents:
diff changeset
   241
   
hgs
parents:
diff changeset
   242
   const TCHAR* NonEnglishStopWords::GERMAN_STOP_WORDS[] = {
hgs
parents:
diff changeset
   243
  _T("einer"), _T( "eine"), _T( "eines"), _T( "einem"), _T( "einen"),
hgs
parents:
diff changeset
   244
  _T("der"), _T( "die"), _T( "das"), _T( "dass"), _T( "daß"),
hgs
parents:
diff changeset
   245
  _T("du"), _T( "er"), _T( "sie"), _T( "es"),
hgs
parents:
diff changeset
   246
  _T("was"), _T( "wer"), _T( "wie"), _T( "wir"),
hgs
parents:
diff changeset
   247
  _T("und"), _T( "oder"), _T( "ohne"), _T( "mit"),
hgs
parents:
diff changeset
   248
  _T("am"), _T( "im"), _T( "in"), _T( "aus"), _T( "auf"),
hgs
parents:
diff changeset
   249
  _T("ist"), _T( "sein"), _T( "war"), _T( "wird"),
hgs
parents:
diff changeset
   250
  _T("ihr"), _T( "ihre"), _T( "ihres"),
hgs
parents:
diff changeset
   251
  _T("als"), _T( "für"), _T( "von"), _T( "mit"),
hgs
parents:
diff changeset
   252
  _T("dich"), _T( "dir"), _T( "mich"), _T( "mir"),
hgs
parents:
diff changeset
   253
  _T("mein"), _T( "sein"), _T( "kein"),
hgs
parents:
diff changeset
   254
  _T("durch"), _T( "wegen"), _T( "wird"),
hgs
parents:
diff changeset
   255
  NULL 
hgs
parents:
diff changeset
   256
};
hgs
parents:
diff changeset
   257
hgs
parents:
diff changeset
   258
   const TCHAR* NonEnglishStopWords::GREEK_STOP_WORDS[] = {
hgs
parents:
diff changeset
   259
    _T( "ο"  ),  _T(  "η" ), _T( "το" ), _T( "οι" ), _T( "τα" ), _T( "του" ), _T( "τησ" ), _T( "των" ), _T( "τον" ), _T( "την" ), _T( "και"  ),
hgs
parents:
diff changeset
   260
    _T( "κι" ), _T( "κ" ), _T( "ειμαι" ), _T( "εισαι" ), _T( "ειναι" ), _T( "ειμαστε" ), _T( "ειστε" ), _T( "στο" ), _T( "στον"  ),
hgs
parents:
diff changeset
   261
    _T( "στη" ), _T( "στην" ), _T( "μα" ), _T( "αλλα" ), _T( "απο" ), _T( "για" ), _T( "π?οσ" ), _T( "με" ), _T( "σε" ), _T( "ωσ"  ),
hgs
parents:
diff changeset
   262
    _T( "πα?α" ), _T( "αντι" ), _T( "κατα" ), _T( "μετα" ), _T( "θα" ), _T( "να" ), _T( "δε" ), _T( "δεν" ), _T( "μη" ), _T( "μην"  ),
hgs
parents:
diff changeset
   263
    _T( "επι" ), _T( "ενω" ), _T( "εαν" ), _T( "αν" ), _T( "τοτε" ), _T( "που" ), _T( "πωσ" ), _T( "ποιοσ" ), _T( "ποια" ), _T( "ποιο"  ),
hgs
parents:
diff changeset
   264
    _T( "ποιοι" ), _T( "ποιεσ" ), _T( "ποιων" ), _T( "ποιουσ" ), _T( "αυτοσ" ), _T( "αυτη" ), _T( "αυτο" ), _T( "αυτοι"  ),
hgs
parents:
diff changeset
   265
    _T( "αυτων" ), _T( "αυτουσ" ), _T( "αυτεσ" ), _T( "αυτα" ), _T( "εκεινοσ" ), _T( "εκεινη" ), _T( "εκεινο"  ),
hgs
parents:
diff changeset
   266
    _T( "εκεινοι" ), _T( "εκεινεσ" ), _T( "εκεινα" ), _T( "εκεινων" ), _T( "εκεινουσ" ), _T( "οπωσ" ), _T( "ομωσ"  ),
hgs
parents:
diff changeset
   267
    _T( "ισωσ" ), _T( "οσο" ), _T( "οτι" ),
hgs
parents:
diff changeset
   268
    NULL
hgs
parents:
diff changeset
   269
};
hgs
parents:
diff changeset
   270
hgs
parents:
diff changeset
   271
   const TCHAR* NonEnglishStopWords::DUTCH_STOP_WORDS[] = {
hgs
parents:
diff changeset
   272
      _T("de"), _T("en"), _T("van"), _T("ik"), _T("te"), _T("dat"), _T("die"), _T("in"), _T("een"),
hgs
parents:
diff changeset
   273
       _T("hij"), _T("het"), _T("niet"), _T("zijn"), _T("is"), _T("was"), _T("op"), _T("aan"), _T("met"), _T("als"), _T("voor"), _T("had"),
hgs
parents:
diff changeset
   274
       _T("er"), _T("maar"), _T("om"), _T("hem"), _T("dan"), _T("zou"), _T("of"), _T("wat"), _T("mijn"), _T("men"), _T("dit"), _T("zo"),
hgs
parents:
diff changeset
   275
       _T("door"), _T("over"), _T("ze"), _T("zich"), _T("bij"), _T("ook"), _T("tot"), _T("je"), _T("mij"), _T("uit"), _T("der"), _T("daar"),
hgs
parents:
diff changeset
   276
       _T("haar"), _T("naar"), _T("heb"), _T("hoe"), _T("heeft"), _T("hebben"), _T("deze"), _T("u"), _T("want"), _T("nog"), _T("zal"),
hgs
parents:
diff changeset
   277
       _T("me"), _T("zij"), _T("nu"), _T("ge"), _T("geen"), _T("omdat"), _T("iets"), _T("worden"), _T("toch"), _T("al"), _T("waren"),
hgs
parents:
diff changeset
   278
       _T("veel"), _T("meer"), _T("doen"), _T("toen"), _T("moet"), _T("ben"), _T("zonder"), _T("kan"), _T("hun"), _T("dus"),
hgs
parents:
diff changeset
   279
       _T("alles"), _T("onder"), _T("ja"), _T("eens"), _T("hier"), _T("wie"), _T("werd"), _T("altijd"), _T("doch"), _T("wordt"),
hgs
parents:
diff changeset
   280
       _T("wezen"), _T("kunnen"), _T("ons"), _T("zelf"), _T("tegen"), _T("na"), _T("reeds"), _T("wil"), _T("kon"), _T("niets"),
hgs
parents:
diff changeset
   281
       _T("uw"), _T("iemand"), _T("geweest"), _T("andere"),
hgs
parents:
diff changeset
   282
      NULL      
hgs
parents:
diff changeset
   283
    };
hgs
parents:
diff changeset
   284
hgs
parents:
diff changeset
   285
   const TCHAR* NonEnglishStopWords::RUSSIAN_STOP_WORDS[] = {
hgs
parents:
diff changeset
   286
  _T("а"), _T("без"), _T("более"), _T("бы"), _T("был"), _T("была"), _T("были"), _T("было"), _T("быть"), _T("в"),
hgs
parents:
diff changeset
   287
   _T("вам"), _T("вас"), _T("весь"), _T("во"), _T("вот"), _T("все"), _T("всего"), _T("всех"), _T("вы"), _T("где"), _T(
hgs
parents:
diff changeset
   288
   "да"), _T("даже"), _T("для"), _T("до"), _T("его"), _T("ее"), _T("ей"), _T("ею"), _T("если"), _T("есть"), _T(
hgs
parents:
diff changeset
   289
   "еще"), _T("же"), _T("за"), _T("здесь"), _T("и"), _T("из"), _T("или"), _T("им"), _T("их"), _T("к"), _T("как"),
hgs
parents:
diff changeset
   290
   _T("ко"), _T("когда"), _T("кто"), _T("ли"), _T("либо"), _T("мне"), _T("может"), _T("мы"), _T("на"), _T("надо"), _T(
hgs
parents:
diff changeset
   291
   "наш"), _T("не"), _T("него"), _T("нее"), _T("нет"), _T("ни"), _T("них"), _T("но"), _T("ну"), _T("о"), _T("об"), _T(
hgs
parents:
diff changeset
   292
   "однако"), _T("он"), _T("она"), _T("они"), _T("оно"), _T("от"), _T("очень"), _T("по"), _T("под"), _T("при"), _T(
hgs
parents:
diff changeset
   293
   "с"), _T("со"), _T("так"), _T("также"), _T("такой"), _T("там"), _T("те"), _T("тем"), _T("то"), _T("того"), _T(
hgs
parents:
diff changeset
   294
   "тоже"), _T("той"), _T("только"), _T("том"), _T("ты"), _T("у"), _T("уже"), _T("хотя"), _T("чего"), _T("чей"), _T(
hgs
parents:
diff changeset
   295
   "чем"), _T("что"), _T("чтобы"), _T("чье"), _T("чья"), _T("эта"), _T("эти"), _T("это"), _T("я"),
hgs
parents:
diff changeset
   296
  NULL
hgs
parents:
diff changeset
   297
};
hgs
parents:
diff changeset
   298
hgs
parents:
diff changeset
   299
   const TCHAR* NonEnglishStopWords::EXTENDED_ENGLISH_STOP_WORDS[] = {
hgs
parents:
diff changeset
   300
   _T("a"), _T("about"), _T("above"), _T("across"), _T("adj"), _T("after"), _T("afterwards"),
hgs
parents:
diff changeset
   301
         _T("again"), _T("against"), _T("albeit"), _T("all"), _T("almost"), _T("alone"), _T("along"),
hgs
parents:
diff changeset
   302
         _T("already"), _T("also"), _T("although"), _T("always"), _T("among"), _T("amongst"), _T("an"),
hgs
parents:
diff changeset
   303
         _T("and"), _T("another"), _T("any"), _T("anyhow"), _T("anyone"), _T("anything"),
hgs
parents:
diff changeset
   304
         _T("anywhere"), _T("are"), _T("around"), _T("as"), _T("at"), _T("be"), _T("became"), _T("because"),
hgs
parents:
diff changeset
   305
         _T("become"), _T("becomes"), _T("becoming"), _T("been"), _T("before"), _T("beforehand"),
hgs
parents:
diff changeset
   306
         _T("behind"), _T("being"), _T("below"), _T("beside"), _T("besides"), _T("between"),
hgs
parents:
diff changeset
   307
         _T("beyond"), _T("both"), _T("but"), _T("by"), _T("can"), _T("cannot"), _T("co"), _T("could"),
hgs
parents:
diff changeset
   308
         _T("down"), _T("during"), _T("each"), _T("eg"), _T("either"), _T("else"), _T("elsewhere"),
hgs
parents:
diff changeset
   309
         _T("enough"), _T("etc"), _T("even"), _T("ever"), _T("every"), _T("everyone"), _T("everything"),
hgs
parents:
diff changeset
   310
         _T("everywhere"), _T("except"), _T("few"), _T("first"), _T("for"), _T("former"),
hgs
parents:
diff changeset
   311
         _T("formerly"), _T("from"), _T("further"), _T("had"), _T("has"), _T("have"), _T("he"), _T("hence"),
hgs
parents:
diff changeset
   312
         _T("her"), _T("here"), _T("hereafter"), _T("hereby"), _T("herein"), _T("hereupon"), _T("hers"),
hgs
parents:
diff changeset
   313
         _T("herself"), _T("him"), _T("himself"), _T("his"), _T("how"), _T("however"), _T("i"), _T("ie"), _T("if"),
hgs
parents:
diff changeset
   314
         _T("in"), _T("inc"), _T("indeed"), _T("into"), _T("is"), _T("it"), _T("its"), _T("itself"), _T("last"),
hgs
parents:
diff changeset
   315
         _T("latter"), _T("latterly"), _T("least"), _T("less"), _T("ltd"), _T("many"), _T("may"), _T("me"),
hgs
parents:
diff changeset
   316
         _T("meanwhile"), _T("might"), _T("more"), _T("moreover"), _T("most"), _T("mostly"), _T("much"),
hgs
parents:
diff changeset
   317
         _T("must"), _T("my"), _T("myself"), _T("namely"), _T("neither"), _T("never"),
hgs
parents:
diff changeset
   318
         _T("nevertheless"), _T("next"), _T("no"), _T("nobody"), _T("none"), _T("noone"), _T("nor"),
hgs
parents:
diff changeset
   319
         _T("not"), _T("nothing"), _T("now"), _T("nowhere"), _T("of"), _T("off"), _T("often"), _T("on"),
hgs
parents:
diff changeset
   320
         _T("once one"), _T("only"), _T("onto"), _T("or"), _T("other"), _T("others"), _T("otherwise"),
hgs
parents:
diff changeset
   321
         _T("our"), _T("ours"), _T("ourselves"), _T("out"), _T("over"), _T("own"), _T("per"), _T("perhaps"),
hgs
parents:
diff changeset
   322
         _T("rather"), _T("s"), _T("same"), _T("seem"), _T("seemed"), _T("seeming"), _T("seems"),
hgs
parents:
diff changeset
   323
         _T("several"), _T("she"), _T("should"), _T("since"), _T("so"), _T("some"), _T("somehow"),
hgs
parents:
diff changeset
   324
         _T("someone"), _T("something"), _T("sometime"), _T("sometimes"), _T("somewhere"),
hgs
parents:
diff changeset
   325
         _T("still"), _T("such"), _T("t"), _T("than"), _T("that"), _T("the"), _T("their"), _T("them"),
hgs
parents:
diff changeset
   326
         _T("themselves"), _T("then"), _T("thence"), _T("there"), _T("thereafter"), _T("thereby"),
hgs
parents:
diff changeset
   327
         _T("therefor"), _T("therein"), _T("thereupon"), _T("these"), _T("they"), _T("this"),
hgs
parents:
diff changeset
   328
         _T("those"), _T("though"), _T("through"), _T("throughout"), _T("thru"), _T("thus"), _T("to"),
hgs
parents:
diff changeset
   329
         _T("together"), _T("too"), _T("toward"), _T("towards"), _T("under"), _T("until"), _T("up"),
hgs
parents:
diff changeset
   330
         _T("upon"), _T("us"), _T("very"), _T("via"), _T("was"), _T("we"), _T("well"), _T("were"), _T("what"),
hgs
parents:
diff changeset
   331
         _T("whatever"), _T("whatsoever"), _T("when"), _T("whence"), _T("whenever"),
hgs
parents:
diff changeset
   332
         _T("whensoever"), _T("where"), _T("whereafter"), _T("whereas"), _T("whereat"),
hgs
parents:
diff changeset
   333
         _T("whereby"), _T("wherefrom"), _T("wherein"), _T("whereinto"), _T("whereof"),
hgs
parents:
diff changeset
   334
         _T("whereon"), _T("whereto"), _T("whereunto"), _T("whereupon"), _T("wherever"),
hgs
parents:
diff changeset
   335
         _T("wherewith"), _T("whether"), _T("which"), _T("whichever"), _T("whichsoever"),
hgs
parents:
diff changeset
   336
         _T("while"), _T("whilst"), _T("whither"), _T("who"), _T("whoever"), _T("whole"), _T("whom"),
hgs
parents:
diff changeset
   337
         _T("whomever"), _T("whomsoever"), _T("whose"), _T("whosoever"), _T("why"), _T("will"),
hgs
parents:
diff changeset
   338
         _T("with"), _T("within"), _T("without"), _T("would"), _T("xsubj"), _T("xcal"), _T("xauthor"),
hgs
parents:
diff changeset
   339
         _T("xother "), _T("xnote"), _T("yet"), _T("you"), _T("your"), _T("yours"), _T("yourself"),
hgs
parents:
diff changeset
   340
         _T("yourselves"),
hgs
parents:
diff changeset
   341
         NULL
hgs
parents:
diff changeset
   342
   };
hgs
parents:
diff changeset
   343
}
hgs
parents:
diff changeset
   344
   /*TODO:
hgs
parents:
diff changeset
   345
   * extented english stop word set can be merged to main english stop words set
hgs
parents:
diff changeset
   346
   * chinese thai, cjk and may be others that use english stop word list may be done as
hgs
parents:
diff changeset
   347
   * const TCHAR* NonEnglishStopWords::THAI_STOP_WORDS = NonEnglishStopWords::ENGLISH_STOP_WORDS
hgs
parents:
diff changeset
   348
   * so as to safely use CustomAnalyzer(L"thai>stop(th)")
hgs
parents:
diff changeset
   349
   */