8
|
1 |
/*
|
|
2 |
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
|
|
3 |
* All rights reserved.
|
|
4 |
* This component and the accompanying materials are made available
|
|
5 |
* under the terms of "Eclipse Public License v1.0"
|
|
6 |
* which accompanies this distribution, and is available
|
|
7 |
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
|
8 |
*
|
|
9 |
* Initial Contributors:
|
|
10 |
* Nokia Corporation - initial contribution.
|
|
11 |
*
|
|
12 |
* Contributors:
|
|
13 |
*
|
|
14 |
* Description:
|
|
15 |
*
|
|
16 |
*/
|
|
17 |
|
|
18 |
|
|
19 |
// system library
|
|
20 |
#include "wchar.h"
|
|
21 |
#include <string>
|
|
22 |
#include <vector>
|
|
23 |
#include <sstream>
|
|
24 |
#include <iostream>
|
|
25 |
#include <glib.h>
|
|
26 |
|
|
27 |
// clucene
|
|
28 |
#include "CLucene.h"
|
|
29 |
#include "CLucene/analysis/AnalysisHeader.h"
|
|
30 |
#include "CLucene/analysis/Analyzers.h"
|
|
31 |
|
|
32 |
// local libary
|
|
33 |
#include "thaianalysis.h"
|
|
34 |
#include "ngram.h"
|
|
35 |
#include "koreananalyzer.h"
|
|
36 |
#include "cjkanalyzer.h"
|
|
37 |
#include "cpixparsetools.h"
|
|
38 |
#include "prefixfilter.h"
|
|
39 |
|
|
40 |
// cpix internal
|
|
41 |
#include "customanalyzer.h"
|
|
42 |
#include "cpixanalyzer.h"
|
|
43 |
#include "analyzer.h"
|
|
44 |
#include "cluceneext.h"
|
|
45 |
#include "analyzerexp.h"
|
|
46 |
#include "indevicecfg.h"
|
|
47 |
#include "cpixexc.h"
|
|
48 |
#include "localization.h"
|
|
49 |
|
|
50 |
namespace Cpix {
|
|
51 |
|
|
52 |
//
|
|
53 |
// Following sections provide the glue code for connecting the
|
|
54 |
// analyzer definition syntax with analyzer, tokenizers and filter
|
|
55 |
// implementations.
|
|
56 |
//
|
|
57 |
// The glue code is template heavy with the indent of providing
|
|
58 |
// automation for associating specific keywords with specific
|
|
59 |
// analyzers, tokenizers and filters implementing corresponding
|
|
60 |
// CLucene abstractions. Additional classes are needed only if
|
|
61 |
// filters, tokenizers, etc. accept parameters.
|
|
62 |
//
|
|
63 |
// NOTE: To understand the analyzers, it is sufficient to understand
|
|
64 |
// that an analyzer transforms characters stream into specific token streams
|
|
65 |
// (e.g. character stream 'foobarmetawords' can be transformed into token
|
|
66 |
// stream 'foo', 'bar' 'meta' 'words'). Analysis consist of two main
|
|
67 |
// parts which are tokenization and filtering. Tokenization converts
|
|
68 |
// the character stream into token stream (e.g. 'FoO bAr' -> 'FoO' 'bAr')
|
|
69 |
// and filtering modifies the tokens (e.g. lowercase filtering 'FoO' ->
|
|
70 |
// 'foo', 'bAr' -> 'bar'). Analyzer as an object is responsible for
|
|
71 |
// constructing a tokenizer and a sequence of filters to perform
|
|
72 |
// these required tasks.
|
|
73 |
//
|
|
74 |
// See the documentation around TokenizerClassEntries and
|
|
75 |
// FilterClassEntries to see how implementations not taking parameters
|
|
76 |
// can be easily added.
|
|
77 |
//
|
|
78 |
|
|
79 |
using namespace Cpix::AnalyzerExp;
|
|
80 |
|
|
81 |
// Safe assumption
|
|
82 |
#define MAX_LANGCODE_LENGTH 256
|
|
83 |
|
|
84 |
class LocaleSwitchStreamFactory : public TokenStreamFactory {
|
|
85 |
public:
|
|
86 |
|
|
87 |
LocaleSwitchStreamFactory(const AnalyzerExp::LocaleSwitch& sw, const wchar_t* config);
|
|
88 |
|
|
89 |
~LocaleSwitchStreamFactory();
|
|
90 |
|
|
91 |
virtual lucene::analysis::TokenStream* tokenStream(const wchar_t * fieldName,
|
|
92 |
lucene::util::Reader * reader);
|
|
93 |
|
|
94 |
lucene::analysis::TokenStream* tokenStream(std::vector<std::wstring>& languages,
|
|
95 |
const wchar_t * fieldName,
|
|
96 |
lucene::util::Reader * reader);
|
|
97 |
|
|
98 |
private:
|
|
99 |
std::map<std::wstring, CustomAnalyzer*> analyzers_;
|
|
100 |
std::auto_ptr<CustomAnalyzer> default_;
|
|
101 |
};
|
|
102 |
|
|
103 |
|
|
104 |
TokenStreamFactory::~TokenStreamFactory() {};
|
|
105 |
|
|
106 |
LocaleSwitchStreamFactory::LocaleSwitchStreamFactory(const LocaleSwitch& sw, const wchar_t* config) {
|
|
107 |
for (int i = 0; i < sw.cases().size(); i++) {
|
|
108 |
const Case& cs = *sw.cases()[i];
|
|
109 |
for (int j = 0; j < cs.cases().size(); j++) {
|
|
110 |
std::wstring c = cs.cases()[j];
|
|
111 |
if (analyzers_.count(c)) delete analyzers_[c];
|
|
112 |
analyzers_[c] = new CustomAnalyzer(cs.piping(), config);
|
|
113 |
}
|
|
114 |
}
|
|
115 |
default_.reset(new CustomAnalyzer(sw.def()));
|
|
116 |
}
|
|
117 |
|
|
118 |
LocaleSwitchStreamFactory::~LocaleSwitchStreamFactory() {
|
|
119 |
typedef std::map<std::wstring, CustomAnalyzer*>::iterator iter;
|
|
120 |
for (iter i = analyzers_.begin(); i != analyzers_.end(); i++) {
|
|
121 |
delete i->second;
|
|
122 |
}
|
|
123 |
}
|
|
124 |
|
|
125 |
lucene::analysis::TokenStream*
|
|
126 |
LocaleSwitchStreamFactory::tokenStream(const wchar_t * fieldName,
|
|
127 |
lucene::util::Reader * reader) {
|
|
128 |
std::vector<std::wstring> languages =
|
|
129 |
Localization::instance().getLanguageNames();
|
|
130 |
|
|
131 |
return tokenStream(languages, fieldName, reader);
|
|
132 |
}
|
|
133 |
|
|
134 |
lucene::analysis::TokenStream*
|
|
135 |
LocaleSwitchStreamFactory::tokenStream(std::vector<std::wstring>& languages,
|
|
136 |
const wchar_t * fieldName,
|
|
137 |
lucene::util::Reader * reader) {
|
|
138 |
for (int i = 0; i < languages.size(); i++) {
|
|
139 |
if ( analyzers_.count(languages[i]) ) {
|
|
140 |
return analyzers_[languages[i]]->tokenStream( fieldName, reader );
|
|
141 |
}
|
|
142 |
}
|
|
143 |
return default_->tokenStream( fieldName, reader );
|
|
144 |
}
|
|
145 |
|
|
146 |
class DefaultTokenStreamFactory : public TokenStreamFactory {
|
|
147 |
public:
|
|
148 |
|
|
149 |
enum Target {
|
|
150 |
NORMAL,
|
|
151 |
INDEXING,
|
|
152 |
QUERY,
|
|
153 |
PREFIX
|
|
154 |
};
|
|
155 |
|
|
156 |
DefaultTokenStreamFactory(const Invokation& invokation) {
|
|
157 |
if (invokation.params().size() == 1) {
|
|
158 |
const Identifier* id = dynamic_cast<const Identifier*>( invokation.params()[0] );
|
|
159 |
if ( id ) {
|
|
160 |
if ( id->id() == CPIX_ID_INDEXING ) {
|
|
161 |
target_ = INDEXING;
|
|
162 |
} else if ( id->id() == CPIX_ID_QUERY ) {
|
|
163 |
target_ = QUERY;
|
|
164 |
} else if ( id->id() == CPIX_ID_PREFIX ) {
|
|
165 |
target_ = PREFIX;
|
|
166 |
} else {
|
|
167 |
THROW_CPIXEXC(L"Default analyzer does not accept %S for parameter", id->id().c_str());
|
|
168 |
}
|
|
169 |
} else {
|
|
170 |
THROW_CPIXEXC(L"Default accepts only identifier as a parameter.");
|
|
171 |
}
|
|
172 |
} else if (invokation.params().size() > 1) {
|
|
173 |
THROW_CPIXEXC(L"Default analyzer does not accept more than one parameter");
|
|
174 |
} else {
|
|
175 |
target_ = NORMAL;
|
|
176 |
}
|
|
177 |
}
|
|
178 |
|
|
179 |
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
|
|
180 |
lucene::util::Reader * reader) {
|
|
181 |
switch (target_) {
|
|
182 |
case QUERY:
|
|
183 |
return Analysis::getQueryAnalyzer().tokenStream( fieldName, reader );
|
|
184 |
case PREFIX:
|
|
185 |
return Analysis::getPrefixAnalyzer().tokenStream( fieldName, reader );
|
|
186 |
}
|
|
187 |
return Analysis::getDefaultAnalyzer().tokenStream( fieldName, reader );
|
|
188 |
}
|
|
189 |
|
|
190 |
private:
|
|
191 |
|
|
192 |
Target target_;
|
|
193 |
|
|
194 |
};
|
|
195 |
|
|
196 |
/**
|
|
197 |
* Template class used to create CLucene tokenizers. Template
|
|
198 |
* parameter T must implement lucene::analysis::Tokenizer abstraction.
|
|
199 |
*/
|
|
200 |
template<class T>
|
|
201 |
class TokenizerFactory : public TokenStreamFactory
|
|
202 |
{
|
|
203 |
public:
|
|
204 |
TokenizerFactory(const Invokation& invokation) {
|
|
205 |
if (invokation.params().size() > 0) {
|
|
206 |
THROW_CPIXEXC(L"Tokenizer %S does not accept parameters",
|
|
207 |
invokation.id().c_str());
|
|
208 |
}
|
|
209 |
}
|
|
210 |
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * /*fieldName*/,
|
|
211 |
lucene::util::Reader * reader) {
|
|
212 |
return _CLNEW T(reader);
|
|
213 |
}
|
|
214 |
};
|
|
215 |
|
|
216 |
template<>
|
|
217 |
class TokenizerFactory<analysis::CjkNGramTokenizer> : public TokenStreamFactory
|
|
218 |
{
|
|
219 |
public:
|
|
220 |
static const int DefaultNgramSize = 1;
|
|
221 |
TokenizerFactory(const Invokation& invokation) {
|
|
222 |
using namespace Cpix::AnalyzerExp;
|
|
223 |
if (invokation.params().size() > 1) {
|
|
224 |
THROW_CPIXEXC(L"Cjk Ngram tokenizer does not accept more than one parameter",
|
|
225 |
invokation.id().c_str());
|
|
226 |
}
|
|
227 |
if (invokation.params().size() == DefaultNgramSize) {
|
|
228 |
IntegerLit* ngramSize = dynamic_cast<IntegerLit*>(invokation.params()[0]);
|
|
229 |
if ( ngramSize ) {
|
|
230 |
ngramSize_ = ngramSize->value();
|
|
231 |
} else {
|
|
232 |
THROW_CPIXEXC(L"Cjk Ngram tokenizer parameter must be an integer");
|
|
233 |
}
|
|
234 |
} else {
|
|
235 |
ngramSize_ = 1;
|
|
236 |
}
|
|
237 |
}
|
|
238 |
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * /*fieldName*/,
|
|
239 |
lucene::util::Reader * reader) {
|
|
240 |
return _CLNEW analysis::CjkNGramTokenizer(reader, ngramSize_);
|
|
241 |
}
|
|
242 |
|
|
243 |
private:
|
|
244 |
|
|
245 |
int ngramSize_;
|
|
246 |
};
|
|
247 |
|
|
248 |
|
|
249 |
/**
|
|
250 |
* Template class wrapping CLucene analyzers. Template parameter T must
|
|
251 |
* implement lucene::analysis::Analyzer abstraction.
|
|
252 |
*/
|
|
253 |
template<class T>
|
|
254 |
class AnalyzerWrap : public TokenStreamFactory
|
|
255 |
{
|
|
256 |
public:
|
|
257 |
AnalyzerWrap(const Invokation& invokation) : analyzer_() {
|
|
258 |
if (invokation.params().size() > 0) {
|
|
259 |
THROW_CPIXEXC(L"Tokenizer %S does not accept parameters",
|
|
260 |
invokation.id().c_str());
|
|
261 |
}
|
|
262 |
}
|
|
263 |
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
|
|
264 |
lucene::util::Reader * reader) {
|
|
265 |
return analyzer_.tokenStream(fieldName, reader);
|
|
266 |
}
|
|
267 |
private:
|
|
268 |
T analyzer_;
|
|
269 |
};
|
|
270 |
|
|
271 |
/**
|
|
272 |
* Template class associated with CLucene filter and a TokenStreamFactory.
|
|
273 |
* Uses TokenStreamFactory to transform given character stream into tokenstream
|
|
274 |
* and then applies the given Clucene filter to the token stream.
|
|
275 |
* The template parameter T must implement lucene::analysis::Filter abstraction.
|
|
276 |
*/
|
|
277 |
template<class T>
|
|
278 |
class FilterFactory : public TokenStreamFactory
|
|
279 |
{
|
|
280 |
public:
|
|
281 |
FilterFactory(const Invokation& invokation, auto_ptr<TokenStreamFactory> factory) : factory_(factory) {
|
|
282 |
if (invokation.params().size() > 0) {
|
|
283 |
THROW_CPIXEXC(L"Filter %S does not accept parameters",
|
|
284 |
invokation.id().c_str());
|
|
285 |
}
|
|
286 |
}
|
|
287 |
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
|
|
288 |
lucene::util::Reader * reader) {
|
|
289 |
return _CLNEW T(factory_->tokenStream(fieldName, reader), true);
|
|
290 |
}
|
|
291 |
private:
|
|
292 |
std::auto_ptr<TokenStreamFactory> factory_;
|
|
293 |
};
|
|
294 |
|
|
295 |
/**
|
|
296 |
* Specialized Analyzer wrap for CLucene's PerFieldAnalyzer. Specialized
|
|
297 |
* template is needed because perfield analyzer accepts parameters
|
|
298 |
* (specific analyzers for different field plus default analyzer)
|
|
299 |
*/
|
|
300 |
template<>
|
|
301 |
class AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper> : public TokenStreamFactory {
|
|
302 |
public:
|
|
303 |
AnalyzerWrap(const Switch& sw, const wchar_t* config) : analyzer_(0) {
|
|
304 |
using namespace Cpt::Parser;
|
|
305 |
using namespace lucene::analysis;
|
|
306 |
|
|
307 |
analyzer_ = _CLNEW PerFieldAnalyzerWrapper(_CLNEW CustomAnalyzer(sw.def()));
|
|
308 |
|
|
309 |
for (int i = 0; i < sw.cases().size(); i++) {
|
|
310 |
const Case& cs = *sw.cases()[i];
|
|
311 |
for (int j = 0; j < cs.cases().size(); j++) {
|
|
312 |
analyzer_->addAnalyzer( cs.cases()[j].c_str(), _CLNEW CustomAnalyzer( cs.piping(), config ) );
|
|
313 |
}
|
|
314 |
}
|
|
315 |
}
|
|
316 |
virtual ~AnalyzerWrap() {
|
|
317 |
_CLDELETE(analyzer_);
|
|
318 |
}
|
|
319 |
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
|
|
320 |
lucene::util::Reader * reader) {
|
|
321 |
return analyzer_->tokenStream(fieldName, reader);
|
|
322 |
}
|
|
323 |
private:
|
|
324 |
lucene::analysis::PerFieldAnalyzerWrapper* analyzer_;
|
|
325 |
};
|
|
326 |
|
|
327 |
|
|
328 |
|
|
329 |
/**
|
|
330 |
* Specialized StopFilter factory. Specialized filter is needed
|
|
331 |
* because StopFilter needs parameters (stop word list or a language)
|
|
332 |
*/
|
|
333 |
template<>
|
|
334 |
class FilterFactory<lucene::analysis::StopFilter> : public TokenStreamFactory
|
|
335 |
{
|
|
336 |
public:
|
|
337 |
FilterFactory(const Invokation& invokation,
|
|
338 |
auto_ptr<TokenStreamFactory> factory)
|
|
339 |
:words_(0), ownWords_(0), factory_(factory) {
|
|
340 |
using namespace Cpt::Parser;
|
|
341 |
if (invokation.params().size() == 1 && dynamic_cast<Identifier*>(invokation.params()[0])) {
|
|
342 |
Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
|
|
343 |
//cpix_LangCode lang;
|
|
344 |
if (id->id() == CPIX_WLANG_EN) {
|
|
345 |
words_ = lucene::analysis::StopAnalyzer::ENGLISH_STOP_WORDS;
|
|
346 |
} else if (id->id() == CPIX_WLANG_FR) {
|
|
347 |
words_ = analysis::NonEnglishStopWords::FRENCH_STOP_WORDS;
|
|
348 |
} else {
|
|
349 |
THROW_CPIXEXC(L"No prepared stopword list for language code '%S'",
|
|
350 |
id->id().c_str());
|
|
351 |
}
|
|
352 |
} else {
|
|
353 |
ownWords_ = new wchar_t*[invokation.params().size()+1];
|
|
354 |
memset(ownWords_, 0, sizeof(wchar_t*)*(invokation.params().size()+1));
|
|
355 |
// FIXE: args may leak
|
|
356 |
for (int i = 0; i < invokation.params().size(); i++) {
|
|
357 |
StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]);
|
|
358 |
if (lit) {
|
|
359 |
const wstring& str = lit->text();
|
|
360 |
ownWords_[i] = new wchar_t[str.length()+1];
|
|
361 |
wcscpy(ownWords_[i], str.c_str());
|
|
362 |
} else {
|
|
363 |
THROW_CPIXEXC(L"StopFilter accepts only language identifer or list of strings as a parameters.");
|
|
364 |
}
|
|
365 |
}
|
|
366 |
}
|
|
367 |
|
|
368 |
}
|
|
369 |
virtual ~FilterFactory() {
|
|
370 |
if (ownWords_) {
|
|
371 |
for (int i = 0; ownWords_[i]; i++) {
|
|
372 |
delete[] ownWords_[i];
|
|
373 |
}
|
|
374 |
delete[] ownWords_;
|
|
375 |
}
|
|
376 |
}
|
|
377 |
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
|
|
378 |
lucene::util::Reader * reader) {
|
|
379 |
return _CLNEW lucene::analysis::StopFilter(factory_->tokenStream(fieldName, reader), true, ownWords_ ? const_cast<const wchar_t**>(ownWords_) : words_);
|
|
380 |
}
|
|
381 |
private:
|
|
382 |
const wchar_t **words_;
|
|
383 |
wchar_t **ownWords_; // owned
|
|
384 |
std::auto_ptr<TokenStreamFactory> factory_;
|
|
385 |
};
|
|
386 |
|
|
387 |
/**
|
|
388 |
* Specialized SnowballFilter factory is needed, because SnowballFilter
|
|
389 |
* accepts parameters (the language).
|
|
390 |
*/
|
|
391 |
template<>
|
|
392 |
class FilterFactory<lucene::analysis::SnowballFilter> : public TokenStreamFactory
|
|
393 |
{
|
|
394 |
public:
|
|
395 |
FilterFactory(const Invokation& invokation,
|
|
396 |
auto_ptr<TokenStreamFactory> factory)
|
|
397 |
: factory_(factory) {
|
|
398 |
using namespace Cpt::Parser;
|
|
399 |
if (invokation.params().size() != 1 || !dynamic_cast<Identifier*>(invokation.params()[0])) {
|
|
400 |
THROW_CPIXEXC(L"Snowball filter takes exactly one identifier as a parameter." );
|
|
401 |
}
|
|
402 |
Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
|
|
403 |
if (id->id() == CPIX_WLANG_EN) {
|
|
404 |
lang_ = cpix_LANG_EN;
|
|
405 |
} else {
|
|
406 |
THROW_CPIXEXC(L"Language identifier %S is not supported for stemming",
|
|
407 |
id->id().c_str());
|
|
408 |
}
|
|
409 |
}
|
|
410 |
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
|
|
411 |
lucene::util::Reader * reader) {
|
|
412 |
return _CLNEW lucene::analysis::SnowballFilter(factory_->tokenStream(fieldName, reader), true, lang_);
|
|
413 |
}
|
|
414 |
private:
|
|
415 |
cpix_LangCode lang_;
|
|
416 |
std::auto_ptr<TokenStreamFactory> factory_;
|
|
417 |
};
|
|
418 |
|
|
419 |
/**
|
|
420 |
* Specialized LengthFilter factory is needed, because length filter
|
|
421 |
* accepts parameters (minimum length and maximum length)
|
|
422 |
*/
|
|
423 |
template<>
|
|
424 |
class FilterFactory<lucene::analysis::LengthFilter> : public TokenStreamFactory
|
|
425 |
{
|
|
426 |
public:
|
|
427 |
FilterFactory(const Invokation& invokation,
|
|
428 |
auto_ptr<TokenStreamFactory> factory)
|
|
429 |
: factory_(factory) {
|
|
430 |
using namespace Cpt::Parser;
|
|
431 |
if (invokation.params().size() != 2 ||
|
|
432 |
!dynamic_cast<IntegerLit*>(invokation.params()[0]) ||
|
|
433 |
!dynamic_cast<IntegerLit*>(invokation.params()[1])) {
|
|
434 |
THROW_CPIXEXC("Length filter takes exactly two integer parameters");
|
|
435 |
}
|
|
436 |
min_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value();
|
|
437 |
max_ = dynamic_cast<IntegerLit*>(invokation.params()[1])->value();
|
|
438 |
}
|
|
439 |
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
|
|
440 |
lucene::util::Reader * reader) {
|
|
441 |
return _CLNEW lucene::analysis::LengthFilter(factory_->tokenStream(fieldName, reader), true, min_, max_ );
|
|
442 |
}
|
|
443 |
private:
|
|
444 |
int min_, max_;
|
|
445 |
std::auto_ptr<TokenStreamFactory> factory_;
|
|
446 |
};
|
|
447 |
|
|
448 |
/**
|
|
449 |
* Specialized PrefixGenerator factory is needed, because PrefixGenerator
|
|
450 |
* requires the max prefix size.
|
|
451 |
*/
|
|
452 |
template<>
|
|
453 |
class FilterFactory<PrefixGenerator> : public TokenStreamFactory
|
|
454 |
{
|
|
455 |
public:
|
|
456 |
FilterFactory(const Invokation& invokation,
|
|
457 |
auto_ptr<TokenStreamFactory> factory)
|
|
458 |
: factory_(factory) {
|
|
459 |
using namespace Cpt::Parser;
|
|
460 |
if (invokation.params().size() != 1 ||
|
|
461 |
!dynamic_cast<IntegerLit*>(invokation.params()[0])) {
|
|
462 |
THROW_CPIXEXC("Prefix generator takes exactly one integer parameter");
|
|
463 |
}
|
|
464 |
maxPrefixLength_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value();
|
|
465 |
}
|
|
466 |
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
|
|
467 |
lucene::util::Reader * reader) {
|
|
468 |
return _CLNEW PrefixGenerator(factory_->tokenStream(fieldName, reader), true, maxPrefixLength_ );
|
|
469 |
}
|
|
470 |
private:
|
|
471 |
int maxPrefixLength_;
|
|
472 |
std::auto_ptr<TokenStreamFactory> factory_;
|
|
473 |
};
|
|
474 |
|
|
475 |
/**
|
|
476 |
* Specialized PrefixFilter factory is needed, because prefix filter
|
|
477 |
* accepts parameters (language set or prefixes)
|
|
478 |
*/
|
|
479 |
template<>
|
|
480 |
class FilterFactory<analysis::PrefixFilter> : public TokenStreamFactory
|
|
481 |
{
|
|
482 |
public:
|
|
483 |
FilterFactory(const Invokation& invokation,
|
|
484 |
auto_ptr<TokenStreamFactory> factory)
|
|
485 |
: prefixes_(0), ownPrefixes_(0), factory_(factory) {
|
|
486 |
using namespace Cpt::Parser;
|
|
487 |
if (invokation.params().size() == 1 &&
|
|
488 |
dynamic_cast<Identifier*>(invokation.params()[0])) {
|
|
489 |
Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
|
|
490 |
//cpix_LangCode lang;
|
|
491 |
if (id->id() == CPIX_WLANG_HE) {
|
|
492 |
prefixes_ = analysis::HebrewPrefixes;
|
|
493 |
} else {
|
|
494 |
THROW_CPIXEXC(L"No prepared prefix list for language code '%S'",
|
|
495 |
id->id().c_str());
|
|
496 |
}
|
|
497 |
} else {
|
|
498 |
ownPrefixes_ = new wchar_t*[invokation.params().size()+1];
|
|
499 |
memset(ownPrefixes_, 0, sizeof(wchar_t*)*(invokation.params().size()+1));
|
|
500 |
// FIXE: args may leak
|
|
501 |
for (int i = 0; i < invokation.params().size(); i++) {
|
|
502 |
StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]);
|
|
503 |
if (lit) {
|
|
504 |
const wstring& str = lit->text();
|
|
505 |
ownPrefixes_[i] = new wchar_t[str.length()+1];
|
|
506 |
wcscpy(ownPrefixes_[i], str.c_str());
|
|
507 |
} else {
|
|
508 |
THROW_CPIXEXC(L"PrefixFilter accepts only language identifer or list of strings as a parameters.");
|
|
509 |
}
|
|
510 |
}
|
|
511 |
}
|
|
512 |
}
|
|
513 |
virtual ~FilterFactory() {
|
|
514 |
if (ownPrefixes_) {
|
|
515 |
for (int i = 0; ownPrefixes_[i]; i++) {
|
|
516 |
delete[] ownPrefixes_[i];
|
|
517 |
}
|
|
518 |
delete[] ownPrefixes_;
|
|
519 |
}
|
|
520 |
}
|
|
521 |
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
|
|
522 |
lucene::util::Reader * reader) {
|
|
523 |
return _CLNEW analysis::PrefixFilter(factory_->tokenStream(fieldName, reader), true, ownPrefixes_ ? const_cast<const wchar_t**>(ownPrefixes_) : prefixes_);
|
|
524 |
}
|
|
525 |
private:
|
|
526 |
const wchar_t **prefixes_;
|
|
527 |
wchar_t **ownPrefixes_; // owned
|
|
528 |
std::auto_ptr<TokenStreamFactory> factory_;
|
|
529 |
};
|
|
530 |
|
|
531 |
/**
|
|
532 |
* Specialized ElisionFilter factory is needed, because elision filter
|
|
533 |
* accepts parameters (language set or articles)
|
|
534 |
*/
|
|
535 |
template<>
|
|
536 |
class FilterFactory<analysis::ElisionFilter> : public TokenStreamFactory
|
|
537 |
{
|
|
538 |
public:
|
|
539 |
FilterFactory(const Invokation& invokation,
|
|
540 |
auto_ptr<TokenStreamFactory> factory)
|
|
541 |
: articles_(0), ownArticles_(0), factory_(factory) {
|
|
542 |
using namespace Cpt::Parser;
|
|
543 |
if (invokation.params().size() == 1 &&
|
|
544 |
dynamic_cast<Identifier*>(invokation.params()[0])) {
|
|
545 |
Identifier* id = dynamic_cast<Identifier*>(invokation.params()[0]);
|
|
546 |
//cpix_LangCode lang;
|
|
547 |
if (id->id() == CPIX_WLANG_FR) {
|
|
548 |
articles_ = analysis::FrenchArticles;
|
|
549 |
} else {
|
|
550 |
THROW_CPIXEXC(L"No prepared article list for language code '%S'",
|
|
551 |
id->id().c_str());
|
|
552 |
}
|
|
553 |
} else {
|
|
554 |
ownArticles_ = new wchar_t*[invokation.params().size()+1];
|
|
555 |
memset(ownArticles_, 0, sizeof(wchar_t*)*(invokation.params().size()+1));
|
|
556 |
// FIXE: args may leak
|
|
557 |
for (int i = 0; i < invokation.params().size(); i++) {
|
|
558 |
StringLit* lit = dynamic_cast<StringLit*>(invokation.params()[i]);
|
|
559 |
if (lit) {
|
|
560 |
const wstring& str = lit->text();
|
|
561 |
ownArticles_[i] = new wchar_t[str.length()+1];
|
|
562 |
wcscpy(ownArticles_[i], str.c_str());
|
|
563 |
} else {
|
|
564 |
THROW_CPIXEXC(L"PrefixFilter accepts only language identifer or list of strings as a parameters.");
|
|
565 |
}
|
|
566 |
}
|
|
567 |
}
|
|
568 |
}
|
|
569 |
virtual ~FilterFactory() {
|
|
570 |
if (ownArticles_) {
|
|
571 |
for (int i = 0; ownArticles_[i]; i++) {
|
|
572 |
delete[] ownArticles_[i];
|
|
573 |
}
|
|
574 |
delete[] ownArticles_;
|
|
575 |
}
|
|
576 |
}
|
|
577 |
virtual lucene::analysis::TokenStream* tokenStream(const TCHAR * fieldName,
|
|
578 |
lucene::util::Reader * reader) {
|
|
579 |
return _CLNEW analysis::ElisionFilter(factory_->tokenStream(fieldName, reader), true, ownArticles_ ? const_cast<const wchar_t**>(ownArticles_) : articles_);
|
|
580 |
}
|
|
581 |
private:
|
|
582 |
const wchar_t **articles_;
|
|
583 |
wchar_t **ownArticles_; // owned
|
|
584 |
std::auto_ptr<TokenStreamFactory> factory_;
|
|
585 |
};
|
|
586 |
|
|
587 |
typedef auto_ptr<TokenStreamFactory> (*TokenizerFactoryCreator)(const Invokation& invokation);
|
|
588 |
typedef auto_ptr<TokenStreamFactory> (*FilterFactoryCreator)(const Invokation& invokation,
|
|
589 |
auto_ptr<TokenStreamFactory> factory);
|
|
590 |
|
|
591 |
template<class T>
|
|
592 |
struct TokenStreamFactoryCtor
|
|
593 |
{
|
|
594 |
static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) {
|
|
595 |
return auto_ptr<TokenStreamFactory>(new T(invokation));
|
|
596 |
}
|
|
597 |
};
|
|
598 |
|
|
599 |
/**
|
|
600 |
* Sets up a tokenizer factory with given invokation parameters
|
|
601 |
*/
|
|
602 |
template<class T>
|
|
603 |
struct TokenizerFactoryCtor
|
|
604 |
{
|
|
605 |
static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) {
|
|
606 |
return auto_ptr<TokenStreamFactory>(new TokenizerFactory<T>(invokation));
|
|
607 |
}
|
|
608 |
};
|
|
609 |
|
|
610 |
/**
|
|
611 |
* Sets up an analyzer wrap with given invokation parameters
|
|
612 |
*/
|
|
613 |
template<class T>
|
|
614 |
struct AnalyzerWrapCtor
|
|
615 |
{
|
|
616 |
static auto_ptr<TokenStreamFactory> create(const Invokation& invokation) {
|
|
617 |
return auto_ptr<TokenStreamFactory>(new AnalyzerWrap<T>(invokation));
|
|
618 |
}
|
|
619 |
};
|
|
620 |
|
|
621 |
/**
|
|
622 |
* Sets up a filter factory with given invokation parameters
|
|
623 |
*/
|
|
624 |
template<class T>
|
|
625 |
struct FilterFactoryCtor
|
|
626 |
{
|
|
627 |
static auto_ptr<TokenStreamFactory> create(const Invokation& invokation,
|
|
628 |
auto_ptr<TokenStreamFactory> factory) {
|
|
629 |
return auto_ptr<TokenStreamFactory>(new FilterFactory<T>(invokation, factory));
|
|
630 |
}
|
|
631 |
};
|
|
632 |
|
|
633 |
struct TokenizerClassEntry {
|
|
634 |
const wchar_t *id_;
|
|
635 |
TokenizerFactoryCreator createFactory_;
|
|
636 |
};
|
|
637 |
|
|
638 |
//
|
|
639 |
// Following TokenizerClassEntries and FilterClassEntries contain
|
|
640 |
// the mapping from tokenizer/analyzer/filter names into glue code
|
|
641 |
// templates providing the implementations.
|
|
642 |
//
|
|
643 |
|
|
644 |
TokenizerClassEntry TokenizerClassEntries[] = {
|
|
645 |
{CPIX_TOKENIZER_STANDARD, TokenizerFactoryCtor<lucene::analysis::standard::StandardTokenizer>::create},
|
|
646 |
{CPIX_TOKENIZER_WHITESPACE, TokenizerFactoryCtor<lucene::analysis::WhitespaceTokenizer>::create},
|
|
647 |
{CPIX_TOKENIZER_LETTER, TokenizerFactoryCtor<lucene::analysis::LetterTokenizer>::create},
|
|
648 |
{CPIX_TOKENIZER_KEYWORD, TokenizerFactoryCtor<lucene::analysis::KeywordTokenizer>::create},
|
|
649 |
{CPIX_TOKENIZER_CJK, TokenizerFactoryCtor<lucene::analysis::cjk::CJKTokenizer>::create},
|
|
650 |
{CPIX_TOKENIZER_NGRAM, TokenizerFactoryCtor<analysis::CjkNGramTokenizer>::create},
|
|
651 |
{CPIX_TOKENIZER_KOREAN, TokenizerFactoryCtor<analysis::KoreanTokenizer>::create},
|
|
652 |
{CPIX_TOKENIZER_KOREAN_QUERY,TokenizerFactoryCtor<analysis::KoreanQueryTokenizer>::create},
|
|
653 |
|
|
654 |
{CPIX_ANALYZER_STANDARD, AnalyzerWrapCtor<lucene::analysis::standard::StandardAnalyzer>::create},
|
19
|
655 |
{CPIX_ANALYZER_PHONENUMBER, AnalyzerWrapCtor<lucene::analysis::PhoneNumberAnalyzer>::create},
|
8
|
656 |
{CPIX_ANALYZER_DEFAULT, TokenStreamFactoryCtor<DefaultTokenStreamFactory>::create},
|
|
657 |
|
|
658 |
// TODO: Add more Tokenizers/Analyzers
|
|
659 |
|
|
660 |
// Example tokenizer (works as such if tokenizers don't take parameters)
|
|
661 |
// {CPIX_TOKENIZER_MYTOKENIZER,TokenizerFactoryCtor<MyTokenizer>::create},
|
|
662 |
|
|
663 |
// Example analyzer (works as such if analyzer don't take parameters)
|
|
664 |
// {CPIX_ANALYZER_MYANALYZER, AnalyzerWrapCtor<MyAnalyzer>::create},
|
|
665 |
|
|
666 |
{0, 0}
|
|
667 |
};
|
|
668 |
|
|
669 |
struct FilterClassEntry {
|
|
670 |
const wchar_t *id_;
|
|
671 |
FilterFactoryCreator createFactory_;
|
|
672 |
};
|
|
673 |
|
|
674 |
FilterClassEntry FilterClassEntries[] = {
|
|
675 |
{CPIX_FILTER_STANDARD, FilterFactoryCtor<lucene::analysis::standard::StandardFilter>::create},
|
|
676 |
{CPIX_FILTER_LOWERCASE, FilterFactoryCtor<lucene::analysis::LowerCaseFilter>::create},
|
|
677 |
{CPIX_FILTER_ACCENT, FilterFactoryCtor<lucene::analysis::ISOLatin1AccentFilter>::create},
|
|
678 |
{CPIX_FILTER_STOP, FilterFactoryCtor<lucene::analysis::StopFilter>::create},
|
|
679 |
{CPIX_FILTER_STEM, FilterFactoryCtor<lucene::analysis::SnowballFilter>::create},
|
|
680 |
{CPIX_FILTER_LENGTH, FilterFactoryCtor<lucene::analysis::LengthFilter>::create},
|
|
681 |
{CPIX_FILTER_PREFIXES, FilterFactoryCtor<PrefixGenerator>::create},
|
|
682 |
{CPIX_FILTER_THAI, FilterFactoryCtor<analysis::ThaiWordFilter>::create},
|
|
683 |
{CPIX_FILTER_PREFIX, FilterFactoryCtor<analysis::PrefixFilter>::create},
|
|
684 |
{CPIX_FILTER_ELISION, FilterFactoryCtor<analysis::ElisionFilter>::create},
|
|
685 |
|
|
686 |
// TODO: Add more Filters
|
|
687 |
|
|
688 |
// Example filter (works as such if filter don't take parameters)
|
|
689 |
// {CPIX_FILTER_MYFILTER, FilterFactoryCtor<MyFilter>::create},
|
|
690 |
|
|
691 |
{0, 0}
|
|
692 |
};
|
|
693 |
|
|
694 |
CustomAnalyzer::CustomAnalyzer(const wchar_t* definition, const wchar_t* config) {
|
|
695 |
std::auto_ptr<Piping> piping = AnalyzerExp::ParsePiping( definition );
|
|
696 |
setup( *piping, config );
|
|
697 |
}
|
|
698 |
|
|
699 |
CustomAnalyzer::CustomAnalyzer(const Piping& definition, const wchar_t* config) {
|
|
700 |
setup(definition, config);
|
|
701 |
}
|
|
702 |
|
|
703 |
using namespace Cpt::Parser;
|
|
704 |
|
|
705 |
void CustomAnalyzer::setup(const Piping& piping, const wchar_t* config) {
|
|
706 |
|
|
707 |
// If the first item is invokation, create corresponding analyzer/tokenizer
|
|
708 |
if (dynamic_cast<const Invokation*>(&piping.tokenizer())) {
|
|
709 |
const Invokation& tokenizer = dynamic_cast<const Invokation&>(piping.tokenizer());
|
|
710 |
TokenizerClassEntry& tokenizerEntry = getTokenizerEntry( tokenizer.id() );
|
|
711 |
factory_ = tokenizerEntry.createFactory_( tokenizer );
|
|
712 |
} else if (dynamic_cast<const Switch*>(&piping.tokenizer())) {
|
|
713 |
// If the first item is switch statement, create per-field analyzer
|
|
714 |
const Switch& tokenizer = dynamic_cast<const Switch&>(piping.tokenizer());
|
|
715 |
factory_ = new AnalyzerWrap<lucene::analysis::PerFieldAnalyzerWrapper>( tokenizer, config );
|
|
716 |
} else if (dynamic_cast<const LocaleSwitch*>(&piping.tokenizer())) {
|
|
717 |
const LocaleSwitch& tokenizer = dynamic_cast<const LocaleSwitch&>(piping.tokenizer());
|
|
718 |
factory_ = new LocaleSwitchStreamFactory( tokenizer, config );
|
|
719 |
} else if (dynamic_cast<const ConfigSwitch*>(&piping.tokenizer())) {
|
|
720 |
const ConfigSwitch& tokenizer = dynamic_cast<const ConfigSwitch&>(piping.tokenizer());
|
|
721 |
factory_ = resolveConfigSwitch( tokenizer, config );
|
|
722 |
} else {
|
|
723 |
THROW_CPIXEXC(L"Analyzer definition syntax did not begin with valid tokenizer");
|
|
724 |
}
|
|
725 |
|
|
726 |
// Add filters
|
|
727 |
const std::vector<Invokation*>& filters = piping.filters();
|
|
728 |
for (int i = 0; i < filters.size(); i++) {
|
|
729 |
FilterClassEntry& filterEntry = getFilterEntry( filters[i]->id() );
|
|
730 |
factory_ = filterEntry.createFactory_( *filters[i], factory_ );
|
|
731 |
}
|
|
732 |
}
|
|
733 |
|
|
734 |
std::auto_ptr<TokenStreamFactory> CustomAnalyzer::resolveConfigSwitch(const ConfigSwitch& csw, const wchar_t* config) {
|
|
735 |
if (config) {
|
|
736 |
for (int i = 0; i < csw.cases().size(); i++) {
|
|
737 |
const Case& cs = *csw.cases()[i];
|
|
738 |
for (int j = 0; j < cs.cases().size(); j++) {
|
|
739 |
if (wcscmp(config, cs.cases()[j].c_str()) == 0) {
|
|
740 |
return std::auto_ptr<TokenStreamFactory>(
|
|
741 |
new CustomAnalyzer(cs.piping(), config));
|
|
742 |
}
|
|
743 |
}
|
|
744 |
}
|
|
745 |
}
|
|
746 |
return std::auto_ptr<TokenStreamFactory>(new CustomAnalyzer(csw.def(), config));
|
|
747 |
}
|
|
748 |
|
|
749 |
TokenizerClassEntry& CustomAnalyzer::getTokenizerEntry(std::wstring id) {
|
|
750 |
|
|
751 |
// Looks for a match in the TokenizerClassEntries. After finding
|
|
752 |
// a match it returns a proper tokenizer/analyzer implementation provider
|
|
753 |
//
|
|
754 |
for (int i = 0; TokenizerClassEntries[i].id_; i++) {
|
|
755 |
if (id == std::wstring(TokenizerClassEntries[i].id_)) {
|
|
756 |
return TokenizerClassEntries[i];
|
|
757 |
}
|
|
758 |
}
|
|
759 |
|
|
760 |
THROW_CPIXEXC(L"Unknown tokenizer '%S'.",
|
|
761 |
id.c_str());
|
|
762 |
}
|
|
763 |
|
|
764 |
FilterClassEntry& CustomAnalyzer::getFilterEntry(std::wstring id) {
|
|
765 |
|
|
766 |
// Looks for a match in the FilterClassEntries. After finding
|
|
767 |
// a match it returns a proper tokenizer/analyzer implementation
|
|
768 |
// provider
|
|
769 |
//
|
|
770 |
for (int i = 0; FilterClassEntries[i].id_; i++) {
|
|
771 |
if (id == std::wstring(FilterClassEntries[i].id_)) {
|
|
772 |
return FilterClassEntries[i];
|
|
773 |
}
|
|
774 |
}
|
|
775 |
|
|
776 |
THROW_CPIXEXC(L"Unknown filter '%S'.",
|
|
777 |
id.c_str());
|
|
778 |
}
|
|
779 |
|
|
780 |
CustomAnalyzer::~CustomAnalyzer() {}
|
|
781 |
|
|
782 |
lucene::analysis::TokenStream* CustomAnalyzer::tokenStream(const wchar_t * fieldName,
|
|
783 |
lucene::util::Reader * reader) {
|
|
784 |
// Utilizes the the token stream factory to form token stream.
|
|
785 |
// token stream factory is prepared during custom analyzer construction
|
|
786 |
// and based on the analyzer definition string.
|
|
787 |
|
|
788 |
return factory_->tokenStream(fieldName, reader);
|
|
789 |
}
|
|
790 |
|
|
791 |
std::auto_ptr<lucene::analysis::Analyzer> CreateDefaultAnalyzer()
|
|
792 |
{
|
|
793 |
return
|
|
794 |
std::auto_ptr<lucene::analysis::Analyzer>(
|
|
795 |
new SystemAnalyzer(_CLNEW lucene::analysis::standard::StandardAnalyzer()));
|
|
796 |
}
|
|
797 |
|
|
798 |
}
|