1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 #include <wchar.h> |
1 #include <wchar.h> |
19 #include <stddef.h> |
2 #include <stddef.h> |
20 |
3 |
21 |
|
22 #include <iostream> |
4 #include <iostream> |
23 |
5 |
24 #include "cpixidxdb.h" |
6 #include "cpixidxdb.h" |
25 |
7 |
26 #include "itk.h" |
8 #include "itk.h" |
27 |
9 |
28 #include "config.h" |
10 #include "config.h" |
29 #include "testutils.h" |
11 #include "testutils.h" |
30 |
|
31 #include "std_log_result.h" |
|
32 |
12 |
33 // For testing custom analyzer |
13 // For testing custom analyzer |
34 #include "CLucene.h" |
14 #include "CLucene.h" |
35 #include "CLucene\analysis\AnalysisHeader.h" |
15 #include "CLucene\analysis\AnalysisHeader.h" |
36 #include "CLucene\util\stringreader.h" |
16 #include "CLucene\util\stringreader.h" |
37 #include "analyzer.h" |
|
38 #include "analyzerexp.h" |
17 #include "analyzerexp.h" |
|
18 #include "customanalyzer.h" |
|
19 |
|
20 #include "localetestinfos.h" |
|
21 |
|
22 #include "spi/locale.h" |
|
23 #include "cpixstrtools.h" |
39 |
24 |
40 using namespace Cpt::Lex; |
25 using namespace Cpt::Lex; |
41 using namespace Cpt::Parser; |
26 using namespace Cpt::Parser; |
42 using namespace Cpix::AnalyzerExp; |
27 using namespace Cpix::AnalyzerExp; |
43 |
28 |
44 void PrintToken(Cpt::Lex::Token token) { |
29 void PrintToken(Cpt::Lex::Token token) { |
45 switch (token.type()) { |
30 printf("%S('%S')", token.type(), token.text()); |
46 case TOKEN_WS: printf("space"); break; |
31 } |
47 case TOKEN_ID: printf("id"); break; |
32 |
48 case TOKEN_LIT: printf("lit"); break; |
33 |
49 case TOKEN_STRLIT: printf("str-lit"); break; |
34 void TestTokenization6(Itk::TestMgr * testMgr) |
50 case TOKEN_REALLIT: printf("real-lit"); break; |
35 { |
51 case TOKEN_INTLIT: printf("int-lit"); break; |
36 Cpix::AnalyzerExp::Tokenizer tokenizer; |
52 case TOKEN_LEFT_BRACKET: printf("lbr"); break; |
|
53 case TOKEN_RIGHT_BRACKET: printf("rbr"); break; |
|
54 case TOKEN_COMMA: printf("comma"); break; |
|
55 case TOKEN_PIPE: printf("pipe"); break; |
|
56 case TOKEN_SWITCH : printf("sw"); break; |
|
57 case TOKEN_CASE : printf("case"); break; |
|
58 case TOKEN_DEFAULT : printf("default"); break; |
|
59 case TOKEN_LEFT_BRACE : printf("lbc"); break; |
|
60 case TOKEN_RIGHT_BRACE : printf("rbc"); break; |
|
61 case TOKEN_COLON : printf("cl"); break; |
|
62 case TOKEN_TERMINATOR : printf("tr"); break; |
|
63 |
|
64 default: printf("unknown"); break; |
|
65 } |
|
66 printf("('%S')", (token.text()).c_str()); |
|
67 } |
|
68 |
|
69 |
|
70 void TestTokenization6(Itk::TestMgr * ) |
|
71 { |
|
72 char *xml_file = (char*)__FUNCTION__; |
|
73 assert_failed = 0; |
|
74 Cpix::AnalyzerExp::Tokenizer tokenizer; |
|
75 Tokens source(tokenizer, |
37 Tokens source(tokenizer, |
76 L"switch { " |
38 L"switch { " |
77 L"case '_docuid', '_mimetype': keywords;" |
39 L"case '_docuid', '_mimetype': keywords;" |
78 L"case '_baseappclass': whitespace>lowercase;" |
40 L"case '_baseappclass': whitespace>lowercase;" |
79 L"default: natural(en); " |
41 L"default: natural(en); " |
80 L"}"); |
42 L"}"); |
81 WhiteSpaceFilter |
43 StdFilter |
82 tokens(source); |
44 tokens(source); |
83 |
45 |
84 while (tokens) PrintToken(tokens++); |
46 while (tokens) PrintToken(tokens++); |
85 testResultXml(xml_file); |
47 } |
86 } |
48 |
87 |
49 void TestParsing(Itk::TestMgr* mgr) |
88 void TestParsing(Itk::TestMgr* ) |
|
89 { |
50 { |
90 Cpix::AnalyzerExp::Tokenizer tokenizer; |
51 Cpix::AnalyzerExp::Tokenizer tokenizer; |
91 char *xml_file = (char*)__FUNCTION__; |
52 |
92 assert_failed = 0; |
|
93 Tokens source(tokenizer, L"foobar(zap, foo, 'bar', 'a', raboof)"); |
53 Tokens source(tokenizer, L"foobar(zap, foo, 'bar', 'a', raboof)"); |
94 WhiteSpaceFilter tokens(source); |
54 StdFilter tokens(source); |
95 Lexer lexer(tokens); |
55 Lexer lexer(tokens); |
96 |
56 |
97 Tokens source2(tokenizer, L" stdtokens >lowercase>stopwords(fin)>stopwords('a', 'an','the')>stem(fin) "); |
57 const wchar_t* text = L" stdtokens >lowercase>stopwords(fin)>stopwords('a', 'an','the')>stem(fin) "; |
98 WhiteSpaceFilter tokens2(source2); |
|
99 Lexer lexer2(tokens2); |
|
100 |
58 |
101 Tokens source3(tokenizer, L"foobar(zap, 0, 0.0045, 4, 'a', 9223.031)"); |
59 Tokens source3(tokenizer, L"foobar(zap, 0, 0.0045, 4, 'a', 9223.031)"); |
102 WhiteSpaceFilter tokens3(source3); |
60 StdFilter tokens3(source3); |
103 Lexer lexer3(tokens3); |
61 Lexer lexer3(tokens3); |
104 |
62 |
105 try { |
63 try { |
106 auto_ptr<Invokation> invoke = ParseInvokation(lexer); |
64 auto_ptr<Invokation> invoke = ParseInvokation(lexer); |
107 lexer.eatEof(); |
65 lexer.eatEof(); |
108 printf("Invoke identifier: %S\n", (invoke->id()).c_str()); |
66 printf("Invoke identifier: %S\n", invoke->id()); |
109 printf("%d parameters\n", invoke->params().size()); |
67 printf("%d parameters\n", invoke->params().size()); |
110 auto_ptr<Piping> piping = ParsePiping(lexer2); |
68 auto_ptr<Piping> piping = ParsePiping(text); |
111 lexer2.eatEof(); |
|
112 printf("piping done.\n"); |
69 printf("piping done.\n"); |
113 if (dynamic_cast<const Invokation*>(&piping->tokenizer())) { |
70 if (dynamic_cast<const Invokation*>(&piping->tokenizer())) { |
114 printf("Tokenizer: %S\n", dynamic_cast<const Invokation&>(piping->tokenizer()).id().c_str()); |
71 printf("Tokenizer: %S\n", dynamic_cast<const Invokation&>(piping->tokenizer()).id()); |
115 } |
72 } |
116 printf("%d filters\n", piping->filters().size()); |
73 printf("%d filters\n", piping->filters().size()); |
117 invoke = ParseInvokation(lexer3); |
74 invoke = ParseInvokation(lexer3); |
118 lexer3.eatEof(); |
75 lexer3.eatEof(); |
119 printf("Invoke identifier: %S\n", (invoke->id()).c_str()); |
76 printf("Invoke identifier: %S\n", invoke->id()); |
120 printf("%d parameters\n", invoke->params().size()); |
77 printf("%d parameters\n", invoke->params().size()); |
121 } catch (ParseException& e) { |
78 } catch (ParseException& e) { |
122 assert_failed = 1; |
|
123 printf("ParseException: %S\n", e.wWhat()); |
79 printf("ParseException: %S\n", e.wWhat()); |
124 } catch (LexException& e) { |
80 } catch (LexException& e) { |
125 assert_failed = 1; |
|
126 printf("LexException: %S\n", e.wWhat()); |
81 printf("LexException: %S\n", e.wWhat()); |
127 } |
82 } |
128 testResultXml(xml_file); |
83 } |
129 } |
84 |
130 |
85 void TestSwitch(Itk::TestMgr* mgr) |
131 void TestSwitch(Itk::TestMgr* ) |
|
132 { |
86 { |
133 Cpix::AnalyzerExp::Tokenizer tokenizer; |
87 Cpix::AnalyzerExp::Tokenizer tokenizer; |
134 char *xml_file = (char*)__FUNCTION__; |
88 |
135 assert_failed = 0; |
89 const wchar_t* text = |
136 const wchar_t* text; |
|
137 Tokens source(tokenizer, text = |
|
138 L"switch { " |
90 L"switch { " |
139 L"case '_docuid', '_mimetype': keywords;" |
91 L"case '_docuid', '_mimetype': keywords;" |
140 L"case '_baseappclass': whitespace>lowercase;" |
92 L"case '_baseappclass': whitespace>lowercase;" |
141 L"default: natural(en); " |
93 L"default: natural(en); " |
142 L"}"); |
94 L"}"; |
143 WhiteSpaceFilter tokens(source); |
95 |
144 Lexer lexer(tokens); |
96 try { |
145 |
97 auto_ptr<Piping> sw = ParsePiping(text); |
146 try { |
|
147 auto_ptr<Piping> sw = ParsePiping(lexer); |
|
148 lexer.eatEof(); |
|
149 if (dynamic_cast<const Switch*>(&sw->tokenizer())) { |
98 if (dynamic_cast<const Switch*>(&sw->tokenizer())) { |
150 const Switch* s = dynamic_cast<const Switch*>(&sw->tokenizer()); |
99 const Switch* s = dynamic_cast<const Switch*>(&sw->tokenizer()); |
151 for (int i = 0; i < s->cases().size(); i++) { |
100 for (int i = 0; i < s->cases().size(); i++) { |
152 const Case* c = s->cases()[i]; |
101 const Case* c = s->cases()[i]; |
153 printf("case "); |
102 printf("case "); |
154 for (int j = 0; j < c->fields().size(); j++) { |
103 for (int j = 0; j < c->cases().size(); j++) { |
155 printf("%S", (c->fields()[j]).c_str()); |
104 printf("%S", c->cases()[j]); |
156 } |
105 } |
157 printf(": ...\n"); |
106 printf(": ...\n"); |
158 // wcout<<L":"<<s->def().tokenizer().id(); |
107 // wcout<<L":"<<s->def().tokenizer().id(); |
159 } |
108 } |
160 printf("default: ...\n");//<<s->def().tokenizer().id()<<"...;"; |
109 printf("default: ...\n");//<<s->def().tokenizer().id()<<"...;"; |
161 } |
110 } |
162 } catch (ParseException& e) { |
111 } catch (ParseException& e) { |
163 // OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; |
112 // OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; |
164 assert_failed = 1; |
|
165 e.setContext(text); |
113 e.setContext(text); |
166 printf("ParseException: %S\n", e.wWhat()); |
114 printf("ParseException: %S\n", e.wWhat()); |
167 } catch (LexException& e) { |
115 } catch (LexException& e) { |
168 // OBS wcout<<L"LexException: "<<e.describe(text)<<endl; |
116 // OBS wcout<<L"LexException: "<<e.describe(text)<<endl; |
169 assert_failed = 1; |
|
170 e.setContext(text); |
117 e.setContext(text); |
171 printf("LexException: %S\n", e.wWhat()); |
118 printf("LexException: %S\n", e.wWhat()); |
172 } |
119 } |
173 testResultXml(xml_file); |
120 } |
174 } |
121 |
175 |
122 void TestConfigSwitch(Itk::TestMgr* mgr) |
176 void TestParsingErrors(Itk::TestMgr* ) |
123 { |
177 { |
|
178 char *xml_file = (char*)__FUNCTION__; |
|
179 assert_failed = 0; |
|
180 Cpix::AnalyzerExp::Tokenizer tokenizer; |
124 Cpix::AnalyzerExp::Tokenizer tokenizer; |
181 // eof |
125 |
182 const wchar_t* text; |
126 const wchar_t* text = |
183 StdLexer eof(tokenizer, text = L"foobar(zap, foo, 'bar', 'raf', do, "); |
127 L"config_switch { " |
184 try { |
128 L"case 'indexing': korean;" |
185 ParsePiping(eof); |
129 L"case 'query': koreanquery;" |
186 eof.eatEof(); |
130 L"case 'prefix': letter;" |
|
131 L"default: korean;" |
|
132 L"}"; |
|
133 |
|
134 try { |
|
135 auto_ptr<Piping> sw = ParsePiping(text); |
|
136 if (dynamic_cast<const ConfigSwitch*>(&sw->tokenizer())) { |
|
137 const ConfigSwitch* s = dynamic_cast<const ConfigSwitch*>(&sw->tokenizer()); |
|
138 for (int i = 0; i < s->cases().size(); i++) { |
|
139 const Case* c = s->cases()[i]; |
|
140 printf("case "); |
|
141 for (int j = 0; j < c->cases().size(); j++) { |
|
142 printf("%S", c->cases()[j]); |
|
143 } |
|
144 printf(": ...\n"); |
|
145 // wcout<<L":"<<s->def().tokenizer().id(); |
|
146 } |
|
147 printf("default: ...\n");//<<s->def().tokenizer().id()<<"...;"; |
|
148 } |
187 } catch (ParseException& e) { |
149 } catch (ParseException& e) { |
188 // OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; |
150 // OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; |
189 e.setContext(text); |
151 e.setContext(text); |
190 printf("ParseException: %S\n", e.wWhat()); |
152 printf("ParseException: %S\n", e.wWhat()); |
191 } |
153 } catch (LexException& e) { |
192 |
|
193 |
|
194 // Unfinished literal |
|
195 StdLexer lit(tokenizer, text = L"foobar(zap, foo, 'bar', 'a, raboof)"); |
|
196 try { |
|
197 ParsePiping(lit); |
|
198 lit.eatEof(); |
|
199 } catch (LexException& e) { // syntax error |
|
200 // OBS wcout<<L"LexException: "<<e.describe(text)<<endl; |
154 // OBS wcout<<L"LexException: "<<e.describe(text)<<endl; |
201 e.setContext(text); |
155 e.setContext(text); |
202 printf("LexException: %S\n", e.wWhat()); |
156 printf("LexException: %S\n", e.wWhat()); |
|
157 } |
|
158 } |
|
159 |
|
160 |
|
161 void TestParsingErrors(Itk::TestMgr* mgr) |
|
162 { |
|
163 Cpix::AnalyzerExp::Tokenizer tokenizer; |
|
164 // eof |
|
165 const wchar_t* text; |
|
166 try { |
|
167 ParsePiping( text = L"foobar(zap, foo, 'bar', 'raf', do, " ); |
|
168 } catch (ParseException& e) { |
|
169 printf("ParseException: %S\n", e.wWhat()); |
|
170 } |
|
171 |
|
172 // Unfinished literal |
|
173 try { |
|
174 ParsePiping(text = L"foobar(zap, foo, 'bar', 'a, raboof)"); |
|
175 } catch (LexException& e) { // syntax error |
|
176 printf("LexException: %S\n", e.wWhat()); |
203 } catch (ParseException& e) { // syntax error |
177 } catch (ParseException& e) { // syntax error |
204 // OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; |
|
205 e.setContext(text); |
|
206 printf("ParseException: %S\n", e.wWhat()); |
178 printf("ParseException: %S\n", e.wWhat()); |
207 } |
179 } |
208 |
180 |
209 // Unknown token |
181 // Unknown token |
210 StdLexer unknown(tokenizer, text = L"foobar(!zap, foo, 'bar', 'a', raboof)"); |
182 try { |
211 try { |
183 ParsePiping(text = L"foobar(!zap, foo, 'bar', 'a', raboof)"); |
212 ParsePiping(unknown); |
|
213 unknown.eatEof(); |
|
214 } catch (LexException& e) { // syntax error |
184 } catch (LexException& e) { // syntax error |
215 // OBS wcout<<L"LexException: "<<e.describe(text)<<endl; |
|
216 e.setContext(text); |
|
217 printf("LexException: %S\n", e.wWhat()); |
185 printf("LexException: %S\n", e.wWhat()); |
218 } |
186 } |
219 |
187 |
220 // Missing comma |
188 // Missing comma |
221 StdLexer comma(tokenizer, text = L"foobar(zap, foo, 'bar', 'a' raboof)"); |
189 try { |
222 try { |
190 ParsePiping(text = L"foobar(zap, foo, 'bar', 'a' raboof)"); |
223 ParsePiping(comma); |
|
224 comma.eatEof(); |
|
225 } catch (ParseException& e) { |
191 } catch (ParseException& e) { |
226 // OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; |
|
227 e.setContext(text); |
|
228 printf("ParseException: %S\n", e.wWhat()); |
192 printf("ParseException: %S\n", e.wWhat()); |
229 } |
193 } |
230 testResultXml(xml_file); |
194 |
231 } |
195 } |
232 |
196 |
233 |
197 |
234 const char * CustomAnalyzerTestDocs[] = { |
198 const char * CustomAnalyzerTestDocs[] = { |
235 FILE_TEST_CORPUS_PATH "\\en\\1.txt", |
199 STEM_TEST_CORPUS_PATH "\\en\\1.txt", |
236 FILE_TEST_CORPUS_PATH "\\en\\2.txt", |
200 STEM_TEST_CORPUS_PATH "\\en\\2.txt", |
237 FILE_TEST_CORPUS_PATH "\\en\\3.txt", |
201 STEM_TEST_CORPUS_PATH "\\en\\3.txt", |
238 FILE_TEST_CORPUS_PATH "\\fi\\1.txt", |
202 STEM_TEST_CORPUS_PATH "\\en\\4.txt", |
239 FILE_TEST_CORPUS_PATH "\\fi\\2.txt", |
203 |
|
204 STEM_TEST_CORPUS_PATH "\\fi\\1.txt", |
|
205 STEM_TEST_CORPUS_PATH "\\fi\\2.txt", |
|
206 LOC_TEST_CORPUS_PATH "\\th\\1.txt", |
|
207 LOC_TEST_CORPUS_PATH "\\th\\2.txt", |
|
208 |
240 NULL |
209 NULL |
241 }; |
210 }; |
242 |
211 |
243 const char DEFAULT_ENCODING[] = "UTF-8"; |
212 const char DEFAULT_ENCODING[] = "UTF-8"; |
244 |
213 |
256 printf("'%S'", token.termText()); |
225 printf("'%S'", token.termText()); |
257 } |
226 } |
258 printf("\n"); |
227 printf("\n"); |
259 } |
228 } |
260 |
229 |
261 void TestCustomAnalyzer(Itk::TestMgr * , const wchar_t* definition) |
230 void TestCustomAnalyzer(Itk::TestMgr * testMgr, |
|
231 const char** files, |
|
232 const wchar_t* definition) |
262 { |
233 { |
263 using namespace lucene::analysis; |
234 using namespace lucene::analysis; |
264 using namespace lucene::util; |
235 using namespace lucene::util; |
265 using namespace Cpix; |
236 using namespace Cpix; |
266 using namespace std; |
237 using namespace std; |
267 CustomAnalyzer analyzer(definition); |
238 CustomAnalyzer analyzer(definition); |
268 |
239 |
269 printf("Analyzer \"%S\":\n", definition); |
240 printf("Analyzer \"%S\":\n", definition); |
270 for (int i = 0; CustomAnalyzerTestDocs[i]; i++) |
241 for (int i = 0; files[i]; i++) |
271 { |
242 { |
272 printf("File !%s tokenized:\n", (CustomAnalyzerTestDocs[i]+1)); |
243 printf("File !%s tokenized:\n", (files[i]+1)); |
273 FileReader file( CustomAnalyzerTestDocs[i], DEFAULT_ENCODING ); |
244 FileReader file( files[i], DEFAULT_ENCODING ); |
274 |
245 |
275 TokenStream* stream = analyzer.tokenStream( L"field", &file ); |
246 TokenStream* stream = analyzer.tokenStream( L"field", &file ); |
276 PrintTokenStream( stream ); |
247 PrintTokenStream( stream ); |
277 stream->close(); |
248 stream->close(); |
278 _CLDELETE( stream ); |
249 _CLDELETE( stream ); |
279 } |
250 } |
|
251 printf("\n"); |
|
252 } |
|
253 |
|
254 void TestCustomAnalyzer(Itk::TestMgr * testMgr, const wchar_t* definition) { |
|
255 TestCustomAnalyzer(testMgr, CustomAnalyzerTestDocs, definition); |
280 } |
256 } |
281 |
257 |
282 void TestCustomAnalyzers(Itk::TestMgr * testMgr) |
258 void TestCustomAnalyzers(Itk::TestMgr * testMgr) |
283 { |
259 { |
284 char *xml_file = (char*)__FUNCTION__; |
|
285 assert_failed = 0; |
|
286 TestCustomAnalyzer(testMgr, L"stdtokens"); |
260 TestCustomAnalyzer(testMgr, L"stdtokens"); |
287 TestCustomAnalyzer(testMgr, L"whitespace"); |
261 TestCustomAnalyzer(testMgr, L"whitespace"); |
288 TestCustomAnalyzer(testMgr, L"whitespace>lowercase"); |
262 TestCustomAnalyzer(testMgr, L"whitespace>lowercase"); |
289 TestCustomAnalyzer(testMgr, L"whitespace>accent"); |
263 TestCustomAnalyzer(testMgr, L"whitespace>accent"); |
290 TestCustomAnalyzer(testMgr, L"letter"); |
264 TestCustomAnalyzer(testMgr, L"letter"); |
291 TestCustomAnalyzer(testMgr, L"letter>lowercase"); |
265 TestCustomAnalyzer(testMgr, L"letter>lowercase"); |
292 TestCustomAnalyzer(testMgr, L"keyword"); |
266 TestCustomAnalyzer(testMgr, L"keyword"); |
293 TestCustomAnalyzer(testMgr, L"keyword>lowercase"); |
267 TestCustomAnalyzer(testMgr, L"keyword>lowercase"); |
294 TestCustomAnalyzer(testMgr, L"stdtokens>lowercase>accent>stem(en)"); |
268 // TestCustomAnalyzer(testMgr, L"stdtokens>lowercase>stem(en)"); // Does not work with NON-ASCII |
295 TestCustomAnalyzer(testMgr, L"letter>lowercase>accent>stop(en)"); |
269 TestCustomAnalyzer(testMgr, L"letter>lowercase>stop(en)"); |
296 TestCustomAnalyzer(testMgr, L"letter>lowercase>stop('i', 'oh', 'nyt', 'näin')"); |
270 TestCustomAnalyzer(testMgr, L"letter>lowercase>stop('i', 'oh', 'nyt', 'n�in')"); |
297 TestCustomAnalyzer(testMgr, L"letter>length(2, 4)"); |
271 TestCustomAnalyzer(testMgr, L"letter>length(2, 4)"); |
298 testResultXml(xml_file); |
272 TestCustomAnalyzer(testMgr, L"standard>prefixes(1)"); |
299 } |
273 TestCustomAnalyzer(testMgr, L"standard>prefixes(2)"); |
300 |
274 TestCustomAnalyzer(testMgr, L"standard>prefixes(3)"); |
301 void TestAnalyzerWithField(Itk::TestMgr * , const wchar_t* definition, const wchar_t* field) |
275 TestCustomAnalyzer(testMgr, L"stdtokens>stdfilter>lowercase>thai>stop(en)"); |
|
276 TestCustomAnalyzer(testMgr, L"cjk>stop(en)"); |
|
277 TestCustomAnalyzer(testMgr, L"ngram(1)>lowercase>stop(en)"); |
|
278 TestCustomAnalyzer(testMgr, L"ngram(2)>lowercase>stop(en)"); |
|
279 } |
|
280 |
|
281 void TestTokenizationWithLocales(Itk::TestMgr * testMgr) { |
|
282 printf("locale=en\n"); |
|
283 cpix_Result result; |
|
284 cpix_SetLocale( &result, "en" ); |
|
285 TestCustomAnalyzer(testMgr, L"natural"); |
|
286 |
|
287 printf("locale=th\n"); |
|
288 cpix_SetLocale( &result, "th" ); |
|
289 TestCustomAnalyzer(testMgr, L"natural"); |
|
290 |
|
291 printf("locale=ko\n"); |
|
292 cpix_SetLocale( &result, "ko" ); |
|
293 TestCustomAnalyzer(testMgr, L"natural"); |
|
294 |
|
295 printf("locale=zh\n"); |
|
296 cpix_SetLocale( &result, "zh" ); |
|
297 TestCustomAnalyzer(testMgr, L"natural"); |
|
298 |
|
299 printf("locale=jp\n"); |
|
300 cpix_SetLocale( &result, "jp" ); |
|
301 TestCustomAnalyzer(testMgr, L"natural"); |
|
302 |
|
303 cpix_SetLocale( &result, cpix_LOCALE_AUTO ); |
|
304 } |
|
305 |
|
306 template<typename T> |
|
307 void TestTokenizationWithLocale(Itk::TestMgr * testMgr) { |
|
308 cpix_Result result; |
|
309 cpix_SetLocale( &result, T::LOCALE ); |
|
310 TestCustomAnalyzer(testMgr, EnglishLocale::FILES, L"natural"); |
|
311 TestCustomAnalyzer(testMgr, T::FILES, L"natural"); |
|
312 cpix_SetLocale( &result, cpix_LOCALE_AUTO ); |
|
313 } |
|
314 |
|
315 |
|
316 template<typename T> |
|
317 void AddTokenizationWithLocaleTest(Itk::SuiteTester* suite) { |
|
318 suite->add(T::LOCALE, |
|
319 &TestTokenizationWithLocale<T>, |
|
320 T::LOCALE); |
|
321 } |
|
322 |
|
323 void TestTokenizationWithCurrentLocale(Itk::TestMgr * testMgr) { |
|
324 cpix_Result result; |
|
325 cpix_SetLocale( &result, cpix_LOCALE_AUTO ); |
|
326 TestCustomAnalyzer(testMgr, L"natural"); |
|
327 } |
|
328 |
|
329 void TestAnalyzerWithField(Itk::TestMgr * testMgr, const wchar_t* definition, const wchar_t* field) |
302 { |
330 { |
303 using namespace lucene::analysis; |
331 using namespace lucene::analysis; |
304 using namespace lucene::util; |
332 using namespace lucene::util; |
305 using namespace Cpix; |
333 using namespace Cpix; |
306 using namespace std; |
334 using namespace std; |
329 TestAnalyzerWithField(testMgr, sw, L"_docuid"); |
355 TestAnalyzerWithField(testMgr, sw, L"_docuid"); |
330 TestAnalyzerWithField(testMgr, sw, L"_appclass"); |
356 TestAnalyzerWithField(testMgr, sw, L"_appclass"); |
331 TestAnalyzerWithField(testMgr, sw, L"Title"); |
357 TestAnalyzerWithField(testMgr, sw, L"Title"); |
332 TestAnalyzerWithField(testMgr, sw, L"message"); |
358 TestAnalyzerWithField(testMgr, sw, L"message"); |
333 TestAnalyzerWithField(testMgr, sw, L"field"); |
359 TestAnalyzerWithField(testMgr, sw, L"field"); |
334 testResultXml(xml_file); |
360 } |
335 } |
361 |
336 |
362 void TestLocaleSwitchAnalyzers(Itk::TestMgr * testMgr) |
|
363 { |
|
364 const wchar_t* sw = L"\n" |
|
365 L"locale_switch {\n" |
|
366 L" case 'en': stdtokens>stdfilter>lowercase>stop(en);\n" |
|
367 L" case 'th': stdtokens>stdfilter>lowercase>thai>stop(en);\n" |
|
368 L" case 'ca': stdtokens>stdfilter>lowercase>accent;\n" |
|
369 L" default: stdtokens>stdfilter>lowercase;\n" |
|
370 L"}"; |
|
371 cpix_Result result; |
|
372 printf("locale=en:\n"); |
|
373 cpix_SetLocale( &result, "en" ); |
|
374 TestCustomAnalyzer(testMgr, sw); |
|
375 printf("\n"); |
|
376 printf("locale=th:\n"); |
|
377 cpix_SetLocale( &result, "th" ); |
|
378 TestCustomAnalyzer(testMgr, sw); |
|
379 printf("\n"); |
|
380 printf("locale=ca:\n"); |
|
381 cpix_SetLocale( &result, "ca" ); |
|
382 TestCustomAnalyzer(testMgr, sw); |
|
383 printf("\n"); |
|
384 printf("default locale:\n"); |
|
385 cpix_SetLocale( &result, "fail" ); |
|
386 TestCustomAnalyzer(testMgr, sw); |
|
387 cpix_SetLocale( &result, cpix_LOCALE_AUTO ); |
|
388 } |
|
389 |
|
390 |
|
391 Itk::TesterBase * CreateAnalysisWhiteBoxLocalizationTests() { |
|
392 using namespace Itk; |
|
393 |
|
394 SuiteTester |
|
395 * tests = new SuiteTester("loc"); |
|
396 |
|
397 std::string locale; |
|
398 locale = "currentlocale_"; |
|
399 |
|
400 Cpt::auto_array<char> name( Cpix::Spi::GetLanguageNames()[0].c_str() ); |
|
401 locale += name.get(); |
|
402 |
|
403 tests->add(locale.c_str(), |
|
404 &TestTokenizationWithCurrentLocale, |
|
405 locale.c_str()); |
|
406 |
|
407 AddTokenizationWithLocaleTest<EnglishLocale>(tests); |
|
408 AddTokenizationWithLocaleTest<FrenchLocale>(tests); |
|
409 AddTokenizationWithLocaleTest<HebrewLocale>(tests); |
|
410 AddTokenizationWithLocaleTest<ThaiLocale>(tests); |
|
411 AddTokenizationWithLocaleTest<KoreanLocale>(tests); |
|
412 AddTokenizationWithLocaleTest<ChineseLocale>(tests); |
|
413 AddTokenizationWithLocaleTest<JapaneseLocale>(tests); |
|
414 |
|
415 return tests; |
|
416 } |
337 |
417 |
338 Itk::TesterBase * CreateAnalysisWhiteBoxTests() |
418 Itk::TesterBase * CreateAnalysisWhiteBoxTests() |
339 { |
419 { |
340 using namespace Itk; |
420 using namespace Itk; |
341 |
421 |
342 SuiteTester |
422 SuiteTester |
343 * analysisTests = new SuiteTester("analysiswhitebox"); |
423 * analysisTests = new SuiteTester("whitebox"); |
344 |
424 |
345 analysisTests->add("analyzer", |
425 analysisTests->add("analyzer", |
346 &TestCustomAnalyzers, |
426 &TestCustomAnalyzers, |
347 "analyzer"); |
427 "analyzer"); |
348 analysisTests->add("switchanalyzer", |
428 analysisTests->add("switchAnalyzer", |
349 &TestSwitchAnalyzers, |
429 &TestSwitchAnalyzers, |
350 "switchanalyzer"); |
430 "switchAnalyzer"); |
|
431 analysisTests->add("localeSwitchAnalyzer", |
|
432 &TestLocaleSwitchAnalyzers, |
|
433 "localeSwitchAnalyzer"); |
351 analysisTests->add("tokenization", |
434 analysisTests->add("tokenization", |
352 TestTokenization6, |
435 TestTokenization6, |
353 "tokenization"); |
436 "tokenization"); |
354 analysisTests->add("parsing", |
437 analysisTests->add("parsing", |
355 TestParsing, |
438 TestParsing, |
356 "parsing"); |
439 "parsing"); |
357 analysisTests->add("parsing2", |
440 analysisTests->add("parsing2", |
358 TestSwitch, |
441 TestSwitch, |
359 "parsing2"); |
442 "parsing2"); |
|
443 analysisTests->add("parsing3", |
|
444 TestConfigSwitch, |
|
445 "parsing3"); |
360 analysisTests->add("parsingerrors", |
446 analysisTests->add("parsingerrors", |
361 TestParsingErrors, |
447 TestParsingErrors, |
362 "parsingerrors"); |
448 "parsingerrors"); |
363 |
449 |
|
450 analysisTests->add(CreateAnalysisWhiteBoxLocalizationTests()); |
364 return analysisTests; |
451 return analysisTests; |
365 } |
452 } |
366 |
453 |
367 |
454 |
368 |
455 |