searchengine/cpix/tsrc/cpixunittest/src/analysiswhitebox.cpp
changeset 10 afe194b6b1cd
parent 7 a5fbfefd615f
equal deleted inserted replaced
9:d575fd691cf9 10:afe194b6b1cd
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 #include <wchar.h>
     1 #include <wchar.h>
    19 #include <stddef.h>
     2 #include <stddef.h>
    20 
     3 
    21 
       
    22 #include <iostream>
     4 #include <iostream>
    23 
     5 
    24 #include "cpixidxdb.h"
     6 #include "cpixidxdb.h"
    25 
     7 
    26 #include "itk.h"
     8 #include "itk.h"
    27 
     9 
    28 #include "config.h"
    10 #include "config.h"
    29 #include "testutils.h"
    11 #include "testutils.h"
    30 
       
    31 #include "std_log_result.h"
       
    32 
    12 
    33 // For testing custom analyzer
    13 // For testing custom analyzer
    34 #include "CLucene.h"
    14 #include "CLucene.h"
    35 #include "CLucene\analysis\AnalysisHeader.h"
    15 #include "CLucene\analysis\AnalysisHeader.h"
    36 #include "CLucene\util\stringreader.h"
    16 #include "CLucene\util\stringreader.h"
    37 #include "analyzer.h"
       
    38 #include "analyzerexp.h"
    17 #include "analyzerexp.h"
       
    18 #include "customanalyzer.h"
       
    19 
       
    20 #include "localetestinfos.h"
       
    21 
       
    22 #include "spi/locale.h"
       
    23 #include "cpixstrtools.h"
    39 
    24 
    40 using namespace Cpt::Lex; 
    25 using namespace Cpt::Lex; 
    41 using namespace Cpt::Parser; 
    26 using namespace Cpt::Parser; 
    42 using namespace Cpix::AnalyzerExp; 
    27 using namespace Cpix::AnalyzerExp; 
    43 
    28 
    44 void PrintToken(Cpt::Lex::Token token) {
    29 void PrintToken(Cpt::Lex::Token token) {
    45 	switch (token.type()) {
    30 	printf("%S('%S')", token.type(), token.text());  
    46 		case TOKEN_WS: printf("space"); break; 
    31 }
    47 		case TOKEN_ID: printf("id"); break;
    32 
    48 		case TOKEN_LIT: printf("lit"); break;
    33 
    49 		case TOKEN_STRLIT: printf("str-lit"); break;
    34 void TestTokenization6(Itk::TestMgr * testMgr)
    50 		case TOKEN_REALLIT: printf("real-lit"); break;
    35 {
    51 		case TOKEN_INTLIT: printf("int-lit"); break;
    36 	Cpix::AnalyzerExp::Tokenizer tokenizer; 
    52 		case TOKEN_LEFT_BRACKET: printf("lbr"); break;
       
    53 		case TOKEN_RIGHT_BRACKET: printf("rbr"); break;
       
    54 		case TOKEN_COMMA: printf("comma"); break;
       
    55 		case TOKEN_PIPE: printf("pipe"); break;
       
    56 		case TOKEN_SWITCH : printf("sw"); break;
       
    57 		case TOKEN_CASE : printf("case"); break;
       
    58 		case TOKEN_DEFAULT : printf("default"); break;
       
    59 		case TOKEN_LEFT_BRACE : printf("lbc"); break;
       
    60 		case TOKEN_RIGHT_BRACE : printf("rbc"); break;
       
    61 		case TOKEN_COLON : printf("cl"); break;
       
    62 		case TOKEN_TERMINATOR : printf("tr"); break;
       
    63 
       
    64 		default: printf("unknown"); break;
       
    65 	}
       
    66 	printf("('%S')", (token.text()).c_str());  
       
    67 }
       
    68 
       
    69 
       
    70 void TestTokenization6(Itk::TestMgr * )
       
    71 {
       
    72     char *xml_file = (char*)__FUNCTION__;
       
    73         assert_failed = 0;
       
    74     Cpix::AnalyzerExp::Tokenizer tokenizer; 
       
    75 	Tokens source(tokenizer, 
    37 	Tokens source(tokenizer, 
    76 		L"switch { "
    38 		L"switch { "
    77 		  L"case '_docuid', '_mimetype': keywords;"
    39 		  L"case '_docuid', '_mimetype': keywords;"
    78 		  L"case '_baseappclass':        whitespace>lowercase;"
    40 		  L"case '_baseappclass':        whitespace>lowercase;"
    79 		  L"default: 					 natural(en); "
    41 		  L"default: 					 natural(en); "
    80 		L"}");
    42 		L"}");
    81     WhiteSpaceFilter 
    43     StdFilter 
    82         tokens(source); 
    44         tokens(source); 
    83 
    45 
    84     while (tokens) PrintToken(tokens++);
    46     while (tokens) PrintToken(tokens++); 
    85     testResultXml(xml_file);
    47 }
    86 }
    48 
    87 
    49 void TestParsing(Itk::TestMgr* mgr)
    88 void TestParsing(Itk::TestMgr* )
       
    89 { 
    50 { 
    90 	Cpix::AnalyzerExp::Tokenizer tokenizer; 
    51 	Cpix::AnalyzerExp::Tokenizer tokenizer; 
    91     char *xml_file = (char*)__FUNCTION__;
    52 	
    92         assert_failed = 0;
       
    93 	Tokens source(tokenizer, L"foobar(zap, foo, 'bar', 'a', raboof)");
    53 	Tokens source(tokenizer, L"foobar(zap, foo, 'bar', 'a', raboof)");
    94 	WhiteSpaceFilter tokens(source);
    54 	StdFilter tokens(source);
    95 	Lexer lexer(tokens);
    55 	Lexer lexer(tokens);
    96 
    56 	
    97 	Tokens source2(tokenizer, L" stdtokens >lowercase>stopwords(fin)>stopwords('a', 'an','the')>stem(fin)  ");
    57 	const wchar_t* text = L" stdtokens >lowercase>stopwords(fin)>stopwords('a', 'an','the')>stem(fin)  ";
    98 	WhiteSpaceFilter tokens2(source2);
       
    99 	Lexer lexer2(tokens2);
       
   100 	
    58 	
   101 	Tokens source3(tokenizer, L"foobar(zap, 0, 0.0045, 4, 'a', 9223.031)");
    59 	Tokens source3(tokenizer, L"foobar(zap, 0, 0.0045, 4, 'a', 9223.031)");
   102 	WhiteSpaceFilter tokens3(source3);
    60 	StdFilter tokens3(source3);
   103 	Lexer lexer3(tokens3);
    61 	Lexer lexer3(tokens3);
   104 
    62 
   105 	try {
    63 	try {
   106 		auto_ptr<Invokation> invoke = ParseInvokation(lexer); 
    64 		auto_ptr<Invokation> invoke = ParseInvokation(lexer); 
   107 		lexer.eatEof(); 
    65 		lexer.eatEof(); 
   108 		printf("Invoke identifier: %S\n", (invoke->id()).c_str()); 
    66 		printf("Invoke identifier: %S\n", invoke->id()); 
   109 		printf("%d parameters\n", invoke->params().size()); 
    67 		printf("%d parameters\n", invoke->params().size()); 
   110 		auto_ptr<Piping> piping = ParsePiping(lexer2); 
    68 		auto_ptr<Piping> piping = ParsePiping(text); 
   111 		lexer2.eatEof(); 
       
   112 		printf("piping done.\n"); 
    69 		printf("piping done.\n"); 
   113 		if (dynamic_cast<const Invokation*>(&piping->tokenizer())) {
    70 		if (dynamic_cast<const Invokation*>(&piping->tokenizer())) {
   114 			printf("Tokenizer: %S\n", dynamic_cast<const Invokation&>(piping->tokenizer()).id().c_str()); 
    71 			printf("Tokenizer: %S\n", dynamic_cast<const Invokation&>(piping->tokenizer()).id()); 
   115 		}
    72 		}
   116 		printf("%d filters\n", piping->filters().size()); 
    73 		printf("%d filters\n", piping->filters().size()); 
   117 		invoke = ParseInvokation(lexer3);
    74 		invoke = ParseInvokation(lexer3);
   118 		lexer3.eatEof(); 
    75 		lexer3.eatEof(); 
   119 		printf("Invoke identifier: %S\n", (invoke->id()).c_str()); 
    76 		printf("Invoke identifier: %S\n", invoke->id()); 
   120 		printf("%d parameters\n", invoke->params().size()); 
    77 		printf("%d parameters\n", invoke->params().size()); 
   121 	} catch (ParseException& e) {
    78 	} catch (ParseException& e) {
   122         assert_failed = 1;
       
   123 		printf("ParseException: %S\n", e.wWhat()); 
    79 		printf("ParseException: %S\n", e.wWhat()); 
   124 	} catch (LexException& e) {
    80 	} catch (LexException& e) {
   125         assert_failed = 1;	
       
   126 		printf("LexException: %S\n", e.wWhat()); 
    81 		printf("LexException: %S\n", e.wWhat()); 
   127 	}
    82 	}
   128 	testResultXml(xml_file);
    83 }
   129 }
    84 
   130 
    85 void TestSwitch(Itk::TestMgr* mgr)
   131 void TestSwitch(Itk::TestMgr* )
       
   132 { 
    86 { 
   133 	Cpix::AnalyzerExp::Tokenizer tokenizer; 
    87 	Cpix::AnalyzerExp::Tokenizer tokenizer; 
   134     char *xml_file = (char*)__FUNCTION__;
    88 	
   135         assert_failed = 0;
    89 	const wchar_t* text = 
   136 	const wchar_t* text; 
       
   137 	Tokens source(tokenizer, text = 
       
   138 		L"switch { "
    90 		L"switch { "
   139 		  L"case '_docuid', '_mimetype': keywords;"
    91 		  L"case '_docuid', '_mimetype': keywords;"
   140 		  L"case '_baseappclass':        whitespace>lowercase;"
    92 		  L"case '_baseappclass':        whitespace>lowercase;"
   141 		  L"default: 					 natural(en); "
    93 		  L"default: 					 natural(en); "
   142 		L"}");
    94 		L"}";
   143 	WhiteSpaceFilter tokens(source);
    95 
   144 	Lexer lexer(tokens);
    96 	try {
   145 
    97 		auto_ptr<Piping> sw = ParsePiping(text); 
   146 	try {
       
   147 		auto_ptr<Piping> sw = ParsePiping(lexer); 
       
   148 		lexer.eatEof(); 
       
   149 		if (dynamic_cast<const Switch*>(&sw->tokenizer())) {
    98 		if (dynamic_cast<const Switch*>(&sw->tokenizer())) {
   150 			const Switch* s = dynamic_cast<const Switch*>(&sw->tokenizer());
    99 			const Switch* s = dynamic_cast<const Switch*>(&sw->tokenizer());
   151 			for (int i = 0; i < s->cases().size(); i++) {
   100 			for (int i = 0; i < s->cases().size(); i++) {
   152 				const Case* c = s->cases()[i]; 
   101 				const Case* c = s->cases()[i]; 
   153 				printf("case "); 
   102 				printf("case "); 
   154 				for (int j = 0; j < c->fields().size(); j++) {
   103 				for (int j = 0; j < c->cases().size(); j++) {
   155 					printf("%S", (c->fields()[j]).c_str());
   104 					printf("%S", c->cases()[j]);
   156 				}
   105 				}
   157 				printf(": ...\n"); 
   106 				printf(": ...\n"); 
   158 //				wcout<<L":"<<s->def().tokenizer().id();
   107  //				wcout<<L":"<<s->def().tokenizer().id();
   159 			}
   108 			}
   160 			printf("default: ...\n");//<<s->def().tokenizer().id()<<"...;";
   109 			printf("default: ...\n");//<<s->def().tokenizer().id()<<"...;";
   161 		}
   110 		}
   162 	} catch (ParseException& e) {
   111 	} catch (ParseException& e) {
   163 		// OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; 
   112 		// OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; 
   164         assert_failed = 1;
       
   165 		e.setContext(text);
   113 		e.setContext(text);
   166 		printf("ParseException: %S\n", e.wWhat()); 
   114 		printf("ParseException: %S\n", e.wWhat()); 
   167 	} catch (LexException& e) {
   115 	} catch (LexException& e) {
   168 		// OBS wcout<<L"LexException: "<<e.describe(text)<<endl; 
   116 		// OBS wcout<<L"LexException: "<<e.describe(text)<<endl; 
   169         assert_failed = 1;
       
   170 		e.setContext(text);
   117 		e.setContext(text);
   171 		printf("LexException: %S\n", e.wWhat()); 
   118 		printf("LexException: %S\n", e.wWhat()); 
   172 	}
   119 	}
   173 	testResultXml(xml_file);
   120 }
   174 }
   121 
   175 
   122 void TestConfigSwitch(Itk::TestMgr* mgr)
   176 void TestParsingErrors(Itk::TestMgr* )
   123 { 
   177 {
       
   178     char *xml_file = (char*)__FUNCTION__;
       
   179             assert_failed = 0;
       
   180 	Cpix::AnalyzerExp::Tokenizer tokenizer; 
   124 	Cpix::AnalyzerExp::Tokenizer tokenizer; 
   181 	// eof
   125 	
   182 	const wchar_t* text; 
   126 	const wchar_t* text = 
   183 	StdLexer eof(tokenizer, text = L"foobar(zap, foo, 'bar', 'raf', do, ");
   127 		L"config_switch { "
   184 	try {
   128 		  L"case 'indexing': 	korean;"
   185 		ParsePiping(eof); 
   129 		  L"case 'query':       koreanquery;"
   186 		eof.eatEof(); 
   130 		  L"case 'prefix':      letter;"
       
   131 		  L"default: 			korean;"
       
   132 		L"}";
       
   133 
       
   134 	try {
       
   135 		auto_ptr<Piping> sw = ParsePiping(text); 
       
   136 		if (dynamic_cast<const ConfigSwitch*>(&sw->tokenizer())) {
       
   137 			const ConfigSwitch* s = dynamic_cast<const ConfigSwitch*>(&sw->tokenizer());
       
   138 			for (int i = 0; i < s->cases().size(); i++) {
       
   139 				const Case* c = s->cases()[i]; 
       
   140 				printf("case "); 
       
   141 				for (int j = 0; j < c->cases().size(); j++) {
       
   142 					printf("%S", c->cases()[j]);
       
   143 				}
       
   144 				printf(": ...\n"); 
       
   145  //				wcout<<L":"<<s->def().tokenizer().id();
       
   146 			}
       
   147 			printf("default: ...\n");//<<s->def().tokenizer().id()<<"...;";
       
   148 		}
   187 	} catch (ParseException& e) {
   149 	} catch (ParseException& e) {
   188 		// OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; 
   150 		// OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; 
   189 		e.setContext(text);
   151 		e.setContext(text);
   190 		printf("ParseException: %S\n", e.wWhat()); 
   152 		printf("ParseException: %S\n", e.wWhat()); 
   191 	}
   153 	} catch (LexException& e) {
   192 
       
   193 	
       
   194 	// Unfinished literal
       
   195 	StdLexer lit(tokenizer, text = L"foobar(zap, foo, 'bar', 'a, raboof)");
       
   196 	try {
       
   197 		ParsePiping(lit); 
       
   198 		lit.eatEof(); 
       
   199 	} catch (LexException& e) { // syntax error
       
   200 		// OBS wcout<<L"LexException: "<<e.describe(text)<<endl; 
   154 		// OBS wcout<<L"LexException: "<<e.describe(text)<<endl; 
   201 		e.setContext(text);
   155 		e.setContext(text);
   202 		printf("LexException: %S\n", e.wWhat()); 
   156 		printf("LexException: %S\n", e.wWhat()); 
       
   157 	}
       
   158 }
       
   159 
       
   160 
       
   161 void TestParsingErrors(Itk::TestMgr* mgr)
       
   162 {
       
   163 	Cpix::AnalyzerExp::Tokenizer tokenizer; 
       
   164 	// eof
       
   165 	const wchar_t* text;
       
   166 	try {
       
   167 		ParsePiping( text = L"foobar(zap, foo, 'bar', 'raf', do, " ); 
       
   168 	} catch (ParseException& e) {
       
   169 		printf("ParseException: %S\n", e.wWhat()); 
       
   170 	}
       
   171 	
       
   172 	// Unfinished literal
       
   173 	try {
       
   174 		ParsePiping(text = L"foobar(zap, foo, 'bar', 'a, raboof)"); 
       
   175 	} catch (LexException& e) { // syntax error
       
   176 		printf("LexException: %S\n", e.wWhat()); 
   203 	} catch (ParseException& e) { // syntax error
   177 	} catch (ParseException& e) { // syntax error
   204 		// OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; 
       
   205 		e.setContext(text);
       
   206 		printf("ParseException: %S\n", e.wWhat()); 
   178 		printf("ParseException: %S\n", e.wWhat()); 
   207 	} 
   179 	} 
   208 
   180 
   209 	// Unknown token
   181 	// Unknown token
   210 	StdLexer unknown(tokenizer, text = L"foobar(!zap, foo, 'bar', 'a', raboof)");
   182 	try {
   211 	try {
   183 		ParsePiping(text = L"foobar(!zap, foo, 'bar', 'a', raboof)"); 
   212 		ParsePiping(unknown); 
       
   213 		unknown.eatEof(); 
       
   214 	} catch (LexException& e) { // syntax error
   184 	} catch (LexException& e) { // syntax error
   215 		// OBS wcout<<L"LexException: "<<e.describe(text)<<endl; 
       
   216 		e.setContext(text);
       
   217 		printf("LexException: %S\n", e.wWhat()); 
   185 		printf("LexException: %S\n", e.wWhat()); 
   218 	} 
   186 	} 
   219 	
   187 	
   220 	// Missing comma
   188 	// Missing comma
   221 	StdLexer comma(tokenizer, text = L"foobar(zap, foo, 'bar', 'a' raboof)");
   189 	try {
   222 	try {
   190 		ParsePiping(text = L"foobar(zap, foo, 'bar', 'a' raboof)"); 
   223 		ParsePiping(comma); 
       
   224 		comma.eatEof(); 
       
   225 	} catch (ParseException& e) {
   191 	} catch (ParseException& e) {
   226 		// OBS wcout<<L"ParseException: "<<e.describe(text)<<endl; 
       
   227 		e.setContext(text);
       
   228 		printf("ParseException: %S\n", e.wWhat()); 
   192 		printf("ParseException: %S\n", e.wWhat()); 
   229 	} 
   193 	} 
   230 	testResultXml(xml_file);
   194 
   231 }
   195 }
   232 
   196 
   233 
   197 
   234 const char * CustomAnalyzerTestDocs[] = {
   198 const char * CustomAnalyzerTestDocs[] = {
   235     FILE_TEST_CORPUS_PATH "\\en\\1.txt",
   199     STEM_TEST_CORPUS_PATH "\\en\\1.txt",
   236     FILE_TEST_CORPUS_PATH "\\en\\2.txt",
   200     STEM_TEST_CORPUS_PATH "\\en\\2.txt",
   237     FILE_TEST_CORPUS_PATH "\\en\\3.txt",
   201     STEM_TEST_CORPUS_PATH "\\en\\3.txt",
   238     FILE_TEST_CORPUS_PATH "\\fi\\1.txt",
   202     STEM_TEST_CORPUS_PATH "\\en\\4.txt",
   239     FILE_TEST_CORPUS_PATH "\\fi\\2.txt",
   203         
       
   204     STEM_TEST_CORPUS_PATH "\\fi\\1.txt",
       
   205     STEM_TEST_CORPUS_PATH "\\fi\\2.txt",
       
   206     LOC_TEST_CORPUS_PATH "\\th\\1.txt",
       
   207     LOC_TEST_CORPUS_PATH "\\th\\2.txt",
       
   208     
   240     NULL
   209     NULL
   241 };
   210 };
   242 
   211 
   243 const char DEFAULT_ENCODING[] = "UTF-8";
   212 const char DEFAULT_ENCODING[] = "UTF-8";
   244 
   213 
   256 		printf("'%S'", token.termText());
   225 		printf("'%S'", token.termText());
   257 	}
   226 	}
   258 	printf("\n");
   227 	printf("\n");
   259 }
   228 }
   260 
   229 
   261 void TestCustomAnalyzer(Itk::TestMgr * , const wchar_t* definition)
   230 void TestCustomAnalyzer(Itk::TestMgr * testMgr, 
       
   231 					    const char** files, 
       
   232 					    const wchar_t* definition)
   262 {
   233 {
   263 	using namespace lucene::analysis; 
   234 	using namespace lucene::analysis; 
   264 	using namespace lucene::util; 
   235 	using namespace lucene::util; 
   265 	using namespace Cpix; 
   236 	using namespace Cpix; 
   266 	using namespace std; 
   237 	using namespace std; 
   267 	CustomAnalyzer analyzer(definition);
   238 	CustomAnalyzer analyzer(definition);
   268 	
   239 	
   269 	printf("Analyzer \"%S\":\n", definition); 
   240 	printf("Analyzer \"%S\":\n", definition); 
   270 	for (int i = 0; CustomAnalyzerTestDocs[i]; i++) 
   241 	for (int i = 0; files[i]; i++) 
   271 	{
   242 	{
   272 		printf("File !%s tokenized:\n", (CustomAnalyzerTestDocs[i]+1));
   243 		printf("File !%s tokenized:\n", (files[i]+1));
   273 		FileReader file( CustomAnalyzerTestDocs[i], DEFAULT_ENCODING ); 
   244 		FileReader file( files[i], DEFAULT_ENCODING ); 
   274 		
   245 		
   275 		TokenStream* stream = analyzer.tokenStream( L"field", &file ); 
   246 		TokenStream* stream = analyzer.tokenStream( L"field", &file ); 
   276 		PrintTokenStream( stream ); 
   247 		PrintTokenStream( stream ); 
   277 		stream->close(); 
   248 		stream->close(); 
   278 		_CLDELETE( stream ); 
   249 		_CLDELETE( stream ); 
   279 	}
   250 	}
       
   251 	printf("\n");
       
   252 }
       
   253 
       
   254 void TestCustomAnalyzer(Itk::TestMgr * testMgr, const wchar_t* definition) {
       
   255 	TestCustomAnalyzer(testMgr, CustomAnalyzerTestDocs, definition);
   280 }
   256 }
   281 
   257 
   282 void TestCustomAnalyzers(Itk::TestMgr * testMgr)
   258 void TestCustomAnalyzers(Itk::TestMgr * testMgr)
   283 {
   259 {
   284     char *xml_file = (char*)__FUNCTION__;
       
   285         assert_failed = 0;
       
   286 	TestCustomAnalyzer(testMgr, L"stdtokens");
   260 	TestCustomAnalyzer(testMgr, L"stdtokens");
   287 	TestCustomAnalyzer(testMgr, L"whitespace");
   261 	TestCustomAnalyzer(testMgr, L"whitespace");
   288 	TestCustomAnalyzer(testMgr, L"whitespace>lowercase");
   262 	TestCustomAnalyzer(testMgr, L"whitespace>lowercase");
   289 	TestCustomAnalyzer(testMgr, L"whitespace>accent");
   263 	TestCustomAnalyzer(testMgr, L"whitespace>accent");
   290 	TestCustomAnalyzer(testMgr, L"letter");
   264 	TestCustomAnalyzer(testMgr, L"letter");
   291 	TestCustomAnalyzer(testMgr, L"letter>lowercase");
   265 	TestCustomAnalyzer(testMgr, L"letter>lowercase");
   292 	TestCustomAnalyzer(testMgr, L"keyword");
   266 	TestCustomAnalyzer(testMgr, L"keyword");
   293 	TestCustomAnalyzer(testMgr, L"keyword>lowercase");
   267 	TestCustomAnalyzer(testMgr, L"keyword>lowercase");
   294 	TestCustomAnalyzer(testMgr, L"stdtokens>lowercase>accent>stem(en)"); 
   268 //	TestCustomAnalyzer(testMgr, L"stdtokens>lowercase>stem(en)"); // Does not work with NON-ASCII
   295 	TestCustomAnalyzer(testMgr, L"letter>lowercase>accent>stop(en)"); 
   269 	TestCustomAnalyzer(testMgr, L"letter>lowercase>stop(en)"); 
   296 	TestCustomAnalyzer(testMgr, L"letter>lowercase>stop('i', 'oh', 'nyt', 'näin')"); 
   270 	TestCustomAnalyzer(testMgr, L"letter>lowercase>stop('i', 'oh', 'nyt', 'n�in')"); 
   297 	TestCustomAnalyzer(testMgr, L"letter>length(2, 4)");
   271 	TestCustomAnalyzer(testMgr, L"letter>length(2, 4)");
   298 	testResultXml(xml_file);
   272 	TestCustomAnalyzer(testMgr, L"standard>prefixes(1)");
   299 }
   273 	TestCustomAnalyzer(testMgr, L"standard>prefixes(2)");
   300 
   274 	TestCustomAnalyzer(testMgr, L"standard>prefixes(3)");
   301 void TestAnalyzerWithField(Itk::TestMgr * , const wchar_t* definition, const wchar_t* field)
   275 	TestCustomAnalyzer(testMgr, L"stdtokens>stdfilter>lowercase>thai>stop(en)");
       
   276 	TestCustomAnalyzer(testMgr, L"cjk>stop(en)");
       
   277     TestCustomAnalyzer(testMgr, L"ngram(1)>lowercase>stop(en)");
       
   278     TestCustomAnalyzer(testMgr, L"ngram(2)>lowercase>stop(en)");
       
   279 }
       
   280 
       
   281 void TestTokenizationWithLocales(Itk::TestMgr * testMgr) {
       
   282 	printf("locale=en\n"); 
       
   283 	cpix_Result result; 
       
   284 	cpix_SetLocale( &result, "en" ); 
       
   285 	TestCustomAnalyzer(testMgr, L"natural");
       
   286 	
       
   287 	printf("locale=th\n"); 
       
   288 	cpix_SetLocale( &result, "th" ); 
       
   289 	TestCustomAnalyzer(testMgr, L"natural");
       
   290 
       
   291 	printf("locale=ko\n");
       
   292 	cpix_SetLocale( &result, "ko" ); 
       
   293 	TestCustomAnalyzer(testMgr, L"natural");
       
   294 	
       
   295 	printf("locale=zh\n");
       
   296 	cpix_SetLocale( &result, "zh" );
       
   297 	TestCustomAnalyzer(testMgr, L"natural");
       
   298 	
       
   299 	printf("locale=jp\n");
       
   300 	cpix_SetLocale( &result, "jp" ); 
       
   301 	TestCustomAnalyzer(testMgr, L"natural");
       
   302 
       
   303 	cpix_SetLocale( &result, cpix_LOCALE_AUTO ); 
       
   304 }
       
   305 
       
   306 template<typename T> 
       
   307 void TestTokenizationWithLocale(Itk::TestMgr * testMgr) {
       
   308 	cpix_Result result; 
       
   309 	cpix_SetLocale( &result, T::LOCALE ); 
       
   310     TestCustomAnalyzer(testMgr, EnglishLocale::FILES, L"natural");
       
   311 	TestCustomAnalyzer(testMgr, T::FILES, L"natural");
       
   312 	cpix_SetLocale( &result, cpix_LOCALE_AUTO ); 
       
   313 }
       
   314 
       
   315 
       
   316 template<typename T>
       
   317 void AddTokenizationWithLocaleTest(Itk::SuiteTester* suite) {
       
   318     suite->add(T::LOCALE,
       
   319                &TestTokenizationWithLocale<T>,
       
   320                T::LOCALE);
       
   321 }
       
   322 
       
   323 void TestTokenizationWithCurrentLocale(Itk::TestMgr * testMgr) {
       
   324 	cpix_Result result; 
       
   325 	cpix_SetLocale( &result, cpix_LOCALE_AUTO ); 
       
   326 	TestCustomAnalyzer(testMgr, L"natural");
       
   327 }
       
   328 
       
   329 void TestAnalyzerWithField(Itk::TestMgr * testMgr, const wchar_t* definition, const wchar_t* field) 	
   302 {
   330 {
   303 	using namespace lucene::analysis; 
   331 	using namespace lucene::analysis; 
   304 	using namespace lucene::util; 
   332 	using namespace lucene::util; 
   305 	using namespace Cpix; 
   333 	using namespace Cpix; 
   306 	using namespace std; 
   334 	using namespace std; 
   315 	_CLDELETE( stream ); 
   343 	_CLDELETE( stream ); 
   316 }
   344 }
   317 
   345 
   318 void TestSwitchAnalyzers(Itk::TestMgr * testMgr)
   346 void TestSwitchAnalyzers(Itk::TestMgr * testMgr)
   319 {
   347 {
   320     char *xml_file = (char*)__FUNCTION__;
   348 	const wchar_t* sw = L"\n"
   321         assert_failed = 0;
       
   322     const wchar_t* sw = L"\n"
       
   323 		L"switch {\n"
   349 		L"switch {\n"
   324 		L"    case '_docuid':          keyword;\n"
   350 		L"    case '_docuid':          keyword;\n"
   325 		L"    case '_appclass':        whitespace>lowercase;\n"
   351 		L"    case '_appclass':        whitespace>lowercase;\n"
   326 		L"    case 'title', 'message': stdtokens>accent>lowercase>stem(en)>stop(en);\n"
   352 		L"    case 'title', 'message': stdtokens>accent>lowercase>stem(en)>stop(en);\n"
   327 		L"    default:                 letter>lowercase>stop('i');\n"
   353 		L"    default:                 letter>lowercase>stop('i');\n"
   329 	TestAnalyzerWithField(testMgr, sw, L"_docuid");
   355 	TestAnalyzerWithField(testMgr, sw, L"_docuid");
   330 	TestAnalyzerWithField(testMgr, sw, L"_appclass");
   356 	TestAnalyzerWithField(testMgr, sw, L"_appclass");
   331 	TestAnalyzerWithField(testMgr, sw, L"Title"); 
   357 	TestAnalyzerWithField(testMgr, sw, L"Title"); 
   332 	TestAnalyzerWithField(testMgr, sw, L"message"); 
   358 	TestAnalyzerWithField(testMgr, sw, L"message"); 
   333 	TestAnalyzerWithField(testMgr, sw, L"field"); 
   359 	TestAnalyzerWithField(testMgr, sw, L"field"); 
   334 	testResultXml(xml_file);
   360 }
   335 }
   361 
   336 
   362 void TestLocaleSwitchAnalyzers(Itk::TestMgr * testMgr)
       
   363 {
       
   364 	const wchar_t* sw = L"\n"
       
   365 		L"locale_switch {\n"
       
   366 		L"    case 'en':       stdtokens>stdfilter>lowercase>stop(en);\n"
       
   367 		L"    case 'th':       stdtokens>stdfilter>lowercase>thai>stop(en);\n"
       
   368 		L"    case 'ca':       stdtokens>stdfilter>lowercase>accent;\n"
       
   369 		L"    default:         stdtokens>stdfilter>lowercase;\n"
       
   370 		L"}";
       
   371 	cpix_Result result; 
       
   372 	printf("locale=en:\n");
       
   373 	cpix_SetLocale( &result, "en" ); 
       
   374 	TestCustomAnalyzer(testMgr, sw);
       
   375 	printf("\n");
       
   376 	printf("locale=th:\n");
       
   377 	cpix_SetLocale( &result, "th" ); 
       
   378 	TestCustomAnalyzer(testMgr, sw);
       
   379 	printf("\n");
       
   380 	printf("locale=ca:\n");
       
   381 	cpix_SetLocale( &result, "ca" ); 
       
   382 	TestCustomAnalyzer(testMgr, sw);
       
   383 	printf("\n");
       
   384 	printf("default locale:\n");
       
   385 	cpix_SetLocale( &result, "fail" ); 
       
   386 	TestCustomAnalyzer(testMgr, sw);
       
   387 	cpix_SetLocale( &result, cpix_LOCALE_AUTO ); 
       
   388 }
       
   389 
       
   390 
       
   391 Itk::TesterBase * CreateAnalysisWhiteBoxLocalizationTests() {
       
   392     using namespace Itk;
       
   393     
       
   394 	SuiteTester
       
   395 		* tests = new SuiteTester("loc");
       
   396 
       
   397 	std::string locale;
       
   398 	locale = "currentlocale_"; 
       
   399     
       
   400     Cpt::auto_array<char> name( Cpix::Spi::GetLanguageNames()[0].c_str() );
       
   401     locale += name.get();
       
   402     
       
   403 	tests->add(locale.c_str(),
       
   404 				  &TestTokenizationWithCurrentLocale,
       
   405 				  locale.c_str());
       
   406 	
       
   407 	AddTokenizationWithLocaleTest<EnglishLocale>(tests);
       
   408 	AddTokenizationWithLocaleTest<FrenchLocale>(tests);
       
   409 	AddTokenizationWithLocaleTest<HebrewLocale>(tests);
       
   410 	AddTokenizationWithLocaleTest<ThaiLocale>(tests);
       
   411 	AddTokenizationWithLocaleTest<KoreanLocale>(tests);
       
   412 	AddTokenizationWithLocaleTest<ChineseLocale>(tests);
       
   413 	AddTokenizationWithLocaleTest<JapaneseLocale>(tests);
       
   414 	    
       
   415 	return tests;
       
   416 }
   337 
   417 
   338 Itk::TesterBase * CreateAnalysisWhiteBoxTests()
   418 Itk::TesterBase * CreateAnalysisWhiteBoxTests()
   339 {
   419 {
   340     using namespace Itk;
   420     using namespace Itk;
   341 
   421 
   342     SuiteTester
   422     SuiteTester
   343         * analysisTests = new SuiteTester("analysiswhitebox");
   423         * analysisTests = new SuiteTester("whitebox");
   344     
   424     
   345     analysisTests->add("analyzer",
   425     analysisTests->add("analyzer",
   346 					   &TestCustomAnalyzers,
   426 					   &TestCustomAnalyzers,
   347 					   "analyzer");
   427 					   "analyzer");
   348     analysisTests->add("switchanalyzer",
   428     analysisTests->add("switchAnalyzer",
   349 					   &TestSwitchAnalyzers,
   429 					   &TestSwitchAnalyzers,
   350 					   "switchanalyzer");
   430 					   "switchAnalyzer");
       
   431     analysisTests->add("localeSwitchAnalyzer",
       
   432 					   &TestLocaleSwitchAnalyzers,
       
   433 					   "localeSwitchAnalyzer");
   351     analysisTests->add("tokenization",
   434     analysisTests->add("tokenization",
   352     				   TestTokenization6,
   435     				   TestTokenization6,
   353     				   "tokenization");
   436     				   "tokenization");
   354   	analysisTests->add("parsing",
   437   	analysisTests->add("parsing",
   355                       TestParsing,
   438                       TestParsing,
   356                       "parsing");
   439                       "parsing");
   357     analysisTests->add("parsing2",
   440     analysisTests->add("parsing2",
   358                       TestSwitch,
   441                       TestSwitch,
   359                       "parsing2");
   442                       "parsing2");
       
   443     analysisTests->add("parsing3",
       
   444                       TestConfigSwitch,
       
   445                       "parsing3");
   360     analysisTests->add("parsingerrors",
   446     analysisTests->add("parsingerrors",
   361                       TestParsingErrors,
   447                       TestParsingErrors,
   362                       "parsingerrors");
   448                       "parsingerrors");
   363     
   449 
       
   450     analysisTests->add(CreateAnalysisWhiteBoxLocalizationTests());
   364     return analysisTests;
   451     return analysisTests;
   365 }
   452 }
   366 
   453 
   367 
   454 
   368 
   455