|
1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 #include "itk.h" |
|
19 |
|
20 #include "thaianalysis.h" |
|
21 |
|
22 #include "CLucene.h" |
|
23 |
|
24 #include <iostream> |
|
25 |
|
26 #include "evaluationtool.h" |
|
27 #include "analysisunittest.h" |
|
28 #include "CJKAnalyzer.h" |
|
29 #include "koreananalyzer.h" |
|
30 #include "ngram.h" |
|
31 #include "prefixfilter.h" |
|
32 |
|
33 #include "testutils.h" |
|
34 |
|
35 using namespace std; |
|
36 using namespace analysis; |
|
37 using namespace lucene::analysis; |
|
38 using namespace evaluationtool; |
|
39 |
|
40 |
|
41 |
|
42 void doEvaluate(Itk::TestMgr* testMgr, Analyzer& analyzer, const char* testName, const char* corpusFile, const char* evalFile, Analyzer* queryAnalyzer = NULL, Analyzer* prefixAnalyzer = NULL ) |
|
43 { |
|
44 Corpus corpus(corpusFile); |
|
45 PreparedCorpus prepared(corpus, analyzer, queryAnalyzer, prefixAnalyzer); |
|
46 EvaluationRecord record(evalFile); |
|
47 |
|
48 int failed = 0; |
|
49 int improved = 0; |
|
50 |
|
51 int timeMs = 0; |
|
52 |
|
53 for (int i = 0; i < record.length(); i++) |
|
54 { |
|
55 const wchar_t* query = record.query(i); |
|
56 |
|
57 if (!*query) continue; // skip empty queries |
|
58 |
|
59 Results& ideal = record.ideal(i); |
|
60 Results& java = record.measured(i); // results for Java implementation |
|
61 |
|
62 Itk::Timestamp begin; |
|
63 Itk::getTimestamp(&begin); |
|
64 |
|
65 Results results(prepared, query); |
|
66 |
|
67 Itk::Timestamp end; |
|
68 Itk::getTimestamp(&end); |
|
69 |
|
70 Evaluation control( ideal, java ); |
|
71 Evaluation eval( ideal, results ); |
|
72 |
|
73 timeMs += Itk::getElapsedMs(&end, &begin); |
|
74 |
|
75 wprintf(L"Q '%S' - ", query); |
|
76 printTokens(queryAnalyzer?*queryAnalyzer:analyzer, query); |
|
77 |
|
78 wprintf(L"i:"); |
|
79 for (int i = 0; i < results.length(); i++) { |
|
80 if (ideal.hit(i)) { |
|
81 wprintf(L"X"); |
|
82 } else { |
|
83 wprintf(L"."); |
|
84 } |
|
85 } |
|
86 wprintf(L"\n"); |
|
87 |
|
88 wprintf(L"j:"); |
|
89 for (int i = 0; i < results.length(); i++) { |
|
90 if (java.hit(i)) { |
|
91 wprintf(L"X"); |
|
92 } else { |
|
93 wprintf(L"."); |
|
94 } |
|
95 } |
|
96 wprintf(L"\n"); |
|
97 |
|
98 wprintf(L"c:"); |
|
99 for (int i = 0; i < results.length(); i++) { |
|
100 if (eval.error(i) && !control.error(i)) { |
|
101 wprintf(L"!"); |
|
102 } else if (!eval.error(i) && control.error(i)){ |
|
103 wprintf(L"+"); |
|
104 } else { |
|
105 wprintf(L"."); |
|
106 } |
|
107 } |
|
108 wprintf(L"\n"); |
|
109 if ( eval.errors() == control.errors() ) { |
|
110 wprintf(L"ok\n"); |
|
111 } else if ( eval.errors() < control.errors() ) { |
|
112 wprintf(L"improved\n"); |
|
113 improved++; |
|
114 } else { |
|
115 wprintf(L"more errors!\n"); |
|
116 failed++; |
|
117 } |
|
118 wprintf(L"\n"); |
|
119 } |
|
120 |
|
121 wprintf(L"Index size was %d KB\n", prepared.indexSize() / 1000); |
|
122 wprintf(L"Improved in %d / %d\n", improved, record.length()); |
|
123 wprintf(L"Deteriorated in %d / %d\n", failed, record.length()); |
|
124 |
|
125 |
|
126 std::string title; |
|
127 title += testName; title += " search time"; |
|
128 ITK_REPORT( testMgr, title.c_str(), "%d ms / query", timeMs / record.length()); |
|
129 |
|
130 title = testName; title += " index size"; |
|
131 ITK_REPORT( testMgr, title.c_str(), "%d KB", prepared.indexSize() / 1000); |
|
132 } |
|
133 |
|
134 void ThaiEvaluation(Itk::TestMgr* testMgr) |
|
135 { |
|
136 ThaiAnalyzer analyzer; |
|
137 doEvaluate(testMgr, analyzer, "thai", CORPUS_DIR "thai/corpus.txt", CORPUS_DIR "thai/eval.txt"); |
|
138 } |
|
139 |
|
140 void GalicianEvaluation(Itk::TestMgr* testMgr) { |
|
141 // GalicianAnalyzer analyzer; |
|
142 standard::StandardAnalyzer analyzer; |
|
143 doEvaluate(testMgr, analyzer, "galician", CORPUS_DIR "galician/corpus.txt", CORPUS_DIR "galician/eval.txt"); |
|
144 } |
|
145 |
|
146 void KoreanCjkEvaluation(Itk::TestMgr* testMgr) { |
|
147 cjk::CJKAnalyzer analyzer; |
|
148 doEvaluate(testMgr, analyzer, "korean_cjk", CORPUS_DIR "korean/corpus.txt", CORPUS_DIR "korean/eval.txt"); |
|
149 } |
|
150 |
|
151 void KoreanBigramEvaluation(Itk::TestMgr* testMgr) { |
|
152 CjkNGramAnalyzer analyzer(2); |
|
153 doEvaluate(testMgr, analyzer, "korean_2gram", CORPUS_DIR "korean/corpus.txt", CORPUS_DIR "korean/eval.txt"); |
|
154 } |
|
155 |
|
156 void KoreanUnigramEvaluation(Itk::TestMgr* testMgr) { |
|
157 CjkNGramAnalyzer analyzer(1); |
|
158 doEvaluate(testMgr, analyzer, "korean_1gram", CORPUS_DIR "korean/corpus.txt", CORPUS_DIR "korean/eval.txt"); |
|
159 } |
|
160 |
|
161 void KoreanJamuUnigramEvaluation(Itk::TestMgr* testMgr) { |
|
162 JamuNGramAnalyzer analyzer(1); |
|
163 doEvaluate(testMgr, analyzer, "jamu_1gram", CORPUS_DIR "korean/corpus.txt", CORPUS_DIR "korean/eval.txt"); |
|
164 } |
|
165 |
|
166 void KoreanJamuBigramEvaluation(Itk::TestMgr* testMgr) { |
|
167 JamuNGramAnalyzer analyzer(2); |
|
168 doEvaluate(testMgr, analyzer, "jamu_2gram", CORPUS_DIR "korean/corpus.txt", CORPUS_DIR "korean/eval.txt"); |
|
169 } |
|
170 |
|
171 void KoreanEvaluation(Itk::TestMgr* testMgr) { |
|
172 KoreanAnalyzer analyzer; |
|
173 KoreanQueryAnalyzer queryAnalyzer; |
|
174 doEvaluate(testMgr, analyzer, "korean", CORPUS_DIR "korean/corpus.txt", CORPUS_DIR "korean/eval.txt", &queryAnalyzer); |
|
175 } |
|
176 |
|
177 void ChineseBigramEvaluation(Itk::TestMgr* testMgr) { |
|
178 CjkNGramAnalyzer analyzer(2); |
|
179 doEvaluate(testMgr, analyzer, "chinese_2gram", CORPUS_DIR "chinese_prc/corpus.txt", CORPUS_DIR "chinese_prc/eval.txt"); |
|
180 } |
|
181 |
|
182 void ChineseUnigramEvaluation(Itk::TestMgr* testMgr) { |
|
183 CjkNGramAnalyzer analyzer(1); |
|
184 doEvaluate(testMgr, analyzer, "chinese_1gram", CORPUS_DIR "chinese_prc/corpus.txt", CORPUS_DIR "chinese_prc/eval.txt"); |
|
185 } |
|
186 |
|
187 void HebrewEvaluation(Itk::TestMgr* testMgr) { |
|
188 HebrewAnalyzer analyzer; |
|
189 HebrewQueryAnalyzer queryAnalyzer; |
|
190 doEvaluate(testMgr, analyzer, "hebrew", CORPUS_DIR "hebrew/corpus.txt", CORPUS_DIR "hebrew/eval.txt", &queryAnalyzer, &queryAnalyzer); |
|
191 } |
|
192 |
|
193 void FrenchEvaluation(Itk::TestMgr* testMgr) { |
|
194 FrenchAnalyzer analyzer; |
|
195 doEvaluate(testMgr, analyzer, "french", CORPUS_DIR "french/corpus.txt", CORPUS_DIR "french/eval.txt", &analyzer, &analyzer); |
|
196 } |
|
197 |
|
198 Itk::TesterBase * CreateEvaluationTest() |
|
199 { |
|
200 using namespace Itk; |
|
201 |
|
202 SuiteTester |
|
203 * testSuite = |
|
204 new SuiteTester( "evaluation" ); |
|
205 |
|
206 testSuite->add( "thai", ThaiEvaluation, "thai" ); |
|
207 testSuite->add( "galician", GalicianEvaluation, "galician" ); |
|
208 testSuite->add( "korean_cjk", KoreanCjkEvaluation, "korean_cjk" ); |
|
209 testSuite->add( "korean", KoreanEvaluation, "korean" ); |
|
210 testSuite->add( "korean_1gram", KoreanUnigramEvaluation, "korean_1gram" ); |
|
211 testSuite->add( "korean_2gram", KoreanBigramEvaluation, "korean_2gram" ); |
|
212 testSuite->add( "chinese_1gram", ChineseUnigramEvaluation, "chinese_1gram" ); |
|
213 testSuite->add( "chinese_2gram", ChineseBigramEvaluation, "chinese_2gram" ); |
|
214 |
|
215 testSuite->add( "jamu_1gram", KoreanJamuUnigramEvaluation, "jamu_1gram" ); |
|
216 testSuite->add( "jamu_2gram", KoreanJamuBigramEvaluation, "jamu_2gram" ); |
|
217 |
|
218 testSuite->add( "hebrew", HebrewEvaluation, "hebrew" ); |
|
219 testSuite->add( "french", FrenchEvaluation, "french" ); |
|
220 |
|
221 return testSuite; |
|
222 } |
|
223 |