|
1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 |
|
19 #include "evaluationtool.h" |
|
20 #include "analysisunittest.h" |
|
21 |
|
22 #include "testutils.h" |
|
23 |
|
24 #include "cpixstrtools.h" |
|
25 |
|
26 #define MAX_LINE_LENGTH 512 |
|
27 |
|
28 namespace evaluationtool { |
|
29 |
|
30 using namespace lucene::analysis; |
|
31 using namespace lucene::util; |
|
32 using namespace lucene::index; |
|
33 using namespace lucene::store; |
|
34 using namespace lucene::search; |
|
35 using namespace lucene::document; |
|
36 using namespace lucene::queryParser; |
|
37 |
|
38 |
|
39 static const wchar_t HIT_MARK_CHAR = 'X'; |
|
40 static const int HIT_MARK_IDX = 2; |
|
41 static const wchar_t ERROR_MARK_CHAR = '!'; |
|
42 static const int ERROR_MARK_IDX = 0; |
|
43 |
|
44 static const wchar_t* SUMMARY_STR = L"--- Summary ---"; |
|
45 static const wchar_t* SECTION_END_STR = L"--- Section End ---"; |
|
46 static const wchar_t* SEARCH_SECTION_STR = L"--- Search ---"; |
|
47 static const wchar_t* FILE_END_STR = L"--- File End ---"; |
|
48 |
|
49 static const wchar_t* ID_FIELD = L"id"; |
|
50 static const wchar_t* CONTENT_FIELD = L"content"; |
|
51 |
|
52 |
|
53 Corpus::Corpus(const char* file) |
|
54 : lines_() { |
|
55 FileReader reader(file, "UTF-8"); |
|
56 |
|
57 wchar_t line[MAX_LINE_LENGTH]; |
|
58 |
|
59 while (readLine(reader, line, MAX_LINE_LENGTH)) { |
|
60 if (wcslen(line)) lines_.push_back( std::wstring( line ) ); |
|
61 } |
|
62 } |
|
63 |
|
64 const wchar_t* Corpus::operator[](int i) { |
|
65 return lines_[i].c_str(); |
|
66 } |
|
67 |
|
68 int Corpus::size() { |
|
69 return lines_.size(); |
|
70 } |
|
71 |
|
72 |
|
73 #define MAX_ID_LENGTH 10 |
|
74 |
|
75 PreparedCorpus::PreparedCorpus(Corpus& corpus, |
|
76 Analyzer& analyzer, |
|
77 Analyzer* queryAnalyzer, |
|
78 Analyzer* prefixAnalyzer) |
|
79 : size_( corpus.size() ), |
|
80 prefixAnalyzer_( prefixAnalyzer ), |
|
81 dir_() { |
|
82 |
|
83 dir_.reset( FSDirectory::getDirectory( INDEX_DIRECTORY, true ) ); |
|
84 |
|
85 IndexWriter writer(dir_.get(), &analyzer, true, false); |
|
86 |
|
87 wchar_t id[MAX_ID_LENGTH]; |
|
88 |
|
89 for (int i = 0; i < corpus.size(); i++) { |
|
90 Document doc; |
|
91 snwprintf(id, MAX_ID_LENGTH, L"%d", i); |
|
92 doc.add(*new Field( ID_FIELD, id, Field::INDEX_NO | Field::STORE_YES)); |
|
93 doc.add(*new Field( CONTENT_FIELD, corpus[i], Field::INDEX_TOKENIZED | Field::STORE_NO)); |
|
94 writer.addDocument(&doc); |
|
95 } |
|
96 |
|
97 writer.optimize(); |
|
98 writer.close(); |
|
99 |
|
100 queryParser_.reset(new QueryParser(CONTENT_FIELD, queryAnalyzer ? queryAnalyzer : &analyzer)); |
|
101 |
|
102 searcher_.reset(new IndexSearcher(dir_.get())); |
|
103 } |
|
104 |
|
105 int PreparedCorpus::size() { |
|
106 return size_; |
|
107 } |
|
108 |
|
109 int PreparedCorpus::indexSize() { |
|
110 std::vector<std::string> v; |
|
111 dir_->list(&v); |
|
112 int ret = 0; |
|
113 for (int i = 0; i < v.size(); i++) { |
|
114 ret += dir_->fileLength(v[i].c_str()); |
|
115 } |
|
116 return ret; |
|
117 } |
|
118 |
|
119 void PreparedCorpus::search(const wchar_t* query, std::bitset<MAXLINES>& hits ) { |
|
120 int qlen = wcslen( query ); |
|
121 while (qlen > 0 && iswspace(query[qlen-1])) qlen--; |
|
122 auto_ptr<Query> q; |
|
123 if ( query[qlen-1] == '*' && prefixAnalyzer_ ) { |
|
124 // Simplified prefix query parser |
|
125 wchar_t buf[512]; |
|
126 memcpy(buf, query, sizeof(wchar_t)*(qlen-1)); |
|
127 buf[qlen-1] = '\0'; |
|
128 // Assume, that prefix query contains only one word |
|
129 auto_ptr<TokenStream> t( prefixAnalyzer_->tokenStream(NULL, new StringReader(buf)) ); |
|
130 Token token; |
|
131 t->next(&token); |
|
132 Term* term = new Term( CONTENT_FIELD, token.termText() ); |
|
133 q.reset( new PrefixQuery( term ) ); |
|
134 _CLDECDELETE( term ); |
|
135 } else { |
|
136 q.reset( queryParser_->parse(query) ); |
|
137 } |
|
138 if ( q.get() ) { |
|
139 auto_ptr<Hits> h( searcher_->search( q.get() ) ); |
|
140 for (int i = 0; i < h->length(); i++) { |
|
141 int id; |
|
142 Cpt::wconvertInteger(&id, h->doc(i).get(ID_FIELD)); |
|
143 hits[id] = true; |
|
144 } |
|
145 } |
|
146 } |
|
147 |
|
148 |
|
149 Results::Results(std::bitset<MAXLINES>& hits, int lines) |
|
150 : hits_(hits), lines_(lines) {} |
|
151 |
|
152 Results::Results() |
|
153 : hits_(), lines_(0) {} |
|
154 |
|
155 Results::Results(PreparedCorpus& corpus, |
|
156 const wchar_t* query) |
|
157 : lines_(corpus.size()) { |
|
158 corpus.search(query, hits_); |
|
159 } |
|
160 |
|
161 bool Results::hit(int i) { |
|
162 return hits_[i]; |
|
163 } |
|
164 |
|
165 void Results::append(bool hit) { |
|
166 hits_[lines_++] = hit; |
|
167 } |
|
168 |
|
169 |
|
170 int Results::length() { |
|
171 return lines_; |
|
172 } |
|
173 |
|
174 EvaluationRecordEntry::EvaluationRecordEntry( |
|
175 const wchar_t* query, |
|
176 Results& ideal, |
|
177 Results& measured) |
|
178 : query_( query ), |
|
179 ideal_( ideal ), |
|
180 measured_( measured ) {} |
|
181 |
|
182 EvaluationRecordEntry::EvaluationRecordEntry(Reader& reader) { |
|
183 wchar_t line[MAX_LINE_LENGTH]; |
|
184 |
|
185 readLine(reader, line, MAX_LINE_LENGTH); // corpusName |
|
186 readLine(reader, line, MAX_LINE_LENGTH); // analyzerName |
|
187 readLine(reader, line, MAX_LINE_LENGTH); // query |
|
188 wchar_t* cut = line; while (*cut && *cut != ':') cut++; |
|
189 cut++; while (*cut == ' ') cut++; |
|
190 query_ = cut; |
|
191 readLine(reader, line, MAX_LINE_LENGTH); // status |
|
192 readLine(reader, line, MAX_LINE_LENGTH); // hits |
|
193 readLine(reader, line, MAX_LINE_LENGTH); // errors |
|
194 readLine(reader, line, MAX_LINE_LENGTH); // false positives |
|
195 readLine(reader, line, MAX_LINE_LENGTH); // false negatives |
|
196 |
|
197 while (readLine(reader, line, MAX_LINE_LENGTH)) { |
|
198 if (wcscmp(line, SECTION_END_STR) == 0) break; |
|
199 bool found = (line[HIT_MARK_IDX] == HIT_MARK_CHAR); |
|
200 bool error = (line[ERROR_MARK_IDX] == ERROR_MARK_CHAR); |
|
201 |
|
202 measured_.append(found); |
|
203 ideal_.append((!error)?found:!found); |
|
204 } |
|
205 } |
|
206 |
|
207 EvaluationRecordEntry::EvaluationRecordEntry() |
|
208 : query_(), ideal_(), measured_() {} |
|
209 |
|
210 EvaluationRecord::EvaluationRecord(const char* file) |
|
211 : entries_() { |
|
212 FileReader reader(file, "UTF-8"); |
|
213 |
|
214 wchar_t line[MAX_LINE_LENGTH]; |
|
215 |
|
216 while (readLine(reader, line, MAX_LINE_LENGTH)) { |
|
217 // Skip summary |
|
218 if (wcscmp(line, SUMMARY_STR) == 0) { |
|
219 while (readLine(reader, line, MAX_LINE_LENGTH) |
|
220 && wcscmp(line, SECTION_END_STR) != 0); |
|
221 } |
|
222 |
|
223 // Eof |
|
224 if (wcscmp(line, FILE_END_STR) == 0) break; |
|
225 |
|
226 // Search section |
|
227 if (wcscmp(line, SEARCH_SECTION_STR) == 0) { |
|
228 entries_.push_back( EvaluationRecordEntry( reader ) ); |
|
229 } |
|
230 } |
|
231 } |
|
232 |
|
233 int EvaluationRecord::length() { |
|
234 return entries_.size(); |
|
235 } |
|
236 |
|
237 const wchar_t* EvaluationRecord::query(int i) { |
|
238 return entries_[i].query_.c_str(); |
|
239 } |
|
240 |
|
241 Results& EvaluationRecord::ideal(int i) { |
|
242 return entries_[i].ideal_; |
|
243 } |
|
244 |
|
245 Results& EvaluationRecord::measured(int i) { |
|
246 return entries_[i].measured_; |
|
247 } |
|
248 |
|
249 Evaluation::Evaluation(Results& ideal, Results& measured) |
|
250 : ideal_( ideal ), |
|
251 measured_( measured ) { |
|
252 } |
|
253 |
|
254 bool Evaluation::falsePositive(int line) { |
|
255 return (!ideal_.hit(line))&&measured_.hit(line); |
|
256 } |
|
257 |
|
258 bool Evaluation::falseNegative(int line) { |
|
259 return ideal_.hit(line)&&(!measured_.hit(line)); |
|
260 } |
|
261 |
|
262 bool Evaluation::error(int line) |
|
263 { |
|
264 return (ideal_.hit(line)!=measured_.hit(line)?1:0); |
|
265 } |
|
266 |
|
267 int Evaluation::errors() |
|
268 { |
|
269 int ret = 0; |
|
270 for (int i = 0; i < ideal_.length(); i++) { |
|
271 if (error(i)) ret++; |
|
272 } |
|
273 return ret; |
|
274 } |
|
275 |
|
276 int Evaluation::falsePositives() |
|
277 { |
|
278 int ret = 0; |
|
279 for (int i = 0; i < ideal_.length(); i++) { |
|
280 if (falsePositive(i)) ret++; |
|
281 } |
|
282 return ret; |
|
283 } |
|
284 |
|
285 int Evaluation::falseNegatives() |
|
286 { |
|
287 int ret = 0; |
|
288 for (int i = 0; i < ideal_.length(); i++) { |
|
289 if (falseNegative(i)) ret++; |
|
290 } |
|
291 return ret; |
|
292 } |
|
293 |
|
294 } |