searchengine/oss/loc/analysisunittest/src/evaluationtool.cpp
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 
       
    19 #include "evaluationtool.h"
       
    20 #include "analysisunittest.h"
       
    21 
       
    22 #include "testutils.h"
       
    23 
       
    24 #include "cpixstrtools.h"
       
    25 
       
    26 #define MAX_LINE_LENGTH 512
       
    27 
       
    28 namespace evaluationtool {
       
    29 
       
    30 	using namespace lucene::analysis; 
       
    31 	using namespace lucene::util; 
       
    32 	using namespace lucene::index; 
       
    33 	using namespace lucene::store; 
       
    34 	using namespace lucene::search; 
       
    35 	using namespace lucene::document; 
       
    36 	using namespace lucene::queryParser; 
       
    37 
       
    38 
       
    39 	static const wchar_t HIT_MARK_CHAR = 'X';
       
    40 	static const int HIT_MARK_IDX = 2;
       
    41 	static const wchar_t ERROR_MARK_CHAR = '!';
       
    42 	static const int ERROR_MARK_IDX = 0;
       
    43 	
       
    44 	static const wchar_t* SUMMARY_STR = L"--- Summary ---";
       
    45 	static const wchar_t* SECTION_END_STR = L"--- Section End ---";
       
    46 	static const wchar_t* SEARCH_SECTION_STR = L"--- Search ---";
       
    47 	static const wchar_t* FILE_END_STR = L"--- File End ---";
       
    48 	
       
    49 	static const wchar_t* ID_FIELD = L"id";
       
    50 	static const wchar_t* CONTENT_FIELD = L"content";
       
    51 
       
    52 	
       
    53 	Corpus::Corpus(const char* file)
       
    54 	:	lines_() {
       
    55 		FileReader reader(file, "UTF-8");
       
    56 			
       
    57 		wchar_t line[MAX_LINE_LENGTH]; 
       
    58 	
       
    59 		while (readLine(reader, line, MAX_LINE_LENGTH)) {
       
    60 			if (wcslen(line)) lines_.push_back( std::wstring( line ) ); 
       
    61 		}
       
    62 	}
       
    63 	
       
    64 	const wchar_t* Corpus::operator[](int i) {
       
    65 		return lines_[i].c_str();
       
    66 	}
       
    67 	
       
    68 	int Corpus::size() {
       
    69 		return lines_.size();
       
    70 	}
       
    71 
       
    72 	
       
    73 #define MAX_ID_LENGTH 10
       
    74 
       
    75 	PreparedCorpus::PreparedCorpus(Corpus& corpus, 	
       
    76 								   Analyzer& analyzer,
       
    77 								   Analyzer* queryAnalyzer,
       
    78 								   Analyzer* prefixAnalyzer)
       
    79 	:   size_( corpus.size() ),
       
    80 	    prefixAnalyzer_( prefixAnalyzer ),
       
    81 	    dir_() {
       
    82 		
       
    83 		dir_.reset( FSDirectory::getDirectory( INDEX_DIRECTORY, true ) ); 
       
    84 		
       
    85 		IndexWriter writer(dir_.get(), &analyzer, true, false);
       
    86 
       
    87 		wchar_t id[MAX_ID_LENGTH];
       
    88 		
       
    89 		for (int i = 0; i < corpus.size(); i++) {
       
    90 			Document doc;
       
    91 			snwprintf(id, MAX_ID_LENGTH, L"%d", i);
       
    92 			doc.add(*new Field( ID_FIELD, id, Field::INDEX_NO | Field::STORE_YES));
       
    93 			doc.add(*new Field( CONTENT_FIELD, corpus[i], Field::INDEX_TOKENIZED | Field::STORE_NO));
       
    94 			writer.addDocument(&doc);
       
    95 		}
       
    96 
       
    97 		writer.optimize(); 
       
    98 		writer.close();
       
    99 		
       
   100 		queryParser_.reset(new QueryParser(CONTENT_FIELD, queryAnalyzer ? queryAnalyzer : &analyzer));
       
   101 		
       
   102 		searcher_.reset(new IndexSearcher(dir_.get()));
       
   103 	}
       
   104 	
       
   105 	int PreparedCorpus::size() {
       
   106 		return size_; 
       
   107 	}
       
   108 
       
   109 	int PreparedCorpus::indexSize() {
       
   110         std::vector<std::string> v;
       
   111         dir_->list(&v);
       
   112         int ret = 0;
       
   113         for (int i = 0; i < v.size(); i++) {
       
   114             ret += dir_->fileLength(v[i].c_str());
       
   115         }
       
   116         return ret;
       
   117     }
       
   118 
       
   119 	void PreparedCorpus::search(const wchar_t* query, std::bitset<MAXLINES>& hits ) {
       
   120 		int qlen = wcslen( query ); 
       
   121 		while (qlen > 0 && iswspace(query[qlen-1])) qlen--;
       
   122 		auto_ptr<Query> q;
       
   123 		if ( query[qlen-1] == '*' && prefixAnalyzer_ ) {
       
   124 			// Simplified prefix query parser
       
   125 			wchar_t buf[512]; 
       
   126 			memcpy(buf, query, sizeof(wchar_t)*(qlen-1)); 
       
   127 			buf[qlen-1] = '\0'; 
       
   128 			// Assume, that prefix query contains only one word
       
   129 			auto_ptr<TokenStream> t( prefixAnalyzer_->tokenStream(NULL, new StringReader(buf)) );
       
   130 			Token token; 
       
   131 			t->next(&token);
       
   132 			Term* term = new Term( CONTENT_FIELD, token.termText() );
       
   133 			q.reset( new PrefixQuery( term ) );
       
   134 			_CLDECDELETE( term ); 
       
   135 		} else {
       
   136 			q.reset( queryParser_->parse(query) );
       
   137 		}
       
   138 		if ( q.get() ) {
       
   139 			auto_ptr<Hits> h( searcher_->search( q.get() ) );
       
   140 			for (int i = 0; i < h->length(); i++) {
       
   141 				int id; 
       
   142 				Cpt::wconvertInteger(&id, h->doc(i).get(ID_FIELD));
       
   143 				hits[id] = true; 
       
   144 			}
       
   145 		}
       
   146 	}
       
   147 
       
   148 	
       
   149 	Results::Results(std::bitset<MAXLINES>& hits, int lines)
       
   150 	:	hits_(hits), lines_(lines) {}
       
   151 
       
   152 	Results::Results() 
       
   153 	:	hits_(), lines_(0) {}
       
   154 
       
   155 	Results::Results(PreparedCorpus& corpus, 
       
   156 					 const wchar_t* query) 
       
   157 	: 	lines_(corpus.size()) {
       
   158 		corpus.search(query, hits_);
       
   159 	}
       
   160 	
       
   161 	bool Results::hit(int i) {
       
   162 		return hits_[i]; 
       
   163 	}
       
   164 	
       
   165 	void Results::append(bool hit) {
       
   166 		hits_[lines_++] = hit;
       
   167 	}
       
   168 
       
   169 	
       
   170 	int Results::length() {
       
   171 		return lines_;
       
   172 	}
       
   173 	
       
   174 	EvaluationRecordEntry::EvaluationRecordEntry(
       
   175 		const wchar_t* query, 
       
   176 		Results& ideal, 
       
   177 		Results& measured)
       
   178 	: query_( query ), 
       
   179 	  ideal_( ideal ), 
       
   180 	  measured_( measured ) {}
       
   181 	
       
   182 	EvaluationRecordEntry::EvaluationRecordEntry(Reader& reader) {
       
   183 		wchar_t line[MAX_LINE_LENGTH];
       
   184 
       
   185 		readLine(reader, line, MAX_LINE_LENGTH); // corpusName
       
   186 		readLine(reader, line, MAX_LINE_LENGTH); // analyzerName
       
   187 		readLine(reader, line, MAX_LINE_LENGTH); // query
       
   188 		wchar_t* cut = line; while (*cut && *cut != ':') cut++;
       
   189 		cut++; while (*cut == ' ') cut++;
       
   190 		query_ = cut; 
       
   191 		readLine(reader, line, MAX_LINE_LENGTH); // status
       
   192 		readLine(reader, line, MAX_LINE_LENGTH); // hits
       
   193 		readLine(reader, line, MAX_LINE_LENGTH); // errors
       
   194 		readLine(reader, line, MAX_LINE_LENGTH); // false positives
       
   195 		readLine(reader, line, MAX_LINE_LENGTH); // false negatives
       
   196 	
       
   197 		while (readLine(reader, line, MAX_LINE_LENGTH)) {
       
   198 			if (wcscmp(line, SECTION_END_STR) == 0) break; 
       
   199 			bool found = (line[HIT_MARK_IDX] == HIT_MARK_CHAR);   
       
   200 			bool error = (line[ERROR_MARK_IDX] == ERROR_MARK_CHAR);   
       
   201 
       
   202 			measured_.append(found);
       
   203 			ideal_.append((!error)?found:!found);
       
   204 		}
       
   205 	}
       
   206 
       
   207 	EvaluationRecordEntry::EvaluationRecordEntry()
       
   208 	: query_(), ideal_(), measured_() {}
       
   209 		
       
   210 	EvaluationRecord::EvaluationRecord(const char* file) 
       
   211 	: 	entries_() {
       
   212 		FileReader reader(file, "UTF-8");
       
   213 	
       
   214 		wchar_t line[MAX_LINE_LENGTH];
       
   215 
       
   216 		while (readLine(reader, line, MAX_LINE_LENGTH)) {
       
   217 			// Skip summary
       
   218 			if (wcscmp(line, SUMMARY_STR) == 0) {
       
   219 				while (readLine(reader, line, MAX_LINE_LENGTH) 
       
   220 					&& wcscmp(line, SECTION_END_STR) != 0); 
       
   221 			}
       
   222 
       
   223 			// Eof
       
   224 			if (wcscmp(line, FILE_END_STR) == 0) break;
       
   225 			
       
   226 			// Search section
       
   227 			if (wcscmp(line, SEARCH_SECTION_STR) == 0) {
       
   228 				entries_.push_back( EvaluationRecordEntry( reader ) );
       
   229 			}
       
   230 		}
       
   231 	}
       
   232 		
       
   233 	int EvaluationRecord::length() {
       
   234 		return entries_.size(); 
       
   235 	}
       
   236 			
       
   237 	const wchar_t* EvaluationRecord::query(int i) {
       
   238 		return entries_[i].query_.c_str(); 
       
   239 	}
       
   240 			
       
   241 	Results& EvaluationRecord::ideal(int i) {
       
   242 		return entries_[i].ideal_; 
       
   243 	}
       
   244 			
       
   245 	Results& EvaluationRecord::measured(int i) {
       
   246 		return entries_[i].measured_; 
       
   247 	}
       
   248 		
       
   249 	Evaluation::Evaluation(Results& ideal, Results& measured) 
       
   250 	: ideal_( ideal ), 
       
   251 	  measured_( measured ) {
       
   252 	}
       
   253 
       
   254 	bool Evaluation::falsePositive(int line) {
       
   255 		return (!ideal_.hit(line))&&measured_.hit(line);
       
   256 	}
       
   257 
       
   258 	bool Evaluation::falseNegative(int line) {
       
   259 		return ideal_.hit(line)&&(!measured_.hit(line));
       
   260 	}
       
   261 
       
   262 	bool Evaluation::error(int line) 
       
   263 	{
       
   264 		return (ideal_.hit(line)!=measured_.hit(line)?1:0);
       
   265 	}
       
   266 		
       
   267 	int Evaluation::errors()
       
   268 	{
       
   269 		int ret = 0;
       
   270 		for (int i = 0; i < ideal_.length(); i++) {
       
   271 			if (error(i)) ret++; 
       
   272 		}
       
   273 		return ret;
       
   274 	}
       
   275 			
       
   276 	int Evaluation::falsePositives()
       
   277 	{
       
   278 		int ret = 0;
       
   279 		for (int i = 0; i < ideal_.length(); i++) {
       
   280 			if (falsePositive(i)) ret++; 
       
   281 		}
       
   282 		return ret;
       
   283 	}
       
   284 			
       
   285 	int Evaluation::falseNegatives()
       
   286 	{
       
   287 		int ret = 0;
       
   288 		for (int i = 0; i < ideal_.length(); i++) {
       
   289 			if (falseNegative(i)) ret++;
       
   290 		}
       
   291 		return ret;
       
   292 	}
       
   293 		
       
   294 }