searchengine/oss/loc/analysisunittest/src/evaluationtool.cpp
changeset 24 65456528cac2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/loc/analysisunittest/src/evaluationtool.cpp	Fri Oct 15 12:09:28 2010 +0530
@@ -0,0 +1,294 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: 
+*
+*/
+
+
+#include "evaluationtool.h"
+#include "analysisunittest.h"
+
+#include "testutils.h"
+
+#include "cpixstrtools.h"
+
+#define MAX_LINE_LENGTH 512
+
+namespace evaluationtool {
+
+	using namespace lucene::analysis; 
+	using namespace lucene::util; 
+	using namespace lucene::index; 
+	using namespace lucene::store; 
+	using namespace lucene::search; 
+	using namespace lucene::document; 
+	using namespace lucene::queryParser; 
+
+
+	static const wchar_t HIT_MARK_CHAR = 'X';
+	static const int HIT_MARK_IDX = 2;
+	static const wchar_t ERROR_MARK_CHAR = '!';
+	static const int ERROR_MARK_IDX = 0;
+	
+	static const wchar_t* SUMMARY_STR = L"--- Summary ---";
+	static const wchar_t* SECTION_END_STR = L"--- Section End ---";
+	static const wchar_t* SEARCH_SECTION_STR = L"--- Search ---";
+	static const wchar_t* FILE_END_STR = L"--- File End ---";
+	
+	static const wchar_t* ID_FIELD = L"id";
+	static const wchar_t* CONTENT_FIELD = L"content";
+
+	
+	Corpus::Corpus(const char* file)
+	:	lines_() {
+		FileReader reader(file, "UTF-8");
+			
+		wchar_t line[MAX_LINE_LENGTH]; 
+	
+		while (readLine(reader, line, MAX_LINE_LENGTH)) {
+			if (wcslen(line)) lines_.push_back( std::wstring( line ) ); 
+		}
+	}
+	
+	const wchar_t* Corpus::operator[](int i) {
+		return lines_[i].c_str();
+	}
+	
+	int Corpus::size() {
+		return lines_.size();
+	}
+
+	
+#define MAX_ID_LENGTH 10
+
+	PreparedCorpus::PreparedCorpus(Corpus& corpus, 	
+								   Analyzer& analyzer,
+								   Analyzer* queryAnalyzer,
+								   Analyzer* prefixAnalyzer)
+	:   size_( corpus.size() ),
+	    prefixAnalyzer_( prefixAnalyzer ),
+	    dir_() {
+		
+		dir_.reset( FSDirectory::getDirectory( INDEX_DIRECTORY, true ) ); 
+		
+		IndexWriter writer(dir_.get(), &analyzer, true, false);
+
+		wchar_t id[MAX_ID_LENGTH];
+		
+		for (int i = 0; i < corpus.size(); i++) {
+			Document doc;
+			snwprintf(id, MAX_ID_LENGTH, L"%d", i);
+			doc.add(*new Field( ID_FIELD, id, Field::INDEX_NO | Field::STORE_YES));
+			doc.add(*new Field( CONTENT_FIELD, corpus[i], Field::INDEX_TOKENIZED | Field::STORE_NO));
+			writer.addDocument(&doc);
+		}
+
+		writer.optimize(); 
+		writer.close();
+		
+		queryParser_.reset(new QueryParser(CONTENT_FIELD, queryAnalyzer ? queryAnalyzer : &analyzer));
+		
+		searcher_.reset(new IndexSearcher(dir_.get()));
+	}
+	
+	int PreparedCorpus::size() {
+		return size_; 
+	}
+
+	int PreparedCorpus::indexSize() {
+        std::vector<std::string> v;
+        dir_->list(&v);
+        int ret = 0;
+        for (int i = 0; i < v.size(); i++) {
+            ret += dir_->fileLength(v[i].c_str());
+        }
+        return ret;
+    }
+
+	void PreparedCorpus::search(const wchar_t* query, std::bitset<MAXLINES>& hits ) {
+		int qlen = wcslen( query ); 
+		while (qlen > 0 && iswspace(query[qlen-1])) qlen--;
+		auto_ptr<Query> q;
+		if ( query[qlen-1] == '*' && prefixAnalyzer_ ) {
+			// Simplified prefix query parser
+			wchar_t buf[512]; 
+			memcpy(buf, query, sizeof(wchar_t)*(qlen-1)); 
+			buf[qlen-1] = '\0'; 
+			// Assume, that prefix query contains only one word
+			auto_ptr<TokenStream> t( prefixAnalyzer_->tokenStream(NULL, new StringReader(buf)) );
+			Token token; 
+			t->next(&token);
+			Term* term = new Term( CONTENT_FIELD, token.termText() );
+			q.reset( new PrefixQuery( term ) );
+			_CLDECDELETE( term ); 
+		} else {
+			q.reset( queryParser_->parse(query) );
+		}
+		if ( q.get() ) {
+			auto_ptr<Hits> h( searcher_->search( q.get() ) );
+			for (int i = 0; i < h->length(); i++) {
+				int id; 
+				Cpt::wconvertInteger(&id, h->doc(i).get(ID_FIELD));
+				hits[id] = true; 
+			}
+		}
+	}
+
+	
+	Results::Results(std::bitset<MAXLINES>& hits, int lines)
+	:	hits_(hits), lines_(lines) {}
+
+	Results::Results() 
+	:	hits_(), lines_(0) {}
+
+	Results::Results(PreparedCorpus& corpus, 
+					 const wchar_t* query) 
+	: 	lines_(corpus.size()) {
+		corpus.search(query, hits_);
+	}
+	
+	bool Results::hit(int i) {
+		return hits_[i]; 
+	}
+	
+	void Results::append(bool hit) {
+		hits_[lines_++] = hit;
+	}
+
+	
+	int Results::length() {
+		return lines_;
+	}
+	
+	EvaluationRecordEntry::EvaluationRecordEntry(
+		const wchar_t* query, 
+		Results& ideal, 
+		Results& measured)
+	: query_( query ), 
+	  ideal_( ideal ), 
+	  measured_( measured ) {}
+	
+	EvaluationRecordEntry::EvaluationRecordEntry(Reader& reader) {
+		wchar_t line[MAX_LINE_LENGTH];
+
+		readLine(reader, line, MAX_LINE_LENGTH); // corpusName
+		readLine(reader, line, MAX_LINE_LENGTH); // analyzerName
+		readLine(reader, line, MAX_LINE_LENGTH); // query
+		wchar_t* cut = line; while (*cut && *cut != ':') cut++;
+		cut++; while (*cut == ' ') cut++;
+		query_ = cut; 
+		readLine(reader, line, MAX_LINE_LENGTH); // status
+		readLine(reader, line, MAX_LINE_LENGTH); // hits
+		readLine(reader, line, MAX_LINE_LENGTH); // errors
+		readLine(reader, line, MAX_LINE_LENGTH); // false positives
+		readLine(reader, line, MAX_LINE_LENGTH); // false negatives
+	
+		while (readLine(reader, line, MAX_LINE_LENGTH)) {
+			if (wcscmp(line, SECTION_END_STR) == 0) break; 
+			bool found = (line[HIT_MARK_IDX] == HIT_MARK_CHAR);   
+			bool error = (line[ERROR_MARK_IDX] == ERROR_MARK_CHAR);   
+
+			measured_.append(found);
+			ideal_.append((!error)?found:!found);
+		}
+	}
+
+	EvaluationRecordEntry::EvaluationRecordEntry()
+	: query_(), ideal_(), measured_() {}
+		
+	EvaluationRecord::EvaluationRecord(const char* file) 
+	: 	entries_() {
+		FileReader reader(file, "UTF-8");
+	
+		wchar_t line[MAX_LINE_LENGTH];
+
+		while (readLine(reader, line, MAX_LINE_LENGTH)) {
+			// Skip summary
+			if (wcscmp(line, SUMMARY_STR) == 0) {
+				while (readLine(reader, line, MAX_LINE_LENGTH) 
+					&& wcscmp(line, SECTION_END_STR) != 0); 
+			}
+
+			// Eof
+			if (wcscmp(line, FILE_END_STR) == 0) break;
+			
+			// Search section
+			if (wcscmp(line, SEARCH_SECTION_STR) == 0) {
+				entries_.push_back( EvaluationRecordEntry( reader ) );
+			}
+		}
+	}
+		
+	int EvaluationRecord::length() {
+		return entries_.size(); 
+	}
+			
+	const wchar_t* EvaluationRecord::query(int i) {
+		return entries_[i].query_.c_str(); 
+	}
+			
+	Results& EvaluationRecord::ideal(int i) {
+		return entries_[i].ideal_; 
+	}
+			
+	Results& EvaluationRecord::measured(int i) {
+		return entries_[i].measured_; 
+	}
+		
+	Evaluation::Evaluation(Results& ideal, Results& measured) 
+	: ideal_( ideal ), 
+	  measured_( measured ) {
+	}
+
+	bool Evaluation::falsePositive(int line) {
+		return (!ideal_.hit(line))&&measured_.hit(line);
+	}
+
+	bool Evaluation::falseNegative(int line) {
+		return ideal_.hit(line)&&(!measured_.hit(line));
+	}
+
+	bool Evaluation::error(int line) 
+	{
+		return (ideal_.hit(line)!=measured_.hit(line)?1:0);
+	}
+		
+	int Evaluation::errors()
+	{
+		int ret = 0;
+		for (int i = 0; i < ideal_.length(); i++) {
+			if (error(i)) ret++; 
+		}
+		return ret;
+	}
+			
+	int Evaluation::falsePositives()
+	{
+		int ret = 0;
+		for (int i = 0; i < ideal_.length(); i++) {
+			if (falsePositive(i)) ret++; 
+		}
+		return ret;
+	}
+			
+	int Evaluation::falseNegatives()
+	{
+		int ret = 0;
+		for (int i = 0; i < ideal_.length(); i++) {
+			if (falseNegative(i)) ret++;
+		}
+		return ret;
+	}
+		
+}