searchengine/oss/loc/analysisunittest/src/evaluationtool.cpp
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/


#include "evaluationtool.h"
#include "analysisunittest.h"

#include "testutils.h"

#include "cpixstrtools.h"

#define MAX_LINE_LENGTH 512

namespace evaluationtool {

	using namespace lucene::analysis; 
	using namespace lucene::util; 
	using namespace lucene::index; 
	using namespace lucene::store; 
	using namespace lucene::search; 
	using namespace lucene::document; 
	using namespace lucene::queryParser; 


	static const wchar_t HIT_MARK_CHAR = 'X';
	static const int HIT_MARK_IDX = 2;
	static const wchar_t ERROR_MARK_CHAR = '!';
	static const int ERROR_MARK_IDX = 0;
	
	static const wchar_t* SUMMARY_STR = L"--- Summary ---";
	static const wchar_t* SECTION_END_STR = L"--- Section End ---";
	static const wchar_t* SEARCH_SECTION_STR = L"--- Search ---";
	static const wchar_t* FILE_END_STR = L"--- File End ---";
	
	static const wchar_t* ID_FIELD = L"id";
	static const wchar_t* CONTENT_FIELD = L"content";

	
	Corpus::Corpus(const char* file)
	:	lines_() {
		FileReader reader(file, "UTF-8");
			
		wchar_t line[MAX_LINE_LENGTH]; 
	
		while (readLine(reader, line, MAX_LINE_LENGTH)) {
			if (wcslen(line)) lines_.push_back( std::wstring( line ) ); 
		}
	}
	
	const wchar_t* Corpus::operator[](int i) {
		return lines_[i].c_str();
	}
	
	int Corpus::size() {
		return lines_.size();
	}

	
#define MAX_ID_LENGTH 10

	PreparedCorpus::PreparedCorpus(Corpus& corpus, 	
								   Analyzer& analyzer,
								   Analyzer* queryAnalyzer,
								   Analyzer* prefixAnalyzer)
	:   size_( corpus.size() ),
	    prefixAnalyzer_( prefixAnalyzer ),
	    dir_() {
		
		dir_.reset( FSDirectory::getDirectory( INDEX_DIRECTORY, true ) ); 
		
		IndexWriter writer(dir_.get(), &analyzer, true, false);

		wchar_t id[MAX_ID_LENGTH];
		
		for (int i = 0; i < corpus.size(); i++) {
			Document doc;
			snwprintf(id, MAX_ID_LENGTH, L"%d", i);
			doc.add(*new Field( ID_FIELD, id, Field::INDEX_NO | Field::STORE_YES));
			doc.add(*new Field( CONTENT_FIELD, corpus[i], Field::INDEX_TOKENIZED | Field::STORE_NO));
			writer.addDocument(&doc);
		}

		writer.optimize(); 
		writer.close();
		
		queryParser_.reset(new QueryParser(CONTENT_FIELD, queryAnalyzer ? queryAnalyzer : &analyzer));
		
		searcher_.reset(new IndexSearcher(dir_.get()));
	}
	
	int PreparedCorpus::size() {
		return size_; 
	}

	int PreparedCorpus::indexSize() {
        std::vector<std::string> v;
        dir_->list(&v);
        int ret = 0;
        for (int i = 0; i < v.size(); i++) {
            ret += dir_->fileLength(v[i].c_str());
        }
        return ret;
    }

	void PreparedCorpus::search(const wchar_t* query, std::bitset<MAXLINES>& hits ) {
		int qlen = wcslen( query ); 
		while (qlen > 0 && iswspace(query[qlen-1])) qlen--;
		auto_ptr<Query> q;
		if ( query[qlen-1] == '*' && prefixAnalyzer_ ) {
			// Simplified prefix query parser
			wchar_t buf[512]; 
			memcpy(buf, query, sizeof(wchar_t)*(qlen-1)); 
			buf[qlen-1] = '\0'; 
			// Assume, that prefix query contains only one word
			auto_ptr<TokenStream> t( prefixAnalyzer_->tokenStream(NULL, new StringReader(buf)) );
			Token token; 
			t->next(&token);
			Term* term = new Term( CONTENT_FIELD, token.termText() );
			q.reset( new PrefixQuery( term ) );
			_CLDECDELETE( term ); 
		} else {
			q.reset( queryParser_->parse(query) );
		}
		if ( q.get() ) {
			auto_ptr<Hits> h( searcher_->search( q.get() ) );
			for (int i = 0; i < h->length(); i++) {
				int id; 
				Cpt::wconvertInteger(&id, h->doc(i).get(ID_FIELD));
				hits[id] = true; 
			}
		}
	}

	
	Results::Results(std::bitset<MAXLINES>& hits, int lines)
	:	hits_(hits), lines_(lines) {}

	Results::Results() 
	:	hits_(), lines_(0) {}

	Results::Results(PreparedCorpus& corpus, 
					 const wchar_t* query) 
	: 	lines_(corpus.size()) {
		corpus.search(query, hits_);
	}
	
	bool Results::hit(int i) {
		return hits_[i]; 
	}
	
	void Results::append(bool hit) {
		hits_[lines_++] = hit;
	}

	
	int Results::length() {
		return lines_;
	}
	
	EvaluationRecordEntry::EvaluationRecordEntry(
		const wchar_t* query, 
		Results& ideal, 
		Results& measured)
	: query_( query ), 
	  ideal_( ideal ), 
	  measured_( measured ) {}
	
	EvaluationRecordEntry::EvaluationRecordEntry(Reader& reader) {
		wchar_t line[MAX_LINE_LENGTH];

		readLine(reader, line, MAX_LINE_LENGTH); // corpusName
		readLine(reader, line, MAX_LINE_LENGTH); // analyzerName
		readLine(reader, line, MAX_LINE_LENGTH); // query
		wchar_t* cut = line; while (*cut && *cut != ':') cut++;
		cut++; while (*cut == ' ') cut++;
		query_ = cut; 
		readLine(reader, line, MAX_LINE_LENGTH); // status
		readLine(reader, line, MAX_LINE_LENGTH); // hits
		readLine(reader, line, MAX_LINE_LENGTH); // errors
		readLine(reader, line, MAX_LINE_LENGTH); // false positives
		readLine(reader, line, MAX_LINE_LENGTH); // false negatives
	
		while (readLine(reader, line, MAX_LINE_LENGTH)) {
			if (wcscmp(line, SECTION_END_STR) == 0) break; 
			bool found = (line[HIT_MARK_IDX] == HIT_MARK_CHAR);   
			bool error = (line[ERROR_MARK_IDX] == ERROR_MARK_CHAR);   

			measured_.append(found);
			ideal_.append((!error)?found:!found);
		}
	}

	EvaluationRecordEntry::EvaluationRecordEntry()
	: query_(), ideal_(), measured_() {}
		
	EvaluationRecord::EvaluationRecord(const char* file) 
	: 	entries_() {
		FileReader reader(file, "UTF-8");
	
		wchar_t line[MAX_LINE_LENGTH];

		while (readLine(reader, line, MAX_LINE_LENGTH)) {
			// Skip summary
			if (wcscmp(line, SUMMARY_STR) == 0) {
				while (readLine(reader, line, MAX_LINE_LENGTH) 
					&& wcscmp(line, SECTION_END_STR) != 0); 
			}

			// Eof
			if (wcscmp(line, FILE_END_STR) == 0) break;
			
			// Search section
			if (wcscmp(line, SEARCH_SECTION_STR) == 0) {
				entries_.push_back( EvaluationRecordEntry( reader ) );
			}
		}
	}
		
	int EvaluationRecord::length() {
		return entries_.size(); 
	}
			
	const wchar_t* EvaluationRecord::query(int i) {
		return entries_[i].query_.c_str(); 
	}
			
	Results& EvaluationRecord::ideal(int i) {
		return entries_[i].ideal_; 
	}
			
	Results& EvaluationRecord::measured(int i) {
		return entries_[i].measured_; 
	}
		
	Evaluation::Evaluation(Results& ideal, Results& measured) 
	: ideal_( ideal ), 
	  measured_( measured ) {
	}

	bool Evaluation::falsePositive(int line) {
		return (!ideal_.hit(line))&&measured_.hit(line);
	}

	bool Evaluation::falseNegative(int line) {
		return ideal_.hit(line)&&(!measured_.hit(line));
	}

	bool Evaluation::error(int line) 
	{
		return (ideal_.hit(line)!=measured_.hit(line)?1:0);
	}
		
	int Evaluation::errors()
	{
		int ret = 0;
		for (int i = 0; i < ideal_.length(); i++) {
			if (error(i)) ret++; 
		}
		return ret;
	}
			
	int Evaluation::falsePositives()
	{
		int ret = 0;
		for (int i = 0; i < ideal_.length(); i++) {
			if (falsePositive(i)) ret++; 
		}
		return ret;
	}
			
	int Evaluation::falseNegatives()
	{
		int ret = 0;
		for (int i = 0; i < ideal_.length(); i++) {
			if (falseNegative(i)) ret++;
		}
		return ret;
	}
		
}