FCL/sf/mw/searchsrv: comparison searchengine/oss/loc/analysisunittest/src/evaluationtool.cpp

equal deleted inserted replaced

-:d4d56f5e7c55
+:65456528cac2
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description:
+*
+*/
+#include "evaluationtool.h"
+#include "analysisunittest.h"
+#include "testutils.h"
+#include "cpixstrtools.h"
+#define MAX_LINE_LENGTH 512
+namespace evaluationtool {
+	using namespace lucene::analysis;
+	using namespace lucene::util;
+	using namespace lucene::index;
+	using namespace lucene::store;
+	using namespace lucene::search;
+	using namespace lucene::document;
+	using namespace lucene::queryParser;
+	static const wchar_t HIT_MARK_CHAR = 'X';
+	static const int HIT_MARK_IDX = 2;
+	static const wchar_t ERROR_MARK_CHAR = '!';
+	static const int ERROR_MARK_IDX = 0;
+	static const wchar_t* SUMMARY_STR = L"--- Summary ---";
+	static const wchar_t* SECTION_END_STR = L"--- Section End ---";
+	static const wchar_t* SEARCH_SECTION_STR = L"--- Search ---";
+	static const wchar_t* FILE_END_STR = L"--- File End ---";
+	static const wchar_t* ID_FIELD = L"id";
+	static const wchar_t* CONTENT_FIELD = L"content";
+	Corpus::Corpus(const char* file)
+	:	lines_() {
+		FileReader reader(file, "UTF-8");
+		wchar_t line[MAX_LINE_LENGTH];
+		while (readLine(reader, line, MAX_LINE_LENGTH)) {
+			if (wcslen(line)) lines_.push_back( std::wstring( line ) );
+		}
+	}
+	const wchar_t* Corpus::operator[](int i) {
+		return lines_[i].c_str();
+	}
+	int Corpus::size() {
+		return lines_.size();
+	}
+#define MAX_ID_LENGTH 10
+	PreparedCorpus::PreparedCorpus(Corpus& corpus,
+								   Analyzer& analyzer,
+								   Analyzer* queryAnalyzer,
+								   Analyzer* prefixAnalyzer)
+	:   size_( corpus.size() ),
+	    prefixAnalyzer_( prefixAnalyzer ),
+	    dir_() {
+		dir_.reset( FSDirectory::getDirectory( INDEX_DIRECTORY, true ) );
+		IndexWriter writer(dir_.get(), &analyzer, true, false);
+		wchar_t id[MAX_ID_LENGTH];
+		for (int i = 0; i < corpus.size(); i++) {
+			Document doc;
+			snwprintf(id, MAX_ID_LENGTH, L"%d", i);
+			doc.add(*new Field( ID_FIELD, id, Field::INDEX_NO | Field::STORE_YES));
+			doc.add(*new Field( CONTENT_FIELD, corpus[i], Field::INDEX_TOKENIZED | Field::STORE_NO));
+			writer.addDocument(&doc);
+		}
+		writer.optimize();
+		writer.close();
+		queryParser_.reset(new QueryParser(CONTENT_FIELD, queryAnalyzer ? queryAnalyzer : &analyzer));
+		searcher_.reset(new IndexSearcher(dir_.get()));
+	}
+	int PreparedCorpus::size() {
+		return size_;
+	}
+	int PreparedCorpus::indexSize() {
+std::vector<std::string> v;
+dir_->list(&v);
+int ret = 0;
+for (int i = 0; i < v.size(); i++) {
+ret += dir_->fileLength(v[i].c_str());
+}
+return ret;
+}
+	void PreparedCorpus::search(const wchar_t* query, std::bitset<MAXLINES>& hits ) {
+		int qlen = wcslen( query );
+		while (qlen > 0 && iswspace(query[qlen-1])) qlen--;
+		auto_ptr<Query> q;
+		if ( query[qlen-1] == '*' && prefixAnalyzer_ ) {
+			// Simplified prefix query parser
+			wchar_t buf[512];
+			memcpy(buf, query, sizeof(wchar_t)*(qlen-1));
+			buf[qlen-1] = '\0';
+			// Assume, that prefix query contains only one word
+			auto_ptr<TokenStream> t( prefixAnalyzer_->tokenStream(NULL, new StringReader(buf)) );
+			Token token;
+			t->next(&token);
+			Term* term = new Term( CONTENT_FIELD, token.termText() );
+			q.reset( new PrefixQuery( term ) );
+			_CLDECDELETE( term );
+		} else {
+			q.reset( queryParser_->parse(query) );
+		}
+		if ( q.get() ) {
+			auto_ptr<Hits> h( searcher_->search( q.get() ) );
+			for (int i = 0; i < h->length(); i++) {
+				int id;
+				Cpt::wconvertInteger(&id, h->doc(i).get(ID_FIELD));
+				hits[id] = true;
+			}
+		}
+	}
+	Results::Results(std::bitset<MAXLINES>& hits, int lines)
+	:	hits_(hits), lines_(lines) {}
+	Results::Results()
+	:	hits_(), lines_(0) {}
+	Results::Results(PreparedCorpus& corpus,
+					 const wchar_t* query)
+	: 	lines_(corpus.size()) {
+		corpus.search(query, hits_);
+	}
+	bool Results::hit(int i) {
+		return hits_[i];
+	}
+	void Results::append(bool hit) {
+		hits_[lines_++] = hit;
+	}
+	int Results::length() {
+		return lines_;
+	}
+	EvaluationRecordEntry::EvaluationRecordEntry(
+		const wchar_t* query,
+		Results& ideal,
+		Results& measured)
+	: query_( query ),
+	  ideal_( ideal ),
+	  measured_( measured ) {}
+	EvaluationRecordEntry::EvaluationRecordEntry(Reader& reader) {
+		wchar_t line[MAX_LINE_LENGTH];
+		readLine(reader, line, MAX_LINE_LENGTH); // corpusName
+		readLine(reader, line, MAX_LINE_LENGTH); // analyzerName
+		readLine(reader, line, MAX_LINE_LENGTH); // query
+		wchar_t* cut = line; while (*cut && *cut != ':') cut++;
+		cut++; while (*cut == ' ') cut++;
+		query_ = cut;
+		readLine(reader, line, MAX_LINE_LENGTH); // status
+		readLine(reader, line, MAX_LINE_LENGTH); // hits
+		readLine(reader, line, MAX_LINE_LENGTH); // errors
+		readLine(reader, line, MAX_LINE_LENGTH); // false positives
+		readLine(reader, line, MAX_LINE_LENGTH); // false negatives
+		while (readLine(reader, line, MAX_LINE_LENGTH)) {
+			if (wcscmp(line, SECTION_END_STR) == 0) break;
+			bool found = (line[HIT_MARK_IDX] == HIT_MARK_CHAR);
+			bool error = (line[ERROR_MARK_IDX] == ERROR_MARK_CHAR);
+			measured_.append(found);
+			ideal_.append((!error)?found:!found);
+		}
+	}
+	EvaluationRecordEntry::EvaluationRecordEntry()
+	: query_(), ideal_(), measured_() {}
+	EvaluationRecord::EvaluationRecord(const char* file)
+	: 	entries_() {
+		FileReader reader(file, "UTF-8");
+		wchar_t line[MAX_LINE_LENGTH];
+		while (readLine(reader, line, MAX_LINE_LENGTH)) {
+			// Skip summary
+			if (wcscmp(line, SUMMARY_STR) == 0) {
+				while (readLine(reader, line, MAX_LINE_LENGTH)
+					&& wcscmp(line, SECTION_END_STR) != 0);
+			}
+			// Eof
+			if (wcscmp(line, FILE_END_STR) == 0) break;
+			// Search section
+			if (wcscmp(line, SEARCH_SECTION_STR) == 0) {
+				entries_.push_back( EvaluationRecordEntry( reader ) );
+			}
+		}
+	}
+	int EvaluationRecord::length() {
+		return entries_.size();
+	}
+	const wchar_t* EvaluationRecord::query(int i) {
+		return entries_[i].query_.c_str();
+	}
+	Results& EvaluationRecord::ideal(int i) {
+		return entries_[i].ideal_;
+	}
+	Results& EvaluationRecord::measured(int i) {
+		return entries_[i].measured_;
+	}
+	Evaluation::Evaluation(Results& ideal, Results& measured)
+	: ideal_( ideal ),
+	  measured_( measured ) {
+	}
+	bool Evaluation::falsePositive(int line) {
+		return (!ideal_.hit(line))&&measured_.hit(line);
+	}
+	bool Evaluation::falseNegative(int line) {
+		return ideal_.hit(line)&&(!measured_.hit(line));
+	}
+	bool Evaluation::error(int line)
+	{
+		return (ideal_.hit(line)!=measured_.hit(line)?1:0);
+	}
+	int Evaluation::errors()
+	{
+		int ret = 0;
+		for (int i = 0; i < ideal_.length(); i++) {
+			if (error(i)) ret++;
+		}
+		return ret;
+	}
+	int Evaluation::falsePositives()
+	{
+		int ret = 0;
+		for (int i = 0; i < ideal_.length(); i++) {
+			if (falsePositive(i)) ret++;
+		}
+		return ret;
+	}
+	int Evaluation::falseNegatives()
+	{
+		int ret = 0;
+		for (int i = 0; i < ideal_.length(); i++) {
+			if (falseNegative(i)) ret++;
+		}
+		return ret;
+	}
+}