diff -r d4d56f5e7c55 -r 65456528cac2 searchengine/oss/loc/analysisunittest/src/evaluationtool.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/searchengine/oss/loc/analysisunittest/src/evaluationtool.cpp Fri Oct 15 12:09:28 2010 +0530 @@ -0,0 +1,294 @@ +/* +* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ + + +#include "evaluationtool.h" +#include "analysisunittest.h" + +#include "testutils.h" + +#include "cpixstrtools.h" + +#define MAX_LINE_LENGTH 512 + +namespace evaluationtool { + + using namespace lucene::analysis; + using namespace lucene::util; + using namespace lucene::index; + using namespace lucene::store; + using namespace lucene::search; + using namespace lucene::document; + using namespace lucene::queryParser; + + + static const wchar_t HIT_MARK_CHAR = 'X'; + static const int HIT_MARK_IDX = 2; + static const wchar_t ERROR_MARK_CHAR = '!'; + static const int ERROR_MARK_IDX = 0; + + static const wchar_t* SUMMARY_STR = L"--- Summary ---"; + static const wchar_t* SECTION_END_STR = L"--- Section End ---"; + static const wchar_t* SEARCH_SECTION_STR = L"--- Search ---"; + static const wchar_t* FILE_END_STR = L"--- File End ---"; + + static const wchar_t* ID_FIELD = L"id"; + static const wchar_t* CONTENT_FIELD = L"content"; + + + Corpus::Corpus(const char* file) + : lines_() { + FileReader reader(file, "UTF-8"); + + wchar_t line[MAX_LINE_LENGTH]; + + while (readLine(reader, line, MAX_LINE_LENGTH)) { + if (wcslen(line)) lines_.push_back( std::wstring( line ) ); + } + } + + const wchar_t* Corpus::operator[](int i) { + return lines_[i].c_str(); + } + + int Corpus::size() { + return lines_.size(); + } + + +#define MAX_ID_LENGTH 10 + + PreparedCorpus::PreparedCorpus(Corpus& corpus, + Analyzer& analyzer, + Analyzer* queryAnalyzer, + Analyzer* prefixAnalyzer) + : size_( corpus.size() ), + prefixAnalyzer_( prefixAnalyzer ), + dir_() { + + dir_.reset( FSDirectory::getDirectory( INDEX_DIRECTORY, true ) ); + + IndexWriter writer(dir_.get(), &analyzer, true, false); + + wchar_t id[MAX_ID_LENGTH]; + + for (int i = 0; i < corpus.size(); i++) { + Document doc; + snwprintf(id, MAX_ID_LENGTH, L"%d", i); + doc.add(*new Field( ID_FIELD, id, Field::INDEX_NO | Field::STORE_YES)); + doc.add(*new Field( CONTENT_FIELD, corpus[i], Field::INDEX_TOKENIZED | Field::STORE_NO)); + writer.addDocument(&doc); + } + + writer.optimize(); + writer.close(); + + queryParser_.reset(new QueryParser(CONTENT_FIELD, queryAnalyzer ? queryAnalyzer : &analyzer)); + + searcher_.reset(new IndexSearcher(dir_.get())); + } + + int PreparedCorpus::size() { + return size_; + } + + int PreparedCorpus::indexSize() { + std::vector v; + dir_->list(&v); + int ret = 0; + for (int i = 0; i < v.size(); i++) { + ret += dir_->fileLength(v[i].c_str()); + } + return ret; + } + + void PreparedCorpus::search(const wchar_t* query, std::bitset& hits ) { + int qlen = wcslen( query ); + while (qlen > 0 && iswspace(query[qlen-1])) qlen--; + auto_ptr q; + if ( query[qlen-1] == '*' && prefixAnalyzer_ ) { + // Simplified prefix query parser + wchar_t buf[512]; + memcpy(buf, query, sizeof(wchar_t)*(qlen-1)); + buf[qlen-1] = '\0'; + // Assume, that prefix query contains only one word + auto_ptr t( prefixAnalyzer_->tokenStream(NULL, new StringReader(buf)) ); + Token token; + t->next(&token); + Term* term = new Term( CONTENT_FIELD, token.termText() ); + q.reset( new PrefixQuery( term ) ); + _CLDECDELETE( term ); + } else { + q.reset( queryParser_->parse(query) ); + } + if ( q.get() ) { + auto_ptr h( searcher_->search( q.get() ) ); + for (int i = 0; i < h->length(); i++) { + int id; + Cpt::wconvertInteger(&id, h->doc(i).get(ID_FIELD)); + hits[id] = true; + } + } + } + + + Results::Results(std::bitset& hits, int lines) + : hits_(hits), lines_(lines) {} + + Results::Results() + : hits_(), lines_(0) {} + + Results::Results(PreparedCorpus& corpus, + const wchar_t* query) + : lines_(corpus.size()) { + corpus.search(query, hits_); + } + + bool Results::hit(int i) { + return hits_[i]; + } + + void Results::append(bool hit) { + hits_[lines_++] = hit; + } + + + int Results::length() { + return lines_; + } + + EvaluationRecordEntry::EvaluationRecordEntry( + const wchar_t* query, + Results& ideal, + Results& measured) + : query_( query ), + ideal_( ideal ), + measured_( measured ) {} + + EvaluationRecordEntry::EvaluationRecordEntry(Reader& reader) { + wchar_t line[MAX_LINE_LENGTH]; + + readLine(reader, line, MAX_LINE_LENGTH); // corpusName + readLine(reader, line, MAX_LINE_LENGTH); // analyzerName + readLine(reader, line, MAX_LINE_LENGTH); // query + wchar_t* cut = line; while (*cut && *cut != ':') cut++; + cut++; while (*cut == ' ') cut++; + query_ = cut; + readLine(reader, line, MAX_LINE_LENGTH); // status + readLine(reader, line, MAX_LINE_LENGTH); // hits + readLine(reader, line, MAX_LINE_LENGTH); // errors + readLine(reader, line, MAX_LINE_LENGTH); // false positives + readLine(reader, line, MAX_LINE_LENGTH); // false negatives + + while (readLine(reader, line, MAX_LINE_LENGTH)) { + if (wcscmp(line, SECTION_END_STR) == 0) break; + bool found = (line[HIT_MARK_IDX] == HIT_MARK_CHAR); + bool error = (line[ERROR_MARK_IDX] == ERROR_MARK_CHAR); + + measured_.append(found); + ideal_.append((!error)?found:!found); + } + } + + EvaluationRecordEntry::EvaluationRecordEntry() + : query_(), ideal_(), measured_() {} + + EvaluationRecord::EvaluationRecord(const char* file) + : entries_() { + FileReader reader(file, "UTF-8"); + + wchar_t line[MAX_LINE_LENGTH]; + + while (readLine(reader, line, MAX_LINE_LENGTH)) { + // Skip summary + if (wcscmp(line, SUMMARY_STR) == 0) { + while (readLine(reader, line, MAX_LINE_LENGTH) + && wcscmp(line, SECTION_END_STR) != 0); + } + + // Eof + if (wcscmp(line, FILE_END_STR) == 0) break; + + // Search section + if (wcscmp(line, SEARCH_SECTION_STR) == 0) { + entries_.push_back( EvaluationRecordEntry( reader ) ); + } + } + } + + int EvaluationRecord::length() { + return entries_.size(); + } + + const wchar_t* EvaluationRecord::query(int i) { + return entries_[i].query_.c_str(); + } + + Results& EvaluationRecord::ideal(int i) { + return entries_[i].ideal_; + } + + Results& EvaluationRecord::measured(int i) { + return entries_[i].measured_; + } + + Evaluation::Evaluation(Results& ideal, Results& measured) + : ideal_( ideal ), + measured_( measured ) { + } + + bool Evaluation::falsePositive(int line) { + return (!ideal_.hit(line))&&measured_.hit(line); + } + + bool Evaluation::falseNegative(int line) { + return ideal_.hit(line)&&(!measured_.hit(line)); + } + + bool Evaluation::error(int line) + { + return (ideal_.hit(line)!=measured_.hit(line)?1:0); + } + + int Evaluation::errors() + { + int ret = 0; + for (int i = 0; i < ideal_.length(); i++) { + if (error(i)) ret++; + } + return ret; + } + + int Evaluation::falsePositives() + { + int ret = 0; + for (int i = 0; i < ideal_.length(); i++) { + if (falsePositive(i)) ret++; + } + return ret; + } + + int Evaluation::falseNegatives() + { + int ret = 0; + for (int i = 0; i < ideal_.length(); i++) { + if (falseNegative(i)) ret++; + } + return ret; + } + +}