--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/loc/analysisunittest/src/evaluationtool.cpp Tue Jul 06 15:30:04 2010 +0300
@@ -0,0 +1,294 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description:
+*
+*/
+
+
+#include "evaluationtool.h"
+#include "analysisunittest.h"
+
+#include "testutils.h"
+
+#include "cpixstrtools.h"
+
+#define MAX_LINE_LENGTH 512
+
+namespace evaluationtool {
+
+ using namespace lucene::analysis;
+ using namespace lucene::util;
+ using namespace lucene::index;
+ using namespace lucene::store;
+ using namespace lucene::search;
+ using namespace lucene::document;
+ using namespace lucene::queryParser;
+
+
+ static const wchar_t HIT_MARK_CHAR = 'X';
+ static const int HIT_MARK_IDX = 2;
+ static const wchar_t ERROR_MARK_CHAR = '!';
+ static const int ERROR_MARK_IDX = 0;
+
+ static const wchar_t* SUMMARY_STR = L"--- Summary ---";
+ static const wchar_t* SECTION_END_STR = L"--- Section End ---";
+ static const wchar_t* SEARCH_SECTION_STR = L"--- Search ---";
+ static const wchar_t* FILE_END_STR = L"--- File End ---";
+
+ static const wchar_t* ID_FIELD = L"id";
+ static const wchar_t* CONTENT_FIELD = L"content";
+
+
+ Corpus::Corpus(const char* file)
+ : lines_() {
+ FileReader reader(file, "UTF-8");
+
+ wchar_t line[MAX_LINE_LENGTH];
+
+ while (readLine(reader, line, MAX_LINE_LENGTH)) {
+ if (wcslen(line)) lines_.push_back( std::wstring( line ) );
+ }
+ }
+
+ const wchar_t* Corpus::operator[](int i) {
+ return lines_[i].c_str();
+ }
+
+ int Corpus::size() {
+ return lines_.size();
+ }
+
+
+#define MAX_ID_LENGTH 10
+
+ PreparedCorpus::PreparedCorpus(Corpus& corpus,
+ Analyzer& analyzer,
+ Analyzer* queryAnalyzer,
+ Analyzer* prefixAnalyzer)
+ : size_( corpus.size() ),
+ prefixAnalyzer_( prefixAnalyzer ),
+ dir_() {
+
+ dir_.reset( FSDirectory::getDirectory( INDEX_DIRECTORY, true ) );
+
+ IndexWriter writer(dir_.get(), &analyzer, true, false);
+
+ wchar_t id[MAX_ID_LENGTH];
+
+ for (int i = 0; i < corpus.size(); i++) {
+ Document doc;
+ snwprintf(id, MAX_ID_LENGTH, L"%d", i);
+ doc.add(*new Field( ID_FIELD, id, Field::INDEX_NO | Field::STORE_YES));
+ doc.add(*new Field( CONTENT_FIELD, corpus[i], Field::INDEX_TOKENIZED | Field::STORE_NO));
+ writer.addDocument(&doc);
+ }
+
+ writer.optimize();
+ writer.close();
+
+ queryParser_.reset(new QueryParser(CONTENT_FIELD, queryAnalyzer ? queryAnalyzer : &analyzer));
+
+ searcher_.reset(new IndexSearcher(dir_.get()));
+ }
+
+ int PreparedCorpus::size() {
+ return size_;
+ }
+
+ int PreparedCorpus::indexSize() {
+ std::vector<std::string> v;
+ dir_->list(&v);
+ int ret = 0;
+ for (int i = 0; i < v.size(); i++) {
+ ret += dir_->fileLength(v[i].c_str());
+ }
+ return ret;
+ }
+
+ void PreparedCorpus::search(const wchar_t* query, std::bitset<MAXLINES>& hits ) {
+ int qlen = wcslen( query );
+ while (qlen > 0 && iswspace(query[qlen-1])) qlen--;
+ auto_ptr<Query> q;
+ if ( query[qlen-1] == '*' && prefixAnalyzer_ ) {
+ // Simplified prefix query parser
+ wchar_t buf[512];
+ memcpy(buf, query, sizeof(wchar_t)*(qlen-1));
+ buf[qlen-1] = '\0';
+ // Assume, that prefix query contains only one word
+ auto_ptr<TokenStream> t( prefixAnalyzer_->tokenStream(NULL, new StringReader(buf)) );
+ Token token;
+ t->next(&token);
+ Term* term = new Term( CONTENT_FIELD, token.termText() );
+ q.reset( new PrefixQuery( term ) );
+ _CLDECDELETE( term );
+ } else {
+ q.reset( queryParser_->parse(query) );
+ }
+ if ( q.get() ) {
+ auto_ptr<Hits> h( searcher_->search( q.get() ) );
+ for (int i = 0; i < h->length(); i++) {
+ int id;
+ Cpt::wconvertInteger(&id, h->doc(i).get(ID_FIELD));
+ hits[id] = true;
+ }
+ }
+ }
+
+
+ Results::Results(std::bitset<MAXLINES>& hits, int lines)
+ : hits_(hits), lines_(lines) {}
+
+ Results::Results()
+ : hits_(), lines_(0) {}
+
+ Results::Results(PreparedCorpus& corpus,
+ const wchar_t* query)
+ : lines_(corpus.size()) {
+ corpus.search(query, hits_);
+ }
+
+ bool Results::hit(int i) {
+ return hits_[i];
+ }
+
+ void Results::append(bool hit) {
+ hits_[lines_++] = hit;
+ }
+
+
+ int Results::length() {
+ return lines_;
+ }
+
+ EvaluationRecordEntry::EvaluationRecordEntry(
+ const wchar_t* query,
+ Results& ideal,
+ Results& measured)
+ : query_( query ),
+ ideal_( ideal ),
+ measured_( measured ) {}
+
+ EvaluationRecordEntry::EvaluationRecordEntry(Reader& reader) {
+ wchar_t line[MAX_LINE_LENGTH];
+
+ readLine(reader, line, MAX_LINE_LENGTH); // corpusName
+ readLine(reader, line, MAX_LINE_LENGTH); // analyzerName
+ readLine(reader, line, MAX_LINE_LENGTH); // query
+ wchar_t* cut = line; while (*cut && *cut != ':') cut++;
+ cut++; while (*cut == ' ') cut++;
+ query_ = cut;
+ readLine(reader, line, MAX_LINE_LENGTH); // status
+ readLine(reader, line, MAX_LINE_LENGTH); // hits
+ readLine(reader, line, MAX_LINE_LENGTH); // errors
+ readLine(reader, line, MAX_LINE_LENGTH); // false positives
+ readLine(reader, line, MAX_LINE_LENGTH); // false negatives
+
+ while (readLine(reader, line, MAX_LINE_LENGTH)) {
+ if (wcscmp(line, SECTION_END_STR) == 0) break;
+ bool found = (line[HIT_MARK_IDX] == HIT_MARK_CHAR);
+ bool error = (line[ERROR_MARK_IDX] == ERROR_MARK_CHAR);
+
+ measured_.append(found);
+ ideal_.append((!error)?found:!found);
+ }
+ }
+
+ EvaluationRecordEntry::EvaluationRecordEntry()
+ : query_(), ideal_(), measured_() {}
+
+ EvaluationRecord::EvaluationRecord(const char* file)
+ : entries_() {
+ FileReader reader(file, "UTF-8");
+
+ wchar_t line[MAX_LINE_LENGTH];
+
+ while (readLine(reader, line, MAX_LINE_LENGTH)) {
+ // Skip summary
+ if (wcscmp(line, SUMMARY_STR) == 0) {
+ while (readLine(reader, line, MAX_LINE_LENGTH)
+ && wcscmp(line, SECTION_END_STR) != 0);
+ }
+
+ // Eof
+ if (wcscmp(line, FILE_END_STR) == 0) break;
+
+ // Search section
+ if (wcscmp(line, SEARCH_SECTION_STR) == 0) {
+ entries_.push_back( EvaluationRecordEntry( reader ) );
+ }
+ }
+ }
+
+ int EvaluationRecord::length() {
+ return entries_.size();
+ }
+
+ const wchar_t* EvaluationRecord::query(int i) {
+ return entries_[i].query_.c_str();
+ }
+
+ Results& EvaluationRecord::ideal(int i) {
+ return entries_[i].ideal_;
+ }
+
+ Results& EvaluationRecord::measured(int i) {
+ return entries_[i].measured_;
+ }
+
+ Evaluation::Evaluation(Results& ideal, Results& measured)
+ : ideal_( ideal ),
+ measured_( measured ) {
+ }
+
+ bool Evaluation::falsePositive(int line) {
+ return (!ideal_.hit(line))&&measured_.hit(line);
+ }
+
+ bool Evaluation::falseNegative(int line) {
+ return ideal_.hit(line)&&(!measured_.hit(line));
+ }
+
+ bool Evaluation::error(int line)
+ {
+ return (ideal_.hit(line)!=measured_.hit(line)?1:0);
+ }
+
+ int Evaluation::errors()
+ {
+ int ret = 0;
+ for (int i = 0; i < ideal_.length(); i++) {
+ if (error(i)) ret++;
+ }
+ return ret;
+ }
+
+ int Evaluation::falsePositives()
+ {
+ int ret = 0;
+ for (int i = 0; i < ideal_.length(); i++) {
+ if (falsePositive(i)) ret++;
+ }
+ return ret;
+ }
+
+ int Evaluation::falseNegatives()
+ {
+ int ret = 0;
+ for (int i = 0; i < ideal_.length(); i++) {
+ if (falseNegative(i)) ret++;
+ }
+ return ret;
+ }
+
+}