searchengine/oss/loc/analysisunittest/src/evaluationtest.cpp
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
hgs
parents:
diff changeset
     1
/*
hgs
parents:
diff changeset
     2
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
hgs
parents:
diff changeset
     3
* All rights reserved.
hgs
parents:
diff changeset
     4
* This component and the accompanying materials are made available
hgs
parents:
diff changeset
     5
* under the terms of "Eclipse Public License v1.0"
hgs
parents:
diff changeset
     6
* which accompanies this distribution, and is available
hgs
parents:
diff changeset
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
hgs
parents:
diff changeset
     8
*
hgs
parents:
diff changeset
     9
* Initial Contributors:
hgs
parents:
diff changeset
    10
* Nokia Corporation - initial contribution.
hgs
parents:
diff changeset
    11
*
hgs
parents:
diff changeset
    12
* Contributors:
hgs
parents:
diff changeset
    13
*
hgs
parents:
diff changeset
    14
* Description: 
hgs
parents:
diff changeset
    15
*
hgs
parents:
diff changeset
    16
*/
hgs
parents:
diff changeset
    17
hgs
parents:
diff changeset
    18
#include "itk.h"
hgs
parents:
diff changeset
    19
hgs
parents:
diff changeset
    20
#include "thaianalysis.h"
hgs
parents:
diff changeset
    21
hgs
parents:
diff changeset
    22
#include "CLucene.h"
hgs
parents:
diff changeset
    23
hgs
parents:
diff changeset
    24
#include <iostream>
hgs
parents:
diff changeset
    25
hgs
parents:
diff changeset
    26
#include "evaluationtool.h"
hgs
parents:
diff changeset
    27
#include "analysisunittest.h"
hgs
parents:
diff changeset
    28
#include "CJKAnalyzer.h"
hgs
parents:
diff changeset
    29
#include "koreananalyzer.h"
hgs
parents:
diff changeset
    30
#include "ngram.h"
hgs
parents:
diff changeset
    31
#include "prefixfilter.h"
hgs
parents:
diff changeset
    32
hgs
parents:
diff changeset
    33
#include "testutils.h"
hgs
parents:
diff changeset
    34
hgs
parents:
diff changeset
    35
using namespace std; 
hgs
parents:
diff changeset
    36
using namespace analysis; 
hgs
parents:
diff changeset
    37
using namespace lucene::analysis; 
hgs
parents:
diff changeset
    38
using namespace evaluationtool; 
hgs
parents:
diff changeset
    39
hgs
parents:
diff changeset
    40
hgs
parents:
diff changeset
    41
hgs
parents:
diff changeset
    42
void doEvaluate(Itk::TestMgr* testMgr, Analyzer& analyzer, const char* testName, const char* corpusFile, const char* evalFile, Analyzer* queryAnalyzer = NULL, Analyzer* prefixAnalyzer = NULL ) 
hgs
parents:
diff changeset
    43
{
hgs
parents:
diff changeset
    44
	Corpus corpus(corpusFile);
hgs
parents:
diff changeset
    45
	PreparedCorpus prepared(corpus, analyzer, queryAnalyzer, prefixAnalyzer);
hgs
parents:
diff changeset
    46
	EvaluationRecord record(evalFile);
hgs
parents:
diff changeset
    47
	
hgs
parents:
diff changeset
    48
	int failed = 0; 
hgs
parents:
diff changeset
    49
	int improved = 0;
hgs
parents:
diff changeset
    50
	
hgs
parents:
diff changeset
    51
	int timeMs = 0;
hgs
parents:
diff changeset
    52
	
hgs
parents:
diff changeset
    53
	for (int i = 0; i < record.length(); i++)
hgs
parents:
diff changeset
    54
	{
hgs
parents:
diff changeset
    55
		const wchar_t* query = record.query(i);
hgs
parents:
diff changeset
    56
		
hgs
parents:
diff changeset
    57
		if (!*query) continue; // skip empty queries
hgs
parents:
diff changeset
    58
hgs
parents:
diff changeset
    59
		Results& ideal = record.ideal(i); 
hgs
parents:
diff changeset
    60
		Results& java = record.measured(i); // results for Java implementation
hgs
parents:
diff changeset
    61
		
hgs
parents:
diff changeset
    62
		Itk::Timestamp begin;
hgs
parents:
diff changeset
    63
		Itk::getTimestamp(&begin);
hgs
parents:
diff changeset
    64
		
hgs
parents:
diff changeset
    65
		Results results(prepared, query);
hgs
parents:
diff changeset
    66
		
hgs
parents:
diff changeset
    67
		Itk::Timestamp end;
hgs
parents:
diff changeset
    68
		Itk::getTimestamp(&end);
hgs
parents:
diff changeset
    69
hgs
parents:
diff changeset
    70
		Evaluation control( ideal, java );
hgs
parents:
diff changeset
    71
		Evaluation eval( ideal, results );
hgs
parents:
diff changeset
    72
		
hgs
parents:
diff changeset
    73
		timeMs += Itk::getElapsedMs(&end, &begin); 
hgs
parents:
diff changeset
    74
		
hgs
parents:
diff changeset
    75
		wprintf(L"Q '%S' - ", query);
hgs
parents:
diff changeset
    76
		printTokens(queryAnalyzer?*queryAnalyzer:analyzer, query);
hgs
parents:
diff changeset
    77
	
hgs
parents:
diff changeset
    78
		wprintf(L"i:");
hgs
parents:
diff changeset
    79
		for (int i = 0; i < results.length(); i++) {
hgs
parents:
diff changeset
    80
			if (ideal.hit(i)) {
hgs
parents:
diff changeset
    81
				wprintf(L"X");
hgs
parents:
diff changeset
    82
			} else {
hgs
parents:
diff changeset
    83
				wprintf(L".");
hgs
parents:
diff changeset
    84
			}
hgs
parents:
diff changeset
    85
		}
hgs
parents:
diff changeset
    86
		wprintf(L"\n");
hgs
parents:
diff changeset
    87
		
hgs
parents:
diff changeset
    88
		wprintf(L"j:");
hgs
parents:
diff changeset
    89
		for (int i = 0; i < results.length(); i++) {
hgs
parents:
diff changeset
    90
			if (java.hit(i)) {
hgs
parents:
diff changeset
    91
				wprintf(L"X");
hgs
parents:
diff changeset
    92
			} else {
hgs
parents:
diff changeset
    93
				wprintf(L".");
hgs
parents:
diff changeset
    94
			}
hgs
parents:
diff changeset
    95
		}
hgs
parents:
diff changeset
    96
		wprintf(L"\n"); 
hgs
parents:
diff changeset
    97
hgs
parents:
diff changeset
    98
		wprintf(L"c:");
hgs
parents:
diff changeset
    99
		for (int i = 0; i < results.length(); i++) {
hgs
parents:
diff changeset
   100
			if (eval.error(i) && !control.error(i)) {
hgs
parents:
diff changeset
   101
				wprintf(L"!");
hgs
parents:
diff changeset
   102
			} else if (!eval.error(i) && control.error(i)){
hgs
parents:
diff changeset
   103
				wprintf(L"+"); 
hgs
parents:
diff changeset
   104
			} else {
hgs
parents:
diff changeset
   105
				wprintf(L"."); 
hgs
parents:
diff changeset
   106
			}
hgs
parents:
diff changeset
   107
		}
hgs
parents:
diff changeset
   108
		wprintf(L"\n"); 
hgs
parents:
diff changeset
   109
		if ( eval.errors() == control.errors() ) {
hgs
parents:
diff changeset
   110
			wprintf(L"ok\n"); 
hgs
parents:
diff changeset
   111
		} else if ( eval.errors() < control.errors() ) {
hgs
parents:
diff changeset
   112
			wprintf(L"improved\n");
hgs
parents:
diff changeset
   113
			improved++; 
hgs
parents:
diff changeset
   114
		} else {
hgs
parents:
diff changeset
   115
			wprintf(L"more errors!\n");
hgs
parents:
diff changeset
   116
			failed++; 
hgs
parents:
diff changeset
   117
		}
hgs
parents:
diff changeset
   118
		wprintf(L"\n"); 
hgs
parents:
diff changeset
   119
	}
hgs
parents:
diff changeset
   120
	
hgs
parents:
diff changeset
   121
    wprintf(L"Index size was %d KB\n", prepared.indexSize() / 1000);
hgs
parents:
diff changeset
   122
	wprintf(L"Improved in %d / %d\n", improved, record.length());
hgs
parents:
diff changeset
   123
	wprintf(L"Deteriorated in %d / %d\n", failed, record.length());
hgs
parents:
diff changeset
   124
hgs
parents:
diff changeset
   125
	
hgs
parents:
diff changeset
   126
	std::string title;
hgs
parents:
diff changeset
   127
	title += testName; title += " search time";
hgs
parents:
diff changeset
   128
    ITK_REPORT( testMgr, title.c_str(), "%d ms / query", timeMs / record.length());
hgs
parents:
diff changeset
   129
    
hgs
parents:
diff changeset
   130
    title = testName; title += " index size";
hgs
parents:
diff changeset
   131
    ITK_REPORT( testMgr, title.c_str(), "%d KB", prepared.indexSize() / 1000);
hgs
parents:
diff changeset
   132
}
hgs
parents:
diff changeset
   133
hgs
parents:
diff changeset
   134
void ThaiEvaluation(Itk::TestMgr* testMgr) 
hgs
parents:
diff changeset
   135
{
hgs
parents:
diff changeset
   136
	ThaiAnalyzer analyzer;
hgs
parents:
diff changeset
   137
	doEvaluate(testMgr, analyzer, "thai", CORPUS_DIR "thai/corpus.txt", CORPUS_DIR "thai/eval.txt");
hgs
parents:
diff changeset
   138
}
hgs
parents:
diff changeset
   139
hgs
parents:
diff changeset
   140
void GalicianEvaluation(Itk::TestMgr* testMgr) {	
hgs
parents:
diff changeset
   141
//	GalicianAnalyzer analyzer;
hgs
parents:
diff changeset
   142
	standard::StandardAnalyzer analyzer;
hgs
parents:
diff changeset
   143
	doEvaluate(testMgr, analyzer, "galician", CORPUS_DIR "galician/corpus.txt", CORPUS_DIR "galician/eval.txt");
hgs
parents:
diff changeset
   144
}
hgs
parents:
diff changeset
   145
hgs
parents:
diff changeset
   146
void KoreanCjkEvaluation(Itk::TestMgr* testMgr) {	
hgs
parents:
diff changeset
   147
	cjk::CJKAnalyzer analyzer;
hgs
parents:
diff changeset
   148
	doEvaluate(testMgr, analyzer, "korean_cjk", CORPUS_DIR "korean/corpus.txt", CORPUS_DIR "korean/eval.txt");
hgs
parents:
diff changeset
   149
}
hgs
parents:
diff changeset
   150
hgs
parents:
diff changeset
   151
void KoreanBigramEvaluation(Itk::TestMgr* testMgr) {	
hgs
parents:
diff changeset
   152
	CjkNGramAnalyzer analyzer(2);
hgs
parents:
diff changeset
   153
	doEvaluate(testMgr, analyzer, "korean_2gram", CORPUS_DIR "korean/corpus.txt", CORPUS_DIR "korean/eval.txt");
hgs
parents:
diff changeset
   154
}
hgs
parents:
diff changeset
   155
hgs
parents:
diff changeset
   156
void KoreanUnigramEvaluation(Itk::TestMgr* testMgr) {
hgs
parents:
diff changeset
   157
    CjkNGramAnalyzer analyzer(1);
hgs
parents:
diff changeset
   158
    doEvaluate(testMgr, analyzer, "korean_1gram", CORPUS_DIR "korean/corpus.txt", CORPUS_DIR "korean/eval.txt");
hgs
parents:
diff changeset
   159
}
hgs
parents:
diff changeset
   160
hgs
parents:
diff changeset
   161
void KoreanJamuUnigramEvaluation(Itk::TestMgr* testMgr) {  
hgs
parents:
diff changeset
   162
    JamuNGramAnalyzer analyzer(1);
hgs
parents:
diff changeset
   163
    doEvaluate(testMgr, analyzer, "jamu_1gram", CORPUS_DIR "korean/corpus.txt", CORPUS_DIR "korean/eval.txt");
hgs
parents:
diff changeset
   164
}
hgs
parents:
diff changeset
   165
hgs
parents:
diff changeset
   166
void KoreanJamuBigramEvaluation(Itk::TestMgr* testMgr) {    
hgs
parents:
diff changeset
   167
    JamuNGramAnalyzer analyzer(2);
hgs
parents:
diff changeset
   168
    doEvaluate(testMgr, analyzer, "jamu_2gram", CORPUS_DIR "korean/corpus.txt", CORPUS_DIR "korean/eval.txt");
hgs
parents:
diff changeset
   169
}
hgs
parents:
diff changeset
   170
hgs
parents:
diff changeset
   171
void KoreanEvaluation(Itk::TestMgr* testMgr) {    
hgs
parents:
diff changeset
   172
    KoreanAnalyzer analyzer;
hgs
parents:
diff changeset
   173
    KoreanQueryAnalyzer queryAnalyzer;
hgs
parents:
diff changeset
   174
    doEvaluate(testMgr, analyzer, "korean", CORPUS_DIR "korean/corpus.txt", CORPUS_DIR "korean/eval.txt", &queryAnalyzer);
hgs
parents:
diff changeset
   175
}
hgs
parents:
diff changeset
   176
hgs
parents:
diff changeset
   177
void ChineseBigramEvaluation(Itk::TestMgr* testMgr) {	
hgs
parents:
diff changeset
   178
	CjkNGramAnalyzer analyzer(2);
hgs
parents:
diff changeset
   179
	doEvaluate(testMgr, analyzer, "chinese_2gram", CORPUS_DIR "chinese_prc/corpus.txt", CORPUS_DIR "chinese_prc/eval.txt");
hgs
parents:
diff changeset
   180
}
hgs
parents:
diff changeset
   181
hgs
parents:
diff changeset
   182
void ChineseUnigramEvaluation(Itk::TestMgr* testMgr) {
hgs
parents:
diff changeset
   183
	CjkNGramAnalyzer analyzer(1);
hgs
parents:
diff changeset
   184
	doEvaluate(testMgr, analyzer, "chinese_1gram", CORPUS_DIR "chinese_prc/corpus.txt", CORPUS_DIR "chinese_prc/eval.txt");
hgs
parents:
diff changeset
   185
}
hgs
parents:
diff changeset
   186
hgs
parents:
diff changeset
   187
void HebrewEvaluation(Itk::TestMgr* testMgr) {    
hgs
parents:
diff changeset
   188
    HebrewAnalyzer analyzer;
hgs
parents:
diff changeset
   189
    HebrewQueryAnalyzer queryAnalyzer;
hgs
parents:
diff changeset
   190
    doEvaluate(testMgr, analyzer, "hebrew", CORPUS_DIR "hebrew/corpus.txt", CORPUS_DIR "hebrew/eval.txt", &queryAnalyzer, &queryAnalyzer);
hgs
parents:
diff changeset
   191
}
hgs
parents:
diff changeset
   192
hgs
parents:
diff changeset
   193
void FrenchEvaluation(Itk::TestMgr* testMgr) {    
hgs
parents:
diff changeset
   194
	FrenchAnalyzer analyzer;
hgs
parents:
diff changeset
   195
    doEvaluate(testMgr, analyzer, "french", CORPUS_DIR "french/corpus.txt", CORPUS_DIR "french/eval.txt", &analyzer, &analyzer);
hgs
parents:
diff changeset
   196
}
hgs
parents:
diff changeset
   197
hgs
parents:
diff changeset
   198
Itk::TesterBase * CreateEvaluationTest() 
hgs
parents:
diff changeset
   199
{
hgs
parents:
diff changeset
   200
	using namespace Itk;
hgs
parents:
diff changeset
   201
	
hgs
parents:
diff changeset
   202
	SuiteTester
hgs
parents:
diff changeset
   203
		* testSuite = 
hgs
parents:
diff changeset
   204
			new SuiteTester( "evaluation" );
hgs
parents:
diff changeset
   205
	
hgs
parents:
diff changeset
   206
	testSuite->add( "thai", ThaiEvaluation, "thai" );
hgs
parents:
diff changeset
   207
	testSuite->add( "galician", GalicianEvaluation, "galician" );
hgs
parents:
diff changeset
   208
	testSuite->add( "korean_cjk", KoreanCjkEvaluation, "korean_cjk" );
hgs
parents:
diff changeset
   209
    testSuite->add( "korean", KoreanEvaluation, "korean" );
hgs
parents:
diff changeset
   210
	testSuite->add( "korean_1gram", KoreanUnigramEvaluation, "korean_1gram" );
hgs
parents:
diff changeset
   211
	testSuite->add( "korean_2gram", KoreanBigramEvaluation, "korean_2gram" );
hgs
parents:
diff changeset
   212
	testSuite->add( "chinese_1gram", ChineseUnigramEvaluation, "chinese_1gram" );
hgs
parents:
diff changeset
   213
	testSuite->add( "chinese_2gram", ChineseBigramEvaluation, "chinese_2gram" );
hgs
parents:
diff changeset
   214
hgs
parents:
diff changeset
   215
    testSuite->add( "jamu_1gram", KoreanJamuUnigramEvaluation, "jamu_1gram" );
hgs
parents:
diff changeset
   216
    testSuite->add( "jamu_2gram", KoreanJamuBigramEvaluation, "jamu_2gram" );
hgs
parents:
diff changeset
   217
hgs
parents:
diff changeset
   218
    testSuite->add( "hebrew", HebrewEvaluation, "hebrew" );
hgs
parents:
diff changeset
   219
    testSuite->add( "french", FrenchEvaluation, "french" );
hgs
parents:
diff changeset
   220
hgs
parents:
diff changeset
   221
	return testSuite;
hgs
parents:
diff changeset
   222
}
hgs
parents:
diff changeset
   223