searchengine/oss/loc/analysis/src/thaianalysis.cpp
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
hgs
parents:
diff changeset
     1
/*
hgs
parents:
diff changeset
     2
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
hgs
parents:
diff changeset
     3
* All rights reserved.
hgs
parents:
diff changeset
     4
* This component and the accompanying materials are made available
hgs
parents:
diff changeset
     5
* under the terms of "Eclipse Public License v1.0"
hgs
parents:
diff changeset
     6
* which accompanies this distribution, and is available
hgs
parents:
diff changeset
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
hgs
parents:
diff changeset
     8
*
hgs
parents:
diff changeset
     9
* Initial Contributors:
hgs
parents:
diff changeset
    10
* Nokia Corporation - initial contribution.
hgs
parents:
diff changeset
    11
*
hgs
parents:
diff changeset
    12
* Contributors:
hgs
parents:
diff changeset
    13
*
hgs
parents:
diff changeset
    14
* Description: 
hgs
parents:
diff changeset
    15
*
hgs
parents:
diff changeset
    16
*/
hgs
parents:
diff changeset
    17
#include "thaianalysis.h"
hgs
parents:
diff changeset
    18
hgs
parents:
diff changeset
    19
#include "cpixfstools.h"
hgs
parents:
diff changeset
    20
hgs
parents:
diff changeset
    21
#include "CLucene/analysis/standard/StandardTokenizer.h"
hgs
parents:
diff changeset
    22
hgs
parents:
diff changeset
    23
#include <iostream>
hgs
parents:
diff changeset
    24
#include <fstream>
hgs
parents:
diff changeset
    25
hgs
parents:
diff changeset
    26
#include "tinyunicode.h"
hgs
parents:
diff changeset
    27
hgs
parents:
diff changeset
    28
#include "thaistatemachine.h"
hgs
parents:
diff changeset
    29
hgs
parents:
diff changeset
    30
namespace analysis {
hgs
parents:
diff changeset
    31
hgs
parents:
diff changeset
    32
	void InitThaiAnalysis(const char* thaiDataFile) {
hgs
parents:
diff changeset
    33
		ThaiAnalysisInfra::init(thaiDataFile);
hgs
parents:
diff changeset
    34
	}
hgs
parents:
diff changeset
    35
	void ShutdownThaiAnalysis() {
hgs
parents:
diff changeset
    36
		ThaiAnalysisInfra::shutdown();
hgs
parents:
diff changeset
    37
	}
hgs
parents:
diff changeset
    38
hgs
parents:
diff changeset
    39
	ThaiAnalysisInfra* ThaiAnalysisInfra::theInstance_ = NULL; 
hgs
parents:
diff changeset
    40
hgs
parents:
diff changeset
    41
	const char* ThaiAnalysisInfraNotInitialized::what() const throw() {
hgs
parents:
diff changeset
    42
		return "Thai analyzer infra was not initialized.";
hgs
parents:
diff changeset
    43
	}
hgs
parents:
diff changeset
    44
hgs
parents:
diff changeset
    45
	const char* StateMachineFileNotFound::what() const throw() {
hgs
parents:
diff changeset
    46
		return "Thai analyzer infra could not find specified StateMachine file.";
hgs
parents:
diff changeset
    47
	}
hgs
parents:
diff changeset
    48
hgs
parents:
diff changeset
    49
	const char* StateMachineLoadingFailed::what() const throw() {
hgs
parents:
diff changeset
    50
		return "Thai analyzer infra failed reading the specified StateMachine file.";
hgs
parents:
diff changeset
    51
	}
hgs
parents:
diff changeset
    52
hgs
parents:
diff changeset
    53
	
hgs
parents:
diff changeset
    54
	void ThaiAnalysisInfra::init(const char* dataFile) 
hgs
parents:
diff changeset
    55
	{
hgs
parents:
diff changeset
    56
		shutdown(); 
hgs
parents:
diff changeset
    57
		theInstance_ = new ThaiAnalysisInfra(dataFile); 
hgs
parents:
diff changeset
    58
	}
hgs
parents:
diff changeset
    59
	
hgs
parents:
diff changeset
    60
	ThaiAnalysisInfra* ThaiAnalysisInfra::theInstance()
hgs
parents:
diff changeset
    61
	{
hgs
parents:
diff changeset
    62
		if ( !theInstance_ ) throw ThaiAnalysisInfraNotInitialized();
hgs
parents:
diff changeset
    63
		return theInstance_; 
hgs
parents:
diff changeset
    64
	}
hgs
parents:
diff changeset
    65
			
hgs
parents:
diff changeset
    66
	void ThaiAnalysisInfra::shutdown()
hgs
parents:
diff changeset
    67
	{
hgs
parents:
diff changeset
    68
		delete theInstance_; 
hgs
parents:
diff changeset
    69
		theInstance_ = 0; 
hgs
parents:
diff changeset
    70
	}
hgs
parents:
diff changeset
    71
			
hgs
parents:
diff changeset
    72
	std::auto_ptr<BreakIterator> ThaiAnalysisInfra::createBreakIterator()
hgs
parents:
diff changeset
    73
	{
hgs
parents:
diff changeset
    74
		if ( !blob_.get() )
hgs
parents:
diff changeset
    75
		{	// load lazily
hgs
parents:
diff changeset
    76
			off_t size = Cpt::filesize(dataFile_.c_str());
hgs
parents:
diff changeset
    77
			
hgs
parents:
diff changeset
    78
			if ( !size ) throw StateMachineFileNotFound();
hgs
parents:
diff changeset
    79
			
hgs
parents:
diff changeset
    80
			blob_.reset( new byte_t[size] );  
hgs
parents:
diff changeset
    81
			
hgs
parents:
diff changeset
    82
			std::ifstream in( dataFile_.c_str(), std::ifstream::in | std::ifstream::binary );
hgs
parents:
diff changeset
    83
			
hgs
parents:
diff changeset
    84
			if ( !in ) throw StateMachineFileNotFound();
hgs
parents:
diff changeset
    85
hgs
parents:
diff changeset
    86
			in.read( reinterpret_cast<char*>( blob_.get() ), size );
hgs
parents:
diff changeset
    87
			
hgs
parents:
diff changeset
    88
			if ( in.fail() ) throw StateMachineLoadingFailed();  
hgs
parents:
diff changeset
    89
			
hgs
parents:
diff changeset
    90
			in.close(); 
hgs
parents:
diff changeset
    91
			
hgs
parents:
diff changeset
    92
			stateMachine_.reset(blob_.get());
hgs
parents:
diff changeset
    93
		}
hgs
parents:
diff changeset
    94
hgs
parents:
diff changeset
    95
		return std::auto_ptr<BreakIterator>( new StateMachineBreakIterator<ThaiSmEncoding>( stateMachine_ ) );
hgs
parents:
diff changeset
    96
	}
hgs
parents:
diff changeset
    97
			
hgs
parents:
diff changeset
    98
	ThaiAnalysisInfra::ThaiAnalysisInfra(const char* dataFile)
hgs
parents:
diff changeset
    99
	: 	blob_(0), 
hgs
parents:
diff changeset
   100
	    stateMachine_(),
hgs
parents:
diff changeset
   101
	  	dataFile_(dataFile) 
hgs
parents:
diff changeset
   102
	{
hgs
parents:
diff changeset
   103
		// sanity check
hgs
parents:
diff changeset
   104
		if ( !Cpt::filesize(dataFile) ) throw StateMachineFileNotFound(); 
hgs
parents:
diff changeset
   105
	}
hgs
parents:
diff changeset
   106
			
hgs
parents:
diff changeset
   107
	ThaiAnalysisInfra::~ThaiAnalysisInfra()
hgs
parents:
diff changeset
   108
	{}
hgs
parents:
diff changeset
   109
	
hgs
parents:
diff changeset
   110
	ThaiWordFilter::ThaiWordFilter( lucene::analysis::TokenStream* input, 
hgs
parents:
diff changeset
   111
									bool deleteTs )
hgs
parents:
diff changeset
   112
	:	TokenFilter(input, deleteTs),
hgs
parents:
diff changeset
   113
		breaks_(),
hgs
parents:
diff changeset
   114
		thaiToken_()
hgs
parents:
diff changeset
   115
	{
hgs
parents:
diff changeset
   116
		breaks_ = ThaiAnalysisInfra::theInstance()->createBreakIterator(); 
hgs
parents:
diff changeset
   117
	}
hgs
parents:
diff changeset
   118
	
hgs
parents:
diff changeset
   119
	using namespace lucene::analysis; 
hgs
parents:
diff changeset
   120
		
hgs
parents:
diff changeset
   121
	ThaiWordFilter::~ThaiWordFilter()
hgs
parents:
diff changeset
   122
	{}
hgs
parents:
diff changeset
   123
	
hgs
parents:
diff changeset
   124
#define MAX_BUFSIZE 256
hgs
parents:
diff changeset
   125
		
hgs
parents:
diff changeset
   126
	bool ThaiWordFilter::next(Token* token)
hgs
parents:
diff changeset
   127
	{
hgs
parents:
diff changeset
   128
		if ( breaks_->hasNext() ) 
hgs
parents:
diff changeset
   129
		{
hgs
parents:
diff changeset
   130
			size_t wordBegin = breaks_->current(); 
hgs
parents:
diff changeset
   131
			size_t wordLength = breaks_->next() - wordBegin;
hgs
parents:
diff changeset
   132
			
hgs
parents:
diff changeset
   133
			wchar_t buf[MAX_BUFSIZE];
hgs
parents:
diff changeset
   134
			memcpy( buf, 
hgs
parents:
diff changeset
   135
					thaiToken_.termText()+wordBegin, 
hgs
parents:
diff changeset
   136
					wordLength * sizeof(wchar_t) );
hgs
parents:
diff changeset
   137
			buf[wordLength] = '\0';
hgs
parents:
diff changeset
   138
			
hgs
parents:
diff changeset
   139
			token->set( buf, 
hgs
parents:
diff changeset
   140
						thaiToken_.startOffset() + wordBegin,  
hgs
parents:
diff changeset
   141
						thaiToken_.endOffset() + wordBegin + wordLength);
hgs
parents:
diff changeset
   142
			return true; 
hgs
parents:
diff changeset
   143
		}
hgs
parents:
diff changeset
   144
		
hgs
parents:
diff changeset
   145
		if ( input->next( token ) )
hgs
parents:
diff changeset
   146
		{
hgs
parents:
diff changeset
   147
			if ( unicode::IsThai( token->termText()[0] ) )
hgs
parents:
diff changeset
   148
			{
hgs
parents:
diff changeset
   149
				thaiToken_.set( token->termText(), token->startOffset(), token->endOffset() );
hgs
parents:
diff changeset
   150
				breaks_->setText( thaiToken_.termText()); // reset
hgs
parents:
diff changeset
   151
				return next( token );  
hgs
parents:
diff changeset
   152
			} else {
hgs
parents:
diff changeset
   153
				return true;
hgs
parents:
diff changeset
   154
			}
hgs
parents:
diff changeset
   155
		}
hgs
parents:
diff changeset
   156
	
hgs
parents:
diff changeset
   157
		return false;
hgs
parents:
diff changeset
   158
	}
hgs
parents:
diff changeset
   159
	
hgs
parents:
diff changeset
   160
hgs
parents:
diff changeset
   161
	using namespace lucene::analysis::standard; 
hgs
parents:
diff changeset
   162
	
hgs
parents:
diff changeset
   163
	ThaiAnalyzer::ThaiAnalyzer()
hgs
parents:
diff changeset
   164
	:	stopWords_(false)
hgs
parents:
diff changeset
   165
	{
hgs
parents:
diff changeset
   166
		StopFilter::fillStopTable( &stopWords_,CL_NS(analysis)::StopAnalyzer::ENGLISH_STOP_WORDS);
hgs
parents:
diff changeset
   167
	}
hgs
parents:
diff changeset
   168
		
hgs
parents:
diff changeset
   169
	lucene::analysis::TokenStream* ThaiAnalyzer::tokenStream(const wchar_t* fieldName, 
hgs
parents:
diff changeset
   170
														    CL_NS(util)::Reader* reader)
hgs
parents:
diff changeset
   171
	{
hgs
parents:
diff changeset
   172
		auto_ptr<TokenStream> ret(  new StandardTokenizer(reader) ); 
hgs
parents:
diff changeset
   173
		
hgs
parents:
diff changeset
   174
		ret.reset( new LowerCaseFilter( ret.release(), true ) ); 
hgs
parents:
diff changeset
   175
		ret.reset( new StandardFilter( ret.release(), true ) ); 
hgs
parents:
diff changeset
   176
		ret.reset( new ThaiWordFilter( ret.release(), true ) ); 
hgs
parents:
diff changeset
   177
		ret.reset( new StopFilter( ret.release(), true, &stopWords_ ) ); 
hgs
parents:
diff changeset
   178
		
hgs
parents:
diff changeset
   179
		return ret.release();  
hgs
parents:
diff changeset
   180
	}
hgs
parents:
diff changeset
   181
	
hgs
parents:
diff changeset
   182
}