searchengine/oss/loc/analysis/src/thaianalysis.cpp
changeset 24 65456528cac2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/loc/analysis/src/thaianalysis.cpp	Fri Oct 15 12:09:28 2010 +0530
@@ -0,0 +1,182 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: 
+*
+*/
+#include "thaianalysis.h"
+
+#include "cpixfstools.h"
+
+#include "CLucene/analysis/standard/StandardTokenizer.h"
+
+#include <iostream>
+#include <fstream>
+
+#include "tinyunicode.h"
+
+#include "thaistatemachine.h"
+
+namespace analysis {
+
+	void InitThaiAnalysis(const char* thaiDataFile) {
+		ThaiAnalysisInfra::init(thaiDataFile);
+	}
+	void ShutdownThaiAnalysis() {
+		ThaiAnalysisInfra::shutdown();
+	}
+
+	ThaiAnalysisInfra* ThaiAnalysisInfra::theInstance_ = NULL; 
+
+	const char* ThaiAnalysisInfraNotInitialized::what() const throw() {
+		return "Thai analyzer infra was not initialized.";
+	}
+
+	const char* StateMachineFileNotFound::what() const throw() {
+		return "Thai analyzer infra could not find specified StateMachine file.";
+	}
+
+	const char* StateMachineLoadingFailed::what() const throw() {
+		return "Thai analyzer infra failed reading the specified StateMachine file.";
+	}
+
+	
+	void ThaiAnalysisInfra::init(const char* dataFile) 
+	{
+		shutdown(); 
+		theInstance_ = new ThaiAnalysisInfra(dataFile); 
+	}
+	
+	ThaiAnalysisInfra* ThaiAnalysisInfra::theInstance()
+	{
+		if ( !theInstance_ ) throw ThaiAnalysisInfraNotInitialized();
+		return theInstance_; 
+	}
+			
+	void ThaiAnalysisInfra::shutdown()
+	{
+		delete theInstance_; 
+		theInstance_ = 0; 
+	}
+			
+	std::auto_ptr<BreakIterator> ThaiAnalysisInfra::createBreakIterator()
+	{
+		if ( !blob_.get() )
+		{	// load lazily
+			off_t size = Cpt::filesize(dataFile_.c_str());
+			
+			if ( !size ) throw StateMachineFileNotFound();
+			
+			blob_.reset( new byte_t[size] );  
+			
+			std::ifstream in( dataFile_.c_str(), std::ifstream::in | std::ifstream::binary );
+			
+			if ( !in ) throw StateMachineFileNotFound();
+
+			in.read( reinterpret_cast<char*>( blob_.get() ), size );
+			
+			if ( in.fail() ) throw StateMachineLoadingFailed();  
+			
+			in.close(); 
+			
+			stateMachine_.reset(blob_.get());
+		}
+
+		return std::auto_ptr<BreakIterator>( new StateMachineBreakIterator<ThaiSmEncoding>( stateMachine_ ) );
+	}
+			
+	ThaiAnalysisInfra::ThaiAnalysisInfra(const char* dataFile)
+	: 	blob_(0), 
+	    stateMachine_(),
+	  	dataFile_(dataFile) 
+	{
+		// sanity check
+		if ( !Cpt::filesize(dataFile) ) throw StateMachineFileNotFound(); 
+	}
+			
+	ThaiAnalysisInfra::~ThaiAnalysisInfra()
+	{}
+	
+	ThaiWordFilter::ThaiWordFilter( lucene::analysis::TokenStream* input, 
+									bool deleteTs )
+	:	TokenFilter(input, deleteTs),
+		breaks_(),
+		thaiToken_()
+	{
+		breaks_ = ThaiAnalysisInfra::theInstance()->createBreakIterator(); 
+	}
+	
+	using namespace lucene::analysis; 
+		
+	ThaiWordFilter::~ThaiWordFilter()
+	{}
+	
+#define MAX_BUFSIZE 256
+		
+	bool ThaiWordFilter::next(Token* token)
+	{
+		if ( breaks_->hasNext() ) 
+		{
+			size_t wordBegin = breaks_->current(); 
+			size_t wordLength = breaks_->next() - wordBegin;
+			
+			wchar_t buf[MAX_BUFSIZE];
+			memcpy( buf, 
+					thaiToken_.termText()+wordBegin, 
+					wordLength * sizeof(wchar_t) );
+			buf[wordLength] = '\0';
+			
+			token->set( buf, 
+						thaiToken_.startOffset() + wordBegin,  
+						thaiToken_.endOffset() + wordBegin + wordLength);
+			return true; 
+		}
+		
+		if ( input->next( token ) )
+		{
+			if ( unicode::IsThai( token->termText()[0] ) )
+			{
+				thaiToken_.set( token->termText(), token->startOffset(), token->endOffset() );
+				breaks_->setText( thaiToken_.termText()); // reset
+				return next( token );  
+			} else {
+				return true;
+			}
+		}
+	
+		return false;
+	}
+	
+
+	using namespace lucene::analysis::standard; 
+	
+	ThaiAnalyzer::ThaiAnalyzer()
+	:	stopWords_(false)
+	{
+		StopFilter::fillStopTable( &stopWords_,CL_NS(analysis)::StopAnalyzer::ENGLISH_STOP_WORDS);
+	}
+		
+	lucene::analysis::TokenStream* ThaiAnalyzer::tokenStream(const wchar_t* fieldName, 
+														    CL_NS(util)::Reader* reader)
+	{
+		auto_ptr<TokenStream> ret(  new StandardTokenizer(reader) ); 
+		
+		ret.reset( new LowerCaseFilter( ret.release(), true ) ); 
+		ret.reset( new StandardFilter( ret.release(), true ) ); 
+		ret.reset( new ThaiWordFilter( ret.release(), true ) ); 
+		ret.reset( new StopFilter( ret.release(), true, &stopWords_ ) ); 
+		
+		return ret.release();  
+	}
+	
+}