diff -r d4d56f5e7c55 -r 65456528cac2 searchengine/oss/loc/analysis/src/thaianalysis.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/searchengine/oss/loc/analysis/src/thaianalysis.cpp Fri Oct 15 12:09:28 2010 +0530 @@ -0,0 +1,182 @@ +/* +* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ +#include "thaianalysis.h" + +#include "cpixfstools.h" + +#include "CLucene/analysis/standard/StandardTokenizer.h" + +#include +#include + +#include "tinyunicode.h" + +#include "thaistatemachine.h" + +namespace analysis { + + void InitThaiAnalysis(const char* thaiDataFile) { + ThaiAnalysisInfra::init(thaiDataFile); + } + void ShutdownThaiAnalysis() { + ThaiAnalysisInfra::shutdown(); + } + + ThaiAnalysisInfra* ThaiAnalysisInfra::theInstance_ = NULL; + + const char* ThaiAnalysisInfraNotInitialized::what() const throw() { + return "Thai analyzer infra was not initialized."; + } + + const char* StateMachineFileNotFound::what() const throw() { + return "Thai analyzer infra could not find specified StateMachine file."; + } + + const char* StateMachineLoadingFailed::what() const throw() { + return "Thai analyzer infra failed reading the specified StateMachine file."; + } + + + void ThaiAnalysisInfra::init(const char* dataFile) + { + shutdown(); + theInstance_ = new ThaiAnalysisInfra(dataFile); + } + + ThaiAnalysisInfra* ThaiAnalysisInfra::theInstance() + { + if ( !theInstance_ ) throw ThaiAnalysisInfraNotInitialized(); + return theInstance_; + } + + void ThaiAnalysisInfra::shutdown() + { + delete theInstance_; + theInstance_ = 0; + } + + std::auto_ptr ThaiAnalysisInfra::createBreakIterator() + { + if ( !blob_.get() ) + { // load lazily + off_t size = Cpt::filesize(dataFile_.c_str()); + + if ( !size ) throw StateMachineFileNotFound(); + + blob_.reset( new byte_t[size] ); + + std::ifstream in( dataFile_.c_str(), std::ifstream::in | std::ifstream::binary ); + + if ( !in ) throw StateMachineFileNotFound(); + + in.read( reinterpret_cast( blob_.get() ), size ); + + if ( in.fail() ) throw StateMachineLoadingFailed(); + + in.close(); + + stateMachine_.reset(blob_.get()); + } + + return std::auto_ptr( new StateMachineBreakIterator( stateMachine_ ) ); + } + + ThaiAnalysisInfra::ThaiAnalysisInfra(const char* dataFile) + : blob_(0), + stateMachine_(), + dataFile_(dataFile) + { + // sanity check + if ( !Cpt::filesize(dataFile) ) throw StateMachineFileNotFound(); + } + + ThaiAnalysisInfra::~ThaiAnalysisInfra() + {} + + ThaiWordFilter::ThaiWordFilter( lucene::analysis::TokenStream* input, + bool deleteTs ) + : TokenFilter(input, deleteTs), + breaks_(), + thaiToken_() + { + breaks_ = ThaiAnalysisInfra::theInstance()->createBreakIterator(); + } + + using namespace lucene::analysis; + + ThaiWordFilter::~ThaiWordFilter() + {} + +#define MAX_BUFSIZE 256 + + bool ThaiWordFilter::next(Token* token) + { + if ( breaks_->hasNext() ) + { + size_t wordBegin = breaks_->current(); + size_t wordLength = breaks_->next() - wordBegin; + + wchar_t buf[MAX_BUFSIZE]; + memcpy( buf, + thaiToken_.termText()+wordBegin, + wordLength * sizeof(wchar_t) ); + buf[wordLength] = '\0'; + + token->set( buf, + thaiToken_.startOffset() + wordBegin, + thaiToken_.endOffset() + wordBegin + wordLength); + return true; + } + + if ( input->next( token ) ) + { + if ( unicode::IsThai( token->termText()[0] ) ) + { + thaiToken_.set( token->termText(), token->startOffset(), token->endOffset() ); + breaks_->setText( thaiToken_.termText()); // reset + return next( token ); + } else { + return true; + } + } + + return false; + } + + + using namespace lucene::analysis::standard; + + ThaiAnalyzer::ThaiAnalyzer() + : stopWords_(false) + { + StopFilter::fillStopTable( &stopWords_,CL_NS(analysis)::StopAnalyzer::ENGLISH_STOP_WORDS); + } + + lucene::analysis::TokenStream* ThaiAnalyzer::tokenStream(const wchar_t* fieldName, + CL_NS(util)::Reader* reader) + { + auto_ptr ret( new StandardTokenizer(reader) ); + + ret.reset( new LowerCaseFilter( ret.release(), true ) ); + ret.reset( new StandardFilter( ret.release(), true ) ); + ret.reset( new ThaiWordFilter( ret.release(), true ) ); + ret.reset( new StopFilter( ret.release(), true, &stopWords_ ) ); + + return ret.release(); + } + +}