searchengine/oss/loc/analysis/src/thaianalysis.cpp
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 #include "thaianalysis.h"
       
    18 
       
    19 #include "cpixfstools.h"
       
    20 
       
    21 #include "CLucene/analysis/standard/StandardTokenizer.h"
       
    22 
       
    23 #include <iostream>
       
    24 #include <fstream>
       
    25 
       
    26 #include "tinyunicode.h"
       
    27 
       
    28 #include "thaistatemachine.h"
       
    29 
       
    30 namespace analysis {
       
    31 
       
    32 	void InitThaiAnalysis(const char* thaiDataFile) {
       
    33 		ThaiAnalysisInfra::init(thaiDataFile);
       
    34 	}
       
    35 	void ShutdownThaiAnalysis() {
       
    36 		ThaiAnalysisInfra::shutdown();
       
    37 	}
       
    38 
       
    39 	ThaiAnalysisInfra* ThaiAnalysisInfra::theInstance_ = NULL; 
       
    40 
       
    41 	const char* ThaiAnalysisInfraNotInitialized::what() const throw() {
       
    42 		return "Thai analyzer infra was not initialized.";
       
    43 	}
       
    44 
       
    45 	const char* StateMachineFileNotFound::what() const throw() {
       
    46 		return "Thai analyzer infra could not find specified StateMachine file.";
       
    47 	}
       
    48 
       
    49 	const char* StateMachineLoadingFailed::what() const throw() {
       
    50 		return "Thai analyzer infra failed reading the specified StateMachine file.";
       
    51 	}
       
    52 
       
    53 	
       
    54 	void ThaiAnalysisInfra::init(const char* dataFile) 
       
    55 	{
       
    56 		shutdown(); 
       
    57 		theInstance_ = new ThaiAnalysisInfra(dataFile); 
       
    58 	}
       
    59 	
       
    60 	ThaiAnalysisInfra* ThaiAnalysisInfra::theInstance()
       
    61 	{
       
    62 		if ( !theInstance_ ) throw ThaiAnalysisInfraNotInitialized();
       
    63 		return theInstance_; 
       
    64 	}
       
    65 			
       
    66 	void ThaiAnalysisInfra::shutdown()
       
    67 	{
       
    68 		delete theInstance_; 
       
    69 		theInstance_ = 0; 
       
    70 	}
       
    71 			
       
    72 	std::auto_ptr<BreakIterator> ThaiAnalysisInfra::createBreakIterator()
       
    73 	{
       
    74 		if ( !blob_.get() )
       
    75 		{	// load lazily
       
    76 			off_t size = Cpt::filesize(dataFile_.c_str());
       
    77 			
       
    78 			if ( !size ) throw StateMachineFileNotFound();
       
    79 			
       
    80 			blob_.reset( new byte_t[size] );  
       
    81 			
       
    82 			std::ifstream in( dataFile_.c_str(), std::ifstream::in | std::ifstream::binary );
       
    83 			
       
    84 			if ( !in ) throw StateMachineFileNotFound();
       
    85 
       
    86 			in.read( reinterpret_cast<char*>( blob_.get() ), size );
       
    87 			
       
    88 			if ( in.fail() ) throw StateMachineLoadingFailed();  
       
    89 			
       
    90 			in.close(); 
       
    91 			
       
    92 			stateMachine_.reset(blob_.get());
       
    93 		}
       
    94 
       
    95 		return std::auto_ptr<BreakIterator>( new StateMachineBreakIterator<ThaiSmEncoding>( stateMachine_ ) );
       
    96 	}
       
    97 			
       
    98 	ThaiAnalysisInfra::ThaiAnalysisInfra(const char* dataFile)
       
    99 	: 	blob_(0), 
       
   100 	    stateMachine_(),
       
   101 	  	dataFile_(dataFile) 
       
   102 	{
       
   103 		// sanity check
       
   104 		if ( !Cpt::filesize(dataFile) ) throw StateMachineFileNotFound(); 
       
   105 	}
       
   106 			
       
   107 	ThaiAnalysisInfra::~ThaiAnalysisInfra()
       
   108 	{}
       
   109 	
       
   110 	ThaiWordFilter::ThaiWordFilter( lucene::analysis::TokenStream* input, 
       
   111 									bool deleteTs )
       
   112 	:	TokenFilter(input, deleteTs),
       
   113 		breaks_(),
       
   114 		thaiToken_()
       
   115 	{
       
   116 		breaks_ = ThaiAnalysisInfra::theInstance()->createBreakIterator(); 
       
   117 	}
       
   118 	
       
   119 	using namespace lucene::analysis; 
       
   120 		
       
   121 	ThaiWordFilter::~ThaiWordFilter()
       
   122 	{}
       
   123 	
       
   124 #define MAX_BUFSIZE 256
       
   125 		
       
   126 	bool ThaiWordFilter::next(Token* token)
       
   127 	{
       
   128 		if ( breaks_->hasNext() ) 
       
   129 		{
       
   130 			size_t wordBegin = breaks_->current(); 
       
   131 			size_t wordLength = breaks_->next() - wordBegin;
       
   132 			
       
   133 			wchar_t buf[MAX_BUFSIZE];
       
   134 			memcpy( buf, 
       
   135 					thaiToken_.termText()+wordBegin, 
       
   136 					wordLength * sizeof(wchar_t) );
       
   137 			buf[wordLength] = '\0';
       
   138 			
       
   139 			token->set( buf, 
       
   140 						thaiToken_.startOffset() + wordBegin,  
       
   141 						thaiToken_.endOffset() + wordBegin + wordLength);
       
   142 			return true; 
       
   143 		}
       
   144 		
       
   145 		if ( input->next( token ) )
       
   146 		{
       
   147 			if ( unicode::IsThai( token->termText()[0] ) )
       
   148 			{
       
   149 				thaiToken_.set( token->termText(), token->startOffset(), token->endOffset() );
       
   150 				breaks_->setText( thaiToken_.termText()); // reset
       
   151 				return next( token );  
       
   152 			} else {
       
   153 				return true;
       
   154 			}
       
   155 		}
       
   156 	
       
   157 		return false;
       
   158 	}
       
   159 	
       
   160 
       
   161 	using namespace lucene::analysis::standard; 
       
   162 	
       
   163 	ThaiAnalyzer::ThaiAnalyzer()
       
   164 	:	stopWords_(false)
       
   165 	{
       
   166 		StopFilter::fillStopTable( &stopWords_,CL_NS(analysis)::StopAnalyzer::ENGLISH_STOP_WORDS);
       
   167 	}
       
   168 		
       
   169 	lucene::analysis::TokenStream* ThaiAnalyzer::tokenStream(const wchar_t* fieldName, 
       
   170 														    CL_NS(util)::Reader* reader)
       
   171 	{
       
   172 		auto_ptr<TokenStream> ret(  new StandardTokenizer(reader) ); 
       
   173 		
       
   174 		ret.reset( new LowerCaseFilter( ret.release(), true ) ); 
       
   175 		ret.reset( new StandardFilter( ret.release(), true ) ); 
       
   176 		ret.reset( new ThaiWordFilter( ret.release(), true ) ); 
       
   177 		ret.reset( new StopFilter( ret.release(), true, &stopWords_ ) ); 
       
   178 		
       
   179 		return ret.release();  
       
   180 	}
       
   181 	
       
   182 }