searchengine/oss/loc/analysis/inc/public/thaianalysis.h
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 #ifndef THAIWORDFILTER_H_
       
    19 #define THAIWORDFILTER_H_
       
    20 
       
    21 #include <memory>
       
    22 
       
    23 #include "cpixstrtools.h"
       
    24 
       
    25 #include "Clucene.h"
       
    26 
       
    27 namespace analysis 
       
    28 {
       
    29 	// Forward declarations
       
    30 	class BreakIterator;
       
    31 
       
    32 	
       
    33 	/**
       
    34 	 * Thai analysis uses compiled dictionary and it needs to 
       
    35 	 * know the dictionary's location in the file system. This dictionary 
       
    36 	 * is then loaded when thai analysis is used for first time.
       
    37 	 * The dictionary is kept in memory until shutdown.
       
    38 	 */ 
       
    39 	void InitThaiAnalysis(const char* thaiDataFile);
       
    40 	
       
    41 	/**
       
    42 	 * Releases the thai dictionary, if loaded to memory 
       
    43 	 */
       
    44 	void ShutdownThaiAnalysis(); 
       
    45 	
       
    46 	/**
       
    47 	 * Dictionary based token filtering. Reads tokens from the token stream. 
       
    48 	 * If token beginning with Thai character is encountered, it breaks
       
    49 	 * the token into a number of thai word tokens based on the dictionary
       
    50 	 * with the longest matching algorithm.
       
    51 	 */
       
    52 	class ThaiWordFilter : public lucene::analysis::TokenFilter 
       
    53 	{
       
    54 	public:
       
    55 	
       
    56 		ThaiWordFilter( lucene::analysis::TokenStream* input, bool deleteTs );
       
    57 		
       
    58 		~ThaiWordFilter(); 
       
    59 		
       
    60 		bool next(lucene::analysis::Token* token); 
       
    61 	
       
    62 	private: 
       
    63 
       
    64 		std::auto_ptr<BreakIterator> breaks_;
       
    65 		
       
    66 		lucene::analysis::Token thaiToken_;
       
    67 		
       
    68 	};
       
    69 	
       
    70 	/**
       
    71 	 * Analyzer for thai language. Uses StandardAnalyzer to make preliminary
       
    72 	 * tokenization and ThaiWordFilter to split Thai sentence tokens 
       
    73 	 * into a number of Thai word tokens.  
       
    74 	 */
       
    75 	class ThaiAnalyzer : public lucene::analysis::Analyzer 
       
    76 	{
       
    77 	public:
       
    78 	
       
    79 		ThaiAnalyzer(); 
       
    80 		
       
    81 	public: 
       
    82 		
       
    83 		lucene::analysis::TokenStream* tokenStream(const wchar_t* fieldName, 
       
    84 												   lucene::util::Reader* reader);
       
    85 		
       
    86 	private: 
       
    87 		
       
    88 		CL_NS(util)::CLSetList<const TCHAR*> stopWords_;
       
    89 
       
    90 	};
       
    91 
       
    92 }
       
    93 
       
    94 #endif /* THAIWORDFILTER_H_ */