searchengine/oss/loc/analysis/inc/public/thaianalysis.h
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
hgs
parents:
diff changeset
     1
/*
hgs
parents:
diff changeset
     2
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
hgs
parents:
diff changeset
     3
* All rights reserved.
hgs
parents:
diff changeset
     4
* This component and the accompanying materials are made available
hgs
parents:
diff changeset
     5
* under the terms of "Eclipse Public License v1.0"
hgs
parents:
diff changeset
     6
* which accompanies this distribution, and is available
hgs
parents:
diff changeset
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
hgs
parents:
diff changeset
     8
*
hgs
parents:
diff changeset
     9
* Initial Contributors:
hgs
parents:
diff changeset
    10
* Nokia Corporation - initial contribution.
hgs
parents:
diff changeset
    11
*
hgs
parents:
diff changeset
    12
* Contributors:
hgs
parents:
diff changeset
    13
*
hgs
parents:
diff changeset
    14
* Description: 
hgs
parents:
diff changeset
    15
*
hgs
parents:
diff changeset
    16
*/
hgs
parents:
diff changeset
    17
hgs
parents:
diff changeset
    18
#ifndef THAIWORDFILTER_H_
hgs
parents:
diff changeset
    19
#define THAIWORDFILTER_H_
hgs
parents:
diff changeset
    20
hgs
parents:
diff changeset
    21
#include <memory>
hgs
parents:
diff changeset
    22
hgs
parents:
diff changeset
    23
#include "cpixstrtools.h"
hgs
parents:
diff changeset
    24
hgs
parents:
diff changeset
    25
#include "Clucene.h"
hgs
parents:
diff changeset
    26
hgs
parents:
diff changeset
    27
namespace analysis 
hgs
parents:
diff changeset
    28
{
hgs
parents:
diff changeset
    29
	// Forward declarations
hgs
parents:
diff changeset
    30
	class BreakIterator;
hgs
parents:
diff changeset
    31
hgs
parents:
diff changeset
    32
	
hgs
parents:
diff changeset
    33
	/**
hgs
parents:
diff changeset
    34
	 * Thai analysis uses compiled dictionary and it needs to 
hgs
parents:
diff changeset
    35
	 * know the dictionary's location in the file system. This dictionary 
hgs
parents:
diff changeset
    36
	 * is then loaded when thai analysis is used for first time.
hgs
parents:
diff changeset
    37
	 * The dictionary is kept in memory until shutdown.
hgs
parents:
diff changeset
    38
	 */ 
hgs
parents:
diff changeset
    39
	void InitThaiAnalysis(const char* thaiDataFile);
hgs
parents:
diff changeset
    40
	
hgs
parents:
diff changeset
    41
	/**
hgs
parents:
diff changeset
    42
	 * Releases the thai dictionary, if loaded to memory 
hgs
parents:
diff changeset
    43
	 */
hgs
parents:
diff changeset
    44
	void ShutdownThaiAnalysis(); 
hgs
parents:
diff changeset
    45
	
hgs
parents:
diff changeset
    46
	/**
hgs
parents:
diff changeset
    47
	 * Dictionary based token filtering. Reads tokens from the token stream. 
hgs
parents:
diff changeset
    48
	 * If token beginning with Thai character is encountered, it breaks
hgs
parents:
diff changeset
    49
	 * the token into a number of thai word tokens based on the dictionary
hgs
parents:
diff changeset
    50
	 * with the longest matching algorithm.
hgs
parents:
diff changeset
    51
	 */
hgs
parents:
diff changeset
    52
	class ThaiWordFilter : public lucene::analysis::TokenFilter 
hgs
parents:
diff changeset
    53
	{
hgs
parents:
diff changeset
    54
	public:
hgs
parents:
diff changeset
    55
	
hgs
parents:
diff changeset
    56
		ThaiWordFilter( lucene::analysis::TokenStream* input, bool deleteTs );
hgs
parents:
diff changeset
    57
		
hgs
parents:
diff changeset
    58
		~ThaiWordFilter(); 
hgs
parents:
diff changeset
    59
		
hgs
parents:
diff changeset
    60
		bool next(lucene::analysis::Token* token); 
hgs
parents:
diff changeset
    61
	
hgs
parents:
diff changeset
    62
	private: 
hgs
parents:
diff changeset
    63
hgs
parents:
diff changeset
    64
		std::auto_ptr<BreakIterator> breaks_;
hgs
parents:
diff changeset
    65
		
hgs
parents:
diff changeset
    66
		lucene::analysis::Token thaiToken_;
hgs
parents:
diff changeset
    67
		
hgs
parents:
diff changeset
    68
	};
hgs
parents:
diff changeset
    69
	
hgs
parents:
diff changeset
    70
	/**
hgs
parents:
diff changeset
    71
	 * Analyzer for thai language. Uses StandardAnalyzer to make preliminary
hgs
parents:
diff changeset
    72
	 * tokenization and ThaiWordFilter to split Thai sentence tokens 
hgs
parents:
diff changeset
    73
	 * into a number of Thai word tokens.  
hgs
parents:
diff changeset
    74
	 */
hgs
parents:
diff changeset
    75
	class ThaiAnalyzer : public lucene::analysis::Analyzer 
hgs
parents:
diff changeset
    76
	{
hgs
parents:
diff changeset
    77
	public:
hgs
parents:
diff changeset
    78
	
hgs
parents:
diff changeset
    79
		ThaiAnalyzer(); 
hgs
parents:
diff changeset
    80
		
hgs
parents:
diff changeset
    81
	public: 
hgs
parents:
diff changeset
    82
		
hgs
parents:
diff changeset
    83
		lucene::analysis::TokenStream* tokenStream(const wchar_t* fieldName, 
hgs
parents:
diff changeset
    84
												   lucene::util::Reader* reader);
hgs
parents:
diff changeset
    85
		
hgs
parents:
diff changeset
    86
	private: 
hgs
parents:
diff changeset
    87
		
hgs
parents:
diff changeset
    88
		CL_NS(util)::CLSetList<const TCHAR*> stopWords_;
hgs
parents:
diff changeset
    89
hgs
parents:
diff changeset
    90
	};
hgs
parents:
diff changeset
    91
hgs
parents:
diff changeset
    92
}
hgs
parents:
diff changeset
    93
hgs
parents:
diff changeset
    94
#endif /* THAIWORDFILTER_H_ */