24
|
1 |
/*
|
|
2 |
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
|
|
3 |
* All rights reserved.
|
|
4 |
* This component and the accompanying materials are made available
|
|
5 |
* under the terms of "Eclipse Public License v1.0"
|
|
6 |
* which accompanies this distribution, and is available
|
|
7 |
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
|
8 |
*
|
|
9 |
* Initial Contributors:
|
|
10 |
* Nokia Corporation - initial contribution.
|
|
11 |
*
|
|
12 |
* Contributors:
|
|
13 |
*
|
|
14 |
* Description:
|
|
15 |
*
|
|
16 |
*/
|
|
17 |
|
|
18 |
#ifndef THAIWORDFILTER_H_
|
|
19 |
#define THAIWORDFILTER_H_
|
|
20 |
|
|
21 |
#include <memory>
|
|
22 |
|
|
23 |
#include "cpixstrtools.h"
|
|
24 |
|
|
25 |
#include "Clucene.h"
|
|
26 |
|
|
27 |
namespace analysis
|
|
28 |
{
|
|
29 |
// Forward declarations
|
|
30 |
class BreakIterator;
|
|
31 |
|
|
32 |
|
|
33 |
/**
|
|
34 |
* Thai analysis uses compiled dictionary and it needs to
|
|
35 |
* know the dictionary's location in the file system. This dictionary
|
|
36 |
* is then loaded when thai analysis is used for first time.
|
|
37 |
* The dictionary is kept in memory until shutdown.
|
|
38 |
*/
|
|
39 |
void InitThaiAnalysis(const char* thaiDataFile);
|
|
40 |
|
|
41 |
/**
|
|
42 |
* Releases the thai dictionary, if loaded to memory
|
|
43 |
*/
|
|
44 |
void ShutdownThaiAnalysis();
|
|
45 |
|
|
46 |
/**
|
|
47 |
* Dictionary based token filtering. Reads tokens from the token stream.
|
|
48 |
* If token beginning with Thai character is encountered, it breaks
|
|
49 |
* the token into a number of thai word tokens based on the dictionary
|
|
50 |
* with the longest matching algorithm.
|
|
51 |
*/
|
|
52 |
class ThaiWordFilter : public lucene::analysis::TokenFilter
|
|
53 |
{
|
|
54 |
public:
|
|
55 |
|
|
56 |
ThaiWordFilter( lucene::analysis::TokenStream* input, bool deleteTs );
|
|
57 |
|
|
58 |
~ThaiWordFilter();
|
|
59 |
|
|
60 |
bool next(lucene::analysis::Token* token);
|
|
61 |
|
|
62 |
private:
|
|
63 |
|
|
64 |
std::auto_ptr<BreakIterator> breaks_;
|
|
65 |
|
|
66 |
lucene::analysis::Token thaiToken_;
|
|
67 |
|
|
68 |
};
|
|
69 |
|
|
70 |
/**
|
|
71 |
* Analyzer for thai language. Uses StandardAnalyzer to make preliminary
|
|
72 |
* tokenization and ThaiWordFilter to split Thai sentence tokens
|
|
73 |
* into a number of Thai word tokens.
|
|
74 |
*/
|
|
75 |
class ThaiAnalyzer : public lucene::analysis::Analyzer
|
|
76 |
{
|
|
77 |
public:
|
|
78 |
|
|
79 |
ThaiAnalyzer();
|
|
80 |
|
|
81 |
public:
|
|
82 |
|
|
83 |
lucene::analysis::TokenStream* tokenStream(const wchar_t* fieldName,
|
|
84 |
lucene::util::Reader* reader);
|
|
85 |
|
|
86 |
private:
|
|
87 |
|
|
88 |
CL_NS(util)::CLSetList<const TCHAR*> stopWords_;
|
|
89 |
|
|
90 |
};
|
|
91 |
|
|
92 |
}
|
|
93 |
|
|
94 |
#endif /* THAIWORDFILTER_H_ */
|