|
1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 #ifndef THAIWORDFILTER_H_ |
|
19 #define THAIWORDFILTER_H_ |
|
20 |
|
21 #include <memory> |
|
22 |
|
23 #include "cpixstrtools.h" |
|
24 |
|
25 #include "Clucene.h" |
|
26 |
|
27 namespace analysis |
|
28 { |
|
29 // Forward declarations |
|
30 class BreakIterator; |
|
31 |
|
32 |
|
33 /** |
|
34 * Thai analysis uses compiled dictionary and it needs to |
|
35 * know the dictionary's location in the file system. This dictionary |
|
36 * is then loaded when thai analysis is used for first time. |
|
37 * The dictionary is kept in memory until shutdown. |
|
38 */ |
|
39 void InitThaiAnalysis(const char* thaiDataFile); |
|
40 |
|
41 /** |
|
42 * Releases the thai dictionary, if loaded to memory |
|
43 */ |
|
44 void ShutdownThaiAnalysis(); |
|
45 |
|
46 /** |
|
47 * Dictionary based token filtering. Reads tokens from the token stream. |
|
48 * If token beginning with Thai character is encountered, it breaks |
|
49 * the token into a number of thai word tokens based on the dictionary |
|
50 * with the longest matching algorithm. |
|
51 */ |
|
52 class ThaiWordFilter : public lucene::analysis::TokenFilter |
|
53 { |
|
54 public: |
|
55 |
|
56 ThaiWordFilter( lucene::analysis::TokenStream* input, bool deleteTs ); |
|
57 |
|
58 ~ThaiWordFilter(); |
|
59 |
|
60 bool next(lucene::analysis::Token* token); |
|
61 |
|
62 private: |
|
63 |
|
64 std::auto_ptr<BreakIterator> breaks_; |
|
65 |
|
66 lucene::analysis::Token thaiToken_; |
|
67 |
|
68 }; |
|
69 |
|
70 /** |
|
71 * Analyzer for thai language. Uses StandardAnalyzer to make preliminary |
|
72 * tokenization and ThaiWordFilter to split Thai sentence tokens |
|
73 * into a number of Thai word tokens. |
|
74 */ |
|
75 class ThaiAnalyzer : public lucene::analysis::Analyzer |
|
76 { |
|
77 public: |
|
78 |
|
79 ThaiAnalyzer(); |
|
80 |
|
81 public: |
|
82 |
|
83 lucene::analysis::TokenStream* tokenStream(const wchar_t* fieldName, |
|
84 lucene::util::Reader* reader); |
|
85 |
|
86 private: |
|
87 |
|
88 CL_NS(util)::CLSetList<const TCHAR*> stopWords_; |
|
89 |
|
90 }; |
|
91 |
|
92 } |
|
93 |
|
94 #endif /* THAIWORDFILTER_H_ */ |