|
1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 #include "thaianalysis.h" |
|
18 |
|
19 #include "cpixfstools.h" |
|
20 |
|
21 #include "CLucene/analysis/standard/StandardTokenizer.h" |
|
22 |
|
23 #include <iostream> |
|
24 #include <fstream> |
|
25 |
|
26 #include "tinyunicode.h" |
|
27 |
|
28 #include "thaistatemachine.h" |
|
29 |
|
30 namespace analysis { |
|
31 |
|
32 void InitThaiAnalysis(const char* thaiDataFile) { |
|
33 ThaiAnalysisInfra::init(thaiDataFile); |
|
34 } |
|
35 void ShutdownThaiAnalysis() { |
|
36 ThaiAnalysisInfra::shutdown(); |
|
37 } |
|
38 |
|
39 ThaiAnalysisInfra* ThaiAnalysisInfra::theInstance_ = NULL; |
|
40 |
|
41 const char* ThaiAnalysisInfraNotInitialized::what() const throw() { |
|
42 return "Thai analyzer infra was not initialized."; |
|
43 } |
|
44 |
|
45 const char* StateMachineFileNotFound::what() const throw() { |
|
46 return "Thai analyzer infra could not find specified StateMachine file."; |
|
47 } |
|
48 |
|
49 const char* StateMachineLoadingFailed::what() const throw() { |
|
50 return "Thai analyzer infra failed reading the specified StateMachine file."; |
|
51 } |
|
52 |
|
53 |
|
54 void ThaiAnalysisInfra::init(const char* dataFile) |
|
55 { |
|
56 shutdown(); |
|
57 theInstance_ = new ThaiAnalysisInfra(dataFile); |
|
58 } |
|
59 |
|
60 ThaiAnalysisInfra* ThaiAnalysisInfra::theInstance() |
|
61 { |
|
62 if ( !theInstance_ ) throw ThaiAnalysisInfraNotInitialized(); |
|
63 return theInstance_; |
|
64 } |
|
65 |
|
66 void ThaiAnalysisInfra::shutdown() |
|
67 { |
|
68 delete theInstance_; |
|
69 theInstance_ = 0; |
|
70 } |
|
71 |
|
72 std::auto_ptr<BreakIterator> ThaiAnalysisInfra::createBreakIterator() |
|
73 { |
|
74 if ( !blob_.get() ) |
|
75 { // load lazily |
|
76 off_t size = Cpt::filesize(dataFile_.c_str()); |
|
77 |
|
78 if ( !size ) throw StateMachineFileNotFound(); |
|
79 |
|
80 blob_.reset( new byte_t[size] ); |
|
81 |
|
82 std::ifstream in( dataFile_.c_str(), std::ifstream::in | std::ifstream::binary ); |
|
83 |
|
84 if ( !in ) throw StateMachineFileNotFound(); |
|
85 |
|
86 in.read( reinterpret_cast<char*>( blob_.get() ), size ); |
|
87 |
|
88 if ( in.fail() ) throw StateMachineLoadingFailed(); |
|
89 |
|
90 in.close(); |
|
91 |
|
92 stateMachine_.reset(blob_.get()); |
|
93 } |
|
94 |
|
95 return std::auto_ptr<BreakIterator>( new StateMachineBreakIterator<ThaiSmEncoding>( stateMachine_ ) ); |
|
96 } |
|
97 |
|
98 ThaiAnalysisInfra::ThaiAnalysisInfra(const char* dataFile) |
|
99 : blob_(0), |
|
100 stateMachine_(), |
|
101 dataFile_(dataFile) |
|
102 { |
|
103 // sanity check |
|
104 if ( !Cpt::filesize(dataFile) ) throw StateMachineFileNotFound(); |
|
105 } |
|
106 |
|
107 ThaiAnalysisInfra::~ThaiAnalysisInfra() |
|
108 {} |
|
109 |
|
110 ThaiWordFilter::ThaiWordFilter( lucene::analysis::TokenStream* input, |
|
111 bool deleteTs ) |
|
112 : TokenFilter(input, deleteTs), |
|
113 breaks_(), |
|
114 thaiToken_() |
|
115 { |
|
116 breaks_ = ThaiAnalysisInfra::theInstance()->createBreakIterator(); |
|
117 } |
|
118 |
|
119 using namespace lucene::analysis; |
|
120 |
|
121 ThaiWordFilter::~ThaiWordFilter() |
|
122 {} |
|
123 |
|
124 #define MAX_BUFSIZE 256 |
|
125 |
|
126 bool ThaiWordFilter::next(Token* token) |
|
127 { |
|
128 if ( breaks_->hasNext() ) |
|
129 { |
|
130 size_t wordBegin = breaks_->current(); |
|
131 size_t wordLength = breaks_->next() - wordBegin; |
|
132 |
|
133 wchar_t buf[MAX_BUFSIZE]; |
|
134 memcpy( buf, |
|
135 thaiToken_.termText()+wordBegin, |
|
136 wordLength * sizeof(wchar_t) ); |
|
137 buf[wordLength] = '\0'; |
|
138 |
|
139 token->set( buf, |
|
140 thaiToken_.startOffset() + wordBegin, |
|
141 thaiToken_.endOffset() + wordBegin + wordLength); |
|
142 return true; |
|
143 } |
|
144 |
|
145 if ( input->next( token ) ) |
|
146 { |
|
147 if ( unicode::IsThai( token->termText()[0] ) ) |
|
148 { |
|
149 thaiToken_.set( token->termText(), token->startOffset(), token->endOffset() ); |
|
150 breaks_->setText( thaiToken_.termText()); // reset |
|
151 return next( token ); |
|
152 } else { |
|
153 return true; |
|
154 } |
|
155 } |
|
156 |
|
157 return false; |
|
158 } |
|
159 |
|
160 |
|
161 using namespace lucene::analysis::standard; |
|
162 |
|
163 ThaiAnalyzer::ThaiAnalyzer() |
|
164 : stopWords_(false) |
|
165 { |
|
166 StopFilter::fillStopTable( &stopWords_,CL_NS(analysis)::StopAnalyzer::ENGLISH_STOP_WORDS); |
|
167 } |
|
168 |
|
169 lucene::analysis::TokenStream* ThaiAnalyzer::tokenStream(const wchar_t* fieldName, |
|
170 CL_NS(util)::Reader* reader) |
|
171 { |
|
172 auto_ptr<TokenStream> ret( new StandardTokenizer(reader) ); |
|
173 |
|
174 ret.reset( new LowerCaseFilter( ret.release(), true ) ); |
|
175 ret.reset( new StandardFilter( ret.release(), true ) ); |
|
176 ret.reset( new ThaiWordFilter( ret.release(), true ) ); |
|
177 ret.reset( new StopFilter( ret.release(), true, &stopWords_ ) ); |
|
178 |
|
179 return ret.release(); |
|
180 } |
|
181 |
|
182 } |