|
1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 #ifndef TINYANALYSIS_H_ |
|
19 #define TINYANALYSIS_H_ |
|
20 |
|
21 #include <string> |
|
22 #include <sstream> |
|
23 |
|
24 #include "tinyutf16.h" |
|
25 #include "wctype.h" |
|
26 |
|
27 /* |
|
28 * This file contains template based tokenization utilities. There |
|
29 * are following rationales for this package: |
|
30 * |
|
31 * * More flexibility was needed for various CJK analyzers. |
|
32 * |
|
33 * -> CLucene tokenizers are difficult to make work |
|
34 * together well. For example in practice you cannot using |
|
35 * generic n-gram tokenizer for cjk and standard tokenizer |
|
36 * for non-cjk. This cannot be done in CLucene without |
|
37 * making it very, very heavy operation. |
|
38 * |
|
39 * * More flexibility was needed on the character reading level. |
|
40 * |
|
41 * * It is possible to encounter over unicodes that don't fit in |
|
42 * 16 bit characters, when dealing with Chinese and Japanese. |
|
43 * For this reason, reading CJK should be done in unicode mode |
|
44 * instead of reading individual 16 bit codepoints. |
|
45 * |
|
46 * * Also with Korean, there is alphabetic (Hangul Jamu) and |
|
47 * syllabic writing form (Hangul Syllables). Same text can be |
|
48 * expressed in either of these forms. For good behavior (and |
|
49 * some UX reasons), it was necessary to convert all encountered |
|
50 * text into one of these forms, so that text written in Jamu |
|
51 * could be found with Hangul Syllables and visa versa. |
|
52 * |
|
53 * This package fulfills both of these requirements in a very speed |
|
54 * efficient way. Tokenizers can be easily combined to form a sort of |
|
55 * 'aggregated tokenizer'. This kind of combination is supported by design |
|
56 * and done with PairTokenizer class. |
|
57 * |
|
58 * The ability to switch the way text is read on fly is supported by |
|
59 * having the reading done by rather abstract iterators. |
|
60 * |
|
61 * Performance is taken into account by having heavily used iterators |
|
62 * resolved run-time by making it a template parameter. Lot of inlines |
|
63 * are used, but perhaps biggest optimization of it all is that instead |
|
64 * extracted tokens holding the string inside, tokenizers simply hold |
|
65 * references (in a form of an iterator) into the original character |
|
66 * buffer. So there is no heap usage, look-ups or string copying. |
|
67 * |
|
68 * NOTE: Iterators may be surprisingly big objects. While wchar_t* |
|
69 * is only 4 bytes, e.g. HangulIterator<Utf16Iterator<ReaderBuffer<N>>> |
|
70 * is already 24 bytes. This size could be reduced to 8 bytes, but |
|
71 * it would bring performance implications. So copying of iterators |
|
72 * may be expensive. |
|
73 * |
|
74 * The design shown in here is actually very nice, flexible, simplistic, |
|
75 * fast and uses very little memory. The same design could be used |
|
76 * e.g. for lexical analysis code. |
|
77 */ |
|
78 |
|
79 namespace lucene { |
|
80 namespace analysis { |
|
81 class Token; |
|
82 } |
|
83 } |
|
84 |
|
85 namespace analysis { |
|
86 |
|
87 |
|
88 namespace tiny { |
|
89 |
|
90 /** |
|
91 * Token is object, which identifies some sequence of characters in |
|
92 * the original text stream. Holds iterator to the beginning of the |
|
93 * token and holds information of the tokens length. The length |
|
94 * is always the amount of unicode characters in the token. |
|
95 */ |
|
96 template <typename Iterator> |
|
97 struct Token { |
|
98 |
|
99 typedef RangeIterator<Iterator> iter; |
|
100 |
|
101 Token() : begin_(), length_() {} |
|
102 Token(Iterator& begin, int length) : begin_(begin), length_(length) {} |
|
103 |
|
104 /** Length in unicode characters */ |
|
105 inline int length() { return length_; }; |
|
106 |
|
107 /** Gives iterator, that iterates over this token's characters */ |
|
108 iter iterator() { |
|
109 return iter(begin_, length_); |
|
110 } |
|
111 /** Informs, whether this token is non-empty */ |
|
112 operator bool() { |
|
113 return length_; |
|
114 } |
|
115 /** Text size in 16 bit codewords */ |
|
116 int utf16size() { |
|
117 return analysis::tiny::utf16size(iterator()); |
|
118 } |
|
119 /** Copy text as 16 bit codewords */ |
|
120 void utf16(wchar_t* buf) { |
|
121 Utf16Writer<wchar_t*>(buf)<<iterator()<<L'\0'; |
|
122 } |
|
123 /** Copy text as 16 bit codewords */ |
|
124 std::wstring utf16() { |
|
125 return utf16str(iterator()); |
|
126 } |
|
127 /** Copy this token content to the Clucene token.*/ |
|
128 void copyTo(lucene::analysis::Token* token); |
|
129 private: |
|
130 Iterator begin_; |
|
131 int length_; |
|
132 }; |
|
133 |
|
134 typedef int (*Acceptor)(int c); |
|
135 |
|
136 /** Skips all characters, that are accepted by the acceptor */ |
|
137 template <class Iterator, typename Acceptor> |
|
138 inline int skip(Iterator& i, Acceptor accept) { |
|
139 int ret = 0; |
|
140 while ( *i && accept( *i ) ) { ++i; ret++; } |
|
141 return ret; |
|
142 } |
|
143 |
|
144 /** Skips all characters, that are not accepted by the acceptor */ |
|
145 template <class Iterator, typename Acceptor> |
|
146 inline int skipbut(Iterator& i, Acceptor accept) { |
|
147 int ret = 0; |
|
148 while ( *i && !accept( *i ) ) { ++i; ret++; } |
|
149 return ret; |
|
150 } |
|
151 |
|
152 /** Consumes a token consisting of all characters accepted by the acceptor */ |
|
153 template <class Iterator, typename Acceptor> |
|
154 Token<Iterator> consume(Iterator& i, Acceptor accept) { |
|
155 Iterator begin = i; |
|
156 return Token<Iterator>( begin, skip(i, accept) ); |
|
157 } |
|
158 |
|
159 /** Abstract base class for tokenizers */ |
|
160 template <class Iterator> |
|
161 class Tokenizer { |
|
162 public: |
|
163 virtual void reset() {}; |
|
164 virtual Token<Iterator> consume(Iterator& i) = 0; |
|
165 }; |
|
166 |
|
167 /** Consumes as accepted by the acceptor */ |
|
168 template <class Iterator> |
|
169 class CustomTokenizer : public Tokenizer<Iterator> { |
|
170 public: |
|
171 CustomTokenizer(Acceptor accept) : accept_(accept) {} |
|
172 Token<Iterator> consume(Iterator& i) { |
|
173 return ::analysis::tiny::consume(i, accept_); |
|
174 } |
|
175 private: |
|
176 Acceptor accept_; |
|
177 }; |
|
178 |
|
179 /** |
|
180 * NGram tokenizer. Tokenizers NGram from any character sequence accepted |
|
181 * by acceptor. This class maintains internal state. It consumes either |
|
182 * fully sized ngrams or entire word, if the word is smaller than defined |
|
183 * ngram size. |
|
184 */ |
|
185 template <class Iterator> |
|
186 class NGramTokenizer : public Tokenizer<Iterator> { |
|
187 public: |
|
188 NGramTokenizer(int size, Acceptor accept) : size_(size), accept_(accept), continue_(false) {} |
|
189 NGramTokenizer(int size) : size_(size), accept_(&iswalpha) {} |
|
190 void reset() { continue_ = false; } |
|
191 Token<Iterator> consume(Iterator& i) { |
|
192 if ( *i ) { |
|
193 Iterator end = i; |
|
194 int l = 0; |
|
195 while (l < size_ && *end && accept_( *end )) { l++; ++end; } |
|
196 if (l == size_ || (!continue_ && l)) { |
|
197 // properly sized token or whole word |
|
198 Token<Iterator> t(i, l); |
|
199 continue_ = true; |
|
200 ++i; |
|
201 return t; |
|
202 } |
|
203 } |
|
204 continue_ = false; |
|
205 return Token<Iterator>(i, 0); |
|
206 } |
|
207 private: |
|
208 int size_; |
|
209 Acceptor accept_; |
|
210 bool continue_; |
|
211 }; |
|
212 |
|
213 /** |
|
214 * Tokenizer, that returns ALWAYS a token, unless EOS is |
|
215 * reached. If the tokenizer given to this tokenizer fails, |
|
216 * relaxed tokenizer just moves one position further and |
|
217 * tries again. |
|
218 */ |
|
219 template <typename I> |
|
220 class RelaxedTokenizer : public Tokenizer<I> { |
|
221 public: |
|
222 /** Uses given tokenizer to extract tokens. */ |
|
223 RelaxedTokenizer(Tokenizer<I>& t) : t_(t) {} |
|
224 void reset() {t_.reset();} |
|
225 /** |
|
226 * Always returns a token. If tokenization fails, |
|
227 * moves forward a character and tries again. |
|
228 */ |
|
229 Token<I> consume(I& i) { |
|
230 Token<I> t; |
|
231 while (*i && !t) { |
|
232 t = t_.consume(i); |
|
233 if (!t) { |
|
234 ++i; t_.reset(); |
|
235 } |
|
236 } |
|
237 return t; |
|
238 } |
|
239 private: |
|
240 Tokenizer<I>& t_; |
|
241 }; |
|
242 |
|
243 /** |
|
244 * Tries to first tokenize with the first tokenizer, but if it |
|
245 * fails, the second tokenizer is tried. If first tokenizer fails, |
|
246 * it is reset. |
|
247 */ |
|
248 template <typename I> |
|
249 class PairTokenizer : public Tokenizer<I>{ |
|
250 public: |
|
251 PairTokenizer(Tokenizer<I>& t1, Tokenizer<I>& t2) : t1_(t1), t2_(t2) {} |
|
252 void reset() { |
|
253 t1_.reset(); |
|
254 t2_.reset(); |
|
255 } |
|
256 /** |
|
257 * Attempts to tokenizer with first tokenizer, then |
|
258 * with second. If both tokenizers fail, empty |
|
259 * token is returned. |
|
260 */ |
|
261 Token<I> consume(I& i) { |
|
262 Token<I> t( t1_.consume( i ) ); |
|
263 if ( !t ) { |
|
264 t1_.reset(); |
|
265 t = t2_.consume( i ); |
|
266 } |
|
267 return t; |
|
268 } |
|
269 private: |
|
270 Tokenizer<I>& t1_; |
|
271 Tokenizer<I>& t2_; |
|
272 }; |
|
273 |
|
274 } |
|
275 |
|
276 } |
|
277 |
|
278 #endif /* TINYTOKENIZER_H_ */ |