24
|
1 |
/*
|
|
2 |
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
|
|
3 |
* All rights reserved.
|
|
4 |
* This component and the accompanying materials are made available
|
|
5 |
* under the terms of "Eclipse Public License v1.0"
|
|
6 |
* which accompanies this distribution, and is available
|
|
7 |
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
|
8 |
*
|
|
9 |
* Initial Contributors:
|
|
10 |
* Nokia Corporation - initial contribution.
|
|
11 |
*
|
|
12 |
* Contributors:
|
|
13 |
*
|
|
14 |
* Description:
|
|
15 |
*
|
|
16 |
*/
|
|
17 |
|
|
18 |
#ifndef TINYANALYSIS_H_
|
|
19 |
#define TINYANALYSIS_H_
|
|
20 |
|
|
21 |
#include <string>
|
|
22 |
#include <sstream>
|
|
23 |
|
|
24 |
#include "tinyutf16.h"
|
|
25 |
#include "wctype.h"
|
|
26 |
|
|
27 |
/*
|
|
28 |
* This file contains template based tokenization utilities. There
|
|
29 |
* are following rationales for this package:
|
|
30 |
*
|
|
31 |
* * More flexibility was needed for various CJK analyzers.
|
|
32 |
*
|
|
33 |
* -> CLucene tokenizers are difficult to make work
|
|
34 |
* together well. For example in practice you cannot using
|
|
35 |
* generic n-gram tokenizer for cjk and standard tokenizer
|
|
36 |
* for non-cjk. This cannot be done in CLucene without
|
|
37 |
* making it very, very heavy operation.
|
|
38 |
*
|
|
39 |
* * More flexibility was needed on the character reading level.
|
|
40 |
*
|
|
41 |
* * It is possible to encounter over unicodes that don't fit in
|
|
42 |
* 16 bit characters, when dealing with Chinese and Japanese.
|
|
43 |
* For this reason, reading CJK should be done in unicode mode
|
|
44 |
* instead of reading individual 16 bit codepoints.
|
|
45 |
*
|
|
46 |
* * Also with Korean, there is alphabetic (Hangul Jamu) and
|
|
47 |
* syllabic writing form (Hangul Syllables). Same text can be
|
|
48 |
* expressed in either of these forms. For good behavior (and
|
|
49 |
* some UX reasons), it was necessary to convert all encountered
|
|
50 |
* text into one of these forms, so that text written in Jamu
|
|
51 |
* could be found with Hangul Syllables and visa versa.
|
|
52 |
*
|
|
53 |
* This package fulfills both of these requirements in a very speed
|
|
54 |
* efficient way. Tokenizers can be easily combined to form a sort of
|
|
55 |
* 'aggregated tokenizer'. This kind of combination is supported by design
|
|
56 |
* and done with PairTokenizer class.
|
|
57 |
*
|
|
58 |
* The ability to switch the way text is read on fly is supported by
|
|
59 |
* having the reading done by rather abstract iterators.
|
|
60 |
*
|
|
61 |
* Performance is taken into account by having heavily used iterators
|
|
62 |
* resolved run-time by making it a template parameter. Lot of inlines
|
|
63 |
* are used, but perhaps biggest optimization of it all is that instead
|
|
64 |
* extracted tokens holding the string inside, tokenizers simply hold
|
|
65 |
* references (in a form of an iterator) into the original character
|
|
66 |
* buffer. So there is no heap usage, look-ups or string copying.
|
|
67 |
*
|
|
68 |
* NOTE: Iterators may be surprisingly big objects. While wchar_t*
|
|
69 |
* is only 4 bytes, e.g. HangulIterator<Utf16Iterator<ReaderBuffer<N>>>
|
|
70 |
* is already 24 bytes. This size could be reduced to 8 bytes, but
|
|
71 |
* it would bring performance implications. So copying of iterators
|
|
72 |
* may be expensive.
|
|
73 |
*
|
|
74 |
* The design shown in here is actually very nice, flexible, simplistic,
|
|
75 |
* fast and uses very little memory. The same design could be used
|
|
76 |
* e.g. for lexical analysis code.
|
|
77 |
*/
|
|
78 |
|
|
79 |
namespace lucene {
|
|
80 |
namespace analysis {
|
|
81 |
class Token;
|
|
82 |
}
|
|
83 |
}
|
|
84 |
|
|
85 |
namespace analysis {
|
|
86 |
|
|
87 |
|
|
88 |
namespace tiny {
|
|
89 |
|
|
90 |
/**
|
|
91 |
* Token is object, which identifies some sequence of characters in
|
|
92 |
* the original text stream. Holds iterator to the beginning of the
|
|
93 |
* token and holds information of the tokens length. The length
|
|
94 |
* is always the amount of unicode characters in the token.
|
|
95 |
*/
|
|
96 |
template <typename Iterator>
|
|
97 |
struct Token {
|
|
98 |
|
|
99 |
typedef RangeIterator<Iterator> iter;
|
|
100 |
|
|
101 |
Token() : begin_(), length_() {}
|
|
102 |
Token(Iterator& begin, int length) : begin_(begin), length_(length) {}
|
|
103 |
|
|
104 |
/** Length in unicode characters */
|
|
105 |
inline int length() { return length_; };
|
|
106 |
|
|
107 |
/** Gives iterator, that iterates over this token's characters */
|
|
108 |
iter iterator() {
|
|
109 |
return iter(begin_, length_);
|
|
110 |
}
|
|
111 |
/** Informs, whether this token is non-empty */
|
|
112 |
operator bool() {
|
|
113 |
return length_;
|
|
114 |
}
|
|
115 |
/** Text size in 16 bit codewords */
|
|
116 |
int utf16size() {
|
|
117 |
return analysis::tiny::utf16size(iterator());
|
|
118 |
}
|
|
119 |
/** Copy text as 16 bit codewords */
|
|
120 |
void utf16(wchar_t* buf) {
|
|
121 |
Utf16Writer<wchar_t*>(buf)<<iterator()<<L'\0';
|
|
122 |
}
|
|
123 |
/** Copy text as 16 bit codewords */
|
|
124 |
std::wstring utf16() {
|
|
125 |
return utf16str(iterator());
|
|
126 |
}
|
|
127 |
/** Copy this token content to the Clucene token.*/
|
|
128 |
void copyTo(lucene::analysis::Token* token);
|
|
129 |
private:
|
|
130 |
Iterator begin_;
|
|
131 |
int length_;
|
|
132 |
};
|
|
133 |
|
|
134 |
typedef int (*Acceptor)(int c);
|
|
135 |
|
|
136 |
/** Skips all characters, that are accepted by the acceptor */
|
|
137 |
template <class Iterator, typename Acceptor>
|
|
138 |
inline int skip(Iterator& i, Acceptor accept) {
|
|
139 |
int ret = 0;
|
|
140 |
while ( *i && accept( *i ) ) { ++i; ret++; }
|
|
141 |
return ret;
|
|
142 |
}
|
|
143 |
|
|
144 |
/** Skips all characters, that are not accepted by the acceptor */
|
|
145 |
template <class Iterator, typename Acceptor>
|
|
146 |
inline int skipbut(Iterator& i, Acceptor accept) {
|
|
147 |
int ret = 0;
|
|
148 |
while ( *i && !accept( *i ) ) { ++i; ret++; }
|
|
149 |
return ret;
|
|
150 |
}
|
|
151 |
|
|
152 |
/** Consumes a token consisting of all characters accepted by the acceptor */
|
|
153 |
template <class Iterator, typename Acceptor>
|
|
154 |
Token<Iterator> consume(Iterator& i, Acceptor accept) {
|
|
155 |
Iterator begin = i;
|
|
156 |
return Token<Iterator>( begin, skip(i, accept) );
|
|
157 |
}
|
|
158 |
|
|
159 |
/** Abstract base class for tokenizers */
|
|
160 |
template <class Iterator>
|
|
161 |
class Tokenizer {
|
|
162 |
public:
|
|
163 |
virtual void reset() {};
|
|
164 |
virtual Token<Iterator> consume(Iterator& i) = 0;
|
|
165 |
};
|
|
166 |
|
|
167 |
/** Consumes as accepted by the acceptor */
|
|
168 |
template <class Iterator>
|
|
169 |
class CustomTokenizer : public Tokenizer<Iterator> {
|
|
170 |
public:
|
|
171 |
CustomTokenizer(Acceptor accept) : accept_(accept) {}
|
|
172 |
Token<Iterator> consume(Iterator& i) {
|
|
173 |
return ::analysis::tiny::consume(i, accept_);
|
|
174 |
}
|
|
175 |
private:
|
|
176 |
Acceptor accept_;
|
|
177 |
};
|
|
178 |
|
|
179 |
/**
|
|
180 |
* NGram tokenizer. Tokenizers NGram from any character sequence accepted
|
|
181 |
* by acceptor. This class maintains internal state. It consumes either
|
|
182 |
* fully sized ngrams or entire word, if the word is smaller than defined
|
|
183 |
* ngram size.
|
|
184 |
*/
|
|
185 |
template <class Iterator>
|
|
186 |
class NGramTokenizer : public Tokenizer<Iterator> {
|
|
187 |
public:
|
|
188 |
NGramTokenizer(int size, Acceptor accept) : size_(size), accept_(accept), continue_(false) {}
|
|
189 |
NGramTokenizer(int size) : size_(size), accept_(&iswalpha) {}
|
|
190 |
void reset() { continue_ = false; }
|
|
191 |
Token<Iterator> consume(Iterator& i) {
|
|
192 |
if ( *i ) {
|
|
193 |
Iterator end = i;
|
|
194 |
int l = 0;
|
|
195 |
while (l < size_ && *end && accept_( *end )) { l++; ++end; }
|
|
196 |
if (l == size_ || (!continue_ && l)) {
|
|
197 |
// properly sized token or whole word
|
|
198 |
Token<Iterator> t(i, l);
|
|
199 |
continue_ = true;
|
|
200 |
++i;
|
|
201 |
return t;
|
|
202 |
}
|
|
203 |
}
|
|
204 |
continue_ = false;
|
|
205 |
return Token<Iterator>(i, 0);
|
|
206 |
}
|
|
207 |
private:
|
|
208 |
int size_;
|
|
209 |
Acceptor accept_;
|
|
210 |
bool continue_;
|
|
211 |
};
|
|
212 |
|
|
213 |
/**
|
|
214 |
* Tokenizer, that returns ALWAYS a token, unless EOS is
|
|
215 |
* reached. If the tokenizer given to this tokenizer fails,
|
|
216 |
* relaxed tokenizer just moves one position further and
|
|
217 |
* tries again.
|
|
218 |
*/
|
|
219 |
template <typename I>
|
|
220 |
class RelaxedTokenizer : public Tokenizer<I> {
|
|
221 |
public:
|
|
222 |
/** Uses given tokenizer to extract tokens. */
|
|
223 |
RelaxedTokenizer(Tokenizer<I>& t) : t_(t) {}
|
|
224 |
void reset() {t_.reset();}
|
|
225 |
/**
|
|
226 |
* Always returns a token. If tokenization fails,
|
|
227 |
* moves forward a character and tries again.
|
|
228 |
*/
|
|
229 |
Token<I> consume(I& i) {
|
|
230 |
Token<I> t;
|
|
231 |
while (*i && !t) {
|
|
232 |
t = t_.consume(i);
|
|
233 |
if (!t) {
|
|
234 |
++i; t_.reset();
|
|
235 |
}
|
|
236 |
}
|
|
237 |
return t;
|
|
238 |
}
|
|
239 |
private:
|
|
240 |
Tokenizer<I>& t_;
|
|
241 |
};
|
|
242 |
|
|
243 |
/**
|
|
244 |
* Tries to first tokenize with the first tokenizer, but if it
|
|
245 |
* fails, the second tokenizer is tried. If first tokenizer fails,
|
|
246 |
* it is reset.
|
|
247 |
*/
|
|
248 |
template <typename I>
|
|
249 |
class PairTokenizer : public Tokenizer<I>{
|
|
250 |
public:
|
|
251 |
PairTokenizer(Tokenizer<I>& t1, Tokenizer<I>& t2) : t1_(t1), t2_(t2) {}
|
|
252 |
void reset() {
|
|
253 |
t1_.reset();
|
|
254 |
t2_.reset();
|
|
255 |
}
|
|
256 |
/**
|
|
257 |
* Attempts to tokenizer with first tokenizer, then
|
|
258 |
* with second. If both tokenizers fail, empty
|
|
259 |
* token is returned.
|
|
260 |
*/
|
|
261 |
Token<I> consume(I& i) {
|
|
262 |
Token<I> t( t1_.consume( i ) );
|
|
263 |
if ( !t ) {
|
|
264 |
t1_.reset();
|
|
265 |
t = t2_.consume( i );
|
|
266 |
}
|
|
267 |
return t;
|
|
268 |
}
|
|
269 |
private:
|
|
270 |
Tokenizer<I>& t1_;
|
|
271 |
Tokenizer<I>& t2_;
|
|
272 |
};
|
|
273 |
|
|
274 |
}
|
|
275 |
|
|
276 |
}
|
|
277 |
|
|
278 |
#endif /* TINYTOKENIZER_H_ */
|