|
1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 #ifndef NGRAM_H_ |
|
18 #define NGRAM_H_ |
|
19 |
|
20 #include "tinyanalysis.h" |
|
21 #include "tinyunicode.h" |
|
22 #include "clutil.h" |
|
23 |
|
24 namespace analysis { |
|
25 |
|
26 /** |
|
27 * Returns true, if the character is non-cjk letter |
|
28 */ |
|
29 int IsNonCjk(int c); |
|
30 |
|
31 /** |
|
32 * TinyCjkTokenizer. Contains tiny analysis classes, that are |
|
33 * used to turn Chinese, Korean and Japanese into 1-grams, while |
|
34 * using letter analyzer for other kinds of text (western, cyrillic, |
|
35 * etc.) |
|
36 * |
|
37 * @tparam I the iterator, that is used to read characters |
|
38 */ |
|
39 template<typename I> |
|
40 struct TinyCjkTokenizer { |
|
41 |
|
42 /** Deals with cjk */ |
|
43 tiny::NGramTokenizer<I> cjk_; |
|
44 /** Letter tokenizer for space separated language */ |
|
45 tiny::CustomTokenizer<I> noncjk_; |
|
46 /** Combines cjk with noncjk */ |
|
47 tiny::PairTokenizer<I> pair_; |
|
48 /** Moves forward, if tokenization fails */ |
|
49 tiny::RelaxedTokenizer<I> t_; |
|
50 |
|
51 /** |
|
52 * Constructs the tiny cjk tokenizer with given ngram size |
|
53 * |
|
54 * @param ngramsize cjk text is treated with n-gram analyzer of this size |
|
55 */ |
|
56 TinyCjkTokenizer(int ngramsize) |
|
57 : cjk_(ngramsize, &unicode::IsCjk), |
|
58 noncjk_(&IsNonCjk), |
|
59 pair_(cjk_, noncjk_), |
|
60 t_(pair_) {} |
|
61 |
|
62 /** |
|
63 * Consumes a token from given iterator. Returns n-grams |
|
64 * for cjk text, letter tokenized words for non-cjk text. |
|
65 * Always returns something unless EOS has been reached. |
|
66 */ |
|
67 inline tiny::Token<I> consume(I& i) { |
|
68 return t_.consume(i); |
|
69 } |
|
70 }; |
|
71 |
|
72 |
|
73 /** |
|
74 * Constructs n-grams of Chinese, Korean and Japanese text. Uses |
|
75 * letter tokenization for other kinds of texts. |
|
76 */ |
|
77 class CjkNGramTokenizer : public lucene::analysis::Tokenizer { |
|
78 |
|
79 public: |
|
80 |
|
81 /** Reads from buffer */ |
|
82 typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator; |
|
83 |
|
84 /** Turns utf16 to unicode */ |
|
85 typedef tiny::Utf16Iterator<buffer_iterator> iterator; |
|
86 |
|
87 public: |
|
88 |
|
89 CjkNGramTokenizer( lucene::util::Reader* reader, int gramSize ); |
|
90 |
|
91 virtual bool next( lucene::analysis::Token* token ); |
|
92 |
|
93 private: |
|
94 |
|
95 /** The tokenizer */ |
|
96 TinyCjkTokenizer<iterator> t_; |
|
97 |
|
98 /** Buffer */ |
|
99 tiny::cl::ReaderBuffer<512> in_; |
|
100 |
|
101 /** Reads utf16 from buffer and transforms it to unicode*/ |
|
102 iterator i_; |
|
103 |
|
104 }; |
|
105 |
|
106 /** |
|
107 * The great difference of this class compared to CJK ngram, |
|
108 * that it decomposes Hangul syllables into Hangul Jamu letters. |
|
109 * |
|
110 * This analyzer appeared to have bad performance in testing. |
|
111 */ |
|
112 class JamuNGramTokenizer : public lucene::analysis::Tokenizer { |
|
113 |
|
114 public: |
|
115 |
|
116 typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator; |
|
117 |
|
118 typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator; |
|
119 |
|
120 typedef tiny::JamuIterator<utf16_iterator> iterator; |
|
121 |
|
122 public: |
|
123 |
|
124 JamuNGramTokenizer( lucene::util::Reader* reader, int gramSize ); |
|
125 |
|
126 virtual bool next( lucene::analysis::Token* token ); |
|
127 |
|
128 private: |
|
129 |
|
130 TinyCjkTokenizer<iterator> t_; |
|
131 |
|
132 tiny::cl::ReaderBuffer<512> in_; |
|
133 |
|
134 iterator i_; |
|
135 |
|
136 }; |
|
137 |
|
138 // Analyzers using the tokenizers |
|
139 // * Provided mainly for testing |
|
140 // |
|
141 |
|
142 /** CjkNGramTokenizer plus lowercase filter */ |
|
143 typedef TemplateAnalyzer1A1F<CjkNGramTokenizer, int, lucene::analysis::LowerCaseFilter> |
|
144 CjkNGramAnalyzer; |
|
145 |
|
146 /** JamuNGramTokenizer plus lowercase filter */ |
|
147 typedef TemplateAnalyzer1A1F<JamuNGramTokenizer, int, lucene::analysis::LowerCaseFilter> |
|
148 JamuNGramAnalyzer; |
|
149 |
|
150 } |
|
151 |
|
152 |
|
153 #endif /* NGRAM_H_ */ |