|
1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 #ifndef KOREANANALYZER_H_ |
|
19 #define KOREANANALYZER_H_ |
|
20 |
|
21 #include "Clucene.h" |
|
22 |
|
23 #include "ngram.h" |
|
24 |
|
25 #include "tinyanalysis.h" |
|
26 #include "tinyutf16.h" |
|
27 #include "tinyunicode.h" |
|
28 |
|
29 namespace analysis |
|
30 { |
|
31 // Forward declarations |
|
32 |
|
33 /** |
|
34 * Special Korean analyzer that is designed so, that Cpix can |
|
35 * update the result list, when each individual Jamu character is |
|
36 * entered. |
|
37 * |
|
38 * The analyzer tries to first convert given character stream into |
|
39 * a form, where all Jamu characters are composed into Hangul form. |
|
40 * This means, that character sequences of form LV and LVT are |
|
41 * eliminated and replaced with hangul syllables (L is leading Jamu |
|
42 * consonant, V is for vocal and T is for trailing consonant). |
|
43 * |
|
44 * The idea behind the analyzer is that it produces up to 3 alternative |
|
45 * tokens for each hangul syllabic. All of these alternatives are returned |
|
46 * to be located at the same position. Let's have some Hangul syllabic H1 |
|
47 * consisting of Jamu characters so that H1=J1J2J3. If H2=J1J2, then first |
|
48 * returned token is H1, second token is H2 and third token is J1. This |
|
49 * means, that when user enters H1, H2 or J1, the term H1 will be found. |
|
50 * Also, if user enters J1J2J3 or J1J2, term will be found, because |
|
51 * J1J2J3 is automatically turned to H1 and J1J2 is turned to H2. |
|
52 * |
|
53 * NOTE: This analyzer MUST NOT be used, when searching, because |
|
54 * CLuceneQueryParser will break, when it faces tokens with zero |
|
55 * increment. Use KoreanQueryAnalyzer for searching material indexed |
|
56 * with this analyzer. |
|
57 */ |
|
58 class KoreanTokenizer : public lucene::analysis::Tokenizer { |
|
59 |
|
60 public: |
|
61 |
|
62 /** Used to read from buffer */ |
|
63 typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator; |
|
64 |
|
65 /** Turns utf16 code points into unicode */ |
|
66 typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator; |
|
67 |
|
68 /** Turns Jamu alphabets into Hangul syllables */ |
|
69 typedef tiny::HangulIterator<utf16_iterator> iterator; |
|
70 |
|
71 KoreanTokenizer(lucene::util::Reader* reader); |
|
72 |
|
73 virtual bool next(lucene::analysis::Token* token); |
|
74 |
|
75 private: |
|
76 |
|
77 /** Jamu form of last consumed hangul syllable */ |
|
78 wchar_t jamu_[4]; |
|
79 |
|
80 /** offsets of last consumed hangul syllable */ |
|
81 int begin_, end_; |
|
82 |
|
83 /** |
|
84 * The amount of jamu characters left in buffer. |
|
85 * If this is non-zero, hangul syllable is being processed. |
|
86 */ |
|
87 int state_; |
|
88 |
|
89 /** |
|
90 * Tiny CJK tokenizer is used to construct 1-grams out of |
|
91 * chinese and japanese characters and to turn latin script |
|
92 * into terms. |
|
93 */ |
|
94 TinyCjkTokenizer<iterator> t_; |
|
95 |
|
96 /** 512 byte buffer for storing characters read with reader */ |
|
97 tiny::cl::ReaderBuffer<512> in_; |
|
98 |
|
99 /** |
|
100 * Reads utf16 from in_ buffer, turns it into unicode and |
|
101 * then composes jamu alphabets into hangul syllables. |
|
102 */ |
|
103 iterator i_; |
|
104 |
|
105 |
|
106 }; |
|
107 |
|
108 /** Korean tokenizer plus lowercase filter */ |
|
109 typedef TemplateAnalyzer1F<KoreanTokenizer, lucene::analysis::LowerCaseFilter> |
|
110 KoreanAnalyzer; |
|
111 |
|
112 /** |
|
113 * Turns Jamu characters into Hangul syllables and generates 1-grams for |
|
114 * all Chinese, Korean and Japanese text. |
|
115 */ |
|
116 class KoreanQueryTokenizer : public lucene::analysis::Tokenizer { |
|
117 |
|
118 public: |
|
119 |
|
120 /** Used to read from buffer */ |
|
121 typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator; |
|
122 |
|
123 /** Turns utf16 code points into unicode */ |
|
124 typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator; |
|
125 |
|
126 /** Turns Hangul syllables into Jamu alphabets */ |
|
127 typedef tiny::HangulIterator<utf16_iterator> iterator; |
|
128 |
|
129 public: |
|
130 |
|
131 KoreanQueryTokenizer( lucene::util::Reader* reader ); |
|
132 |
|
133 virtual bool next( lucene::analysis::Token* token ); |
|
134 |
|
135 private: |
|
136 |
|
137 /** Buffer for storing characters read with reader */ |
|
138 TinyCjkTokenizer<iterator> t_; |
|
139 |
|
140 /** Buffer for storing characters read with reader */ |
|
141 tiny::cl::ReaderBuffer<512> in_; |
|
142 |
|
143 /** |
|
144 * Reads utf16 from in_ buffer, turns it into unicode and |
|
145 * then composes jamu alphabets into hangul syllables. |
|
146 */ |
|
147 iterator i_; |
|
148 |
|
149 }; |
|
150 |
|
151 /** Korean query analyzer plus lowercase filter */ |
|
152 typedef TemplateAnalyzer1F<KoreanQueryTokenizer, lucene::analysis::LowerCaseFilter> |
|
153 KoreanQueryAnalyzer; |
|
154 |
|
155 } |
|
156 |
|
157 #endif /* KOREANANALYZER_H_ */ |