|
1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 #ifndef UNICODEINFO_H_ |
|
18 #define UNICODEINFO_H_ |
|
19 |
|
20 #include "tinyiterator.h" |
|
21 |
|
22 namespace analysis { |
|
23 |
|
24 /** |
|
25 * The package contains various unicode related functionality as |
|
26 * needed by the remaining analysis package |
|
27 */ |
|
28 namespace unicode { |
|
29 |
|
30 /** Returns true, if character c is either Hangul Jamo or Syllable */ |
|
31 int IsHangul(int c); |
|
32 |
|
33 /** Returns true, if character c is Hangul Syllable */ |
|
34 int IsHangulSyllable(int c); |
|
35 |
|
36 /** Returns true, if character c is Hangul Jamo */ |
|
37 int IsHangulJamo(int c); |
|
38 |
|
39 /** |
|
40 * Returns true, if character is of either Chinese, |
|
41 * Japanese or Korean writing systems |
|
42 */ |
|
43 int IsCjk(int c); |
|
44 |
|
45 /** |
|
46 * Returns true, if characters is on the Thai unicode block |
|
47 */ |
|
48 int IsThai(int c); |
|
49 |
|
50 namespace hangul { |
|
51 |
|
52 /* First Hangul Syllable code */ |
|
53 static const int SyllableBase = 0xAC00; |
|
54 |
|
55 // Jamu Alphabets |
|
56 /** First leading Jamu consonant */ |
|
57 static const int LeadingBase = 0x1100; |
|
58 |
|
59 /** First vowel */ |
|
60 static const int VowelBase = 0x1161; |
|
61 |
|
62 /** First trailing Jamu consonant */ |
|
63 static const int TrailingBase = 0x11A7; |
|
64 |
|
65 /** Leading consonants count */ |
|
66 static const int LeadingCount = 19; |
|
67 |
|
68 /** Vowel count */ |
|
69 static const int VowelCount = 21; |
|
70 |
|
71 /** Trailing consonant count */ |
|
72 static const int TrailingCount = 28; |
|
73 |
|
74 /** |
|
75 * Amount of syllables that are composed of a leading |
|
76 * consonant and a vowel |
|
77 */ |
|
78 static const int LvSyllableCount = VowelCount * TrailingCount; // 588 |
|
79 |
|
80 /** |
|
81 * Amount of syllables that are composed of a leading |
|
82 * consonant, a vowel and a trailing consonant |
|
83 */ |
|
84 static const int SyllableCount = LvSyllableCount * TrailingCount; // 11172 |
|
85 } |
|
86 |
|
87 /** |
|
88 * Decomposes hangul syllable into jamu alphabets |
|
89 */ |
|
90 template<typename Output> |
|
91 void DecomposeHangul(Output out, int c) { |
|
92 using namespace hangul; |
|
93 int sindex = c - SyllableBase; |
|
94 if (sindex < 0 || sindex >= SyllableCount) { |
|
95 out<<c<<'\0'; |
|
96 } else { |
|
97 // Leading |
|
98 out<<(LeadingBase + sindex / LvSyllableCount); |
|
99 // Vocal |
|
100 out<<(VowelBase + (sindex % LvSyllableCount) / TrailingCount); |
|
101 // Trailing (voluntary) |
|
102 int toffset = sindex % TrailingCount; |
|
103 if (toffset) out<<(TrailingBase + toffset); |
|
104 // Finish |
|
105 out<<'\0'; |
|
106 } |
|
107 } |
|
108 |
|
109 /** |
|
110 * Composes encountered jamu alphabets into hangul syllable. |
|
111 * Moves given iterator over the consumed unicode character. |
|
112 */ |
|
113 template<typename Iterator> |
|
114 int ConsumeComposedJamu(Iterator& i) { |
|
115 using namespace hangul; |
|
116 int c = *i; ++i; |
|
117 int lindex = c - LeadingBase; |
|
118 if (0 <= lindex && lindex < LeadingCount) { |
|
119 int vindex = *i - VowelBase; |
|
120 if (0 <= vindex && vindex < VowelCount) { |
|
121 ++i; |
|
122 int tindex = *i - TrailingBase; |
|
123 c = (SyllableBase + (lindex * VowelCount + vindex) * TrailingCount); |
|
124 if (0 <= tindex && tindex < TrailingCount) { |
|
125 ++i; |
|
126 c += tindex; |
|
127 } |
|
128 } |
|
129 } |
|
130 return c; |
|
131 } |
|
132 |
|
133 /** |
|
134 * Composes encountered jamu alphabets into hangul syllable. |
|
135 */ |
|
136 template<typename Iterator> |
|
137 inline int ComposeJamu(Iterator i) { |
|
138 return ConsumeComposedJamu(i); |
|
139 } |
|
140 } |
|
141 |
|
142 namespace tiny { // tiny analysis |
|
143 |
|
144 using namespace analysis::unicode::hangul; |
|
145 using namespace analysis::unicode; |
|
146 |
|
147 /** |
|
148 * Composes encountered Hangul Jamu characters into |
|
149 * Hangul syllables. |
|
150 */ |
|
151 template <typename Iterator> |
|
152 struct HangulIterator { |
|
153 public: |
|
154 HangulIterator() : i_(), c_(), offset_(0) {} |
|
155 HangulIterator(Iterator i) : i_(i) { |
|
156 ++(*this); // populate c_ |
|
157 } |
|
158 int operator*() { |
|
159 return c_; |
|
160 } |
|
161 operator int() { |
|
162 return offset_; |
|
163 } |
|
164 HangulIterator& operator++() { |
|
165 offset_ = i_; |
|
166 c_ = ConsumeComposedJamu(i_); |
|
167 return *this; |
|
168 } |
|
169 private: |
|
170 Iterator i_; |
|
171 int c_; |
|
172 int offset_; |
|
173 }; |
|
174 |
|
175 /** |
|
176 * Decomposes encountered Hangul syllables into |
|
177 * Hangul Jamu characters |
|
178 */ |
|
179 template <typename Iterator> |
|
180 struct JamuIterator { |
|
181 public: |
|
182 JamuIterator() : i_(), b_(0), offset_(0) { buf_[0];} |
|
183 JamuIterator(Iterator i) : i_(i), b_(0) { |
|
184 buf_[1] = '\0'; |
|
185 ++(*this); // populate buffer |
|
186 } |
|
187 int operator*() { |
|
188 return buf_[b_]; |
|
189 } |
|
190 JamuIterator& operator++() { |
|
191 offset_ = i_; |
|
192 if (!buf_[++b_]) { |
|
193 b_ = 0; // reset buf |
|
194 tiny::IteratorOutput<int*> out(buf_); |
|
195 DecomposeHangul(out, *i_); ++i_; |
|
196 } |
|
197 return *this; |
|
198 } |
|
199 operator int() { |
|
200 return offset_; |
|
201 } |
|
202 private: |
|
203 Iterator i_; |
|
204 int buf_[4]; |
|
205 int b_; |
|
206 int offset_; |
|
207 |
|
208 }; |
|
209 } |
|
210 } |
|
211 |
|
212 |
|
213 #endif /* UNICODEINFO_H_ */ |