|
1 /* |
|
2 * |
|
3 * (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved |
|
4 * |
|
5 * This file is a modification of the ICU file IndicReordering.cpp |
|
6 * by Jens Herden and Javier Sola for Khmer language |
|
7 * |
|
8 */ |
|
9 |
|
10 #include "LETypes.h" |
|
11 #include "KhmerReordering.h" |
|
12 #include "LEGlyphStorage.h" |
|
13 |
|
14 |
|
15 U_NAMESPACE_BEGIN |
|
16 |
|
17 // Characters that get refered to by name... |
|
18 enum |
|
19 { |
|
20 C_SIGN_ZWNJ = 0x200C, |
|
21 C_SIGN_ZWJ = 0x200D, |
|
22 C_DOTTED_CIRCLE = 0x25CC, |
|
23 C_RO = 0x179A, |
|
24 C_VOWEL_AA = 0x17B6, |
|
25 C_SIGN_NIKAHIT = 0x17C6, |
|
26 C_VOWEL_E = 0x17C1, |
|
27 C_COENG = 0x17D2 |
|
28 }; |
|
29 |
|
30 |
|
31 enum |
|
32 { |
|
33 // simple classes, they are used in the statetable (in this file) to control the length of a syllable |
|
34 // they are also used to know where a character should be placed (location in reference to the base character) |
|
35 // and also to know if a character, when independtly displayed, should be displayed with a dotted-circle to |
|
36 // indicate error in syllable construction |
|
37 _xx = KhmerClassTable::CC_RESERVED, |
|
38 _sa = KhmerClassTable::CC_SIGN_ABOVE | KhmerClassTable::CF_DOTTED_CIRCLE | KhmerClassTable::CF_POS_ABOVE, |
|
39 _sp = KhmerClassTable::CC_SIGN_AFTER | KhmerClassTable::CF_DOTTED_CIRCLE| KhmerClassTable::CF_POS_AFTER, |
|
40 _c1 = KhmerClassTable::CC_CONSONANT | KhmerClassTable::CF_CONSONANT, |
|
41 _c2 = KhmerClassTable::CC_CONSONANT2 | KhmerClassTable::CF_CONSONANT, |
|
42 _c3 = KhmerClassTable::CC_CONSONANT3 | KhmerClassTable::CF_CONSONANT, |
|
43 _rb = KhmerClassTable::CC_ROBAT | KhmerClassTable::CF_POS_ABOVE | KhmerClassTable::CF_DOTTED_CIRCLE, |
|
44 _cs = KhmerClassTable::CC_CONSONANT_SHIFTER | KhmerClassTable::CF_DOTTED_CIRCLE | KhmerClassTable::CF_SHIFTER, |
|
45 _dl = KhmerClassTable::CC_DEPENDENT_VOWEL | KhmerClassTable::CF_POS_BEFORE | KhmerClassTable::CF_DOTTED_CIRCLE, |
|
46 _db = KhmerClassTable::CC_DEPENDENT_VOWEL | KhmerClassTable::CF_POS_BELOW | KhmerClassTable::CF_DOTTED_CIRCLE, |
|
47 _da = KhmerClassTable::CC_DEPENDENT_VOWEL | KhmerClassTable::CF_POS_ABOVE | KhmerClassTable::CF_DOTTED_CIRCLE | KhmerClassTable::CF_ABOVE_VOWEL, |
|
48 _dr = KhmerClassTable::CC_DEPENDENT_VOWEL | KhmerClassTable::CF_POS_AFTER | KhmerClassTable::CF_DOTTED_CIRCLE, |
|
49 _co = KhmerClassTable::CC_COENG | KhmerClassTable::CF_COENG | KhmerClassTable::CF_DOTTED_CIRCLE, |
|
50 |
|
51 // split vowel |
|
52 _va = _da | KhmerClassTable::CF_SPLIT_VOWEL, |
|
53 _vr = _dr | KhmerClassTable::CF_SPLIT_VOWEL |
|
54 }; |
|
55 |
|
56 |
|
57 // Character class tables |
|
58 // _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs... |
|
59 // _sa Sign placed above the base |
|
60 // _sp Sign placed after the base |
|
61 // _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants) |
|
62 // _c2 Consonant of type 2 (only RO) |
|
63 // _c3 Consonant of type 3 |
|
64 // _rb Khmer sign robat u17CC. combining mark for subscript consonants |
|
65 // _cd Consonant-shifter |
|
66 // _dl Dependent vowel placed before the base (left of the base) |
|
67 // _db Dependent vowel placed below the base |
|
68 // _da Dependent vowel placed above the base |
|
69 // _dr Dependent vowel placed behind the base (right of the base) |
|
70 // _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following |
|
71 // it to create a subscript consonant or independent vowel |
|
72 // _va Khmer split vowel in wich the first part is before the base and the second one above the base |
|
73 // _vr Khmer split vowel in wich the first part is before the base and the second one behind (right of) the base |
|
74 |
|
75 static const KhmerClassTable::CharClass khmerCharClasses[] = |
|
76 { |
|
77 _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, // 1780 - 178F |
|
78 _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, // 1790 - 179F |
|
79 _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, // 17A0 - 17AF |
|
80 _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, // 17B0 - 17BF |
|
81 _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, // 17C0 - 17CF |
|
82 _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx, // 17D0 - 17DF |
|
83 }; |
|
84 |
|
85 |
|
86 // |
|
87 // Khmer Class Tables |
|
88 // |
|
89 |
|
90 // |
|
91 // The range of characters defined in the above table is defined here. FOr Khmer 1780 to 17DF |
|
92 // Even if the Khmer range is bigger, all other characters are not combinable, and therefore treated |
|
93 // as _xx |
|
94 static const KhmerClassTable khmerClassTable = {0x1780, 0x17df, khmerCharClasses}; |
|
95 |
|
96 |
|
97 // Below we define how a character in the input string is either in the khmerCharClasses table |
|
98 // (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear |
|
99 // within the syllable, but are not in the table) we also get their type back, or an unknown object |
|
100 // in which case we get _xx (CC_RESERVED) back |
|
101 KhmerClassTable::CharClass KhmerClassTable::getCharClass(LEUnicode ch) const |
|
102 { |
|
103 |
|
104 if (ch == C_SIGN_ZWJ) { |
|
105 return CC_ZERO_WIDTH_J_MARK; |
|
106 } |
|
107 |
|
108 if (ch == C_SIGN_ZWNJ) { |
|
109 return CC_ZERO_WIDTH_NJ_MARK; |
|
110 } |
|
111 |
|
112 if (ch < firstChar || ch > lastChar) { |
|
113 return CC_RESERVED; |
|
114 } |
|
115 |
|
116 return classTable[ch - firstChar]; |
|
117 } |
|
118 |
|
119 const KhmerClassTable *KhmerClassTable::getKhmerClassTable() |
|
120 { |
|
121 return &khmerClassTable; |
|
122 } |
|
123 |
|
124 |
|
125 |
|
126 class ReorderingOutput : public UMemory { |
|
127 private: |
|
128 le_int32 fOutIndex; |
|
129 LEUnicode *fOutChars; |
|
130 |
|
131 LEGlyphStorage &fGlyphStorage; |
|
132 |
|
133 |
|
134 public: |
|
135 ReorderingOutput(LEUnicode *outChars, LEGlyphStorage &glyphStorage) |
|
136 : fOutIndex(0), fOutChars(outChars), fGlyphStorage(glyphStorage) |
|
137 { |
|
138 // nothing else to do... |
|
139 } |
|
140 |
|
141 ~ReorderingOutput() |
|
142 { |
|
143 // nothing to do here... |
|
144 } |
|
145 |
|
146 void writeChar(LEUnicode ch, le_uint32 charIndex, const LETag *charTags) |
|
147 { |
|
148 LEErrorCode success = LE_NO_ERROR; |
|
149 |
|
150 fOutChars[fOutIndex] = ch; |
|
151 |
|
152 fGlyphStorage.setCharIndex(fOutIndex, charIndex, success); |
|
153 fGlyphStorage.setAuxData(fOutIndex, (void *) charTags, success); |
|
154 |
|
155 fOutIndex += 1; |
|
156 } |
|
157 |
|
158 le_int32 getOutputIndex() |
|
159 { |
|
160 return fOutIndex; |
|
161 } |
|
162 }; |
|
163 |
|
164 |
|
165 static const LETag emptyTag = 0x00000000; // '' |
|
166 //TODO remove unused flags |
|
167 //static const LETag nuktFeatureTag = LE_NUKT_FEATURE_TAG; |
|
168 //static const LETag akhnFeatureTag = LE_AKHN_FEATURE_TAG; |
|
169 //static const LETag rphfFeatureTag = LE_RPHF_FEATURE_TAG; |
|
170 static const LETag blwfFeatureTag = LE_BLWF_FEATURE_TAG; |
|
171 //static const LETag halfFeatureTag = LE_HALF_FEATURE_TAG; |
|
172 static const LETag pstfFeatureTag = LE_PSTF_FEATURE_TAG; |
|
173 //static const LETag vatuFeatureTag = LE_VATU_FEATURE_TAG; |
|
174 static const LETag presFeatureTag = LE_PRES_FEATURE_TAG; |
|
175 static const LETag blwsFeatureTag = LE_BLWS_FEATURE_TAG; |
|
176 static const LETag abvsFeatureTag = LE_ABVS_FEATURE_TAG; |
|
177 static const LETag pstsFeatureTag = LE_PSTS_FEATURE_TAG; |
|
178 //static const LETag halnFeatureTag = LE_HALN_FEATURE_TAG; |
|
179 |
|
180 static const LETag blwmFeatureTag = LE_BLWM_FEATURE_TAG; |
|
181 static const LETag abvmFeatureTag = LE_ABVM_FEATURE_TAG; |
|
182 static const LETag distFeatureTag = LE_DIST_FEATURE_TAG; |
|
183 |
|
184 static const LETag prefFeatureTag = LE_PREF_FEATURE_TAG; |
|
185 static const LETag abvfFeatureTag = LE_ABVF_FEATURE_TAG; |
|
186 static const LETag cligFeatureTag = LE_CLIG_FEATURE_TAG; |
|
187 static const LETag mkmkFeatureTag = LE_MKMK_FEATURE_TAG; |
|
188 |
|
189 // These are in the order in which the features need to be applied |
|
190 // for correct processing |
|
191 static const LETag featureOrder[] = |
|
192 { |
|
193 // Shaping features |
|
194 prefFeatureTag, blwfFeatureTag, abvfFeatureTag, pstfFeatureTag, |
|
195 presFeatureTag, blwsFeatureTag, abvsFeatureTag, pstsFeatureTag, |
|
196 cligFeatureTag, |
|
197 |
|
198 // Positioning features |
|
199 distFeatureTag, blwmFeatureTag, abvmFeatureTag, mkmkFeatureTag, |
|
200 emptyTag |
|
201 }; |
|
202 |
|
203 static const LETag tagPref[] = |
|
204 { |
|
205 prefFeatureTag, presFeatureTag, |
|
206 cligFeatureTag, |
|
207 |
|
208 // Positioning features |
|
209 distFeatureTag, |
|
210 emptyTag |
|
211 }; |
|
212 |
|
213 static const LETag tagAbvf[] = |
|
214 { |
|
215 abvfFeatureTag, abvsFeatureTag, |
|
216 cligFeatureTag, |
|
217 |
|
218 // Positioning features |
|
219 distFeatureTag, abvmFeatureTag, mkmkFeatureTag, |
|
220 emptyTag |
|
221 }; |
|
222 |
|
223 static const LETag tagPstf[] = |
|
224 { |
|
225 blwfFeatureTag, blwsFeatureTag, |
|
226 prefFeatureTag, presFeatureTag, |
|
227 |
|
228 pstfFeatureTag, pstsFeatureTag, |
|
229 cligFeatureTag, |
|
230 |
|
231 // Positioning features |
|
232 distFeatureTag, blwmFeatureTag, |
|
233 emptyTag |
|
234 }; |
|
235 |
|
236 static const LETag tagBlwf[] = |
|
237 { |
|
238 blwfFeatureTag, blwsFeatureTag, |
|
239 cligFeatureTag, |
|
240 |
|
241 // Positioning features |
|
242 distFeatureTag, blwmFeatureTag, mkmkFeatureTag, |
|
243 emptyTag |
|
244 }; |
|
245 |
|
246 |
|
247 // TODO do we need all of them? |
|
248 static const LETag tagDefault[] = |
|
249 { |
|
250 // Shaping feature |
|
251 prefFeatureTag, blwfFeatureTag, /*abvfFeatureTag,*/ /*pstfFeatureTag, */ |
|
252 presFeatureTag, blwsFeatureTag, /*abvsFeatureTag,*/ /*pstsFeatureTag,*/ |
|
253 cligFeatureTag, |
|
254 |
|
255 // Positioning features |
|
256 distFeatureTag, abvmFeatureTag, blwmFeatureTag, mkmkFeatureTag, |
|
257 emptyTag |
|
258 }; |
|
259 |
|
260 |
|
261 |
|
262 // The stateTable is used to calculate the end (the length) of a well |
|
263 // formed Khmer Syllable. |
|
264 // |
|
265 // Each horizontal line is ordered exactly the same way as the values in KhmerClassTable |
|
266 // CharClassValues in KhmerReordering.h This coincidence of values allows the |
|
267 // follow up of the table. |
|
268 // |
|
269 // Each line corresponds to a state, which does not necessarily need to be a type |
|
270 // of component... for example, state 2 is a base, with is always a first character |
|
271 // in the syllable, but the state could be produced a consonant of any type when |
|
272 // it is the first character that is analysed (in ground state). |
|
273 // |
|
274 // Differentiating 3 types of consonants is necessary in order to |
|
275 // forbid the use of certain combinations, such as having a second |
|
276 // coeng after a coeng RO, |
|
277 // The inexistent possibility of having a type 3 after another type 3 is permitted, |
|
278 // eliminating it would very much complicate the table, and it does not create typing |
|
279 // problems, as the case above. |
|
280 // |
|
281 // The table is quite complex, in order to limit the number of coeng consonants |
|
282 // to 2 (by means of the table). |
|
283 // |
|
284 // There a peculiarity, as far as Unicode is concerned: |
|
285 // - The consonant-shifter is considered in two possible different |
|
286 // locations, the one considered in Unicode 3.0 and the one considered in |
|
287 // Unicode 4.0. (there is a backwards compatibility problem in this standard). |
|
288 |
|
289 |
|
290 // xx independent character, such as a number, punctuation sign or non-khmer char |
|
291 // |
|
292 // c1 Khmer consonant of type 1 or an independent vowel |
|
293 // that is, a letter in which the subscript for is only under the |
|
294 // base, not taking any space to the right or to the left |
|
295 // |
|
296 // c2 Khmer consonant of type 2, the coeng form takes space under |
|
297 // and to the left of the base (only RO is of this type) |
|
298 // |
|
299 // c3 Khmer consonant of type 3. Its subscript form takes space under |
|
300 // and to the right of the base. |
|
301 // |
|
302 // cs Khmer consonant shifter |
|
303 // |
|
304 // rb Khmer robat |
|
305 // |
|
306 // co coeng character (u17D2) |
|
307 // |
|
308 // dv dependent vowel (including split vowels, they are treated in the same way). |
|
309 // even if dv is not defined above, the component that is really tested for is |
|
310 // KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels |
|
311 // |
|
312 // zwj Zero Width joiner |
|
313 // |
|
314 // zwnj Zero width non joiner |
|
315 // |
|
316 // sa above sign |
|
317 // |
|
318 // sp post sign |
|
319 // |
|
320 // there are lines with equal content but for an easier understanding |
|
321 // (and maybe change in the future) we did not join them |
|
322 // |
|
323 static const le_int8 khmerStateTable[][KhmerClassTable::CC_COUNT] = |
|
324 { |
|
325 |
|
326 // xx c1 c2 c3 zwnj cs rb co dv sa sp zwj |
|
327 { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, // 0 - ground state |
|
328 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // 1 - exit state (or sign to the right of the syllable) |
|
329 {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, // 2 - Base consonant |
|
330 {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, // 3 - First ZWNJ before a register shifter |
|
331 // It can only be followed by a shifter or a vowel |
|
332 {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, // 4 - First register shifter |
|
333 {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, // 5 - Robat |
|
334 {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, // 6 - First Coeng |
|
335 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, // 7 - First consonant of type 1 after coeng |
|
336 {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, // 8 - First consonant of type 2 after coeng |
|
337 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, // 9 - First consonant or type 3 after ceong |
|
338 {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, // 10 - Second Coeng (no register shifter before) |
|
339 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, // 11 - Second coeng consonant (or ind. vowel) no register shifter before |
|
340 {-1, -1, 1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, // 12 - Second ZWNJ before a register shifter |
|
341 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, // 13 - Second register shifter |
|
342 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, // 14 - ZWJ before vowel |
|
343 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, // 15 - ZWNJ before vowel |
|
344 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, // 16 - dependent vowel |
|
345 {-1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, // 17 - sign above |
|
346 {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, // 18 - ZWJ after vowel |
|
347 {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, // 19 - Third coeng |
|
348 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, // 20 - dependent vowel after a Robat |
|
349 |
|
350 }; |
|
351 |
|
352 |
|
353 const LETag *KhmerReordering::getFeatureOrder() |
|
354 { |
|
355 return featureOrder; |
|
356 } |
|
357 |
|
358 |
|
359 // Given an input string of characters and a location in which to start looking |
|
360 // calculate, using the state table, which one is the last character of the syllable |
|
361 // that starts in the starting position. |
|
362 le_int32 KhmerReordering::findSyllable(const KhmerClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount) |
|
363 { |
|
364 le_int32 cursor = prev; |
|
365 le_int8 state = 0; |
|
366 |
|
367 while (cursor < charCount) { |
|
368 KhmerClassTable::CharClass charClass = (classTable->getCharClass(chars[cursor]) & KhmerClassTable::CF_CLASS_MASK); |
|
369 |
|
370 state = khmerStateTable[state][charClass]; |
|
371 |
|
372 if (state < 0) { |
|
373 break; |
|
374 } |
|
375 |
|
376 cursor += 1; |
|
377 } |
|
378 |
|
379 return cursor; |
|
380 } |
|
381 |
|
382 |
|
383 // This is the real reordering function as applied to the Khmer language |
|
384 |
|
385 le_int32 KhmerReordering::reorder(const LEUnicode *chars, le_int32 charCount, le_int32 /*scriptCode*/, |
|
386 LEUnicode *outChars, LEGlyphStorage &glyphStorage) |
|
387 { |
|
388 const KhmerClassTable *classTable = KhmerClassTable::getKhmerClassTable(); |
|
389 |
|
390 ReorderingOutput output(outChars, glyphStorage); |
|
391 KhmerClassTable::CharClass charClass; |
|
392 le_int32 i, prev = 0, coengRo; |
|
393 |
|
394 |
|
395 // This loop only exits when we reach the end of a run, which may contain |
|
396 // several syllables. |
|
397 while (prev < charCount) { |
|
398 le_int32 syllable = findSyllable(classTable, chars, prev, charCount); |
|
399 |
|
400 // write a pre vowel or the pre part of a split vowel first |
|
401 // and look out for coeng + ro. RO is the only vowel of type 2, and |
|
402 // therefore the only one that requires saving space before the base. |
|
403 coengRo = -1; // There is no Coeng Ro, if found this value will change |
|
404 for (i = prev; i < syllable; i += 1) { |
|
405 charClass = classTable->getCharClass(chars[i]); |
|
406 |
|
407 // if a split vowel, write the pre part. In Khmer the pre part |
|
408 // is the same for all split vowels, same glyph as pre vowel C_VOWEL_E |
|
409 if (charClass & KhmerClassTable::CF_SPLIT_VOWEL) { |
|
410 output.writeChar(C_VOWEL_E, i, &tagPref[0]); |
|
411 break; // there can be only one vowel |
|
412 } |
|
413 |
|
414 // if a vowel with pos before write it out |
|
415 if (charClass & KhmerClassTable::CF_POS_BEFORE) { |
|
416 output.writeChar(chars[i], i, &tagPref[0]); |
|
417 break; // there can be only one vowel |
|
418 } |
|
419 |
|
420 // look for coeng + ro and remember position |
|
421 // works because coeng + ro is always in front of a vowel (if there is a vowel) |
|
422 // and because CC_CONSONANT2 is enough to identify it, as it is the only consonant |
|
423 // with this flag |
|
424 if ( (charClass & KhmerClassTable::CF_COENG) && (i + 1 < syllable) && |
|
425 ( (classTable->getCharClass(chars[i + 1]) & KhmerClassTable::CF_CLASS_MASK) == KhmerClassTable::CC_CONSONANT2) ) |
|
426 { |
|
427 coengRo = i; |
|
428 } |
|
429 } |
|
430 |
|
431 // write coeng + ro if found |
|
432 if (coengRo > -1) { |
|
433 output.writeChar(C_COENG, coengRo, &tagPref[0]); |
|
434 output.writeChar(C_RO, coengRo + 1, &tagPref[0]); |
|
435 } |
|
436 |
|
437 // shall we add a dotted circle? |
|
438 // If in the position in which the base should be (first char in the string) there is |
|
439 // a character that has the Dotted circle flag (a character that cannot be a base) |
|
440 // then write a dotted circle |
|
441 if (classTable->getCharClass(chars[prev]) & KhmerClassTable::CF_DOTTED_CIRCLE) { |
|
442 output.writeChar(C_DOTTED_CIRCLE, prev, &tagDefault[0]); |
|
443 } |
|
444 |
|
445 // copy what is left to the output, skipping before vowels and coeng Ro if they are present |
|
446 for (i = prev; i < syllable; i += 1) { |
|
447 charClass = classTable->getCharClass(chars[i]); |
|
448 |
|
449 // skip a before vowel, it was already processed |
|
450 if (charClass & KhmerClassTable::CF_POS_BEFORE) { |
|
451 continue; |
|
452 } |
|
453 |
|
454 // skip coeng + ro, it was already processed |
|
455 if (i == coengRo) { |
|
456 i += 1; |
|
457 continue; |
|
458 } |
|
459 |
|
460 switch (charClass & KhmerClassTable::CF_POS_MASK) { |
|
461 case KhmerClassTable::CF_POS_ABOVE : |
|
462 output.writeChar(chars[i], i, &tagAbvf[0]); |
|
463 break; |
|
464 |
|
465 case KhmerClassTable::CF_POS_AFTER : |
|
466 output.writeChar(chars[i], i, &tagPstf[0]); |
|
467 break; |
|
468 |
|
469 case KhmerClassTable::CF_POS_BELOW : |
|
470 output.writeChar(chars[i], i, &tagBlwf[0]); |
|
471 break; |
|
472 |
|
473 default: |
|
474 // assign the correct flags to a coeng consonant |
|
475 // Consonants of type 3 are taged as Post forms and those type 1 as below forms |
|
476 if ( (charClass & KhmerClassTable::CF_COENG) && i + 1 < syllable ) { |
|
477 if ( (classTable->getCharClass(chars[i + 1]) & KhmerClassTable::CF_CLASS_MASK) |
|
478 == KhmerClassTable::CC_CONSONANT3) { |
|
479 output.writeChar(chars[i], i, &tagPstf[0]); |
|
480 i += 1; |
|
481 output.writeChar(chars[i], i, &tagPstf[0]); |
|
482 } |
|
483 else { |
|
484 output.writeChar(chars[i], i, &tagBlwf[0]); |
|
485 i += 1; |
|
486 output.writeChar(chars[i], i, &tagBlwf[0]); |
|
487 } |
|
488 break; |
|
489 } |
|
490 // if a shifter is followed by an above vowel change the shifter to below form, |
|
491 // an above vowel can have two possible positions i + 1 or i + 3 |
|
492 // (position i+1 corresponds to unicode 3, position i+3 to Unicode 4) |
|
493 // and there is an extra rule for C_VOWEL_AA + C_SIGN_NIKAHIT also for two |
|
494 // different positions, right after the shifter or after a vowel (Unicode 4) |
|
495 if ( (charClass & KhmerClassTable::CF_SHIFTER) && (i + 1 < syllable) ) { |
|
496 if ((classTable->getCharClass(chars[i + 1]) & KhmerClassTable::CF_ABOVE_VOWEL) |
|
497 || (i + 2 < syllable |
|
498 && ( (classTable->getCharClass(chars[i + 1]) & KhmerClassTable::CF_CLASS_MASK) == C_VOWEL_AA) |
|
499 && ( (classTable->getCharClass(chars[i + 2]) & KhmerClassTable::CF_CLASS_MASK) == C_SIGN_NIKAHIT)) |
|
500 || (i + 3 < syllable && (classTable->getCharClass(chars[i + 3]) & KhmerClassTable::CF_ABOVE_VOWEL)) |
|
501 || (i + 4 < syllable |
|
502 && ( (classTable->getCharClass(chars[i + 3]) & KhmerClassTable::CF_CLASS_MASK) == C_VOWEL_AA) |
|
503 && ( (classTable->getCharClass(chars[i + 4]) & KhmerClassTable::CF_CLASS_MASK) == C_SIGN_NIKAHIT) ) ) |
|
504 { |
|
505 output.writeChar(chars[i], i, &tagBlwf[0]); |
|
506 break; |
|
507 } |
|
508 |
|
509 } |
|
510 // default - any other characters |
|
511 output.writeChar(chars[i], i, &tagDefault[0]); |
|
512 break; |
|
513 } // switch |
|
514 } // for |
|
515 |
|
516 prev = syllable; // move the pointer to the start of next syllable |
|
517 } |
|
518 |
|
519 return output.getOutputIndex(); |
|
520 } |
|
521 |
|
522 |
|
523 U_NAMESPACE_END |