|
1 /* |
|
2 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings |
|
3 * |
|
4 * Written by Hye-Shik Chang <perky@FreeBSD.org> |
|
5 */ |
|
6 |
|
7 #include "cjkcodecs.h" |
|
8 #include "mappings_cn.h" |
|
9 |
|
10 /** |
|
11 * hz is predefined as 100 on AIX. So we undefine it to avoid |
|
12 * conflict against hz codec's. |
|
13 */ |
|
14 #ifdef _AIX |
|
15 #undef hz |
|
16 #endif |
|
17 |
|
18 /* GBK and GB2312 map differently in few codepoints that are listed below: |
|
19 * |
|
20 * gb2312 gbk |
|
21 * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT |
|
22 * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH |
|
23 * A844 undefined U+2015 HORIZONTAL BAR |
|
24 */ |
|
25 |
|
26 #define GBK_DECODE(dc1, dc2, assi) \ |
|
27 if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \ |
|
28 else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \ |
|
29 else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \ |
|
30 else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \ |
|
31 else TRYMAP_DEC(gbkext, assi, dc1, dc2); |
|
32 |
|
33 #define GBK_ENCODE(code, assi) \ |
|
34 if ((code) == 0x2014) (assi) = 0xa1aa; \ |
|
35 else if ((code) == 0x2015) (assi) = 0xa844; \ |
|
36 else if ((code) == 0x00b7) (assi) = 0xa1a4; \ |
|
37 else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code)); |
|
38 |
|
39 /* |
|
40 * GB2312 codec |
|
41 */ |
|
42 |
|
43 ENCODER(gb2312) |
|
44 { |
|
45 while (inleft > 0) { |
|
46 Py_UNICODE c = IN1; |
|
47 DBCHAR code; |
|
48 |
|
49 if (c < 0x80) { |
|
50 WRITE1((unsigned char)c) |
|
51 NEXT(1, 1) |
|
52 continue; |
|
53 } |
|
54 UCS4INVALID(c) |
|
55 |
|
56 REQUIRE_OUTBUF(2) |
|
57 TRYMAP_ENC(gbcommon, code, c); |
|
58 else return 1; |
|
59 |
|
60 if (code & 0x8000) /* MSB set: GBK */ |
|
61 return 1; |
|
62 |
|
63 OUT1((code >> 8) | 0x80) |
|
64 OUT2((code & 0xFF) | 0x80) |
|
65 NEXT(1, 2) |
|
66 } |
|
67 |
|
68 return 0; |
|
69 } |
|
70 |
|
71 DECODER(gb2312) |
|
72 { |
|
73 while (inleft > 0) { |
|
74 unsigned char c = **inbuf; |
|
75 |
|
76 REQUIRE_OUTBUF(1) |
|
77 |
|
78 if (c < 0x80) { |
|
79 OUT1(c) |
|
80 NEXT(1, 1) |
|
81 continue; |
|
82 } |
|
83 |
|
84 REQUIRE_INBUF(2) |
|
85 TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) { |
|
86 NEXT(2, 1) |
|
87 } |
|
88 else return 2; |
|
89 } |
|
90 |
|
91 return 0; |
|
92 } |
|
93 |
|
94 |
|
95 /* |
|
96 * GBK codec |
|
97 */ |
|
98 |
|
99 ENCODER(gbk) |
|
100 { |
|
101 while (inleft > 0) { |
|
102 Py_UNICODE c = IN1; |
|
103 DBCHAR code; |
|
104 |
|
105 if (c < 0x80) { |
|
106 WRITE1((unsigned char)c) |
|
107 NEXT(1, 1) |
|
108 continue; |
|
109 } |
|
110 UCS4INVALID(c) |
|
111 |
|
112 REQUIRE_OUTBUF(2) |
|
113 |
|
114 GBK_ENCODE(c, code) |
|
115 else return 1; |
|
116 |
|
117 OUT1((code >> 8) | 0x80) |
|
118 if (code & 0x8000) |
|
119 OUT2((code & 0xFF)) /* MSB set: GBK */ |
|
120 else |
|
121 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */ |
|
122 NEXT(1, 2) |
|
123 } |
|
124 |
|
125 return 0; |
|
126 } |
|
127 |
|
128 DECODER(gbk) |
|
129 { |
|
130 while (inleft > 0) { |
|
131 unsigned char c = IN1; |
|
132 |
|
133 REQUIRE_OUTBUF(1) |
|
134 |
|
135 if (c < 0x80) { |
|
136 OUT1(c) |
|
137 NEXT(1, 1) |
|
138 continue; |
|
139 } |
|
140 |
|
141 REQUIRE_INBUF(2) |
|
142 |
|
143 GBK_DECODE(c, IN2, **outbuf) |
|
144 else return 2; |
|
145 |
|
146 NEXT(2, 1) |
|
147 } |
|
148 |
|
149 return 0; |
|
150 } |
|
151 |
|
152 |
|
153 /* |
|
154 * GB18030 codec |
|
155 */ |
|
156 |
|
157 ENCODER(gb18030) |
|
158 { |
|
159 while (inleft > 0) { |
|
160 ucs4_t c = IN1; |
|
161 DBCHAR code; |
|
162 |
|
163 if (c < 0x80) { |
|
164 WRITE1(c) |
|
165 NEXT(1, 1) |
|
166 continue; |
|
167 } |
|
168 |
|
169 DECODE_SURROGATE(c) |
|
170 if (c > 0x10FFFF) |
|
171 #if Py_UNICODE_SIZE == 2 |
|
172 return 2; /* surrogates pair */ |
|
173 #else |
|
174 return 1; |
|
175 #endif |
|
176 else if (c >= 0x10000) { |
|
177 ucs4_t tc = c - 0x10000; |
|
178 |
|
179 REQUIRE_OUTBUF(4) |
|
180 |
|
181 OUT4((unsigned char)(tc % 10) + 0x30) |
|
182 tc /= 10; |
|
183 OUT3((unsigned char)(tc % 126) + 0x81) |
|
184 tc /= 126; |
|
185 OUT2((unsigned char)(tc % 10) + 0x30) |
|
186 tc /= 10; |
|
187 OUT1((unsigned char)(tc + 0x90)) |
|
188 |
|
189 #if Py_UNICODE_SIZE == 2 |
|
190 NEXT(2, 4) /* surrogates pair */ |
|
191 #else |
|
192 NEXT(1, 4) |
|
193 #endif |
|
194 continue; |
|
195 } |
|
196 |
|
197 REQUIRE_OUTBUF(2) |
|
198 |
|
199 GBK_ENCODE(c, code) |
|
200 else TRYMAP_ENC(gb18030ext, code, c); |
|
201 else { |
|
202 const struct _gb18030_to_unibmp_ranges *utrrange; |
|
203 |
|
204 REQUIRE_OUTBUF(4) |
|
205 |
|
206 for (utrrange = gb18030_to_unibmp_ranges; |
|
207 utrrange->first != 0; |
|
208 utrrange++) |
|
209 if (utrrange->first <= c && |
|
210 c <= utrrange->last) { |
|
211 Py_UNICODE tc; |
|
212 |
|
213 tc = c - utrrange->first + |
|
214 utrrange->base; |
|
215 |
|
216 OUT4((unsigned char)(tc % 10) + 0x30) |
|
217 tc /= 10; |
|
218 OUT3((unsigned char)(tc % 126) + 0x81) |
|
219 tc /= 126; |
|
220 OUT2((unsigned char)(tc % 10) + 0x30) |
|
221 tc /= 10; |
|
222 OUT1((unsigned char)tc + 0x81) |
|
223 |
|
224 NEXT(1, 4) |
|
225 break; |
|
226 } |
|
227 |
|
228 if (utrrange->first == 0) |
|
229 return 1; |
|
230 continue; |
|
231 } |
|
232 |
|
233 OUT1((code >> 8) | 0x80) |
|
234 if (code & 0x8000) |
|
235 OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */ |
|
236 else |
|
237 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */ |
|
238 |
|
239 NEXT(1, 2) |
|
240 } |
|
241 |
|
242 return 0; |
|
243 } |
|
244 |
|
245 DECODER(gb18030) |
|
246 { |
|
247 while (inleft > 0) { |
|
248 unsigned char c = IN1, c2; |
|
249 |
|
250 REQUIRE_OUTBUF(1) |
|
251 |
|
252 if (c < 0x80) { |
|
253 OUT1(c) |
|
254 NEXT(1, 1) |
|
255 continue; |
|
256 } |
|
257 |
|
258 REQUIRE_INBUF(2) |
|
259 |
|
260 c2 = IN2; |
|
261 if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */ |
|
262 const struct _gb18030_to_unibmp_ranges *utr; |
|
263 unsigned char c3, c4; |
|
264 ucs4_t lseq; |
|
265 |
|
266 REQUIRE_INBUF(4) |
|
267 c3 = IN3; |
|
268 c4 = IN4; |
|
269 if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39) |
|
270 return 4; |
|
271 c -= 0x81; c2 -= 0x30; |
|
272 c3 -= 0x81; c4 -= 0x30; |
|
273 |
|
274 if (c < 4) { /* U+0080 - U+FFFF */ |
|
275 lseq = ((ucs4_t)c * 10 + c2) * 1260 + |
|
276 (ucs4_t)c3 * 10 + c4; |
|
277 if (lseq < 39420) { |
|
278 for (utr = gb18030_to_unibmp_ranges; |
|
279 lseq >= (utr + 1)->base; |
|
280 utr++) ; |
|
281 OUT1(utr->first - utr->base + lseq) |
|
282 NEXT(4, 1) |
|
283 continue; |
|
284 } |
|
285 } |
|
286 else if (c >= 15) { /* U+10000 - U+10FFFF */ |
|
287 lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2) |
|
288 * 1260 + (ucs4_t)c3 * 10 + c4; |
|
289 if (lseq <= 0x10FFFF) { |
|
290 WRITEUCS4(lseq); |
|
291 NEXT_IN(4) |
|
292 continue; |
|
293 } |
|
294 } |
|
295 return 4; |
|
296 } |
|
297 |
|
298 GBK_DECODE(c, c2, **outbuf) |
|
299 else TRYMAP_DEC(gb18030ext, **outbuf, c, c2); |
|
300 else return 2; |
|
301 |
|
302 NEXT(2, 1) |
|
303 } |
|
304 |
|
305 return 0; |
|
306 } |
|
307 |
|
308 |
|
309 /* |
|
310 * HZ codec |
|
311 */ |
|
312 |
|
313 ENCODER_INIT(hz) |
|
314 { |
|
315 state->i = 0; |
|
316 return 0; |
|
317 } |
|
318 |
|
319 ENCODER_RESET(hz) |
|
320 { |
|
321 if (state->i != 0) { |
|
322 WRITE2('~', '}') |
|
323 state->i = 0; |
|
324 NEXT_OUT(2) |
|
325 } |
|
326 return 0; |
|
327 } |
|
328 |
|
329 ENCODER(hz) |
|
330 { |
|
331 while (inleft > 0) { |
|
332 Py_UNICODE c = IN1; |
|
333 DBCHAR code; |
|
334 |
|
335 if (c < 0x80) { |
|
336 if (state->i == 0) { |
|
337 WRITE1((unsigned char)c) |
|
338 NEXT(1, 1) |
|
339 } |
|
340 else { |
|
341 WRITE3('~', '}', (unsigned char)c) |
|
342 NEXT(1, 3) |
|
343 state->i = 0; |
|
344 } |
|
345 continue; |
|
346 } |
|
347 |
|
348 UCS4INVALID(c) |
|
349 |
|
350 TRYMAP_ENC(gbcommon, code, c); |
|
351 else return 1; |
|
352 |
|
353 if (code & 0x8000) /* MSB set: GBK */ |
|
354 return 1; |
|
355 |
|
356 if (state->i == 0) { |
|
357 WRITE4('~', '{', code >> 8, code & 0xff) |
|
358 NEXT(1, 4) |
|
359 state->i = 1; |
|
360 } |
|
361 else { |
|
362 WRITE2(code >> 8, code & 0xff) |
|
363 NEXT(1, 2) |
|
364 } |
|
365 } |
|
366 |
|
367 return 0; |
|
368 } |
|
369 |
|
370 DECODER_INIT(hz) |
|
371 { |
|
372 state->i = 0; |
|
373 return 0; |
|
374 } |
|
375 |
|
376 DECODER_RESET(hz) |
|
377 { |
|
378 state->i = 0; |
|
379 return 0; |
|
380 } |
|
381 |
|
382 DECODER(hz) |
|
383 { |
|
384 while (inleft > 0) { |
|
385 unsigned char c = IN1; |
|
386 |
|
387 if (c == '~') { |
|
388 unsigned char c2 = IN2; |
|
389 |
|
390 REQUIRE_INBUF(2) |
|
391 if (c2 == '~') { |
|
392 WRITE1('~') |
|
393 NEXT(2, 1) |
|
394 continue; |
|
395 } |
|
396 else if (c2 == '{' && state->i == 0) |
|
397 state->i = 1; /* set GB */ |
|
398 else if (c2 == '}' && state->i == 1) |
|
399 state->i = 0; /* set ASCII */ |
|
400 else if (c2 == '\n') |
|
401 ; /* line-continuation */ |
|
402 else |
|
403 return 2; |
|
404 NEXT(2, 0); |
|
405 continue; |
|
406 } |
|
407 |
|
408 if (c & 0x80) |
|
409 return 1; |
|
410 |
|
411 if (state->i == 0) { /* ASCII mode */ |
|
412 WRITE1(c) |
|
413 NEXT(1, 1) |
|
414 } |
|
415 else { /* GB mode */ |
|
416 REQUIRE_INBUF(2) |
|
417 REQUIRE_OUTBUF(1) |
|
418 TRYMAP_DEC(gb2312, **outbuf, c, IN2) { |
|
419 NEXT(2, 1) |
|
420 } |
|
421 else |
|
422 return 2; |
|
423 } |
|
424 } |
|
425 |
|
426 return 0; |
|
427 } |
|
428 |
|
429 |
|
430 BEGIN_MAPPINGS_LIST |
|
431 MAPPING_DECONLY(gb2312) |
|
432 MAPPING_DECONLY(gbkext) |
|
433 MAPPING_ENCONLY(gbcommon) |
|
434 MAPPING_ENCDEC(gb18030ext) |
|
435 END_MAPPINGS_LIST |
|
436 |
|
437 BEGIN_CODECS_LIST |
|
438 CODEC_STATELESS(gb2312) |
|
439 CODEC_STATELESS(gbk) |
|
440 CODEC_STATELESS(gb18030) |
|
441 CODEC_STATEFUL(hz) |
|
442 END_CODECS_LIST |
|
443 |
|
444 I_AM_A_MODULE_FOR(cn) |