symbian-qemu-0.9.1-12/python-2.6.1/Modules/cjkcodecs/_codecs_cn.c
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 /*
       
     2  * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
       
     3  *
       
     4  * Written by Hye-Shik Chang <perky@FreeBSD.org>
       
     5  */
       
     6 
       
     7 #include "cjkcodecs.h"
       
     8 #include "mappings_cn.h"
       
     9 
       
    10 /**
       
    11  * hz is predefined as 100 on AIX. So we undefine it to avoid
       
    12  * conflict against hz codec's.
       
    13  */
       
    14 #ifdef _AIX
       
    15 #undef hz
       
    16 #endif
       
    17 
       
    18 /* GBK and GB2312 map differently in few codepoints that are listed below:
       
    19  *
       
    20  *		gb2312				gbk
       
    21  * A1A4		U+30FB KATAKANA MIDDLE DOT	U+00B7 MIDDLE DOT
       
    22  * A1AA		U+2015 HORIZONTAL BAR		U+2014 EM DASH
       
    23  * A844		undefined			U+2015 HORIZONTAL BAR
       
    24  */
       
    25 
       
    26 #define GBK_DECODE(dc1, dc2, assi) \
       
    27 	if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
       
    28 	else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
       
    29 	else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
       
    30 	else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
       
    31 	else TRYMAP_DEC(gbkext, assi, dc1, dc2);
       
    32 
       
    33 #define GBK_ENCODE(code, assi) \
       
    34 	if ((code) == 0x2014) (assi) = 0xa1aa; \
       
    35 	else if ((code) == 0x2015) (assi) = 0xa844; \
       
    36 	else if ((code) == 0x00b7) (assi) = 0xa1a4; \
       
    37 	else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
       
    38 
       
    39 /*
       
    40  * GB2312 codec
       
    41  */
       
    42 
       
    43 ENCODER(gb2312)
       
    44 {
       
    45 	while (inleft > 0) {
       
    46 		Py_UNICODE c = IN1;
       
    47 		DBCHAR code;
       
    48 
       
    49 		if (c < 0x80) {
       
    50 			WRITE1((unsigned char)c)
       
    51 			NEXT(1, 1)
       
    52 			continue;
       
    53 		}
       
    54 		UCS4INVALID(c)
       
    55 
       
    56 		REQUIRE_OUTBUF(2)
       
    57 		TRYMAP_ENC(gbcommon, code, c);
       
    58 		else return 1;
       
    59 
       
    60 		if (code & 0x8000) /* MSB set: GBK */
       
    61 			return 1;
       
    62 
       
    63 		OUT1((code >> 8) | 0x80)
       
    64 		OUT2((code & 0xFF) | 0x80)
       
    65 		NEXT(1, 2)
       
    66 	}
       
    67 
       
    68 	return 0;
       
    69 }
       
    70 
       
    71 DECODER(gb2312)
       
    72 {
       
    73 	while (inleft > 0) {
       
    74 		unsigned char c = **inbuf;
       
    75 
       
    76 		REQUIRE_OUTBUF(1)
       
    77 
       
    78 		if (c < 0x80) {
       
    79 			OUT1(c)
       
    80 			NEXT(1, 1)
       
    81 			continue;
       
    82 		}
       
    83 
       
    84 		REQUIRE_INBUF(2)
       
    85 		TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
       
    86 			NEXT(2, 1)
       
    87 		}
       
    88 		else return 2;
       
    89 	}
       
    90 
       
    91 	return 0;
       
    92 }
       
    93 
       
    94 
       
    95 /*
       
    96  * GBK codec
       
    97  */
       
    98 
       
    99 ENCODER(gbk)
       
   100 {
       
   101 	while (inleft > 0) {
       
   102 		Py_UNICODE c = IN1;
       
   103 		DBCHAR code;
       
   104 
       
   105 		if (c < 0x80) {
       
   106 			WRITE1((unsigned char)c)
       
   107 			NEXT(1, 1)
       
   108 			continue;
       
   109 		}
       
   110 		UCS4INVALID(c)
       
   111 
       
   112 		REQUIRE_OUTBUF(2)
       
   113 
       
   114 		GBK_ENCODE(c, code)
       
   115 		else return 1;
       
   116 
       
   117 		OUT1((code >> 8) | 0x80)
       
   118 		if (code & 0x8000)
       
   119 			OUT2((code & 0xFF)) /* MSB set: GBK */
       
   120 		else
       
   121 			OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
       
   122 		NEXT(1, 2)
       
   123 	}
       
   124 
       
   125 	return 0;
       
   126 }
       
   127 
       
   128 DECODER(gbk)
       
   129 {
       
   130 	while (inleft > 0) {
       
   131 		unsigned char c = IN1;
       
   132 
       
   133 		REQUIRE_OUTBUF(1)
       
   134 
       
   135 		if (c < 0x80) {
       
   136 			OUT1(c)
       
   137 			NEXT(1, 1)
       
   138 			continue;
       
   139 		}
       
   140 
       
   141 		REQUIRE_INBUF(2)
       
   142 
       
   143 		GBK_DECODE(c, IN2, **outbuf)
       
   144 		else return 2;
       
   145 
       
   146 		NEXT(2, 1)
       
   147 	}
       
   148 
       
   149 	return 0;
       
   150 }
       
   151 
       
   152 
       
   153 /*
       
   154  * GB18030 codec
       
   155  */
       
   156 
       
   157 ENCODER(gb18030)
       
   158 {
       
   159 	while (inleft > 0) {
       
   160 		ucs4_t c = IN1;
       
   161 		DBCHAR code;
       
   162 
       
   163 		if (c < 0x80) {
       
   164 			WRITE1(c)
       
   165 			NEXT(1, 1)
       
   166 			continue;
       
   167 		}
       
   168 
       
   169 		DECODE_SURROGATE(c)
       
   170 		if (c > 0x10FFFF)
       
   171 #if Py_UNICODE_SIZE == 2
       
   172 			return 2; /* surrogates pair */
       
   173 #else
       
   174 			return 1;
       
   175 #endif
       
   176 		else if (c >= 0x10000) {
       
   177 			ucs4_t tc = c - 0x10000;
       
   178 
       
   179 			REQUIRE_OUTBUF(4)
       
   180 
       
   181 			OUT4((unsigned char)(tc % 10) + 0x30)
       
   182 			tc /= 10;
       
   183 			OUT3((unsigned char)(tc % 126) + 0x81)
       
   184 			tc /= 126;
       
   185 			OUT2((unsigned char)(tc % 10) + 0x30)
       
   186 			tc /= 10;
       
   187 			OUT1((unsigned char)(tc + 0x90))
       
   188 
       
   189 #if Py_UNICODE_SIZE == 2
       
   190 			NEXT(2, 4) /* surrogates pair */
       
   191 #else
       
   192 			NEXT(1, 4)
       
   193 #endif
       
   194 			continue;
       
   195 		}
       
   196 
       
   197 		REQUIRE_OUTBUF(2)
       
   198 
       
   199 		GBK_ENCODE(c, code)
       
   200 		else TRYMAP_ENC(gb18030ext, code, c);
       
   201 		else {
       
   202 			const struct _gb18030_to_unibmp_ranges *utrrange;
       
   203 
       
   204 			REQUIRE_OUTBUF(4)
       
   205 
       
   206 			for (utrrange = gb18030_to_unibmp_ranges;
       
   207 			     utrrange->first != 0;
       
   208 			     utrrange++)
       
   209 				if (utrrange->first <= c &&
       
   210 				    c <= utrrange->last) {
       
   211 					Py_UNICODE tc;
       
   212 
       
   213 					tc = c - utrrange->first +
       
   214 					     utrrange->base;
       
   215 
       
   216 					OUT4((unsigned char)(tc % 10) + 0x30)
       
   217 					tc /= 10;
       
   218 					OUT3((unsigned char)(tc % 126) + 0x81)
       
   219 					tc /= 126;
       
   220 					OUT2((unsigned char)(tc % 10) + 0x30)
       
   221 					tc /= 10;
       
   222 					OUT1((unsigned char)tc + 0x81)
       
   223 
       
   224 					NEXT(1, 4)
       
   225 					break;
       
   226 				}
       
   227 
       
   228 			if (utrrange->first == 0)
       
   229 				return 1;
       
   230 			continue;
       
   231 		}
       
   232 
       
   233 		OUT1((code >> 8) | 0x80)
       
   234 		if (code & 0x8000)
       
   235 			OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
       
   236 		else
       
   237 			OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
       
   238 
       
   239 		NEXT(1, 2)
       
   240 	}
       
   241 
       
   242 	return 0;
       
   243 }
       
   244 
       
   245 DECODER(gb18030)
       
   246 {
       
   247 	while (inleft > 0) {
       
   248 		unsigned char c = IN1, c2;
       
   249 
       
   250 		REQUIRE_OUTBUF(1)
       
   251 
       
   252 		if (c < 0x80) {
       
   253 			OUT1(c)
       
   254 			NEXT(1, 1)
       
   255 			continue;
       
   256 		}
       
   257 
       
   258 		REQUIRE_INBUF(2)
       
   259 
       
   260 		c2 = IN2;
       
   261 		if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
       
   262 			const struct _gb18030_to_unibmp_ranges *utr;
       
   263 			unsigned char c3, c4;
       
   264 			ucs4_t lseq;
       
   265 
       
   266 			REQUIRE_INBUF(4)
       
   267 			c3 = IN3;
       
   268 			c4 = IN4;
       
   269 			if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
       
   270 				return 4;
       
   271 			c -= 0x81;  c2 -= 0x30;
       
   272 			c3 -= 0x81; c4 -= 0x30;
       
   273 
       
   274 			if (c < 4) { /* U+0080 - U+FFFF */
       
   275 				lseq = ((ucs4_t)c * 10 + c2) * 1260 +
       
   276 					(ucs4_t)c3 * 10 + c4;
       
   277 				if (lseq < 39420) {
       
   278 					for (utr = gb18030_to_unibmp_ranges;
       
   279 					     lseq >= (utr + 1)->base;
       
   280 					     utr++) ;
       
   281 					OUT1(utr->first - utr->base + lseq)
       
   282 					NEXT(4, 1)
       
   283 					continue;
       
   284 				}
       
   285 			}
       
   286 			else if (c >= 15) { /* U+10000 - U+10FFFF */
       
   287 				lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
       
   288 					* 1260 + (ucs4_t)c3 * 10 + c4;
       
   289 				if (lseq <= 0x10FFFF) {
       
   290 					WRITEUCS4(lseq);
       
   291 					NEXT_IN(4)
       
   292 					continue;
       
   293 				}
       
   294 			}
       
   295 			return 4;
       
   296 		}
       
   297 
       
   298 		GBK_DECODE(c, c2, **outbuf)
       
   299 		else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
       
   300 		else return 2;
       
   301 
       
   302 		NEXT(2, 1)
       
   303 	}
       
   304 
       
   305 	return 0;
       
   306 }
       
   307 
       
   308 
       
   309 /*
       
   310  * HZ codec
       
   311  */
       
   312 
       
   313 ENCODER_INIT(hz)
       
   314 {
       
   315 	state->i = 0;
       
   316 	return 0;
       
   317 }
       
   318 
       
   319 ENCODER_RESET(hz)
       
   320 {
       
   321 	if (state->i != 0) {
       
   322 		WRITE2('~', '}')
       
   323 		state->i = 0;
       
   324 		NEXT_OUT(2)
       
   325 	}
       
   326 	return 0;
       
   327 }
       
   328 
       
   329 ENCODER(hz)
       
   330 {
       
   331 	while (inleft > 0) {
       
   332 		Py_UNICODE c = IN1;
       
   333 		DBCHAR code;
       
   334 
       
   335 		if (c < 0x80) {
       
   336 			if (state->i == 0) {
       
   337 				WRITE1((unsigned char)c)
       
   338 				NEXT(1, 1)
       
   339 			}
       
   340 			else {
       
   341 				WRITE3('~', '}', (unsigned char)c)
       
   342 				NEXT(1, 3)
       
   343 				state->i = 0;
       
   344 			}
       
   345 			continue;
       
   346 		}
       
   347 
       
   348 		UCS4INVALID(c)
       
   349 
       
   350 		TRYMAP_ENC(gbcommon, code, c);
       
   351 		else return 1;
       
   352 
       
   353 		if (code & 0x8000) /* MSB set: GBK */
       
   354 			return 1;
       
   355 
       
   356 		if (state->i == 0) {
       
   357 			WRITE4('~', '{', code >> 8, code & 0xff)
       
   358 			NEXT(1, 4)
       
   359 			state->i = 1;
       
   360 		}
       
   361 		else {
       
   362 			WRITE2(code >> 8, code & 0xff)
       
   363 			NEXT(1, 2)
       
   364 		}
       
   365 	}
       
   366 
       
   367 	return 0;
       
   368 }
       
   369 
       
   370 DECODER_INIT(hz)
       
   371 {
       
   372 	state->i = 0;
       
   373 	return 0;
       
   374 }
       
   375 
       
   376 DECODER_RESET(hz)
       
   377 {
       
   378 	state->i = 0;
       
   379 	return 0;
       
   380 }
       
   381 
       
   382 DECODER(hz)
       
   383 {
       
   384 	while (inleft > 0) {
       
   385 		unsigned char c = IN1;
       
   386 
       
   387 		if (c == '~') {
       
   388 			unsigned char c2 = IN2;
       
   389 
       
   390 			REQUIRE_INBUF(2)
       
   391 			if (c2 == '~') {
       
   392 				WRITE1('~')
       
   393 				NEXT(2, 1)
       
   394 				continue;
       
   395 			}
       
   396 			else if (c2 == '{' && state->i == 0)
       
   397 				state->i = 1; /* set GB */
       
   398 			else if (c2 == '}' && state->i == 1)
       
   399 				state->i = 0; /* set ASCII */
       
   400 			else if (c2 == '\n')
       
   401 				; /* line-continuation */
       
   402 			else
       
   403 				return 2;
       
   404 			NEXT(2, 0);
       
   405 			continue;
       
   406 		}
       
   407 
       
   408 		if (c & 0x80)
       
   409 			return 1;
       
   410 
       
   411 		if (state->i == 0) { /* ASCII mode */
       
   412 			WRITE1(c)
       
   413 			NEXT(1, 1)
       
   414 		}
       
   415 		else { /* GB mode */
       
   416 			REQUIRE_INBUF(2)
       
   417 			REQUIRE_OUTBUF(1)
       
   418 			TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
       
   419 				NEXT(2, 1)
       
   420 			}
       
   421 			else
       
   422 				return 2;
       
   423 		}
       
   424 	}
       
   425 
       
   426 	return 0;
       
   427 }
       
   428 
       
   429 
       
   430 BEGIN_MAPPINGS_LIST
       
   431   MAPPING_DECONLY(gb2312)
       
   432   MAPPING_DECONLY(gbkext)
       
   433   MAPPING_ENCONLY(gbcommon)
       
   434   MAPPING_ENCDEC(gb18030ext)
       
   435 END_MAPPINGS_LIST
       
   436 
       
   437 BEGIN_CODECS_LIST
       
   438   CODEC_STATELESS(gb2312)
       
   439   CODEC_STATELESS(gbk)
       
   440   CODEC_STATELESS(gb18030)
       
   441   CODEC_STATEFUL(hz)
       
   442 END_CODECS_LIST
       
   443 
       
   444 I_AM_A_MODULE_FOR(cn)