|
1 """ Unicode Mapping Parser and Codec Generator. |
|
2 |
|
3 This script parses Unicode mapping files as available from the Unicode |
|
4 site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec |
|
5 modules from them. The codecs use the standard character mapping codec |
|
6 to actually apply the mapping. |
|
7 |
|
8 Synopsis: gencodec.py dir codec_prefix |
|
9 |
|
10 All files in dir are scanned and those producing non-empty mappings |
|
11 will be written to <codec_prefix><mapname>.py with <mapname> being the |
|
12 first part of the map's filename ('a' in a.b.c.txt) converted to |
|
13 lowercase with hyphens replaced by underscores. |
|
14 |
|
15 The tool also writes marshalled versions of the mapping tables to the |
|
16 same location (with .mapping extension). |
|
17 |
|
18 Written by Marc-Andre Lemburg (mal@lemburg.com). |
|
19 |
|
20 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. |
|
21 (c) Copyright Guido van Rossum, 2000. |
|
22 |
|
23 Table generation: |
|
24 (c) Copyright Marc-Andre Lemburg, 2005. |
|
25 Licensed to PSF under a Contributor Agreement. |
|
26 |
|
27 """#" |
|
28 |
|
29 import re, os, marshal, codecs |
|
30 |
|
31 # Maximum allowed size of charmap tables |
|
32 MAX_TABLE_SIZE = 8192 |
|
33 |
|
34 # Standard undefined Unicode code point |
|
35 UNI_UNDEFINED = unichr(0xFFFE) |
|
36 |
|
37 mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)' |
|
38 '\s+' |
|
39 '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' |
|
40 '\s*' |
|
41 '(#.+)?') |
|
42 |
|
43 def parsecodes(codes, |
|
44 len=len, filter=filter,range=range): |
|
45 |
|
46 """ Converts code combinations to either a single code integer |
|
47 or a tuple of integers. |
|
48 |
|
49 meta-codes (in angular brackets, e.g. <LR> and <RL>) are |
|
50 ignored. |
|
51 |
|
52 Empty codes or illegal ones are returned as None. |
|
53 |
|
54 """ |
|
55 if not codes: |
|
56 return None |
|
57 l = codes.split('+') |
|
58 if len(l) == 1: |
|
59 return int(l[0],16) |
|
60 for i in range(len(l)): |
|
61 try: |
|
62 l[i] = int(l[i],16) |
|
63 except ValueError: |
|
64 l[i] = None |
|
65 l = filter(lambda x: x is not None, l) |
|
66 if len(l) == 1: |
|
67 return l[0] |
|
68 else: |
|
69 return tuple(l) |
|
70 |
|
71 def readmap(filename): |
|
72 |
|
73 f = open(filename,'r') |
|
74 lines = f.readlines() |
|
75 f.close() |
|
76 enc2uni = {} |
|
77 identity = [] |
|
78 unmapped = range(256) |
|
79 |
|
80 # UTC mapping tables per convention don't include the identity |
|
81 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are |
|
82 # explicitly mapped to different characters or undefined |
|
83 for i in range(32) + [127]: |
|
84 identity.append(i) |
|
85 unmapped.remove(i) |
|
86 enc2uni[i] = (i, 'CONTROL CHARACTER') |
|
87 |
|
88 for line in lines: |
|
89 line = line.strip() |
|
90 if not line or line[0] == '#': |
|
91 continue |
|
92 m = mapRE.match(line) |
|
93 if not m: |
|
94 #print '* not matched: %s' % repr(line) |
|
95 continue |
|
96 enc,uni,comment = m.groups() |
|
97 enc = parsecodes(enc) |
|
98 uni = parsecodes(uni) |
|
99 if comment is None: |
|
100 comment = '' |
|
101 else: |
|
102 comment = comment[1:].strip() |
|
103 if enc < 256: |
|
104 if enc in unmapped: |
|
105 unmapped.remove(enc) |
|
106 if enc == uni: |
|
107 identity.append(enc) |
|
108 enc2uni[enc] = (uni,comment) |
|
109 else: |
|
110 enc2uni[enc] = (uni,comment) |
|
111 |
|
112 # If there are more identity-mapped entries than unmapped entries, |
|
113 # it pays to generate an identity dictionary first, and add explicit |
|
114 # mappings to None for the rest |
|
115 if len(identity) >= len(unmapped): |
|
116 for enc in unmapped: |
|
117 enc2uni[enc] = (None, "") |
|
118 enc2uni['IDENTITY'] = 256 |
|
119 |
|
120 return enc2uni |
|
121 |
|
122 def hexrepr(t, precision=4): |
|
123 |
|
124 if t is None: |
|
125 return 'None' |
|
126 try: |
|
127 len(t) |
|
128 except: |
|
129 return '0x%0*X' % (precision, t) |
|
130 try: |
|
131 return '(' + ', '.join(['0x%0*X' % (precision, item) |
|
132 for item in t]) + ')' |
|
133 except TypeError, why: |
|
134 print '* failed to convert %r: %s' % (t, why) |
|
135 raise |
|
136 |
|
137 def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)): |
|
138 |
|
139 l = [] |
|
140 append = l.append |
|
141 if map.has_key("IDENTITY"): |
|
142 append("%s = codecs.make_identity_dict(range(%d))" % |
|
143 (varname, map["IDENTITY"])) |
|
144 append("%s.update({" % varname) |
|
145 splits = 1 |
|
146 del map["IDENTITY"] |
|
147 identity = 1 |
|
148 else: |
|
149 append("%s = {" % varname) |
|
150 splits = 0 |
|
151 identity = 0 |
|
152 |
|
153 mappings = map.items() |
|
154 mappings.sort() |
|
155 i = 0 |
|
156 key_precision, value_precision = precisions |
|
157 for mapkey, mapvalue in mappings: |
|
158 mapcomment = '' |
|
159 if isinstance(mapkey, tuple): |
|
160 (mapkey, mapcomment) = mapkey |
|
161 if isinstance(mapvalue, tuple): |
|
162 (mapvalue, mapcomment) = mapvalue |
|
163 if mapkey is None: |
|
164 continue |
|
165 if (identity and |
|
166 mapkey == mapvalue and |
|
167 mapkey < 256): |
|
168 # No need to include identity mappings, since these |
|
169 # are already set for the first 256 code points. |
|
170 continue |
|
171 key = hexrepr(mapkey, key_precision) |
|
172 value = hexrepr(mapvalue, value_precision) |
|
173 if mapcomment and comments: |
|
174 append(' %s: %s,\t# %s' % (key, value, mapcomment)) |
|
175 else: |
|
176 append(' %s: %s,' % (key, value)) |
|
177 i += 1 |
|
178 if i == 4096: |
|
179 # Split the definition into parts to that the Python |
|
180 # parser doesn't dump core |
|
181 if splits == 0: |
|
182 append('}') |
|
183 else: |
|
184 append('})') |
|
185 append('%s.update({' % varname) |
|
186 i = 0 |
|
187 splits = splits + 1 |
|
188 if splits == 0: |
|
189 append('}') |
|
190 else: |
|
191 append('})') |
|
192 |
|
193 return l |
|
194 |
|
195 def python_tabledef_code(varname, map, comments=1, key_precision=2): |
|
196 |
|
197 l = [] |
|
198 append = l.append |
|
199 append('%s = (' % varname) |
|
200 |
|
201 # Analyze map and create table dict |
|
202 mappings = map.items() |
|
203 mappings.sort() |
|
204 table = {} |
|
205 maxkey = 0 |
|
206 if map.has_key('IDENTITY'): |
|
207 for key in range(256): |
|
208 table[key] = (key, '') |
|
209 maxkey = 255 |
|
210 del map['IDENTITY'] |
|
211 for mapkey, mapvalue in mappings: |
|
212 mapcomment = '' |
|
213 if isinstance(mapkey, tuple): |
|
214 (mapkey, mapcomment) = mapkey |
|
215 if isinstance(mapvalue, tuple): |
|
216 (mapvalue, mapcomment) = mapvalue |
|
217 if mapkey is None: |
|
218 continue |
|
219 table[mapkey] = (mapvalue, mapcomment) |
|
220 if mapkey > maxkey: |
|
221 maxkey = mapkey |
|
222 if maxkey > MAX_TABLE_SIZE: |
|
223 # Table too large |
|
224 return None |
|
225 |
|
226 # Create table code |
|
227 for key in range(maxkey + 1): |
|
228 if key not in table: |
|
229 mapvalue = None |
|
230 mapcomment = 'UNDEFINED' |
|
231 else: |
|
232 mapvalue, mapcomment = table[key] |
|
233 if mapvalue is None: |
|
234 mapchar = UNI_UNDEFINED |
|
235 else: |
|
236 if isinstance(mapvalue, tuple): |
|
237 # 1-n mappings not supported |
|
238 return None |
|
239 else: |
|
240 mapchar = unichr(mapvalue) |
|
241 if mapcomment and comments: |
|
242 append(' %r\t# %s -> %s' % (mapchar, |
|
243 hexrepr(key, key_precision), |
|
244 mapcomment)) |
|
245 else: |
|
246 append(' %r' % mapchar) |
|
247 |
|
248 append(')') |
|
249 return l |
|
250 |
|
251 def codegen(name, map, encodingname, comments=1): |
|
252 |
|
253 """ Returns Python source for the given map. |
|
254 |
|
255 Comments are included in the source, if comments is true (default). |
|
256 |
|
257 """ |
|
258 # Generate code |
|
259 decoding_map_code = python_mapdef_code( |
|
260 'decoding_map', |
|
261 map, |
|
262 comments=comments) |
|
263 decoding_table_code = python_tabledef_code( |
|
264 'decoding_table', |
|
265 map, |
|
266 comments=comments) |
|
267 encoding_map_code = python_mapdef_code( |
|
268 'encoding_map', |
|
269 codecs.make_encoding_map(map), |
|
270 comments=comments, |
|
271 precisions=(4, 2)) |
|
272 |
|
273 if decoding_table_code: |
|
274 suffix = 'table' |
|
275 else: |
|
276 suffix = 'map' |
|
277 |
|
278 l = [ |
|
279 '''\ |
|
280 """ Python Character Mapping Codec %s generated from '%s' with gencodec.py. |
|
281 |
|
282 """#" |
|
283 |
|
284 import codecs |
|
285 |
|
286 ### Codec APIs |
|
287 |
|
288 class Codec(codecs.Codec): |
|
289 |
|
290 def encode(self,input,errors='strict'): |
|
291 return codecs.charmap_encode(input,errors,encoding_%s) |
|
292 |
|
293 def decode(self,input,errors='strict'): |
|
294 return codecs.charmap_decode(input,errors,decoding_%s) |
|
295 ''' % (encodingname, name, suffix, suffix)] |
|
296 l.append('''\ |
|
297 class IncrementalEncoder(codecs.IncrementalEncoder): |
|
298 def encode(self, input, final=False): |
|
299 return codecs.charmap_encode(input,self.errors,encoding_%s)[0] |
|
300 |
|
301 class IncrementalDecoder(codecs.IncrementalDecoder): |
|
302 def decode(self, input, final=False): |
|
303 return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' % |
|
304 (suffix, suffix)) |
|
305 |
|
306 l.append(''' |
|
307 class StreamWriter(Codec,codecs.StreamWriter): |
|
308 pass |
|
309 |
|
310 class StreamReader(Codec,codecs.StreamReader): |
|
311 pass |
|
312 |
|
313 ### encodings module API |
|
314 |
|
315 def getregentry(): |
|
316 return codecs.CodecInfo( |
|
317 name=%r, |
|
318 encode=Codec().encode, |
|
319 decode=Codec().decode, |
|
320 incrementalencoder=IncrementalEncoder, |
|
321 incrementaldecoder=IncrementalDecoder, |
|
322 streamreader=StreamReader, |
|
323 streamwriter=StreamWriter, |
|
324 ) |
|
325 ''' % encodingname.replace('_', '-')) |
|
326 |
|
327 # Add decoding table or map (with preference to the table) |
|
328 if not decoding_table_code: |
|
329 l.append(''' |
|
330 ### Decoding Map |
|
331 ''') |
|
332 l.extend(decoding_map_code) |
|
333 else: |
|
334 l.append(''' |
|
335 ### Decoding Table |
|
336 ''') |
|
337 l.extend(decoding_table_code) |
|
338 |
|
339 # Add encoding map |
|
340 if decoding_table_code: |
|
341 l.append(''' |
|
342 ### Encoding table |
|
343 encoding_table=codecs.charmap_build(decoding_table) |
|
344 ''') |
|
345 else: |
|
346 l.append(''' |
|
347 ### Encoding Map |
|
348 ''') |
|
349 l.extend(encoding_map_code) |
|
350 |
|
351 # Final new-line |
|
352 l.append('') |
|
353 |
|
354 return '\n'.join(l).expandtabs() |
|
355 |
|
356 def pymap(name,map,pyfile,encodingname,comments=1): |
|
357 |
|
358 code = codegen(name,map,encodingname,comments) |
|
359 f = open(pyfile,'w') |
|
360 f.write(code) |
|
361 f.close() |
|
362 |
|
363 def marshalmap(name,map,marshalfile): |
|
364 |
|
365 d = {} |
|
366 for e,(u,c) in map.items(): |
|
367 d[e] = (u,c) |
|
368 f = open(marshalfile,'wb') |
|
369 marshal.dump(d,f) |
|
370 f.close() |
|
371 |
|
372 def convertdir(dir, dirprefix='', nameprefix='', comments=1): |
|
373 |
|
374 mapnames = os.listdir(dir) |
|
375 for mapname in mapnames: |
|
376 mappathname = os.path.join(dir, mapname) |
|
377 if not os.path.isfile(mappathname): |
|
378 continue |
|
379 name = os.path.split(mapname)[1] |
|
380 name = name.replace('-','_') |
|
381 name = name.split('.')[0] |
|
382 name = name.lower() |
|
383 name = nameprefix + name |
|
384 codefile = name + '.py' |
|
385 marshalfile = name + '.mapping' |
|
386 print 'converting %s to %s and %s' % (mapname, |
|
387 dirprefix + codefile, |
|
388 dirprefix + marshalfile) |
|
389 try: |
|
390 map = readmap(os.path.join(dir,mapname)) |
|
391 if not map: |
|
392 print '* map is empty; skipping' |
|
393 else: |
|
394 pymap(mappathname, map, dirprefix + codefile,name,comments) |
|
395 marshalmap(mappathname, map, dirprefix + marshalfile) |
|
396 except ValueError, why: |
|
397 print '* conversion failed: %s' % why |
|
398 raise |
|
399 |
|
400 def rewritepythondir(dir, dirprefix='', comments=1): |
|
401 |
|
402 mapnames = os.listdir(dir) |
|
403 for mapname in mapnames: |
|
404 if not mapname.endswith('.mapping'): |
|
405 continue |
|
406 name = mapname[:-len('.mapping')] |
|
407 codefile = name + '.py' |
|
408 print 'converting %s to %s' % (mapname, |
|
409 dirprefix + codefile) |
|
410 try: |
|
411 map = marshal.load(open(os.path.join(dir,mapname), |
|
412 'rb')) |
|
413 if not map: |
|
414 print '* map is empty; skipping' |
|
415 else: |
|
416 pymap(mapname, map, dirprefix + codefile,name,comments) |
|
417 except ValueError, why: |
|
418 print '* conversion failed: %s' % why |
|
419 |
|
420 if __name__ == '__main__': |
|
421 |
|
422 import sys |
|
423 if 1: |
|
424 apply(convertdir,tuple(sys.argv[1:])) |
|
425 else: |
|
426 apply(rewritepythondir,tuple(sys.argv[1:])) |