symbian-qemu-0.9.1-12/python-2.6.1/Tools/unicode/gencodec.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 """ Unicode Mapping Parser and Codec Generator.
       
     2 
       
     3 This script parses Unicode mapping files as available from the Unicode
       
     4 site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
       
     5 modules from them. The codecs use the standard character mapping codec
       
     6 to actually apply the mapping.
       
     7 
       
     8 Synopsis: gencodec.py dir codec_prefix
       
     9 
       
    10 All files in dir are scanned and those producing non-empty mappings
       
    11 will be written to <codec_prefix><mapname>.py with <mapname> being the
       
    12 first part of the map's filename ('a' in a.b.c.txt) converted to
       
    13 lowercase with hyphens replaced by underscores.
       
    14 
       
    15 The tool also writes marshalled versions of the mapping tables to the
       
    16 same location (with .mapping extension).
       
    17 
       
    18 Written by Marc-Andre Lemburg (mal@lemburg.com).
       
    19 
       
    20 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
       
    21 (c) Copyright Guido van Rossum, 2000.
       
    22 
       
    23 Table generation:
       
    24 (c) Copyright Marc-Andre Lemburg, 2005.
       
    25     Licensed to PSF under a Contributor Agreement.
       
    26 
       
    27 """#"
       
    28 
       
    29 import re, os, marshal, codecs
       
    30 
       
    31 # Maximum allowed size of charmap tables
       
    32 MAX_TABLE_SIZE = 8192
       
    33 
       
    34 # Standard undefined Unicode code point
       
    35 UNI_UNDEFINED = unichr(0xFFFE)
       
    36 
       
    37 mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
       
    38                    '\s+'
       
    39                    '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
       
    40                    '\s*'
       
    41                    '(#.+)?')
       
    42 
       
    43 def parsecodes(codes,
       
    44                len=len, filter=filter,range=range):
       
    45 
       
    46     """ Converts code combinations to either a single code integer
       
    47         or a tuple of integers.
       
    48 
       
    49         meta-codes (in angular brackets, e.g. <LR> and <RL>) are
       
    50         ignored.
       
    51 
       
    52         Empty codes or illegal ones are returned as None.
       
    53 
       
    54     """
       
    55     if not codes:
       
    56         return None
       
    57     l = codes.split('+')
       
    58     if len(l) == 1:
       
    59         return int(l[0],16)
       
    60     for i in range(len(l)):
       
    61         try:
       
    62             l[i] = int(l[i],16)
       
    63         except ValueError:
       
    64             l[i] = None
       
    65     l = filter(lambda x: x is not None, l)
       
    66     if len(l) == 1:
       
    67         return l[0]
       
    68     else:
       
    69         return tuple(l)
       
    70 
       
    71 def readmap(filename):
       
    72 
       
    73     f = open(filename,'r')
       
    74     lines = f.readlines()
       
    75     f.close()
       
    76     enc2uni = {}
       
    77     identity = []
       
    78     unmapped = range(256)
       
    79 
       
    80     # UTC mapping tables per convention don't include the identity
       
    81     # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
       
    82     # explicitly mapped to different characters or undefined
       
    83     for i in range(32) + [127]:
       
    84         identity.append(i)
       
    85         unmapped.remove(i)
       
    86         enc2uni[i] = (i, 'CONTROL CHARACTER')
       
    87 
       
    88     for line in lines:
       
    89         line = line.strip()
       
    90         if not line or line[0] == '#':
       
    91             continue
       
    92         m = mapRE.match(line)
       
    93         if not m:
       
    94             #print '* not matched: %s' % repr(line)
       
    95             continue
       
    96         enc,uni,comment = m.groups()
       
    97         enc = parsecodes(enc)
       
    98         uni = parsecodes(uni)
       
    99         if comment is None:
       
   100             comment = ''
       
   101         else:
       
   102             comment = comment[1:].strip()
       
   103         if enc < 256:
       
   104             if enc in unmapped:
       
   105                 unmapped.remove(enc)
       
   106             if enc == uni:
       
   107                 identity.append(enc)
       
   108             enc2uni[enc] = (uni,comment)
       
   109         else:
       
   110             enc2uni[enc] = (uni,comment)
       
   111 
       
   112     # If there are more identity-mapped entries than unmapped entries,
       
   113     # it pays to generate an identity dictionary first, and add explicit
       
   114     # mappings to None for the rest
       
   115     if len(identity) >= len(unmapped):
       
   116         for enc in unmapped:
       
   117             enc2uni[enc] = (None, "")
       
   118         enc2uni['IDENTITY'] = 256
       
   119 
       
   120     return enc2uni
       
   121 
       
   122 def hexrepr(t, precision=4):
       
   123 
       
   124     if t is None:
       
   125         return 'None'
       
   126     try:
       
   127         len(t)
       
   128     except:
       
   129         return '0x%0*X' % (precision, t)
       
   130     try:
       
   131         return '(' + ', '.join(['0x%0*X' % (precision, item)
       
   132                                 for item in t]) + ')'
       
   133     except TypeError, why:
       
   134         print '* failed to convert %r: %s' % (t, why)
       
   135         raise
       
   136 
       
   137 def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
       
   138 
       
   139     l = []
       
   140     append = l.append
       
   141     if map.has_key("IDENTITY"):
       
   142         append("%s = codecs.make_identity_dict(range(%d))" %
       
   143                (varname, map["IDENTITY"]))
       
   144         append("%s.update({" % varname)
       
   145         splits = 1
       
   146         del map["IDENTITY"]
       
   147         identity = 1
       
   148     else:
       
   149         append("%s = {" % varname)
       
   150         splits = 0
       
   151         identity = 0
       
   152 
       
   153     mappings = map.items()
       
   154     mappings.sort()
       
   155     i = 0
       
   156     key_precision, value_precision = precisions
       
   157     for mapkey, mapvalue in mappings:
       
   158         mapcomment = ''
       
   159         if isinstance(mapkey, tuple):
       
   160             (mapkey, mapcomment) = mapkey
       
   161         if isinstance(mapvalue, tuple):
       
   162             (mapvalue, mapcomment) = mapvalue
       
   163         if mapkey is None:
       
   164             continue
       
   165         if (identity and
       
   166             mapkey == mapvalue and
       
   167             mapkey < 256):
       
   168             # No need to include identity mappings, since these
       
   169             # are already set for the first 256 code points.
       
   170             continue
       
   171         key = hexrepr(mapkey, key_precision)
       
   172         value = hexrepr(mapvalue, value_precision)
       
   173         if mapcomment and comments:
       
   174             append('    %s: %s,\t#  %s' % (key, value, mapcomment))
       
   175         else:
       
   176             append('    %s: %s,' % (key, value))
       
   177         i += 1
       
   178         if i == 4096:
       
   179             # Split the definition into parts to that the Python
       
   180             # parser doesn't dump core
       
   181             if splits == 0:
       
   182                 append('}')
       
   183             else:
       
   184                 append('})')
       
   185             append('%s.update({' % varname)
       
   186             i = 0
       
   187             splits = splits + 1
       
   188     if splits == 0:
       
   189         append('}')
       
   190     else:
       
   191         append('})')
       
   192 
       
   193     return l
       
   194 
       
   195 def python_tabledef_code(varname, map, comments=1, key_precision=2):
       
   196 
       
   197     l = []
       
   198     append = l.append
       
   199     append('%s = (' % varname)
       
   200 
       
   201     # Analyze map and create table dict
       
   202     mappings = map.items()
       
   203     mappings.sort()
       
   204     table = {}
       
   205     maxkey = 0
       
   206     if map.has_key('IDENTITY'):
       
   207         for key in range(256):
       
   208             table[key] = (key, '')
       
   209         maxkey = 255
       
   210         del map['IDENTITY']
       
   211     for mapkey, mapvalue in mappings:
       
   212         mapcomment = ''
       
   213         if isinstance(mapkey, tuple):
       
   214             (mapkey, mapcomment) = mapkey
       
   215         if isinstance(mapvalue, tuple):
       
   216             (mapvalue, mapcomment) = mapvalue
       
   217         if mapkey is None:
       
   218             continue
       
   219         table[mapkey] = (mapvalue, mapcomment)
       
   220         if mapkey > maxkey:
       
   221             maxkey = mapkey
       
   222     if maxkey > MAX_TABLE_SIZE:
       
   223         # Table too large
       
   224         return None
       
   225 
       
   226     # Create table code
       
   227     for key in range(maxkey + 1):
       
   228         if key not in table:
       
   229             mapvalue = None
       
   230             mapcomment = 'UNDEFINED'
       
   231         else:
       
   232             mapvalue, mapcomment = table[key]
       
   233         if mapvalue is None:
       
   234             mapchar = UNI_UNDEFINED
       
   235         else:
       
   236             if isinstance(mapvalue, tuple):
       
   237                 # 1-n mappings not supported
       
   238                 return None
       
   239             else:
       
   240                 mapchar = unichr(mapvalue)
       
   241         if mapcomment and comments:
       
   242             append('    %r\t#  %s -> %s' % (mapchar,
       
   243                                             hexrepr(key, key_precision),
       
   244                                             mapcomment))
       
   245         else:
       
   246             append('    %r' % mapchar)
       
   247 
       
   248     append(')')
       
   249     return l
       
   250 
       
   251 def codegen(name, map, encodingname, comments=1):
       
   252 
       
   253     """ Returns Python source for the given map.
       
   254 
       
   255         Comments are included in the source, if comments is true (default).
       
   256 
       
   257     """
       
   258     # Generate code
       
   259     decoding_map_code = python_mapdef_code(
       
   260         'decoding_map',
       
   261         map,
       
   262         comments=comments)
       
   263     decoding_table_code = python_tabledef_code(
       
   264         'decoding_table',
       
   265         map,
       
   266         comments=comments)
       
   267     encoding_map_code = python_mapdef_code(
       
   268         'encoding_map',
       
   269         codecs.make_encoding_map(map),
       
   270         comments=comments,
       
   271         precisions=(4, 2))
       
   272 
       
   273     if decoding_table_code:
       
   274         suffix = 'table'
       
   275     else:
       
   276         suffix = 'map'
       
   277 
       
   278     l = [
       
   279         '''\
       
   280 """ Python Character Mapping Codec %s generated from '%s' with gencodec.py.
       
   281 
       
   282 """#"
       
   283 
       
   284 import codecs
       
   285 
       
   286 ### Codec APIs
       
   287 
       
   288 class Codec(codecs.Codec):
       
   289 
       
   290     def encode(self,input,errors='strict'):
       
   291         return codecs.charmap_encode(input,errors,encoding_%s)
       
   292 
       
   293     def decode(self,input,errors='strict'):
       
   294         return codecs.charmap_decode(input,errors,decoding_%s)
       
   295 ''' % (encodingname, name, suffix, suffix)]
       
   296     l.append('''\
       
   297 class IncrementalEncoder(codecs.IncrementalEncoder):
       
   298     def encode(self, input, final=False):
       
   299         return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
       
   300 
       
   301 class IncrementalDecoder(codecs.IncrementalDecoder):
       
   302     def decode(self, input, final=False):
       
   303         return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
       
   304         (suffix, suffix))
       
   305 
       
   306     l.append('''
       
   307 class StreamWriter(Codec,codecs.StreamWriter):
       
   308     pass
       
   309 
       
   310 class StreamReader(Codec,codecs.StreamReader):
       
   311     pass
       
   312 
       
   313 ### encodings module API
       
   314 
       
   315 def getregentry():
       
   316     return codecs.CodecInfo(
       
   317         name=%r,
       
   318         encode=Codec().encode,
       
   319         decode=Codec().decode,
       
   320         incrementalencoder=IncrementalEncoder,
       
   321         incrementaldecoder=IncrementalDecoder,
       
   322         streamreader=StreamReader,
       
   323         streamwriter=StreamWriter,
       
   324     )
       
   325 ''' % encodingname.replace('_', '-'))
       
   326 
       
   327     # Add decoding table or map (with preference to the table)
       
   328     if not decoding_table_code:
       
   329         l.append('''
       
   330 ### Decoding Map
       
   331 ''')
       
   332         l.extend(decoding_map_code)
       
   333     else:
       
   334         l.append('''
       
   335 ### Decoding Table
       
   336 ''')
       
   337         l.extend(decoding_table_code)
       
   338 
       
   339     # Add encoding map
       
   340     if decoding_table_code:
       
   341         l.append('''
       
   342 ### Encoding table
       
   343 encoding_table=codecs.charmap_build(decoding_table)
       
   344 ''')
       
   345     else:
       
   346         l.append('''
       
   347 ### Encoding Map
       
   348 ''')
       
   349         l.extend(encoding_map_code)
       
   350 
       
   351     # Final new-line
       
   352     l.append('')
       
   353 
       
   354     return '\n'.join(l).expandtabs()
       
   355 
       
   356 def pymap(name,map,pyfile,encodingname,comments=1):
       
   357 
       
   358     code = codegen(name,map,encodingname,comments)
       
   359     f = open(pyfile,'w')
       
   360     f.write(code)
       
   361     f.close()
       
   362 
       
   363 def marshalmap(name,map,marshalfile):
       
   364 
       
   365     d = {}
       
   366     for e,(u,c) in map.items():
       
   367         d[e] = (u,c)
       
   368     f = open(marshalfile,'wb')
       
   369     marshal.dump(d,f)
       
   370     f.close()
       
   371 
       
   372 def convertdir(dir, dirprefix='', nameprefix='', comments=1):
       
   373 
       
   374     mapnames = os.listdir(dir)
       
   375     for mapname in mapnames:
       
   376         mappathname = os.path.join(dir, mapname)
       
   377         if not os.path.isfile(mappathname):
       
   378             continue
       
   379         name = os.path.split(mapname)[1]
       
   380         name = name.replace('-','_')
       
   381         name = name.split('.')[0]
       
   382         name = name.lower()
       
   383         name = nameprefix + name
       
   384         codefile = name + '.py'
       
   385         marshalfile = name + '.mapping'
       
   386         print 'converting %s to %s and %s' % (mapname,
       
   387                                               dirprefix + codefile,
       
   388                                               dirprefix + marshalfile)
       
   389         try:
       
   390             map = readmap(os.path.join(dir,mapname))
       
   391             if not map:
       
   392                 print '* map is empty; skipping'
       
   393             else:
       
   394                 pymap(mappathname, map, dirprefix + codefile,name,comments)
       
   395                 marshalmap(mappathname, map, dirprefix + marshalfile)
       
   396         except ValueError, why:
       
   397             print '* conversion failed: %s' % why
       
   398             raise
       
   399 
       
   400 def rewritepythondir(dir, dirprefix='', comments=1):
       
   401 
       
   402     mapnames = os.listdir(dir)
       
   403     for mapname in mapnames:
       
   404         if not mapname.endswith('.mapping'):
       
   405             continue
       
   406         name = mapname[:-len('.mapping')]
       
   407         codefile = name + '.py'
       
   408         print 'converting %s to %s' % (mapname,
       
   409                                        dirprefix + codefile)
       
   410         try:
       
   411             map = marshal.load(open(os.path.join(dir,mapname),
       
   412                                'rb'))
       
   413             if not map:
       
   414                 print '* map is empty; skipping'
       
   415             else:
       
   416                 pymap(mapname, map, dirprefix + codefile,name,comments)
       
   417         except ValueError, why:
       
   418             print '* conversion failed: %s' % why
       
   419 
       
   420 if __name__ == '__main__':
       
   421 
       
   422     import sys
       
   423     if 1:
       
   424         apply(convertdir,tuple(sys.argv[1:]))
       
   425     else:
       
   426         apply(rewritepythondir,tuple(sys.argv[1:]))