|
1 """ Standard "encodings" Package |
|
2 |
|
3 Standard Python encoding modules are stored in this package |
|
4 directory. |
|
5 |
|
6 Codec modules must have names corresponding to normalized encoding |
|
7 names as defined in the normalize_encoding() function below, e.g. |
|
8 'utf-8' must be implemented by the module 'utf_8.py'. |
|
9 |
|
10 Each codec module must export the following interface: |
|
11 |
|
12 * getregentry() -> codecs.CodecInfo object |
|
13 The getregentry() API must a CodecInfo object with encoder, decoder, |
|
14 incrementalencoder, incrementaldecoder, streamwriter and streamreader |
|
15 atttributes which adhere to the Python Codec Interface Standard. |
|
16 |
|
17 In addition, a module may optionally also define the following |
|
18 APIs which are then used by the package's codec search function: |
|
19 |
|
20 * getaliases() -> sequence of encoding name strings to use as aliases |
|
21 |
|
22 Alias names returned by getaliases() must be normalized encoding |
|
23 names as defined by normalize_encoding(). |
|
24 |
|
25 Written by Marc-Andre Lemburg (mal@lemburg.com). |
|
26 |
|
27 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. |
|
28 |
|
29 """#" |
|
30 |
|
31 import codecs |
|
32 from encodings import aliases |
|
33 import __builtin__ |
|
34 |
|
35 _cache = {} |
|
36 _unknown = '--unknown--' |
|
37 _import_tail = ['*'] |
|
38 _norm_encoding_map = (' . ' |
|
39 '0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ ' |
|
40 ' abcdefghijklmnopqrstuvwxyz ' |
|
41 ' ' |
|
42 ' ' |
|
43 ' ') |
|
44 _aliases = aliases.aliases |
|
45 |
|
46 class CodecRegistryError(LookupError, SystemError): |
|
47 pass |
|
48 |
|
49 def normalize_encoding(encoding): |
|
50 |
|
51 """ Normalize an encoding name. |
|
52 |
|
53 Normalization works as follows: all non-alphanumeric |
|
54 characters except the dot used for Python package names are |
|
55 collapsed and replaced with a single underscore, e.g. ' -;#' |
|
56 becomes '_'. Leading and trailing underscores are removed. |
|
57 |
|
58 Note that encoding names should be ASCII only; if they do use |
|
59 non-ASCII characters, these must be Latin-1 compatible. |
|
60 |
|
61 """ |
|
62 # Make sure we have an 8-bit string, because .translate() works |
|
63 # differently for Unicode strings. |
|
64 if hasattr(__builtin__, "unicode") and isinstance(encoding, unicode): |
|
65 # Note that .encode('latin-1') does *not* use the codec |
|
66 # registry, so this call doesn't recurse. (See unicodeobject.c |
|
67 # PyUnicode_AsEncodedString() for details) |
|
68 encoding = encoding.encode('latin-1') |
|
69 return '_'.join(encoding.translate(_norm_encoding_map).split()) |
|
70 |
|
71 def search_function(encoding): |
|
72 |
|
73 # Cache lookup |
|
74 entry = _cache.get(encoding, _unknown) |
|
75 if entry is not _unknown: |
|
76 return entry |
|
77 |
|
78 # Import the module: |
|
79 # |
|
80 # First try to find an alias for the normalized encoding |
|
81 # name and lookup the module using the aliased name, then try to |
|
82 # lookup the module using the standard import scheme, i.e. first |
|
83 # try in the encodings package, then at top-level. |
|
84 # |
|
85 norm_encoding = normalize_encoding(encoding) |
|
86 aliased_encoding = _aliases.get(norm_encoding) or \ |
|
87 _aliases.get(norm_encoding.replace('.', '_')) |
|
88 if aliased_encoding is not None: |
|
89 modnames = [aliased_encoding, |
|
90 norm_encoding] |
|
91 else: |
|
92 modnames = [norm_encoding] |
|
93 for modname in modnames: |
|
94 if not modname or '.' in modname: |
|
95 continue |
|
96 try: |
|
97 # Import is absolute to prevent the possibly malicious import of a |
|
98 # module with side-effects that is not in the 'encodings' package. |
|
99 mod = __import__('encodings.' + modname, fromlist=_import_tail, |
|
100 level=0) |
|
101 except ImportError: |
|
102 pass |
|
103 else: |
|
104 break |
|
105 else: |
|
106 mod = None |
|
107 |
|
108 try: |
|
109 getregentry = mod.getregentry |
|
110 except AttributeError: |
|
111 # Not a codec module |
|
112 mod = None |
|
113 |
|
114 if mod is None: |
|
115 # Cache misses |
|
116 _cache[encoding] = None |
|
117 return None |
|
118 |
|
119 # Now ask the module for the registry entry |
|
120 entry = getregentry() |
|
121 if not isinstance(entry, codecs.CodecInfo): |
|
122 if not 4 <= len(entry) <= 7: |
|
123 raise CodecRegistryError,\ |
|
124 'module "%s" (%s) failed to register' % \ |
|
125 (mod.__name__, mod.__file__) |
|
126 if not callable(entry[0]) or \ |
|
127 not callable(entry[1]) or \ |
|
128 (entry[2] is not None and not callable(entry[2])) or \ |
|
129 (entry[3] is not None and not callable(entry[3])) or \ |
|
130 (len(entry) > 4 and entry[4] is not None and not callable(entry[4])) or \ |
|
131 (len(entry) > 5 and entry[5] is not None and not callable(entry[5])): |
|
132 raise CodecRegistryError,\ |
|
133 'incompatible codecs in module "%s" (%s)' % \ |
|
134 (mod.__name__, mod.__file__) |
|
135 if len(entry)<7 or entry[6] is None: |
|
136 entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],) |
|
137 entry = codecs.CodecInfo(*entry) |
|
138 |
|
139 # Cache the codec registry entry |
|
140 _cache[encoding] = entry |
|
141 |
|
142 # Register its aliases (without overwriting previously registered |
|
143 # aliases) |
|
144 try: |
|
145 codecaliases = mod.getaliases() |
|
146 except AttributeError: |
|
147 pass |
|
148 else: |
|
149 for alias in codecaliases: |
|
150 if not _aliases.has_key(alias): |
|
151 _aliases[alias] = modname |
|
152 |
|
153 # Return the registry entry |
|
154 return entry |
|
155 |
|
156 # Register the search_function in the Python codec registry |
|
157 codecs.register(search_function) |