|
1 # -*- coding: iso-8859-1 -*- |
|
2 """ Codec for the Punicode encoding, as specified in RFC 3492 |
|
3 |
|
4 Written by Martin v. Löwis. |
|
5 """ |
|
6 |
|
7 import codecs |
|
8 |
|
9 ##################### Encoding ##################################### |
|
10 |
|
11 def segregate(str): |
|
12 """3.1 Basic code point segregation""" |
|
13 base = [] |
|
14 extended = {} |
|
15 for c in str: |
|
16 if ord(c) < 128: |
|
17 base.append(c) |
|
18 else: |
|
19 extended[c] = 1 |
|
20 extended = extended.keys() |
|
21 extended.sort() |
|
22 return "".join(base).encode("ascii"),extended |
|
23 |
|
24 def selective_len(str, max): |
|
25 """Return the length of str, considering only characters below max.""" |
|
26 res = 0 |
|
27 for c in str: |
|
28 if ord(c) < max: |
|
29 res += 1 |
|
30 return res |
|
31 |
|
32 def selective_find(str, char, index, pos): |
|
33 """Return a pair (index, pos), indicating the next occurrence of |
|
34 char in str. index is the position of the character considering |
|
35 only ordinals up to and including char, and pos is the position in |
|
36 the full string. index/pos is the starting position in the full |
|
37 string.""" |
|
38 |
|
39 l = len(str) |
|
40 while 1: |
|
41 pos += 1 |
|
42 if pos == l: |
|
43 return (-1, -1) |
|
44 c = str[pos] |
|
45 if c == char: |
|
46 return index+1, pos |
|
47 elif c < char: |
|
48 index += 1 |
|
49 |
|
50 def insertion_unsort(str, extended): |
|
51 """3.2 Insertion unsort coding""" |
|
52 oldchar = 0x80 |
|
53 result = [] |
|
54 oldindex = -1 |
|
55 for c in extended: |
|
56 index = pos = -1 |
|
57 char = ord(c) |
|
58 curlen = selective_len(str, char) |
|
59 delta = (curlen+1) * (char - oldchar) |
|
60 while 1: |
|
61 index,pos = selective_find(str,c,index,pos) |
|
62 if index == -1: |
|
63 break |
|
64 delta += index - oldindex |
|
65 result.append(delta-1) |
|
66 oldindex = index |
|
67 delta = 0 |
|
68 oldchar = char |
|
69 |
|
70 return result |
|
71 |
|
72 def T(j, bias): |
|
73 # Punycode parameters: tmin = 1, tmax = 26, base = 36 |
|
74 res = 36 * (j + 1) - bias |
|
75 if res < 1: return 1 |
|
76 if res > 26: return 26 |
|
77 return res |
|
78 |
|
79 digits = "abcdefghijklmnopqrstuvwxyz0123456789" |
|
80 def generate_generalized_integer(N, bias): |
|
81 """3.3 Generalized variable-length integers""" |
|
82 result = [] |
|
83 j = 0 |
|
84 while 1: |
|
85 t = T(j, bias) |
|
86 if N < t: |
|
87 result.append(digits[N]) |
|
88 return result |
|
89 result.append(digits[t + ((N - t) % (36 - t))]) |
|
90 N = (N - t) // (36 - t) |
|
91 j += 1 |
|
92 |
|
93 def adapt(delta, first, numchars): |
|
94 if first: |
|
95 delta //= 700 |
|
96 else: |
|
97 delta //= 2 |
|
98 delta += delta // numchars |
|
99 # ((base - tmin) * tmax) // 2 == 455 |
|
100 divisions = 0 |
|
101 while delta > 455: |
|
102 delta = delta // 35 # base - tmin |
|
103 divisions += 36 |
|
104 bias = divisions + (36 * delta // (delta + 38)) |
|
105 return bias |
|
106 |
|
107 |
|
108 def generate_integers(baselen, deltas): |
|
109 """3.4 Bias adaptation""" |
|
110 # Punycode parameters: initial bias = 72, damp = 700, skew = 38 |
|
111 result = [] |
|
112 bias = 72 |
|
113 for points, delta in enumerate(deltas): |
|
114 s = generate_generalized_integer(delta, bias) |
|
115 result.extend(s) |
|
116 bias = adapt(delta, points==0, baselen+points+1) |
|
117 return "".join(result) |
|
118 |
|
119 def punycode_encode(text): |
|
120 base, extended = segregate(text) |
|
121 base = base.encode("ascii") |
|
122 deltas = insertion_unsort(text, extended) |
|
123 extended = generate_integers(len(base), deltas) |
|
124 if base: |
|
125 return base + "-" + extended |
|
126 return extended |
|
127 |
|
128 ##################### Decoding ##################################### |
|
129 |
|
130 def decode_generalized_number(extended, extpos, bias, errors): |
|
131 """3.3 Generalized variable-length integers""" |
|
132 result = 0 |
|
133 w = 1 |
|
134 j = 0 |
|
135 while 1: |
|
136 try: |
|
137 char = ord(extended[extpos]) |
|
138 except IndexError: |
|
139 if errors == "strict": |
|
140 raise UnicodeError, "incomplete punicode string" |
|
141 return extpos + 1, None |
|
142 extpos += 1 |
|
143 if 0x41 <= char <= 0x5A: # A-Z |
|
144 digit = char - 0x41 |
|
145 elif 0x30 <= char <= 0x39: |
|
146 digit = char - 22 # 0x30-26 |
|
147 elif errors == "strict": |
|
148 raise UnicodeError("Invalid extended code point '%s'" |
|
149 % extended[extpos]) |
|
150 else: |
|
151 return extpos, None |
|
152 t = T(j, bias) |
|
153 result += digit * w |
|
154 if digit < t: |
|
155 return extpos, result |
|
156 w = w * (36 - t) |
|
157 j += 1 |
|
158 |
|
159 |
|
160 def insertion_sort(base, extended, errors): |
|
161 """3.2 Insertion unsort coding""" |
|
162 char = 0x80 |
|
163 pos = -1 |
|
164 bias = 72 |
|
165 extpos = 0 |
|
166 while extpos < len(extended): |
|
167 newpos, delta = decode_generalized_number(extended, extpos, |
|
168 bias, errors) |
|
169 if delta is None: |
|
170 # There was an error in decoding. We can't continue because |
|
171 # synchronization is lost. |
|
172 return base |
|
173 pos += delta+1 |
|
174 char += pos // (len(base) + 1) |
|
175 if char > 0x10FFFF: |
|
176 if errors == "strict": |
|
177 raise UnicodeError, ("Invalid character U+%x" % char) |
|
178 char = ord('?') |
|
179 pos = pos % (len(base) + 1) |
|
180 base = base[:pos] + unichr(char) + base[pos:] |
|
181 bias = adapt(delta, (extpos == 0), len(base)) |
|
182 extpos = newpos |
|
183 return base |
|
184 |
|
185 def punycode_decode(text, errors): |
|
186 pos = text.rfind("-") |
|
187 if pos == -1: |
|
188 base = "" |
|
189 extended = text |
|
190 else: |
|
191 base = text[:pos] |
|
192 extended = text[pos+1:] |
|
193 base = unicode(base, "ascii", errors) |
|
194 extended = extended.upper() |
|
195 return insertion_sort(base, extended, errors) |
|
196 |
|
197 ### Codec APIs |
|
198 |
|
199 class Codec(codecs.Codec): |
|
200 |
|
201 def encode(self,input,errors='strict'): |
|
202 res = punycode_encode(input) |
|
203 return res, len(input) |
|
204 |
|
205 def decode(self,input,errors='strict'): |
|
206 if errors not in ('strict', 'replace', 'ignore'): |
|
207 raise UnicodeError, "Unsupported error handling "+errors |
|
208 res = punycode_decode(input, errors) |
|
209 return res, len(input) |
|
210 |
|
211 class IncrementalEncoder(codecs.IncrementalEncoder): |
|
212 def encode(self, input, final=False): |
|
213 return punycode_encode(input) |
|
214 |
|
215 class IncrementalDecoder(codecs.IncrementalDecoder): |
|
216 def decode(self, input, final=False): |
|
217 if self.errors not in ('strict', 'replace', 'ignore'): |
|
218 raise UnicodeError, "Unsupported error handling "+self.errors |
|
219 return punycode_decode(input, self.errors) |
|
220 |
|
221 class StreamWriter(Codec,codecs.StreamWriter): |
|
222 pass |
|
223 |
|
224 class StreamReader(Codec,codecs.StreamReader): |
|
225 pass |
|
226 |
|
227 ### encodings module API |
|
228 |
|
229 def getregentry(): |
|
230 return codecs.CodecInfo( |
|
231 name='punycode', |
|
232 encode=Codec().encode, |
|
233 decode=Codec().decode, |
|
234 incrementalencoder=IncrementalEncoder, |
|
235 incrementaldecoder=IncrementalDecoder, |
|
236 streamwriter=StreamWriter, |
|
237 streamreader=StreamReader, |
|
238 ) |