|
1 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) |
|
2 |
|
3 import stringprep, re, codecs |
|
4 from unicodedata import ucd_3_2_0 as unicodedata |
|
5 |
|
6 # IDNA section 3.1 |
|
7 dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") |
|
8 |
|
9 # IDNA section 5 |
|
10 ace_prefix = "xn--" |
|
11 uace_prefix = unicode(ace_prefix, "ascii") |
|
12 |
|
13 # This assumes query strings, so AllowUnassigned is true |
|
14 def nameprep(label): |
|
15 # Map |
|
16 newlabel = [] |
|
17 for c in label: |
|
18 if stringprep.in_table_b1(c): |
|
19 # Map to nothing |
|
20 continue |
|
21 newlabel.append(stringprep.map_table_b2(c)) |
|
22 label = u"".join(newlabel) |
|
23 |
|
24 # Normalize |
|
25 label = unicodedata.normalize("NFKC", label) |
|
26 |
|
27 # Prohibit |
|
28 for c in label: |
|
29 if stringprep.in_table_c12(c) or \ |
|
30 stringprep.in_table_c22(c) or \ |
|
31 stringprep.in_table_c3(c) or \ |
|
32 stringprep.in_table_c4(c) or \ |
|
33 stringprep.in_table_c5(c) or \ |
|
34 stringprep.in_table_c6(c) or \ |
|
35 stringprep.in_table_c7(c) or \ |
|
36 stringprep.in_table_c8(c) or \ |
|
37 stringprep.in_table_c9(c): |
|
38 raise UnicodeError("Invalid character %r" % c) |
|
39 |
|
40 # Check bidi |
|
41 RandAL = map(stringprep.in_table_d1, label) |
|
42 for c in RandAL: |
|
43 if c: |
|
44 # There is a RandAL char in the string. Must perform further |
|
45 # tests: |
|
46 # 1) The characters in section 5.8 MUST be prohibited. |
|
47 # This is table C.8, which was already checked |
|
48 # 2) If a string contains any RandALCat character, the string |
|
49 # MUST NOT contain any LCat character. |
|
50 if filter(stringprep.in_table_d2, label): |
|
51 raise UnicodeError("Violation of BIDI requirement 2") |
|
52 |
|
53 # 3) If a string contains any RandALCat character, a |
|
54 # RandALCat character MUST be the first character of the |
|
55 # string, and a RandALCat character MUST be the last |
|
56 # character of the string. |
|
57 if not RandAL[0] or not RandAL[-1]: |
|
58 raise UnicodeError("Violation of BIDI requirement 3") |
|
59 |
|
60 return label |
|
61 |
|
62 def ToASCII(label): |
|
63 try: |
|
64 # Step 1: try ASCII |
|
65 label = label.encode("ascii") |
|
66 except UnicodeError: |
|
67 pass |
|
68 else: |
|
69 # Skip to step 3: UseSTD3ASCIIRules is false, so |
|
70 # Skip to step 8. |
|
71 if 0 < len(label) < 64: |
|
72 return label |
|
73 raise UnicodeError("label empty or too long") |
|
74 |
|
75 # Step 2: nameprep |
|
76 label = nameprep(label) |
|
77 |
|
78 # Step 3: UseSTD3ASCIIRules is false |
|
79 # Step 4: try ASCII |
|
80 try: |
|
81 label = label.encode("ascii") |
|
82 except UnicodeError: |
|
83 pass |
|
84 else: |
|
85 # Skip to step 8. |
|
86 if 0 < len(label) < 64: |
|
87 return label |
|
88 raise UnicodeError("label empty or too long") |
|
89 |
|
90 # Step 5: Check ACE prefix |
|
91 if label.startswith(uace_prefix): |
|
92 raise UnicodeError("Label starts with ACE prefix") |
|
93 |
|
94 # Step 6: Encode with PUNYCODE |
|
95 label = label.encode("punycode") |
|
96 |
|
97 # Step 7: Prepend ACE prefix |
|
98 label = ace_prefix + label |
|
99 |
|
100 # Step 8: Check size |
|
101 if 0 < len(label) < 64: |
|
102 return label |
|
103 raise UnicodeError("label empty or too long") |
|
104 |
|
105 def ToUnicode(label): |
|
106 # Step 1: Check for ASCII |
|
107 if isinstance(label, str): |
|
108 pure_ascii = True |
|
109 else: |
|
110 try: |
|
111 label = label.encode("ascii") |
|
112 pure_ascii = True |
|
113 except UnicodeError: |
|
114 pure_ascii = False |
|
115 if not pure_ascii: |
|
116 # Step 2: Perform nameprep |
|
117 label = nameprep(label) |
|
118 # It doesn't say this, but apparently, it should be ASCII now |
|
119 try: |
|
120 label = label.encode("ascii") |
|
121 except UnicodeError: |
|
122 raise UnicodeError("Invalid character in IDN label") |
|
123 # Step 3: Check for ACE prefix |
|
124 if not label.startswith(ace_prefix): |
|
125 return unicode(label, "ascii") |
|
126 |
|
127 # Step 4: Remove ACE prefix |
|
128 label1 = label[len(ace_prefix):] |
|
129 |
|
130 # Step 5: Decode using PUNYCODE |
|
131 result = label1.decode("punycode") |
|
132 |
|
133 # Step 6: Apply ToASCII |
|
134 label2 = ToASCII(result) |
|
135 |
|
136 # Step 7: Compare the result of step 6 with the one of step 3 |
|
137 # label2 will already be in lower case. |
|
138 if label.lower() != label2: |
|
139 raise UnicodeError("IDNA does not round-trip", label, label2) |
|
140 |
|
141 # Step 8: return the result of step 5 |
|
142 return result |
|
143 |
|
144 ### Codec APIs |
|
145 |
|
146 class Codec(codecs.Codec): |
|
147 def encode(self,input,errors='strict'): |
|
148 |
|
149 if errors != 'strict': |
|
150 # IDNA is quite clear that implementations must be strict |
|
151 raise UnicodeError("unsupported error handling "+errors) |
|
152 |
|
153 if not input: |
|
154 return "", 0 |
|
155 |
|
156 result = [] |
|
157 labels = dots.split(input) |
|
158 if labels and len(labels[-1])==0: |
|
159 trailing_dot = '.' |
|
160 del labels[-1] |
|
161 else: |
|
162 trailing_dot = '' |
|
163 for label in labels: |
|
164 result.append(ToASCII(label)) |
|
165 # Join with U+002E |
|
166 return ".".join(result)+trailing_dot, len(input) |
|
167 |
|
168 def decode(self,input,errors='strict'): |
|
169 |
|
170 if errors != 'strict': |
|
171 raise UnicodeError("Unsupported error handling "+errors) |
|
172 |
|
173 if not input: |
|
174 return u"", 0 |
|
175 |
|
176 # IDNA allows decoding to operate on Unicode strings, too. |
|
177 if isinstance(input, unicode): |
|
178 labels = dots.split(input) |
|
179 else: |
|
180 # Must be ASCII string |
|
181 input = str(input) |
|
182 unicode(input, "ascii") |
|
183 labels = input.split(".") |
|
184 |
|
185 if labels and len(labels[-1]) == 0: |
|
186 trailing_dot = u'.' |
|
187 del labels[-1] |
|
188 else: |
|
189 trailing_dot = u'' |
|
190 |
|
191 result = [] |
|
192 for label in labels: |
|
193 result.append(ToUnicode(label)) |
|
194 |
|
195 return u".".join(result)+trailing_dot, len(input) |
|
196 |
|
197 class IncrementalEncoder(codecs.BufferedIncrementalEncoder): |
|
198 def _buffer_encode(self, input, errors, final): |
|
199 if errors != 'strict': |
|
200 # IDNA is quite clear that implementations must be strict |
|
201 raise UnicodeError("unsupported error handling "+errors) |
|
202 |
|
203 if not input: |
|
204 return ("", 0) |
|
205 |
|
206 labels = dots.split(input) |
|
207 trailing_dot = u'' |
|
208 if labels: |
|
209 if not labels[-1]: |
|
210 trailing_dot = '.' |
|
211 del labels[-1] |
|
212 elif not final: |
|
213 # Keep potentially unfinished label until the next call |
|
214 del labels[-1] |
|
215 if labels: |
|
216 trailing_dot = '.' |
|
217 |
|
218 result = [] |
|
219 size = 0 |
|
220 for label in labels: |
|
221 result.append(ToASCII(label)) |
|
222 if size: |
|
223 size += 1 |
|
224 size += len(label) |
|
225 |
|
226 # Join with U+002E |
|
227 result = ".".join(result) + trailing_dot |
|
228 size += len(trailing_dot) |
|
229 return (result, size) |
|
230 |
|
231 class IncrementalDecoder(codecs.BufferedIncrementalDecoder): |
|
232 def _buffer_decode(self, input, errors, final): |
|
233 if errors != 'strict': |
|
234 raise UnicodeError("Unsupported error handling "+errors) |
|
235 |
|
236 if not input: |
|
237 return (u"", 0) |
|
238 |
|
239 # IDNA allows decoding to operate on Unicode strings, too. |
|
240 if isinstance(input, unicode): |
|
241 labels = dots.split(input) |
|
242 else: |
|
243 # Must be ASCII string |
|
244 input = str(input) |
|
245 unicode(input, "ascii") |
|
246 labels = input.split(".") |
|
247 |
|
248 trailing_dot = u'' |
|
249 if labels: |
|
250 if not labels[-1]: |
|
251 trailing_dot = u'.' |
|
252 del labels[-1] |
|
253 elif not final: |
|
254 # Keep potentially unfinished label until the next call |
|
255 del labels[-1] |
|
256 if labels: |
|
257 trailing_dot = u'.' |
|
258 |
|
259 result = [] |
|
260 size = 0 |
|
261 for label in labels: |
|
262 result.append(ToUnicode(label)) |
|
263 if size: |
|
264 size += 1 |
|
265 size += len(label) |
|
266 |
|
267 result = u".".join(result) + trailing_dot |
|
268 size += len(trailing_dot) |
|
269 return (result, size) |
|
270 |
|
271 class StreamWriter(Codec,codecs.StreamWriter): |
|
272 pass |
|
273 |
|
274 class StreamReader(Codec,codecs.StreamReader): |
|
275 pass |
|
276 |
|
277 ### encodings module API |
|
278 |
|
279 def getregentry(): |
|
280 return codecs.CodecInfo( |
|
281 name='idna', |
|
282 encode=Codec().encode, |
|
283 decode=Codec().decode, |
|
284 incrementalencoder=IncrementalEncoder, |
|
285 incrementaldecoder=IncrementalDecoder, |
|
286 streamwriter=StreamWriter, |
|
287 streamreader=StreamReader, |
|
288 ) |