|
1 """Parse (absolute and relative) URLs. |
|
2 |
|
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, |
|
4 UC Irvine, June 1995. |
|
5 """ |
|
6 |
|
7 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", |
|
8 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"] |
|
9 |
|
10 # A classification of schemes ('' means apply by default) |
|
11 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', |
|
12 'wais', 'file', 'https', 'shttp', 'mms', |
|
13 'prospero', 'rtsp', 'rtspu', '', 'sftp'] |
|
14 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', |
|
15 'imap', 'wais', 'file', 'mms', 'https', 'shttp', |
|
16 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', |
|
17 'svn', 'svn+ssh', 'sftp'] |
|
18 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', |
|
19 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] |
|
20 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', |
|
21 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', |
|
22 'mms', '', 'sftp'] |
|
23 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', |
|
24 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] |
|
25 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', |
|
26 'nntp', 'wais', 'https', 'shttp', 'snews', |
|
27 'file', 'prospero', ''] |
|
28 |
|
29 # Characters valid in scheme names |
|
30 scheme_chars = ('abcdefghijklmnopqrstuvwxyz' |
|
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
|
32 '0123456789' |
|
33 '+-.') |
|
34 |
|
35 MAX_CACHE_SIZE = 20 |
|
36 _parse_cache = {} |
|
37 |
|
38 def clear_cache(): |
|
39 """Clear the parse cache.""" |
|
40 _parse_cache.clear() |
|
41 |
|
42 |
|
43 class ResultMixin(object): |
|
44 """Shared methods for the parsed result objects.""" |
|
45 |
|
46 @property |
|
47 def username(self): |
|
48 netloc = self.netloc |
|
49 if "@" in netloc: |
|
50 userinfo = netloc.rsplit("@", 1)[0] |
|
51 if ":" in userinfo: |
|
52 userinfo = userinfo.split(":", 1)[0] |
|
53 return userinfo |
|
54 return None |
|
55 |
|
56 @property |
|
57 def password(self): |
|
58 netloc = self.netloc |
|
59 if "@" in netloc: |
|
60 userinfo = netloc.rsplit("@", 1)[0] |
|
61 if ":" in userinfo: |
|
62 return userinfo.split(":", 1)[1] |
|
63 return None |
|
64 |
|
65 @property |
|
66 def hostname(self): |
|
67 netloc = self.netloc |
|
68 if "@" in netloc: |
|
69 netloc = netloc.rsplit("@", 1)[1] |
|
70 if ":" in netloc: |
|
71 netloc = netloc.split(":", 1)[0] |
|
72 return netloc.lower() or None |
|
73 |
|
74 @property |
|
75 def port(self): |
|
76 netloc = self.netloc |
|
77 if "@" in netloc: |
|
78 netloc = netloc.rsplit("@", 1)[1] |
|
79 if ":" in netloc: |
|
80 port = netloc.split(":", 1)[1] |
|
81 return int(port, 10) |
|
82 return None |
|
83 |
|
84 from collections import namedtuple |
|
85 |
|
86 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin): |
|
87 |
|
88 __slots__ = () |
|
89 |
|
90 def geturl(self): |
|
91 return urlunsplit(self) |
|
92 |
|
93 |
|
94 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin): |
|
95 |
|
96 __slots__ = () |
|
97 |
|
98 def geturl(self): |
|
99 return urlunparse(self) |
|
100 |
|
101 |
|
102 def urlparse(url, scheme='', allow_fragments=True): |
|
103 """Parse a URL into 6 components: |
|
104 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> |
|
105 Return a 6-tuple: (scheme, netloc, path, params, query, fragment). |
|
106 Note that we don't break the components up in smaller bits |
|
107 (e.g. netloc is a single string) and we don't expand % escapes.""" |
|
108 tuple = urlsplit(url, scheme, allow_fragments) |
|
109 scheme, netloc, url, query, fragment = tuple |
|
110 if scheme in uses_params and ';' in url: |
|
111 url, params = _splitparams(url) |
|
112 else: |
|
113 params = '' |
|
114 return ParseResult(scheme, netloc, url, params, query, fragment) |
|
115 |
|
116 def _splitparams(url): |
|
117 if '/' in url: |
|
118 i = url.find(';', url.rfind('/')) |
|
119 if i < 0: |
|
120 return url, '' |
|
121 else: |
|
122 i = url.find(';') |
|
123 return url[:i], url[i+1:] |
|
124 |
|
125 def _splitnetloc(url, start=0): |
|
126 delim = len(url) # position of end of domain part of url, default is end |
|
127 for c in '/?#': # look for delimiters; the order is NOT important |
|
128 wdelim = url.find(c, start) # find first of this delim |
|
129 if wdelim >= 0: # if found |
|
130 delim = min(delim, wdelim) # use earliest delim position |
|
131 return url[start:delim], url[delim:] # return (domain, rest) |
|
132 |
|
133 def urlsplit(url, scheme='', allow_fragments=True): |
|
134 """Parse a URL into 5 components: |
|
135 <scheme>://<netloc>/<path>?<query>#<fragment> |
|
136 Return a 5-tuple: (scheme, netloc, path, query, fragment). |
|
137 Note that we don't break the components up in smaller bits |
|
138 (e.g. netloc is a single string) and we don't expand % escapes.""" |
|
139 allow_fragments = bool(allow_fragments) |
|
140 key = url, scheme, allow_fragments, type(url), type(scheme) |
|
141 cached = _parse_cache.get(key, None) |
|
142 if cached: |
|
143 return cached |
|
144 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth |
|
145 clear_cache() |
|
146 netloc = query = fragment = '' |
|
147 i = url.find(':') |
|
148 if i > 0: |
|
149 if url[:i] == 'http': # optimize the common case |
|
150 scheme = url[:i].lower() |
|
151 url = url[i+1:] |
|
152 if url[:2] == '//': |
|
153 netloc, url = _splitnetloc(url, 2) |
|
154 if allow_fragments and '#' in url: |
|
155 url, fragment = url.split('#', 1) |
|
156 if '?' in url: |
|
157 url, query = url.split('?', 1) |
|
158 v = SplitResult(scheme, netloc, url, query, fragment) |
|
159 _parse_cache[key] = v |
|
160 return v |
|
161 for c in url[:i]: |
|
162 if c not in scheme_chars: |
|
163 break |
|
164 else: |
|
165 scheme, url = url[:i].lower(), url[i+1:] |
|
166 if scheme in uses_netloc and url[:2] == '//': |
|
167 netloc, url = _splitnetloc(url, 2) |
|
168 if allow_fragments and scheme in uses_fragment and '#' in url: |
|
169 url, fragment = url.split('#', 1) |
|
170 if scheme in uses_query and '?' in url: |
|
171 url, query = url.split('?', 1) |
|
172 v = SplitResult(scheme, netloc, url, query, fragment) |
|
173 _parse_cache[key] = v |
|
174 return v |
|
175 |
|
176 def urlunparse(data): |
|
177 """Put a parsed URL back together again. This may result in a |
|
178 slightly different, but equivalent URL, if the URL that was parsed |
|
179 originally had redundant delimiters, e.g. a ? with an empty query |
|
180 (the draft states that these are equivalent).""" |
|
181 scheme, netloc, url, params, query, fragment = data |
|
182 if params: |
|
183 url = "%s;%s" % (url, params) |
|
184 return urlunsplit((scheme, netloc, url, query, fragment)) |
|
185 |
|
186 def urlunsplit(data): |
|
187 scheme, netloc, url, query, fragment = data |
|
188 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): |
|
189 if url and url[:1] != '/': url = '/' + url |
|
190 url = '//' + (netloc or '') + url |
|
191 if scheme: |
|
192 url = scheme + ':' + url |
|
193 if query: |
|
194 url = url + '?' + query |
|
195 if fragment: |
|
196 url = url + '#' + fragment |
|
197 return url |
|
198 |
|
199 def urljoin(base, url, allow_fragments=True): |
|
200 """Join a base URL and a possibly relative URL to form an absolute |
|
201 interpretation of the latter.""" |
|
202 if not base: |
|
203 return url |
|
204 if not url: |
|
205 return base |
|
206 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ |
|
207 urlparse(base, '', allow_fragments) |
|
208 scheme, netloc, path, params, query, fragment = \ |
|
209 urlparse(url, bscheme, allow_fragments) |
|
210 if scheme != bscheme or scheme not in uses_relative: |
|
211 return url |
|
212 if scheme in uses_netloc: |
|
213 if netloc: |
|
214 return urlunparse((scheme, netloc, path, |
|
215 params, query, fragment)) |
|
216 netloc = bnetloc |
|
217 if path[:1] == '/': |
|
218 return urlunparse((scheme, netloc, path, |
|
219 params, query, fragment)) |
|
220 if not path: |
|
221 path = bpath |
|
222 if not params: |
|
223 params = bparams |
|
224 else: |
|
225 path = path[:-1] |
|
226 return urlunparse((scheme, netloc, path, |
|
227 params, query, fragment)) |
|
228 if not query: |
|
229 query = bquery |
|
230 return urlunparse((scheme, netloc, path, |
|
231 params, query, fragment)) |
|
232 segments = bpath.split('/')[:-1] + path.split('/') |
|
233 # XXX The stuff below is bogus in various ways... |
|
234 if segments[-1] == '.': |
|
235 segments[-1] = '' |
|
236 while '.' in segments: |
|
237 segments.remove('.') |
|
238 while 1: |
|
239 i = 1 |
|
240 n = len(segments) - 1 |
|
241 while i < n: |
|
242 if (segments[i] == '..' |
|
243 and segments[i-1] not in ('', '..')): |
|
244 del segments[i-1:i+1] |
|
245 break |
|
246 i = i+1 |
|
247 else: |
|
248 break |
|
249 if segments == ['', '..']: |
|
250 segments[-1] = '' |
|
251 elif len(segments) >= 2 and segments[-1] == '..': |
|
252 segments[-2:] = [''] |
|
253 return urlunparse((scheme, netloc, '/'.join(segments), |
|
254 params, query, fragment)) |
|
255 |
|
256 def urldefrag(url): |
|
257 """Removes any existing fragment from URL. |
|
258 |
|
259 Returns a tuple of the defragmented URL and the fragment. If |
|
260 the URL contained no fragments, the second element is the |
|
261 empty string. |
|
262 """ |
|
263 if '#' in url: |
|
264 s, n, p, a, q, frag = urlparse(url) |
|
265 defrag = urlunparse((s, n, p, a, q, '')) |
|
266 return defrag, frag |
|
267 else: |
|
268 return url, '' |
|
269 |
|
270 # unquote method for parse_qs and parse_qsl |
|
271 # Cannot use directly from urllib as it would create circular reference. |
|
272 # urllib uses urlparse methods ( urljoin) |
|
273 |
|
274 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) |
|
275 _hextochr.update(('%02X' % i, chr(i)) for i in range(256)) |
|
276 |
|
277 def unquote(s): |
|
278 """unquote('abc%20def') -> 'abc def'.""" |
|
279 res = s.split('%') |
|
280 for i in xrange(1, len(res)): |
|
281 item = res[i] |
|
282 try: |
|
283 res[i] = _hextochr[item[:2]] + item[2:] |
|
284 except KeyError: |
|
285 res[i] = '%' + item |
|
286 except UnicodeDecodeError: |
|
287 res[i] = unichr(int(item[:2], 16)) + item[2:] |
|
288 return "".join(res) |
|
289 |
|
290 def parse_qs(qs, keep_blank_values=0, strict_parsing=0): |
|
291 """Parse a query given as a string argument. |
|
292 |
|
293 Arguments: |
|
294 |
|
295 qs: URL-encoded query string to be parsed |
|
296 |
|
297 keep_blank_values: flag indicating whether blank values in |
|
298 URL encoded queries should be treated as blank strings. |
|
299 A true value indicates that blanks should be retained as |
|
300 blank strings. The default false value indicates that |
|
301 blank values are to be ignored and treated as if they were |
|
302 not included. |
|
303 |
|
304 strict_parsing: flag indicating what to do with parsing errors. |
|
305 If false (the default), errors are silently ignored. |
|
306 If true, errors raise a ValueError exception. |
|
307 """ |
|
308 dict = {} |
|
309 for name, value in parse_qsl(qs, keep_blank_values, strict_parsing): |
|
310 if name in dict: |
|
311 dict[name].append(value) |
|
312 else: |
|
313 dict[name] = [value] |
|
314 return dict |
|
315 |
|
316 def parse_qsl(qs, keep_blank_values=0, strict_parsing=0): |
|
317 """Parse a query given as a string argument. |
|
318 |
|
319 Arguments: |
|
320 |
|
321 qs: URL-encoded query string to be parsed |
|
322 |
|
323 keep_blank_values: flag indicating whether blank values in |
|
324 URL encoded queries should be treated as blank strings. A |
|
325 true value indicates that blanks should be retained as blank |
|
326 strings. The default false value indicates that blank values |
|
327 are to be ignored and treated as if they were not included. |
|
328 |
|
329 strict_parsing: flag indicating what to do with parsing errors. If |
|
330 false (the default), errors are silently ignored. If true, |
|
331 errors raise a ValueError exception. |
|
332 |
|
333 Returns a list, as G-d intended. |
|
334 """ |
|
335 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] |
|
336 r = [] |
|
337 for name_value in pairs: |
|
338 if not name_value and not strict_parsing: |
|
339 continue |
|
340 nv = name_value.split('=', 1) |
|
341 if len(nv) != 2: |
|
342 if strict_parsing: |
|
343 raise ValueError, "bad query field: %r" % (name_value,) |
|
344 # Handle case of a control-name with no equal sign |
|
345 if keep_blank_values: |
|
346 nv.append('') |
|
347 else: |
|
348 continue |
|
349 if len(nv[1]) or keep_blank_values: |
|
350 name = unquote(nv[0].replace('+', ' ')) |
|
351 value = unquote(nv[1].replace('+', ' ')) |
|
352 r.append((name, value)) |
|
353 |
|
354 return r |
|
355 |
|
356 |
|
357 test_input = """ |
|
358 http://a/b/c/d |
|
359 |
|
360 g:h = <URL:g:h> |
|
361 http:g = <URL:http://a/b/c/g> |
|
362 http: = <URL:http://a/b/c/d> |
|
363 g = <URL:http://a/b/c/g> |
|
364 ./g = <URL:http://a/b/c/g> |
|
365 g/ = <URL:http://a/b/c/g/> |
|
366 /g = <URL:http://a/g> |
|
367 //g = <URL:http://g> |
|
368 ?y = <URL:http://a/b/c/d?y> |
|
369 g?y = <URL:http://a/b/c/g?y> |
|
370 g?y/./x = <URL:http://a/b/c/g?y/./x> |
|
371 . = <URL:http://a/b/c/> |
|
372 ./ = <URL:http://a/b/c/> |
|
373 .. = <URL:http://a/b/> |
|
374 ../ = <URL:http://a/b/> |
|
375 ../g = <URL:http://a/b/g> |
|
376 ../.. = <URL:http://a/> |
|
377 ../../g = <URL:http://a/g> |
|
378 ../../../g = <URL:http://a/../g> |
|
379 ./../g = <URL:http://a/b/g> |
|
380 ./g/. = <URL:http://a/b/c/g/> |
|
381 /./g = <URL:http://a/./g> |
|
382 g/./h = <URL:http://a/b/c/g/h> |
|
383 g/../h = <URL:http://a/b/c/h> |
|
384 http:g = <URL:http://a/b/c/g> |
|
385 http: = <URL:http://a/b/c/d> |
|
386 http:?y = <URL:http://a/b/c/d?y> |
|
387 http:g?y = <URL:http://a/b/c/g?y> |
|
388 http:g?y/./x = <URL:http://a/b/c/g?y/./x> |
|
389 """ |
|
390 |
|
391 def test(): |
|
392 import sys |
|
393 base = '' |
|
394 if sys.argv[1:]: |
|
395 fn = sys.argv[1] |
|
396 if fn == '-': |
|
397 fp = sys.stdin |
|
398 else: |
|
399 fp = open(fn) |
|
400 else: |
|
401 try: |
|
402 from cStringIO import StringIO |
|
403 except ImportError: |
|
404 from StringIO import StringIO |
|
405 fp = StringIO(test_input) |
|
406 for line in fp: |
|
407 words = line.split() |
|
408 if not words: |
|
409 continue |
|
410 url = words[0] |
|
411 parts = urlparse(url) |
|
412 print '%-10s : %s' % (url, parts) |
|
413 abs = urljoin(base, url) |
|
414 if not base: |
|
415 base = abs |
|
416 wrapped = '<URL:%s>' % abs |
|
417 print '%-10s = %s' % (url, wrapped) |
|
418 if len(words) == 3 and words[1] == '=': |
|
419 if wrapped != words[2]: |
|
420 print 'EXPECTED', words[2], '!!!!!!!!!!' |
|
421 |
|
422 if __name__ == '__main__': |
|
423 test() |