|
1 """Parse (absolute and relative) URLs. |
|
2 |
|
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, |
|
4 UC Irvine, June 1995. |
|
5 """ |
|
6 |
|
7 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", |
|
8 "urlsplit", "urlunsplit"] |
|
9 |
|
10 # A classification of schemes ('' means apply by default) |
|
11 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', |
|
12 'wais', 'file', 'https', 'shttp', 'mms', |
|
13 'prospero', 'rtsp', 'rtspu', '', 'sftp'] |
|
14 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', |
|
15 'imap', 'wais', 'file', 'mms', 'https', 'shttp', |
|
16 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', |
|
17 'svn', 'svn+ssh', 'sftp'] |
|
18 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', |
|
19 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] |
|
20 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', |
|
21 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', |
|
22 'mms', '', 'sftp'] |
|
23 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', |
|
24 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] |
|
25 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', |
|
26 'nntp', 'wais', 'https', 'shttp', 'snews', |
|
27 'file', 'prospero', ''] |
|
28 |
|
29 # Characters valid in scheme names |
|
30 scheme_chars = ('abcdefghijklmnopqrstuvwxyz' |
|
31 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
|
32 '0123456789' |
|
33 '+-.') |
|
34 |
|
35 MAX_CACHE_SIZE = 20 |
|
36 _parse_cache = {} |
|
37 |
|
38 def clear_cache(): |
|
39 """Clear the parse cache.""" |
|
40 global _parse_cache |
|
41 _parse_cache = {} |
|
42 |
|
43 |
|
44 class BaseResult(tuple): |
|
45 """Base class for the parsed result objects. |
|
46 |
|
47 This provides the attributes shared by the two derived result |
|
48 objects as read-only properties. The derived classes are |
|
49 responsible for checking the right number of arguments were |
|
50 supplied to the constructor. |
|
51 |
|
52 """ |
|
53 |
|
54 __slots__ = () |
|
55 |
|
56 # Attributes that access the basic components of the URL: |
|
57 |
|
58 @property |
|
59 def scheme(self): |
|
60 return self[0] |
|
61 |
|
62 @property |
|
63 def netloc(self): |
|
64 return self[1] |
|
65 |
|
66 @property |
|
67 def path(self): |
|
68 return self[2] |
|
69 |
|
70 @property |
|
71 def query(self): |
|
72 return self[-2] |
|
73 |
|
74 @property |
|
75 def fragment(self): |
|
76 return self[-1] |
|
77 |
|
78 # Additional attributes that provide access to parsed-out portions |
|
79 # of the netloc: |
|
80 |
|
81 @property |
|
82 def username(self): |
|
83 netloc = self.netloc |
|
84 if "@" in netloc: |
|
85 userinfo = netloc.split("@", 1)[0] |
|
86 if ":" in userinfo: |
|
87 userinfo = userinfo.split(":", 1)[0] |
|
88 return userinfo |
|
89 return None |
|
90 |
|
91 @property |
|
92 def password(self): |
|
93 netloc = self.netloc |
|
94 if "@" in netloc: |
|
95 userinfo = netloc.split("@", 1)[0] |
|
96 if ":" in userinfo: |
|
97 return userinfo.split(":", 1)[1] |
|
98 return None |
|
99 |
|
100 @property |
|
101 def hostname(self): |
|
102 netloc = self.netloc |
|
103 if "@" in netloc: |
|
104 netloc = netloc.split("@", 1)[1] |
|
105 if ":" in netloc: |
|
106 netloc = netloc.split(":", 1)[0] |
|
107 return netloc.lower() or None |
|
108 |
|
109 @property |
|
110 def port(self): |
|
111 netloc = self.netloc |
|
112 if "@" in netloc: |
|
113 netloc = netloc.split("@", 1)[1] |
|
114 if ":" in netloc: |
|
115 port = netloc.split(":", 1)[1] |
|
116 return int(port, 10) |
|
117 return None |
|
118 |
|
119 |
|
120 class SplitResult(BaseResult): |
|
121 |
|
122 __slots__ = () |
|
123 |
|
124 def __new__(cls, scheme, netloc, path, query, fragment): |
|
125 return BaseResult.__new__( |
|
126 cls, (scheme, netloc, path, query, fragment)) |
|
127 |
|
128 def geturl(self): |
|
129 return urlunsplit(self) |
|
130 |
|
131 |
|
132 class ParseResult(BaseResult): |
|
133 |
|
134 __slots__ = () |
|
135 |
|
136 def __new__(cls, scheme, netloc, path, params, query, fragment): |
|
137 return BaseResult.__new__( |
|
138 cls, (scheme, netloc, path, params, query, fragment)) |
|
139 |
|
140 @property |
|
141 def params(self): |
|
142 return self[3] |
|
143 |
|
144 def geturl(self): |
|
145 return urlunparse(self) |
|
146 |
|
147 |
|
148 def urlparse(url, scheme='', allow_fragments=True): |
|
149 """Parse a URL into 6 components: |
|
150 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> |
|
151 Return a 6-tuple: (scheme, netloc, path, params, query, fragment). |
|
152 Note that we don't break the components up in smaller bits |
|
153 (e.g. netloc is a single string) and we don't expand % escapes.""" |
|
154 tuple = urlsplit(url, scheme, allow_fragments) |
|
155 scheme, netloc, url, query, fragment = tuple |
|
156 if scheme in uses_params and ';' in url: |
|
157 url, params = _splitparams(url) |
|
158 else: |
|
159 params = '' |
|
160 return ParseResult(scheme, netloc, url, params, query, fragment) |
|
161 |
|
162 def _splitparams(url): |
|
163 if '/' in url: |
|
164 i = url.find(';', url.rfind('/')) |
|
165 if i < 0: |
|
166 return url, '' |
|
167 else: |
|
168 i = url.find(';') |
|
169 return url[:i], url[i+1:] |
|
170 |
|
171 def _splitnetloc(url, start=0): |
|
172 delim = len(url) # position of end of domain part of url, default is end |
|
173 for c in '/?#': # look for delimiters; the order is NOT important |
|
174 wdelim = url.find(c, start) # find first of this delim |
|
175 if wdelim >= 0: # if found |
|
176 delim = min(delim, wdelim) # use earliest delim position |
|
177 return url[start:delim], url[delim:] # return (domain, rest) |
|
178 |
|
179 def urlsplit(url, scheme='', allow_fragments=True): |
|
180 """Parse a URL into 5 components: |
|
181 <scheme>://<netloc>/<path>?<query>#<fragment> |
|
182 Return a 5-tuple: (scheme, netloc, path, query, fragment). |
|
183 Note that we don't break the components up in smaller bits |
|
184 (e.g. netloc is a single string) and we don't expand % escapes.""" |
|
185 allow_fragments = bool(allow_fragments) |
|
186 key = url, scheme, allow_fragments, type(url), type(scheme) |
|
187 cached = _parse_cache.get(key, None) |
|
188 if cached: |
|
189 return cached |
|
190 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth |
|
191 clear_cache() |
|
192 netloc = query = fragment = '' |
|
193 i = url.find(':') |
|
194 if i > 0: |
|
195 if url[:i] == 'http': # optimize the common case |
|
196 scheme = url[:i].lower() |
|
197 url = url[i+1:] |
|
198 if url[:2] == '//': |
|
199 netloc, url = _splitnetloc(url, 2) |
|
200 if allow_fragments and '#' in url: |
|
201 url, fragment = url.split('#', 1) |
|
202 if '?' in url: |
|
203 url, query = url.split('?', 1) |
|
204 v = SplitResult(scheme, netloc, url, query, fragment) |
|
205 _parse_cache[key] = v |
|
206 return v |
|
207 for c in url[:i]: |
|
208 if c not in scheme_chars: |
|
209 break |
|
210 else: |
|
211 scheme, url = url[:i].lower(), url[i+1:] |
|
212 if scheme in uses_netloc and url[:2] == '//': |
|
213 netloc, url = _splitnetloc(url, 2) |
|
214 if allow_fragments and scheme in uses_fragment and '#' in url: |
|
215 url, fragment = url.split('#', 1) |
|
216 if scheme in uses_query and '?' in url: |
|
217 url, query = url.split('?', 1) |
|
218 v = SplitResult(scheme, netloc, url, query, fragment) |
|
219 _parse_cache[key] = v |
|
220 return v |
|
221 |
|
222 def urlunparse((scheme, netloc, url, params, query, fragment)): |
|
223 """Put a parsed URL back together again. This may result in a |
|
224 slightly different, but equivalent URL, if the URL that was parsed |
|
225 originally had redundant delimiters, e.g. a ? with an empty query |
|
226 (the draft states that these are equivalent).""" |
|
227 if params: |
|
228 url = "%s;%s" % (url, params) |
|
229 return urlunsplit((scheme, netloc, url, query, fragment)) |
|
230 |
|
231 def urlunsplit((scheme, netloc, url, query, fragment)): |
|
232 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): |
|
233 if url and url[:1] != '/': url = '/' + url |
|
234 url = '//' + (netloc or '') + url |
|
235 if scheme: |
|
236 url = scheme + ':' + url |
|
237 if query: |
|
238 url = url + '?' + query |
|
239 if fragment: |
|
240 url = url + '#' + fragment |
|
241 return url |
|
242 |
|
243 def urljoin(base, url, allow_fragments=True): |
|
244 """Join a base URL and a possibly relative URL to form an absolute |
|
245 interpretation of the latter.""" |
|
246 if not base: |
|
247 return url |
|
248 if not url: |
|
249 return base |
|
250 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ |
|
251 urlparse(base, '', allow_fragments) |
|
252 scheme, netloc, path, params, query, fragment = \ |
|
253 urlparse(url, bscheme, allow_fragments) |
|
254 if scheme != bscheme or scheme not in uses_relative: |
|
255 return url |
|
256 if scheme in uses_netloc: |
|
257 if netloc: |
|
258 return urlunparse((scheme, netloc, path, |
|
259 params, query, fragment)) |
|
260 netloc = bnetloc |
|
261 if path[:1] == '/': |
|
262 return urlunparse((scheme, netloc, path, |
|
263 params, query, fragment)) |
|
264 if not (path or params or query): |
|
265 return urlunparse((scheme, netloc, bpath, |
|
266 bparams, bquery, fragment)) |
|
267 segments = bpath.split('/')[:-1] + path.split('/') |
|
268 # XXX The stuff below is bogus in various ways... |
|
269 if segments[-1] == '.': |
|
270 segments[-1] = '' |
|
271 while '.' in segments: |
|
272 segments.remove('.') |
|
273 while 1: |
|
274 i = 1 |
|
275 n = len(segments) - 1 |
|
276 while i < n: |
|
277 if (segments[i] == '..' |
|
278 and segments[i-1] not in ('', '..')): |
|
279 del segments[i-1:i+1] |
|
280 break |
|
281 i = i+1 |
|
282 else: |
|
283 break |
|
284 if segments == ['', '..']: |
|
285 segments[-1] = '' |
|
286 elif len(segments) >= 2 and segments[-1] == '..': |
|
287 segments[-2:] = [''] |
|
288 return urlunparse((scheme, netloc, '/'.join(segments), |
|
289 params, query, fragment)) |
|
290 |
|
291 def urldefrag(url): |
|
292 """Removes any existing fragment from URL. |
|
293 |
|
294 Returns a tuple of the defragmented URL and the fragment. If |
|
295 the URL contained no fragments, the second element is the |
|
296 empty string. |
|
297 """ |
|
298 if '#' in url: |
|
299 s, n, p, a, q, frag = urlparse(url) |
|
300 defrag = urlunparse((s, n, p, a, q, '')) |
|
301 return defrag, frag |
|
302 else: |
|
303 return url, '' |
|
304 |
|
305 |
|
306 test_input = """ |
|
307 http://a/b/c/d |
|
308 |
|
309 g:h = <URL:g:h> |
|
310 http:g = <URL:http://a/b/c/g> |
|
311 http: = <URL:http://a/b/c/d> |
|
312 g = <URL:http://a/b/c/g> |
|
313 ./g = <URL:http://a/b/c/g> |
|
314 g/ = <URL:http://a/b/c/g/> |
|
315 /g = <URL:http://a/g> |
|
316 //g = <URL:http://g> |
|
317 ?y = <URL:http://a/b/c/d?y> |
|
318 g?y = <URL:http://a/b/c/g?y> |
|
319 g?y/./x = <URL:http://a/b/c/g?y/./x> |
|
320 . = <URL:http://a/b/c/> |
|
321 ./ = <URL:http://a/b/c/> |
|
322 .. = <URL:http://a/b/> |
|
323 ../ = <URL:http://a/b/> |
|
324 ../g = <URL:http://a/b/g> |
|
325 ../.. = <URL:http://a/> |
|
326 ../../g = <URL:http://a/g> |
|
327 ../../../g = <URL:http://a/../g> |
|
328 ./../g = <URL:http://a/b/g> |
|
329 ./g/. = <URL:http://a/b/c/g/> |
|
330 /./g = <URL:http://a/./g> |
|
331 g/./h = <URL:http://a/b/c/g/h> |
|
332 g/../h = <URL:http://a/b/c/h> |
|
333 http:g = <URL:http://a/b/c/g> |
|
334 http: = <URL:http://a/b/c/d> |
|
335 http:?y = <URL:http://a/b/c/d?y> |
|
336 http:g?y = <URL:http://a/b/c/g?y> |
|
337 http:g?y/./x = <URL:http://a/b/c/g?y/./x> |
|
338 """ |
|
339 |
|
340 def test(): |
|
341 import sys |
|
342 base = '' |
|
343 if sys.argv[1:]: |
|
344 fn = sys.argv[1] |
|
345 if fn == '-': |
|
346 fp = sys.stdin |
|
347 else: |
|
348 fp = open(fn) |
|
349 else: |
|
350 try: |
|
351 from cStringIO import StringIO |
|
352 except ImportError: |
|
353 from StringIO import StringIO |
|
354 fp = StringIO(test_input) |
|
355 while 1: |
|
356 line = fp.readline() |
|
357 if not line: break |
|
358 words = line.split() |
|
359 if not words: |
|
360 continue |
|
361 url = words[0] |
|
362 parts = urlparse(url) |
|
363 print '%-10s : %s' % (url, parts) |
|
364 abs = urljoin(base, url) |
|
365 if not base: |
|
366 base = abs |
|
367 wrapped = '<URL:%s>' % abs |
|
368 print '%-10s = %s' % (url, wrapped) |
|
369 if len(words) == 3 and words[1] == '=': |
|
370 if wrapped != words[2]: |
|
371 print 'EXPECTED', words[2], '!!!!!!!!!!' |
|
372 |
|
373 if __name__ == '__main__': |
|
374 test() |