|
1 """An extensible library for opening URLs using a variety of protocols |
|
2 |
|
3 The simplest way to use this module is to call the urlopen function, |
|
4 which accepts a string containing a URL or a Request object (described |
|
5 below). It opens the URL and returns the results as file-like |
|
6 object; the returned object has some extra methods described below. |
|
7 |
|
8 The OpenerDirector manages a collection of Handler objects that do |
|
9 all the actual work. Each Handler implements a particular protocol or |
|
10 option. The OpenerDirector is a composite object that invokes the |
|
11 Handlers needed to open the requested URL. For example, the |
|
12 HTTPHandler performs HTTP GET and POST requests and deals with |
|
13 non-error returns. The HTTPRedirectHandler automatically deals with |
|
14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler |
|
15 deals with digest authentication. |
|
16 |
|
17 urlopen(url, data=None) -- Basic usage is the same as original |
|
18 urllib. pass the url and optionally data to post to an HTTP URL, and |
|
19 get a file-like object back. One difference is that you can also pass |
|
20 a Request instance instead of URL. Raises a URLError (subclass of |
|
21 IOError); for HTTP errors, raises an HTTPError, which can also be |
|
22 treated as a valid response. |
|
23 |
|
24 build_opener -- Function that creates a new OpenerDirector instance. |
|
25 Will install the default handlers. Accepts one or more Handlers as |
|
26 arguments, either instances or Handler classes that it will |
|
27 instantiate. If one of the argument is a subclass of the default |
|
28 handler, the argument will be installed instead of the default. |
|
29 |
|
30 install_opener -- Installs a new opener as the default opener. |
|
31 |
|
32 objects of interest: |
|
33 OpenerDirector -- |
|
34 |
|
35 Request -- An object that encapsulates the state of a request. The |
|
36 state can be as simple as the URL. It can also include extra HTTP |
|
37 headers, e.g. a User-Agent. |
|
38 |
|
39 BaseHandler -- |
|
40 |
|
41 exceptions: |
|
42 URLError -- A subclass of IOError, individual protocols have their own |
|
43 specific subclass. |
|
44 |
|
45 HTTPError -- Also a valid HTTP response, so you can treat an HTTP error |
|
46 as an exceptional event or valid response. |
|
47 |
|
48 internals: |
|
49 BaseHandler and parent |
|
50 _call_chain conventions |
|
51 |
|
52 Example usage: |
|
53 |
|
54 import urllib2 |
|
55 |
|
56 # set up authentication info |
|
57 authinfo = urllib2.HTTPBasicAuthHandler() |
|
58 authinfo.add_password(realm='PDQ Application', |
|
59 uri='https://mahler:8092/site-updates.py', |
|
60 user='klem', |
|
61 passwd='geheim$parole') |
|
62 |
|
63 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"}) |
|
64 |
|
65 # build a new opener that adds authentication and caching FTP handlers |
|
66 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler) |
|
67 |
|
68 # install it |
|
69 urllib2.install_opener(opener) |
|
70 |
|
71 f = urllib2.urlopen('http://www.python.org/') |
|
72 |
|
73 |
|
74 """ |
|
75 |
|
76 # XXX issues: |
|
77 # If an authentication error handler that tries to perform |
|
78 # authentication for some reason but fails, how should the error be |
|
79 # signalled? The client needs to know the HTTP error code. But if |
|
80 # the handler knows that the problem was, e.g., that it didn't know |
|
81 # that hash algo that requested in the challenge, it would be good to |
|
82 # pass that information along to the client, too. |
|
83 # ftp errors aren't handled cleanly |
|
84 # check digest against correct (i.e. non-apache) implementation |
|
85 |
|
86 # Possible extensions: |
|
87 # complex proxies XXX not sure what exactly was meant by this |
|
88 # abstract factory for opener |
|
89 |
|
90 import base64 |
|
91 import hashlib |
|
92 import httplib |
|
93 import mimetools |
|
94 import os |
|
95 import posixpath |
|
96 import random |
|
97 import re |
|
98 import socket |
|
99 import sys |
|
100 import time |
|
101 import urlparse |
|
102 import bisect |
|
103 |
|
104 try: |
|
105 from cStringIO import StringIO |
|
106 except ImportError: |
|
107 from StringIO import StringIO |
|
108 |
|
109 from urllib import (unwrap, unquote, splittype, splithost, quote, |
|
110 addinfourl, splitport, |
|
111 splitattr, ftpwrapper, splituser, splitpasswd, splitvalue) |
|
112 |
|
113 # support for FileHandler, proxies via environment variables |
|
114 from urllib import localhost, url2pathname, getproxies |
|
115 |
|
116 # used in User-Agent header sent |
|
117 __version__ = sys.version[:3] |
|
118 |
|
119 _opener = None |
|
120 def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): |
|
121 global _opener |
|
122 if _opener is None: |
|
123 _opener = build_opener() |
|
124 return _opener.open(url, data, timeout) |
|
125 |
|
126 def install_opener(opener): |
|
127 global _opener |
|
128 _opener = opener |
|
129 |
|
130 # do these error classes make sense? |
|
131 # make sure all of the IOError stuff is overridden. we just want to be |
|
132 # subtypes. |
|
133 |
|
134 class URLError(IOError): |
|
135 # URLError is a sub-type of IOError, but it doesn't share any of |
|
136 # the implementation. need to override __init__ and __str__. |
|
137 # It sets self.args for compatibility with other EnvironmentError |
|
138 # subclasses, but args doesn't have the typical format with errno in |
|
139 # slot 0 and strerror in slot 1. This may be better than nothing. |
|
140 def __init__(self, reason): |
|
141 self.args = reason, |
|
142 self.reason = reason |
|
143 |
|
144 def __str__(self): |
|
145 return '<urlopen error %s>' % self.reason |
|
146 |
|
147 class HTTPError(URLError, addinfourl): |
|
148 """Raised when HTTP error occurs, but also acts like non-error return""" |
|
149 __super_init = addinfourl.__init__ |
|
150 |
|
151 def __init__(self, url, code, msg, hdrs, fp): |
|
152 self.code = code |
|
153 self.msg = msg |
|
154 self.hdrs = hdrs |
|
155 self.fp = fp |
|
156 self.filename = url |
|
157 # The addinfourl classes depend on fp being a valid file |
|
158 # object. In some cases, the HTTPError may not have a valid |
|
159 # file object. If this happens, the simplest workaround is to |
|
160 # not initialize the base classes. |
|
161 if fp is not None: |
|
162 self.__super_init(fp, hdrs, url, code) |
|
163 |
|
164 def __str__(self): |
|
165 return 'HTTP Error %s: %s' % (self.code, self.msg) |
|
166 |
|
167 # copied from cookielib.py |
|
168 _cut_port_re = re.compile(r":\d+$") |
|
169 def request_host(request): |
|
170 """Return request-host, as defined by RFC 2965. |
|
171 |
|
172 Variation from RFC: returned value is lowercased, for convenient |
|
173 comparison. |
|
174 |
|
175 """ |
|
176 url = request.get_full_url() |
|
177 host = urlparse.urlparse(url)[1] |
|
178 if host == "": |
|
179 host = request.get_header("Host", "") |
|
180 |
|
181 # remove port, if present |
|
182 host = _cut_port_re.sub("", host, 1) |
|
183 return host.lower() |
|
184 |
|
185 class Request: |
|
186 |
|
187 def __init__(self, url, data=None, headers={}, |
|
188 origin_req_host=None, unverifiable=False): |
|
189 # unwrap('<URL:type://host/path>') --> 'type://host/path' |
|
190 self.__original = unwrap(url) |
|
191 self.type = None |
|
192 # self.__r_type is what's left after doing the splittype |
|
193 self.host = None |
|
194 self.port = None |
|
195 self.data = data |
|
196 self.headers = {} |
|
197 for key, value in headers.items(): |
|
198 self.add_header(key, value) |
|
199 self.unredirected_hdrs = {} |
|
200 if origin_req_host is None: |
|
201 origin_req_host = request_host(self) |
|
202 self.origin_req_host = origin_req_host |
|
203 self.unverifiable = unverifiable |
|
204 |
|
205 def __getattr__(self, attr): |
|
206 # XXX this is a fallback mechanism to guard against these |
|
207 # methods getting called in a non-standard order. this may be |
|
208 # too complicated and/or unnecessary. |
|
209 # XXX should the __r_XXX attributes be public? |
|
210 if attr[:12] == '_Request__r_': |
|
211 name = attr[12:] |
|
212 if hasattr(Request, 'get_' + name): |
|
213 getattr(self, 'get_' + name)() |
|
214 return getattr(self, attr) |
|
215 raise AttributeError, attr |
|
216 |
|
217 def get_method(self): |
|
218 if self.has_data(): |
|
219 return "POST" |
|
220 else: |
|
221 return "GET" |
|
222 |
|
223 # XXX these helper methods are lame |
|
224 |
|
225 def add_data(self, data): |
|
226 self.data = data |
|
227 |
|
228 def has_data(self): |
|
229 return self.data is not None |
|
230 |
|
231 def get_data(self): |
|
232 return self.data |
|
233 |
|
234 def get_full_url(self): |
|
235 return self.__original |
|
236 |
|
237 def get_type(self): |
|
238 if self.type is None: |
|
239 self.type, self.__r_type = splittype(self.__original) |
|
240 if self.type is None: |
|
241 raise ValueError, "unknown url type: %s" % self.__original |
|
242 return self.type |
|
243 |
|
244 def get_host(self): |
|
245 if self.host is None: |
|
246 self.host, self.__r_host = splithost(self.__r_type) |
|
247 if self.host: |
|
248 self.host = unquote(self.host) |
|
249 return self.host |
|
250 |
|
251 def get_selector(self): |
|
252 return self.__r_host |
|
253 |
|
254 def set_proxy(self, host, type): |
|
255 self.host, self.type = host, type |
|
256 self.__r_host = self.__original |
|
257 |
|
258 def has_proxy(self): |
|
259 return self.__r_host == self.__original |
|
260 |
|
261 def get_origin_req_host(self): |
|
262 return self.origin_req_host |
|
263 |
|
264 def is_unverifiable(self): |
|
265 return self.unverifiable |
|
266 |
|
267 def add_header(self, key, val): |
|
268 # useful for something like authentication |
|
269 self.headers[key.capitalize()] = val |
|
270 |
|
271 def add_unredirected_header(self, key, val): |
|
272 # will not be added to a redirected request |
|
273 self.unredirected_hdrs[key.capitalize()] = val |
|
274 |
|
275 def has_header(self, header_name): |
|
276 return (header_name in self.headers or |
|
277 header_name in self.unredirected_hdrs) |
|
278 |
|
279 def get_header(self, header_name, default=None): |
|
280 return self.headers.get( |
|
281 header_name, |
|
282 self.unredirected_hdrs.get(header_name, default)) |
|
283 |
|
284 def header_items(self): |
|
285 hdrs = self.unredirected_hdrs.copy() |
|
286 hdrs.update(self.headers) |
|
287 return hdrs.items() |
|
288 |
|
289 class OpenerDirector: |
|
290 def __init__(self): |
|
291 client_version = "Python-urllib/%s" % __version__ |
|
292 self.addheaders = [('User-agent', client_version)] |
|
293 # manage the individual handlers |
|
294 self.handlers = [] |
|
295 self.handle_open = {} |
|
296 self.handle_error = {} |
|
297 self.process_response = {} |
|
298 self.process_request = {} |
|
299 |
|
300 def add_handler(self, handler): |
|
301 if not hasattr(handler, "add_parent"): |
|
302 raise TypeError("expected BaseHandler instance, got %r" % |
|
303 type(handler)) |
|
304 |
|
305 added = False |
|
306 for meth in dir(handler): |
|
307 if meth in ["redirect_request", "do_open", "proxy_open"]: |
|
308 # oops, coincidental match |
|
309 continue |
|
310 |
|
311 i = meth.find("_") |
|
312 protocol = meth[:i] |
|
313 condition = meth[i+1:] |
|
314 |
|
315 if condition.startswith("error"): |
|
316 j = condition.find("_") + i + 1 |
|
317 kind = meth[j+1:] |
|
318 try: |
|
319 kind = int(kind) |
|
320 except ValueError: |
|
321 pass |
|
322 lookup = self.handle_error.get(protocol, {}) |
|
323 self.handle_error[protocol] = lookup |
|
324 elif condition == "open": |
|
325 kind = protocol |
|
326 lookup = self.handle_open |
|
327 elif condition == "response": |
|
328 kind = protocol |
|
329 lookup = self.process_response |
|
330 elif condition == "request": |
|
331 kind = protocol |
|
332 lookup = self.process_request |
|
333 else: |
|
334 continue |
|
335 |
|
336 handlers = lookup.setdefault(kind, []) |
|
337 if handlers: |
|
338 bisect.insort(handlers, handler) |
|
339 else: |
|
340 handlers.append(handler) |
|
341 added = True |
|
342 |
|
343 if added: |
|
344 # the handlers must work in an specific order, the order |
|
345 # is specified in a Handler attribute |
|
346 bisect.insort(self.handlers, handler) |
|
347 handler.add_parent(self) |
|
348 |
|
349 def close(self): |
|
350 # Only exists for backwards compatibility. |
|
351 pass |
|
352 |
|
353 def _call_chain(self, chain, kind, meth_name, *args): |
|
354 # Handlers raise an exception if no one else should try to handle |
|
355 # the request, or return None if they can't but another handler |
|
356 # could. Otherwise, they return the response. |
|
357 handlers = chain.get(kind, ()) |
|
358 for handler in handlers: |
|
359 func = getattr(handler, meth_name) |
|
360 |
|
361 result = func(*args) |
|
362 if result is not None: |
|
363 return result |
|
364 |
|
365 def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT): |
|
366 # accept a URL or a Request object |
|
367 if isinstance(fullurl, basestring): |
|
368 req = Request(fullurl, data) |
|
369 else: |
|
370 req = fullurl |
|
371 if data is not None: |
|
372 req.add_data(data) |
|
373 |
|
374 req.timeout = timeout |
|
375 protocol = req.get_type() |
|
376 |
|
377 # pre-process request |
|
378 meth_name = protocol+"_request" |
|
379 for processor in self.process_request.get(protocol, []): |
|
380 meth = getattr(processor, meth_name) |
|
381 req = meth(req) |
|
382 |
|
383 response = self._open(req, data) |
|
384 |
|
385 # post-process response |
|
386 meth_name = protocol+"_response" |
|
387 for processor in self.process_response.get(protocol, []): |
|
388 meth = getattr(processor, meth_name) |
|
389 response = meth(req, response) |
|
390 |
|
391 return response |
|
392 |
|
393 def _open(self, req, data=None): |
|
394 result = self._call_chain(self.handle_open, 'default', |
|
395 'default_open', req) |
|
396 if result: |
|
397 return result |
|
398 |
|
399 protocol = req.get_type() |
|
400 result = self._call_chain(self.handle_open, protocol, protocol + |
|
401 '_open', req) |
|
402 if result: |
|
403 return result |
|
404 |
|
405 return self._call_chain(self.handle_open, 'unknown', |
|
406 'unknown_open', req) |
|
407 |
|
408 def error(self, proto, *args): |
|
409 if proto in ('http', 'https'): |
|
410 # XXX http[s] protocols are special-cased |
|
411 dict = self.handle_error['http'] # https is not different than http |
|
412 proto = args[2] # YUCK! |
|
413 meth_name = 'http_error_%s' % proto |
|
414 http_err = 1 |
|
415 orig_args = args |
|
416 else: |
|
417 dict = self.handle_error |
|
418 meth_name = proto + '_error' |
|
419 http_err = 0 |
|
420 args = (dict, proto, meth_name) + args |
|
421 result = self._call_chain(*args) |
|
422 if result: |
|
423 return result |
|
424 |
|
425 if http_err: |
|
426 args = (dict, 'default', 'http_error_default') + orig_args |
|
427 return self._call_chain(*args) |
|
428 |
|
429 # XXX probably also want an abstract factory that knows when it makes |
|
430 # sense to skip a superclass in favor of a subclass and when it might |
|
431 # make sense to include both |
|
432 |
|
433 def build_opener(*handlers): |
|
434 """Create an opener object from a list of handlers. |
|
435 |
|
436 The opener will use several default handlers, including support |
|
437 for HTTP and FTP. |
|
438 |
|
439 If any of the handlers passed as arguments are subclasses of the |
|
440 default handlers, the default handlers will not be used. |
|
441 """ |
|
442 import types |
|
443 def isclass(obj): |
|
444 return isinstance(obj, types.ClassType) or hasattr(obj, "__bases__") |
|
445 |
|
446 opener = OpenerDirector() |
|
447 default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, |
|
448 HTTPDefaultErrorHandler, HTTPRedirectHandler, |
|
449 FTPHandler, FileHandler, HTTPErrorProcessor] |
|
450 if hasattr(httplib, 'HTTPS'): |
|
451 default_classes.append(HTTPSHandler) |
|
452 skip = set() |
|
453 for klass in default_classes: |
|
454 for check in handlers: |
|
455 if isclass(check): |
|
456 if issubclass(check, klass): |
|
457 skip.add(klass) |
|
458 elif isinstance(check, klass): |
|
459 skip.add(klass) |
|
460 for klass in skip: |
|
461 default_classes.remove(klass) |
|
462 |
|
463 for klass in default_classes: |
|
464 opener.add_handler(klass()) |
|
465 |
|
466 for h in handlers: |
|
467 if isclass(h): |
|
468 h = h() |
|
469 opener.add_handler(h) |
|
470 return opener |
|
471 |
|
472 class BaseHandler: |
|
473 handler_order = 500 |
|
474 |
|
475 def add_parent(self, parent): |
|
476 self.parent = parent |
|
477 |
|
478 def close(self): |
|
479 # Only exists for backwards compatibility |
|
480 pass |
|
481 |
|
482 def __lt__(self, other): |
|
483 if not hasattr(other, "handler_order"): |
|
484 # Try to preserve the old behavior of having custom classes |
|
485 # inserted after default ones (works only for custom user |
|
486 # classes which are not aware of handler_order). |
|
487 return True |
|
488 return self.handler_order < other.handler_order |
|
489 |
|
490 |
|
491 class HTTPErrorProcessor(BaseHandler): |
|
492 """Process HTTP error responses.""" |
|
493 handler_order = 1000 # after all other processing |
|
494 |
|
495 def http_response(self, request, response): |
|
496 code, msg, hdrs = response.code, response.msg, response.info() |
|
497 |
|
498 # According to RFC 2616, "2xx" code indicates that the client's |
|
499 # request was successfully received, understood, and accepted. |
|
500 if not (200 <= code < 300): |
|
501 response = self.parent.error( |
|
502 'http', request, response, code, msg, hdrs) |
|
503 |
|
504 return response |
|
505 |
|
506 https_response = http_response |
|
507 |
|
508 class HTTPDefaultErrorHandler(BaseHandler): |
|
509 def http_error_default(self, req, fp, code, msg, hdrs): |
|
510 raise HTTPError(req.get_full_url(), code, msg, hdrs, fp) |
|
511 |
|
512 class HTTPRedirectHandler(BaseHandler): |
|
513 # maximum number of redirections to any single URL |
|
514 # this is needed because of the state that cookies introduce |
|
515 max_repeats = 4 |
|
516 # maximum total number of redirections (regardless of URL) before |
|
517 # assuming we're in a loop |
|
518 max_redirections = 10 |
|
519 |
|
520 def redirect_request(self, req, fp, code, msg, headers, newurl): |
|
521 """Return a Request or None in response to a redirect. |
|
522 |
|
523 This is called by the http_error_30x methods when a |
|
524 redirection response is received. If a redirection should |
|
525 take place, return a new Request to allow http_error_30x to |
|
526 perform the redirect. Otherwise, raise HTTPError if no-one |
|
527 else should try to handle this url. Return None if you can't |
|
528 but another Handler might. |
|
529 """ |
|
530 m = req.get_method() |
|
531 if (code in (301, 302, 303, 307) and m in ("GET", "HEAD") |
|
532 or code in (301, 302, 303) and m == "POST"): |
|
533 # Strictly (according to RFC 2616), 301 or 302 in response |
|
534 # to a POST MUST NOT cause a redirection without confirmation |
|
535 # from the user (of urllib2, in this case). In practice, |
|
536 # essentially all clients do redirect in this case, so we |
|
537 # do the same. |
|
538 # be conciliant with URIs containing a space |
|
539 newurl = newurl.replace(' ', '%20') |
|
540 newheaders = dict((k,v) for k,v in req.headers.items() |
|
541 if k.lower() not in ("content-length", "content-type") |
|
542 ) |
|
543 return Request(newurl, |
|
544 headers=newheaders, |
|
545 origin_req_host=req.get_origin_req_host(), |
|
546 unverifiable=True) |
|
547 else: |
|
548 raise HTTPError(req.get_full_url(), code, msg, headers, fp) |
|
549 |
|
550 # Implementation note: To avoid the server sending us into an |
|
551 # infinite loop, the request object needs to track what URLs we |
|
552 # have already seen. Do this by adding a handler-specific |
|
553 # attribute to the Request object. |
|
554 def http_error_302(self, req, fp, code, msg, headers): |
|
555 # Some servers (incorrectly) return multiple Location headers |
|
556 # (so probably same goes for URI). Use first header. |
|
557 if 'location' in headers: |
|
558 newurl = headers.getheaders('location')[0] |
|
559 elif 'uri' in headers: |
|
560 newurl = headers.getheaders('uri')[0] |
|
561 else: |
|
562 return |
|
563 |
|
564 # fix a possible malformed URL |
|
565 urlparts = urlparse.urlparse(newurl) |
|
566 if not urlparts.path: |
|
567 urlparts = list(urlparts) |
|
568 urlparts[2] = "/" |
|
569 newurl = urlparse.urlunparse(urlparts) |
|
570 |
|
571 newurl = urlparse.urljoin(req.get_full_url(), newurl) |
|
572 |
|
573 # XXX Probably want to forget about the state of the current |
|
574 # request, although that might interact poorly with other |
|
575 # handlers that also use handler-specific request attributes |
|
576 new = self.redirect_request(req, fp, code, msg, headers, newurl) |
|
577 if new is None: |
|
578 return |
|
579 |
|
580 # loop detection |
|
581 # .redirect_dict has a key url if url was previously visited. |
|
582 if hasattr(req, 'redirect_dict'): |
|
583 visited = new.redirect_dict = req.redirect_dict |
|
584 if (visited.get(newurl, 0) >= self.max_repeats or |
|
585 len(visited) >= self.max_redirections): |
|
586 raise HTTPError(req.get_full_url(), code, |
|
587 self.inf_msg + msg, headers, fp) |
|
588 else: |
|
589 visited = new.redirect_dict = req.redirect_dict = {} |
|
590 visited[newurl] = visited.get(newurl, 0) + 1 |
|
591 |
|
592 # Don't close the fp until we are sure that we won't use it |
|
593 # with HTTPError. |
|
594 fp.read() |
|
595 fp.close() |
|
596 |
|
597 return self.parent.open(new) |
|
598 |
|
599 http_error_301 = http_error_303 = http_error_307 = http_error_302 |
|
600 |
|
601 inf_msg = "The HTTP server returned a redirect error that would " \ |
|
602 "lead to an infinite loop.\n" \ |
|
603 "The last 30x error message was:\n" |
|
604 |
|
605 |
|
606 def _parse_proxy(proxy): |
|
607 """Return (scheme, user, password, host/port) given a URL or an authority. |
|
608 |
|
609 If a URL is supplied, it must have an authority (host:port) component. |
|
610 According to RFC 3986, having an authority component means the URL must |
|
611 have two slashes after the scheme: |
|
612 |
|
613 >>> _parse_proxy('file:/ftp.example.com/') |
|
614 Traceback (most recent call last): |
|
615 ValueError: proxy URL with no authority: 'file:/ftp.example.com/' |
|
616 |
|
617 The first three items of the returned tuple may be None. |
|
618 |
|
619 Examples of authority parsing: |
|
620 |
|
621 >>> _parse_proxy('proxy.example.com') |
|
622 (None, None, None, 'proxy.example.com') |
|
623 >>> _parse_proxy('proxy.example.com:3128') |
|
624 (None, None, None, 'proxy.example.com:3128') |
|
625 |
|
626 The authority component may optionally include userinfo (assumed to be |
|
627 username:password): |
|
628 |
|
629 >>> _parse_proxy('joe:password@proxy.example.com') |
|
630 (None, 'joe', 'password', 'proxy.example.com') |
|
631 >>> _parse_proxy('joe:password@proxy.example.com:3128') |
|
632 (None, 'joe', 'password', 'proxy.example.com:3128') |
|
633 |
|
634 Same examples, but with URLs instead: |
|
635 |
|
636 >>> _parse_proxy('http://proxy.example.com/') |
|
637 ('http', None, None, 'proxy.example.com') |
|
638 >>> _parse_proxy('http://proxy.example.com:3128/') |
|
639 ('http', None, None, 'proxy.example.com:3128') |
|
640 >>> _parse_proxy('http://joe:password@proxy.example.com/') |
|
641 ('http', 'joe', 'password', 'proxy.example.com') |
|
642 >>> _parse_proxy('http://joe:password@proxy.example.com:3128') |
|
643 ('http', 'joe', 'password', 'proxy.example.com:3128') |
|
644 |
|
645 Everything after the authority is ignored: |
|
646 |
|
647 >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128') |
|
648 ('ftp', 'joe', 'password', 'proxy.example.com') |
|
649 |
|
650 Test for no trailing '/' case: |
|
651 |
|
652 >>> _parse_proxy('http://joe:password@proxy.example.com') |
|
653 ('http', 'joe', 'password', 'proxy.example.com') |
|
654 |
|
655 """ |
|
656 scheme, r_scheme = splittype(proxy) |
|
657 if not r_scheme.startswith("/"): |
|
658 # authority |
|
659 scheme = None |
|
660 authority = proxy |
|
661 else: |
|
662 # URL |
|
663 if not r_scheme.startswith("//"): |
|
664 raise ValueError("proxy URL with no authority: %r" % proxy) |
|
665 # We have an authority, so for RFC 3986-compliant URLs (by ss 3. |
|
666 # and 3.3.), path is empty or starts with '/' |
|
667 end = r_scheme.find("/", 2) |
|
668 if end == -1: |
|
669 end = None |
|
670 authority = r_scheme[2:end] |
|
671 userinfo, hostport = splituser(authority) |
|
672 if userinfo is not None: |
|
673 user, password = splitpasswd(userinfo) |
|
674 else: |
|
675 user = password = None |
|
676 return scheme, user, password, hostport |
|
677 |
|
678 class ProxyHandler(BaseHandler): |
|
679 # Proxies must be in front |
|
680 handler_order = 100 |
|
681 |
|
682 def __init__(self, proxies=None): |
|
683 if proxies is None: |
|
684 proxies = getproxies() |
|
685 assert hasattr(proxies, 'has_key'), "proxies must be a mapping" |
|
686 self.proxies = proxies |
|
687 for type, url in proxies.items(): |
|
688 setattr(self, '%s_open' % type, |
|
689 lambda r, proxy=url, type=type, meth=self.proxy_open: \ |
|
690 meth(r, proxy, type)) |
|
691 |
|
692 def proxy_open(self, req, proxy, type): |
|
693 orig_type = req.get_type() |
|
694 proxy_type, user, password, hostport = _parse_proxy(proxy) |
|
695 if proxy_type is None: |
|
696 proxy_type = orig_type |
|
697 if user and password: |
|
698 user_pass = '%s:%s' % (unquote(user), unquote(password)) |
|
699 creds = base64.b64encode(user_pass).strip() |
|
700 req.add_header('Proxy-authorization', 'Basic ' + creds) |
|
701 hostport = unquote(hostport) |
|
702 req.set_proxy(hostport, proxy_type) |
|
703 if orig_type == proxy_type: |
|
704 # let other handlers take care of it |
|
705 return None |
|
706 else: |
|
707 # need to start over, because the other handlers don't |
|
708 # grok the proxy's URL type |
|
709 # e.g. if we have a constructor arg proxies like so: |
|
710 # {'http': 'ftp://proxy.example.com'}, we may end up turning |
|
711 # a request for http://acme.example.com/a into one for |
|
712 # ftp://proxy.example.com/a |
|
713 return self.parent.open(req) |
|
714 |
|
715 class HTTPPasswordMgr: |
|
716 |
|
717 def __init__(self): |
|
718 self.passwd = {} |
|
719 |
|
720 def add_password(self, realm, uri, user, passwd): |
|
721 # uri could be a single URI or a sequence |
|
722 if isinstance(uri, basestring): |
|
723 uri = [uri] |
|
724 if not realm in self.passwd: |
|
725 self.passwd[realm] = {} |
|
726 for default_port in True, False: |
|
727 reduced_uri = tuple( |
|
728 [self.reduce_uri(u, default_port) for u in uri]) |
|
729 self.passwd[realm][reduced_uri] = (user, passwd) |
|
730 |
|
731 def find_user_password(self, realm, authuri): |
|
732 domains = self.passwd.get(realm, {}) |
|
733 for default_port in True, False: |
|
734 reduced_authuri = self.reduce_uri(authuri, default_port) |
|
735 for uris, authinfo in domains.iteritems(): |
|
736 for uri in uris: |
|
737 if self.is_suburi(uri, reduced_authuri): |
|
738 return authinfo |
|
739 return None, None |
|
740 |
|
741 def reduce_uri(self, uri, default_port=True): |
|
742 """Accept authority or URI and extract only the authority and path.""" |
|
743 # note HTTP URLs do not have a userinfo component |
|
744 parts = urlparse.urlsplit(uri) |
|
745 if parts[1]: |
|
746 # URI |
|
747 scheme = parts[0] |
|
748 authority = parts[1] |
|
749 path = parts[2] or '/' |
|
750 else: |
|
751 # host or host:port |
|
752 scheme = None |
|
753 authority = uri |
|
754 path = '/' |
|
755 host, port = splitport(authority) |
|
756 if default_port and port is None and scheme is not None: |
|
757 dport = {"http": 80, |
|
758 "https": 443, |
|
759 }.get(scheme) |
|
760 if dport is not None: |
|
761 authority = "%s:%d" % (host, dport) |
|
762 return authority, path |
|
763 |
|
764 def is_suburi(self, base, test): |
|
765 """Check if test is below base in a URI tree |
|
766 |
|
767 Both args must be URIs in reduced form. |
|
768 """ |
|
769 if base == test: |
|
770 return True |
|
771 if base[0] != test[0]: |
|
772 return False |
|
773 common = posixpath.commonprefix((base[1], test[1])) |
|
774 if len(common) == len(base[1]): |
|
775 return True |
|
776 return False |
|
777 |
|
778 |
|
779 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr): |
|
780 |
|
781 def find_user_password(self, realm, authuri): |
|
782 user, password = HTTPPasswordMgr.find_user_password(self, realm, |
|
783 authuri) |
|
784 if user is not None: |
|
785 return user, password |
|
786 return HTTPPasswordMgr.find_user_password(self, None, authuri) |
|
787 |
|
788 |
|
789 class AbstractBasicAuthHandler: |
|
790 |
|
791 # XXX this allows for multiple auth-schemes, but will stupidly pick |
|
792 # the last one with a realm specified. |
|
793 |
|
794 # allow for double- and single-quoted realm values |
|
795 # (single quotes are a violation of the RFC, but appear in the wild) |
|
796 rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+' |
|
797 'realm=(["\'])(.*?)\\2', re.I) |
|
798 |
|
799 # XXX could pre-emptively send auth info already accepted (RFC 2617, |
|
800 # end of section 2, and section 1.2 immediately after "credentials" |
|
801 # production). |
|
802 |
|
803 def __init__(self, password_mgr=None): |
|
804 if password_mgr is None: |
|
805 password_mgr = HTTPPasswordMgr() |
|
806 self.passwd = password_mgr |
|
807 self.add_password = self.passwd.add_password |
|
808 |
|
809 def http_error_auth_reqed(self, authreq, host, req, headers): |
|
810 # host may be an authority (without userinfo) or a URL with an |
|
811 # authority |
|
812 # XXX could be multiple headers |
|
813 authreq = headers.get(authreq, None) |
|
814 if authreq: |
|
815 mo = AbstractBasicAuthHandler.rx.search(authreq) |
|
816 if mo: |
|
817 scheme, quote, realm = mo.groups() |
|
818 if scheme.lower() == 'basic': |
|
819 return self.retry_http_basic_auth(host, req, realm) |
|
820 |
|
821 def retry_http_basic_auth(self, host, req, realm): |
|
822 user, pw = self.passwd.find_user_password(realm, host) |
|
823 if pw is not None: |
|
824 raw = "%s:%s" % (user, pw) |
|
825 auth = 'Basic %s' % base64.b64encode(raw).strip() |
|
826 if req.headers.get(self.auth_header, None) == auth: |
|
827 return None |
|
828 req.add_header(self.auth_header, auth) |
|
829 return self.parent.open(req) |
|
830 else: |
|
831 return None |
|
832 |
|
833 |
|
834 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): |
|
835 |
|
836 auth_header = 'Authorization' |
|
837 |
|
838 def http_error_401(self, req, fp, code, msg, headers): |
|
839 url = req.get_full_url() |
|
840 return self.http_error_auth_reqed('www-authenticate', |
|
841 url, req, headers) |
|
842 |
|
843 |
|
844 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler): |
|
845 |
|
846 auth_header = 'Proxy-authorization' |
|
847 |
|
848 def http_error_407(self, req, fp, code, msg, headers): |
|
849 # http_error_auth_reqed requires that there is no userinfo component in |
|
850 # authority. Assume there isn't one, since urllib2 does not (and |
|
851 # should not, RFC 3986 s. 3.2.1) support requests for URLs containing |
|
852 # userinfo. |
|
853 authority = req.get_host() |
|
854 return self.http_error_auth_reqed('proxy-authenticate', |
|
855 authority, req, headers) |
|
856 |
|
857 |
|
858 def randombytes(n): |
|
859 """Return n random bytes.""" |
|
860 # Use /dev/urandom if it is available. Fall back to random module |
|
861 # if not. It might be worthwhile to extend this function to use |
|
862 # other platform-specific mechanisms for getting random bytes. |
|
863 if os.path.exists("/dev/urandom"): |
|
864 f = open("/dev/urandom") |
|
865 s = f.read(n) |
|
866 f.close() |
|
867 return s |
|
868 else: |
|
869 L = [chr(random.randrange(0, 256)) for i in range(n)] |
|
870 return "".join(L) |
|
871 |
|
872 class AbstractDigestAuthHandler: |
|
873 # Digest authentication is specified in RFC 2617. |
|
874 |
|
875 # XXX The client does not inspect the Authentication-Info header |
|
876 # in a successful response. |
|
877 |
|
878 # XXX It should be possible to test this implementation against |
|
879 # a mock server that just generates a static set of challenges. |
|
880 |
|
881 # XXX qop="auth-int" supports is shaky |
|
882 |
|
883 def __init__(self, passwd=None): |
|
884 if passwd is None: |
|
885 passwd = HTTPPasswordMgr() |
|
886 self.passwd = passwd |
|
887 self.add_password = self.passwd.add_password |
|
888 self.retried = 0 |
|
889 self.nonce_count = 0 |
|
890 |
|
891 def reset_retry_count(self): |
|
892 self.retried = 0 |
|
893 |
|
894 def http_error_auth_reqed(self, auth_header, host, req, headers): |
|
895 authreq = headers.get(auth_header, None) |
|
896 if self.retried > 5: |
|
897 # Don't fail endlessly - if we failed once, we'll probably |
|
898 # fail a second time. Hm. Unless the Password Manager is |
|
899 # prompting for the information. Crap. This isn't great |
|
900 # but it's better than the current 'repeat until recursion |
|
901 # depth exceeded' approach <wink> |
|
902 raise HTTPError(req.get_full_url(), 401, "digest auth failed", |
|
903 headers, None) |
|
904 else: |
|
905 self.retried += 1 |
|
906 if authreq: |
|
907 scheme = authreq.split()[0] |
|
908 if scheme.lower() == 'digest': |
|
909 return self.retry_http_digest_auth(req, authreq) |
|
910 |
|
911 def retry_http_digest_auth(self, req, auth): |
|
912 token, challenge = auth.split(' ', 1) |
|
913 chal = parse_keqv_list(parse_http_list(challenge)) |
|
914 auth = self.get_authorization(req, chal) |
|
915 if auth: |
|
916 auth_val = 'Digest %s' % auth |
|
917 if req.headers.get(self.auth_header, None) == auth_val: |
|
918 return None |
|
919 req.add_unredirected_header(self.auth_header, auth_val) |
|
920 resp = self.parent.open(req) |
|
921 return resp |
|
922 |
|
923 def get_cnonce(self, nonce): |
|
924 # The cnonce-value is an opaque |
|
925 # quoted string value provided by the client and used by both client |
|
926 # and server to avoid chosen plaintext attacks, to provide mutual |
|
927 # authentication, and to provide some message integrity protection. |
|
928 # This isn't a fabulous effort, but it's probably Good Enough. |
|
929 dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(), |
|
930 randombytes(8))).hexdigest() |
|
931 return dig[:16] |
|
932 |
|
933 def get_authorization(self, req, chal): |
|
934 try: |
|
935 realm = chal['realm'] |
|
936 nonce = chal['nonce'] |
|
937 qop = chal.get('qop') |
|
938 algorithm = chal.get('algorithm', 'MD5') |
|
939 # mod_digest doesn't send an opaque, even though it isn't |
|
940 # supposed to be optional |
|
941 opaque = chal.get('opaque', None) |
|
942 except KeyError: |
|
943 return None |
|
944 |
|
945 H, KD = self.get_algorithm_impls(algorithm) |
|
946 if H is None: |
|
947 return None |
|
948 |
|
949 user, pw = self.passwd.find_user_password(realm, req.get_full_url()) |
|
950 if user is None: |
|
951 return None |
|
952 |
|
953 # XXX not implemented yet |
|
954 if req.has_data(): |
|
955 entdig = self.get_entity_digest(req.get_data(), chal) |
|
956 else: |
|
957 entdig = None |
|
958 |
|
959 A1 = "%s:%s:%s" % (user, realm, pw) |
|
960 A2 = "%s:%s" % (req.get_method(), |
|
961 # XXX selector: what about proxies and full urls |
|
962 req.get_selector()) |
|
963 if qop == 'auth': |
|
964 self.nonce_count += 1 |
|
965 ncvalue = '%08x' % self.nonce_count |
|
966 cnonce = self.get_cnonce(nonce) |
|
967 noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2)) |
|
968 respdig = KD(H(A1), noncebit) |
|
969 elif qop is None: |
|
970 respdig = KD(H(A1), "%s:%s" % (nonce, H(A2))) |
|
971 else: |
|
972 # XXX handle auth-int. |
|
973 raise URLError("qop '%s' is not supported." % qop) |
|
974 |
|
975 # XXX should the partial digests be encoded too? |
|
976 |
|
977 base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \ |
|
978 'response="%s"' % (user, realm, nonce, req.get_selector(), |
|
979 respdig) |
|
980 if opaque: |
|
981 base += ', opaque="%s"' % opaque |
|
982 if entdig: |
|
983 base += ', digest="%s"' % entdig |
|
984 base += ', algorithm="%s"' % algorithm |
|
985 if qop: |
|
986 base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce) |
|
987 return base |
|
988 |
|
989 def get_algorithm_impls(self, algorithm): |
|
990 # algorithm should be case-insensitive according to RFC2617 |
|
991 algorithm = algorithm.upper() |
|
992 # lambdas assume digest modules are imported at the top level |
|
993 if algorithm == 'MD5': |
|
994 H = lambda x: hashlib.md5(x).hexdigest() |
|
995 elif algorithm == 'SHA': |
|
996 H = lambda x: hashlib.sha1(x).hexdigest() |
|
997 # XXX MD5-sess |
|
998 KD = lambda s, d: H("%s:%s" % (s, d)) |
|
999 return H, KD |
|
1000 |
|
1001 def get_entity_digest(self, data, chal): |
|
1002 # XXX not implemented yet |
|
1003 return None |
|
1004 |
|
1005 |
|
1006 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): |
|
1007 """An authentication protocol defined by RFC 2069 |
|
1008 |
|
1009 Digest authentication improves on basic authentication because it |
|
1010 does not transmit passwords in the clear. |
|
1011 """ |
|
1012 |
|
1013 auth_header = 'Authorization' |
|
1014 handler_order = 490 # before Basic auth |
|
1015 |
|
1016 def http_error_401(self, req, fp, code, msg, headers): |
|
1017 host = urlparse.urlparse(req.get_full_url())[1] |
|
1018 retry = self.http_error_auth_reqed('www-authenticate', |
|
1019 host, req, headers) |
|
1020 self.reset_retry_count() |
|
1021 return retry |
|
1022 |
|
1023 |
|
1024 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler): |
|
1025 |
|
1026 auth_header = 'Proxy-Authorization' |
|
1027 handler_order = 490 # before Basic auth |
|
1028 |
|
1029 def http_error_407(self, req, fp, code, msg, headers): |
|
1030 host = req.get_host() |
|
1031 retry = self.http_error_auth_reqed('proxy-authenticate', |
|
1032 host, req, headers) |
|
1033 self.reset_retry_count() |
|
1034 return retry |
|
1035 |
|
1036 class AbstractHTTPHandler(BaseHandler): |
|
1037 |
|
1038 def __init__(self, debuglevel=0): |
|
1039 self._debuglevel = debuglevel |
|
1040 |
|
1041 def set_http_debuglevel(self, level): |
|
1042 self._debuglevel = level |
|
1043 |
|
1044 def do_request_(self, request): |
|
1045 host = request.get_host() |
|
1046 if not host: |
|
1047 raise URLError('no host given') |
|
1048 |
|
1049 if request.has_data(): # POST |
|
1050 data = request.get_data() |
|
1051 if not request.has_header('Content-type'): |
|
1052 request.add_unredirected_header( |
|
1053 'Content-type', |
|
1054 'application/x-www-form-urlencoded') |
|
1055 if not request.has_header('Content-length'): |
|
1056 request.add_unredirected_header( |
|
1057 'Content-length', '%d' % len(data)) |
|
1058 |
|
1059 sel_host = host |
|
1060 if request.has_proxy(): |
|
1061 scheme, sel = splittype(request.get_selector()) |
|
1062 sel_host, sel_path = splithost(sel) |
|
1063 |
|
1064 if not request.has_header('Host'): |
|
1065 request.add_unredirected_header('Host', sel_host) |
|
1066 for name, value in self.parent.addheaders: |
|
1067 name = name.capitalize() |
|
1068 if not request.has_header(name): |
|
1069 request.add_unredirected_header(name, value) |
|
1070 |
|
1071 return request |
|
1072 |
|
1073 def do_open(self, http_class, req): |
|
1074 """Return an addinfourl object for the request, using http_class. |
|
1075 |
|
1076 http_class must implement the HTTPConnection API from httplib. |
|
1077 The addinfourl return value is a file-like object. It also |
|
1078 has methods and attributes including: |
|
1079 - info(): return a mimetools.Message object for the headers |
|
1080 - geturl(): return the original request URL |
|
1081 - code: HTTP status code |
|
1082 """ |
|
1083 host = req.get_host() |
|
1084 if not host: |
|
1085 raise URLError('no host given') |
|
1086 |
|
1087 h = http_class(host, timeout=req.timeout) # will parse host:port |
|
1088 h.set_debuglevel(self._debuglevel) |
|
1089 |
|
1090 headers = dict(req.headers) |
|
1091 headers.update(req.unredirected_hdrs) |
|
1092 # We want to make an HTTP/1.1 request, but the addinfourl |
|
1093 # class isn't prepared to deal with a persistent connection. |
|
1094 # It will try to read all remaining data from the socket, |
|
1095 # which will block while the server waits for the next request. |
|
1096 # So make sure the connection gets closed after the (only) |
|
1097 # request. |
|
1098 headers["Connection"] = "close" |
|
1099 headers = dict( |
|
1100 (name.title(), val) for name, val in headers.items()) |
|
1101 try: |
|
1102 h.request(req.get_method(), req.get_selector(), req.data, headers) |
|
1103 r = h.getresponse() |
|
1104 except socket.error, err: # XXX what error? |
|
1105 raise URLError(err) |
|
1106 |
|
1107 # Pick apart the HTTPResponse object to get the addinfourl |
|
1108 # object initialized properly. |
|
1109 |
|
1110 # Wrap the HTTPResponse object in socket's file object adapter |
|
1111 # for Windows. That adapter calls recv(), so delegate recv() |
|
1112 # to read(). This weird wrapping allows the returned object to |
|
1113 # have readline() and readlines() methods. |
|
1114 |
|
1115 # XXX It might be better to extract the read buffering code |
|
1116 # out of socket._fileobject() and into a base class. |
|
1117 |
|
1118 r.recv = r.read |
|
1119 fp = socket._fileobject(r, close=True) |
|
1120 |
|
1121 resp = addinfourl(fp, r.msg, req.get_full_url()) |
|
1122 resp.code = r.status |
|
1123 resp.msg = r.reason |
|
1124 return resp |
|
1125 |
|
1126 |
|
1127 class HTTPHandler(AbstractHTTPHandler): |
|
1128 |
|
1129 def http_open(self, req): |
|
1130 return self.do_open(httplib.HTTPConnection, req) |
|
1131 |
|
1132 http_request = AbstractHTTPHandler.do_request_ |
|
1133 |
|
1134 if hasattr(httplib, 'HTTPS'): |
|
1135 class HTTPSHandler(AbstractHTTPHandler): |
|
1136 |
|
1137 def https_open(self, req): |
|
1138 return self.do_open(httplib.HTTPSConnection, req) |
|
1139 |
|
1140 https_request = AbstractHTTPHandler.do_request_ |
|
1141 |
|
1142 class HTTPCookieProcessor(BaseHandler): |
|
1143 def __init__(self, cookiejar=None): |
|
1144 import cookielib |
|
1145 if cookiejar is None: |
|
1146 cookiejar = cookielib.CookieJar() |
|
1147 self.cookiejar = cookiejar |
|
1148 |
|
1149 def http_request(self, request): |
|
1150 self.cookiejar.add_cookie_header(request) |
|
1151 return request |
|
1152 |
|
1153 def http_response(self, request, response): |
|
1154 self.cookiejar.extract_cookies(response, request) |
|
1155 return response |
|
1156 |
|
1157 https_request = http_request |
|
1158 https_response = http_response |
|
1159 |
|
1160 class UnknownHandler(BaseHandler): |
|
1161 def unknown_open(self, req): |
|
1162 type = req.get_type() |
|
1163 raise URLError('unknown url type: %s' % type) |
|
1164 |
|
1165 def parse_keqv_list(l): |
|
1166 """Parse list of key=value strings where keys are not duplicated.""" |
|
1167 parsed = {} |
|
1168 for elt in l: |
|
1169 k, v = elt.split('=', 1) |
|
1170 if v[0] == '"' and v[-1] == '"': |
|
1171 v = v[1:-1] |
|
1172 parsed[k] = v |
|
1173 return parsed |
|
1174 |
|
1175 def parse_http_list(s): |
|
1176 """Parse lists as described by RFC 2068 Section 2. |
|
1177 |
|
1178 In particular, parse comma-separated lists where the elements of |
|
1179 the list may include quoted-strings. A quoted-string could |
|
1180 contain a comma. A non-quoted string could have quotes in the |
|
1181 middle. Neither commas nor quotes count if they are escaped. |
|
1182 Only double-quotes count, not single-quotes. |
|
1183 """ |
|
1184 res = [] |
|
1185 part = '' |
|
1186 |
|
1187 escape = quote = False |
|
1188 for cur in s: |
|
1189 if escape: |
|
1190 part += cur |
|
1191 escape = False |
|
1192 continue |
|
1193 if quote: |
|
1194 if cur == '\\': |
|
1195 escape = True |
|
1196 continue |
|
1197 elif cur == '"': |
|
1198 quote = False |
|
1199 part += cur |
|
1200 continue |
|
1201 |
|
1202 if cur == ',': |
|
1203 res.append(part) |
|
1204 part = '' |
|
1205 continue |
|
1206 |
|
1207 if cur == '"': |
|
1208 quote = True |
|
1209 |
|
1210 part += cur |
|
1211 |
|
1212 # append last part |
|
1213 if part: |
|
1214 res.append(part) |
|
1215 |
|
1216 return [part.strip() for part in res] |
|
1217 |
|
1218 class FileHandler(BaseHandler): |
|
1219 # Use local file or FTP depending on form of URL |
|
1220 def file_open(self, req): |
|
1221 url = req.get_selector() |
|
1222 if url[:2] == '//' and url[2:3] != '/': |
|
1223 req.type = 'ftp' |
|
1224 return self.parent.open(req) |
|
1225 else: |
|
1226 return self.open_local_file(req) |
|
1227 |
|
1228 # names for the localhost |
|
1229 names = None |
|
1230 def get_names(self): |
|
1231 if FileHandler.names is None: |
|
1232 try: |
|
1233 FileHandler.names = (socket.gethostbyname('localhost'), |
|
1234 socket.gethostbyname(socket.gethostname())) |
|
1235 except socket.gaierror: |
|
1236 FileHandler.names = (socket.gethostbyname('localhost'),) |
|
1237 return FileHandler.names |
|
1238 |
|
1239 # not entirely sure what the rules are here |
|
1240 def open_local_file(self, req): |
|
1241 import email.utils |
|
1242 import mimetypes |
|
1243 host = req.get_host() |
|
1244 file = req.get_selector() |
|
1245 localfile = url2pathname(file) |
|
1246 try: |
|
1247 stats = os.stat(localfile) |
|
1248 size = stats.st_size |
|
1249 modified = email.utils.formatdate(stats.st_mtime, usegmt=True) |
|
1250 mtype = mimetypes.guess_type(file)[0] |
|
1251 headers = mimetools.Message(StringIO( |
|
1252 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % |
|
1253 (mtype or 'text/plain', size, modified))) |
|
1254 if host: |
|
1255 host, port = splitport(host) |
|
1256 if not host or \ |
|
1257 (not port and socket.gethostbyname(host) in self.get_names()): |
|
1258 return addinfourl(open(localfile, 'rb'), |
|
1259 headers, 'file:'+file) |
|
1260 except OSError, msg: |
|
1261 # urllib2 users shouldn't expect OSErrors coming from urlopen() |
|
1262 raise URLError(msg) |
|
1263 raise URLError('file not on local host') |
|
1264 |
|
1265 class FTPHandler(BaseHandler): |
|
1266 def ftp_open(self, req): |
|
1267 import ftplib |
|
1268 import mimetypes |
|
1269 host = req.get_host() |
|
1270 if not host: |
|
1271 raise URLError('ftp error: no host given') |
|
1272 host, port = splitport(host) |
|
1273 if port is None: |
|
1274 port = ftplib.FTP_PORT |
|
1275 else: |
|
1276 port = int(port) |
|
1277 |
|
1278 # username/password handling |
|
1279 user, host = splituser(host) |
|
1280 if user: |
|
1281 user, passwd = splitpasswd(user) |
|
1282 else: |
|
1283 passwd = None |
|
1284 host = unquote(host) |
|
1285 user = unquote(user or '') |
|
1286 passwd = unquote(passwd or '') |
|
1287 |
|
1288 try: |
|
1289 host = socket.gethostbyname(host) |
|
1290 except socket.error, msg: |
|
1291 raise URLError(msg) |
|
1292 path, attrs = splitattr(req.get_selector()) |
|
1293 dirs = path.split('/') |
|
1294 dirs = map(unquote, dirs) |
|
1295 dirs, file = dirs[:-1], dirs[-1] |
|
1296 if dirs and not dirs[0]: |
|
1297 dirs = dirs[1:] |
|
1298 try: |
|
1299 fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout) |
|
1300 type = file and 'I' or 'D' |
|
1301 for attr in attrs: |
|
1302 attr, value = splitvalue(attr) |
|
1303 if attr.lower() == 'type' and \ |
|
1304 value in ('a', 'A', 'i', 'I', 'd', 'D'): |
|
1305 type = value.upper() |
|
1306 fp, retrlen = fw.retrfile(file, type) |
|
1307 headers = "" |
|
1308 mtype = mimetypes.guess_type(req.get_full_url())[0] |
|
1309 if mtype: |
|
1310 headers += "Content-type: %s\n" % mtype |
|
1311 if retrlen is not None and retrlen >= 0: |
|
1312 headers += "Content-length: %d\n" % retrlen |
|
1313 sf = StringIO(headers) |
|
1314 headers = mimetools.Message(sf) |
|
1315 return addinfourl(fp, headers, req.get_full_url()) |
|
1316 except ftplib.all_errors, msg: |
|
1317 raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2] |
|
1318 |
|
1319 def connect_ftp(self, user, passwd, host, port, dirs, timeout): |
|
1320 fw = ftpwrapper(user, passwd, host, port, dirs, timeout) |
|
1321 ## fw.ftp.set_debuglevel(1) |
|
1322 return fw |
|
1323 |
|
1324 class CacheFTPHandler(FTPHandler): |
|
1325 # XXX would be nice to have pluggable cache strategies |
|
1326 # XXX this stuff is definitely not thread safe |
|
1327 def __init__(self): |
|
1328 self.cache = {} |
|
1329 self.timeout = {} |
|
1330 self.soonest = 0 |
|
1331 self.delay = 60 |
|
1332 self.max_conns = 16 |
|
1333 |
|
1334 def setTimeout(self, t): |
|
1335 self.delay = t |
|
1336 |
|
1337 def setMaxConns(self, m): |
|
1338 self.max_conns = m |
|
1339 |
|
1340 def connect_ftp(self, user, passwd, host, port, dirs, timeout): |
|
1341 key = user, host, port, '/'.join(dirs), timeout |
|
1342 if key in self.cache: |
|
1343 self.timeout[key] = time.time() + self.delay |
|
1344 else: |
|
1345 self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout) |
|
1346 self.timeout[key] = time.time() + self.delay |
|
1347 self.check_cache() |
|
1348 return self.cache[key] |
|
1349 |
|
1350 def check_cache(self): |
|
1351 # first check for old ones |
|
1352 t = time.time() |
|
1353 if self.soonest <= t: |
|
1354 for k, v in self.timeout.items(): |
|
1355 if v < t: |
|
1356 self.cache[k].close() |
|
1357 del self.cache[k] |
|
1358 del self.timeout[k] |
|
1359 self.soonest = min(self.timeout.values()) |
|
1360 |
|
1361 # then check the size |
|
1362 if len(self.cache) == self.max_conns: |
|
1363 for k, v in self.timeout.items(): |
|
1364 if v == self.soonest: |
|
1365 del self.cache[k] |
|
1366 del self.timeout[k] |
|
1367 break |
|
1368 self.soonest = min(self.timeout.values()) |