symbian-qemu-0.9.1-12/python-win32-2.6.1/lib/urllib2.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 """An extensible library for opening URLs using a variety of protocols
       
     2 
       
     3 The simplest way to use this module is to call the urlopen function,
       
     4 which accepts a string containing a URL or a Request object (described
       
     5 below).  It opens the URL and returns the results as file-like
       
     6 object; the returned object has some extra methods described below.
       
     7 
       
     8 The OpenerDirector manages a collection of Handler objects that do
       
     9 all the actual work.  Each Handler implements a particular protocol or
       
    10 option.  The OpenerDirector is a composite object that invokes the
       
    11 Handlers needed to open the requested URL.  For example, the
       
    12 HTTPHandler performs HTTP GET and POST requests and deals with
       
    13 non-error returns.  The HTTPRedirectHandler automatically deals with
       
    14 HTTP 301, 302, 303 and 307 redirect errors, and the HTTPDigestAuthHandler
       
    15 deals with digest authentication.
       
    16 
       
    17 urlopen(url, data=None) -- Basic usage is the same as original
       
    18 urllib.  pass the url and optionally data to post to an HTTP URL, and
       
    19 get a file-like object back.  One difference is that you can also pass
       
    20 a Request instance instead of URL.  Raises a URLError (subclass of
       
    21 IOError); for HTTP errors, raises an HTTPError, which can also be
       
    22 treated as a valid response.
       
    23 
       
    24 build_opener -- Function that creates a new OpenerDirector instance.
       
    25 Will install the default handlers.  Accepts one or more Handlers as
       
    26 arguments, either instances or Handler classes that it will
       
    27 instantiate.  If one of the argument is a subclass of the default
       
    28 handler, the argument will be installed instead of the default.
       
    29 
       
    30 install_opener -- Installs a new opener as the default opener.
       
    31 
       
    32 objects of interest:
       
    33 OpenerDirector --
       
    34 
       
    35 Request -- An object that encapsulates the state of a request.  The
       
    36 state can be as simple as the URL.  It can also include extra HTTP
       
    37 headers, e.g. a User-Agent.
       
    38 
       
    39 BaseHandler --
       
    40 
       
    41 exceptions:
       
    42 URLError -- A subclass of IOError, individual protocols have their own
       
    43 specific subclass.
       
    44 
       
    45 HTTPError -- Also a valid HTTP response, so you can treat an HTTP error
       
    46 as an exceptional event or valid response.
       
    47 
       
    48 internals:
       
    49 BaseHandler and parent
       
    50 _call_chain conventions
       
    51 
       
    52 Example usage:
       
    53 
       
    54 import urllib2
       
    55 
       
    56 # set up authentication info
       
    57 authinfo = urllib2.HTTPBasicAuthHandler()
       
    58 authinfo.add_password(realm='PDQ Application',
       
    59                       uri='https://mahler:8092/site-updates.py',
       
    60                       user='klem',
       
    61                       passwd='geheim$parole')
       
    62 
       
    63 proxy_support = urllib2.ProxyHandler({"http" : "http://ahad-haam:3128"})
       
    64 
       
    65 # build a new opener that adds authentication and caching FTP handlers
       
    66 opener = urllib2.build_opener(proxy_support, authinfo, urllib2.CacheFTPHandler)
       
    67 
       
    68 # install it
       
    69 urllib2.install_opener(opener)
       
    70 
       
    71 f = urllib2.urlopen('http://www.python.org/')
       
    72 
       
    73 
       
    74 """
       
    75 
       
    76 # XXX issues:
       
    77 # If an authentication error handler that tries to perform
       
    78 # authentication for some reason but fails, how should the error be
       
    79 # signalled?  The client needs to know the HTTP error code.  But if
       
    80 # the handler knows that the problem was, e.g., that it didn't know
       
    81 # that hash algo that requested in the challenge, it would be good to
       
    82 # pass that information along to the client, too.
       
    83 # ftp errors aren't handled cleanly
       
    84 # check digest against correct (i.e. non-apache) implementation
       
    85 
       
    86 # Possible extensions:
       
    87 # complex proxies  XXX not sure what exactly was meant by this
       
    88 # abstract factory for opener
       
    89 
       
    90 import base64
       
    91 import hashlib
       
    92 import httplib
       
    93 import mimetools
       
    94 import os
       
    95 import posixpath
       
    96 import random
       
    97 import re
       
    98 import socket
       
    99 import sys
       
   100 import time
       
   101 import urlparse
       
   102 import bisect
       
   103 
       
   104 try:
       
   105     from cStringIO import StringIO
       
   106 except ImportError:
       
   107     from StringIO import StringIO
       
   108 
       
   109 from urllib import (unwrap, unquote, splittype, splithost, quote,
       
   110      addinfourl, splitport,
       
   111      splitattr, ftpwrapper, splituser, splitpasswd, splitvalue)
       
   112 
       
   113 # support for FileHandler, proxies via environment variables
       
   114 from urllib import localhost, url2pathname, getproxies
       
   115 
       
   116 # used in User-Agent header sent
       
   117 __version__ = sys.version[:3]
       
   118 
       
   119 _opener = None
       
   120 def urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
       
   121     global _opener
       
   122     if _opener is None:
       
   123         _opener = build_opener()
       
   124     return _opener.open(url, data, timeout)
       
   125 
       
   126 def install_opener(opener):
       
   127     global _opener
       
   128     _opener = opener
       
   129 
       
   130 # do these error classes make sense?
       
   131 # make sure all of the IOError stuff is overridden.  we just want to be
       
   132 # subtypes.
       
   133 
       
   134 class URLError(IOError):
       
   135     # URLError is a sub-type of IOError, but it doesn't share any of
       
   136     # the implementation.  need to override __init__ and __str__.
       
   137     # It sets self.args for compatibility with other EnvironmentError
       
   138     # subclasses, but args doesn't have the typical format with errno in
       
   139     # slot 0 and strerror in slot 1.  This may be better than nothing.
       
   140     def __init__(self, reason):
       
   141         self.args = reason,
       
   142         self.reason = reason
       
   143 
       
   144     def __str__(self):
       
   145         return '<urlopen error %s>' % self.reason
       
   146 
       
   147 class HTTPError(URLError, addinfourl):
       
   148     """Raised when HTTP error occurs, but also acts like non-error return"""
       
   149     __super_init = addinfourl.__init__
       
   150 
       
   151     def __init__(self, url, code, msg, hdrs, fp):
       
   152         self.code = code
       
   153         self.msg = msg
       
   154         self.hdrs = hdrs
       
   155         self.fp = fp
       
   156         self.filename = url
       
   157         # The addinfourl classes depend on fp being a valid file
       
   158         # object.  In some cases, the HTTPError may not have a valid
       
   159         # file object.  If this happens, the simplest workaround is to
       
   160         # not initialize the base classes.
       
   161         if fp is not None:
       
   162             self.__super_init(fp, hdrs, url, code)
       
   163 
       
   164     def __str__(self):
       
   165         return 'HTTP Error %s: %s' % (self.code, self.msg)
       
   166 
       
   167 # copied from cookielib.py
       
   168 _cut_port_re = re.compile(r":\d+$")
       
   169 def request_host(request):
       
   170     """Return request-host, as defined by RFC 2965.
       
   171 
       
   172     Variation from RFC: returned value is lowercased, for convenient
       
   173     comparison.
       
   174 
       
   175     """
       
   176     url = request.get_full_url()
       
   177     host = urlparse.urlparse(url)[1]
       
   178     if host == "":
       
   179         host = request.get_header("Host", "")
       
   180 
       
   181     # remove port, if present
       
   182     host = _cut_port_re.sub("", host, 1)
       
   183     return host.lower()
       
   184 
       
   185 class Request:
       
   186 
       
   187     def __init__(self, url, data=None, headers={},
       
   188                  origin_req_host=None, unverifiable=False):
       
   189         # unwrap('<URL:type://host/path>') --> 'type://host/path'
       
   190         self.__original = unwrap(url)
       
   191         self.type = None
       
   192         # self.__r_type is what's left after doing the splittype
       
   193         self.host = None
       
   194         self.port = None
       
   195         self.data = data
       
   196         self.headers = {}
       
   197         for key, value in headers.items():
       
   198             self.add_header(key, value)
       
   199         self.unredirected_hdrs = {}
       
   200         if origin_req_host is None:
       
   201             origin_req_host = request_host(self)
       
   202         self.origin_req_host = origin_req_host
       
   203         self.unverifiable = unverifiable
       
   204 
       
   205     def __getattr__(self, attr):
       
   206         # XXX this is a fallback mechanism to guard against these
       
   207         # methods getting called in a non-standard order.  this may be
       
   208         # too complicated and/or unnecessary.
       
   209         # XXX should the __r_XXX attributes be public?
       
   210         if attr[:12] == '_Request__r_':
       
   211             name = attr[12:]
       
   212             if hasattr(Request, 'get_' + name):
       
   213                 getattr(self, 'get_' + name)()
       
   214                 return getattr(self, attr)
       
   215         raise AttributeError, attr
       
   216 
       
   217     def get_method(self):
       
   218         if self.has_data():
       
   219             return "POST"
       
   220         else:
       
   221             return "GET"
       
   222 
       
   223     # XXX these helper methods are lame
       
   224 
       
   225     def add_data(self, data):
       
   226         self.data = data
       
   227 
       
   228     def has_data(self):
       
   229         return self.data is not None
       
   230 
       
   231     def get_data(self):
       
   232         return self.data
       
   233 
       
   234     def get_full_url(self):
       
   235         return self.__original
       
   236 
       
   237     def get_type(self):
       
   238         if self.type is None:
       
   239             self.type, self.__r_type = splittype(self.__original)
       
   240             if self.type is None:
       
   241                 raise ValueError, "unknown url type: %s" % self.__original
       
   242         return self.type
       
   243 
       
   244     def get_host(self):
       
   245         if self.host is None:
       
   246             self.host, self.__r_host = splithost(self.__r_type)
       
   247             if self.host:
       
   248                 self.host = unquote(self.host)
       
   249         return self.host
       
   250 
       
   251     def get_selector(self):
       
   252         return self.__r_host
       
   253 
       
   254     def set_proxy(self, host, type):
       
   255         self.host, self.type = host, type
       
   256         self.__r_host = self.__original
       
   257 
       
   258     def has_proxy(self):
       
   259         return self.__r_host == self.__original
       
   260 
       
   261     def get_origin_req_host(self):
       
   262         return self.origin_req_host
       
   263 
       
   264     def is_unverifiable(self):
       
   265         return self.unverifiable
       
   266 
       
   267     def add_header(self, key, val):
       
   268         # useful for something like authentication
       
   269         self.headers[key.capitalize()] = val
       
   270 
       
   271     def add_unredirected_header(self, key, val):
       
   272         # will not be added to a redirected request
       
   273         self.unredirected_hdrs[key.capitalize()] = val
       
   274 
       
   275     def has_header(self, header_name):
       
   276         return (header_name in self.headers or
       
   277                 header_name in self.unredirected_hdrs)
       
   278 
       
   279     def get_header(self, header_name, default=None):
       
   280         return self.headers.get(
       
   281             header_name,
       
   282             self.unredirected_hdrs.get(header_name, default))
       
   283 
       
   284     def header_items(self):
       
   285         hdrs = self.unredirected_hdrs.copy()
       
   286         hdrs.update(self.headers)
       
   287         return hdrs.items()
       
   288 
       
   289 class OpenerDirector:
       
   290     def __init__(self):
       
   291         client_version = "Python-urllib/%s" % __version__
       
   292         self.addheaders = [('User-agent', client_version)]
       
   293         # manage the individual handlers
       
   294         self.handlers = []
       
   295         self.handle_open = {}
       
   296         self.handle_error = {}
       
   297         self.process_response = {}
       
   298         self.process_request = {}
       
   299 
       
   300     def add_handler(self, handler):
       
   301         if not hasattr(handler, "add_parent"):
       
   302             raise TypeError("expected BaseHandler instance, got %r" %
       
   303                             type(handler))
       
   304 
       
   305         added = False
       
   306         for meth in dir(handler):
       
   307             if meth in ["redirect_request", "do_open", "proxy_open"]:
       
   308                 # oops, coincidental match
       
   309                 continue
       
   310 
       
   311             i = meth.find("_")
       
   312             protocol = meth[:i]
       
   313             condition = meth[i+1:]
       
   314 
       
   315             if condition.startswith("error"):
       
   316                 j = condition.find("_") + i + 1
       
   317                 kind = meth[j+1:]
       
   318                 try:
       
   319                     kind = int(kind)
       
   320                 except ValueError:
       
   321                     pass
       
   322                 lookup = self.handle_error.get(protocol, {})
       
   323                 self.handle_error[protocol] = lookup
       
   324             elif condition == "open":
       
   325                 kind = protocol
       
   326                 lookup = self.handle_open
       
   327             elif condition == "response":
       
   328                 kind = protocol
       
   329                 lookup = self.process_response
       
   330             elif condition == "request":
       
   331                 kind = protocol
       
   332                 lookup = self.process_request
       
   333             else:
       
   334                 continue
       
   335 
       
   336             handlers = lookup.setdefault(kind, [])
       
   337             if handlers:
       
   338                 bisect.insort(handlers, handler)
       
   339             else:
       
   340                 handlers.append(handler)
       
   341             added = True
       
   342 
       
   343         if added:
       
   344             # the handlers must work in an specific order, the order
       
   345             # is specified in a Handler attribute
       
   346             bisect.insort(self.handlers, handler)
       
   347             handler.add_parent(self)
       
   348 
       
   349     def close(self):
       
   350         # Only exists for backwards compatibility.
       
   351         pass
       
   352 
       
   353     def _call_chain(self, chain, kind, meth_name, *args):
       
   354         # Handlers raise an exception if no one else should try to handle
       
   355         # the request, or return None if they can't but another handler
       
   356         # could.  Otherwise, they return the response.
       
   357         handlers = chain.get(kind, ())
       
   358         for handler in handlers:
       
   359             func = getattr(handler, meth_name)
       
   360 
       
   361             result = func(*args)
       
   362             if result is not None:
       
   363                 return result
       
   364 
       
   365     def open(self, fullurl, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
       
   366         # accept a URL or a Request object
       
   367         if isinstance(fullurl, basestring):
       
   368             req = Request(fullurl, data)
       
   369         else:
       
   370             req = fullurl
       
   371             if data is not None:
       
   372                 req.add_data(data)
       
   373 
       
   374         req.timeout = timeout
       
   375         protocol = req.get_type()
       
   376 
       
   377         # pre-process request
       
   378         meth_name = protocol+"_request"
       
   379         for processor in self.process_request.get(protocol, []):
       
   380             meth = getattr(processor, meth_name)
       
   381             req = meth(req)
       
   382 
       
   383         response = self._open(req, data)
       
   384 
       
   385         # post-process response
       
   386         meth_name = protocol+"_response"
       
   387         for processor in self.process_response.get(protocol, []):
       
   388             meth = getattr(processor, meth_name)
       
   389             response = meth(req, response)
       
   390 
       
   391         return response
       
   392 
       
   393     def _open(self, req, data=None):
       
   394         result = self._call_chain(self.handle_open, 'default',
       
   395                                   'default_open', req)
       
   396         if result:
       
   397             return result
       
   398 
       
   399         protocol = req.get_type()
       
   400         result = self._call_chain(self.handle_open, protocol, protocol +
       
   401                                   '_open', req)
       
   402         if result:
       
   403             return result
       
   404 
       
   405         return self._call_chain(self.handle_open, 'unknown',
       
   406                                 'unknown_open', req)
       
   407 
       
   408     def error(self, proto, *args):
       
   409         if proto in ('http', 'https'):
       
   410             # XXX http[s] protocols are special-cased
       
   411             dict = self.handle_error['http'] # https is not different than http
       
   412             proto = args[2]  # YUCK!
       
   413             meth_name = 'http_error_%s' % proto
       
   414             http_err = 1
       
   415             orig_args = args
       
   416         else:
       
   417             dict = self.handle_error
       
   418             meth_name = proto + '_error'
       
   419             http_err = 0
       
   420         args = (dict, proto, meth_name) + args
       
   421         result = self._call_chain(*args)
       
   422         if result:
       
   423             return result
       
   424 
       
   425         if http_err:
       
   426             args = (dict, 'default', 'http_error_default') + orig_args
       
   427             return self._call_chain(*args)
       
   428 
       
   429 # XXX probably also want an abstract factory that knows when it makes
       
   430 # sense to skip a superclass in favor of a subclass and when it might
       
   431 # make sense to include both
       
   432 
       
   433 def build_opener(*handlers):
       
   434     """Create an opener object from a list of handlers.
       
   435 
       
   436     The opener will use several default handlers, including support
       
   437     for HTTP and FTP.
       
   438 
       
   439     If any of the handlers passed as arguments are subclasses of the
       
   440     default handlers, the default handlers will not be used.
       
   441     """
       
   442     import types
       
   443     def isclass(obj):
       
   444         return isinstance(obj, types.ClassType) or hasattr(obj, "__bases__")
       
   445 
       
   446     opener = OpenerDirector()
       
   447     default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
       
   448                        HTTPDefaultErrorHandler, HTTPRedirectHandler,
       
   449                        FTPHandler, FileHandler, HTTPErrorProcessor]
       
   450     if hasattr(httplib, 'HTTPS'):
       
   451         default_classes.append(HTTPSHandler)
       
   452     skip = set()
       
   453     for klass in default_classes:
       
   454         for check in handlers:
       
   455             if isclass(check):
       
   456                 if issubclass(check, klass):
       
   457                     skip.add(klass)
       
   458             elif isinstance(check, klass):
       
   459                 skip.add(klass)
       
   460     for klass in skip:
       
   461         default_classes.remove(klass)
       
   462 
       
   463     for klass in default_classes:
       
   464         opener.add_handler(klass())
       
   465 
       
   466     for h in handlers:
       
   467         if isclass(h):
       
   468             h = h()
       
   469         opener.add_handler(h)
       
   470     return opener
       
   471 
       
   472 class BaseHandler:
       
   473     handler_order = 500
       
   474 
       
   475     def add_parent(self, parent):
       
   476         self.parent = parent
       
   477 
       
   478     def close(self):
       
   479         # Only exists for backwards compatibility
       
   480         pass
       
   481 
       
   482     def __lt__(self, other):
       
   483         if not hasattr(other, "handler_order"):
       
   484             # Try to preserve the old behavior of having custom classes
       
   485             # inserted after default ones (works only for custom user
       
   486             # classes which are not aware of handler_order).
       
   487             return True
       
   488         return self.handler_order < other.handler_order
       
   489 
       
   490 
       
   491 class HTTPErrorProcessor(BaseHandler):
       
   492     """Process HTTP error responses."""
       
   493     handler_order = 1000  # after all other processing
       
   494 
       
   495     def http_response(self, request, response):
       
   496         code, msg, hdrs = response.code, response.msg, response.info()
       
   497 
       
   498         # According to RFC 2616, "2xx" code indicates that the client's
       
   499         # request was successfully received, understood, and accepted.
       
   500         if not (200 <= code < 300):
       
   501             response = self.parent.error(
       
   502                 'http', request, response, code, msg, hdrs)
       
   503 
       
   504         return response
       
   505 
       
   506     https_response = http_response
       
   507 
       
   508 class HTTPDefaultErrorHandler(BaseHandler):
       
   509     def http_error_default(self, req, fp, code, msg, hdrs):
       
   510         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
       
   511 
       
   512 class HTTPRedirectHandler(BaseHandler):
       
   513     # maximum number of redirections to any single URL
       
   514     # this is needed because of the state that cookies introduce
       
   515     max_repeats = 4
       
   516     # maximum total number of redirections (regardless of URL) before
       
   517     # assuming we're in a loop
       
   518     max_redirections = 10
       
   519 
       
   520     def redirect_request(self, req, fp, code, msg, headers, newurl):
       
   521         """Return a Request or None in response to a redirect.
       
   522 
       
   523         This is called by the http_error_30x methods when a
       
   524         redirection response is received.  If a redirection should
       
   525         take place, return a new Request to allow http_error_30x to
       
   526         perform the redirect.  Otherwise, raise HTTPError if no-one
       
   527         else should try to handle this url.  Return None if you can't
       
   528         but another Handler might.
       
   529         """
       
   530         m = req.get_method()
       
   531         if (code in (301, 302, 303, 307) and m in ("GET", "HEAD")
       
   532             or code in (301, 302, 303) and m == "POST"):
       
   533             # Strictly (according to RFC 2616), 301 or 302 in response
       
   534             # to a POST MUST NOT cause a redirection without confirmation
       
   535             # from the user (of urllib2, in this case).  In practice,
       
   536             # essentially all clients do redirect in this case, so we
       
   537             # do the same.
       
   538             # be conciliant with URIs containing a space
       
   539             newurl = newurl.replace(' ', '%20')
       
   540             newheaders = dict((k,v) for k,v in req.headers.items()
       
   541                               if k.lower() not in ("content-length", "content-type")
       
   542                              )
       
   543             return Request(newurl,
       
   544                            headers=newheaders,
       
   545                            origin_req_host=req.get_origin_req_host(),
       
   546                            unverifiable=True)
       
   547         else:
       
   548             raise HTTPError(req.get_full_url(), code, msg, headers, fp)
       
   549 
       
   550     # Implementation note: To avoid the server sending us into an
       
   551     # infinite loop, the request object needs to track what URLs we
       
   552     # have already seen.  Do this by adding a handler-specific
       
   553     # attribute to the Request object.
       
   554     def http_error_302(self, req, fp, code, msg, headers):
       
   555         # Some servers (incorrectly) return multiple Location headers
       
   556         # (so probably same goes for URI).  Use first header.
       
   557         if 'location' in headers:
       
   558             newurl = headers.getheaders('location')[0]
       
   559         elif 'uri' in headers:
       
   560             newurl = headers.getheaders('uri')[0]
       
   561         else:
       
   562             return
       
   563 
       
   564         # fix a possible malformed URL
       
   565         urlparts = urlparse.urlparse(newurl)
       
   566         if not urlparts.path:
       
   567             urlparts = list(urlparts)
       
   568             urlparts[2] = "/"
       
   569         newurl = urlparse.urlunparse(urlparts)
       
   570 
       
   571         newurl = urlparse.urljoin(req.get_full_url(), newurl)
       
   572 
       
   573         # XXX Probably want to forget about the state of the current
       
   574         # request, although that might interact poorly with other
       
   575         # handlers that also use handler-specific request attributes
       
   576         new = self.redirect_request(req, fp, code, msg, headers, newurl)
       
   577         if new is None:
       
   578             return
       
   579 
       
   580         # loop detection
       
   581         # .redirect_dict has a key url if url was previously visited.
       
   582         if hasattr(req, 'redirect_dict'):
       
   583             visited = new.redirect_dict = req.redirect_dict
       
   584             if (visited.get(newurl, 0) >= self.max_repeats or
       
   585                 len(visited) >= self.max_redirections):
       
   586                 raise HTTPError(req.get_full_url(), code,
       
   587                                 self.inf_msg + msg, headers, fp)
       
   588         else:
       
   589             visited = new.redirect_dict = req.redirect_dict = {}
       
   590         visited[newurl] = visited.get(newurl, 0) + 1
       
   591 
       
   592         # Don't close the fp until we are sure that we won't use it
       
   593         # with HTTPError.
       
   594         fp.read()
       
   595         fp.close()
       
   596 
       
   597         return self.parent.open(new)
       
   598 
       
   599     http_error_301 = http_error_303 = http_error_307 = http_error_302
       
   600 
       
   601     inf_msg = "The HTTP server returned a redirect error that would " \
       
   602               "lead to an infinite loop.\n" \
       
   603               "The last 30x error message was:\n"
       
   604 
       
   605 
       
   606 def _parse_proxy(proxy):
       
   607     """Return (scheme, user, password, host/port) given a URL or an authority.
       
   608 
       
   609     If a URL is supplied, it must have an authority (host:port) component.
       
   610     According to RFC 3986, having an authority component means the URL must
       
   611     have two slashes after the scheme:
       
   612 
       
   613     >>> _parse_proxy('file:/ftp.example.com/')
       
   614     Traceback (most recent call last):
       
   615     ValueError: proxy URL with no authority: 'file:/ftp.example.com/'
       
   616 
       
   617     The first three items of the returned tuple may be None.
       
   618 
       
   619     Examples of authority parsing:
       
   620 
       
   621     >>> _parse_proxy('proxy.example.com')
       
   622     (None, None, None, 'proxy.example.com')
       
   623     >>> _parse_proxy('proxy.example.com:3128')
       
   624     (None, None, None, 'proxy.example.com:3128')
       
   625 
       
   626     The authority component may optionally include userinfo (assumed to be
       
   627     username:password):
       
   628 
       
   629     >>> _parse_proxy('joe:password@proxy.example.com')
       
   630     (None, 'joe', 'password', 'proxy.example.com')
       
   631     >>> _parse_proxy('joe:password@proxy.example.com:3128')
       
   632     (None, 'joe', 'password', 'proxy.example.com:3128')
       
   633 
       
   634     Same examples, but with URLs instead:
       
   635 
       
   636     >>> _parse_proxy('http://proxy.example.com/')
       
   637     ('http', None, None, 'proxy.example.com')
       
   638     >>> _parse_proxy('http://proxy.example.com:3128/')
       
   639     ('http', None, None, 'proxy.example.com:3128')
       
   640     >>> _parse_proxy('http://joe:password@proxy.example.com/')
       
   641     ('http', 'joe', 'password', 'proxy.example.com')
       
   642     >>> _parse_proxy('http://joe:password@proxy.example.com:3128')
       
   643     ('http', 'joe', 'password', 'proxy.example.com:3128')
       
   644 
       
   645     Everything after the authority is ignored:
       
   646 
       
   647     >>> _parse_proxy('ftp://joe:password@proxy.example.com/rubbish:3128')
       
   648     ('ftp', 'joe', 'password', 'proxy.example.com')
       
   649 
       
   650     Test for no trailing '/' case:
       
   651 
       
   652     >>> _parse_proxy('http://joe:password@proxy.example.com')
       
   653     ('http', 'joe', 'password', 'proxy.example.com')
       
   654 
       
   655     """
       
   656     scheme, r_scheme = splittype(proxy)
       
   657     if not r_scheme.startswith("/"):
       
   658         # authority
       
   659         scheme = None
       
   660         authority = proxy
       
   661     else:
       
   662         # URL
       
   663         if not r_scheme.startswith("//"):
       
   664             raise ValueError("proxy URL with no authority: %r" % proxy)
       
   665         # We have an authority, so for RFC 3986-compliant URLs (by ss 3.
       
   666         # and 3.3.), path is empty or starts with '/'
       
   667         end = r_scheme.find("/", 2)
       
   668         if end == -1:
       
   669             end = None
       
   670         authority = r_scheme[2:end]
       
   671     userinfo, hostport = splituser(authority)
       
   672     if userinfo is not None:
       
   673         user, password = splitpasswd(userinfo)
       
   674     else:
       
   675         user = password = None
       
   676     return scheme, user, password, hostport
       
   677 
       
   678 class ProxyHandler(BaseHandler):
       
   679     # Proxies must be in front
       
   680     handler_order = 100
       
   681 
       
   682     def __init__(self, proxies=None):
       
   683         if proxies is None:
       
   684             proxies = getproxies()
       
   685         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
       
   686         self.proxies = proxies
       
   687         for type, url in proxies.items():
       
   688             setattr(self, '%s_open' % type,
       
   689                     lambda r, proxy=url, type=type, meth=self.proxy_open: \
       
   690                     meth(r, proxy, type))
       
   691 
       
   692     def proxy_open(self, req, proxy, type):
       
   693         orig_type = req.get_type()
       
   694         proxy_type, user, password, hostport = _parse_proxy(proxy)
       
   695         if proxy_type is None:
       
   696             proxy_type = orig_type
       
   697         if user and password:
       
   698             user_pass = '%s:%s' % (unquote(user), unquote(password))
       
   699             creds = base64.b64encode(user_pass).strip()
       
   700             req.add_header('Proxy-authorization', 'Basic ' + creds)
       
   701         hostport = unquote(hostport)
       
   702         req.set_proxy(hostport, proxy_type)
       
   703         if orig_type == proxy_type:
       
   704             # let other handlers take care of it
       
   705             return None
       
   706         else:
       
   707             # need to start over, because the other handlers don't
       
   708             # grok the proxy's URL type
       
   709             # e.g. if we have a constructor arg proxies like so:
       
   710             # {'http': 'ftp://proxy.example.com'}, we may end up turning
       
   711             # a request for http://acme.example.com/a into one for
       
   712             # ftp://proxy.example.com/a
       
   713             return self.parent.open(req)
       
   714 
       
   715 class HTTPPasswordMgr:
       
   716 
       
   717     def __init__(self):
       
   718         self.passwd = {}
       
   719 
       
   720     def add_password(self, realm, uri, user, passwd):
       
   721         # uri could be a single URI or a sequence
       
   722         if isinstance(uri, basestring):
       
   723             uri = [uri]
       
   724         if not realm in self.passwd:
       
   725             self.passwd[realm] = {}
       
   726         for default_port in True, False:
       
   727             reduced_uri = tuple(
       
   728                 [self.reduce_uri(u, default_port) for u in uri])
       
   729             self.passwd[realm][reduced_uri] = (user, passwd)
       
   730 
       
   731     def find_user_password(self, realm, authuri):
       
   732         domains = self.passwd.get(realm, {})
       
   733         for default_port in True, False:
       
   734             reduced_authuri = self.reduce_uri(authuri, default_port)
       
   735             for uris, authinfo in domains.iteritems():
       
   736                 for uri in uris:
       
   737                     if self.is_suburi(uri, reduced_authuri):
       
   738                         return authinfo
       
   739         return None, None
       
   740 
       
   741     def reduce_uri(self, uri, default_port=True):
       
   742         """Accept authority or URI and extract only the authority and path."""
       
   743         # note HTTP URLs do not have a userinfo component
       
   744         parts = urlparse.urlsplit(uri)
       
   745         if parts[1]:
       
   746             # URI
       
   747             scheme = parts[0]
       
   748             authority = parts[1]
       
   749             path = parts[2] or '/'
       
   750         else:
       
   751             # host or host:port
       
   752             scheme = None
       
   753             authority = uri
       
   754             path = '/'
       
   755         host, port = splitport(authority)
       
   756         if default_port and port is None and scheme is not None:
       
   757             dport = {"http": 80,
       
   758                      "https": 443,
       
   759                      }.get(scheme)
       
   760             if dport is not None:
       
   761                 authority = "%s:%d" % (host, dport)
       
   762         return authority, path
       
   763 
       
   764     def is_suburi(self, base, test):
       
   765         """Check if test is below base in a URI tree
       
   766 
       
   767         Both args must be URIs in reduced form.
       
   768         """
       
   769         if base == test:
       
   770             return True
       
   771         if base[0] != test[0]:
       
   772             return False
       
   773         common = posixpath.commonprefix((base[1], test[1]))
       
   774         if len(common) == len(base[1]):
       
   775             return True
       
   776         return False
       
   777 
       
   778 
       
   779 class HTTPPasswordMgrWithDefaultRealm(HTTPPasswordMgr):
       
   780 
       
   781     def find_user_password(self, realm, authuri):
       
   782         user, password = HTTPPasswordMgr.find_user_password(self, realm,
       
   783                                                             authuri)
       
   784         if user is not None:
       
   785             return user, password
       
   786         return HTTPPasswordMgr.find_user_password(self, None, authuri)
       
   787 
       
   788 
       
   789 class AbstractBasicAuthHandler:
       
   790 
       
   791     # XXX this allows for multiple auth-schemes, but will stupidly pick
       
   792     # the last one with a realm specified.
       
   793 
       
   794     # allow for double- and single-quoted realm values
       
   795     # (single quotes are a violation of the RFC, but appear in the wild)
       
   796     rx = re.compile('(?:.*,)*[ \t]*([^ \t]+)[ \t]+'
       
   797                     'realm=(["\'])(.*?)\\2', re.I)
       
   798 
       
   799     # XXX could pre-emptively send auth info already accepted (RFC 2617,
       
   800     # end of section 2, and section 1.2 immediately after "credentials"
       
   801     # production).
       
   802 
       
   803     def __init__(self, password_mgr=None):
       
   804         if password_mgr is None:
       
   805             password_mgr = HTTPPasswordMgr()
       
   806         self.passwd = password_mgr
       
   807         self.add_password = self.passwd.add_password
       
   808 
       
   809     def http_error_auth_reqed(self, authreq, host, req, headers):
       
   810         # host may be an authority (without userinfo) or a URL with an
       
   811         # authority
       
   812         # XXX could be multiple headers
       
   813         authreq = headers.get(authreq, None)
       
   814         if authreq:
       
   815             mo = AbstractBasicAuthHandler.rx.search(authreq)
       
   816             if mo:
       
   817                 scheme, quote, realm = mo.groups()
       
   818                 if scheme.lower() == 'basic':
       
   819                     return self.retry_http_basic_auth(host, req, realm)
       
   820 
       
   821     def retry_http_basic_auth(self, host, req, realm):
       
   822         user, pw = self.passwd.find_user_password(realm, host)
       
   823         if pw is not None:
       
   824             raw = "%s:%s" % (user, pw)
       
   825             auth = 'Basic %s' % base64.b64encode(raw).strip()
       
   826             if req.headers.get(self.auth_header, None) == auth:
       
   827                 return None
       
   828             req.add_header(self.auth_header, auth)
       
   829             return self.parent.open(req)
       
   830         else:
       
   831             return None
       
   832 
       
   833 
       
   834 class HTTPBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
       
   835 
       
   836     auth_header = 'Authorization'
       
   837 
       
   838     def http_error_401(self, req, fp, code, msg, headers):
       
   839         url = req.get_full_url()
       
   840         return self.http_error_auth_reqed('www-authenticate',
       
   841                                           url, req, headers)
       
   842 
       
   843 
       
   844 class ProxyBasicAuthHandler(AbstractBasicAuthHandler, BaseHandler):
       
   845 
       
   846     auth_header = 'Proxy-authorization'
       
   847 
       
   848     def http_error_407(self, req, fp, code, msg, headers):
       
   849         # http_error_auth_reqed requires that there is no userinfo component in
       
   850         # authority.  Assume there isn't one, since urllib2 does not (and
       
   851         # should not, RFC 3986 s. 3.2.1) support requests for URLs containing
       
   852         # userinfo.
       
   853         authority = req.get_host()
       
   854         return self.http_error_auth_reqed('proxy-authenticate',
       
   855                                           authority, req, headers)
       
   856 
       
   857 
       
   858 def randombytes(n):
       
   859     """Return n random bytes."""
       
   860     # Use /dev/urandom if it is available.  Fall back to random module
       
   861     # if not.  It might be worthwhile to extend this function to use
       
   862     # other platform-specific mechanisms for getting random bytes.
       
   863     if os.path.exists("/dev/urandom"):
       
   864         f = open("/dev/urandom")
       
   865         s = f.read(n)
       
   866         f.close()
       
   867         return s
       
   868     else:
       
   869         L = [chr(random.randrange(0, 256)) for i in range(n)]
       
   870         return "".join(L)
       
   871 
       
   872 class AbstractDigestAuthHandler:
       
   873     # Digest authentication is specified in RFC 2617.
       
   874 
       
   875     # XXX The client does not inspect the Authentication-Info header
       
   876     # in a successful response.
       
   877 
       
   878     # XXX It should be possible to test this implementation against
       
   879     # a mock server that just generates a static set of challenges.
       
   880 
       
   881     # XXX qop="auth-int" supports is shaky
       
   882 
       
   883     def __init__(self, passwd=None):
       
   884         if passwd is None:
       
   885             passwd = HTTPPasswordMgr()
       
   886         self.passwd = passwd
       
   887         self.add_password = self.passwd.add_password
       
   888         self.retried = 0
       
   889         self.nonce_count = 0
       
   890 
       
   891     def reset_retry_count(self):
       
   892         self.retried = 0
       
   893 
       
   894     def http_error_auth_reqed(self, auth_header, host, req, headers):
       
   895         authreq = headers.get(auth_header, None)
       
   896         if self.retried > 5:
       
   897             # Don't fail endlessly - if we failed once, we'll probably
       
   898             # fail a second time. Hm. Unless the Password Manager is
       
   899             # prompting for the information. Crap. This isn't great
       
   900             # but it's better than the current 'repeat until recursion
       
   901             # depth exceeded' approach <wink>
       
   902             raise HTTPError(req.get_full_url(), 401, "digest auth failed",
       
   903                             headers, None)
       
   904         else:
       
   905             self.retried += 1
       
   906         if authreq:
       
   907             scheme = authreq.split()[0]
       
   908             if scheme.lower() == 'digest':
       
   909                 return self.retry_http_digest_auth(req, authreq)
       
   910 
       
   911     def retry_http_digest_auth(self, req, auth):
       
   912         token, challenge = auth.split(' ', 1)
       
   913         chal = parse_keqv_list(parse_http_list(challenge))
       
   914         auth = self.get_authorization(req, chal)
       
   915         if auth:
       
   916             auth_val = 'Digest %s' % auth
       
   917             if req.headers.get(self.auth_header, None) == auth_val:
       
   918                 return None
       
   919             req.add_unredirected_header(self.auth_header, auth_val)
       
   920             resp = self.parent.open(req)
       
   921             return resp
       
   922 
       
   923     def get_cnonce(self, nonce):
       
   924         # The cnonce-value is an opaque
       
   925         # quoted string value provided by the client and used by both client
       
   926         # and server to avoid chosen plaintext attacks, to provide mutual
       
   927         # authentication, and to provide some message integrity protection.
       
   928         # This isn't a fabulous effort, but it's probably Good Enough.
       
   929         dig = hashlib.sha1("%s:%s:%s:%s" % (self.nonce_count, nonce, time.ctime(),
       
   930                                             randombytes(8))).hexdigest()
       
   931         return dig[:16]
       
   932 
       
   933     def get_authorization(self, req, chal):
       
   934         try:
       
   935             realm = chal['realm']
       
   936             nonce = chal['nonce']
       
   937             qop = chal.get('qop')
       
   938             algorithm = chal.get('algorithm', 'MD5')
       
   939             # mod_digest doesn't send an opaque, even though it isn't
       
   940             # supposed to be optional
       
   941             opaque = chal.get('opaque', None)
       
   942         except KeyError:
       
   943             return None
       
   944 
       
   945         H, KD = self.get_algorithm_impls(algorithm)
       
   946         if H is None:
       
   947             return None
       
   948 
       
   949         user, pw = self.passwd.find_user_password(realm, req.get_full_url())
       
   950         if user is None:
       
   951             return None
       
   952 
       
   953         # XXX not implemented yet
       
   954         if req.has_data():
       
   955             entdig = self.get_entity_digest(req.get_data(), chal)
       
   956         else:
       
   957             entdig = None
       
   958 
       
   959         A1 = "%s:%s:%s" % (user, realm, pw)
       
   960         A2 = "%s:%s" % (req.get_method(),
       
   961                         # XXX selector: what about proxies and full urls
       
   962                         req.get_selector())
       
   963         if qop == 'auth':
       
   964             self.nonce_count += 1
       
   965             ncvalue = '%08x' % self.nonce_count
       
   966             cnonce = self.get_cnonce(nonce)
       
   967             noncebit = "%s:%s:%s:%s:%s" % (nonce, ncvalue, cnonce, qop, H(A2))
       
   968             respdig = KD(H(A1), noncebit)
       
   969         elif qop is None:
       
   970             respdig = KD(H(A1), "%s:%s" % (nonce, H(A2)))
       
   971         else:
       
   972             # XXX handle auth-int.
       
   973             raise URLError("qop '%s' is not supported." % qop)
       
   974 
       
   975         # XXX should the partial digests be encoded too?
       
   976 
       
   977         base = 'username="%s", realm="%s", nonce="%s", uri="%s", ' \
       
   978                'response="%s"' % (user, realm, nonce, req.get_selector(),
       
   979                                   respdig)
       
   980         if opaque:
       
   981             base += ', opaque="%s"' % opaque
       
   982         if entdig:
       
   983             base += ', digest="%s"' % entdig
       
   984         base += ', algorithm="%s"' % algorithm
       
   985         if qop:
       
   986             base += ', qop=auth, nc=%s, cnonce="%s"' % (ncvalue, cnonce)
       
   987         return base
       
   988 
       
   989     def get_algorithm_impls(self, algorithm):
       
   990         # algorithm should be case-insensitive according to RFC2617
       
   991         algorithm = algorithm.upper()
       
   992         # lambdas assume digest modules are imported at the top level
       
   993         if algorithm == 'MD5':
       
   994             H = lambda x: hashlib.md5(x).hexdigest()
       
   995         elif algorithm == 'SHA':
       
   996             H = lambda x: hashlib.sha1(x).hexdigest()
       
   997         # XXX MD5-sess
       
   998         KD = lambda s, d: H("%s:%s" % (s, d))
       
   999         return H, KD
       
  1000 
       
  1001     def get_entity_digest(self, data, chal):
       
  1002         # XXX not implemented yet
       
  1003         return None
       
  1004 
       
  1005 
       
  1006 class HTTPDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
       
  1007     """An authentication protocol defined by RFC 2069
       
  1008 
       
  1009     Digest authentication improves on basic authentication because it
       
  1010     does not transmit passwords in the clear.
       
  1011     """
       
  1012 
       
  1013     auth_header = 'Authorization'
       
  1014     handler_order = 490  # before Basic auth
       
  1015 
       
  1016     def http_error_401(self, req, fp, code, msg, headers):
       
  1017         host = urlparse.urlparse(req.get_full_url())[1]
       
  1018         retry = self.http_error_auth_reqed('www-authenticate',
       
  1019                                            host, req, headers)
       
  1020         self.reset_retry_count()
       
  1021         return retry
       
  1022 
       
  1023 
       
  1024 class ProxyDigestAuthHandler(BaseHandler, AbstractDigestAuthHandler):
       
  1025 
       
  1026     auth_header = 'Proxy-Authorization'
       
  1027     handler_order = 490  # before Basic auth
       
  1028 
       
  1029     def http_error_407(self, req, fp, code, msg, headers):
       
  1030         host = req.get_host()
       
  1031         retry = self.http_error_auth_reqed('proxy-authenticate',
       
  1032                                            host, req, headers)
       
  1033         self.reset_retry_count()
       
  1034         return retry
       
  1035 
       
  1036 class AbstractHTTPHandler(BaseHandler):
       
  1037 
       
  1038     def __init__(self, debuglevel=0):
       
  1039         self._debuglevel = debuglevel
       
  1040 
       
  1041     def set_http_debuglevel(self, level):
       
  1042         self._debuglevel = level
       
  1043 
       
  1044     def do_request_(self, request):
       
  1045         host = request.get_host()
       
  1046         if not host:
       
  1047             raise URLError('no host given')
       
  1048 
       
  1049         if request.has_data():  # POST
       
  1050             data = request.get_data()
       
  1051             if not request.has_header('Content-type'):
       
  1052                 request.add_unredirected_header(
       
  1053                     'Content-type',
       
  1054                     'application/x-www-form-urlencoded')
       
  1055             if not request.has_header('Content-length'):
       
  1056                 request.add_unredirected_header(
       
  1057                     'Content-length', '%d' % len(data))
       
  1058 
       
  1059         sel_host = host
       
  1060         if request.has_proxy():
       
  1061             scheme, sel = splittype(request.get_selector())
       
  1062             sel_host, sel_path = splithost(sel)
       
  1063 
       
  1064         if not request.has_header('Host'):
       
  1065             request.add_unredirected_header('Host', sel_host)
       
  1066         for name, value in self.parent.addheaders:
       
  1067             name = name.capitalize()
       
  1068             if not request.has_header(name):
       
  1069                 request.add_unredirected_header(name, value)
       
  1070 
       
  1071         return request
       
  1072 
       
  1073     def do_open(self, http_class, req):
       
  1074         """Return an addinfourl object for the request, using http_class.
       
  1075 
       
  1076         http_class must implement the HTTPConnection API from httplib.
       
  1077         The addinfourl return value is a file-like object.  It also
       
  1078         has methods and attributes including:
       
  1079             - info(): return a mimetools.Message object for the headers
       
  1080             - geturl(): return the original request URL
       
  1081             - code: HTTP status code
       
  1082         """
       
  1083         host = req.get_host()
       
  1084         if not host:
       
  1085             raise URLError('no host given')
       
  1086 
       
  1087         h = http_class(host, timeout=req.timeout) # will parse host:port
       
  1088         h.set_debuglevel(self._debuglevel)
       
  1089 
       
  1090         headers = dict(req.headers)
       
  1091         headers.update(req.unredirected_hdrs)
       
  1092         # We want to make an HTTP/1.1 request, but the addinfourl
       
  1093         # class isn't prepared to deal with a persistent connection.
       
  1094         # It will try to read all remaining data from the socket,
       
  1095         # which will block while the server waits for the next request.
       
  1096         # So make sure the connection gets closed after the (only)
       
  1097         # request.
       
  1098         headers["Connection"] = "close"
       
  1099         headers = dict(
       
  1100             (name.title(), val) for name, val in headers.items())
       
  1101         try:
       
  1102             h.request(req.get_method(), req.get_selector(), req.data, headers)
       
  1103             r = h.getresponse()
       
  1104         except socket.error, err: # XXX what error?
       
  1105             raise URLError(err)
       
  1106 
       
  1107         # Pick apart the HTTPResponse object to get the addinfourl
       
  1108         # object initialized properly.
       
  1109 
       
  1110         # Wrap the HTTPResponse object in socket's file object adapter
       
  1111         # for Windows.  That adapter calls recv(), so delegate recv()
       
  1112         # to read().  This weird wrapping allows the returned object to
       
  1113         # have readline() and readlines() methods.
       
  1114 
       
  1115         # XXX It might be better to extract the read buffering code
       
  1116         # out of socket._fileobject() and into a base class.
       
  1117 
       
  1118         r.recv = r.read
       
  1119         fp = socket._fileobject(r, close=True)
       
  1120 
       
  1121         resp = addinfourl(fp, r.msg, req.get_full_url())
       
  1122         resp.code = r.status
       
  1123         resp.msg = r.reason
       
  1124         return resp
       
  1125 
       
  1126 
       
  1127 class HTTPHandler(AbstractHTTPHandler):
       
  1128 
       
  1129     def http_open(self, req):
       
  1130         return self.do_open(httplib.HTTPConnection, req)
       
  1131 
       
  1132     http_request = AbstractHTTPHandler.do_request_
       
  1133 
       
  1134 if hasattr(httplib, 'HTTPS'):
       
  1135     class HTTPSHandler(AbstractHTTPHandler):
       
  1136 
       
  1137         def https_open(self, req):
       
  1138             return self.do_open(httplib.HTTPSConnection, req)
       
  1139 
       
  1140         https_request = AbstractHTTPHandler.do_request_
       
  1141 
       
  1142 class HTTPCookieProcessor(BaseHandler):
       
  1143     def __init__(self, cookiejar=None):
       
  1144         import cookielib
       
  1145         if cookiejar is None:
       
  1146             cookiejar = cookielib.CookieJar()
       
  1147         self.cookiejar = cookiejar
       
  1148 
       
  1149     def http_request(self, request):
       
  1150         self.cookiejar.add_cookie_header(request)
       
  1151         return request
       
  1152 
       
  1153     def http_response(self, request, response):
       
  1154         self.cookiejar.extract_cookies(response, request)
       
  1155         return response
       
  1156 
       
  1157     https_request = http_request
       
  1158     https_response = http_response
       
  1159 
       
  1160 class UnknownHandler(BaseHandler):
       
  1161     def unknown_open(self, req):
       
  1162         type = req.get_type()
       
  1163         raise URLError('unknown url type: %s' % type)
       
  1164 
       
  1165 def parse_keqv_list(l):
       
  1166     """Parse list of key=value strings where keys are not duplicated."""
       
  1167     parsed = {}
       
  1168     for elt in l:
       
  1169         k, v = elt.split('=', 1)
       
  1170         if v[0] == '"' and v[-1] == '"':
       
  1171             v = v[1:-1]
       
  1172         parsed[k] = v
       
  1173     return parsed
       
  1174 
       
  1175 def parse_http_list(s):
       
  1176     """Parse lists as described by RFC 2068 Section 2.
       
  1177 
       
  1178     In particular, parse comma-separated lists where the elements of
       
  1179     the list may include quoted-strings.  A quoted-string could
       
  1180     contain a comma.  A non-quoted string could have quotes in the
       
  1181     middle.  Neither commas nor quotes count if they are escaped.
       
  1182     Only double-quotes count, not single-quotes.
       
  1183     """
       
  1184     res = []
       
  1185     part = ''
       
  1186 
       
  1187     escape = quote = False
       
  1188     for cur in s:
       
  1189         if escape:
       
  1190             part += cur
       
  1191             escape = False
       
  1192             continue
       
  1193         if quote:
       
  1194             if cur == '\\':
       
  1195                 escape = True
       
  1196                 continue
       
  1197             elif cur == '"':
       
  1198                 quote = False
       
  1199             part += cur
       
  1200             continue
       
  1201 
       
  1202         if cur == ',':
       
  1203             res.append(part)
       
  1204             part = ''
       
  1205             continue
       
  1206 
       
  1207         if cur == '"':
       
  1208             quote = True
       
  1209 
       
  1210         part += cur
       
  1211 
       
  1212     # append last part
       
  1213     if part:
       
  1214         res.append(part)
       
  1215 
       
  1216     return [part.strip() for part in res]
       
  1217 
       
  1218 class FileHandler(BaseHandler):
       
  1219     # Use local file or FTP depending on form of URL
       
  1220     def file_open(self, req):
       
  1221         url = req.get_selector()
       
  1222         if url[:2] == '//' and url[2:3] != '/':
       
  1223             req.type = 'ftp'
       
  1224             return self.parent.open(req)
       
  1225         else:
       
  1226             return self.open_local_file(req)
       
  1227 
       
  1228     # names for the localhost
       
  1229     names = None
       
  1230     def get_names(self):
       
  1231         if FileHandler.names is None:
       
  1232             try:
       
  1233                 FileHandler.names = (socket.gethostbyname('localhost'),
       
  1234                                     socket.gethostbyname(socket.gethostname()))
       
  1235             except socket.gaierror:
       
  1236                 FileHandler.names = (socket.gethostbyname('localhost'),)
       
  1237         return FileHandler.names
       
  1238 
       
  1239     # not entirely sure what the rules are here
       
  1240     def open_local_file(self, req):
       
  1241         import email.utils
       
  1242         import mimetypes
       
  1243         host = req.get_host()
       
  1244         file = req.get_selector()
       
  1245         localfile = url2pathname(file)
       
  1246         try:
       
  1247             stats = os.stat(localfile)
       
  1248             size = stats.st_size
       
  1249             modified = email.utils.formatdate(stats.st_mtime, usegmt=True)
       
  1250             mtype = mimetypes.guess_type(file)[0]
       
  1251             headers = mimetools.Message(StringIO(
       
  1252                 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
       
  1253                 (mtype or 'text/plain', size, modified)))
       
  1254             if host:
       
  1255                 host, port = splitport(host)
       
  1256             if not host or \
       
  1257                 (not port and socket.gethostbyname(host) in self.get_names()):
       
  1258                 return addinfourl(open(localfile, 'rb'),
       
  1259                                   headers, 'file:'+file)
       
  1260         except OSError, msg:
       
  1261             # urllib2 users shouldn't expect OSErrors coming from urlopen()
       
  1262             raise URLError(msg)
       
  1263         raise URLError('file not on local host')
       
  1264 
       
  1265 class FTPHandler(BaseHandler):
       
  1266     def ftp_open(self, req):
       
  1267         import ftplib
       
  1268         import mimetypes
       
  1269         host = req.get_host()
       
  1270         if not host:
       
  1271             raise URLError('ftp error: no host given')
       
  1272         host, port = splitport(host)
       
  1273         if port is None:
       
  1274             port = ftplib.FTP_PORT
       
  1275         else:
       
  1276             port = int(port)
       
  1277 
       
  1278         # username/password handling
       
  1279         user, host = splituser(host)
       
  1280         if user:
       
  1281             user, passwd = splitpasswd(user)
       
  1282         else:
       
  1283             passwd = None
       
  1284         host = unquote(host)
       
  1285         user = unquote(user or '')
       
  1286         passwd = unquote(passwd or '')
       
  1287 
       
  1288         try:
       
  1289             host = socket.gethostbyname(host)
       
  1290         except socket.error, msg:
       
  1291             raise URLError(msg)
       
  1292         path, attrs = splitattr(req.get_selector())
       
  1293         dirs = path.split('/')
       
  1294         dirs = map(unquote, dirs)
       
  1295         dirs, file = dirs[:-1], dirs[-1]
       
  1296         if dirs and not dirs[0]:
       
  1297             dirs = dirs[1:]
       
  1298         try:
       
  1299             fw = self.connect_ftp(user, passwd, host, port, dirs, req.timeout)
       
  1300             type = file and 'I' or 'D'
       
  1301             for attr in attrs:
       
  1302                 attr, value = splitvalue(attr)
       
  1303                 if attr.lower() == 'type' and \
       
  1304                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
       
  1305                     type = value.upper()
       
  1306             fp, retrlen = fw.retrfile(file, type)
       
  1307             headers = ""
       
  1308             mtype = mimetypes.guess_type(req.get_full_url())[0]
       
  1309             if mtype:
       
  1310                 headers += "Content-type: %s\n" % mtype
       
  1311             if retrlen is not None and retrlen >= 0:
       
  1312                 headers += "Content-length: %d\n" % retrlen
       
  1313             sf = StringIO(headers)
       
  1314             headers = mimetools.Message(sf)
       
  1315             return addinfourl(fp, headers, req.get_full_url())
       
  1316         except ftplib.all_errors, msg:
       
  1317             raise URLError, ('ftp error: %s' % msg), sys.exc_info()[2]
       
  1318 
       
  1319     def connect_ftp(self, user, passwd, host, port, dirs, timeout):
       
  1320         fw = ftpwrapper(user, passwd, host, port, dirs, timeout)
       
  1321 ##        fw.ftp.set_debuglevel(1)
       
  1322         return fw
       
  1323 
       
  1324 class CacheFTPHandler(FTPHandler):
       
  1325     # XXX would be nice to have pluggable cache strategies
       
  1326     # XXX this stuff is definitely not thread safe
       
  1327     def __init__(self):
       
  1328         self.cache = {}
       
  1329         self.timeout = {}
       
  1330         self.soonest = 0
       
  1331         self.delay = 60
       
  1332         self.max_conns = 16
       
  1333 
       
  1334     def setTimeout(self, t):
       
  1335         self.delay = t
       
  1336 
       
  1337     def setMaxConns(self, m):
       
  1338         self.max_conns = m
       
  1339 
       
  1340     def connect_ftp(self, user, passwd, host, port, dirs, timeout):
       
  1341         key = user, host, port, '/'.join(dirs), timeout
       
  1342         if key in self.cache:
       
  1343             self.timeout[key] = time.time() + self.delay
       
  1344         else:
       
  1345             self.cache[key] = ftpwrapper(user, passwd, host, port, dirs, timeout)
       
  1346             self.timeout[key] = time.time() + self.delay
       
  1347         self.check_cache()
       
  1348         return self.cache[key]
       
  1349 
       
  1350     def check_cache(self):
       
  1351         # first check for old ones
       
  1352         t = time.time()
       
  1353         if self.soonest <= t:
       
  1354             for k, v in self.timeout.items():
       
  1355                 if v < t:
       
  1356                     self.cache[k].close()
       
  1357                     del self.cache[k]
       
  1358                     del self.timeout[k]
       
  1359         self.soonest = min(self.timeout.values())
       
  1360 
       
  1361         # then check the size
       
  1362         if len(self.cache) == self.max_conns:
       
  1363             for k, v in self.timeout.items():
       
  1364                 if v == self.soonest:
       
  1365                     del self.cache[k]
       
  1366                     del self.timeout[k]
       
  1367                     break
       
  1368             self.soonest = min(self.timeout.values())