python-2.5.2/win32/Lib/urllib.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 """Open an arbitrary URL.
       
     2 
       
     3 See the following document for more info on URLs:
       
     4 "Names and Addresses, URIs, URLs, URNs, URCs", at
       
     5 http://www.w3.org/pub/WWW/Addressing/Overview.html
       
     6 
       
     7 See also the HTTP spec (from which the error codes are derived):
       
     8 "HTTP - Hypertext Transfer Protocol", at
       
     9 http://www.w3.org/pub/WWW/Protocols/
       
    10 
       
    11 Related standards and specs:
       
    12 - RFC1808: the "relative URL" spec. (authoritative status)
       
    13 - RFC1738 - the "URL standard". (authoritative status)
       
    14 - RFC1630 - the "URI spec". (informational status)
       
    15 
       
    16 The object returned by URLopener().open(file) will differ per
       
    17 protocol.  All you know is that is has methods read(), readline(),
       
    18 readlines(), fileno(), close() and info().  The read*(), fileno()
       
    19 and close() methods work like those of open files.
       
    20 The info() method returns a mimetools.Message object which can be
       
    21 used to query various info about the object, if available.
       
    22 (mimetools.Message objects are queried with the getheader() method.)
       
    23 """
       
    24 
       
    25 import string
       
    26 import socket
       
    27 import os
       
    28 import time
       
    29 import sys
       
    30 from urlparse import urljoin as basejoin
       
    31 
       
    32 __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
       
    33            "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
       
    34            "urlencode", "url2pathname", "pathname2url", "splittag",
       
    35            "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
       
    36            "splittype", "splithost", "splituser", "splitpasswd", "splitport",
       
    37            "splitnport", "splitquery", "splitattr", "splitvalue",
       
    38            "splitgophertype", "getproxies"]
       
    39 
       
    40 __version__ = '1.17'    # XXX This version is not always updated :-(
       
    41 
       
    42 MAXFTPCACHE = 10        # Trim the ftp cache beyond this size
       
    43 
       
    44 # Helper for non-unix systems
       
    45 if os.name == 'mac':
       
    46     from macurl2path import url2pathname, pathname2url
       
    47 elif os.name == 'nt':
       
    48     from nturl2path import url2pathname, pathname2url
       
    49 elif os.name == 'riscos':
       
    50     from rourl2path import url2pathname, pathname2url
       
    51 else:
       
    52     def url2pathname(pathname):
       
    53         """OS-specific conversion from a relative URL of the 'file' scheme
       
    54         to a file system path; not recommended for general use."""
       
    55         return unquote(pathname)
       
    56 
       
    57     def pathname2url(pathname):
       
    58         """OS-specific conversion from a file system path to a relative URL
       
    59         of the 'file' scheme; not recommended for general use."""
       
    60         return quote(pathname)
       
    61 
       
    62 # This really consists of two pieces:
       
    63 # (1) a class which handles opening of all sorts of URLs
       
    64 #     (plus assorted utilities etc.)
       
    65 # (2) a set of functions for parsing URLs
       
    66 # XXX Should these be separated out into different modules?
       
    67 
       
    68 
       
    69 # Shortcut for basic usage
       
    70 _urlopener = None
       
    71 def urlopen(url, data=None, proxies=None):
       
    72     """urlopen(url [, data]) -> open file-like object"""
       
    73     global _urlopener
       
    74     if proxies is not None:
       
    75         opener = FancyURLopener(proxies=proxies)
       
    76     elif not _urlopener:
       
    77         opener = FancyURLopener()
       
    78         _urlopener = opener
       
    79     else:
       
    80         opener = _urlopener
       
    81     if data is None:
       
    82         return opener.open(url)
       
    83     else:
       
    84         return opener.open(url, data)
       
    85 def urlretrieve(url, filename=None, reporthook=None, data=None):
       
    86     global _urlopener
       
    87     if not _urlopener:
       
    88         _urlopener = FancyURLopener()
       
    89     return _urlopener.retrieve(url, filename, reporthook, data)
       
    90 def urlcleanup():
       
    91     if _urlopener:
       
    92         _urlopener.cleanup()
       
    93 
       
    94 # exception raised when downloaded size does not match content-length
       
    95 class ContentTooShortError(IOError):
       
    96     def __init__(self, message, content):
       
    97         IOError.__init__(self, message)
       
    98         self.content = content
       
    99 
       
   100 ftpcache = {}
       
   101 class URLopener:
       
   102     """Class to open URLs.
       
   103     This is a class rather than just a subroutine because we may need
       
   104     more than one set of global protocol-specific options.
       
   105     Note -- this is a base class for those who don't want the
       
   106     automatic handling of errors type 302 (relocated) and 401
       
   107     (authorization needed)."""
       
   108 
       
   109     __tempfiles = None
       
   110 
       
   111     version = "Python-urllib/%s" % __version__
       
   112 
       
   113     # Constructor
       
   114     def __init__(self, proxies=None, **x509):
       
   115         if proxies is None:
       
   116             proxies = getproxies()
       
   117         assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
       
   118         self.proxies = proxies
       
   119         self.key_file = x509.get('key_file')
       
   120         self.cert_file = x509.get('cert_file')
       
   121         self.addheaders = [('User-Agent', self.version)]
       
   122         self.__tempfiles = []
       
   123         self.__unlink = os.unlink # See cleanup()
       
   124         self.tempcache = None
       
   125         # Undocumented feature: if you assign {} to tempcache,
       
   126         # it is used to cache files retrieved with
       
   127         # self.retrieve().  This is not enabled by default
       
   128         # since it does not work for changing documents (and I
       
   129         # haven't got the logic to check expiration headers
       
   130         # yet).
       
   131         self.ftpcache = ftpcache
       
   132         # Undocumented feature: you can use a different
       
   133         # ftp cache by assigning to the .ftpcache member;
       
   134         # in case you want logically independent URL openers
       
   135         # XXX This is not threadsafe.  Bah.
       
   136 
       
   137     def __del__(self):
       
   138         self.close()
       
   139 
       
   140     def close(self):
       
   141         self.cleanup()
       
   142 
       
   143     def cleanup(self):
       
   144         # This code sometimes runs when the rest of this module
       
   145         # has already been deleted, so it can't use any globals
       
   146         # or import anything.
       
   147         if self.__tempfiles:
       
   148             for file in self.__tempfiles:
       
   149                 try:
       
   150                     self.__unlink(file)
       
   151                 except OSError:
       
   152                     pass
       
   153             del self.__tempfiles[:]
       
   154         if self.tempcache:
       
   155             self.tempcache.clear()
       
   156 
       
   157     def addheader(self, *args):
       
   158         """Add a header to be used by the HTTP interface only
       
   159         e.g. u.addheader('Accept', 'sound/basic')"""
       
   160         self.addheaders.append(args)
       
   161 
       
   162     # External interface
       
   163     def open(self, fullurl, data=None):
       
   164         """Use URLopener().open(file) instead of open(file, 'r')."""
       
   165         fullurl = unwrap(toBytes(fullurl))
       
   166         if self.tempcache and fullurl in self.tempcache:
       
   167             filename, headers = self.tempcache[fullurl]
       
   168             fp = open(filename, 'rb')
       
   169             return addinfourl(fp, headers, fullurl)
       
   170         urltype, url = splittype(fullurl)
       
   171         if not urltype:
       
   172             urltype = 'file'
       
   173         if urltype in self.proxies:
       
   174             proxy = self.proxies[urltype]
       
   175             urltype, proxyhost = splittype(proxy)
       
   176             host, selector = splithost(proxyhost)
       
   177             url = (host, fullurl) # Signal special case to open_*()
       
   178         else:
       
   179             proxy = None
       
   180         name = 'open_' + urltype
       
   181         self.type = urltype
       
   182         name = name.replace('-', '_')
       
   183         if not hasattr(self, name):
       
   184             if proxy:
       
   185                 return self.open_unknown_proxy(proxy, fullurl, data)
       
   186             else:
       
   187                 return self.open_unknown(fullurl, data)
       
   188         try:
       
   189             if data is None:
       
   190                 return getattr(self, name)(url)
       
   191             else:
       
   192                 return getattr(self, name)(url, data)
       
   193         except socket.error, msg:
       
   194             raise IOError, ('socket error', msg), sys.exc_info()[2]
       
   195 
       
   196     def open_unknown(self, fullurl, data=None):
       
   197         """Overridable interface to open unknown URL type."""
       
   198         type, url = splittype(fullurl)
       
   199         raise IOError, ('url error', 'unknown url type', type)
       
   200 
       
   201     def open_unknown_proxy(self, proxy, fullurl, data=None):
       
   202         """Overridable interface to open unknown URL type."""
       
   203         type, url = splittype(fullurl)
       
   204         raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
       
   205 
       
   206     # External interface
       
   207     def retrieve(self, url, filename=None, reporthook=None, data=None):
       
   208         """retrieve(url) returns (filename, headers) for a local object
       
   209         or (tempfilename, headers) for a remote object."""
       
   210         url = unwrap(toBytes(url))
       
   211         if self.tempcache and url in self.tempcache:
       
   212             return self.tempcache[url]
       
   213         type, url1 = splittype(url)
       
   214         if filename is None and (not type or type == 'file'):
       
   215             try:
       
   216                 fp = self.open_local_file(url1)
       
   217                 hdrs = fp.info()
       
   218                 del fp
       
   219                 return url2pathname(splithost(url1)[1]), hdrs
       
   220             except IOError, msg:
       
   221                 pass
       
   222         fp = self.open(url, data)
       
   223         headers = fp.info()
       
   224         if filename:
       
   225             tfp = open(filename, 'wb')
       
   226         else:
       
   227             import tempfile
       
   228             garbage, path = splittype(url)
       
   229             garbage, path = splithost(path or "")
       
   230             path, garbage = splitquery(path or "")
       
   231             path, garbage = splitattr(path or "")
       
   232             suffix = os.path.splitext(path)[1]
       
   233             (fd, filename) = tempfile.mkstemp(suffix)
       
   234             self.__tempfiles.append(filename)
       
   235             tfp = os.fdopen(fd, 'wb')
       
   236         result = filename, headers
       
   237         if self.tempcache is not None:
       
   238             self.tempcache[url] = result
       
   239         bs = 1024*8
       
   240         size = -1
       
   241         read = 0
       
   242         blocknum = 0
       
   243         if reporthook:
       
   244             if "content-length" in headers:
       
   245                 size = int(headers["Content-Length"])
       
   246             reporthook(blocknum, bs, size)
       
   247         while 1:
       
   248             block = fp.read(bs)
       
   249             if block == "":
       
   250                 break
       
   251             read += len(block)
       
   252             tfp.write(block)
       
   253             blocknum += 1
       
   254             if reporthook:
       
   255                 reporthook(blocknum, bs, size)
       
   256         fp.close()
       
   257         tfp.close()
       
   258         del fp
       
   259         del tfp
       
   260 
       
   261         # raise exception if actual size does not match content-length header
       
   262         if size >= 0 and read < size:
       
   263             raise ContentTooShortError("retrieval incomplete: got only %i out "
       
   264                                        "of %i bytes" % (read, size), result)
       
   265 
       
   266         return result
       
   267 
       
   268     # Each method named open_<type> knows how to open that type of URL
       
   269 
       
   270     def open_http(self, url, data=None):
       
   271         """Use HTTP protocol."""
       
   272         import httplib
       
   273         user_passwd = None
       
   274         proxy_passwd= None
       
   275         if isinstance(url, str):
       
   276             host, selector = splithost(url)
       
   277             if host:
       
   278                 user_passwd, host = splituser(host)
       
   279                 host = unquote(host)
       
   280             realhost = host
       
   281         else:
       
   282             host, selector = url
       
   283             # check whether the proxy contains authorization information
       
   284             proxy_passwd, host = splituser(host)
       
   285             # now we proceed with the url we want to obtain
       
   286             urltype, rest = splittype(selector)
       
   287             url = rest
       
   288             user_passwd = None
       
   289             if urltype.lower() != 'http':
       
   290                 realhost = None
       
   291             else:
       
   292                 realhost, rest = splithost(rest)
       
   293                 if realhost:
       
   294                     user_passwd, realhost = splituser(realhost)
       
   295                 if user_passwd:
       
   296                     selector = "%s://%s%s" % (urltype, realhost, rest)
       
   297                 if proxy_bypass(realhost):
       
   298                     host = realhost
       
   299 
       
   300             #print "proxy via http:", host, selector
       
   301         if not host: raise IOError, ('http error', 'no host given')
       
   302 
       
   303         if proxy_passwd:
       
   304             import base64
       
   305             proxy_auth = base64.b64encode(proxy_passwd).strip()
       
   306         else:
       
   307             proxy_auth = None
       
   308 
       
   309         if user_passwd:
       
   310             import base64
       
   311             auth = base64.b64encode(user_passwd).strip()
       
   312         else:
       
   313             auth = None
       
   314         h = httplib.HTTP(host)
       
   315         if data is not None:
       
   316             h.putrequest('POST', selector)
       
   317             h.putheader('Content-Type', 'application/x-www-form-urlencoded')
       
   318             h.putheader('Content-Length', '%d' % len(data))
       
   319         else:
       
   320             h.putrequest('GET', selector)
       
   321         if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
       
   322         if auth: h.putheader('Authorization', 'Basic %s' % auth)
       
   323         if realhost: h.putheader('Host', realhost)
       
   324         for args in self.addheaders: h.putheader(*args)
       
   325         h.endheaders()
       
   326         if data is not None:
       
   327             h.send(data)
       
   328         errcode, errmsg, headers = h.getreply()
       
   329         if errcode == -1:
       
   330             # something went wrong with the HTTP status line
       
   331             raise IOError, ('http protocol error', 0,
       
   332                             'got a bad status line', None)
       
   333         fp = h.getfile()
       
   334         if errcode == 200:
       
   335             return addinfourl(fp, headers, "http:" + url)
       
   336         else:
       
   337             if data is None:
       
   338                 return self.http_error(url, fp, errcode, errmsg, headers)
       
   339             else:
       
   340                 return self.http_error(url, fp, errcode, errmsg, headers, data)
       
   341 
       
   342     def http_error(self, url, fp, errcode, errmsg, headers, data=None):
       
   343         """Handle http errors.
       
   344         Derived class can override this, or provide specific handlers
       
   345         named http_error_DDD where DDD is the 3-digit error code."""
       
   346         # First check if there's a specific handler for this error
       
   347         name = 'http_error_%d' % errcode
       
   348         if hasattr(self, name):
       
   349             method = getattr(self, name)
       
   350             if data is None:
       
   351                 result = method(url, fp, errcode, errmsg, headers)
       
   352             else:
       
   353                 result = method(url, fp, errcode, errmsg, headers, data)
       
   354             if result: return result
       
   355         return self.http_error_default(url, fp, errcode, errmsg, headers)
       
   356 
       
   357     def http_error_default(self, url, fp, errcode, errmsg, headers):
       
   358         """Default error handler: close the connection and raise IOError."""
       
   359         void = fp.read()
       
   360         fp.close()
       
   361         raise IOError, ('http error', errcode, errmsg, headers)
       
   362 
       
   363     if hasattr(socket, "ssl"):
       
   364         def open_https(self, url, data=None):
       
   365             """Use HTTPS protocol."""
       
   366             import httplib
       
   367             user_passwd = None
       
   368             proxy_passwd = None
       
   369             if isinstance(url, str):
       
   370                 host, selector = splithost(url)
       
   371                 if host:
       
   372                     user_passwd, host = splituser(host)
       
   373                     host = unquote(host)
       
   374                 realhost = host
       
   375             else:
       
   376                 host, selector = url
       
   377                 # here, we determine, whether the proxy contains authorization information
       
   378                 proxy_passwd, host = splituser(host)
       
   379                 urltype, rest = splittype(selector)
       
   380                 url = rest
       
   381                 user_passwd = None
       
   382                 if urltype.lower() != 'https':
       
   383                     realhost = None
       
   384                 else:
       
   385                     realhost, rest = splithost(rest)
       
   386                     if realhost:
       
   387                         user_passwd, realhost = splituser(realhost)
       
   388                     if user_passwd:
       
   389                         selector = "%s://%s%s" % (urltype, realhost, rest)
       
   390                 #print "proxy via https:", host, selector
       
   391             if not host: raise IOError, ('https error', 'no host given')
       
   392             if proxy_passwd:
       
   393                 import base64
       
   394                 proxy_auth = base64.b64encode(proxy_passwd).strip()
       
   395             else:
       
   396                 proxy_auth = None
       
   397             if user_passwd:
       
   398                 import base64
       
   399                 auth = base64.b64encode(user_passwd).strip()
       
   400             else:
       
   401                 auth = None
       
   402             h = httplib.HTTPS(host, 0,
       
   403                               key_file=self.key_file,
       
   404                               cert_file=self.cert_file)
       
   405             if data is not None:
       
   406                 h.putrequest('POST', selector)
       
   407                 h.putheader('Content-Type',
       
   408                             'application/x-www-form-urlencoded')
       
   409                 h.putheader('Content-Length', '%d' % len(data))
       
   410             else:
       
   411                 h.putrequest('GET', selector)
       
   412             if proxy_auth: h.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
       
   413             if auth: h.putheader('Authorization', 'Basic %s' % auth)
       
   414             if realhost: h.putheader('Host', realhost)
       
   415             for args in self.addheaders: h.putheader(*args)
       
   416             h.endheaders()
       
   417             if data is not None:
       
   418                 h.send(data)
       
   419             errcode, errmsg, headers = h.getreply()
       
   420             if errcode == -1:
       
   421                 # something went wrong with the HTTP status line
       
   422                 raise IOError, ('http protocol error', 0,
       
   423                                 'got a bad status line', None)
       
   424             fp = h.getfile()
       
   425             if errcode == 200:
       
   426                 return addinfourl(fp, headers, "https:" + url)
       
   427             else:
       
   428                 if data is None:
       
   429                     return self.http_error(url, fp, errcode, errmsg, headers)
       
   430                 else:
       
   431                     return self.http_error(url, fp, errcode, errmsg, headers,
       
   432                                            data)
       
   433 
       
   434     def open_gopher(self, url):
       
   435         """Use Gopher protocol."""
       
   436         if not isinstance(url, str):
       
   437             raise IOError, ('gopher error', 'proxy support for gopher protocol currently not implemented')
       
   438         import gopherlib
       
   439         host, selector = splithost(url)
       
   440         if not host: raise IOError, ('gopher error', 'no host given')
       
   441         host = unquote(host)
       
   442         type, selector = splitgophertype(selector)
       
   443         selector, query = splitquery(selector)
       
   444         selector = unquote(selector)
       
   445         if query:
       
   446             query = unquote(query)
       
   447             fp = gopherlib.send_query(selector, query, host)
       
   448         else:
       
   449             fp = gopherlib.send_selector(selector, host)
       
   450         return addinfourl(fp, noheaders(), "gopher:" + url)
       
   451 
       
   452     def open_file(self, url):
       
   453         """Use local file or FTP depending on form of URL."""
       
   454         if not isinstance(url, str):
       
   455             raise IOError, ('file error', 'proxy support for file protocol currently not implemented')
       
   456         if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
       
   457             return self.open_ftp(url)
       
   458         else:
       
   459             return self.open_local_file(url)
       
   460 
       
   461     def open_local_file(self, url):
       
   462         """Use local file."""
       
   463         import mimetypes, mimetools, email.Utils
       
   464         try:
       
   465             from cStringIO import StringIO
       
   466         except ImportError:
       
   467             from StringIO import StringIO
       
   468         host, file = splithost(url)
       
   469         localname = url2pathname(file)
       
   470         try:
       
   471             stats = os.stat(localname)
       
   472         except OSError, e:
       
   473             raise IOError(e.errno, e.strerror, e.filename)
       
   474         size = stats.st_size
       
   475         modified = email.Utils.formatdate(stats.st_mtime, usegmt=True)
       
   476         mtype = mimetypes.guess_type(url)[0]
       
   477         headers = mimetools.Message(StringIO(
       
   478             'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
       
   479             (mtype or 'text/plain', size, modified)))
       
   480         if not host:
       
   481             urlfile = file
       
   482             if file[:1] == '/':
       
   483                 urlfile = 'file://' + file
       
   484             return addinfourl(open(localname, 'rb'),
       
   485                               headers, urlfile)
       
   486         host, port = splitport(host)
       
   487         if not port \
       
   488            and socket.gethostbyname(host) in (localhost(), thishost()):
       
   489             urlfile = file
       
   490             if file[:1] == '/':
       
   491                 urlfile = 'file://' + file
       
   492             return addinfourl(open(localname, 'rb'),
       
   493                               headers, urlfile)
       
   494         raise IOError, ('local file error', 'not on local host')
       
   495 
       
   496     def open_ftp(self, url):
       
   497         """Use FTP protocol."""
       
   498         if not isinstance(url, str):
       
   499             raise IOError, ('ftp error', 'proxy support for ftp protocol currently not implemented')
       
   500         import mimetypes, mimetools
       
   501         try:
       
   502             from cStringIO import StringIO
       
   503         except ImportError:
       
   504             from StringIO import StringIO
       
   505         host, path = splithost(url)
       
   506         if not host: raise IOError, ('ftp error', 'no host given')
       
   507         host, port = splitport(host)
       
   508         user, host = splituser(host)
       
   509         if user: user, passwd = splitpasswd(user)
       
   510         else: passwd = None
       
   511         host = unquote(host)
       
   512         user = unquote(user or '')
       
   513         passwd = unquote(passwd or '')
       
   514         host = socket.gethostbyname(host)
       
   515         if not port:
       
   516             import ftplib
       
   517             port = ftplib.FTP_PORT
       
   518         else:
       
   519             port = int(port)
       
   520         path, attrs = splitattr(path)
       
   521         path = unquote(path)
       
   522         dirs = path.split('/')
       
   523         dirs, file = dirs[:-1], dirs[-1]
       
   524         if dirs and not dirs[0]: dirs = dirs[1:]
       
   525         if dirs and not dirs[0]: dirs[0] = '/'
       
   526         key = user, host, port, '/'.join(dirs)
       
   527         # XXX thread unsafe!
       
   528         if len(self.ftpcache) > MAXFTPCACHE:
       
   529             # Prune the cache, rather arbitrarily
       
   530             for k in self.ftpcache.keys():
       
   531                 if k != key:
       
   532                     v = self.ftpcache[k]
       
   533                     del self.ftpcache[k]
       
   534                     v.close()
       
   535         try:
       
   536             if not key in self.ftpcache:
       
   537                 self.ftpcache[key] = \
       
   538                     ftpwrapper(user, passwd, host, port, dirs)
       
   539             if not file: type = 'D'
       
   540             else: type = 'I'
       
   541             for attr in attrs:
       
   542                 attr, value = splitvalue(attr)
       
   543                 if attr.lower() == 'type' and \
       
   544                    value in ('a', 'A', 'i', 'I', 'd', 'D'):
       
   545                     type = value.upper()
       
   546             (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
       
   547             mtype = mimetypes.guess_type("ftp:" + url)[0]
       
   548             headers = ""
       
   549             if mtype:
       
   550                 headers += "Content-Type: %s\n" % mtype
       
   551             if retrlen is not None and retrlen >= 0:
       
   552                 headers += "Content-Length: %d\n" % retrlen
       
   553             headers = mimetools.Message(StringIO(headers))
       
   554             return addinfourl(fp, headers, "ftp:" + url)
       
   555         except ftperrors(), msg:
       
   556             raise IOError, ('ftp error', msg), sys.exc_info()[2]
       
   557 
       
   558     def open_data(self, url, data=None):
       
   559         """Use "data" URL."""
       
   560         if not isinstance(url, str):
       
   561             raise IOError, ('data error', 'proxy support for data protocol currently not implemented')
       
   562         # ignore POSTed data
       
   563         #
       
   564         # syntax of data URLs:
       
   565         # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
       
   566         # mediatype := [ type "/" subtype ] *( ";" parameter )
       
   567         # data      := *urlchar
       
   568         # parameter := attribute "=" value
       
   569         import mimetools
       
   570         try:
       
   571             from cStringIO import StringIO
       
   572         except ImportError:
       
   573             from StringIO import StringIO
       
   574         try:
       
   575             [type, data] = url.split(',', 1)
       
   576         except ValueError:
       
   577             raise IOError, ('data error', 'bad data URL')
       
   578         if not type:
       
   579             type = 'text/plain;charset=US-ASCII'
       
   580         semi = type.rfind(';')
       
   581         if semi >= 0 and '=' not in type[semi:]:
       
   582             encoding = type[semi+1:]
       
   583             type = type[:semi]
       
   584         else:
       
   585             encoding = ''
       
   586         msg = []
       
   587         msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
       
   588                                             time.gmtime(time.time())))
       
   589         msg.append('Content-type: %s' % type)
       
   590         if encoding == 'base64':
       
   591             import base64
       
   592             data = base64.decodestring(data)
       
   593         else:
       
   594             data = unquote(data)
       
   595         msg.append('Content-Length: %d' % len(data))
       
   596         msg.append('')
       
   597         msg.append(data)
       
   598         msg = '\n'.join(msg)
       
   599         f = StringIO(msg)
       
   600         headers = mimetools.Message(f, 0)
       
   601         #f.fileno = None     # needed for addinfourl
       
   602         return addinfourl(f, headers, url)
       
   603 
       
   604 
       
   605 class FancyURLopener(URLopener):
       
   606     """Derived class with handlers for errors we can handle (perhaps)."""
       
   607 
       
   608     def __init__(self, *args, **kwargs):
       
   609         URLopener.__init__(self, *args, **kwargs)
       
   610         self.auth_cache = {}
       
   611         self.tries = 0
       
   612         self.maxtries = 10
       
   613 
       
   614     def http_error_default(self, url, fp, errcode, errmsg, headers):
       
   615         """Default error handling -- don't raise an exception."""
       
   616         return addinfourl(fp, headers, "http:" + url)
       
   617 
       
   618     def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
       
   619         """Error 302 -- relocated (temporarily)."""
       
   620         self.tries += 1
       
   621         if self.maxtries and self.tries >= self.maxtries:
       
   622             if hasattr(self, "http_error_500"):
       
   623                 meth = self.http_error_500
       
   624             else:
       
   625                 meth = self.http_error_default
       
   626             self.tries = 0
       
   627             return meth(url, fp, 500,
       
   628                         "Internal Server Error: Redirect Recursion", headers)
       
   629         result = self.redirect_internal(url, fp, errcode, errmsg, headers,
       
   630                                         data)
       
   631         self.tries = 0
       
   632         return result
       
   633 
       
   634     def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
       
   635         if 'location' in headers:
       
   636             newurl = headers['location']
       
   637         elif 'uri' in headers:
       
   638             newurl = headers['uri']
       
   639         else:
       
   640             return
       
   641         void = fp.read()
       
   642         fp.close()
       
   643         # In case the server sent a relative URL, join with original:
       
   644         newurl = basejoin(self.type + ":" + url, newurl)
       
   645         return self.open(newurl)
       
   646 
       
   647     def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
       
   648         """Error 301 -- also relocated (permanently)."""
       
   649         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
       
   650 
       
   651     def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
       
   652         """Error 303 -- also relocated (essentially identical to 302)."""
       
   653         return self.http_error_302(url, fp, errcode, errmsg, headers, data)
       
   654 
       
   655     def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
       
   656         """Error 307 -- relocated, but turn POST into error."""
       
   657         if data is None:
       
   658             return self.http_error_302(url, fp, errcode, errmsg, headers, data)
       
   659         else:
       
   660             return self.http_error_default(url, fp, errcode, errmsg, headers)
       
   661 
       
   662     def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
       
   663         """Error 401 -- authentication required.
       
   664         This function supports Basic authentication only."""
       
   665         if not 'www-authenticate' in headers:
       
   666             URLopener.http_error_default(self, url, fp,
       
   667                                          errcode, errmsg, headers)
       
   668         stuff = headers['www-authenticate']
       
   669         import re
       
   670         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
       
   671         if not match:
       
   672             URLopener.http_error_default(self, url, fp,
       
   673                                          errcode, errmsg, headers)
       
   674         scheme, realm = match.groups()
       
   675         if scheme.lower() != 'basic':
       
   676             URLopener.http_error_default(self, url, fp,
       
   677                                          errcode, errmsg, headers)
       
   678         name = 'retry_' + self.type + '_basic_auth'
       
   679         if data is None:
       
   680             return getattr(self,name)(url, realm)
       
   681         else:
       
   682             return getattr(self,name)(url, realm, data)
       
   683 
       
   684     def http_error_407(self, url, fp, errcode, errmsg, headers, data=None):
       
   685         """Error 407 -- proxy authentication required.
       
   686         This function supports Basic authentication only."""
       
   687         if not 'proxy-authenticate' in headers:
       
   688             URLopener.http_error_default(self, url, fp,
       
   689                                          errcode, errmsg, headers)
       
   690         stuff = headers['proxy-authenticate']
       
   691         import re
       
   692         match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
       
   693         if not match:
       
   694             URLopener.http_error_default(self, url, fp,
       
   695                                          errcode, errmsg, headers)
       
   696         scheme, realm = match.groups()
       
   697         if scheme.lower() != 'basic':
       
   698             URLopener.http_error_default(self, url, fp,
       
   699                                          errcode, errmsg, headers)
       
   700         name = 'retry_proxy_' + self.type + '_basic_auth'
       
   701         if data is None:
       
   702             return getattr(self,name)(url, realm)
       
   703         else:
       
   704             return getattr(self,name)(url, realm, data)
       
   705 
       
   706     def retry_proxy_http_basic_auth(self, url, realm, data=None):
       
   707         host, selector = splithost(url)
       
   708         newurl = 'http://' + host + selector
       
   709         proxy = self.proxies['http']
       
   710         urltype, proxyhost = splittype(proxy)
       
   711         proxyhost, proxyselector = splithost(proxyhost)
       
   712         i = proxyhost.find('@') + 1
       
   713         proxyhost = proxyhost[i:]
       
   714         user, passwd = self.get_user_passwd(proxyhost, realm, i)
       
   715         if not (user or passwd): return None
       
   716         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
       
   717         self.proxies['http'] = 'http://' + proxyhost + proxyselector
       
   718         if data is None:
       
   719             return self.open(newurl)
       
   720         else:
       
   721             return self.open(newurl, data)
       
   722 
       
   723     def retry_proxy_https_basic_auth(self, url, realm, data=None):
       
   724         host, selector = splithost(url)
       
   725         newurl = 'https://' + host + selector
       
   726         proxy = self.proxies['https']
       
   727         urltype, proxyhost = splittype(proxy)
       
   728         proxyhost, proxyselector = splithost(proxyhost)
       
   729         i = proxyhost.find('@') + 1
       
   730         proxyhost = proxyhost[i:]
       
   731         user, passwd = self.get_user_passwd(proxyhost, realm, i)
       
   732         if not (user or passwd): return None
       
   733         proxyhost = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + proxyhost
       
   734         self.proxies['https'] = 'https://' + proxyhost + proxyselector
       
   735         if data is None:
       
   736             return self.open(newurl)
       
   737         else:
       
   738             return self.open(newurl, data)
       
   739 
       
   740     def retry_http_basic_auth(self, url, realm, data=None):
       
   741         host, selector = splithost(url)
       
   742         i = host.find('@') + 1
       
   743         host = host[i:]
       
   744         user, passwd = self.get_user_passwd(host, realm, i)
       
   745         if not (user or passwd): return None
       
   746         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
       
   747         newurl = 'http://' + host + selector
       
   748         if data is None:
       
   749             return self.open(newurl)
       
   750         else:
       
   751             return self.open(newurl, data)
       
   752 
       
   753     def retry_https_basic_auth(self, url, realm, data=None):
       
   754         host, selector = splithost(url)
       
   755         i = host.find('@') + 1
       
   756         host = host[i:]
       
   757         user, passwd = self.get_user_passwd(host, realm, i)
       
   758         if not (user or passwd): return None
       
   759         host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
       
   760         newurl = 'https://' + host + selector
       
   761         if data is None:
       
   762             return self.open(newurl)
       
   763         else:
       
   764             return self.open(newurl, data)
       
   765 
       
   766     def get_user_passwd(self, host, realm, clear_cache = 0):
       
   767         key = realm + '@' + host.lower()
       
   768         if key in self.auth_cache:
       
   769             if clear_cache:
       
   770                 del self.auth_cache[key]
       
   771             else:
       
   772                 return self.auth_cache[key]
       
   773         user, passwd = self.prompt_user_passwd(host, realm)
       
   774         if user or passwd: self.auth_cache[key] = (user, passwd)
       
   775         return user, passwd
       
   776 
       
   777     def prompt_user_passwd(self, host, realm):
       
   778         """Override this in a GUI environment!"""
       
   779         import getpass
       
   780         try:
       
   781             user = raw_input("Enter username for %s at %s: " % (realm,
       
   782                                                                 host))
       
   783             passwd = getpass.getpass("Enter password for %s in %s at %s: " %
       
   784                 (user, realm, host))
       
   785             return user, passwd
       
   786         except KeyboardInterrupt:
       
   787             print
       
   788             return None, None
       
   789 
       
   790 
       
   791 # Utility functions
       
   792 
       
   793 _localhost = None
       
   794 def localhost():
       
   795     """Return the IP address of the magic hostname 'localhost'."""
       
   796     global _localhost
       
   797     if _localhost is None:
       
   798         _localhost = socket.gethostbyname('localhost')
       
   799     return _localhost
       
   800 
       
   801 _thishost = None
       
   802 def thishost():
       
   803     """Return the IP address of the current host."""
       
   804     global _thishost
       
   805     if _thishost is None:
       
   806         _thishost = socket.gethostbyname(socket.gethostname())
       
   807     return _thishost
       
   808 
       
   809 _ftperrors = None
       
   810 def ftperrors():
       
   811     """Return the set of errors raised by the FTP class."""
       
   812     global _ftperrors
       
   813     if _ftperrors is None:
       
   814         import ftplib
       
   815         _ftperrors = ftplib.all_errors
       
   816     return _ftperrors
       
   817 
       
   818 _noheaders = None
       
   819 def noheaders():
       
   820     """Return an empty mimetools.Message object."""
       
   821     global _noheaders
       
   822     if _noheaders is None:
       
   823         import mimetools
       
   824         try:
       
   825             from cStringIO import StringIO
       
   826         except ImportError:
       
   827             from StringIO import StringIO
       
   828         _noheaders = mimetools.Message(StringIO(), 0)
       
   829         _noheaders.fp.close()   # Recycle file descriptor
       
   830     return _noheaders
       
   831 
       
   832 
       
   833 # Utility classes
       
   834 
       
   835 class ftpwrapper:
       
   836     """Class used by open_ftp() for cache of open FTP connections."""
       
   837 
       
   838     def __init__(self, user, passwd, host, port, dirs):
       
   839         self.user = user
       
   840         self.passwd = passwd
       
   841         self.host = host
       
   842         self.port = port
       
   843         self.dirs = dirs
       
   844         self.init()
       
   845 
       
   846     def init(self):
       
   847         import ftplib
       
   848         self.busy = 0
       
   849         self.ftp = ftplib.FTP()
       
   850         self.ftp.connect(self.host, self.port)
       
   851         self.ftp.login(self.user, self.passwd)
       
   852         for dir in self.dirs:
       
   853             self.ftp.cwd(dir)
       
   854 
       
   855     def retrfile(self, file, type):
       
   856         import ftplib
       
   857         self.endtransfer()
       
   858         if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
       
   859         else: cmd = 'TYPE ' + type; isdir = 0
       
   860         try:
       
   861             self.ftp.voidcmd(cmd)
       
   862         except ftplib.all_errors:
       
   863             self.init()
       
   864             self.ftp.voidcmd(cmd)
       
   865         conn = None
       
   866         if file and not isdir:
       
   867             # Try to retrieve as a file
       
   868             try:
       
   869                 cmd = 'RETR ' + file
       
   870                 conn = self.ftp.ntransfercmd(cmd)
       
   871             except ftplib.error_perm, reason:
       
   872                 if str(reason)[:3] != '550':
       
   873                     raise IOError, ('ftp error', reason), sys.exc_info()[2]
       
   874         if not conn:
       
   875             # Set transfer mode to ASCII!
       
   876             self.ftp.voidcmd('TYPE A')
       
   877             # Try a directory listing
       
   878             if file: cmd = 'LIST ' + file
       
   879             else: cmd = 'LIST'
       
   880             conn = self.ftp.ntransfercmd(cmd)
       
   881         self.busy = 1
       
   882         # Pass back both a suitably decorated object and a retrieval length
       
   883         return (addclosehook(conn[0].makefile('rb'),
       
   884                              self.endtransfer), conn[1])
       
   885     def endtransfer(self):
       
   886         if not self.busy:
       
   887             return
       
   888         self.busy = 0
       
   889         try:
       
   890             self.ftp.voidresp()
       
   891         except ftperrors():
       
   892             pass
       
   893 
       
   894     def close(self):
       
   895         self.endtransfer()
       
   896         try:
       
   897             self.ftp.close()
       
   898         except ftperrors():
       
   899             pass
       
   900 
       
   901 class addbase:
       
   902     """Base class for addinfo and addclosehook."""
       
   903 
       
   904     def __init__(self, fp):
       
   905         self.fp = fp
       
   906         self.read = self.fp.read
       
   907         self.readline = self.fp.readline
       
   908         if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
       
   909         if hasattr(self.fp, "fileno"):
       
   910             self.fileno = self.fp.fileno
       
   911         else:
       
   912             self.fileno = lambda: None
       
   913         if hasattr(self.fp, "__iter__"):
       
   914             self.__iter__ = self.fp.__iter__
       
   915             if hasattr(self.fp, "next"):
       
   916                 self.next = self.fp.next
       
   917 
       
   918     def __repr__(self):
       
   919         return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
       
   920                                              id(self), self.fp)
       
   921 
       
   922     def close(self):
       
   923         self.read = None
       
   924         self.readline = None
       
   925         self.readlines = None
       
   926         self.fileno = None
       
   927         if self.fp: self.fp.close()
       
   928         self.fp = None
       
   929 
       
   930 class addclosehook(addbase):
       
   931     """Class to add a close hook to an open file."""
       
   932 
       
   933     def __init__(self, fp, closehook, *hookargs):
       
   934         addbase.__init__(self, fp)
       
   935         self.closehook = closehook
       
   936         self.hookargs = hookargs
       
   937 
       
   938     def close(self):
       
   939         addbase.close(self)
       
   940         if self.closehook:
       
   941             self.closehook(*self.hookargs)
       
   942             self.closehook = None
       
   943             self.hookargs = None
       
   944 
       
   945 class addinfo(addbase):
       
   946     """class to add an info() method to an open file."""
       
   947 
       
   948     def __init__(self, fp, headers):
       
   949         addbase.__init__(self, fp)
       
   950         self.headers = headers
       
   951 
       
   952     def info(self):
       
   953         return self.headers
       
   954 
       
   955 class addinfourl(addbase):
       
   956     """class to add info() and geturl() methods to an open file."""
       
   957 
       
   958     def __init__(self, fp, headers, url):
       
   959         addbase.__init__(self, fp)
       
   960         self.headers = headers
       
   961         self.url = url
       
   962 
       
   963     def info(self):
       
   964         return self.headers
       
   965 
       
   966     def geturl(self):
       
   967         return self.url
       
   968 
       
   969 
       
   970 # Utilities to parse URLs (most of these return None for missing parts):
       
   971 # unwrap('<URL:type://host/path>') --> 'type://host/path'
       
   972 # splittype('type:opaquestring') --> 'type', 'opaquestring'
       
   973 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
       
   974 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
       
   975 # splitpasswd('user:passwd') -> 'user', 'passwd'
       
   976 # splitport('host:port') --> 'host', 'port'
       
   977 # splitquery('/path?query') --> '/path', 'query'
       
   978 # splittag('/path#tag') --> '/path', 'tag'
       
   979 # splitattr('/path;attr1=value1;attr2=value2;...') ->
       
   980 #   '/path', ['attr1=value1', 'attr2=value2', ...]
       
   981 # splitvalue('attr=value') --> 'attr', 'value'
       
   982 # splitgophertype('/Xselector') --> 'X', 'selector'
       
   983 # unquote('abc%20def') -> 'abc def'
       
   984 # quote('abc def') -> 'abc%20def')
       
   985 
       
   986 try:
       
   987     unicode
       
   988 except NameError:
       
   989     def _is_unicode(x):
       
   990         return 0
       
   991 else:
       
   992     def _is_unicode(x):
       
   993         return isinstance(x, unicode)
       
   994 
       
   995 def toBytes(url):
       
   996     """toBytes(u"URL") --> 'URL'."""
       
   997     # Most URL schemes require ASCII. If that changes, the conversion
       
   998     # can be relaxed
       
   999     if _is_unicode(url):
       
  1000         try:
       
  1001             url = url.encode("ASCII")
       
  1002         except UnicodeError:
       
  1003             raise UnicodeError("URL " + repr(url) +
       
  1004                                " contains non-ASCII characters")
       
  1005     return url
       
  1006 
       
  1007 def unwrap(url):
       
  1008     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
       
  1009     url = url.strip()
       
  1010     if url[:1] == '<' and url[-1:] == '>':
       
  1011         url = url[1:-1].strip()
       
  1012     if url[:4] == 'URL:': url = url[4:].strip()
       
  1013     return url
       
  1014 
       
  1015 _typeprog = None
       
  1016 def splittype(url):
       
  1017     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
       
  1018     global _typeprog
       
  1019     if _typeprog is None:
       
  1020         import re
       
  1021         _typeprog = re.compile('^([^/:]+):')
       
  1022 
       
  1023     match = _typeprog.match(url)
       
  1024     if match:
       
  1025         scheme = match.group(1)
       
  1026         return scheme.lower(), url[len(scheme) + 1:]
       
  1027     return None, url
       
  1028 
       
  1029 _hostprog = None
       
  1030 def splithost(url):
       
  1031     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
       
  1032     global _hostprog
       
  1033     if _hostprog is None:
       
  1034         import re
       
  1035         _hostprog = re.compile('^//([^/?]*)(.*)$')
       
  1036 
       
  1037     match = _hostprog.match(url)
       
  1038     if match: return match.group(1, 2)
       
  1039     return None, url
       
  1040 
       
  1041 _userprog = None
       
  1042 def splituser(host):
       
  1043     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
       
  1044     global _userprog
       
  1045     if _userprog is None:
       
  1046         import re
       
  1047         _userprog = re.compile('^(.*)@(.*)$')
       
  1048 
       
  1049     match = _userprog.match(host)
       
  1050     if match: return map(unquote, match.group(1, 2))
       
  1051     return None, host
       
  1052 
       
  1053 _passwdprog = None
       
  1054 def splitpasswd(user):
       
  1055     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
       
  1056     global _passwdprog
       
  1057     if _passwdprog is None:
       
  1058         import re
       
  1059         _passwdprog = re.compile('^([^:]*):(.*)$')
       
  1060 
       
  1061     match = _passwdprog.match(user)
       
  1062     if match: return match.group(1, 2)
       
  1063     return user, None
       
  1064 
       
  1065 # splittag('/path#tag') --> '/path', 'tag'
       
  1066 _portprog = None
       
  1067 def splitport(host):
       
  1068     """splitport('host:port') --> 'host', 'port'."""
       
  1069     global _portprog
       
  1070     if _portprog is None:
       
  1071         import re
       
  1072         _portprog = re.compile('^(.*):([0-9]+)$')
       
  1073 
       
  1074     match = _portprog.match(host)
       
  1075     if match: return match.group(1, 2)
       
  1076     return host, None
       
  1077 
       
  1078 _nportprog = None
       
  1079 def splitnport(host, defport=-1):
       
  1080     """Split host and port, returning numeric port.
       
  1081     Return given default port if no ':' found; defaults to -1.
       
  1082     Return numerical port if a valid number are found after ':'.
       
  1083     Return None if ':' but not a valid number."""
       
  1084     global _nportprog
       
  1085     if _nportprog is None:
       
  1086         import re
       
  1087         _nportprog = re.compile('^(.*):(.*)$')
       
  1088 
       
  1089     match = _nportprog.match(host)
       
  1090     if match:
       
  1091         host, port = match.group(1, 2)
       
  1092         try:
       
  1093             if not port: raise ValueError, "no digits"
       
  1094             nport = int(port)
       
  1095         except ValueError:
       
  1096             nport = None
       
  1097         return host, nport
       
  1098     return host, defport
       
  1099 
       
  1100 _queryprog = None
       
  1101 def splitquery(url):
       
  1102     """splitquery('/path?query') --> '/path', 'query'."""
       
  1103     global _queryprog
       
  1104     if _queryprog is None:
       
  1105         import re
       
  1106         _queryprog = re.compile('^(.*)\?([^?]*)$')
       
  1107 
       
  1108     match = _queryprog.match(url)
       
  1109     if match: return match.group(1, 2)
       
  1110     return url, None
       
  1111 
       
  1112 _tagprog = None
       
  1113 def splittag(url):
       
  1114     """splittag('/path#tag') --> '/path', 'tag'."""
       
  1115     global _tagprog
       
  1116     if _tagprog is None:
       
  1117         import re
       
  1118         _tagprog = re.compile('^(.*)#([^#]*)$')
       
  1119 
       
  1120     match = _tagprog.match(url)
       
  1121     if match: return match.group(1, 2)
       
  1122     return url, None
       
  1123 
       
  1124 def splitattr(url):
       
  1125     """splitattr('/path;attr1=value1;attr2=value2;...') ->
       
  1126         '/path', ['attr1=value1', 'attr2=value2', ...]."""
       
  1127     words = url.split(';')
       
  1128     return words[0], words[1:]
       
  1129 
       
  1130 _valueprog = None
       
  1131 def splitvalue(attr):
       
  1132     """splitvalue('attr=value') --> 'attr', 'value'."""
       
  1133     global _valueprog
       
  1134     if _valueprog is None:
       
  1135         import re
       
  1136         _valueprog = re.compile('^([^=]*)=(.*)$')
       
  1137 
       
  1138     match = _valueprog.match(attr)
       
  1139     if match: return match.group(1, 2)
       
  1140     return attr, None
       
  1141 
       
  1142 def splitgophertype(selector):
       
  1143     """splitgophertype('/Xselector') --> 'X', 'selector'."""
       
  1144     if selector[:1] == '/' and selector[1:2]:
       
  1145         return selector[1], selector[2:]
       
  1146     return None, selector
       
  1147 
       
  1148 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
       
  1149 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
       
  1150 
       
  1151 def unquote(s):
       
  1152     """unquote('abc%20def') -> 'abc def'."""
       
  1153     res = s.split('%')
       
  1154     for i in xrange(1, len(res)):
       
  1155         item = res[i]
       
  1156         try:
       
  1157             res[i] = _hextochr[item[:2]] + item[2:]
       
  1158         except KeyError:
       
  1159             res[i] = '%' + item
       
  1160         except UnicodeDecodeError:
       
  1161             res[i] = unichr(int(item[:2], 16)) + item[2:]
       
  1162     return "".join(res)
       
  1163 
       
  1164 def unquote_plus(s):
       
  1165     """unquote('%7e/abc+def') -> '~/abc def'"""
       
  1166     s = s.replace('+', ' ')
       
  1167     return unquote(s)
       
  1168 
       
  1169 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
       
  1170                'abcdefghijklmnopqrstuvwxyz'
       
  1171                '0123456789' '_.-')
       
  1172 _safemaps = {}
       
  1173 
       
  1174 def quote(s, safe = '/'):
       
  1175     """quote('abc def') -> 'abc%20def'
       
  1176 
       
  1177     Each part of a URL, e.g. the path info, the query, etc., has a
       
  1178     different set of reserved characters that must be quoted.
       
  1179 
       
  1180     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
       
  1181     the following reserved characters.
       
  1182 
       
  1183     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
       
  1184                   "$" | ","
       
  1185 
       
  1186     Each of these characters is reserved in some component of a URL,
       
  1187     but not necessarily in all of them.
       
  1188 
       
  1189     By default, the quote function is intended for quoting the path
       
  1190     section of a URL.  Thus, it will not encode '/'.  This character
       
  1191     is reserved, but in typical usage the quote function is being
       
  1192     called on a path where the existing slash characters are used as
       
  1193     reserved characters.
       
  1194     """
       
  1195     cachekey = (safe, always_safe)
       
  1196     try:
       
  1197         safe_map = _safemaps[cachekey]
       
  1198     except KeyError:
       
  1199         safe += always_safe
       
  1200         safe_map = {}
       
  1201         for i in range(256):
       
  1202             c = chr(i)
       
  1203             safe_map[c] = (c in safe) and c or ('%%%02X' % i)
       
  1204         _safemaps[cachekey] = safe_map
       
  1205     res = map(safe_map.__getitem__, s)
       
  1206     return ''.join(res)
       
  1207 
       
  1208 def quote_plus(s, safe = ''):
       
  1209     """Quote the query fragment of a URL; replacing ' ' with '+'"""
       
  1210     if ' ' in s:
       
  1211         s = quote(s, safe + ' ')
       
  1212         return s.replace(' ', '+')
       
  1213     return quote(s, safe)
       
  1214 
       
  1215 def urlencode(query,doseq=0):
       
  1216     """Encode a sequence of two-element tuples or dictionary into a URL query string.
       
  1217 
       
  1218     If any values in the query arg are sequences and doseq is true, each
       
  1219     sequence element is converted to a separate parameter.
       
  1220 
       
  1221     If the query arg is a sequence of two-element tuples, the order of the
       
  1222     parameters in the output will match the order of parameters in the
       
  1223     input.
       
  1224     """
       
  1225 
       
  1226     if hasattr(query,"items"):
       
  1227         # mapping objects
       
  1228         query = query.items()
       
  1229     else:
       
  1230         # it's a bother at times that strings and string-like objects are
       
  1231         # sequences...
       
  1232         try:
       
  1233             # non-sequence items should not work with len()
       
  1234             # non-empty strings will fail this
       
  1235             if len(query) and not isinstance(query[0], tuple):
       
  1236                 raise TypeError
       
  1237             # zero-length sequences of all types will get here and succeed,
       
  1238             # but that's a minor nit - since the original implementation
       
  1239             # allowed empty dicts that type of behavior probably should be
       
  1240             # preserved for consistency
       
  1241         except TypeError:
       
  1242             ty,va,tb = sys.exc_info()
       
  1243             raise TypeError, "not a valid non-string sequence or mapping object", tb
       
  1244 
       
  1245     l = []
       
  1246     if not doseq:
       
  1247         # preserve old behavior
       
  1248         for k, v in query:
       
  1249             k = quote_plus(str(k))
       
  1250             v = quote_plus(str(v))
       
  1251             l.append(k + '=' + v)
       
  1252     else:
       
  1253         for k, v in query:
       
  1254             k = quote_plus(str(k))
       
  1255             if isinstance(v, str):
       
  1256                 v = quote_plus(v)
       
  1257                 l.append(k + '=' + v)
       
  1258             elif _is_unicode(v):
       
  1259                 # is there a reasonable way to convert to ASCII?
       
  1260                 # encode generates a string, but "replace" or "ignore"
       
  1261                 # lose information and "strict" can raise UnicodeError
       
  1262                 v = quote_plus(v.encode("ASCII","replace"))
       
  1263                 l.append(k + '=' + v)
       
  1264             else:
       
  1265                 try:
       
  1266                     # is this a sufficient test for sequence-ness?
       
  1267                     x = len(v)
       
  1268                 except TypeError:
       
  1269                     # not a sequence
       
  1270                     v = quote_plus(str(v))
       
  1271                     l.append(k + '=' + v)
       
  1272                 else:
       
  1273                     # loop over the sequence
       
  1274                     for elt in v:
       
  1275                         l.append(k + '=' + quote_plus(str(elt)))
       
  1276     return '&'.join(l)
       
  1277 
       
  1278 # Proxy handling
       
  1279 def getproxies_environment():
       
  1280     """Return a dictionary of scheme -> proxy server URL mappings.
       
  1281 
       
  1282     Scan the environment for variables named <scheme>_proxy;
       
  1283     this seems to be the standard convention.  If you need a
       
  1284     different way, you can pass a proxies dictionary to the
       
  1285     [Fancy]URLopener constructor.
       
  1286 
       
  1287     """
       
  1288     proxies = {}
       
  1289     for name, value in os.environ.items():
       
  1290         name = name.lower()
       
  1291         if value and name[-6:] == '_proxy':
       
  1292             proxies[name[:-6]] = value
       
  1293     return proxies
       
  1294 
       
  1295 if sys.platform == 'darwin':
       
  1296     def getproxies_internetconfig():
       
  1297         """Return a dictionary of scheme -> proxy server URL mappings.
       
  1298 
       
  1299         By convention the mac uses Internet Config to store
       
  1300         proxies.  An HTTP proxy, for instance, is stored under
       
  1301         the HttpProxy key.
       
  1302 
       
  1303         """
       
  1304         try:
       
  1305             import ic
       
  1306         except ImportError:
       
  1307             return {}
       
  1308 
       
  1309         try:
       
  1310             config = ic.IC()
       
  1311         except ic.error:
       
  1312             return {}
       
  1313         proxies = {}
       
  1314         # HTTP:
       
  1315         if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
       
  1316             try:
       
  1317                 value = config['HTTPProxyHost']
       
  1318             except ic.error:
       
  1319                 pass
       
  1320             else:
       
  1321                 proxies['http'] = 'http://%s' % value
       
  1322         # FTP: XXXX To be done.
       
  1323         # Gopher: XXXX To be done.
       
  1324         return proxies
       
  1325 
       
  1326     def proxy_bypass(x):
       
  1327         return 0
       
  1328 
       
  1329     def getproxies():
       
  1330         return getproxies_environment() or getproxies_internetconfig()
       
  1331 
       
  1332 elif os.name == 'nt':
       
  1333     def getproxies_registry():
       
  1334         """Return a dictionary of scheme -> proxy server URL mappings.
       
  1335 
       
  1336         Win32 uses the registry to store proxies.
       
  1337 
       
  1338         """
       
  1339         proxies = {}
       
  1340         try:
       
  1341             import _winreg
       
  1342         except ImportError:
       
  1343             # Std module, so should be around - but you never know!
       
  1344             return proxies
       
  1345         try:
       
  1346             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
       
  1347                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
       
  1348             proxyEnable = _winreg.QueryValueEx(internetSettings,
       
  1349                                                'ProxyEnable')[0]
       
  1350             if proxyEnable:
       
  1351                 # Returned as Unicode but problems if not converted to ASCII
       
  1352                 proxyServer = str(_winreg.QueryValueEx(internetSettings,
       
  1353                                                        'ProxyServer')[0])
       
  1354                 if '=' in proxyServer:
       
  1355                     # Per-protocol settings
       
  1356                     for p in proxyServer.split(';'):
       
  1357                         protocol, address = p.split('=', 1)
       
  1358                         # See if address has a type:// prefix
       
  1359                         import re
       
  1360                         if not re.match('^([^/:]+)://', address):
       
  1361                             address = '%s://%s' % (protocol, address)
       
  1362                         proxies[protocol] = address
       
  1363                 else:
       
  1364                     # Use one setting for all protocols
       
  1365                     if proxyServer[:5] == 'http:':
       
  1366                         proxies['http'] = proxyServer
       
  1367                     else:
       
  1368                         proxies['http'] = 'http://%s' % proxyServer
       
  1369                         proxies['ftp'] = 'ftp://%s' % proxyServer
       
  1370             internetSettings.Close()
       
  1371         except (WindowsError, ValueError, TypeError):
       
  1372             # Either registry key not found etc, or the value in an
       
  1373             # unexpected format.
       
  1374             # proxies already set up to be empty so nothing to do
       
  1375             pass
       
  1376         return proxies
       
  1377 
       
  1378     def getproxies():
       
  1379         """Return a dictionary of scheme -> proxy server URL mappings.
       
  1380 
       
  1381         Returns settings gathered from the environment, if specified,
       
  1382         or the registry.
       
  1383 
       
  1384         """
       
  1385         return getproxies_environment() or getproxies_registry()
       
  1386 
       
  1387     def proxy_bypass(host):
       
  1388         try:
       
  1389             import _winreg
       
  1390             import re
       
  1391         except ImportError:
       
  1392             # Std modules, so should be around - but you never know!
       
  1393             return 0
       
  1394         try:
       
  1395             internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
       
  1396                 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
       
  1397             proxyEnable = _winreg.QueryValueEx(internetSettings,
       
  1398                                                'ProxyEnable')[0]
       
  1399             proxyOverride = str(_winreg.QueryValueEx(internetSettings,
       
  1400                                                      'ProxyOverride')[0])
       
  1401             # ^^^^ Returned as Unicode but problems if not converted to ASCII
       
  1402         except WindowsError:
       
  1403             return 0
       
  1404         if not proxyEnable or not proxyOverride:
       
  1405             return 0
       
  1406         # try to make a host list from name and IP address.
       
  1407         rawHost, port = splitport(host)
       
  1408         host = [rawHost]
       
  1409         try:
       
  1410             addr = socket.gethostbyname(rawHost)
       
  1411             if addr != rawHost:
       
  1412                 host.append(addr)
       
  1413         except socket.error:
       
  1414             pass
       
  1415         try:
       
  1416             fqdn = socket.getfqdn(rawHost)
       
  1417             if fqdn != rawHost:
       
  1418                 host.append(fqdn)
       
  1419         except socket.error:
       
  1420             pass
       
  1421         # make a check value list from the registry entry: replace the
       
  1422         # '<local>' string by the localhost entry and the corresponding
       
  1423         # canonical entry.
       
  1424         proxyOverride = proxyOverride.split(';')
       
  1425         i = 0
       
  1426         while i < len(proxyOverride):
       
  1427             if proxyOverride[i] == '<local>':
       
  1428                 proxyOverride[i:i+1] = ['localhost',
       
  1429                                         '127.0.0.1',
       
  1430                                         socket.gethostname(),
       
  1431                                         socket.gethostbyname(
       
  1432                                             socket.gethostname())]
       
  1433             i += 1
       
  1434         # print proxyOverride
       
  1435         # now check if we match one of the registry values.
       
  1436         for test in proxyOverride:
       
  1437             test = test.replace(".", r"\.")     # mask dots
       
  1438             test = test.replace("*", r".*")     # change glob sequence
       
  1439             test = test.replace("?", r".")      # change glob char
       
  1440             for val in host:
       
  1441                 # print "%s <--> %s" %( test, val )
       
  1442                 if re.match(test, val, re.I):
       
  1443                     return 1
       
  1444         return 0
       
  1445 
       
  1446 else:
       
  1447     # By default use environment variables
       
  1448     getproxies = getproxies_environment
       
  1449 
       
  1450     def proxy_bypass(host):
       
  1451         return 0
       
  1452 
       
  1453 # Test and time quote() and unquote()
       
  1454 def test1():
       
  1455     s = ''
       
  1456     for i in range(256): s = s + chr(i)
       
  1457     s = s*4
       
  1458     t0 = time.time()
       
  1459     qs = quote(s)
       
  1460     uqs = unquote(qs)
       
  1461     t1 = time.time()
       
  1462     if uqs != s:
       
  1463         print 'Wrong!'
       
  1464     print repr(s)
       
  1465     print repr(qs)
       
  1466     print repr(uqs)
       
  1467     print round(t1 - t0, 3), 'sec'
       
  1468 
       
  1469 
       
  1470 def reporthook(blocknum, blocksize, totalsize):
       
  1471     # Report during remote transfers
       
  1472     print "Block number: %d, Block size: %d, Total size: %d" % (
       
  1473         blocknum, blocksize, totalsize)
       
  1474 
       
  1475 # Test program
       
  1476 def test(args=[]):
       
  1477     if not args:
       
  1478         args = [
       
  1479             '/etc/passwd',
       
  1480             'file:/etc/passwd',
       
  1481             'file://localhost/etc/passwd',
       
  1482             'ftp://ftp.gnu.org/pub/README',
       
  1483 ##          'gopher://gopher.micro.umn.edu/1/',
       
  1484             'http://www.python.org/index.html',
       
  1485             ]
       
  1486         if hasattr(URLopener, "open_https"):
       
  1487             args.append('https://synergy.as.cmu.edu/~geek/')
       
  1488     try:
       
  1489         for url in args:
       
  1490             print '-'*10, url, '-'*10
       
  1491             fn, h = urlretrieve(url, None, reporthook)
       
  1492             print fn
       
  1493             if h:
       
  1494                 print '======'
       
  1495                 for k in h.keys(): print k + ':', h[k]
       
  1496                 print '======'
       
  1497             fp = open(fn, 'rb')
       
  1498             data = fp.read()
       
  1499             del fp
       
  1500             if '\r' in data:
       
  1501                 table = string.maketrans("", "")
       
  1502                 data = data.translate(table, "\r")
       
  1503             print data
       
  1504             fn, h = None, None
       
  1505         print '-'*40
       
  1506     finally:
       
  1507         urlcleanup()
       
  1508 
       
  1509 def main():
       
  1510     import getopt, sys
       
  1511     try:
       
  1512         opts, args = getopt.getopt(sys.argv[1:], "th")
       
  1513     except getopt.error, msg:
       
  1514         print msg
       
  1515         print "Use -h for help"
       
  1516         return
       
  1517     t = 0
       
  1518     for o, a in opts:
       
  1519         if o == '-t':
       
  1520             t = t + 1
       
  1521         if o == '-h':
       
  1522             print "Usage: python urllib.py [-t] [url ...]"
       
  1523             print "-t runs self-test;",
       
  1524             print "otherwise, contents of urls are printed"
       
  1525             return
       
  1526     if t:
       
  1527         if t > 1:
       
  1528             test1()
       
  1529         test(args)
       
  1530     else:
       
  1531         if not args:
       
  1532             print "Use -h for help"
       
  1533         for url in args:
       
  1534             print urlopen(url).read(),
       
  1535 
       
  1536 # Run test program when run as a script
       
  1537 if __name__ == '__main__':
       
  1538     main()