symbian-qemu-0.9.1-12/python-win32-2.6.1/lib/urlparse.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 """Parse (absolute and relative) URLs.
       
     2 
       
     3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
       
     4 UC Irvine, June 1995.
       
     5 """
       
     6 
       
     7 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
       
     8            "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
       
     9 
       
    10 # A classification of schemes ('' means apply by default)
       
    11 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
       
    12                  'wais', 'file', 'https', 'shttp', 'mms',
       
    13                  'prospero', 'rtsp', 'rtspu', '', 'sftp']
       
    14 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
       
    15                'imap', 'wais', 'file', 'mms', 'https', 'shttp',
       
    16                'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
       
    17                'svn', 'svn+ssh', 'sftp']
       
    18 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
       
    19                     'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
       
    20 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
       
    21                'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
       
    22                'mms', '', 'sftp']
       
    23 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
       
    24               'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
       
    25 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
       
    26                  'nntp', 'wais', 'https', 'shttp', 'snews',
       
    27                  'file', 'prospero', '']
       
    28 
       
    29 # Characters valid in scheme names
       
    30 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
       
    31                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
       
    32                 '0123456789'
       
    33                 '+-.')
       
    34 
       
    35 MAX_CACHE_SIZE = 20
       
    36 _parse_cache = {}
       
    37 
       
    38 def clear_cache():
       
    39     """Clear the parse cache."""
       
    40     _parse_cache.clear()
       
    41 
       
    42 
       
    43 class ResultMixin(object):
       
    44     """Shared methods for the parsed result objects."""
       
    45 
       
    46     @property
       
    47     def username(self):
       
    48         netloc = self.netloc
       
    49         if "@" in netloc:
       
    50             userinfo = netloc.rsplit("@", 1)[0]
       
    51             if ":" in userinfo:
       
    52                 userinfo = userinfo.split(":", 1)[0]
       
    53             return userinfo
       
    54         return None
       
    55 
       
    56     @property
       
    57     def password(self):
       
    58         netloc = self.netloc
       
    59         if "@" in netloc:
       
    60             userinfo = netloc.rsplit("@", 1)[0]
       
    61             if ":" in userinfo:
       
    62                 return userinfo.split(":", 1)[1]
       
    63         return None
       
    64 
       
    65     @property
       
    66     def hostname(self):
       
    67         netloc = self.netloc
       
    68         if "@" in netloc:
       
    69             netloc = netloc.rsplit("@", 1)[1]
       
    70         if ":" in netloc:
       
    71             netloc = netloc.split(":", 1)[0]
       
    72         return netloc.lower() or None
       
    73 
       
    74     @property
       
    75     def port(self):
       
    76         netloc = self.netloc
       
    77         if "@" in netloc:
       
    78             netloc = netloc.rsplit("@", 1)[1]
       
    79         if ":" in netloc:
       
    80             port = netloc.split(":", 1)[1]
       
    81             return int(port, 10)
       
    82         return None
       
    83 
       
    84 from collections import namedtuple
       
    85 
       
    86 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
       
    87 
       
    88     __slots__ = ()
       
    89 
       
    90     def geturl(self):
       
    91         return urlunsplit(self)
       
    92 
       
    93 
       
    94 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
       
    95 
       
    96     __slots__ = ()
       
    97 
       
    98     def geturl(self):
       
    99         return urlunparse(self)
       
   100 
       
   101 
       
   102 def urlparse(url, scheme='', allow_fragments=True):
       
   103     """Parse a URL into 6 components:
       
   104     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
       
   105     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
       
   106     Note that we don't break the components up in smaller bits
       
   107     (e.g. netloc is a single string) and we don't expand % escapes."""
       
   108     tuple = urlsplit(url, scheme, allow_fragments)
       
   109     scheme, netloc, url, query, fragment = tuple
       
   110     if scheme in uses_params and ';' in url:
       
   111         url, params = _splitparams(url)
       
   112     else:
       
   113         params = ''
       
   114     return ParseResult(scheme, netloc, url, params, query, fragment)
       
   115 
       
   116 def _splitparams(url):
       
   117     if '/'  in url:
       
   118         i = url.find(';', url.rfind('/'))
       
   119         if i < 0:
       
   120             return url, ''
       
   121     else:
       
   122         i = url.find(';')
       
   123     return url[:i], url[i+1:]
       
   124 
       
   125 def _splitnetloc(url, start=0):
       
   126     delim = len(url)   # position of end of domain part of url, default is end
       
   127     for c in '/?#':    # look for delimiters; the order is NOT important
       
   128         wdelim = url.find(c, start)        # find first of this delim
       
   129         if wdelim >= 0:                    # if found
       
   130             delim = min(delim, wdelim)     # use earliest delim position
       
   131     return url[start:delim], url[delim:]   # return (domain, rest)
       
   132 
       
   133 def urlsplit(url, scheme='', allow_fragments=True):
       
   134     """Parse a URL into 5 components:
       
   135     <scheme>://<netloc>/<path>?<query>#<fragment>
       
   136     Return a 5-tuple: (scheme, netloc, path, query, fragment).
       
   137     Note that we don't break the components up in smaller bits
       
   138     (e.g. netloc is a single string) and we don't expand % escapes."""
       
   139     allow_fragments = bool(allow_fragments)
       
   140     key = url, scheme, allow_fragments, type(url), type(scheme)
       
   141     cached = _parse_cache.get(key, None)
       
   142     if cached:
       
   143         return cached
       
   144     if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
       
   145         clear_cache()
       
   146     netloc = query = fragment = ''
       
   147     i = url.find(':')
       
   148     if i > 0:
       
   149         if url[:i] == 'http': # optimize the common case
       
   150             scheme = url[:i].lower()
       
   151             url = url[i+1:]
       
   152             if url[:2] == '//':
       
   153                 netloc, url = _splitnetloc(url, 2)
       
   154             if allow_fragments and '#' in url:
       
   155                 url, fragment = url.split('#', 1)
       
   156             if '?' in url:
       
   157                 url, query = url.split('?', 1)
       
   158             v = SplitResult(scheme, netloc, url, query, fragment)
       
   159             _parse_cache[key] = v
       
   160             return v
       
   161         for c in url[:i]:
       
   162             if c not in scheme_chars:
       
   163                 break
       
   164         else:
       
   165             scheme, url = url[:i].lower(), url[i+1:]
       
   166     if scheme in uses_netloc and url[:2] == '//':
       
   167         netloc, url = _splitnetloc(url, 2)
       
   168     if allow_fragments and scheme in uses_fragment and '#' in url:
       
   169         url, fragment = url.split('#', 1)
       
   170     if scheme in uses_query and '?' in url:
       
   171         url, query = url.split('?', 1)
       
   172     v = SplitResult(scheme, netloc, url, query, fragment)
       
   173     _parse_cache[key] = v
       
   174     return v
       
   175 
       
   176 def urlunparse(data):
       
   177     """Put a parsed URL back together again.  This may result in a
       
   178     slightly different, but equivalent URL, if the URL that was parsed
       
   179     originally had redundant delimiters, e.g. a ? with an empty query
       
   180     (the draft states that these are equivalent)."""
       
   181     scheme, netloc, url, params, query, fragment = data
       
   182     if params:
       
   183         url = "%s;%s" % (url, params)
       
   184     return urlunsplit((scheme, netloc, url, query, fragment))
       
   185 
       
   186 def urlunsplit(data):
       
   187     scheme, netloc, url, query, fragment = data
       
   188     if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
       
   189         if url and url[:1] != '/': url = '/' + url
       
   190         url = '//' + (netloc or '') + url
       
   191     if scheme:
       
   192         url = scheme + ':' + url
       
   193     if query:
       
   194         url = url + '?' + query
       
   195     if fragment:
       
   196         url = url + '#' + fragment
       
   197     return url
       
   198 
       
   199 def urljoin(base, url, allow_fragments=True):
       
   200     """Join a base URL and a possibly relative URL to form an absolute
       
   201     interpretation of the latter."""
       
   202     if not base:
       
   203         return url
       
   204     if not url:
       
   205         return base
       
   206     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
       
   207             urlparse(base, '', allow_fragments)
       
   208     scheme, netloc, path, params, query, fragment = \
       
   209             urlparse(url, bscheme, allow_fragments)
       
   210     if scheme != bscheme or scheme not in uses_relative:
       
   211         return url
       
   212     if scheme in uses_netloc:
       
   213         if netloc:
       
   214             return urlunparse((scheme, netloc, path,
       
   215                                params, query, fragment))
       
   216         netloc = bnetloc
       
   217     if path[:1] == '/':
       
   218         return urlunparse((scheme, netloc, path,
       
   219                            params, query, fragment))
       
   220     if not path:
       
   221         path = bpath
       
   222         if not params:
       
   223             params = bparams
       
   224         else:
       
   225             path = path[:-1]
       
   226             return urlunparse((scheme, netloc, path,
       
   227                                 params, query, fragment))
       
   228         if not query:
       
   229             query = bquery
       
   230         return urlunparse((scheme, netloc, path,
       
   231                            params, query, fragment))
       
   232     segments = bpath.split('/')[:-1] + path.split('/')
       
   233     # XXX The stuff below is bogus in various ways...
       
   234     if segments[-1] == '.':
       
   235         segments[-1] = ''
       
   236     while '.' in segments:
       
   237         segments.remove('.')
       
   238     while 1:
       
   239         i = 1
       
   240         n = len(segments) - 1
       
   241         while i < n:
       
   242             if (segments[i] == '..'
       
   243                 and segments[i-1] not in ('', '..')):
       
   244                 del segments[i-1:i+1]
       
   245                 break
       
   246             i = i+1
       
   247         else:
       
   248             break
       
   249     if segments == ['', '..']:
       
   250         segments[-1] = ''
       
   251     elif len(segments) >= 2 and segments[-1] == '..':
       
   252         segments[-2:] = ['']
       
   253     return urlunparse((scheme, netloc, '/'.join(segments),
       
   254                        params, query, fragment))
       
   255 
       
   256 def urldefrag(url):
       
   257     """Removes any existing fragment from URL.
       
   258 
       
   259     Returns a tuple of the defragmented URL and the fragment.  If
       
   260     the URL contained no fragments, the second element is the
       
   261     empty string.
       
   262     """
       
   263     if '#' in url:
       
   264         s, n, p, a, q, frag = urlparse(url)
       
   265         defrag = urlunparse((s, n, p, a, q, ''))
       
   266         return defrag, frag
       
   267     else:
       
   268         return url, ''
       
   269 
       
   270 # unquote method for parse_qs and parse_qsl
       
   271 # Cannot use directly from urllib as it would create circular reference.
       
   272 # urllib uses urlparse methods ( urljoin)
       
   273 
       
   274 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
       
   275 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
       
   276 
       
   277 def unquote(s):
       
   278     """unquote('abc%20def') -> 'abc def'."""
       
   279     res = s.split('%')
       
   280     for i in xrange(1, len(res)):
       
   281         item = res[i]
       
   282         try:
       
   283             res[i] = _hextochr[item[:2]] + item[2:]
       
   284         except KeyError:
       
   285             res[i] = '%' + item
       
   286         except UnicodeDecodeError:
       
   287             res[i] = unichr(int(item[:2], 16)) + item[2:]
       
   288     return "".join(res)
       
   289 
       
   290 def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
       
   291     """Parse a query given as a string argument.
       
   292 
       
   293         Arguments:
       
   294 
       
   295         qs: URL-encoded query string to be parsed
       
   296 
       
   297         keep_blank_values: flag indicating whether blank values in
       
   298             URL encoded queries should be treated as blank strings.
       
   299             A true value indicates that blanks should be retained as
       
   300             blank strings.  The default false value indicates that
       
   301             blank values are to be ignored and treated as if they were
       
   302             not included.
       
   303 
       
   304         strict_parsing: flag indicating what to do with parsing errors.
       
   305             If false (the default), errors are silently ignored.
       
   306             If true, errors raise a ValueError exception.
       
   307     """
       
   308     dict = {}
       
   309     for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
       
   310         if name in dict:
       
   311             dict[name].append(value)
       
   312         else:
       
   313             dict[name] = [value]
       
   314     return dict
       
   315 
       
   316 def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
       
   317     """Parse a query given as a string argument.
       
   318 
       
   319     Arguments:
       
   320 
       
   321     qs: URL-encoded query string to be parsed
       
   322 
       
   323     keep_blank_values: flag indicating whether blank values in
       
   324         URL encoded queries should be treated as blank strings.  A
       
   325         true value indicates that blanks should be retained as blank
       
   326         strings.  The default false value indicates that blank values
       
   327         are to be ignored and treated as if they were  not included.
       
   328 
       
   329     strict_parsing: flag indicating what to do with parsing errors. If
       
   330         false (the default), errors are silently ignored. If true,
       
   331         errors raise a ValueError exception.
       
   332 
       
   333     Returns a list, as G-d intended.
       
   334     """
       
   335     pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
       
   336     r = []
       
   337     for name_value in pairs:
       
   338         if not name_value and not strict_parsing:
       
   339             continue
       
   340         nv = name_value.split('=', 1)
       
   341         if len(nv) != 2:
       
   342             if strict_parsing:
       
   343                 raise ValueError, "bad query field: %r" % (name_value,)
       
   344             # Handle case of a control-name with no equal sign
       
   345             if keep_blank_values:
       
   346                 nv.append('')
       
   347             else:
       
   348                 continue
       
   349         if len(nv[1]) or keep_blank_values:
       
   350             name = unquote(nv[0].replace('+', ' '))
       
   351             value = unquote(nv[1].replace('+', ' '))
       
   352             r.append((name, value))
       
   353 
       
   354     return r
       
   355 
       
   356 
       
   357 test_input = """
       
   358       http://a/b/c/d
       
   359 
       
   360       g:h        = <URL:g:h>
       
   361       http:g     = <URL:http://a/b/c/g>
       
   362       http:      = <URL:http://a/b/c/d>
       
   363       g          = <URL:http://a/b/c/g>
       
   364       ./g        = <URL:http://a/b/c/g>
       
   365       g/         = <URL:http://a/b/c/g/>
       
   366       /g         = <URL:http://a/g>
       
   367       //g        = <URL:http://g>
       
   368       ?y         = <URL:http://a/b/c/d?y>
       
   369       g?y        = <URL:http://a/b/c/g?y>
       
   370       g?y/./x    = <URL:http://a/b/c/g?y/./x>
       
   371       .          = <URL:http://a/b/c/>
       
   372       ./         = <URL:http://a/b/c/>
       
   373       ..         = <URL:http://a/b/>
       
   374       ../        = <URL:http://a/b/>
       
   375       ../g       = <URL:http://a/b/g>
       
   376       ../..      = <URL:http://a/>
       
   377       ../../g    = <URL:http://a/g>
       
   378       ../../../g = <URL:http://a/../g>
       
   379       ./../g     = <URL:http://a/b/g>
       
   380       ./g/.      = <URL:http://a/b/c/g/>
       
   381       /./g       = <URL:http://a/./g>
       
   382       g/./h      = <URL:http://a/b/c/g/h>
       
   383       g/../h     = <URL:http://a/b/c/h>
       
   384       http:g     = <URL:http://a/b/c/g>
       
   385       http:      = <URL:http://a/b/c/d>
       
   386       http:?y         = <URL:http://a/b/c/d?y>
       
   387       http:g?y        = <URL:http://a/b/c/g?y>
       
   388       http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
       
   389 """
       
   390 
       
   391 def test():
       
   392     import sys
       
   393     base = ''
       
   394     if sys.argv[1:]:
       
   395         fn = sys.argv[1]
       
   396         if fn == '-':
       
   397             fp = sys.stdin
       
   398         else:
       
   399             fp = open(fn)
       
   400     else:
       
   401         try:
       
   402             from cStringIO import StringIO
       
   403         except ImportError:
       
   404             from StringIO import StringIO
       
   405         fp = StringIO(test_input)
       
   406     for line in fp:
       
   407         words = line.split()
       
   408         if not words:
       
   409             continue
       
   410         url = words[0]
       
   411         parts = urlparse(url)
       
   412         print '%-10s : %s' % (url, parts)
       
   413         abs = urljoin(base, url)
       
   414         if not base:
       
   415             base = abs
       
   416         wrapped = '<URL:%s>' % abs
       
   417         print '%-10s = %s' % (url, wrapped)
       
   418         if len(words) == 3 and words[1] == '=':
       
   419             if wrapped != words[2]:
       
   420                 print 'EXPECTED', words[2], '!!!!!!!!!!'
       
   421 
       
   422 if __name__ == '__main__':
       
   423     test()