symbian-qemu-0.9.1-12/python-2.6.1/Tools/webchecker/websucker.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 #! /usr/bin/env python
       
     2 
       
     3 """A variant on webchecker that creates a mirror copy of a remote site."""
       
     4 
       
     5 __version__ = "$Revision: 28654 $"
       
     6 
       
     7 import os
       
     8 import sys
       
     9 import urllib
       
    10 import getopt
       
    11 
       
    12 import webchecker
       
    13 
       
    14 # Extract real version number if necessary
       
    15 if __version__[0] == '$':
       
    16     _v = __version__.split()
       
    17     if len(_v) == 3:
       
    18         __version__ = _v[1]
       
    19 
       
    20 def main():
       
    21     verbose = webchecker.VERBOSE
       
    22     try:
       
    23         opts, args = getopt.getopt(sys.argv[1:], "qv")
       
    24     except getopt.error, msg:
       
    25         print msg
       
    26         print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
       
    27         return 2
       
    28     for o, a in opts:
       
    29         if o == "-q":
       
    30             verbose = 0
       
    31         if o == "-v":
       
    32             verbose = verbose + 1
       
    33     c = Sucker()
       
    34     c.setflags(verbose=verbose)
       
    35     c.urlopener.addheaders = [
       
    36             ('User-agent', 'websucker/%s' % __version__),
       
    37         ]
       
    38     for arg in args:
       
    39         print "Adding root", arg
       
    40         c.addroot(arg)
       
    41     print "Run..."
       
    42     c.run()
       
    43 
       
    44 class Sucker(webchecker.Checker):
       
    45 
       
    46     checkext = 0
       
    47     nonames = 1
       
    48 
       
    49     # SAM 11/13/99: in general, URLs are now URL pairs.
       
    50     # Since we've suppressed name anchor checking,
       
    51     # we can ignore the second dimension.
       
    52 
       
    53     def readhtml(self, url_pair):
       
    54         url = url_pair[0]
       
    55         text = None
       
    56         path = self.savefilename(url)
       
    57         try:
       
    58             f = open(path, "rb")
       
    59         except IOError:
       
    60             f = self.openpage(url_pair)
       
    61             if f:
       
    62                 info = f.info()
       
    63                 nurl = f.geturl()
       
    64                 if nurl != url:
       
    65                     url = nurl
       
    66                     path = self.savefilename(url)
       
    67                 text = f.read()
       
    68                 f.close()
       
    69                 self.savefile(text, path)
       
    70                 if not self.checkforhtml(info, url):
       
    71                     text = None
       
    72         else:
       
    73             if self.checkforhtml({}, url):
       
    74                 text = f.read()
       
    75             f.close()
       
    76         return text, url
       
    77 
       
    78     def savefile(self, text, path):
       
    79         dir, base = os.path.split(path)
       
    80         makedirs(dir)
       
    81         try:
       
    82             f = open(path, "wb")
       
    83             f.write(text)
       
    84             f.close()
       
    85             self.message("saved %s", path)
       
    86         except IOError, msg:
       
    87             self.message("didn't save %s: %s", path, str(msg))
       
    88 
       
    89     def savefilename(self, url):
       
    90         type, rest = urllib.splittype(url)
       
    91         host, path = urllib.splithost(rest)
       
    92         path = path.lstrip("/")
       
    93         user, host = urllib.splituser(host)
       
    94         host, port = urllib.splitnport(host)
       
    95         host = host.lower()
       
    96         if not path or path[-1] == "/":
       
    97             path = path + "index.html"
       
    98         if os.sep != "/":
       
    99             path = os.sep.join(path.split("/"))
       
   100             if os.name == "mac":
       
   101                 path = os.sep + path
       
   102         path = os.path.join(host, path)
       
   103         return path
       
   104 
       
   105 def makedirs(dir):
       
   106     if not dir:
       
   107         return
       
   108     if os.path.exists(dir):
       
   109         if not os.path.isdir(dir):
       
   110             try:
       
   111                 os.rename(dir, dir + ".bak")
       
   112                 os.mkdir(dir)
       
   113                 os.rename(dir + ".bak", os.path.join(dir, "index.html"))
       
   114             except os.error:
       
   115                 pass
       
   116         return
       
   117     head, tail = os.path.split(dir)
       
   118     if not tail:
       
   119         print "Huh?  Don't know how to make dir", dir
       
   120         return
       
   121     makedirs(head)
       
   122     os.mkdir(dir, 0777)
       
   123 
       
   124 if __name__ == '__main__':
       
   125     sys.exit(main() or 0)