|
1 #! /usr/bin/env python |
|
2 |
|
3 """A variant on webchecker that creates a mirror copy of a remote site.""" |
|
4 |
|
5 __version__ = "$Revision: 28654 $" |
|
6 |
|
7 import os |
|
8 import sys |
|
9 import urllib |
|
10 import getopt |
|
11 |
|
12 import webchecker |
|
13 |
|
14 # Extract real version number if necessary |
|
15 if __version__[0] == '$': |
|
16 _v = __version__.split() |
|
17 if len(_v) == 3: |
|
18 __version__ = _v[1] |
|
19 |
|
20 def main(): |
|
21 verbose = webchecker.VERBOSE |
|
22 try: |
|
23 opts, args = getopt.getopt(sys.argv[1:], "qv") |
|
24 except getopt.error, msg: |
|
25 print msg |
|
26 print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..." |
|
27 return 2 |
|
28 for o, a in opts: |
|
29 if o == "-q": |
|
30 verbose = 0 |
|
31 if o == "-v": |
|
32 verbose = verbose + 1 |
|
33 c = Sucker() |
|
34 c.setflags(verbose=verbose) |
|
35 c.urlopener.addheaders = [ |
|
36 ('User-agent', 'websucker/%s' % __version__), |
|
37 ] |
|
38 for arg in args: |
|
39 print "Adding root", arg |
|
40 c.addroot(arg) |
|
41 print "Run..." |
|
42 c.run() |
|
43 |
|
44 class Sucker(webchecker.Checker): |
|
45 |
|
46 checkext = 0 |
|
47 nonames = 1 |
|
48 |
|
49 # SAM 11/13/99: in general, URLs are now URL pairs. |
|
50 # Since we've suppressed name anchor checking, |
|
51 # we can ignore the second dimension. |
|
52 |
|
53 def readhtml(self, url_pair): |
|
54 url = url_pair[0] |
|
55 text = None |
|
56 path = self.savefilename(url) |
|
57 try: |
|
58 f = open(path, "rb") |
|
59 except IOError: |
|
60 f = self.openpage(url_pair) |
|
61 if f: |
|
62 info = f.info() |
|
63 nurl = f.geturl() |
|
64 if nurl != url: |
|
65 url = nurl |
|
66 path = self.savefilename(url) |
|
67 text = f.read() |
|
68 f.close() |
|
69 self.savefile(text, path) |
|
70 if not self.checkforhtml(info, url): |
|
71 text = None |
|
72 else: |
|
73 if self.checkforhtml({}, url): |
|
74 text = f.read() |
|
75 f.close() |
|
76 return text, url |
|
77 |
|
78 def savefile(self, text, path): |
|
79 dir, base = os.path.split(path) |
|
80 makedirs(dir) |
|
81 try: |
|
82 f = open(path, "wb") |
|
83 f.write(text) |
|
84 f.close() |
|
85 self.message("saved %s", path) |
|
86 except IOError, msg: |
|
87 self.message("didn't save %s: %s", path, str(msg)) |
|
88 |
|
89 def savefilename(self, url): |
|
90 type, rest = urllib.splittype(url) |
|
91 host, path = urllib.splithost(rest) |
|
92 path = path.lstrip("/") |
|
93 user, host = urllib.splituser(host) |
|
94 host, port = urllib.splitnport(host) |
|
95 host = host.lower() |
|
96 if not path or path[-1] == "/": |
|
97 path = path + "index.html" |
|
98 if os.sep != "/": |
|
99 path = os.sep.join(path.split("/")) |
|
100 if os.name == "mac": |
|
101 path = os.sep + path |
|
102 path = os.path.join(host, path) |
|
103 return path |
|
104 |
|
105 def makedirs(dir): |
|
106 if not dir: |
|
107 return |
|
108 if os.path.exists(dir): |
|
109 if not os.path.isdir(dir): |
|
110 try: |
|
111 os.rename(dir, dir + ".bak") |
|
112 os.mkdir(dir) |
|
113 os.rename(dir + ".bak", os.path.join(dir, "index.html")) |
|
114 except os.error: |
|
115 pass |
|
116 return |
|
117 head, tail = os.path.split(dir) |
|
118 if not tail: |
|
119 print "Huh? Don't know how to make dir", dir |
|
120 return |
|
121 makedirs(head) |
|
122 os.mkdir(dir, 0777) |
|
123 |
|
124 if __name__ == '__main__': |
|
125 sys.exit(main() or 0) |