|
1 #! /usr/bin/env python |
|
2 |
|
3 # Original code by Guido van Rossum; extensive changes by Sam Bayer, |
|
4 # including code to check URL fragments. |
|
5 |
|
6 """Web tree checker. |
|
7 |
|
8 This utility is handy to check a subweb of the world-wide web for |
|
9 errors. A subweb is specified by giving one or more ``root URLs''; a |
|
10 page belongs to the subweb if one of the root URLs is an initial |
|
11 prefix of it. |
|
12 |
|
13 File URL extension: |
|
14 |
|
15 In order to easy the checking of subwebs via the local file system, |
|
16 the interpretation of ``file:'' URLs is extended to mimic the behavior |
|
17 of your average HTTP daemon: if a directory pathname is given, the |
|
18 file index.html in that directory is returned if it exists, otherwise |
|
19 a directory listing is returned. Now, you can point webchecker to the |
|
20 document tree in the local file system of your HTTP daemon, and have |
|
21 most of it checked. In fact the default works this way if your local |
|
22 web tree is located at /usr/local/etc/httpd/htdpcs (the default for |
|
23 the NCSA HTTP daemon and probably others). |
|
24 |
|
25 Report printed: |
|
26 |
|
27 When done, it reports pages with bad links within the subweb. When |
|
28 interrupted, it reports for the pages that it has checked already. |
|
29 |
|
30 In verbose mode, additional messages are printed during the |
|
31 information gathering phase. By default, it prints a summary of its |
|
32 work status every 50 URLs (adjustable with the -r option), and it |
|
33 reports errors as they are encountered. Use the -q option to disable |
|
34 this output. |
|
35 |
|
36 Checkpoint feature: |
|
37 |
|
38 Whether interrupted or not, it dumps its state (a Python pickle) to a |
|
39 checkpoint file and the -R option allows it to restart from the |
|
40 checkpoint (assuming that the pages on the subweb that were already |
|
41 processed haven't changed). Even when it has run till completion, -R |
|
42 can still be useful -- it will print the reports again, and -Rq prints |
|
43 the errors only. In this case, the checkpoint file is not written |
|
44 again. The checkpoint file can be set with the -d option. |
|
45 |
|
46 The checkpoint file is written as a Python pickle. Remember that |
|
47 Python's pickle module is currently quite slow. Give it the time it |
|
48 needs to load and save the checkpoint file. When interrupted while |
|
49 writing the checkpoint file, the old checkpoint file is not |
|
50 overwritten, but all work done in the current run is lost. |
|
51 |
|
52 Miscellaneous: |
|
53 |
|
54 - You may find the (Tk-based) GUI version easier to use. See wcgui.py. |
|
55 |
|
56 - Webchecker honors the "robots.txt" convention. Thanks to Skip |
|
57 Montanaro for his robotparser.py module (included in this directory)! |
|
58 The agent name is hardwired to "webchecker". URLs that are disallowed |
|
59 by the robots.txt file are reported as external URLs. |
|
60 |
|
61 - Because the SGML parser is a bit slow, very large SGML files are |
|
62 skipped. The size limit can be set with the -m option. |
|
63 |
|
64 - When the server or protocol does not tell us a file's type, we guess |
|
65 it based on the URL's suffix. The mimetypes.py module (also in this |
|
66 directory) has a built-in table mapping most currently known suffixes, |
|
67 and in addition attempts to read the mime.types configuration files in |
|
68 the default locations of Netscape and the NCSA HTTP daemon. |
|
69 |
|
70 - We follow links indicated by <A>, <FRAME> and <IMG> tags. We also |
|
71 honor the <BASE> tag. |
|
72 |
|
73 - We now check internal NAME anchor links, as well as toplevel links. |
|
74 |
|
75 - Checking external links is now done by default; use -x to *disable* |
|
76 this feature. External links are now checked during normal |
|
77 processing. (XXX The status of a checked link could be categorized |
|
78 better. Later...) |
|
79 |
|
80 - If external links are not checked, you can use the -t flag to |
|
81 provide specific overrides to -x. |
|
82 |
|
83 Usage: webchecker.py [option] ... [rooturl] ... |
|
84 |
|
85 Options: |
|
86 |
|
87 -R -- restart from checkpoint file |
|
88 -d file -- checkpoint filename (default %(DUMPFILE)s) |
|
89 -m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d) |
|
90 -n -- reports only, no checking (use with -R) |
|
91 -q -- quiet operation (also suppresses external links report) |
|
92 -r number -- number of links processed per round (default %(ROUNDSIZE)d) |
|
93 -t root -- specify root dir which should be treated as internal (can repeat) |
|
94 -v -- verbose operation; repeating -v will increase verbosity |
|
95 -x -- don't check external links (these are often slow to check) |
|
96 -a -- don't check name anchors |
|
97 |
|
98 Arguments: |
|
99 |
|
100 rooturl -- URL to start checking |
|
101 (default %(DEFROOT)s) |
|
102 |
|
103 """ |
|
104 |
|
105 |
|
106 __version__ = "$Revision: 50851 $" |
|
107 |
|
108 |
|
109 import sys |
|
110 import os |
|
111 from types import * |
|
112 import StringIO |
|
113 import getopt |
|
114 import pickle |
|
115 |
|
116 import urllib |
|
117 import urlparse |
|
118 import sgmllib |
|
119 import cgi |
|
120 |
|
121 import mimetypes |
|
122 import robotparser |
|
123 |
|
124 # Extract real version number if necessary |
|
125 if __version__[0] == '$': |
|
126 _v = __version__.split() |
|
127 if len(_v) == 3: |
|
128 __version__ = _v[1] |
|
129 |
|
130 |
|
131 # Tunable parameters |
|
132 DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL |
|
133 CHECKEXT = 1 # Check external references (1 deep) |
|
134 VERBOSE = 1 # Verbosity level (0-3) |
|
135 MAXPAGE = 150000 # Ignore files bigger than this |
|
136 ROUNDSIZE = 50 # Number of links processed per round |
|
137 DUMPFILE = "@webchecker.pickle" # Pickled checkpoint |
|
138 AGENTNAME = "webchecker" # Agent name for robots.txt parser |
|
139 NONAMES = 0 # Force name anchor checking |
|
140 |
|
141 |
|
142 # Global variables |
|
143 |
|
144 |
|
145 def main(): |
|
146 checkext = CHECKEXT |
|
147 verbose = VERBOSE |
|
148 maxpage = MAXPAGE |
|
149 roundsize = ROUNDSIZE |
|
150 dumpfile = DUMPFILE |
|
151 restart = 0 |
|
152 norun = 0 |
|
153 |
|
154 try: |
|
155 opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa') |
|
156 except getopt.error, msg: |
|
157 sys.stdout = sys.stderr |
|
158 print msg |
|
159 print __doc__%globals() |
|
160 sys.exit(2) |
|
161 |
|
162 # The extra_roots variable collects extra roots. |
|
163 extra_roots = [] |
|
164 nonames = NONAMES |
|
165 |
|
166 for o, a in opts: |
|
167 if o == '-R': |
|
168 restart = 1 |
|
169 if o == '-d': |
|
170 dumpfile = a |
|
171 if o == '-m': |
|
172 maxpage = int(a) |
|
173 if o == '-n': |
|
174 norun = 1 |
|
175 if o == '-q': |
|
176 verbose = 0 |
|
177 if o == '-r': |
|
178 roundsize = int(a) |
|
179 if o == '-t': |
|
180 extra_roots.append(a) |
|
181 if o == '-a': |
|
182 nonames = not nonames |
|
183 if o == '-v': |
|
184 verbose = verbose + 1 |
|
185 if o == '-x': |
|
186 checkext = not checkext |
|
187 |
|
188 if verbose > 0: |
|
189 print AGENTNAME, "version", __version__ |
|
190 |
|
191 if restart: |
|
192 c = load_pickle(dumpfile=dumpfile, verbose=verbose) |
|
193 else: |
|
194 c = Checker() |
|
195 |
|
196 c.setflags(checkext=checkext, verbose=verbose, |
|
197 maxpage=maxpage, roundsize=roundsize, |
|
198 nonames=nonames |
|
199 ) |
|
200 |
|
201 if not restart and not args: |
|
202 args.append(DEFROOT) |
|
203 |
|
204 for arg in args: |
|
205 c.addroot(arg) |
|
206 |
|
207 # The -t flag is only needed if external links are not to be |
|
208 # checked. So -t values are ignored unless -x was specified. |
|
209 if not checkext: |
|
210 for root in extra_roots: |
|
211 # Make sure it's terminated by a slash, |
|
212 # so that addroot doesn't discard the last |
|
213 # directory component. |
|
214 if root[-1] != "/": |
|
215 root = root + "/" |
|
216 c.addroot(root, add_to_do = 0) |
|
217 |
|
218 try: |
|
219 |
|
220 if not norun: |
|
221 try: |
|
222 c.run() |
|
223 except KeyboardInterrupt: |
|
224 if verbose > 0: |
|
225 print "[run interrupted]" |
|
226 |
|
227 try: |
|
228 c.report() |
|
229 except KeyboardInterrupt: |
|
230 if verbose > 0: |
|
231 print "[report interrupted]" |
|
232 |
|
233 finally: |
|
234 if c.save_pickle(dumpfile): |
|
235 if dumpfile == DUMPFILE: |
|
236 print "Use ``%s -R'' to restart." % sys.argv[0] |
|
237 else: |
|
238 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0], |
|
239 dumpfile) |
|
240 |
|
241 |
|
242 def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE): |
|
243 if verbose > 0: |
|
244 print "Loading checkpoint from %s ..." % dumpfile |
|
245 f = open(dumpfile, "rb") |
|
246 c = pickle.load(f) |
|
247 f.close() |
|
248 if verbose > 0: |
|
249 print "Done." |
|
250 print "Root:", "\n ".join(c.roots) |
|
251 return c |
|
252 |
|
253 |
|
254 class Checker: |
|
255 |
|
256 checkext = CHECKEXT |
|
257 verbose = VERBOSE |
|
258 maxpage = MAXPAGE |
|
259 roundsize = ROUNDSIZE |
|
260 nonames = NONAMES |
|
261 |
|
262 validflags = tuple(dir()) |
|
263 |
|
264 def __init__(self): |
|
265 self.reset() |
|
266 |
|
267 def setflags(self, **kw): |
|
268 for key in kw.keys(): |
|
269 if key not in self.validflags: |
|
270 raise NameError, "invalid keyword argument: %s" % str(key) |
|
271 for key, value in kw.items(): |
|
272 setattr(self, key, value) |
|
273 |
|
274 def reset(self): |
|
275 self.roots = [] |
|
276 self.todo = {} |
|
277 self.done = {} |
|
278 self.bad = {} |
|
279 |
|
280 # Add a name table, so that the name URLs can be checked. Also |
|
281 # serves as an implicit cache for which URLs are done. |
|
282 self.name_table = {} |
|
283 |
|
284 self.round = 0 |
|
285 # The following are not pickled: |
|
286 self.robots = {} |
|
287 self.errors = {} |
|
288 self.urlopener = MyURLopener() |
|
289 self.changed = 0 |
|
290 |
|
291 def note(self, level, format, *args): |
|
292 if self.verbose > level: |
|
293 if args: |
|
294 format = format%args |
|
295 self.message(format) |
|
296 |
|
297 def message(self, format, *args): |
|
298 if args: |
|
299 format = format%args |
|
300 print format |
|
301 |
|
302 def __getstate__(self): |
|
303 return (self.roots, self.todo, self.done, self.bad, self.round) |
|
304 |
|
305 def __setstate__(self, state): |
|
306 self.reset() |
|
307 (self.roots, self.todo, self.done, self.bad, self.round) = state |
|
308 for root in self.roots: |
|
309 self.addrobot(root) |
|
310 for url in self.bad.keys(): |
|
311 self.markerror(url) |
|
312 |
|
313 def addroot(self, root, add_to_do = 1): |
|
314 if root not in self.roots: |
|
315 troot = root |
|
316 scheme, netloc, path, params, query, fragment = \ |
|
317 urlparse.urlparse(root) |
|
318 i = path.rfind("/") + 1 |
|
319 if 0 < i < len(path): |
|
320 path = path[:i] |
|
321 troot = urlparse.urlunparse((scheme, netloc, path, |
|
322 params, query, fragment)) |
|
323 self.roots.append(troot) |
|
324 self.addrobot(root) |
|
325 if add_to_do: |
|
326 self.newlink((root, ""), ("<root>", root)) |
|
327 |
|
328 def addrobot(self, root): |
|
329 root = urlparse.urljoin(root, "/") |
|
330 if self.robots.has_key(root): return |
|
331 url = urlparse.urljoin(root, "/robots.txt") |
|
332 self.robots[root] = rp = robotparser.RobotFileParser() |
|
333 self.note(2, "Parsing %s", url) |
|
334 rp.debug = self.verbose > 3 |
|
335 rp.set_url(url) |
|
336 try: |
|
337 rp.read() |
|
338 except (OSError, IOError), msg: |
|
339 self.note(1, "I/O error parsing %s: %s", url, msg) |
|
340 |
|
341 def run(self): |
|
342 while self.todo: |
|
343 self.round = self.round + 1 |
|
344 self.note(0, "\nRound %d (%s)\n", self.round, self.status()) |
|
345 urls = self.todo.keys() |
|
346 urls.sort() |
|
347 del urls[self.roundsize:] |
|
348 for url in urls: |
|
349 self.dopage(url) |
|
350 |
|
351 def status(self): |
|
352 return "%d total, %d to do, %d done, %d bad" % ( |
|
353 len(self.todo)+len(self.done), |
|
354 len(self.todo), len(self.done), |
|
355 len(self.bad)) |
|
356 |
|
357 def report(self): |
|
358 self.message("") |
|
359 if not self.todo: s = "Final" |
|
360 else: s = "Interim" |
|
361 self.message("%s Report (%s)", s, self.status()) |
|
362 self.report_errors() |
|
363 |
|
364 def report_errors(self): |
|
365 if not self.bad: |
|
366 self.message("\nNo errors") |
|
367 return |
|
368 self.message("\nError Report:") |
|
369 sources = self.errors.keys() |
|
370 sources.sort() |
|
371 for source in sources: |
|
372 triples = self.errors[source] |
|
373 self.message("") |
|
374 if len(triples) > 1: |
|
375 self.message("%d Errors in %s", len(triples), source) |
|
376 else: |
|
377 self.message("Error in %s", source) |
|
378 # Call self.format_url() instead of referring |
|
379 # to the URL directly, since the URLs in these |
|
380 # triples is now a (URL, fragment) pair. The value |
|
381 # of the "source" variable comes from the list of |
|
382 # origins, and is a URL, not a pair. |
|
383 for url, rawlink, msg in triples: |
|
384 if rawlink != self.format_url(url): s = " (%s)" % rawlink |
|
385 else: s = "" |
|
386 self.message(" HREF %s%s\n msg %s", |
|
387 self.format_url(url), s, msg) |
|
388 |
|
389 def dopage(self, url_pair): |
|
390 |
|
391 # All printing of URLs uses format_url(); argument changed to |
|
392 # url_pair for clarity. |
|
393 if self.verbose > 1: |
|
394 if self.verbose > 2: |
|
395 self.show("Check ", self.format_url(url_pair), |
|
396 " from", self.todo[url_pair]) |
|
397 else: |
|
398 self.message("Check %s", self.format_url(url_pair)) |
|
399 url, local_fragment = url_pair |
|
400 if local_fragment and self.nonames: |
|
401 self.markdone(url_pair) |
|
402 return |
|
403 try: |
|
404 page = self.getpage(url_pair) |
|
405 except sgmllib.SGMLParseError, msg: |
|
406 msg = self.sanitize(msg) |
|
407 self.note(0, "Error parsing %s: %s", |
|
408 self.format_url(url_pair), msg) |
|
409 # Dont actually mark the URL as bad - it exists, just |
|
410 # we can't parse it! |
|
411 page = None |
|
412 if page: |
|
413 # Store the page which corresponds to this URL. |
|
414 self.name_table[url] = page |
|
415 # If there is a fragment in this url_pair, and it's not |
|
416 # in the list of names for the page, call setbad(), since |
|
417 # it's a missing anchor. |
|
418 if local_fragment and local_fragment not in page.getnames(): |
|
419 self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment)) |
|
420 for info in page.getlinkinfos(): |
|
421 # getlinkinfos() now returns the fragment as well, |
|
422 # and we store that fragment here in the "todo" dictionary. |
|
423 link, rawlink, fragment = info |
|
424 # However, we don't want the fragment as the origin, since |
|
425 # the origin is logically a page. |
|
426 origin = url, rawlink |
|
427 self.newlink((link, fragment), origin) |
|
428 else: |
|
429 # If no page has been created yet, we want to |
|
430 # record that fact. |
|
431 self.name_table[url_pair[0]] = None |
|
432 self.markdone(url_pair) |
|
433 |
|
434 def newlink(self, url, origin): |
|
435 if self.done.has_key(url): |
|
436 self.newdonelink(url, origin) |
|
437 else: |
|
438 self.newtodolink(url, origin) |
|
439 |
|
440 def newdonelink(self, url, origin): |
|
441 if origin not in self.done[url]: |
|
442 self.done[url].append(origin) |
|
443 |
|
444 # Call self.format_url(), since the URL here |
|
445 # is now a (URL, fragment) pair. |
|
446 self.note(3, " Done link %s", self.format_url(url)) |
|
447 |
|
448 # Make sure that if it's bad, that the origin gets added. |
|
449 if self.bad.has_key(url): |
|
450 source, rawlink = origin |
|
451 triple = url, rawlink, self.bad[url] |
|
452 self.seterror(source, triple) |
|
453 |
|
454 def newtodolink(self, url, origin): |
|
455 # Call self.format_url(), since the URL here |
|
456 # is now a (URL, fragment) pair. |
|
457 if self.todo.has_key(url): |
|
458 if origin not in self.todo[url]: |
|
459 self.todo[url].append(origin) |
|
460 self.note(3, " Seen todo link %s", self.format_url(url)) |
|
461 else: |
|
462 self.todo[url] = [origin] |
|
463 self.note(3, " New todo link %s", self.format_url(url)) |
|
464 |
|
465 def format_url(self, url): |
|
466 link, fragment = url |
|
467 if fragment: return link + "#" + fragment |
|
468 else: return link |
|
469 |
|
470 def markdone(self, url): |
|
471 self.done[url] = self.todo[url] |
|
472 del self.todo[url] |
|
473 self.changed = 1 |
|
474 |
|
475 def inroots(self, url): |
|
476 for root in self.roots: |
|
477 if url[:len(root)] == root: |
|
478 return self.isallowed(root, url) |
|
479 return 0 |
|
480 |
|
481 def isallowed(self, root, url): |
|
482 root = urlparse.urljoin(root, "/") |
|
483 return self.robots[root].can_fetch(AGENTNAME, url) |
|
484 |
|
485 def getpage(self, url_pair): |
|
486 # Incoming argument name is a (URL, fragment) pair. |
|
487 # The page may have been cached in the name_table variable. |
|
488 url, fragment = url_pair |
|
489 if self.name_table.has_key(url): |
|
490 return self.name_table[url] |
|
491 |
|
492 scheme, path = urllib.splittype(url) |
|
493 if scheme in ('mailto', 'news', 'javascript', 'telnet'): |
|
494 self.note(1, " Not checking %s URL" % scheme) |
|
495 return None |
|
496 isint = self.inroots(url) |
|
497 |
|
498 # Ensure that openpage gets the URL pair to |
|
499 # print out its error message and record the error pair |
|
500 # correctly. |
|
501 if not isint: |
|
502 if not self.checkext: |
|
503 self.note(1, " Not checking ext link") |
|
504 return None |
|
505 f = self.openpage(url_pair) |
|
506 if f: |
|
507 self.safeclose(f) |
|
508 return None |
|
509 text, nurl = self.readhtml(url_pair) |
|
510 |
|
511 if nurl != url: |
|
512 self.note(1, " Redirected to %s", nurl) |
|
513 url = nurl |
|
514 if text: |
|
515 return Page(text, url, maxpage=self.maxpage, checker=self) |
|
516 |
|
517 # These next three functions take (URL, fragment) pairs as |
|
518 # arguments, so that openpage() receives the appropriate tuple to |
|
519 # record error messages. |
|
520 def readhtml(self, url_pair): |
|
521 url, fragment = url_pair |
|
522 text = None |
|
523 f, url = self.openhtml(url_pair) |
|
524 if f: |
|
525 text = f.read() |
|
526 f.close() |
|
527 return text, url |
|
528 |
|
529 def openhtml(self, url_pair): |
|
530 url, fragment = url_pair |
|
531 f = self.openpage(url_pair) |
|
532 if f: |
|
533 url = f.geturl() |
|
534 info = f.info() |
|
535 if not self.checkforhtml(info, url): |
|
536 self.safeclose(f) |
|
537 f = None |
|
538 return f, url |
|
539 |
|
540 def openpage(self, url_pair): |
|
541 url, fragment = url_pair |
|
542 try: |
|
543 return self.urlopener.open(url) |
|
544 except (OSError, IOError), msg: |
|
545 msg = self.sanitize(msg) |
|
546 self.note(0, "Error %s", msg) |
|
547 if self.verbose > 0: |
|
548 self.show(" HREF ", url, " from", self.todo[url_pair]) |
|
549 self.setbad(url_pair, msg) |
|
550 return None |
|
551 |
|
552 def checkforhtml(self, info, url): |
|
553 if info.has_key('content-type'): |
|
554 ctype = cgi.parse_header(info['content-type'])[0].lower() |
|
555 if ';' in ctype: |
|
556 # handle content-type: text/html; charset=iso8859-1 : |
|
557 ctype = ctype.split(';', 1)[0].strip() |
|
558 else: |
|
559 if url[-1:] == "/": |
|
560 return 1 |
|
561 ctype, encoding = mimetypes.guess_type(url) |
|
562 if ctype == 'text/html': |
|
563 return 1 |
|
564 else: |
|
565 self.note(1, " Not HTML, mime type %s", ctype) |
|
566 return 0 |
|
567 |
|
568 def setgood(self, url): |
|
569 if self.bad.has_key(url): |
|
570 del self.bad[url] |
|
571 self.changed = 1 |
|
572 self.note(0, "(Clear previously seen error)") |
|
573 |
|
574 def setbad(self, url, msg): |
|
575 if self.bad.has_key(url) and self.bad[url] == msg: |
|
576 self.note(0, "(Seen this error before)") |
|
577 return |
|
578 self.bad[url] = msg |
|
579 self.changed = 1 |
|
580 self.markerror(url) |
|
581 |
|
582 def markerror(self, url): |
|
583 try: |
|
584 origins = self.todo[url] |
|
585 except KeyError: |
|
586 origins = self.done[url] |
|
587 for source, rawlink in origins: |
|
588 triple = url, rawlink, self.bad[url] |
|
589 self.seterror(source, triple) |
|
590 |
|
591 def seterror(self, url, triple): |
|
592 try: |
|
593 # Because of the way the URLs are now processed, I need to |
|
594 # check to make sure the URL hasn't been entered in the |
|
595 # error list. The first element of the triple here is a |
|
596 # (URL, fragment) pair, but the URL key is not, since it's |
|
597 # from the list of origins. |
|
598 if triple not in self.errors[url]: |
|
599 self.errors[url].append(triple) |
|
600 except KeyError: |
|
601 self.errors[url] = [triple] |
|
602 |
|
603 # The following used to be toplevel functions; they have been |
|
604 # changed into methods so they can be overridden in subclasses. |
|
605 |
|
606 def show(self, p1, link, p2, origins): |
|
607 self.message("%s %s", p1, link) |
|
608 i = 0 |
|
609 for source, rawlink in origins: |
|
610 i = i+1 |
|
611 if i == 2: |
|
612 p2 = ' '*len(p2) |
|
613 if rawlink != link: s = " (%s)" % rawlink |
|
614 else: s = "" |
|
615 self.message("%s %s%s", p2, source, s) |
|
616 |
|
617 def sanitize(self, msg): |
|
618 if isinstance(IOError, ClassType) and isinstance(msg, IOError): |
|
619 # Do the other branch recursively |
|
620 msg.args = self.sanitize(msg.args) |
|
621 elif isinstance(msg, TupleType): |
|
622 if len(msg) >= 4 and msg[0] == 'http error' and \ |
|
623 isinstance(msg[3], InstanceType): |
|
624 # Remove the Message instance -- it may contain |
|
625 # a file object which prevents pickling. |
|
626 msg = msg[:3] + msg[4:] |
|
627 return msg |
|
628 |
|
629 def safeclose(self, f): |
|
630 try: |
|
631 url = f.geturl() |
|
632 except AttributeError: |
|
633 pass |
|
634 else: |
|
635 if url[:4] == 'ftp:' or url[:7] == 'file://': |
|
636 # Apparently ftp connections don't like to be closed |
|
637 # prematurely... |
|
638 text = f.read() |
|
639 f.close() |
|
640 |
|
641 def save_pickle(self, dumpfile=DUMPFILE): |
|
642 if not self.changed: |
|
643 self.note(0, "\nNo need to save checkpoint") |
|
644 elif not dumpfile: |
|
645 self.note(0, "No dumpfile, won't save checkpoint") |
|
646 else: |
|
647 self.note(0, "\nSaving checkpoint to %s ...", dumpfile) |
|
648 newfile = dumpfile + ".new" |
|
649 f = open(newfile, "wb") |
|
650 pickle.dump(self, f) |
|
651 f.close() |
|
652 try: |
|
653 os.unlink(dumpfile) |
|
654 except os.error: |
|
655 pass |
|
656 os.rename(newfile, dumpfile) |
|
657 self.note(0, "Done.") |
|
658 return 1 |
|
659 |
|
660 |
|
661 class Page: |
|
662 |
|
663 def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None): |
|
664 self.text = text |
|
665 self.url = url |
|
666 self.verbose = verbose |
|
667 self.maxpage = maxpage |
|
668 self.checker = checker |
|
669 |
|
670 # The parsing of the page is done in the __init__() routine in |
|
671 # order to initialize the list of names the file |
|
672 # contains. Stored the parser in an instance variable. Passed |
|
673 # the URL to MyHTMLParser(). |
|
674 size = len(self.text) |
|
675 if size > self.maxpage: |
|
676 self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001)) |
|
677 self.parser = None |
|
678 return |
|
679 self.checker.note(2, " Parsing %s (%d bytes)", self.url, size) |
|
680 self.parser = MyHTMLParser(url, verbose=self.verbose, |
|
681 checker=self.checker) |
|
682 self.parser.feed(self.text) |
|
683 self.parser.close() |
|
684 |
|
685 def note(self, level, msg, *args): |
|
686 if self.checker: |
|
687 apply(self.checker.note, (level, msg) + args) |
|
688 else: |
|
689 if self.verbose >= level: |
|
690 if args: |
|
691 msg = msg%args |
|
692 print msg |
|
693 |
|
694 # Method to retrieve names. |
|
695 def getnames(self): |
|
696 if self.parser: |
|
697 return self.parser.names |
|
698 else: |
|
699 return [] |
|
700 |
|
701 def getlinkinfos(self): |
|
702 # File reading is done in __init__() routine. Store parser in |
|
703 # local variable to indicate success of parsing. |
|
704 |
|
705 # If no parser was stored, fail. |
|
706 if not self.parser: return [] |
|
707 |
|
708 rawlinks = self.parser.getlinks() |
|
709 base = urlparse.urljoin(self.url, self.parser.getbase() or "") |
|
710 infos = [] |
|
711 for rawlink in rawlinks: |
|
712 t = urlparse.urlparse(rawlink) |
|
713 # DON'T DISCARD THE FRAGMENT! Instead, include |
|
714 # it in the tuples which are returned. See Checker.dopage(). |
|
715 fragment = t[-1] |
|
716 t = t[:-1] + ('',) |
|
717 rawlink = urlparse.urlunparse(t) |
|
718 link = urlparse.urljoin(base, rawlink) |
|
719 infos.append((link, rawlink, fragment)) |
|
720 |
|
721 return infos |
|
722 |
|
723 |
|
724 class MyStringIO(StringIO.StringIO): |
|
725 |
|
726 def __init__(self, url, info): |
|
727 self.__url = url |
|
728 self.__info = info |
|
729 StringIO.StringIO.__init__(self) |
|
730 |
|
731 def info(self): |
|
732 return self.__info |
|
733 |
|
734 def geturl(self): |
|
735 return self.__url |
|
736 |
|
737 |
|
738 class MyURLopener(urllib.FancyURLopener): |
|
739 |
|
740 http_error_default = urllib.URLopener.http_error_default |
|
741 |
|
742 def __init__(*args): |
|
743 self = args[0] |
|
744 apply(urllib.FancyURLopener.__init__, args) |
|
745 self.addheaders = [ |
|
746 ('User-agent', 'Python-webchecker/%s' % __version__), |
|
747 ] |
|
748 |
|
749 def http_error_401(self, url, fp, errcode, errmsg, headers): |
|
750 return None |
|
751 |
|
752 def open_file(self, url): |
|
753 path = urllib.url2pathname(urllib.unquote(url)) |
|
754 if os.path.isdir(path): |
|
755 if path[-1] != os.sep: |
|
756 url = url + '/' |
|
757 indexpath = os.path.join(path, "index.html") |
|
758 if os.path.exists(indexpath): |
|
759 return self.open_file(url + "index.html") |
|
760 try: |
|
761 names = os.listdir(path) |
|
762 except os.error, msg: |
|
763 exc_type, exc_value, exc_tb = sys.exc_info() |
|
764 raise IOError, msg, exc_tb |
|
765 names.sort() |
|
766 s = MyStringIO("file:"+url, {'content-type': 'text/html'}) |
|
767 s.write('<BASE HREF="file:%s">\n' % |
|
768 urllib.quote(os.path.join(path, ""))) |
|
769 for name in names: |
|
770 q = urllib.quote(name) |
|
771 s.write('<A HREF="%s">%s</A>\n' % (q, q)) |
|
772 s.seek(0) |
|
773 return s |
|
774 return urllib.FancyURLopener.open_file(self, url) |
|
775 |
|
776 |
|
777 class MyHTMLParser(sgmllib.SGMLParser): |
|
778 |
|
779 def __init__(self, url, verbose=VERBOSE, checker=None): |
|
780 self.myverbose = verbose # now unused |
|
781 self.checker = checker |
|
782 self.base = None |
|
783 self.links = {} |
|
784 self.names = [] |
|
785 self.url = url |
|
786 sgmllib.SGMLParser.__init__(self) |
|
787 |
|
788 def check_name_id(self, attributes): |
|
789 """ Check the name or id attributes on an element. |
|
790 """ |
|
791 # We must rescue the NAME or id (name is deprecated in XHTML) |
|
792 # attributes from the anchor, in order to |
|
793 # cache the internal anchors which are made |
|
794 # available in the page. |
|
795 for name, value in attributes: |
|
796 if name == "name" or name == "id": |
|
797 if value in self.names: |
|
798 self.checker.message("WARNING: duplicate ID name %s in %s", |
|
799 value, self.url) |
|
800 else: self.names.append(value) |
|
801 break |
|
802 |
|
803 def unknown_starttag(self, tag, attributes): |
|
804 """ In XHTML, you can have id attributes on any element. |
|
805 """ |
|
806 self.check_name_id(attributes) |
|
807 |
|
808 def start_a(self, attributes): |
|
809 self.link_attr(attributes, 'href') |
|
810 self.check_name_id(attributes) |
|
811 |
|
812 def end_a(self): pass |
|
813 |
|
814 def do_area(self, attributes): |
|
815 self.link_attr(attributes, 'href') |
|
816 self.check_name_id(attributes) |
|
817 |
|
818 def do_body(self, attributes): |
|
819 self.link_attr(attributes, 'background', 'bgsound') |
|
820 self.check_name_id(attributes) |
|
821 |
|
822 def do_img(self, attributes): |
|
823 self.link_attr(attributes, 'src', 'lowsrc') |
|
824 self.check_name_id(attributes) |
|
825 |
|
826 def do_frame(self, attributes): |
|
827 self.link_attr(attributes, 'src', 'longdesc') |
|
828 self.check_name_id(attributes) |
|
829 |
|
830 def do_iframe(self, attributes): |
|
831 self.link_attr(attributes, 'src', 'longdesc') |
|
832 self.check_name_id(attributes) |
|
833 |
|
834 def do_link(self, attributes): |
|
835 for name, value in attributes: |
|
836 if name == "rel": |
|
837 parts = value.lower().split() |
|
838 if ( parts == ["stylesheet"] |
|
839 or parts == ["alternate", "stylesheet"]): |
|
840 self.link_attr(attributes, "href") |
|
841 break |
|
842 self.check_name_id(attributes) |
|
843 |
|
844 def do_object(self, attributes): |
|
845 self.link_attr(attributes, 'data', 'usemap') |
|
846 self.check_name_id(attributes) |
|
847 |
|
848 def do_script(self, attributes): |
|
849 self.link_attr(attributes, 'src') |
|
850 self.check_name_id(attributes) |
|
851 |
|
852 def do_table(self, attributes): |
|
853 self.link_attr(attributes, 'background') |
|
854 self.check_name_id(attributes) |
|
855 |
|
856 def do_td(self, attributes): |
|
857 self.link_attr(attributes, 'background') |
|
858 self.check_name_id(attributes) |
|
859 |
|
860 def do_th(self, attributes): |
|
861 self.link_attr(attributes, 'background') |
|
862 self.check_name_id(attributes) |
|
863 |
|
864 def do_tr(self, attributes): |
|
865 self.link_attr(attributes, 'background') |
|
866 self.check_name_id(attributes) |
|
867 |
|
868 def link_attr(self, attributes, *args): |
|
869 for name, value in attributes: |
|
870 if name in args: |
|
871 if value: value = value.strip() |
|
872 if value: self.links[value] = None |
|
873 |
|
874 def do_base(self, attributes): |
|
875 for name, value in attributes: |
|
876 if name == 'href': |
|
877 if value: value = value.strip() |
|
878 if value: |
|
879 if self.checker: |
|
880 self.checker.note(1, " Base %s", value) |
|
881 self.base = value |
|
882 self.check_name_id(attributes) |
|
883 |
|
884 def getlinks(self): |
|
885 return self.links.keys() |
|
886 |
|
887 def getbase(self): |
|
888 return self.base |
|
889 |
|
890 |
|
891 if __name__ == '__main__': |
|
892 main() |