|
1 """ robotparser.py |
|
2 |
|
3 Copyright (C) 2000 Bastian Kleineidam |
|
4 |
|
5 You can choose between two licenses when using this package: |
|
6 1) GNU GPLv2 |
|
7 2) PSF license for Python 2.2 |
|
8 |
|
9 The robots.txt Exclusion Protocol is implemented as specified in |
|
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html |
|
11 """ |
|
12 import urlparse |
|
13 import urllib |
|
14 |
|
15 __all__ = ["RobotFileParser"] |
|
16 |
|
17 |
|
18 class RobotFileParser: |
|
19 """ This class provides a set of methods to read, parse and answer |
|
20 questions about a single robots.txt file. |
|
21 |
|
22 """ |
|
23 |
|
24 def __init__(self, url=''): |
|
25 self.entries = [] |
|
26 self.default_entry = None |
|
27 self.disallow_all = False |
|
28 self.allow_all = False |
|
29 self.set_url(url) |
|
30 self.last_checked = 0 |
|
31 |
|
32 def mtime(self): |
|
33 """Returns the time the robots.txt file was last fetched. |
|
34 |
|
35 This is useful for long-running web spiders that need to |
|
36 check for new robots.txt files periodically. |
|
37 |
|
38 """ |
|
39 return self.last_checked |
|
40 |
|
41 def modified(self): |
|
42 """Sets the time the robots.txt file was last fetched to the |
|
43 current time. |
|
44 |
|
45 """ |
|
46 import time |
|
47 self.last_checked = time.time() |
|
48 |
|
49 def set_url(self, url): |
|
50 """Sets the URL referring to a robots.txt file.""" |
|
51 self.url = url |
|
52 self.host, self.path = urlparse.urlparse(url)[1:3] |
|
53 |
|
54 def read(self): |
|
55 """Reads the robots.txt URL and feeds it to the parser.""" |
|
56 opener = URLopener() |
|
57 f = opener.open(self.url) |
|
58 lines = [line.strip() for line in f] |
|
59 f.close() |
|
60 self.errcode = opener.errcode |
|
61 if self.errcode in (401, 403): |
|
62 self.disallow_all = True |
|
63 elif self.errcode >= 400: |
|
64 self.allow_all = True |
|
65 elif self.errcode == 200 and lines: |
|
66 self.parse(lines) |
|
67 |
|
68 def _add_entry(self, entry): |
|
69 if "*" in entry.useragents: |
|
70 # the default entry is considered last |
|
71 self.default_entry = entry |
|
72 else: |
|
73 self.entries.append(entry) |
|
74 |
|
75 def parse(self, lines): |
|
76 """parse the input lines from a robots.txt file. |
|
77 We allow that a user-agent: line is not preceded by |
|
78 one or more blank lines.""" |
|
79 # states: |
|
80 # 0: start state |
|
81 # 1: saw user-agent line |
|
82 # 2: saw an allow or disallow line |
|
83 state = 0 |
|
84 linenumber = 0 |
|
85 entry = Entry() |
|
86 |
|
87 for line in lines: |
|
88 linenumber += 1 |
|
89 if not line: |
|
90 if state == 1: |
|
91 entry = Entry() |
|
92 state = 0 |
|
93 elif state == 2: |
|
94 self._add_entry(entry) |
|
95 entry = Entry() |
|
96 state = 0 |
|
97 # remove optional comment and strip line |
|
98 i = line.find('#') |
|
99 if i >= 0: |
|
100 line = line[:i] |
|
101 line = line.strip() |
|
102 if not line: |
|
103 continue |
|
104 line = line.split(':', 1) |
|
105 if len(line) == 2: |
|
106 line[0] = line[0].strip().lower() |
|
107 line[1] = urllib.unquote(line[1].strip()) |
|
108 if line[0] == "user-agent": |
|
109 if state == 2: |
|
110 self._add_entry(entry) |
|
111 entry = Entry() |
|
112 entry.useragents.append(line[1]) |
|
113 state = 1 |
|
114 elif line[0] == "disallow": |
|
115 if state != 0: |
|
116 entry.rulelines.append(RuleLine(line[1], False)) |
|
117 state = 2 |
|
118 elif line[0] == "allow": |
|
119 if state != 0: |
|
120 entry.rulelines.append(RuleLine(line[1], True)) |
|
121 state = 2 |
|
122 if state == 2: |
|
123 self.entries.append(entry) |
|
124 |
|
125 |
|
126 def can_fetch(self, useragent, url): |
|
127 """using the parsed robots.txt decide if useragent can fetch url""" |
|
128 if self.disallow_all: |
|
129 return False |
|
130 if self.allow_all: |
|
131 return True |
|
132 # search for given user agent matches |
|
133 # the first match counts |
|
134 url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" |
|
135 for entry in self.entries: |
|
136 if entry.applies_to(useragent): |
|
137 return entry.allowance(url) |
|
138 # try the default entry last |
|
139 if self.default_entry: |
|
140 return self.default_entry.allowance(url) |
|
141 # agent not found ==> access granted |
|
142 return True |
|
143 |
|
144 |
|
145 def __str__(self): |
|
146 return ''.join([str(entry) + "\n" for entry in self.entries]) |
|
147 |
|
148 |
|
149 class RuleLine: |
|
150 """A rule line is a single "Allow:" (allowance==True) or "Disallow:" |
|
151 (allowance==False) followed by a path.""" |
|
152 def __init__(self, path, allowance): |
|
153 if path == '' and not allowance: |
|
154 # an empty value means allow all |
|
155 allowance = True |
|
156 self.path = urllib.quote(path) |
|
157 self.allowance = allowance |
|
158 |
|
159 def applies_to(self, filename): |
|
160 return self.path == "*" or filename.startswith(self.path) |
|
161 |
|
162 def __str__(self): |
|
163 return (self.allowance and "Allow" or "Disallow") + ": " + self.path |
|
164 |
|
165 |
|
166 class Entry: |
|
167 """An entry has one or more user-agents and zero or more rulelines""" |
|
168 def __init__(self): |
|
169 self.useragents = [] |
|
170 self.rulelines = [] |
|
171 |
|
172 def __str__(self): |
|
173 ret = [] |
|
174 for agent in self.useragents: |
|
175 ret.extend(["User-agent: ", agent, "\n"]) |
|
176 for line in self.rulelines: |
|
177 ret.extend([str(line), "\n"]) |
|
178 return ''.join(ret) |
|
179 |
|
180 def applies_to(self, useragent): |
|
181 """check if this entry applies to the specified agent""" |
|
182 # split the name token and make it lower case |
|
183 useragent = useragent.split("/")[0].lower() |
|
184 for agent in self.useragents: |
|
185 if agent == '*': |
|
186 # we have the catch-all agent |
|
187 return True |
|
188 agent = agent.lower() |
|
189 if agent in useragent: |
|
190 return True |
|
191 return False |
|
192 |
|
193 def allowance(self, filename): |
|
194 """Preconditions: |
|
195 - our agent applies to this entry |
|
196 - filename is URL decoded""" |
|
197 for line in self.rulelines: |
|
198 if line.applies_to(filename): |
|
199 return line.allowance |
|
200 return True |
|
201 |
|
202 class URLopener(urllib.FancyURLopener): |
|
203 def __init__(self, *args): |
|
204 urllib.FancyURLopener.__init__(self, *args) |
|
205 self.errcode = 200 |
|
206 |
|
207 def prompt_user_passwd(self, host, realm): |
|
208 ## If robots.txt file is accessible only with a password, |
|
209 ## we act as if the file wasn't there. |
|
210 return None, None |
|
211 |
|
212 def http_error_default(self, url, fp, errcode, errmsg, headers): |
|
213 self.errcode = errcode |
|
214 return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, |
|
215 errmsg, headers) |