|
1 import unittest, StringIO, robotparser |
|
2 from test import test_support |
|
3 |
|
4 class RobotTestCase(unittest.TestCase): |
|
5 def __init__(self, index, parser, url, good, agent): |
|
6 unittest.TestCase.__init__(self) |
|
7 if good: |
|
8 self.str = "RobotTest(%d, good, %s)" % (index, url) |
|
9 else: |
|
10 self.str = "RobotTest(%d, bad, %s)" % (index, url) |
|
11 self.parser = parser |
|
12 self.url = url |
|
13 self.good = good |
|
14 self.agent = agent |
|
15 |
|
16 def runTest(self): |
|
17 if isinstance(self.url, tuple): |
|
18 agent, url = self.url |
|
19 else: |
|
20 url = self.url |
|
21 agent = self.agent |
|
22 if self.good: |
|
23 self.failUnless(self.parser.can_fetch(agent, url)) |
|
24 else: |
|
25 self.failIf(self.parser.can_fetch(agent, url)) |
|
26 |
|
27 def __str__(self): |
|
28 return self.str |
|
29 |
|
30 tests = unittest.TestSuite() |
|
31 |
|
32 def RobotTest(index, robots_txt, good_urls, bad_urls, |
|
33 agent="test_robotparser"): |
|
34 |
|
35 lines = StringIO.StringIO(robots_txt).readlines() |
|
36 parser = robotparser.RobotFileParser() |
|
37 parser.parse(lines) |
|
38 for url in good_urls: |
|
39 tests.addTest(RobotTestCase(index, parser, url, 1, agent)) |
|
40 for url in bad_urls: |
|
41 tests.addTest(RobotTestCase(index, parser, url, 0, agent)) |
|
42 |
|
43 # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) |
|
44 |
|
45 # 1. |
|
46 doc = """ |
|
47 User-agent: * |
|
48 Disallow: /cyberworld/map/ # This is an infinite virtual URL space |
|
49 Disallow: /tmp/ # these will soon disappear |
|
50 Disallow: /foo.html |
|
51 """ |
|
52 |
|
53 good = ['/','/test.html'] |
|
54 bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html'] |
|
55 |
|
56 RobotTest(1, doc, good, bad) |
|
57 |
|
58 # 2. |
|
59 doc = """ |
|
60 # robots.txt for http://www.example.com/ |
|
61 |
|
62 User-agent: * |
|
63 Disallow: /cyberworld/map/ # This is an infinite virtual URL space |
|
64 |
|
65 # Cybermapper knows where to go. |
|
66 User-agent: cybermapper |
|
67 Disallow: |
|
68 |
|
69 """ |
|
70 |
|
71 good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')] |
|
72 bad = ['/cyberworld/map/index.html'] |
|
73 |
|
74 RobotTest(2, doc, good, bad) |
|
75 |
|
76 # 3. |
|
77 doc = """ |
|
78 # go away |
|
79 User-agent: * |
|
80 Disallow: / |
|
81 """ |
|
82 |
|
83 good = [] |
|
84 bad = ['/cyberworld/map/index.html','/','/tmp/'] |
|
85 |
|
86 RobotTest(3, doc, good, bad) |
|
87 |
|
88 # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002) |
|
89 |
|
90 # 4. |
|
91 doc = """ |
|
92 User-agent: figtree |
|
93 Disallow: /tmp |
|
94 Disallow: /a%3cd.html |
|
95 Disallow: /a%2fb.html |
|
96 Disallow: /%7ejoe/index.html |
|
97 """ |
|
98 |
|
99 good = [] # XFAIL '/a/b.html' |
|
100 bad = ['/tmp','/tmp.html','/tmp/a.html', |
|
101 '/a%3cd.html','/a%3Cd.html','/a%2fb.html', |
|
102 '/~joe/index.html' |
|
103 ] |
|
104 |
|
105 RobotTest(4, doc, good, bad, 'figtree') |
|
106 RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04') |
|
107 |
|
108 # 6. |
|
109 doc = """ |
|
110 User-agent: * |
|
111 Disallow: /tmp/ |
|
112 Disallow: /a%3Cd.html |
|
113 Disallow: /a/b.html |
|
114 Disallow: /%7ejoe/index.html |
|
115 """ |
|
116 |
|
117 good = ['/tmp',] # XFAIL: '/a%2fb.html' |
|
118 bad = ['/tmp/','/tmp/a.html', |
|
119 '/a%3cd.html','/a%3Cd.html',"/a/b.html", |
|
120 '/%7Ejoe/index.html'] |
|
121 |
|
122 RobotTest(6, doc, good, bad) |
|
123 |
|
124 # From bug report #523041 |
|
125 |
|
126 # 7. |
|
127 doc = """ |
|
128 User-Agent: * |
|
129 Disallow: /. |
|
130 """ |
|
131 |
|
132 good = ['/foo.html'] |
|
133 bad = [] # Bug report says "/" should be denied, but that is not in the RFC |
|
134 |
|
135 RobotTest(7, doc, good, bad) |
|
136 |
|
137 # From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364 |
|
138 |
|
139 # 8. |
|
140 doc = """ |
|
141 User-agent: Googlebot |
|
142 Allow: /folder1/myfile.html |
|
143 Disallow: /folder1/ |
|
144 """ |
|
145 |
|
146 good = ['/folder1/myfile.html'] |
|
147 bad = ['/folder1/anotherfile.html'] |
|
148 |
|
149 RobotTest(8, doc, good, bad, agent="Googlebot") |
|
150 |
|
151 # 9. This file is incorrect because "Googlebot" is a substring of |
|
152 # "Googlebot-Mobile", so test 10 works just like test 9. |
|
153 doc = """ |
|
154 User-agent: Googlebot |
|
155 Disallow: / |
|
156 |
|
157 User-agent: Googlebot-Mobile |
|
158 Allow: / |
|
159 """ |
|
160 |
|
161 good = [] |
|
162 bad = ['/something.jpg'] |
|
163 |
|
164 RobotTest(9, doc, good, bad, agent="Googlebot") |
|
165 |
|
166 good = [] |
|
167 bad = ['/something.jpg'] |
|
168 |
|
169 RobotTest(10, doc, good, bad, agent="Googlebot-Mobile") |
|
170 |
|
171 # 11. Get the order correct. |
|
172 doc = """ |
|
173 User-agent: Googlebot-Mobile |
|
174 Allow: / |
|
175 |
|
176 User-agent: Googlebot |
|
177 Disallow: / |
|
178 """ |
|
179 |
|
180 good = [] |
|
181 bad = ['/something.jpg'] |
|
182 |
|
183 RobotTest(11, doc, good, bad, agent="Googlebot") |
|
184 |
|
185 good = ['/something.jpg'] |
|
186 bad = [] |
|
187 |
|
188 RobotTest(12, doc, good, bad, agent="Googlebot-Mobile") |
|
189 |
|
190 |
|
191 # 13. Google also got the order wrong in #8. You need to specify the |
|
192 # URLs from more specific to more general. |
|
193 doc = """ |
|
194 User-agent: Googlebot |
|
195 Allow: /folder1/myfile.html |
|
196 Disallow: /folder1/ |
|
197 """ |
|
198 |
|
199 good = ['/folder1/myfile.html'] |
|
200 bad = ['/folder1/anotherfile.html'] |
|
201 |
|
202 RobotTest(13, doc, good, bad, agent="googlebot") |
|
203 |
|
204 |
|
205 |
|
206 class TestCase(unittest.TestCase): |
|
207 def runTest(self): |
|
208 test_support.requires('network') |
|
209 # whole site is password-protected. |
|
210 url = 'http://mueblesmoraleda.com' |
|
211 parser = robotparser.RobotFileParser() |
|
212 parser.set_url(url) |
|
213 parser.read() |
|
214 self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False) |
|
215 |
|
216 def test_main(): |
|
217 test_support.run_unittest(tests) |
|
218 TestCase().run() |
|
219 |
|
220 if __name__=='__main__': |
|
221 test_support.verbose = 1 |
|
222 test_main() |