|
1 # Very simple test - Parse a file and print what happens |
|
2 |
|
3 # XXX TypeErrors on calling handlers, or on bad return values from a |
|
4 # handler, are obscure and unhelpful. |
|
5 |
|
6 import pyexpat |
|
7 from xml.parsers import expat |
|
8 |
|
9 from test.test_support import sortdict, TestFailed |
|
10 |
|
11 class Outputter: |
|
12 def StartElementHandler(self, name, attrs): |
|
13 print 'Start element:\n\t', repr(name), sortdict(attrs) |
|
14 |
|
15 def EndElementHandler(self, name): |
|
16 print 'End element:\n\t', repr(name) |
|
17 |
|
18 def CharacterDataHandler(self, data): |
|
19 data = data.strip() |
|
20 if data: |
|
21 print 'Character data:' |
|
22 print '\t', repr(data) |
|
23 |
|
24 def ProcessingInstructionHandler(self, target, data): |
|
25 print 'PI:\n\t', repr(target), repr(data) |
|
26 |
|
27 def StartNamespaceDeclHandler(self, prefix, uri): |
|
28 print 'NS decl:\n\t', repr(prefix), repr(uri) |
|
29 |
|
30 def EndNamespaceDeclHandler(self, prefix): |
|
31 print 'End of NS decl:\n\t', repr(prefix) |
|
32 |
|
33 def StartCdataSectionHandler(self): |
|
34 print 'Start of CDATA section' |
|
35 |
|
36 def EndCdataSectionHandler(self): |
|
37 print 'End of CDATA section' |
|
38 |
|
39 def CommentHandler(self, text): |
|
40 print 'Comment:\n\t', repr(text) |
|
41 |
|
42 def NotationDeclHandler(self, *args): |
|
43 name, base, sysid, pubid = args |
|
44 print 'Notation declared:', args |
|
45 |
|
46 def UnparsedEntityDeclHandler(self, *args): |
|
47 entityName, base, systemId, publicId, notationName = args |
|
48 print 'Unparsed entity decl:\n\t', args |
|
49 |
|
50 def NotStandaloneHandler(self, userData): |
|
51 print 'Not standalone' |
|
52 return 1 |
|
53 |
|
54 def ExternalEntityRefHandler(self, *args): |
|
55 context, base, sysId, pubId = args |
|
56 print 'External entity ref:', args[1:] |
|
57 return 1 |
|
58 |
|
59 def DefaultHandler(self, userData): |
|
60 pass |
|
61 |
|
62 def DefaultHandlerExpand(self, userData): |
|
63 pass |
|
64 |
|
65 |
|
66 def confirm(ok): |
|
67 if ok: |
|
68 print "OK." |
|
69 else: |
|
70 print "Not OK." |
|
71 |
|
72 out = Outputter() |
|
73 parser = expat.ParserCreate(namespace_separator='!') |
|
74 |
|
75 # Test getting/setting returns_unicode |
|
76 parser.returns_unicode = 0; confirm(parser.returns_unicode == 0) |
|
77 parser.returns_unicode = 1; confirm(parser.returns_unicode == 1) |
|
78 parser.returns_unicode = 2; confirm(parser.returns_unicode == 1) |
|
79 parser.returns_unicode = 0; confirm(parser.returns_unicode == 0) |
|
80 |
|
81 # Test getting/setting ordered_attributes |
|
82 parser.ordered_attributes = 0; confirm(parser.ordered_attributes == 0) |
|
83 parser.ordered_attributes = 1; confirm(parser.ordered_attributes == 1) |
|
84 parser.ordered_attributes = 2; confirm(parser.ordered_attributes == 1) |
|
85 parser.ordered_attributes = 0; confirm(parser.ordered_attributes == 0) |
|
86 |
|
87 # Test getting/setting specified_attributes |
|
88 parser.specified_attributes = 0; confirm(parser.specified_attributes == 0) |
|
89 parser.specified_attributes = 1; confirm(parser.specified_attributes == 1) |
|
90 parser.specified_attributes = 2; confirm(parser.specified_attributes == 1) |
|
91 parser.specified_attributes = 0; confirm(parser.specified_attributes == 0) |
|
92 |
|
93 HANDLER_NAMES = [ |
|
94 'StartElementHandler', 'EndElementHandler', |
|
95 'CharacterDataHandler', 'ProcessingInstructionHandler', |
|
96 'UnparsedEntityDeclHandler', 'NotationDeclHandler', |
|
97 'StartNamespaceDeclHandler', 'EndNamespaceDeclHandler', |
|
98 'CommentHandler', 'StartCdataSectionHandler', |
|
99 'EndCdataSectionHandler', |
|
100 'DefaultHandler', 'DefaultHandlerExpand', |
|
101 #'NotStandaloneHandler', |
|
102 'ExternalEntityRefHandler' |
|
103 ] |
|
104 for name in HANDLER_NAMES: |
|
105 setattr(parser, name, getattr(out, name)) |
|
106 |
|
107 data = '''\ |
|
108 <?xml version="1.0" encoding="iso-8859-1" standalone="no"?> |
|
109 <?xml-stylesheet href="stylesheet.css"?> |
|
110 <!-- comment data --> |
|
111 <!DOCTYPE quotations SYSTEM "quotations.dtd" [ |
|
112 <!ELEMENT root ANY> |
|
113 <!NOTATION notation SYSTEM "notation.jpeg"> |
|
114 <!ENTITY acirc "â"> |
|
115 <!ENTITY external_entity SYSTEM "entity.file"> |
|
116 <!ENTITY unparsed_entity SYSTEM "entity.file" NDATA notation> |
|
117 %unparsed_entity; |
|
118 ]> |
|
119 |
|
120 <root attr1="value1" attr2="value2ὀ"> |
|
121 <myns:subelement xmlns:myns="http://www.python.org/namespace"> |
|
122 Contents of subelements |
|
123 </myns:subelement> |
|
124 <sub2><![CDATA[contents of CDATA section]]></sub2> |
|
125 &external_entity; |
|
126 </root> |
|
127 ''' |
|
128 |
|
129 # Produce UTF-8 output |
|
130 parser.returns_unicode = 0 |
|
131 try: |
|
132 parser.Parse(data, 1) |
|
133 except expat.error: |
|
134 print '** Error', parser.ErrorCode, expat.ErrorString(parser.ErrorCode) |
|
135 print '** Line', parser.ErrorLineNumber |
|
136 print '** Column', parser.ErrorColumnNumber |
|
137 print '** Byte', parser.ErrorByteIndex |
|
138 |
|
139 # Try the parse again, this time producing Unicode output |
|
140 parser = expat.ParserCreate(namespace_separator='!') |
|
141 parser.returns_unicode = 1 |
|
142 |
|
143 for name in HANDLER_NAMES: |
|
144 setattr(parser, name, getattr(out, name)) |
|
145 try: |
|
146 parser.Parse(data, 1) |
|
147 except expat.error: |
|
148 print '** Error', parser.ErrorCode, expat.ErrorString(parser.ErrorCode) |
|
149 print '** Line', parser.ErrorLineNumber |
|
150 print '** Column', parser.ErrorColumnNumber |
|
151 print '** Byte', parser.ErrorByteIndex |
|
152 |
|
153 # Try parsing a file |
|
154 parser = expat.ParserCreate(namespace_separator='!') |
|
155 parser.returns_unicode = 1 |
|
156 |
|
157 for name in HANDLER_NAMES: |
|
158 setattr(parser, name, getattr(out, name)) |
|
159 import StringIO |
|
160 file = StringIO.StringIO(data) |
|
161 try: |
|
162 parser.ParseFile(file) |
|
163 except expat.error: |
|
164 print '** Error', parser.ErrorCode, expat.ErrorString(parser.ErrorCode) |
|
165 print '** Line', parser.ErrorLineNumber |
|
166 print '** Column', parser.ErrorColumnNumber |
|
167 print '** Byte', parser.ErrorByteIndex |
|
168 |
|
169 |
|
170 # Tests that make sure we get errors when the namespace_separator value |
|
171 # is illegal, and that we don't for good values: |
|
172 print |
|
173 print "Testing constructor for proper handling of namespace_separator values:" |
|
174 expat.ParserCreate() |
|
175 expat.ParserCreate(namespace_separator=None) |
|
176 expat.ParserCreate(namespace_separator=' ') |
|
177 print "Legal values tested o.k." |
|
178 try: |
|
179 expat.ParserCreate(namespace_separator=42) |
|
180 except TypeError, e: |
|
181 print "Caught expected TypeError:" |
|
182 print e |
|
183 else: |
|
184 print "Failed to catch expected TypeError." |
|
185 |
|
186 try: |
|
187 expat.ParserCreate(namespace_separator='too long') |
|
188 except ValueError, e: |
|
189 print "Caught expected ValueError:" |
|
190 print e |
|
191 else: |
|
192 print "Failed to catch expected ValueError." |
|
193 |
|
194 # ParserCreate() needs to accept a namespace_separator of zero length |
|
195 # to satisfy the requirements of RDF applications that are required |
|
196 # to simply glue together the namespace URI and the localname. Though |
|
197 # considered a wart of the RDF specifications, it needs to be supported. |
|
198 # |
|
199 # See XML-SIG mailing list thread starting with |
|
200 # http://mail.python.org/pipermail/xml-sig/2001-April/005202.html |
|
201 # |
|
202 expat.ParserCreate(namespace_separator='') # too short |
|
203 |
|
204 # Test the interning machinery. |
|
205 p = expat.ParserCreate() |
|
206 L = [] |
|
207 def collector(name, *args): |
|
208 L.append(name) |
|
209 p.StartElementHandler = collector |
|
210 p.EndElementHandler = collector |
|
211 p.Parse("<e> <e/> <e></e> </e>", 1) |
|
212 tag = L[0] |
|
213 if len(L) != 6: |
|
214 print "L should only contain 6 entries; found", len(L) |
|
215 for entry in L: |
|
216 if tag is not entry: |
|
217 print "expected L to contain many references to the same string", |
|
218 print "(it didn't)" |
|
219 print "L =", repr(L) |
|
220 break |
|
221 |
|
222 # Tests of the buffer_text attribute. |
|
223 import sys |
|
224 |
|
225 class TextCollector: |
|
226 def __init__(self, parser): |
|
227 self.stuff = [] |
|
228 |
|
229 def check(self, expected, label): |
|
230 require(self.stuff == expected, |
|
231 "%s\nstuff = %r\nexpected = %r" |
|
232 % (label, self.stuff, map(unicode, expected))) |
|
233 |
|
234 def CharacterDataHandler(self, text): |
|
235 self.stuff.append(text) |
|
236 |
|
237 def StartElementHandler(self, name, attrs): |
|
238 self.stuff.append("<%s>" % name) |
|
239 bt = attrs.get("buffer-text") |
|
240 if bt == "yes": |
|
241 parser.buffer_text = 1 |
|
242 elif bt == "no": |
|
243 parser.buffer_text = 0 |
|
244 |
|
245 def EndElementHandler(self, name): |
|
246 self.stuff.append("</%s>" % name) |
|
247 |
|
248 def CommentHandler(self, data): |
|
249 self.stuff.append("<!--%s-->" % data) |
|
250 |
|
251 def require(cond, label): |
|
252 # similar to confirm(), but no extraneous output |
|
253 if not cond: |
|
254 raise TestFailed(label) |
|
255 |
|
256 def setup(handlers=[]): |
|
257 parser = expat.ParserCreate() |
|
258 require(not parser.buffer_text, |
|
259 "buffer_text not disabled by default") |
|
260 parser.buffer_text = 1 |
|
261 handler = TextCollector(parser) |
|
262 parser.CharacterDataHandler = handler.CharacterDataHandler |
|
263 for name in handlers: |
|
264 setattr(parser, name, getattr(handler, name)) |
|
265 return parser, handler |
|
266 |
|
267 parser, handler = setup() |
|
268 require(parser.buffer_text, |
|
269 "text buffering either not acknowledged or not enabled") |
|
270 parser.Parse("<a>1<b/>2<c/>3</a>", 1) |
|
271 handler.check(["123"], |
|
272 "buffered text not properly collapsed") |
|
273 |
|
274 # XXX This test exposes more detail of Expat's text chunking than we |
|
275 # XXX like, but it tests what we need to concisely. |
|
276 parser, handler = setup(["StartElementHandler"]) |
|
277 parser.Parse("<a>1<b buffer-text='no'/>2\n3<c buffer-text='yes'/>4\n5</a>", 1) |
|
278 handler.check(["<a>", "1", "<b>", "2", "\n", "3", "<c>", "4\n5"], |
|
279 "buffering control not reacting as expected") |
|
280 |
|
281 parser, handler = setup() |
|
282 parser.Parse("<a>1<b/><2><c/> \n 3</a>", 1) |
|
283 handler.check(["1<2> \n 3"], |
|
284 "buffered text not properly collapsed") |
|
285 |
|
286 parser, handler = setup(["StartElementHandler"]) |
|
287 parser.Parse("<a>1<b/>2<c/>3</a>", 1) |
|
288 handler.check(["<a>", "1", "<b>", "2", "<c>", "3"], |
|
289 "buffered text not properly split") |
|
290 |
|
291 parser, handler = setup(["StartElementHandler", "EndElementHandler"]) |
|
292 parser.CharacterDataHandler = None |
|
293 parser.Parse("<a>1<b/>2<c/>3</a>", 1) |
|
294 handler.check(["<a>", "<b>", "</b>", "<c>", "</c>", "</a>"], |
|
295 "huh?") |
|
296 |
|
297 parser, handler = setup(["StartElementHandler", "EndElementHandler"]) |
|
298 parser.Parse("<a>1<b></b>2<c/>3</a>", 1) |
|
299 handler.check(["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3", "</a>"], |
|
300 "huh?") |
|
301 |
|
302 parser, handler = setup(["CommentHandler", "EndElementHandler", |
|
303 "StartElementHandler"]) |
|
304 parser.Parse("<a>1<b/>2<c></c>345</a> ", 1) |
|
305 handler.check(["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "345", "</a>"], |
|
306 "buffered text not properly split") |
|
307 |
|
308 parser, handler = setup(["CommentHandler", "EndElementHandler", |
|
309 "StartElementHandler"]) |
|
310 parser.Parse("<a>1<b/>2<c></c>3<!--abc-->4<!--def-->5</a> ", 1) |
|
311 handler.check(["<a>", "1", "<b>", "</b>", "2", "<c>", "</c>", "3", |
|
312 "<!--abc-->", "4", "<!--def-->", "5", "</a>"], |
|
313 "buffered text not properly split") |
|
314 |
|
315 # Test handling of exception from callback: |
|
316 def StartElementHandler(name, attrs): |
|
317 raise RuntimeError(name) |
|
318 |
|
319 parser = expat.ParserCreate() |
|
320 parser.StartElementHandler = StartElementHandler |
|
321 |
|
322 try: |
|
323 parser.Parse("<a><b><c/></b></a>", 1) |
|
324 except RuntimeError, e: |
|
325 if e.args[0] != "a": |
|
326 print "Expected RuntimeError for element 'a'; found %r" % e.args[0] |
|
327 else: |
|
328 print "Expected RuntimeError for 'a'" |
|
329 |
|
330 # Test Current* members: |
|
331 class PositionTest: |
|
332 |
|
333 def __init__(self, expected_list, parser): |
|
334 self.parser = parser |
|
335 self.parser.StartElementHandler = self.StartElementHandler |
|
336 self.parser.EndElementHandler = self.EndElementHandler |
|
337 self.expected_list = expected_list |
|
338 self.upto = 0 |
|
339 |
|
340 def StartElementHandler(self, name, attrs): |
|
341 self.check_pos('s') |
|
342 |
|
343 def EndElementHandler(self, name): |
|
344 self.check_pos('e') |
|
345 |
|
346 def check_pos(self, event): |
|
347 pos = (event, |
|
348 self.parser.CurrentByteIndex, |
|
349 self.parser.CurrentLineNumber, |
|
350 self.parser.CurrentColumnNumber) |
|
351 require(self.upto < len(self.expected_list), |
|
352 'too many parser events') |
|
353 expected = self.expected_list[self.upto] |
|
354 require(pos == expected, |
|
355 'expected position %s, got %s' % (expected, pos)) |
|
356 self.upto += 1 |
|
357 |
|
358 |
|
359 parser = expat.ParserCreate() |
|
360 handler = PositionTest([('s', 0, 1, 0), ('s', 5, 2, 1), ('s', 11, 3, 2), |
|
361 ('e', 15, 3, 6), ('e', 17, 4, 1), ('e', 22, 5, 0)], |
|
362 parser) |
|
363 parser.Parse('''<a> |
|
364 <b> |
|
365 <c/> |
|
366 </b> |
|
367 </a>''', 1) |
|
368 |
|
369 |
|
370 def test_parse_only_xml_data(): |
|
371 # http://python.org/sf/1296433 |
|
372 # |
|
373 xml = "<?xml version='1.0' encoding='iso8859'?><s>%s</s>" % ('a' * 1025) |
|
374 # this one doesn't crash |
|
375 #xml = "<?xml version='1.0'?><s>%s</s>" % ('a' * 10000) |
|
376 |
|
377 def handler(text): |
|
378 raise Exception |
|
379 |
|
380 parser = expat.ParserCreate() |
|
381 parser.CharacterDataHandler = handler |
|
382 |
|
383 try: |
|
384 parser.Parse(xml) |
|
385 except: |
|
386 pass |
|
387 |
|
388 test_parse_only_xml_data() |