|
1 # $Id: DTDDecls.pm,v 1.7 2005/10/14 20:31:20 matt Exp $ |
|
2 |
|
3 package XML::SAX::PurePerl; |
|
4 |
|
5 use strict; |
|
6 use XML::SAX::PurePerl::Productions qw($NameChar $SingleChar); |
|
7 |
|
8 sub elementdecl { |
|
9 my ($self, $reader) = @_; |
|
10 |
|
11 my $data = $reader->data(9); |
|
12 return 0 unless $data =~ /^<!ELEMENT/; |
|
13 $reader->move_along(9); |
|
14 |
|
15 $self->skip_whitespace($reader) || |
|
16 $self->parser_error("No whitespace after ELEMENT declaration", $reader); |
|
17 |
|
18 my $name = $self->Name($reader); |
|
19 |
|
20 $self->skip_whitespace($reader) || |
|
21 $self->parser_error("No whitespace after ELEMENT's name", $reader); |
|
22 |
|
23 $self->contentspec($reader, $name); |
|
24 |
|
25 $self->skip_whitespace($reader); |
|
26 |
|
27 $reader->match('>') or $self->parser_error("Closing angle bracket not found on ELEMENT declaration", $reader); |
|
28 |
|
29 return 1; |
|
30 } |
|
31 |
|
32 sub contentspec { |
|
33 my ($self, $reader, $name) = @_; |
|
34 |
|
35 my $data = $reader->data(5); |
|
36 |
|
37 my $model; |
|
38 if ($data =~ /^EMPTY/) { |
|
39 $reader->move_along(5); |
|
40 $model = 'EMPTY'; |
|
41 } |
|
42 elsif ($data =~ /^ANY/) { |
|
43 $reader->move_along(3); |
|
44 $model = 'ANY'; |
|
45 } |
|
46 else { |
|
47 $model = $self->Mixed_or_children($reader); |
|
48 } |
|
49 |
|
50 if ($model) { |
|
51 # call SAX callback now. |
|
52 $self->element_decl({Name => $name, Model => $model}); |
|
53 return 1; |
|
54 } |
|
55 |
|
56 $self->parser_error("contentspec not found in ELEMENT declaration", $reader); |
|
57 } |
|
58 |
|
59 sub Mixed_or_children { |
|
60 my ($self, $reader) = @_; |
|
61 |
|
62 my $data = $reader->data(8); |
|
63 $data =~ /^\(/ or return; # $self->parser_error("No opening bracket in Mixed or children", $reader); |
|
64 |
|
65 if ($data =~ /^\(\s*\#PCDATA/) { |
|
66 $reader->match('('); |
|
67 $self->skip_whitespace($reader); |
|
68 $reader->move_along(7); |
|
69 my $model = $self->Mixed($reader); |
|
70 return $model; |
|
71 } |
|
72 |
|
73 # not matched - must be Children |
|
74 return $self->children($reader); |
|
75 } |
|
76 |
|
77 # Mixed ::= ( '(' S* PCDATA ( S* '|' S* QName )* S* ')' '*' ) |
|
78 # | ( '(' S* PCDATA S* ')' ) |
|
79 sub Mixed { |
|
80 my ($self, $reader) = @_; |
|
81 |
|
82 # Mixed_or_children already matched '(' S* '#PCDATA' |
|
83 |
|
84 my $model = '(#PCDATA'; |
|
85 |
|
86 $self->skip_whitespace($reader); |
|
87 |
|
88 my %seen; |
|
89 |
|
90 while (1) { |
|
91 last unless $reader->match('|'); |
|
92 $self->skip_whitespace($reader); |
|
93 |
|
94 my $name = $self->Name($reader) || |
|
95 $self->parser_error("No 'Name' after Mixed content '|'", $reader); |
|
96 |
|
97 if ($seen{$name}) { |
|
98 $self->parser_error("Element '$name' has already appeared in this group", $reader); |
|
99 } |
|
100 $seen{$name}++; |
|
101 |
|
102 $model .= "|$name"; |
|
103 |
|
104 $self->skip_whitespace($reader); |
|
105 } |
|
106 |
|
107 $reader->match(')') || $self->parser_error("no closing bracket on mixed content", $reader); |
|
108 |
|
109 $model .= ")"; |
|
110 |
|
111 if ($reader->match('*')) { |
|
112 $model .= "*"; |
|
113 } |
|
114 |
|
115 return $model; |
|
116 } |
|
117 |
|
118 # [[47]] Children ::= ChoiceOrSeq Cardinality? |
|
119 # [[48]] Cp ::= ( QName | ChoiceOrSeq ) Cardinality? |
|
120 # ChoiceOrSeq ::= '(' S* Cp ( Choice | Seq )? S* ')' |
|
121 # [[49]] Choice ::= ( S* '|' S* Cp )+ |
|
122 # [[50]] Seq ::= ( S* ',' S* Cp )+ |
|
123 # // Children ::= (Choice | Seq) Cardinality? |
|
124 # // Cp ::= ( QName | Choice | Seq) Cardinality? |
|
125 # // Choice ::= '(' S* Cp ( S* '|' S* Cp )+ S* ')' |
|
126 # // Seq ::= '(' S* Cp ( S* ',' S* Cp )* S* ')' |
|
127 # [[51]] Mixed ::= ( '(' S* PCDATA ( S* '|' S* QName )* S* ')' MixedCardinality ) |
|
128 # | ( '(' S* PCDATA S* ')' ) |
|
129 # Cardinality ::= '?' | '+' | '*' |
|
130 # MixedCardinality ::= '*' |
|
131 sub children { |
|
132 my ($self, $reader) = @_; |
|
133 |
|
134 return $self->ChoiceOrSeq($reader) . $self->Cardinality($reader); |
|
135 } |
|
136 |
|
137 sub ChoiceOrSeq { |
|
138 my ($self, $reader) = @_; |
|
139 |
|
140 $reader->match('(') or $self->parser_error("choice/seq contains no opening bracket", $reader); |
|
141 |
|
142 my $model = '('; |
|
143 |
|
144 $self->skip_whitespace($reader); |
|
145 |
|
146 $model .= $self->Cp($reader); |
|
147 |
|
148 if (my $choice = $self->Choice($reader)) { |
|
149 $model .= $choice; |
|
150 } |
|
151 else { |
|
152 $model .= $self->Seq($reader); |
|
153 } |
|
154 |
|
155 $self->skip_whitespace($reader); |
|
156 |
|
157 $reader->match(')') or $self->parser_error("choice/seq contains no closing bracket", $reader); |
|
158 |
|
159 $model .= ')'; |
|
160 |
|
161 return $model; |
|
162 } |
|
163 |
|
164 sub Cardinality { |
|
165 my ($self, $reader) = @_; |
|
166 # cardinality is always optional |
|
167 my $data = $reader->data; |
|
168 if ($data =~ /^([\?\+\*])/) { |
|
169 $reader->move_along(1); |
|
170 return $1; |
|
171 } |
|
172 return ''; |
|
173 } |
|
174 |
|
175 sub Cp { |
|
176 my ($self, $reader) = @_; |
|
177 |
|
178 my $model; |
|
179 my $name = eval |
|
180 { |
|
181 if (my $name = $self->Name($reader)) { |
|
182 return $name . $self->Cardinality($reader); |
|
183 } |
|
184 }; |
|
185 return $name if defined $name; |
|
186 return $self->ChoiceOrSeq($reader) . $self->Cardinality($reader); |
|
187 } |
|
188 |
|
189 sub Choice { |
|
190 my ($self, $reader) = @_; |
|
191 |
|
192 my $model = ''; |
|
193 $self->skip_whitespace($reader); |
|
194 |
|
195 while ($reader->match('|')) { |
|
196 $self->skip_whitespace($reader); |
|
197 $model .= '|'; |
|
198 $model .= $self->Cp($reader); |
|
199 $self->skip_whitespace($reader); |
|
200 } |
|
201 |
|
202 return $model; |
|
203 } |
|
204 |
|
205 sub Seq { |
|
206 my ($self, $reader) = @_; |
|
207 |
|
208 my $model = ''; |
|
209 $self->skip_whitespace($reader); |
|
210 |
|
211 while ($reader->match(',')) { |
|
212 $self->skip_whitespace($reader); |
|
213 my $cp = $self->Cp($reader); |
|
214 if ($cp) { |
|
215 $model .= ','; |
|
216 $model .= $cp; |
|
217 } |
|
218 $self->skip_whitespace($reader); |
|
219 } |
|
220 |
|
221 return $model; |
|
222 } |
|
223 |
|
224 sub AttlistDecl { |
|
225 my ($self, $reader) = @_; |
|
226 |
|
227 my $data = $reader->data(9); |
|
228 if ($data =~ /^<!ATTLIST/) { |
|
229 # It's an attlist |
|
230 |
|
231 $reader->move_along(9); |
|
232 |
|
233 $self->skip_whitespace($reader) || |
|
234 $self->parser_error("No whitespace after ATTLIST declaration", $reader); |
|
235 my $name = $self->Name($reader); |
|
236 |
|
237 $self->AttDefList($reader, $name); |
|
238 |
|
239 $self->skip_whitespace($reader); |
|
240 |
|
241 $reader->match('>') or $self->parser_error("Closing angle bracket not found on ATTLIST declaration", $reader); |
|
242 |
|
243 return 1; |
|
244 } |
|
245 |
|
246 return 0; |
|
247 } |
|
248 |
|
249 sub AttDefList { |
|
250 my ($self, $reader, $name) = @_; |
|
251 |
|
252 1 while $self->AttDef($reader, $name); |
|
253 } |
|
254 |
|
255 sub AttDef { |
|
256 my ($self, $reader, $el_name) = @_; |
|
257 |
|
258 $self->skip_whitespace($reader) || return 0; |
|
259 my $att_name = $self->Name($reader) || return 0; |
|
260 $self->skip_whitespace($reader) || |
|
261 $self->parser_error("No whitespace after Name in attribute definition", $reader); |
|
262 my $att_type = $self->AttType($reader); |
|
263 |
|
264 $self->skip_whitespace($reader) || |
|
265 $self->parser_error("No whitespace after AttType in attribute definition", $reader); |
|
266 my ($mode, $value) = $self->DefaultDecl($reader); |
|
267 |
|
268 # fire SAX event here! |
|
269 $self->attribute_decl({ |
|
270 eName => $el_name, |
|
271 aName => $att_name, |
|
272 Type => $att_type, |
|
273 Mode => $mode, |
|
274 Value => $value, |
|
275 }); |
|
276 return 1; |
|
277 } |
|
278 |
|
279 sub AttType { |
|
280 my ($self, $reader) = @_; |
|
281 |
|
282 return $self->StringType($reader) || |
|
283 $self->TokenizedType($reader) || |
|
284 $self->EnumeratedType($reader) || |
|
285 $self->parser_error("Can't match AttType", $reader); |
|
286 } |
|
287 |
|
288 sub StringType { |
|
289 my ($self, $reader) = @_; |
|
290 |
|
291 my $data = $reader->data(5); |
|
292 return unless $data =~ /^CDATA/; |
|
293 $reader->move_along(5); |
|
294 return 'CDATA'; |
|
295 } |
|
296 |
|
297 sub TokenizedType { |
|
298 my ($self, $reader) = @_; |
|
299 |
|
300 my $data = $reader->data(8); |
|
301 if ($data =~ /^(IDREFS?|ID|ENTITIES|ENTITY|NMTOKENS?)/) { |
|
302 $reader->move_along(length($1)); |
|
303 return $1; |
|
304 } |
|
305 return; |
|
306 } |
|
307 |
|
308 sub EnumeratedType { |
|
309 my ($self, $reader) = @_; |
|
310 return $self->NotationType($reader) || $self->Enumeration($reader); |
|
311 } |
|
312 |
|
313 sub NotationType { |
|
314 my ($self, $reader) = @_; |
|
315 |
|
316 my $data = $reader->data(8); |
|
317 return unless $data =~ /^NOTATION/; |
|
318 $reader->move_along(8); |
|
319 |
|
320 $self->skip_whitespace($reader) || |
|
321 $self->parser_error("No whitespace after NOTATION", $reader); |
|
322 $reader->match('(') or $self->parser_error("No opening bracket in notation section", $reader); |
|
323 |
|
324 $self->skip_whitespace($reader); |
|
325 my $model = 'NOTATION ('; |
|
326 my $name = $self->Name($reader) || |
|
327 $self->parser_error("No name in notation section", $reader); |
|
328 $model .= $name; |
|
329 $self->skip_whitespace($reader); |
|
330 $data = $reader->data; |
|
331 while ($data =~ /^\|/) { |
|
332 $reader->move_along(1); |
|
333 $model .= '|'; |
|
334 $self->skip_whitespace($reader); |
|
335 my $name = $self->Name($reader) || |
|
336 $self->parser_error("No name in notation section", $reader); |
|
337 $model .= $name; |
|
338 $self->skip_whitespace($reader); |
|
339 $data = $reader->data; |
|
340 } |
|
341 $data =~ /^\)/ or $self->parser_error("No closing bracket in notation section", $reader); |
|
342 $reader->move_along(1); |
|
343 |
|
344 $model .= ')'; |
|
345 |
|
346 return $model; |
|
347 } |
|
348 |
|
349 sub Enumeration { |
|
350 my ($self, $reader) = @_; |
|
351 |
|
352 return unless $reader->match('('); |
|
353 |
|
354 $self->skip_whitespace($reader); |
|
355 my $model = '('; |
|
356 my $nmtoken = $self->Nmtoken($reader) || |
|
357 $self->parser_error("No Nmtoken in enumerated declaration", $reader); |
|
358 $model .= $nmtoken; |
|
359 $self->skip_whitespace($reader); |
|
360 my $data = $reader->data; |
|
361 while ($data =~ /^\|/) { |
|
362 $model .= '|'; |
|
363 $reader->move_along(1); |
|
364 $self->skip_whitespace($reader); |
|
365 my $nmtoken = $self->Nmtoken($reader) || |
|
366 $self->parser_error("No Nmtoken in enumerated declaration", $reader); |
|
367 $model .= $nmtoken; |
|
368 $self->skip_whitespace($reader); |
|
369 $data = $reader->data; |
|
370 } |
|
371 $data =~ /^\)/ or $self->parser_error("No closing bracket in enumerated declaration", $reader); |
|
372 $reader->move_along(1); |
|
373 |
|
374 $model .= ')'; |
|
375 |
|
376 return $model; |
|
377 } |
|
378 |
|
379 sub Nmtoken { |
|
380 my ($self, $reader) = @_; |
|
381 return $self->Name($reader); |
|
382 } |
|
383 |
|
384 sub DefaultDecl { |
|
385 my ($self, $reader) = @_; |
|
386 |
|
387 my $data = $reader->data(9); |
|
388 if ($data =~ /^(\#REQUIRED|\#IMPLIED)/) { |
|
389 $reader->move_along(length($1)); |
|
390 return $1; |
|
391 } |
|
392 my $model = ''; |
|
393 if ($data =~ /^\#FIXED/) { |
|
394 $reader->move_along(6); |
|
395 $self->skip_whitespace($reader) || $self->parser_error( |
|
396 "no whitespace after FIXED specifier", $reader); |
|
397 my $value = $self->AttValue($reader); |
|
398 return "#FIXED", $value; |
|
399 } |
|
400 my $value = $self->AttValue($reader); |
|
401 return undef, $value; |
|
402 } |
|
403 |
|
404 sub EntityDecl { |
|
405 my ($self, $reader) = @_; |
|
406 |
|
407 my $data = $reader->data(8); |
|
408 return 0 unless $data =~ /^<!ENTITY/; |
|
409 $reader->move_along(8); |
|
410 |
|
411 $self->skip_whitespace($reader) || $self->parser_error( |
|
412 "No whitespace after ENTITY declaration", $reader); |
|
413 |
|
414 $self->PEDecl($reader) || $self->GEDecl($reader); |
|
415 |
|
416 $self->skip_whitespace($reader); |
|
417 |
|
418 $reader->match('>') or $self->parser_error("No closing '>' in entity definition", $reader); |
|
419 |
|
420 return 1; |
|
421 } |
|
422 |
|
423 sub GEDecl { |
|
424 my ($self, $reader) = @_; |
|
425 |
|
426 my $name = $self->Name($reader) || $self->parser_error("No entity name given", $reader); |
|
427 $self->skip_whitespace($reader) || $self->parser_error("No whitespace after entity name", $reader); |
|
428 |
|
429 # TODO: ExternalID calls lexhandler method. Wrong place for it. |
|
430 my $value; |
|
431 if ($value = $self->ExternalID($reader)) { |
|
432 $value .= $self->NDataDecl($reader); |
|
433 } |
|
434 else { |
|
435 $value = $self->EntityValue($reader); |
|
436 } |
|
437 |
|
438 if ($self->{ParseOptions}{entities}{$name}) { |
|
439 warn("entity $name already exists\n"); |
|
440 } else { |
|
441 $self->{ParseOptions}{entities}{$name} = 1; |
|
442 $self->{ParseOptions}{expanded_entity}{$name} = $value; # ??? |
|
443 } |
|
444 # do callback? |
|
445 return 1; |
|
446 } |
|
447 |
|
448 sub PEDecl { |
|
449 my ($self, $reader) = @_; |
|
450 |
|
451 return 0 unless $reader->match('%'); |
|
452 |
|
453 $self->skip_whitespace($reader) || $self->parser_error("No whitespace after parameter entity marker", $reader); |
|
454 my $name = $self->Name($reader) || $self->parser_error("No parameter entity name given", $reader); |
|
455 $self->skip_whitespace($reader) || $self->parser_error("No whitespace after parameter entity name", $reader); |
|
456 my $value = $self->ExternalID($reader) || |
|
457 $self->EntityValue($reader) || |
|
458 $self->parser_error("PE is not a value or an external resource", $reader); |
|
459 # do callback? |
|
460 return 1; |
|
461 } |
|
462 |
|
463 my $quotre = qr/[^%&\"]/; |
|
464 my $aposre = qr/[^%&\']/; |
|
465 |
|
466 sub EntityValue { |
|
467 my ($self, $reader) = @_; |
|
468 |
|
469 my $data = $reader->data; |
|
470 my $quote = '"'; |
|
471 my $re = $quotre; |
|
472 if (!$data =~ /^"/) { |
|
473 $data =~ /^'/ or $self->parser_error("Not a quote character", $reader); |
|
474 $quote = "'"; |
|
475 $re = $aposre; |
|
476 } |
|
477 $reader->move_along(1); |
|
478 |
|
479 my $value = ''; |
|
480 |
|
481 while (1) { |
|
482 my $data = $reader->data; |
|
483 |
|
484 $self->parser_error("EOF found while reading entity value", $reader) |
|
485 unless length($data); |
|
486 |
|
487 if ($data =~ /^($re+)/) { |
|
488 my $match = $1; |
|
489 $value .= $match; |
|
490 $reader->move_along(length($match)); |
|
491 } |
|
492 elsif ($reader->match('&')) { |
|
493 # if it's a char ref, expand now: |
|
494 if ($reader->match('#')) { |
|
495 my $char; |
|
496 my $ref = ''; |
|
497 if ($reader->match('x')) { |
|
498 my $data = $reader->data; |
|
499 while (1) { |
|
500 $self->parser_error("EOF looking for reference end", $reader) |
|
501 unless length($data); |
|
502 if ($data !~ /^([0-9a-fA-F]*)/) { |
|
503 last; |
|
504 } |
|
505 $ref .= $1; |
|
506 $reader->move_along(length($1)); |
|
507 if (length($1) == length($data)) { |
|
508 $data = $reader->data; |
|
509 } |
|
510 else { |
|
511 last; |
|
512 } |
|
513 } |
|
514 $char = chr_ref(hex($ref)); |
|
515 $ref = "x$ref"; |
|
516 } |
|
517 else { |
|
518 my $data = $reader->data; |
|
519 while (1) { |
|
520 $self->parser_error("EOF looking for reference end", $reader) |
|
521 unless length($data); |
|
522 if ($data !~ /^([0-9]*)/) { |
|
523 last; |
|
524 } |
|
525 $ref .= $1; |
|
526 $reader->move_along(length($1)); |
|
527 if (length($1) == length($data)) { |
|
528 $data = $reader->data; |
|
529 } |
|
530 else { |
|
531 last; |
|
532 } |
|
533 } |
|
534 $char = chr($ref); |
|
535 } |
|
536 $reader->match(';') || |
|
537 $self->parser_error("No semi-colon found after character reference", $reader); |
|
538 if ($char !~ $SingleChar) { # match a single character |
|
539 $self->parser_error("Character reference '&#$ref;' refers to an illegal XML character ($char)", $reader); |
|
540 } |
|
541 $value .= $char; |
|
542 } |
|
543 else { |
|
544 # entity refs in entities get expanded later, so don't parse now. |
|
545 $value .= '&'; |
|
546 } |
|
547 } |
|
548 elsif ($reader->match('%')) { |
|
549 $value .= $self->PEReference($reader); |
|
550 } |
|
551 elsif ($reader->match($quote)) { |
|
552 # end of attrib |
|
553 last; |
|
554 } |
|
555 else { |
|
556 $self->parser_error("Invalid character in attribute value: " . substr($reader->data, 0, 1), $reader); |
|
557 } |
|
558 } |
|
559 |
|
560 return $value; |
|
561 } |
|
562 |
|
563 sub NDataDecl { |
|
564 my ($self, $reader) = @_; |
|
565 $self->skip_whitespace($reader) || return ''; |
|
566 my $data = $reader->data(5); |
|
567 return '' unless $data =~ /^NDATA/; |
|
568 $reader->move_along(5); |
|
569 $self->skip_whitespace($reader) || $self->parser_error("No whitespace after NDATA declaration", $reader); |
|
570 my $name = $self->Name($reader) || $self->parser_error("NDATA declaration lacks a proper Name", $reader); |
|
571 return " NDATA $name"; |
|
572 } |
|
573 |
|
574 sub NotationDecl { |
|
575 my ($self, $reader) = @_; |
|
576 |
|
577 my $data = $reader->data(10); |
|
578 return 0 unless $data =~ /^<!NOTATION/; |
|
579 $reader->move_along(10); |
|
580 $self->skip_whitespace($reader) || |
|
581 $self->parser_error("No whitespace after NOTATION declaration", $reader); |
|
582 $data = $reader->data; |
|
583 my $value = ''; |
|
584 while(1) { |
|
585 $self->parser_error("EOF found while looking for end of NotationDecl", $reader) |
|
586 unless length($data); |
|
587 |
|
588 if ($data =~ /^([^>]*)>/) { |
|
589 $value .= $1; |
|
590 $reader->move_along(length($1) + 1); |
|
591 $self->notation_decl({Name => "FIXME", SystemId => "FIXME", PublicId => "FIXME" }); |
|
592 last; |
|
593 } |
|
594 else { |
|
595 $value .= $data; |
|
596 $reader->move_along(length($data)); |
|
597 $data = $reader->data; |
|
598 } |
|
599 } |
|
600 return 1; |
|
601 } |
|
602 |
|
603 1; |