uh_parser/XML/SAX/PurePerl/DTDDecls.pm
changeset 177 6d3c3db11e72
equal deleted inserted replaced
176:266a7e9b9237 177:6d3c3db11e72
       
     1 # $Id: DTDDecls.pm,v 1.7 2005/10/14 20:31:20 matt Exp $
       
     2 
       
     3 package XML::SAX::PurePerl;
       
     4 
       
     5 use strict;
       
     6 use XML::SAX::PurePerl::Productions qw($NameChar $SingleChar);
       
     7 
       
     8 sub elementdecl {
       
     9     my ($self, $reader) = @_;
       
    10     
       
    11     my $data = $reader->data(9);
       
    12     return 0 unless $data =~ /^<!ELEMENT/;
       
    13     $reader->move_along(9);
       
    14     
       
    15     $self->skip_whitespace($reader) ||
       
    16         $self->parser_error("No whitespace after ELEMENT declaration", $reader);
       
    17     
       
    18     my $name = $self->Name($reader);
       
    19     
       
    20     $self->skip_whitespace($reader) ||
       
    21         $self->parser_error("No whitespace after ELEMENT's name", $reader);
       
    22         
       
    23     $self->contentspec($reader, $name);
       
    24     
       
    25     $self->skip_whitespace($reader);
       
    26     
       
    27     $reader->match('>') or $self->parser_error("Closing angle bracket not found on ELEMENT declaration", $reader);
       
    28     
       
    29     return 1;
       
    30 }
       
    31 
       
    32 sub contentspec {
       
    33     my ($self, $reader, $name) = @_;
       
    34     
       
    35     my $data = $reader->data(5);
       
    36     
       
    37     my $model;
       
    38     if ($data =~ /^EMPTY/) {
       
    39         $reader->move_along(5);
       
    40         $model = 'EMPTY';
       
    41     }
       
    42     elsif ($data =~ /^ANY/) {
       
    43         $reader->move_along(3);
       
    44         $model = 'ANY';
       
    45     }
       
    46     else {
       
    47         $model = $self->Mixed_or_children($reader);
       
    48     }
       
    49 
       
    50     if ($model) {
       
    51         # call SAX callback now.
       
    52         $self->element_decl({Name => $name, Model => $model});
       
    53         return 1;
       
    54     }
       
    55     
       
    56     $self->parser_error("contentspec not found in ELEMENT declaration", $reader);
       
    57 }
       
    58 
       
    59 sub Mixed_or_children {
       
    60     my ($self, $reader) = @_;
       
    61 
       
    62     my $data = $reader->data(8);
       
    63     $data =~ /^\(/ or return; # $self->parser_error("No opening bracket in Mixed or children", $reader);
       
    64     
       
    65     if ($data =~ /^\(\s*\#PCDATA/) {
       
    66         $reader->match('(');
       
    67         $self->skip_whitespace($reader);
       
    68         $reader->move_along(7);
       
    69         my $model = $self->Mixed($reader);
       
    70         return $model;
       
    71     }
       
    72 
       
    73     # not matched - must be Children
       
    74     return $self->children($reader);
       
    75 }
       
    76 
       
    77 # Mixed ::= ( '(' S* PCDATA ( S* '|' S* QName )* S* ')' '*' )
       
    78 #               | ( '(' S* PCDATA S* ')' )
       
    79 sub Mixed {
       
    80     my ($self, $reader) = @_;
       
    81 
       
    82     # Mixed_or_children already matched '(' S* '#PCDATA'
       
    83 
       
    84     my $model = '(#PCDATA';
       
    85             
       
    86     $self->skip_whitespace($reader);
       
    87 
       
    88     my %seen;
       
    89     
       
    90     while (1) {
       
    91         last unless $reader->match('|');
       
    92         $self->skip_whitespace($reader);
       
    93 
       
    94         my $name = $self->Name($reader) || 
       
    95             $self->parser_error("No 'Name' after Mixed content '|'", $reader);
       
    96 
       
    97         if ($seen{$name}) {
       
    98             $self->parser_error("Element '$name' has already appeared in this group", $reader);
       
    99         }
       
   100         $seen{$name}++;
       
   101 
       
   102         $model .= "|$name";
       
   103         
       
   104         $self->skip_whitespace($reader);
       
   105     }
       
   106     
       
   107     $reader->match(')') || $self->parser_error("no closing bracket on mixed content", $reader);
       
   108 
       
   109     $model .= ")";
       
   110 
       
   111     if ($reader->match('*')) {
       
   112         $model .= "*";
       
   113     }
       
   114     
       
   115     return $model;
       
   116 }
       
   117 
       
   118 # [[47]] Children ::= ChoiceOrSeq Cardinality?
       
   119 # [[48]] Cp ::= ( QName | ChoiceOrSeq ) Cardinality?
       
   120 #       ChoiceOrSeq ::= '(' S* Cp ( Choice | Seq )? S* ')'
       
   121 # [[49]] Choice ::= ( S* '|' S* Cp )+
       
   122 # [[50]] Seq    ::= ( S* ',' S* Cp )+
       
   123 #        // Children ::= (Choice | Seq) Cardinality?
       
   124 #        // Cp ::= ( QName | Choice | Seq) Cardinality?
       
   125 #        // Choice ::= '(' S* Cp ( S* '|' S* Cp )+ S* ')'
       
   126 #        // Seq    ::= '(' S* Cp ( S* ',' S* Cp )* S* ')'
       
   127 # [[51]] Mixed ::= ( '(' S* PCDATA ( S* '|' S* QName )* S* ')' MixedCardinality )
       
   128 #                | ( '(' S* PCDATA S* ')' )
       
   129 #        Cardinality ::= '?' | '+' | '*'
       
   130 #        MixedCardinality ::= '*'
       
   131 sub children {
       
   132     my ($self, $reader) = @_;
       
   133     
       
   134     return $self->ChoiceOrSeq($reader) . $self->Cardinality($reader);
       
   135 }
       
   136 
       
   137 sub ChoiceOrSeq {
       
   138     my ($self, $reader) = @_;
       
   139     
       
   140     $reader->match('(') or $self->parser_error("choice/seq contains no opening bracket", $reader);
       
   141     
       
   142     my $model = '(';
       
   143     
       
   144     $self->skip_whitespace($reader);
       
   145 
       
   146     $model .= $self->Cp($reader);
       
   147     
       
   148     if (my $choice = $self->Choice($reader)) {
       
   149         $model .= $choice;
       
   150     }
       
   151     else {
       
   152         $model .= $self->Seq($reader);
       
   153     }
       
   154 
       
   155     $self->skip_whitespace($reader);
       
   156 
       
   157     $reader->match(')') or $self->parser_error("choice/seq contains no closing bracket", $reader);
       
   158 
       
   159     $model .= ')';
       
   160     
       
   161     return $model;
       
   162 }
       
   163 
       
   164 sub Cardinality {
       
   165     my ($self, $reader) = @_;
       
   166     # cardinality is always optional
       
   167     my $data = $reader->data;
       
   168     if ($data =~ /^([\?\+\*])/) {
       
   169         $reader->move_along(1);
       
   170         return $1;
       
   171     }
       
   172     return '';
       
   173 }
       
   174 
       
   175 sub Cp {
       
   176     my ($self, $reader) = @_;
       
   177 
       
   178     my $model;
       
   179     my $name = eval
       
   180     {
       
   181 	if (my $name = $self->Name($reader)) {
       
   182 	    return $name . $self->Cardinality($reader);
       
   183 	}
       
   184     };
       
   185     return $name if defined $name;
       
   186     return $self->ChoiceOrSeq($reader) . $self->Cardinality($reader);
       
   187 }
       
   188 
       
   189 sub Choice {
       
   190     my ($self, $reader) = @_;
       
   191     
       
   192     my $model = '';
       
   193     $self->skip_whitespace($reader);
       
   194     
       
   195     while ($reader->match('|')) {
       
   196         $self->skip_whitespace($reader);
       
   197         $model .= '|';
       
   198         $model .= $self->Cp($reader);
       
   199         $self->skip_whitespace($reader);
       
   200     }
       
   201 
       
   202     return $model;
       
   203 }
       
   204 
       
   205 sub Seq {
       
   206     my ($self, $reader) = @_;
       
   207     
       
   208     my $model = '';
       
   209     $self->skip_whitespace($reader);
       
   210     
       
   211     while ($reader->match(',')) {
       
   212         $self->skip_whitespace($reader);
       
   213         my $cp = $self->Cp($reader);
       
   214         if ($cp) {
       
   215             $model .= ',';
       
   216             $model .= $cp;
       
   217         }
       
   218         $self->skip_whitespace($reader);
       
   219     }
       
   220 
       
   221     return $model;
       
   222 }
       
   223 
       
   224 sub AttlistDecl {
       
   225     my ($self, $reader) = @_;
       
   226     
       
   227     my $data = $reader->data(9);
       
   228     if ($data =~ /^<!ATTLIST/) {
       
   229         # It's an attlist
       
   230         
       
   231         $reader->move_along(9);
       
   232         
       
   233         $self->skip_whitespace($reader) || 
       
   234             $self->parser_error("No whitespace after ATTLIST declaration", $reader);
       
   235         my $name = $self->Name($reader);
       
   236 
       
   237         $self->AttDefList($reader, $name);
       
   238 
       
   239         $self->skip_whitespace($reader);
       
   240         
       
   241         $reader->match('>') or $self->parser_error("Closing angle bracket not found on ATTLIST declaration", $reader);
       
   242         
       
   243         return 1;
       
   244     }
       
   245     
       
   246     return 0;
       
   247 }
       
   248 
       
   249 sub AttDefList {
       
   250     my ($self, $reader, $name) = @_;
       
   251 
       
   252     1 while $self->AttDef($reader, $name);
       
   253 }
       
   254 
       
   255 sub AttDef {
       
   256     my ($self, $reader, $el_name) = @_;
       
   257 
       
   258     $self->skip_whitespace($reader) || return 0;
       
   259     my $att_name = $self->Name($reader) || return 0;
       
   260     $self->skip_whitespace($reader) || 
       
   261         $self->parser_error("No whitespace after Name in attribute definition", $reader);
       
   262     my $att_type = $self->AttType($reader);
       
   263 
       
   264     $self->skip_whitespace($reader) ||
       
   265         $self->parser_error("No whitespace after AttType in attribute definition", $reader);
       
   266     my ($mode, $value) = $self->DefaultDecl($reader);
       
   267     
       
   268     # fire SAX event here!
       
   269     $self->attribute_decl({
       
   270             eName => $el_name, 
       
   271             aName => $att_name, 
       
   272             Type => $att_type, 
       
   273             Mode => $mode, 
       
   274             Value => $value,
       
   275             });
       
   276     return 1;
       
   277 }
       
   278 
       
   279 sub AttType {
       
   280     my ($self, $reader) = @_;
       
   281 
       
   282     return $self->StringType($reader) ||
       
   283             $self->TokenizedType($reader) ||
       
   284             $self->EnumeratedType($reader) ||
       
   285             $self->parser_error("Can't match AttType", $reader);
       
   286 }
       
   287 
       
   288 sub StringType {
       
   289     my ($self, $reader) = @_;
       
   290     
       
   291     my $data = $reader->data(5);
       
   292     return unless $data =~ /^CDATA/;
       
   293     $reader->move_along(5);
       
   294     return 'CDATA';
       
   295 }
       
   296 
       
   297 sub TokenizedType {
       
   298     my ($self, $reader) = @_;
       
   299     
       
   300     my $data = $reader->data(8);
       
   301     if ($data =~ /^(IDREFS?|ID|ENTITIES|ENTITY|NMTOKENS?)/) {
       
   302         $reader->move_along(length($1));
       
   303         return $1;
       
   304     }
       
   305     return;
       
   306 }
       
   307 
       
   308 sub EnumeratedType {
       
   309     my ($self, $reader) = @_;
       
   310     return $self->NotationType($reader) || $self->Enumeration($reader);
       
   311 }
       
   312 
       
   313 sub NotationType {
       
   314     my ($self, $reader) = @_;
       
   315     
       
   316     my $data = $reader->data(8);
       
   317     return unless $data =~ /^NOTATION/;
       
   318     $reader->move_along(8);
       
   319     
       
   320     $self->skip_whitespace($reader) ||
       
   321         $self->parser_error("No whitespace after NOTATION", $reader);
       
   322     $reader->match('(') or $self->parser_error("No opening bracket in notation section", $reader);
       
   323     
       
   324     $self->skip_whitespace($reader);
       
   325     my $model = 'NOTATION (';
       
   326     my $name = $self->Name($reader) ||
       
   327         $self->parser_error("No name in notation section", $reader);
       
   328     $model .= $name;
       
   329     $self->skip_whitespace($reader);
       
   330     $data = $reader->data;
       
   331     while ($data =~ /^\|/) {
       
   332         $reader->move_along(1);
       
   333         $model .= '|';
       
   334         $self->skip_whitespace($reader);
       
   335         my $name = $self->Name($reader) ||
       
   336             $self->parser_error("No name in notation section", $reader);
       
   337         $model .= $name;
       
   338         $self->skip_whitespace($reader);
       
   339         $data = $reader->data;
       
   340     }
       
   341     $data =~ /^\)/ or $self->parser_error("No closing bracket in notation section", $reader);
       
   342     $reader->move_along(1);
       
   343     
       
   344     $model .= ')';
       
   345 
       
   346     return $model;
       
   347 }
       
   348 
       
   349 sub Enumeration {
       
   350     my ($self, $reader) = @_;
       
   351     
       
   352     return unless $reader->match('(');
       
   353     
       
   354     $self->skip_whitespace($reader);
       
   355     my $model = '(';
       
   356     my $nmtoken = $self->Nmtoken($reader) ||
       
   357         $self->parser_error("No Nmtoken in enumerated declaration", $reader);
       
   358     $model .= $nmtoken;
       
   359     $self->skip_whitespace($reader);
       
   360     my $data = $reader->data;
       
   361     while ($data =~ /^\|/) {
       
   362         $model .= '|';
       
   363         $reader->move_along(1);
       
   364         $self->skip_whitespace($reader);
       
   365         my $nmtoken = $self->Nmtoken($reader) ||
       
   366             $self->parser_error("No Nmtoken in enumerated declaration", $reader);
       
   367         $model .= $nmtoken;
       
   368         $self->skip_whitespace($reader);
       
   369         $data = $reader->data;
       
   370     }
       
   371     $data =~ /^\)/ or $self->parser_error("No closing bracket in enumerated declaration", $reader);
       
   372     $reader->move_along(1);
       
   373     
       
   374     $model .= ')';
       
   375 
       
   376     return $model;
       
   377 }
       
   378 
       
   379 sub Nmtoken {
       
   380     my ($self, $reader) = @_;
       
   381     return $self->Name($reader);
       
   382 }
       
   383 
       
   384 sub DefaultDecl {
       
   385     my ($self, $reader) = @_;
       
   386     
       
   387     my $data = $reader->data(9);
       
   388     if ($data =~ /^(\#REQUIRED|\#IMPLIED)/) {
       
   389         $reader->move_along(length($1));
       
   390         return $1;
       
   391     }
       
   392     my $model = '';
       
   393     if ($data =~ /^\#FIXED/) {
       
   394         $reader->move_along(6);
       
   395         $self->skip_whitespace($reader) || $self->parser_error(
       
   396                 "no whitespace after FIXED specifier", $reader);
       
   397         my $value = $self->AttValue($reader);
       
   398         return "#FIXED", $value;
       
   399     }
       
   400     my $value = $self->AttValue($reader);
       
   401     return undef, $value;
       
   402 }
       
   403 
       
   404 sub EntityDecl {
       
   405     my ($self, $reader) = @_;
       
   406     
       
   407     my $data = $reader->data(8);
       
   408     return 0 unless $data =~ /^<!ENTITY/;
       
   409     $reader->move_along(8);
       
   410     
       
   411     $self->skip_whitespace($reader) || $self->parser_error(
       
   412         "No whitespace after ENTITY declaration", $reader);
       
   413     
       
   414     $self->PEDecl($reader) || $self->GEDecl($reader);
       
   415     
       
   416     $self->skip_whitespace($reader);
       
   417     
       
   418     $reader->match('>') or $self->parser_error("No closing '>' in entity definition", $reader);
       
   419     
       
   420     return 1;
       
   421 }
       
   422 
       
   423 sub GEDecl {
       
   424     my ($self, $reader) = @_;
       
   425 
       
   426     my $name = $self->Name($reader) || $self->parser_error("No entity name given", $reader);
       
   427     $self->skip_whitespace($reader) || $self->parser_error("No whitespace after entity name", $reader);
       
   428 
       
   429     # TODO: ExternalID calls lexhandler method. Wrong place for it.
       
   430     my $value;
       
   431     if ($value = $self->ExternalID($reader)) {
       
   432         $value .= $self->NDataDecl($reader);
       
   433     }
       
   434     else {
       
   435         $value = $self->EntityValue($reader);
       
   436     }
       
   437 
       
   438     if ($self->{ParseOptions}{entities}{$name}) {
       
   439         warn("entity $name already exists\n");
       
   440     } else {
       
   441         $self->{ParseOptions}{entities}{$name} = 1;
       
   442         $self->{ParseOptions}{expanded_entity}{$name} = $value; # ???
       
   443     }
       
   444     # do callback?
       
   445     return 1;
       
   446 }
       
   447 
       
   448 sub PEDecl {
       
   449     my ($self, $reader) = @_;
       
   450     
       
   451     return 0 unless $reader->match('%');
       
   452 
       
   453     $self->skip_whitespace($reader) || $self->parser_error("No whitespace after parameter entity marker", $reader);
       
   454     my $name = $self->Name($reader) || $self->parser_error("No parameter entity name given", $reader);
       
   455     $self->skip_whitespace($reader) || $self->parser_error("No whitespace after parameter entity name", $reader);
       
   456     my $value = $self->ExternalID($reader) ||
       
   457                 $self->EntityValue($reader) ||
       
   458                 $self->parser_error("PE is not a value or an external resource", $reader);
       
   459     # do callback?
       
   460     return 1;
       
   461 }
       
   462 
       
   463 my $quotre = qr/[^%&\"]/;
       
   464 my $aposre = qr/[^%&\']/;
       
   465 
       
   466 sub EntityValue {
       
   467     my ($self, $reader) = @_;
       
   468     
       
   469     my $data = $reader->data;
       
   470     my $quote = '"';
       
   471     my $re = $quotre;
       
   472     if (!$data =~ /^"/) {
       
   473         $data =~ /^'/ or $self->parser_error("Not a quote character", $reader);
       
   474         $quote = "'";
       
   475         $re = $aposre;
       
   476     }
       
   477     $reader->move_along(1);
       
   478     
       
   479     my $value = '';
       
   480     
       
   481     while (1) {
       
   482         my $data = $reader->data;
       
   483 
       
   484         $self->parser_error("EOF found while reading entity value", $reader)
       
   485             unless length($data);
       
   486         
       
   487         if ($data =~ /^($re+)/) {
       
   488             my $match = $1;
       
   489             $value .= $match;
       
   490             $reader->move_along(length($match));
       
   491         }
       
   492         elsif ($reader->match('&')) {
       
   493             # if it's a char ref, expand now:
       
   494             if ($reader->match('#')) {
       
   495                 my $char;
       
   496                 my $ref = '';
       
   497                 if ($reader->match('x')) {
       
   498                     my $data = $reader->data;
       
   499                     while (1) {
       
   500                         $self->parser_error("EOF looking for reference end", $reader)
       
   501                             unless length($data);
       
   502                         if ($data !~ /^([0-9a-fA-F]*)/) {
       
   503                             last;
       
   504                         }
       
   505                         $ref .= $1;
       
   506                         $reader->move_along(length($1));
       
   507                         if (length($1) == length($data)) {
       
   508                             $data = $reader->data;
       
   509                         }
       
   510                         else {
       
   511                             last;
       
   512                         }
       
   513                     }
       
   514                     $char = chr_ref(hex($ref));
       
   515                     $ref = "x$ref";
       
   516                 }
       
   517                 else {
       
   518                     my $data = $reader->data;
       
   519                     while (1) {
       
   520                         $self->parser_error("EOF looking for reference end", $reader)
       
   521                             unless length($data);
       
   522                         if ($data !~ /^([0-9]*)/) {
       
   523                             last;
       
   524                         }
       
   525                         $ref .= $1;
       
   526                         $reader->move_along(length($1));
       
   527                         if (length($1) == length($data)) {
       
   528                             $data = $reader->data;
       
   529                         }
       
   530                         else {
       
   531                             last;
       
   532                         }
       
   533                     }
       
   534                     $char = chr($ref);
       
   535                 }
       
   536                 $reader->match(';') ||
       
   537                     $self->parser_error("No semi-colon found after character reference", $reader);
       
   538                 if ($char !~ $SingleChar) { # match a single character
       
   539                     $self->parser_error("Character reference '&#$ref;' refers to an illegal XML character ($char)", $reader);
       
   540                 }
       
   541                 $value .= $char;
       
   542             }
       
   543             else {
       
   544                 # entity refs in entities get expanded later, so don't parse now.
       
   545                 $value .= '&';
       
   546             }
       
   547         }
       
   548         elsif ($reader->match('%')) {
       
   549             $value .= $self->PEReference($reader);
       
   550         }
       
   551         elsif ($reader->match($quote)) {
       
   552             # end of attrib
       
   553             last;
       
   554         }
       
   555         else {
       
   556             $self->parser_error("Invalid character in attribute value: " . substr($reader->data, 0, 1), $reader);
       
   557         }
       
   558     }
       
   559     
       
   560     return $value;
       
   561 }
       
   562 
       
   563 sub NDataDecl {
       
   564     my ($self, $reader) = @_;
       
   565     $self->skip_whitespace($reader) || return '';
       
   566     my $data = $reader->data(5);
       
   567     return '' unless $data =~ /^NDATA/;
       
   568     $reader->move_along(5);
       
   569     $self->skip_whitespace($reader) || $self->parser_error("No whitespace after NDATA declaration", $reader);
       
   570     my $name = $self->Name($reader) || $self->parser_error("NDATA declaration lacks a proper Name", $reader);
       
   571     return " NDATA $name";
       
   572 }
       
   573 
       
   574 sub NotationDecl {
       
   575     my ($self, $reader) = @_;
       
   576     
       
   577     my $data = $reader->data(10);
       
   578     return 0 unless $data =~ /^<!NOTATION/;
       
   579     $reader->move_along(10);
       
   580     $self->skip_whitespace($reader) ||
       
   581         $self->parser_error("No whitespace after NOTATION declaration", $reader);
       
   582     $data = $reader->data;
       
   583     my $value = '';
       
   584     while(1) {
       
   585         $self->parser_error("EOF found while looking for end of NotationDecl", $reader)
       
   586             unless length($data);
       
   587         
       
   588         if ($data =~ /^([^>]*)>/) {
       
   589             $value .= $1;
       
   590             $reader->move_along(length($1) + 1);
       
   591             $self->notation_decl({Name => "FIXME", SystemId => "FIXME", PublicId => "FIXME" });
       
   592             last;
       
   593         }
       
   594         else {
       
   595             $value .= $data;
       
   596             $reader->move_along(length($data));
       
   597             $data = $reader->data;
       
   598         }
       
   599     }
       
   600     return 1;
       
   601 }
       
   602 
       
   603 1;