diff -r 000000000000 -r 02cd6b52f378 dummy_foundation/lib/XML/Checker.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dummy_foundation/lib/XML/Checker.pm Thu May 28 10:10:03 2009 +0100 @@ -0,0 +1,2006 @@ +# +# +# TO DO +# - update docs regarding PerlSAX interface +# - add current node to error context when checking DOM subtrees +# - add parsed Entity to test XML files +# - free circular references +# - Implied handler? +# - Notation, Entity, Unparsed checks, Default handler? +# - check no root element (it's checked by expat) ? + +package XML::Checker::Term; +use strict; + +sub new +{ + my ($class, %h) = @_; + bless \%h, $class; +} + +sub str +{ + '<' . $_[0]->{C} . $_[0]->{N} . '>' +} + +sub re +{ + $_[0]->{S} +} + +sub rel +{ + my $self = shift; + defined $self->{SL} ? @{ $self->{SL} } : ( $self->{S} ); +} + +sub debug +{ + my $t = shift; + my ($c, $n, $s) = ($t->{C}, $t->{N}, $t->{S}); + my @sl = $t->rel; + "{C=$c N=$n S=$s SL=@sl}"; +} + +#------------------------------------------------------------------------- + +package XML::Checker::Context; + +sub new +{ + my ($class) = @_; + my $scalar; + bless \$scalar, $class; +} + +sub Start {} +sub End {} +sub Char {} + +# +# The initial Context when checking an entire XML Document +# +package XML::Checker::DocContext; +use vars qw( @ISA ); +@ISA = qw( XML::Checker::Context ); + +sub new +{ +#??checker not used + my ($class, $checker) = @_; + bless { }, $class; +} + +sub setRootElement +{ + $_[0]->{RootElement} = $_[1]; +} + +sub Start +{ + my ($self, $checker, $tag) = @_; + if (exists $self->{Elem}) + { + my $tags = join (", ", @{$self->{Elem}}); + $checker->fail (155, "more than one root Element [$tags]"); + push @{$self->{Elem}}, $tag; + } + else + { + $self->{Elem} = [ $tag ]; + } + + my $exp_root = $self->{RootElement}; + $checker->fail (156, "unexpected root Element [$tag], expected [$exp_root]") + if defined ($exp_root) and $tag ne $exp_root; +} + +sub debug +{ + my $self = shift; + "DocContext[Count=" . $self->{Count} . ",Root=" . + $self->{RootElement} . "]"; +} + +package XML::Checker::Context::ANY; +use vars qw( @ISA ); +@ISA = qw( XML::Checker::Context ); + +# No overrides, because everything is accepted + +sub debug { "XML::Checker::Context::ANY" } + +package XML::Checker::Context::EMPTY; +use vars qw( @ISA $ALLOW_WHITE_SPACE ); +@ISA = qw( XML::Checker::Context ); + +$ALLOW_WHITE_SPACE = 0; + +sub debug { "XML::Checker::Context::EMPTY" } + +sub Start +{ + my ($self, $checker, $tag) = @_; + $checker->fail (152, "Element should be EMPTY, found Element [$tag]"); +} + +sub Char +{ + my ($self, $checker, $str) = @_; + $checker->fail (153, "Element should be EMPTY, found text [$str]") + unless ($ALLOW_WHITE_SPACE and $checker->isWS ($str)); + + # NOTE: if $ALLOW_WHITE_SPACE = 1, the isWS call does not only check + # whether it is whitespace, but it also informs the checker that this + # might be insignificant whitespace +} + +#?? what about Comments + +package XML::Checker::Context::Children; +use vars qw( @ISA ); +@ISA = qw( XML::Checker::Context ); + +sub new +{ + my ($class, $rule) = @_; + bless { Name => $rule->{Name}, RE => $rule->{RE}, Buf => "", N => 0 }, $class; +} + +sub phash +{ + my $href = shift; + my $str = ""; + for (keys %$href) + { + $str .= ' ' if $str; + $str .= $_ . '=' . $href->{$_}; + } + $str; +} + +sub debug +{ + my $self = shift; + "Context::Children[Name=(" . phash ($self->{Name}) . ",N=" . $self->{N} . + ",RE=" . $self->{RE} . ",Buf=[" . $self->{Buf} . "]"; +} + +sub Start +{ + my ($self, $checker, $tag) = @_; + +#print "Children.Start tag=$tag rule=$checker drule=" . $checker->debug . "\n"; + + if (exists $self->{Name}->{$tag}) + { +#print "Buf=[".$self->{Buf}. "] tag=[" . $self->{Name}->{$tag}->{S} . "]\n"; + $self->{Buf} .= $self->{Name}->{$tag}->{S}; + } + else + { + $checker->fail (157, "unexpected Element [$tag]", + ChildElementIndex => $self->{N}) + } + $self->{N}++; +} + +sub decode +{ + my ($self) = @_; + my $re = $self->{RE}; + my $name = $self->{Name}; + my $buf = $self->{Buf}; + + my %s = (); + while (my ($key, $val) = each %$name) + { + $s{$val->{S}} = $key; + } + + my ($len) = scalar (keys %$name); + $len = length $len; + my $dots = "[^()*+?]" x $len; + + $buf =~ s/($dots)/$s{$1} . ","/ge; + chop $buf; + + $re =~ s/($dots)/"(" . $s{$1} . ")"/ge; + + "Found=[$buf] RE=[$re]" +} + +sub End +{ + my ($self, $checker) = @_; + my $re = $self->{RE}; + +#print "End " . $self->debug . "\n"; + $checker->fail (154, "bad order of Elements " . $self->decode) + unless $self->{Buf} =~ /^$re$/; +} + +sub Char +{ + my ($self, $checker, $str) = @_; + + # Inform the checker that this might be insignificant whitespace + $checker->isWS ($str); +} + +package XML::Checker::Context::Mixed; +use vars qw( @ISA ); +@ISA = qw( XML::Checker::Context ); + +sub new +{ + my ($class, $rule) = @_; + bless { Name => $rule->{Name}, N => 0 }, $class; +} + +sub debug +{ + my $self = shift; + "Context::Mixed[Name=" . $self->{Name} . ",N=" , $self->{N} . "]"; +} + +sub Start +{ + my ($self, $checker, $tag) = @_; + + $checker->fail (157, "unexpected Element [$tag]", + ChildElementIndex => $self->{N}) + unless exists $self->{Name}->{$tag}; + $self->{N}++; +} + +package XML::Checker::ERule; + +package XML::Checker::ERule::EMPTY; +use vars qw( @ISA ); +@ISA = qw( XML::Checker::ERule ); + +sub new +{ + my ($class) = @_; + bless {}, $class; +} + +my $context = new XML::Checker::Context::EMPTY; +sub context { $context } # share the context + +sub debug { "EMPTY" } + +package XML::Checker::ERule::ANY; +use vars qw( @ISA ); +@ISA = qw( XML::Checker::ERule ); + +sub new +{ + my ($class) = @_; + bless {}, $class; +} + +my $any_context = new XML::Checker::Context::ANY; +sub context { $any_context } # share the context + +sub debug { "ANY" } + +package XML::Checker::ERule::Mixed; +use vars qw( @ISA ); +@ISA = qw( XML::Checker::ERule ); + +sub new +{ + my ($class) = @_; + bless { Name => {} }, $class; +} + +sub context +{ + my ($self) = @_; + new XML::Checker::Context::Mixed ($self); +} + +sub setModel +{ + my ($self, $model) = @_; + my $rule = $model; + + # Mixed := '(' '#PCDATA' ')' '*'? + if ($rule =~ /^\(\s*#PCDATA\s*\)(\*)?$/) + { +#? how do we interpret the '*' ?? + return 1; + } + else # Mixed := '(' '#PCDATA' ('|' Name)* ')*' + { + return 0 unless $rule =~ s/^\(\s*#PCDATA\s*//; + return 0 unless $rule =~ s/\s*\)\*$//; + + my %names = (); + while ($rule =~ s/^\s*\|\s*($XML::RegExp::Name)//) + { + $names{$1} = 1; + } + if ($rule eq "") + { + $self->{Name} = \%names; + return 1; + } + } + return 0; +} + +sub debug +{ + my ($self) = @_; + "Mixed[Names=" . join("|", keys %{$self->{Name}}) . "]"; +} + +package XML::Checker::ERule::Children; +use vars qw( @ISA %_name %_map $_n ); +@ISA = qw( XML::Checker::ERule ); + +sub new +{ + my ($class) = @_; + bless {}, $class; +} + +sub context +{ + my ($self) = @_; + new XML::Checker::Context::Children ($self); +} + +sub _add # static +{ + my $exp = new XML::Checker::Term (@_); + $_map{$exp->{N}} = $exp; + $exp->str; +} + +my $IDS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + +sub _tokenize +{ + my ($self, $rule) = @_; + + # Replace names with Terms of the form "", e.g. "". + # Lookup already used names and store new names in %_name. + # + $$rule =~ s/($XML::RegExp::Name)(?!>)/ + if (exists $_name{$1}) # name already used? + { + $_name{$1}->str; + } + else + { + my $exp = new XML::Checker::Term (C => 'n', N => $_n++, + Name => $1); + $_name{$1} = $_map{$exp->{N}} = $exp; + $exp->str; + } + /eg; + + if ($_n < length $IDS) + { + # Generate regular expression for the name Term, i.e. + # a single character from $IDS + my $i = 0; + for (values %_name) + { + $_->{S} = substr ($IDS, $i++, 1); +#print "tokenized " . $_->{Name} . " num=" . $_->{N} . " to " . $_->{S} . "\n"; + } + } + else + { + # Generate RE, convert Term->{N} to hex string a la "(#)", + # e.g. "(03d)". Calculate needed length of hex string first. + my $len = 1; + for (my $n = $_n - 1; ($n >> 4) > 0; $len++) {} + + my $i = 0; + for (values %_name) + { + $_->{S} = sprintf ("(0${len}lx)", $i++); +#print "tokenized " . $_->{Name} . " num=" . $_->{N} . " to " . $_->{S} . "\n"; + } + } +} + +sub setModel +{ + my ($self, $rule) = @_; + + local $_n = 0; + local %_map = (); + local %_name = (); + + $self->_tokenize (\$rule); + +#?? check for single name - die "!ELEMENT contents can't be just a NAME" if $rule =~ /^$XML::RegExp::Name$/; + + for ($rule) + { + my $n = 1; + while ($n) + { + $n = 0; + + # cp := ( name | choice | seq ) ('?' | '*' | '+')? + $n++ while s/<[ncs](\d+)>([?*+]?)/_add + (C => 'a', N => $_n++, + S => ($_map{$1}->re . $2))/eg; + + # choice := '(' ch_l ')' + $n++ while s/\(\s*<[ad](\d+)>\s*\)/_add + (C => 'c', N => $_n++, + S => "(" . join ("|", $_map{$1}->rel) . ")")/eg; + + # ch_l := ( cp | ch_l ) '|' ( cp | ch_l ) + $n++ while s/<[ad](\d+)>\s*\|\s*<[ad](\d+)>/_add + (C => 'd', N => $_n++, + SL => [ $_map{$1}->rel, $_map{$2}->rel ])/eg; + + # seq := '(' (seq_l ')' + $n++ while s/\(\s*<[at](\d+)>\s*\)/_add + (C => 's', N => $_n++, + S => "(".join("", $_map{$1}->rel).")")/eg; + + # seq_l := ( cp | seq_l ) ',' ( cp | seq_l ) + $n++ while s/<[at](\d+)>\s*,\s*<[at](\d+)>/_add + (C => 't', N => $_n++, + SL => [ $_map{$1}->rel, $_map{$2}->rel ])/eg; + } + } + + return 0 if ($rule !~ /^$/); + + $self->{Name} = \%_name; + $self->{RE} = $_map{$1}->re; + + return 1; +} + +sub debug +{ + my ($self) = @_; + "Children[RE=" . $self->{RE} . "]"; +} + + +package XML::Checker::ARule; +use XML::RegExp; + +sub new +{ + my ($class, $elem, $checker) = @_; + bless { Elem => $elem, Checker => $checker, Required => {} }, $class; +} + +sub Attlist +{ + my ($self, $attr, $type, $default, $fixed, $checker) = @_; + my ($c1, $c2); + + if ($self->{Defined}->{$attr}) + { + my $tag = $self->{Elem}; + $self->fail ($attr, 110, "attribute [$attr] of element [$tag] already defined"); + } + else + { + $self->{Defined}->{$attr} = 1; + } + + if ($default =~ /^\#(REQUIRED|IMPLIED)$/) + { + $c1 = $1; + + # Keep list of all required attributes + if ($default eq '#REQUIRED') + { + $self->{Required}->{$attr} = 1; + } + } + else + { + $self->fail ($attr, 122, "invalid default attribute value [$default]") + unless $default =~ /^$XML::RegExp::AttValue$/; + + $default = substr ($default, 1, length($default)-2); + $self->{Default}->{$attr} = $default; + $c1 = 'FIXED' if $fixed; + } + + if ($type eq 'ID') + { + $self->fail ($attr, 123, "invalid default ID [$default], must be #REQUIRED or #IMPLIED") + unless $default =~ /^#(REQUIRED|IMPLIED)$/; + + if (exists ($self->{ID}) && $self->{ID} ne $attr) + { + $self->fail ($attr, 151, "only one ID allowed per ELEMENT " . + "first=[" . $self->{ID} . "]"); + } + else + { + $self->{ID} = $attr; + } + $c2 = 'ID'; + } + elsif ($type =~ /^(IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS)$/) + { + my $def = $self->{Default}->{$attr}; + if (defined $def) + { + my $re = ($type =~ /^[IE]/) ? $XML::RegExp::Name : $XML::RegExp::NmToken; + if ($type =~ /S$/) + { + for (split (/\s+/, $def)) + { + $self->fail ($attr, 121, + "invalid default [$_] in $type [$def]") + unless $_ =~ /^$re$/; + } + } + else # singular + { + $self->fail ($attr, 120, "invalid default $type [$def]") + unless $def =~ /^$re$/; + } + } + $c2 = $type; + } + elsif ($type ne 'CDATA') # Enumerated := NotationType | Enumeration + { + if ($type =~ /^\s*NOTATION\s*\(\s*($XML::RegExp::Name(\s*\|\s*$XML::RegExp::Name)*)\s*\)\s*$/) + { + $self->fail ($attr, 135, "empty NOTATION list in ATTLIST") + unless defined $1; + + my @tok = split (/\s*\|\s*/, $1); + for (@tok) + { + $self->fail ($attr, 100, "undefined NOTATION [$_] in ATTLIST") + unless exists $checker->{NOTATION}->{$_}; + } + + my $re = join ("|", @tok); + $self->{NotationRE} = "^($re)\$"; + $c2 = 'NotationType'; + } + elsif ($type =~ /^\s*\(\s*($XML::RegExp::NmToken(\s*\|\s*$XML::RegExp::NmToken)*)\s*\)\s*$/) + { + # Enumeration + + $self->fail ($attr, 136, "empty Enumeration list in ATTLIST") + unless defined $1; + + my @tok = split (/\s*\|\s*/, $1); + for (@tok) + { + $self->fail ($attr, 134, + "invalid Enumeration value [$_] in ATTLIST") + unless $_ =~ /^$XML::RegExp::NmToken$/; + } + $self->{EnumRE}->{$attr} = '^(' . join ("|", @tok) . ')$'; #'; + $c2 = 'Enumeration'; + } + else + { + $self->fail ($attr, 137, "invalid ATTLIST type [$type]"); + } + } + + $self->{Check1}->{$attr} = $c1 if $c1; + $self->{Check2}->{$attr} = $c2 if $c2; +} + +sub fail +{ + my $self = shift; + my $attr = shift; + $self->{Checker}->fail (@_, Element => $self->{Elem}, Attr => $attr); +} + +sub check +{ + my ($self, $attr) = @_; + my $func1 = $self->{Check1}->{$attr}; + my $func2 = $self->{Check2}->{$attr}; +# print "check func1=$func1 func2=$func2 @_\n"; + + if (exists $self->{ReqNotSeen}->{$attr}) + { + delete $self->{ReqNotSeen}->{$attr}; + } + no strict; + + &$func1 (@_) if defined $func1; + &$func2 (@_) if defined $func2; +} + +# Copies the list of all required attributes from $self->{Required} to +# $self->{ReqNotSeen}. +# When check() encounters a required attribute, it is removed from ReqNotSeen. +# In EndAttr we look at which attribute names are still in ReqNotSeen - those +# are the ones that were not specified and are, therefore, in error. +sub StartAttr +{ + my $self = shift; + my %not_seen = %{ $self->{Required} }; + $self->{ReqNotSeen} = \%not_seen; +} + +# Checks which of the #REQUIRED attributes were not specified +sub EndAttr +{ + my $self = shift; + + for my $attr (keys %{ $self->{ReqNotSeen} }) + { + $self->fail ($attr, 159, + "unspecified value for \#REQUIRED attribute [$attr]"); + } +} + +sub FIXED +{ + my ($self, $attr, $val, $specified) = @_; + + my $default = $self->{Default}->{$attr}; + $self->fail ($attr, 150, + "bad \#FIXED attribute value [$val], it should be [$default]") + unless ($val eq $default); +} + +sub IMPLIED +{ + my ($self, $attr, $val, $specified) = @_; + +#?? should #IMPLIED be specified? + $self->fail ($attr, 158, + "unspecified value for \#IMPLIED attribute [$attr]") + unless $specified; + +#?? Implied handler ? +} + +# This is called when an attribute is passed to the check() method by +# XML::Checker::Attr(), i.e. when the attribute was specified explicitly +# or defaulted by the parser (which should never happen), *NOT* when the +# attribute was omitted. (The latter is checked by StartAttr/EndAttr) +sub REQUIRED +{ + my ($self, $attr, $val, $specified) = @_; +# print "REQUIRED attr=$attr val=$val spec=$specified\n"; + + $self->fail ($attr, 159, + "unspecified value for \#REQUIRED attribute [$attr]") + unless $specified; +} + +sub ID # must be #IMPLIED or #REQUIRED +{ + my ($self, $attr, $val, $specified) = @_; + + $self->fail ($attr, 131, "invalid ID [$val]") + unless $val =~ /^$XML::RegExp::Name$/; + + $self->fail ($attr, 111, "ID [$val] already defined") + if $self->{Checker}->{ID}->{$val}++; +} + +sub IDREF +{ + my ($self, $attr, $val, $specified) = @_; + + $self->fail ($attr, 132, "invalid IDREF [$val]") + unless $val =~ /^$XML::RegExp::Name$/; + + $self->{Checker}->{IDREF}->{$val}++; +} + +sub IDREFS +{ + my ($self, $attr, $val, $specified) = @_; + for (split /\s+/, $val) + { + $self->IDREF ($attr, $_); + } +} + +sub ENTITY +{ + my ($self, $attr, $val, $specified) = @_; +#?? should it be specified? + + $self->fail ($attr, 133, "invalid ENTITY name [$val]") + unless $val =~ /^$XML::RegExp::Name$/; + + $self->fail ($attr, 102, "undefined unparsed ENTITY [$val]") + unless exists $self->{Checker}->{Unparsed}->{$val}; +} + +sub ENTITIES +{ + my ($self, $attr, $val, $specified) = @_; + for (split /\s+/, $val) + { + $self->ENTITY ($attr, $_); + } +} + +sub NMTOKEN +{ + my ($self, $attr, $val, $specified) = @_; + $self->fail ($attr, 130, "invalid NMTOKEN [$val]") + unless $val =~ /^$XML::RegExp::NmToken$/; +} + +sub NMTOKENS +{ + my ($self, $attr, $val, $specified) = @_; + for (split /\s+/, $val) + { + $self->NMTOKEN ($attr, $_, $specified); + } +} + +sub Enumeration +{ + my ($self, $attr, $val, $specified) = @_; + my $re = $self->{EnumRE}->{$attr}; + + $self->fail ($attr, 160, "invalid Enumeration value [$val]") + unless $val =~ /$re/; +} + +sub NotationType +{ + my ($self, $attr, $val, $specified) = @_; + my $re = $self->{NotationRE}; + + $self->fail ($attr, 161, "invalid NOTATION value [$val]") + unless $val =~ /$re/; + + $self->fail ($attr, 162, "undefined NOTATION [$val]") + unless exists $self->{Checker}->{NOTATION}->{$val}; +} + +package XML::Checker; +use vars qw ( $VERSION $FAIL $INSIGNIF_WS ); + +BEGIN +{ + $VERSION = '0.09'; +} + +$FAIL = \&print_error; + +# Whether the last seen Char data was insignicant whitespace +$INSIGNIF_WS = 0; + +sub new +{ + my ($class, %args) = @_; + + $args{ERule} = {}; + $args{ARule} = {}; + $args{InCDATA} = 0; + +# $args{Debug} = 1; + bless \%args, $class; +} + +# PerlSAX API +sub element_decl +{ + my ($self, $hash) = @_; + $self->Element ($hash->{Name}, $hash->{Model}); +} + +# Same parameter order as the Element handler in XML::Parser module +sub Element +{ + my ($self, $name, $model) = @_; + + if (defined $self->{ERule}->{$name}) + { + $self->fail (115, "ELEMENT [$name] already defined", + Element => $name); + } + + if ($model eq "EMPTY") + { + $self->{ERule}->{$name} = new XML::Checker::ERule::EMPTY; + } + elsif ($model eq "ANY") + { + $self->{ERule}->{$name} = new XML::Checker::ERule::ANY; + } + elsif ($model =~ /#PCDATA/) + { + my $rule = new XML::Checker::ERule::Mixed; + if ($rule->setModel ($model)) + { + $self->{ERule}->{$name} = $rule; + } + else + { + $self->fail (124, "bad model [$model] for ELEMENT [$name]", + Element => $name); + } + } + else + { + my $rule = new XML::Checker::ERule::Children; + if ($rule->setModel ($model)) + { + $self->{ERule}->{$name} = $rule; + } + else + { + $self->fail (124, "bad model [$model] for ELEMENT [$name]", + Element => $name); + } + } + my $rule = $self->{ERule}->{$name}; + print "added ELEMENT model for $name: " . $rule->debug . "\n" + if $rule and $self->{Debug}; +} + +# PerlSAX API +sub attlist_decl +{ + my ($self, $hash) = @_; + $self->Attlist ($hash->{ElementName}, $hash->{AttributeName}, + $hash->{Type}, $hash->{Default}, $hash->{Fixed}); +} + +sub Attlist +{ + my ($self, $tag, $attrName, $type, $default, $fixed) = @_; + my $arule = $self->{ARule}->{$tag} ||= + new XML::Checker::ARule ($tag, $self); + + $arule->Attlist ($attrName, $type, $default, $fixed, $self); +} + +# Initializes the context stack to check an XML::DOM::Element +sub InitDomElem +{ + my $self = shift; + + # initialize Context stack + $self->{Context} = [ new XML::Checker::Context::ANY ($self) ]; + $self->{InCDATA} = 0; +} + +# Clears the context stack after checking an XML::DOM::Element +sub FinalDomElem +{ + my $self = shift; + delete $self->{Context}; +} + +# PerlSAX API +sub start_document +{ + shift->Init; +} + +sub Init +{ + my $self = shift; + + # initialize Context stack + $self->{Context} = [ new XML::Checker::DocContext ($self) ]; + $self->{InCDATA} = 0; +} + +# PerlSAX API +sub end_document +{ + shift->Final; +} + +sub Final +{ + my $self = shift; +#?? could add more statistics: unreferenced Unparsed, ID + + for (keys %{ $self->{IDREF} }) + { + my $n = $self->{IDREF}->{$_}; + $self->fail (200, "undefined ID [$_] was referenced [$n] times") + unless defined $self->{ID}->{$_}; + } + + for (keys %{ $self->{ID} }) + { + my $n = $self->{IDREF}->{$_} || 0; + $self->fail (300, "[$n] references to ID [$_]"); + } + + delete $self->{Context}; +} + +sub getRootElement +{ + my $self = shift; +# print "getRoot $self " . $self->{RootElement} . "\n"; + $_[0]->{RootElement}; +} + +# PerlSAX API +sub doctype_decl +{ + my ($self, $hash) = @_; + $self->Doctype ($hash->{Name}, $hash->{SystemId}, + $hash->{PublicId}, $hash->{Internal}); +} + +sub Doctype +{ + my ($self, $name, $sysid, $pubid, $internal) = @_; + $self->{RootElement} = $name; + + my $context = $self->{Context}->[0]; + $context->setRootElement ($name); + +#?? what else +} + +sub Attr +{ + my ($self, $tag, $attr, $val, $specified) = @_; + +#print "Attr for tag=$tag attr=$attr val=$val spec=$specified\n"; + + my $arule = $self->{ARule}->{$tag}; + if (defined $arule && $arule->{Defined}->{$attr}) + { + $arule->check ($attr, $val, $specified); + } + else + { + $self->fail (103, "undefined attribute [$attr]", Element => $tag); + } +} + +sub EndAttr +{ + my $self = shift; + + my $arule = $self->{CurrARule}; + if (defined $arule) + { + $arule->EndAttr; + } +} + +# PerlSAX API +sub start_element +{ + my ($self, $hash) = @_; + my $tag = $hash->{Name}; + my $attr = $hash->{Attributes}; + + $self->Start ($tag); + + if (exists $hash->{AttributeOrder}) + { + my $defaulted = $hash->{Defaulted}; + my @order = @{ $hash->{AttributeOrder} }; + + # Specified attributes + for (my $i = 0; $i < $defaulted; $i++) + { + my $a = $order[$i]; + $self->Attr ($tag, $a, $attr->{$a}, 1); + } + + # Defaulted attributes + for (my $i = $defaulted; $i < @order; $i++) + { + my $attr = $order[$i]; + $self->Attr ($tag, $a, $attr->{$a}, 0); + } + } + else + { + # Assume all attributes were specified + my @attr = %$attr; + my ($key, $val); + while ($key = shift @attr) + { + $val = shift @attr; + + $self->Attr ($tag, $key, $val, 1); + } + } + $self->EndAttr; +} + +sub Start +{ + my ($self, $tag) = @_; +#?? if first tag, check with root element - or does expat check this already? + + my $context = $self->{Context}; + $context->[0]->Start ($self, $tag); + + my $erule = $self->{ERule}->{$tag}; + if (defined $erule) + { + unshift @$context, $erule->context; + } + else + { + # It's not a real error according to the XML Spec. + $self->fail (101, "undefined ELEMENT [$tag]"); + unshift @$context, new XML::Checker::Context::ANY; + } + +#?? what about ARule ?? + my $arule = $self->{ARule}->{$tag}; + if (defined $arule) + { + $self->{CurrARule} = $arule; + $arule->StartAttr; + } +} + +# PerlSAX API +sub end_element +{ + shift->End; +} + +sub End +{ + my ($self) = @_; + my $context = $self->{Context}; + + $context->[0]->End ($self); + shift @$context; +} + +# PerlSAX API +sub characters +{ + my ($self, $hash) = @_; + my $data = $hash->{Data}; + + if ($self->{InCDATA}) + { + $self->CData ($data); + } + else + { + $self->Char ($data); + } +} + +# PerlSAX API +sub start_cdata +{ + $_[0]->{InCDATA} = 1; +} + +# PerlSAX API +sub end_cdata +{ + $_[0]->{InCDATA} = 0; +} + +sub Char +{ + my ($self, $text) = @_; + my $context = $self->{Context}; + + # NOTE: calls to isWS may set this to 1. + $INSIGNIF_WS = 0; + + $context->[0]->Char ($self, $text); +} + +# Treat CDATASection same as Char (Text) +sub CData +{ + my ($self, $cdata) = @_; + my $context = $self->{Context}; + + $context->[0]->Char ($self, $cdata); + + # CDATASection can never be insignificant whitespace + $INSIGNIF_WS = 0; +#?? I'm not sure if this assumption is correct +} + +# PerlSAX API +sub comment +{ + my ($self, $hash) = @_; + $self->Comment ($hash->{Data}); +} + +sub Comment +{ +# ?? what can be checked here? +} + +# PerlSAX API +sub entity_reference +{ + my ($self, $hash) = @_; + $self->EntityRef ($hash->{Name}, 0); +#?? parameter entities (like %par;) are NOT supported! +# PerlSAX::handle_default should be fixed! +} + +sub EntityRef +{ + my ($self, $ref, $isParam) = @_; + + if ($isParam) + { + # expand to "%name;" + print STDERR "XML::Checker::Entity - parameter Entity (%ent;) not implemented\n"; + } + else + { + # Treat same as Char - for now + my $context = $self->{Context}; + $context->[0]->Char ($self, "&$ref;"); + $INSIGNIF_WS = 0; +#?? I could count the number of times each Entity is referenced + } +} + +# PerlSAX API +sub unparsed_entity_decl +{ + my ($self, $hash) = @_; + $self->Unparsed ($hash->{Name}); +#?? what about Base, SytemId, PublicId ? +} + +sub Unparsed +{ + my ($self, $entity) = @_; +# print "ARule::Unparsed $entity\n"; + if ($self->{Unparsed}->{$entity}) + { + $self->fail (112, "unparsed ENTITY [$entity] already defined"); + } + else + { + $self->{Unparsed}->{$entity} = 1; + } +} + +# PerlSAX API +sub notation_decl +{ + my ($self, $hash) = @_; + $self->Notation ($hash->{Name}); +#?? what about Base, SytemId, PublicId ? +} + +sub Notation +{ + my ($self, $notation) = @_; + if ($self->{NOTATION}->{$notation}) + { + $self->fail (113, "NOTATION [$notation] already defined"); + } + else + { + $self->{NOTATION}->{$notation} = 1; + } +} + +# PerlSAX API +sub entity_decl +{ + my ($self, $hash) = @_; + + $self->Entity ($hash->{Name}, $hash->{Value}, $hash->{SystemId}, + $hash->{PublicId}, $hash->{'Notation'}); +} + +sub Entity +{ + my ($self, $name, $val, $sysId, $pubId, $ndata) = @_; + + if (exists $self->{ENTITY}->{$name}) + { + $self->fail (114, "ENTITY [$name] already defined"); + } + else + { + $self->{ENTITY}->{$name} = $val; + } +} + +# PerlSAX API +#sub xml_decl {} $hash=> Version, Encoding, Standalone +# Don't implement resolve_entity() which is called by ExternEnt! +#sub processing_instruction {} $hash=> Target, Data + +# Returns whether the Char data is whitespace and also updates the +# $INSIGNIF_WS variable to indicate whether it is insignificant whitespace. +# Note that this method is only called in places where potential whitespace +# can be insignificant (i.e. when the ERule is Children or EMPTY) +sub isWS +{ + $INSIGNIF_WS = ($_[1] =~ /^\s*$/); +} + +sub isInsignifWS +{ + $INSIGNIF_WS; +} + +sub fail +{ + my $self = shift; + &$FAIL (@_); +} + +sub print_error # static +{ + my $str = error_string (@_); + print STDERR $str; +} + +sub error_string # static +{ + my $code = shift; + my $msg = shift; + + my @a = (); + my ($key, $val); + while ($key = shift) + { + $val = shift; + push @a, ("$key " . (defined $val ? $val : "(undef)")); + } + + my $cat = $code >= 200 ? ($code >= 300 ? "INFO" : "WARNING") : "ERROR"; + my $str = join (", ", @a); + $str = length($str) ? "\tContext: $str\n" : ""; + + "XML::Checker $cat-$code: $msg\n$str"; +} + +sub debug +{ + my ($self) = @_; + my $context = $self->{Context}->[0]; + my $c = $context ? $context->debug : "no context"; + my $root = $self->{RootElement}; + + "Checker[$c,RootElement=$root]"; +} + +1; # package return code + +__END__ + +=head1 NAME + +XML::Checker - A perl module for validating XML + +=head1 SYNOPSIS + +L - an L that validates at parse time + +L - an L that validates at parse time + +(Some of the package names may change! This is only an alpha release...) + +=head1 DESCRIPTION + +XML::Checker can be used in different ways to validate XML. See the manual +pages of L and L +for more information. + +This document only describes common topics like error handling +and the XML::Checker class itself. + +WARNING: Not all errors are currently checked. Almost everything is subject to +change. Some reported errors may not be real errors. + +=head1 ERROR HANDLING + +Whenever XML::Checker (or one of the packages that uses XML::Checker) detects a +potential error, the 'fail handler' is called. It is currently also called +to report information, like how many times an Entity was referenced. +(The whole error handling mechanism is subject to change, I'm afraid...) + +The default fail handler is XML::Checker::print_error(), which prints an error +message to STDERR. It does not stop the XML::Checker, so it will continue +looking for other errors. +The error message is created with XML::Checker::error_string(). + +You can define your +own fail handler in two ways, locally and globally. Use a local variable to +temporarily override the fail handler. This way the default fail handler is restored +when the local variable goes out of scope, esp. when exceptions are thrown e.g. + + # Using a local variable to temporarily override the fail handler (preferred) + { # new block - start of local scope + local $XML::Checker::FAIL = \&my_fail; + ... your code here ... + } # end of block - the previous fail handler is restored + +You can also set the error handler globally, risking that your code may not +be reusable or may clash with other modules that use XML::Checker. + + # Globally setting the fail handler (not recommended) + $XML::Checker::FAIL = \&my_fail; + ... rest of your code ... + +The fail handler is called with the following parameters ($code, $msg, @context), +where $code is the error code, $msg is the error description and +@context contains information on where the error occurred. The @context is +a (ordered) list of (key,value) pairs and can easily be turned into a hash. +It contains the following information: + + Element - tag name of Element node (if applicable) + Attr - attribute name (if applicable) + ChildElementIndex - if applicable (see error 157) + line - only when parsing + column - only when parsing + byte - only when parsing (-1 means: end of file) + +Some examples of fail handlers: + + # Don't print info messages + sub my_fail + { + my $code = shift; + print STDERR XML::Checker::error_message ($code, @_) + if $code < 300; + } + + # Die when the first error is encountered - this will stop + # the parsing process. Ignore information messages. + sub my_fail + { + my $code = shift; + die XML::Checker::error_message ($code, @_) if $code < 300; + } + + # Count the number of undefined NOTATION references + # and print the error as usual + sub my_fail + { + my $code = shift; + $count_undef_notations++ if $code == 100; + XML::Checker::print_error ($code, @_); + } + + # Die when an error is encountered. + # Don't die if a warning or info message is encountered, just print a message. + sub my_fail { + my $code = shift; + die XML::Checker::error_string ($code, @_) if $code < 200; + XML::Checker::print_error ($code, @_); + } + +=head1 INSIGNIFICANT WHITESPACE + +XML::Checker keeps track of whether whitespace found in character data +is significant or not. It is considered insignicant if it is found inside +an element that has a ELEMENT rule that is not of type Mixed or of type ANY. +(A Mixed ELEMENT rule does contains the #PCDATA keyword. +An ANY rule contains the ANY keyword. See the XML spec for more info.) + +XML::Checker can not determine whether whitespace is insignificant in those two +cases, because they both allow regular character data to appear within +XML elements and XML::Checker can therefore not deduce whether whitespace +is part of the actual data or was just added for readability of the XML file. + +XML::Checker::Parser and XML::DOM::ValParser both have the option to skip +insignificant whitespace when setting B to 1 in their constructor. +If set, they will not call the Char handler when insignificant whitespace is +encountered. This means that in XML::DOM::ValParser no Text nodes are created +for insignificant whitespace. + +Regardless of whether the SkipInsignifWS options is set, XML::Checker always +keeps track of whether whitespace is insignificant. After making a call to +XML::Checker's Char handler, you can find out if it was insignificant whitespace +by calling the isInsignifWS method. + +When using multiple (nested) XML::Checker instances or when using XML::Checker +without using XML::Checker::Parser or XML::DOM::ValParser (which hardly anybody +probably will), make sure to set a local variable in the scope of your checking +code, e.g. + + { # new block - start of local scope + local $XML::Checker::INSIGNIF_WS = 0; + ... insert your code here ... + } # end of scope + +=head1 ERROR CODES + +There are 3 categories, errors, warnings and info messages. +(The codes are still subject to change, as well the error descriptions.) + +Most errors have a link to the appropriate Validaty Constraint (B) +or other section in the XML specification. + +=head2 ERROR Messages + +=head2 100 - 109 + +=over 4 + +=item * + +B<100> - undefined NOTATION [$notation] in ATTLIST + +The ATTLIST contained a Notation reference that was not defined in a +NOTATION definition. +B L + + +=item * + +B<101> - undefined ELEMENT [$tagName] + +The specified Element was never defined in an ELEMENT definition. +This is not an error according to the XML spec. +See L + + +=item * + +B<102> - undefined unparsed ENTITY [$entity] + +The attribute value referenced an undefined unparsed entity. +B L + + +=item * + +B<103> - undefined attribute [$attrName] + +The specified attribute was not defined in an ATTLIST for that Element. +B L + + +=back + +=head2 110 - 119 + +=over 4 + +=item * + +B<110> - attribute [$attrName] of element [$tagName] already defined + +The specified attribute was already defined in this ATTLIST definition or +in a previous one. +This is not an error according to the XML spec. +See L + + +=item * + +B<111> - ID [$value] already defined + +An ID with the specified value was already defined in an attribute +within the same document. +B L + + +=item * + +B<112> - unparsed ENTITY [$entity] already defined + +This is not an error according to the XML spec. +See L + + +=item * + +B<113> - NOTATION [$notation] already defined + + +=item * + +B<114> - ENTITY [$entity] already defined + +This is not an error according to the XML spec. +See L + + +=item * + +B<115> - ELEMENT [$name] already defined +B L + + +=back + +=head2 120 - 129 + +=over 4 + +=item * + +B<120> - invalid default ENTITY [$default] + +(Or IDREF or NMTOKEN instead of ENTITY.) +The ENTITY, IDREF or NMTOKEN reference in the default attribute +value for an attribute with types ENTITY, IDREF or NMTOKEN was not +valid. +B L + + +=item * + +B<121> - invalid default [$token] in ENTITIES [$default] + +(Or IDREFS or NMTOKENS instead of ENTITIES) +One of the ENTITY, IDREF or NMTOKEN references in the default attribute +value for an attribute with types ENTITIES, IDREFS or NMTOKENS was not +valid. +B L + + +=item * + +B<122> - invalid default attribute value [$default] + +The specified default attribute value is not a valid attribute value. +B L + + +=item * + +B<123> - invalid default ID [$default], must be #REQUIRED or #IMPLIED + +The default attribute value for an attribute of type ID has to be +#REQUIRED or #IMPLIED. +B L + + +=item * + +B<124> - bad model [$model] for ELEMENT [$name] + +The model in the ELEMENT definition did not conform to the XML syntax +for Mixed models. +See L + + +=back + +=head2 130 - 139 + +=over 4 + +=item * + +B<130> - invalid NMTOKEN [$attrValue] + +The attribute value is not a valid NmToken token. +B L + + +=item * + +B<131> - invalid ID [$attrValue] + +The specified attribute value is not a valid Name token. +B L + + +=item * + +B<132> - invalid IDREF [$value] + +The specified attribute value is not a valid Name token. +B L + + +=item * + +B<133> - invalid ENTITY name [$name] + +The specified attribute value is not a valid Name token. +B L + + +=item * + +B<134> - invalid Enumeration value [$value] in ATTLIST + +The specified value is not a valid NmToken (see XML spec for def.) +See definition of L + + +=item * + +B<135> - empty NOTATION list in ATTLIST + +The NOTATION list of the ATTLIST definition did not contain any NOTATION +references. +See definition of L + + +=item * + +B<136> - empty Enumeration list in ATTLIST + +The ATTLIST definition of the attribute of type Enumeration did not +contain any values. +See definition of L + + +=item * + +B<137> - invalid ATTLIST type [$type] + +The attribute type has to be one of: ID, IDREF, IDREFS, ENTITY, ENTITIES, +NMTOKEN, NMTOKENS, CDATA, NOTATION or an Enumeration. +See definition of L + + +=back + +=head2 150 - 159 + +=over 4 + +=item * + +B<150> - bad #FIXED attribute value [$value], it should be [$default] + +The specified attribute was defined as #FIXED in the ATTLIST definition +and the found attribute $value differs from the specified $default value. +B L + + +=item * + +B<151> - only one ID allowed in ATTLIST per element first=[$attrName] + +The ATTLIST definitions for an Element may contain only one attribute +with the type ID. The specified $attrName is the one that was found first. +B L + + +=item * + +B<152> - Element should be EMPTY, found Element [$tagName] + +The ELEMENT definition for the specified Element said it should be +EMPTY, but a child Element was found. +B L + + +=item * + +B<153> - Element should be EMPTY, found text [$text] + +The ELEMENT definition for the specified Element said it should be +EMPTY, but text was found. Currently, whitespace is not allowed between the +open and close tag. (This may be wrong, please give feedback.) +To allow whitespace (subject to change), set: + + $XML::Checker::Context::EMPTY::ALLOW_WHITE_SPACE = 1; + +B L + + +=item * + +B<154> - bad order of Elements Found=[$found] RE=[$re] + +The child elements of the specified Element did not match the +regular expression found in the ELEMENT definition. $found contains +a comma separated list of all the child element tag names that were found. +$re contains the (decoded) regular expression that was used internally. +B L + + +=item * + +B<155> - more than one root Element [$tags] + +An XML Document may only contain one Element. +$tags is a comma separated list of element tag names encountered sofar. +L (expat) throws 'no element found' exception. +See two_roots.xml for an example. +See definition of L + + +=item * + +B<156> - unexpected root Element [$tagName], expected [$rootTagName] + +The tag name of the root Element of the XML Document differs from the name +specified in the DOCTYPE section. +L (expat) throws 'not well-formed' exception. +See bad_root.xml for an example. +B L + + +=item * + +B<157> - unexpected Element [$tagName] + +The ELEMENT definition for the specified Element does not allow child +Elements with the specified $tagName. +B L + +The error context contains ChildElementIndex which is the index within +its parent Element (counting only Element nodes.) + + +=item * + +B<158> - unspecified value for #IMPLIED attribute [$attrName] + +The ATTLIST for the specified attribute said the attribute was #IMPLIED, +which means the user application should supply a value, but the attribute +value was not specified. (User applications should pass a value and set +$specified to 1 in the Attr handler.) + + +=item * + +B<159> - unspecified value for #REQUIRED attribute [$attrName] + +The ATTLIST for the specified attribute said the attribute was #REQUIRED, +which means that a value should have been specified. +B L + + +=back + +=head2 160 - 169 + +=over 4 + +=item * + +B<160> - invalid Enumeration value [$attrValue] + +The specified attribute value does not match one of the Enumeration values +in the ATTLIST. +B L + + +=item * + +B<161> - invalid NOTATION value [$attrValue] + +The specifed attribute value was not found in the list of possible NOTATION +references as found in the ATTLIST definition. +B L + + +=item * + +B<162> - undefined NOTATION [$attrValue] + +The NOTATION referenced by the specified attribute value was not defined. +B L + + +=back + +=head2 WARNING Messages (200 and up) + +=over 4 + +=item * + +B<200> - undefined ID [$id] was referenced [$n] times + +The specified ID was referenced $n times, but never defined in an attribute +value with type ID. +B L + + +=back + +=head2 INFO Messages (300 and up) + +=over 4 + +=item * + +B<300> - [$n] references to ID [$id] + +The specified ID was referenced $n times. + + +=back + +=head2 Not checked + +The following errors are already checked by L (expat) and +are currently not checked by XML::Checker: + +(?? TODO - add more info) + +=over 4 + +=item root element is missing + +L (expat) throws 'no element found' exception. +See no_root.xml for an example. + +=back + +=head1 XML::Checker + +XML::Checker can be easily plugged into your application. +It uses mostly the same style of event handlers (or callbacks) as L. +See L manual page for descriptions of most handlers. + +It also implements PerlSAX style event handlers. See L. + +Currently, the XML::Checker object is a blessed hash with the following +(potentially useful) entries: + + $checker->{RootElement} - root element name as found in the DOCTYPE + $checker->{NOTATION}->{$notation} - is 1 if the NOTATION was defined + $checker->{ENTITY}->{$name} - contains the (first) ENTITY value if defined + $checker->{Unparsed}->{$entity} - is 1 if the unparsed ENTITY was defined + $checker->{ID}->{$id} - is 1 if the ID was defined + $checker->{IDREF}->{$id} - number of times the ID was referenced + + # Less useful: + $checker->{ERule}->{$tag} - the ELEMENT rules by Element tag name + $checker->{ARule}->{$tag} - the ATTLIST rules by Element tag name + $checker->{Context} - context stack used internally + $checker->{CurrARule} - current ATTLIST rule for the current Element + +=head2 XML:Checker methods + +This section is only interesting when using XML::Checker directly. +XML::Checker supports most event handlers that L supports with minor +differences. Note that the XML::Checker event handler methods are +instance methods and not static, so don't forget to call them like this, +without passing $expat (as in the L) handlers: + + $checker->Start($tagName); + +=over 4 + +=item Constructor + + $checker = new XML::Checker; + $checker = new XML::Checker (%user_args); + +User data may be stored by client applications. Only $checker->{User} is +guaranteed not to clash with internal hash keys. + +=item getRootElement () + + $tagName = $checker->getRootElement; + +Returns the root element name as found in the DOCTYPE + +=back + +=head2 Expat interface + +XML::Checker supports what I call the I interface, which is +the collection of methods you normally specify as the callback handlers +when using XML::Parser. + +Only the following L handlers are currently supported: +Init, Final, Char, Start, End, Element, Attlist, Doctype, +Unparsed, Entity, Notation. + +I don't know how to correctly support the Default handler for all L +releases. The Start handler works a little different (see below) and I +added Attr, InitDomElem, FinalDomElem, CDATA and EntityRef handlers. +See L for a description of the handlers that are not listed below. + +Note that this interface may disappear, when the PerlSAX interface stabilizes. + +=over 4 + +=item Start ($tag) + + $checker->Start($tag); + +Call this when an Element with the specified $tag name is encountered. +Different from the Start handler in L, in that no attributes +are passed in (use the Attr handler for those.) + +=item Attr ($tag, $attrName, $attrValue, $isSpecified) + + $checker->Attr($tag,$attrName,$attrValue,$spec); + +Checks an attribute with the specified $attrName and $attrValue against the +ATTLIST definition of the element with the specified $tag name. +$isSpecified means whether the attribute was specified (1) or defaulted (0). + +=item EndAttr () + + $checker->EndAttr; + +This should be called after all attributes are passed with Attr(). +It will check which of the #REQUIRED attributes were not specified and generate +the appropriate error (159) for each one that is missing. + +=item CDATA ($text) + + $checker->CDATA($text); + +This should be called whenever CDATASections are encountered. +Similar to Char handler (but might perform different checks later...) + +=item EntityRef ($entity, $isParameterEntity) + + $checker->EntityRef($entity,$isParameterEntity); + +Checks the ENTITY reference. Set $isParameterEntity to 1 for +entity references that start with '%'. + +=item InitDomElem () and FinalDomElem () + +Used by XML::DOM::Element::check() to initialize (and cleanup) the +context stack when checking a single element. + +=back + +=head2 PerlSAX interface + +XML::Checker now also supports the PerlSAX interface, so you can use XML::Checker +wherever you use PerlSAX handlers. + +XML::Checker implements the following methods: start_document, end_document, +start_element, end_element, characters, processing_instruction, comment, +start_cdata, end_cdata, entity_reference, notation_decl, unparsed_entity_decl, +entity_decl, element_decl, attlist_decl, doctype_decl, xml_decl + +Not implemented: set_document_locator, ignorable_whitespace + +See PerlSAX.pod for details. (It is called lib/PerlSAX.pod in the libxml-perl +distribution which can be found at CPAN.) + +=head1 CAVEATS + +This is an alpha release. Almost everything is subject to change. + +=head1 AUTHOR + +Send bug reports, hints, tips, suggestions to Enno Derksen at +>. + +=head1 SEE ALSO + +The home page of XML::Checker at L + +The XML spec (Extensible Markup Language 1.0) at L + +The L and L manual pages. + +The other packages that come with XML::Checker: +L, L + +The DOM Level 1 specification at L + +The PerlSAX specification. It is currently in lib/PerlSAX.pod in the +libxml-perl distribution by Ken MacLeod. + +The original SAX specification (Simple API for XML) can be found at +L and L