diff -r 8b87ea768cb8 -r 60053dab7e2a dummy_foundation/lib/XML/Checker.pm --- a/dummy_foundation/lib/XML/Checker.pm Wed Jun 03 18:33:51 2009 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2006 +0,0 @@ -# -# -# TO DO -# - update docs regarding PerlSAX interface -# - add current node to error context when checking DOM subtrees -# - add parsed Entity to test XML files -# - free circular references -# - Implied handler? -# - Notation, Entity, Unparsed checks, Default handler? -# - check no root element (it's checked by expat) ? - -package XML::Checker::Term; -use strict; - -sub new -{ - my ($class, %h) = @_; - bless \%h, $class; -} - -sub str -{ - '<' . $_[0]->{C} . $_[0]->{N} . '>' -} - -sub re -{ - $_[0]->{S} -} - -sub rel -{ - my $self = shift; - defined $self->{SL} ? @{ $self->{SL} } : ( $self->{S} ); -} - -sub debug -{ - my $t = shift; - my ($c, $n, $s) = ($t->{C}, $t->{N}, $t->{S}); - my @sl = $t->rel; - "{C=$c N=$n S=$s SL=@sl}"; -} - -#------------------------------------------------------------------------- - -package XML::Checker::Context; - -sub new -{ - my ($class) = @_; - my $scalar; - bless \$scalar, $class; -} - -sub Start {} -sub End {} -sub Char {} - -# -# The initial Context when checking an entire XML Document -# -package XML::Checker::DocContext; -use vars qw( @ISA ); -@ISA = qw( XML::Checker::Context ); - -sub new -{ -#??checker not used - my ($class, $checker) = @_; - bless { }, $class; -} - -sub setRootElement -{ - $_[0]->{RootElement} = $_[1]; -} - -sub Start -{ - my ($self, $checker, $tag) = @_; - if (exists $self->{Elem}) - { - my $tags = join (", ", @{$self->{Elem}}); - $checker->fail (155, "more than one root Element [$tags]"); - push @{$self->{Elem}}, $tag; - } - else - { - $self->{Elem} = [ $tag ]; - } - - my $exp_root = $self->{RootElement}; - $checker->fail (156, "unexpected root Element [$tag], expected [$exp_root]") - if defined ($exp_root) and $tag ne $exp_root; -} - -sub debug -{ - my $self = shift; - "DocContext[Count=" . $self->{Count} . ",Root=" . - $self->{RootElement} . "]"; -} - -package XML::Checker::Context::ANY; -use vars qw( @ISA ); -@ISA = qw( XML::Checker::Context ); - -# No overrides, because everything is accepted - -sub debug { "XML::Checker::Context::ANY" } - -package XML::Checker::Context::EMPTY; -use vars qw( @ISA $ALLOW_WHITE_SPACE ); -@ISA = qw( XML::Checker::Context ); - -$ALLOW_WHITE_SPACE = 0; - -sub debug { "XML::Checker::Context::EMPTY" } - -sub Start -{ - my ($self, $checker, $tag) = @_; - $checker->fail (152, "Element should be EMPTY, found Element [$tag]"); -} - -sub Char -{ - my ($self, $checker, $str) = @_; - $checker->fail (153, "Element should be EMPTY, found text [$str]") - unless ($ALLOW_WHITE_SPACE and $checker->isWS ($str)); - - # NOTE: if $ALLOW_WHITE_SPACE = 1, the isWS call does not only check - # whether it is whitespace, but it also informs the checker that this - # might be insignificant whitespace -} - -#?? what about Comments - -package XML::Checker::Context::Children; -use vars qw( @ISA ); -@ISA = qw( XML::Checker::Context ); - -sub new -{ - my ($class, $rule) = @_; - bless { Name => $rule->{Name}, RE => $rule->{RE}, Buf => "", N => 0 }, $class; -} - -sub phash -{ - my $href = shift; - my $str = ""; - for (keys %$href) - { - $str .= ' ' if $str; - $str .= $_ . '=' . $href->{$_}; - } - $str; -} - -sub debug -{ - my $self = shift; - "Context::Children[Name=(" . phash ($self->{Name}) . ",N=" . $self->{N} . - ",RE=" . $self->{RE} . ",Buf=[" . $self->{Buf} . "]"; -} - -sub Start -{ - my ($self, $checker, $tag) = @_; - -#print "Children.Start tag=$tag rule=$checker drule=" . $checker->debug . "\n"; - - if (exists $self->{Name}->{$tag}) - { -#print "Buf=[".$self->{Buf}. "] tag=[" . $self->{Name}->{$tag}->{S} . "]\n"; - $self->{Buf} .= $self->{Name}->{$tag}->{S}; - } - else - { - $checker->fail (157, "unexpected Element [$tag]", - ChildElementIndex => $self->{N}) - } - $self->{N}++; -} - -sub decode -{ - my ($self) = @_; - my $re = $self->{RE}; - my $name = $self->{Name}; - my $buf = $self->{Buf}; - - my %s = (); - while (my ($key, $val) = each %$name) - { - $s{$val->{S}} = $key; - } - - my ($len) = scalar (keys %$name); - $len = length $len; - my $dots = "[^()*+?]" x $len; - - $buf =~ s/($dots)/$s{$1} . ","/ge; - chop $buf; - - $re =~ s/($dots)/"(" . $s{$1} . ")"/ge; - - "Found=[$buf] RE=[$re]" -} - -sub End -{ - my ($self, $checker) = @_; - my $re = $self->{RE}; - -#print "End " . $self->debug . "\n"; - $checker->fail (154, "bad order of Elements " . $self->decode) - unless $self->{Buf} =~ /^$re$/; -} - -sub Char -{ - my ($self, $checker, $str) = @_; - - # Inform the checker that this might be insignificant whitespace - $checker->isWS ($str); -} - -package XML::Checker::Context::Mixed; -use vars qw( @ISA ); -@ISA = qw( XML::Checker::Context ); - -sub new -{ - my ($class, $rule) = @_; - bless { Name => $rule->{Name}, N => 0 }, $class; -} - -sub debug -{ - my $self = shift; - "Context::Mixed[Name=" . $self->{Name} . ",N=" , $self->{N} . "]"; -} - -sub Start -{ - my ($self, $checker, $tag) = @_; - - $checker->fail (157, "unexpected Element [$tag]", - ChildElementIndex => $self->{N}) - unless exists $self->{Name}->{$tag}; - $self->{N}++; -} - -package XML::Checker::ERule; - -package XML::Checker::ERule::EMPTY; -use vars qw( @ISA ); -@ISA = qw( XML::Checker::ERule ); - -sub new -{ - my ($class) = @_; - bless {}, $class; -} - -my $context = new XML::Checker::Context::EMPTY; -sub context { $context } # share the context - -sub debug { "EMPTY" } - -package XML::Checker::ERule::ANY; -use vars qw( @ISA ); -@ISA = qw( XML::Checker::ERule ); - -sub new -{ - my ($class) = @_; - bless {}, $class; -} - -my $any_context = new XML::Checker::Context::ANY; -sub context { $any_context } # share the context - -sub debug { "ANY" } - -package XML::Checker::ERule::Mixed; -use vars qw( @ISA ); -@ISA = qw( XML::Checker::ERule ); - -sub new -{ - my ($class) = @_; - bless { Name => {} }, $class; -} - -sub context -{ - my ($self) = @_; - new XML::Checker::Context::Mixed ($self); -} - -sub setModel -{ - my ($self, $model) = @_; - my $rule = $model; - - # Mixed := '(' '#PCDATA' ')' '*'? - if ($rule =~ /^\(\s*#PCDATA\s*\)(\*)?$/) - { -#? how do we interpret the '*' ?? - return 1; - } - else # Mixed := '(' '#PCDATA' ('|' Name)* ')*' - { - return 0 unless $rule =~ s/^\(\s*#PCDATA\s*//; - return 0 unless $rule =~ s/\s*\)\*$//; - - my %names = (); - while ($rule =~ s/^\s*\|\s*($XML::RegExp::Name)//) - { - $names{$1} = 1; - } - if ($rule eq "") - { - $self->{Name} = \%names; - return 1; - } - } - return 0; -} - -sub debug -{ - my ($self) = @_; - "Mixed[Names=" . join("|", keys %{$self->{Name}}) . "]"; -} - -package XML::Checker::ERule::Children; -use vars qw( @ISA %_name %_map $_n ); -@ISA = qw( XML::Checker::ERule ); - -sub new -{ - my ($class) = @_; - bless {}, $class; -} - -sub context -{ - my ($self) = @_; - new XML::Checker::Context::Children ($self); -} - -sub _add # static -{ - my $exp = new XML::Checker::Term (@_); - $_map{$exp->{N}} = $exp; - $exp->str; -} - -my $IDS = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - -sub _tokenize -{ - my ($self, $rule) = @_; - - # Replace names with Terms of the form "", e.g. "". - # Lookup already used names and store new names in %_name. - # - $$rule =~ s/($XML::RegExp::Name)(?!>)/ - if (exists $_name{$1}) # name already used? - { - $_name{$1}->str; - } - else - { - my $exp = new XML::Checker::Term (C => 'n', N => $_n++, - Name => $1); - $_name{$1} = $_map{$exp->{N}} = $exp; - $exp->str; - } - /eg; - - if ($_n < length $IDS) - { - # Generate regular expression for the name Term, i.e. - # a single character from $IDS - my $i = 0; - for (values %_name) - { - $_->{S} = substr ($IDS, $i++, 1); -#print "tokenized " . $_->{Name} . " num=" . $_->{N} . " to " . $_->{S} . "\n"; - } - } - else - { - # Generate RE, convert Term->{N} to hex string a la "(#)", - # e.g. "(03d)". Calculate needed length of hex string first. - my $len = 1; - for (my $n = $_n - 1; ($n >> 4) > 0; $len++) {} - - my $i = 0; - for (values %_name) - { - $_->{S} = sprintf ("(0${len}lx)", $i++); -#print "tokenized " . $_->{Name} . " num=" . $_->{N} . " to " . $_->{S} . "\n"; - } - } -} - -sub setModel -{ - my ($self, $rule) = @_; - - local $_n = 0; - local %_map = (); - local %_name = (); - - $self->_tokenize (\$rule); - -#?? check for single name - die "!ELEMENT contents can't be just a NAME" if $rule =~ /^$XML::RegExp::Name$/; - - for ($rule) - { - my $n = 1; - while ($n) - { - $n = 0; - - # cp := ( name | choice | seq ) ('?' | '*' | '+')? - $n++ while s/<[ncs](\d+)>([?*+]?)/_add - (C => 'a', N => $_n++, - S => ($_map{$1}->re . $2))/eg; - - # choice := '(' ch_l ')' - $n++ while s/\(\s*<[ad](\d+)>\s*\)/_add - (C => 'c', N => $_n++, - S => "(" . join ("|", $_map{$1}->rel) . ")")/eg; - - # ch_l := ( cp | ch_l ) '|' ( cp | ch_l ) - $n++ while s/<[ad](\d+)>\s*\|\s*<[ad](\d+)>/_add - (C => 'd', N => $_n++, - SL => [ $_map{$1}->rel, $_map{$2}->rel ])/eg; - - # seq := '(' (seq_l ')' - $n++ while s/\(\s*<[at](\d+)>\s*\)/_add - (C => 's', N => $_n++, - S => "(".join("", $_map{$1}->rel).")")/eg; - - # seq_l := ( cp | seq_l ) ',' ( cp | seq_l ) - $n++ while s/<[at](\d+)>\s*,\s*<[at](\d+)>/_add - (C => 't', N => $_n++, - SL => [ $_map{$1}->rel, $_map{$2}->rel ])/eg; - } - } - - return 0 if ($rule !~ /^$/); - - $self->{Name} = \%_name; - $self->{RE} = $_map{$1}->re; - - return 1; -} - -sub debug -{ - my ($self) = @_; - "Children[RE=" . $self->{RE} . "]"; -} - - -package XML::Checker::ARule; -use XML::RegExp; - -sub new -{ - my ($class, $elem, $checker) = @_; - bless { Elem => $elem, Checker => $checker, Required => {} }, $class; -} - -sub Attlist -{ - my ($self, $attr, $type, $default, $fixed, $checker) = @_; - my ($c1, $c2); - - if ($self->{Defined}->{$attr}) - { - my $tag = $self->{Elem}; - $self->fail ($attr, 110, "attribute [$attr] of element [$tag] already defined"); - } - else - { - $self->{Defined}->{$attr} = 1; - } - - if ($default =~ /^\#(REQUIRED|IMPLIED)$/) - { - $c1 = $1; - - # Keep list of all required attributes - if ($default eq '#REQUIRED') - { - $self->{Required}->{$attr} = 1; - } - } - else - { - $self->fail ($attr, 122, "invalid default attribute value [$default]") - unless $default =~ /^$XML::RegExp::AttValue$/; - - $default = substr ($default, 1, length($default)-2); - $self->{Default}->{$attr} = $default; - $c1 = 'FIXED' if $fixed; - } - - if ($type eq 'ID') - { - $self->fail ($attr, 123, "invalid default ID [$default], must be #REQUIRED or #IMPLIED") - unless $default =~ /^#(REQUIRED|IMPLIED)$/; - - if (exists ($self->{ID}) && $self->{ID} ne $attr) - { - $self->fail ($attr, 151, "only one ID allowed per ELEMENT " . - "first=[" . $self->{ID} . "]"); - } - else - { - $self->{ID} = $attr; - } - $c2 = 'ID'; - } - elsif ($type =~ /^(IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS)$/) - { - my $def = $self->{Default}->{$attr}; - if (defined $def) - { - my $re = ($type =~ /^[IE]/) ? $XML::RegExp::Name : $XML::RegExp::NmToken; - if ($type =~ /S$/) - { - for (split (/\s+/, $def)) - { - $self->fail ($attr, 121, - "invalid default [$_] in $type [$def]") - unless $_ =~ /^$re$/; - } - } - else # singular - { - $self->fail ($attr, 120, "invalid default $type [$def]") - unless $def =~ /^$re$/; - } - } - $c2 = $type; - } - elsif ($type ne 'CDATA') # Enumerated := NotationType | Enumeration - { - if ($type =~ /^\s*NOTATION\s*\(\s*($XML::RegExp::Name(\s*\|\s*$XML::RegExp::Name)*)\s*\)\s*$/) - { - $self->fail ($attr, 135, "empty NOTATION list in ATTLIST") - unless defined $1; - - my @tok = split (/\s*\|\s*/, $1); - for (@tok) - { - $self->fail ($attr, 100, "undefined NOTATION [$_] in ATTLIST") - unless exists $checker->{NOTATION}->{$_}; - } - - my $re = join ("|", @tok); - $self->{NotationRE} = "^($re)\$"; - $c2 = 'NotationType'; - } - elsif ($type =~ /^\s*\(\s*($XML::RegExp::NmToken(\s*\|\s*$XML::RegExp::NmToken)*)\s*\)\s*$/) - { - # Enumeration - - $self->fail ($attr, 136, "empty Enumeration list in ATTLIST") - unless defined $1; - - my @tok = split (/\s*\|\s*/, $1); - for (@tok) - { - $self->fail ($attr, 134, - "invalid Enumeration value [$_] in ATTLIST") - unless $_ =~ /^$XML::RegExp::NmToken$/; - } - $self->{EnumRE}->{$attr} = '^(' . join ("|", @tok) . ')$'; #'; - $c2 = 'Enumeration'; - } - else - { - $self->fail ($attr, 137, "invalid ATTLIST type [$type]"); - } - } - - $self->{Check1}->{$attr} = $c1 if $c1; - $self->{Check2}->{$attr} = $c2 if $c2; -} - -sub fail -{ - my $self = shift; - my $attr = shift; - $self->{Checker}->fail (@_, Element => $self->{Elem}, Attr => $attr); -} - -sub check -{ - my ($self, $attr) = @_; - my $func1 = $self->{Check1}->{$attr}; - my $func2 = $self->{Check2}->{$attr}; -# print "check func1=$func1 func2=$func2 @_\n"; - - if (exists $self->{ReqNotSeen}->{$attr}) - { - delete $self->{ReqNotSeen}->{$attr}; - } - no strict; - - &$func1 (@_) if defined $func1; - &$func2 (@_) if defined $func2; -} - -# Copies the list of all required attributes from $self->{Required} to -# $self->{ReqNotSeen}. -# When check() encounters a required attribute, it is removed from ReqNotSeen. -# In EndAttr we look at which attribute names are still in ReqNotSeen - those -# are the ones that were not specified and are, therefore, in error. -sub StartAttr -{ - my $self = shift; - my %not_seen = %{ $self->{Required} }; - $self->{ReqNotSeen} = \%not_seen; -} - -# Checks which of the #REQUIRED attributes were not specified -sub EndAttr -{ - my $self = shift; - - for my $attr (keys %{ $self->{ReqNotSeen} }) - { - $self->fail ($attr, 159, - "unspecified value for \#REQUIRED attribute [$attr]"); - } -} - -sub FIXED -{ - my ($self, $attr, $val, $specified) = @_; - - my $default = $self->{Default}->{$attr}; - $self->fail ($attr, 150, - "bad \#FIXED attribute value [$val], it should be [$default]") - unless ($val eq $default); -} - -sub IMPLIED -{ - my ($self, $attr, $val, $specified) = @_; - -#?? should #IMPLIED be specified? - $self->fail ($attr, 158, - "unspecified value for \#IMPLIED attribute [$attr]") - unless $specified; - -#?? Implied handler ? -} - -# This is called when an attribute is passed to the check() method by -# XML::Checker::Attr(), i.e. when the attribute was specified explicitly -# or defaulted by the parser (which should never happen), *NOT* when the -# attribute was omitted. (The latter is checked by StartAttr/EndAttr) -sub REQUIRED -{ - my ($self, $attr, $val, $specified) = @_; -# print "REQUIRED attr=$attr val=$val spec=$specified\n"; - - $self->fail ($attr, 159, - "unspecified value for \#REQUIRED attribute [$attr]") - unless $specified; -} - -sub ID # must be #IMPLIED or #REQUIRED -{ - my ($self, $attr, $val, $specified) = @_; - - $self->fail ($attr, 131, "invalid ID [$val]") - unless $val =~ /^$XML::RegExp::Name$/; - - $self->fail ($attr, 111, "ID [$val] already defined") - if $self->{Checker}->{ID}->{$val}++; -} - -sub IDREF -{ - my ($self, $attr, $val, $specified) = @_; - - $self->fail ($attr, 132, "invalid IDREF [$val]") - unless $val =~ /^$XML::RegExp::Name$/; - - $self->{Checker}->{IDREF}->{$val}++; -} - -sub IDREFS -{ - my ($self, $attr, $val, $specified) = @_; - for (split /\s+/, $val) - { - $self->IDREF ($attr, $_); - } -} - -sub ENTITY -{ - my ($self, $attr, $val, $specified) = @_; -#?? should it be specified? - - $self->fail ($attr, 133, "invalid ENTITY name [$val]") - unless $val =~ /^$XML::RegExp::Name$/; - - $self->fail ($attr, 102, "undefined unparsed ENTITY [$val]") - unless exists $self->{Checker}->{Unparsed}->{$val}; -} - -sub ENTITIES -{ - my ($self, $attr, $val, $specified) = @_; - for (split /\s+/, $val) - { - $self->ENTITY ($attr, $_); - } -} - -sub NMTOKEN -{ - my ($self, $attr, $val, $specified) = @_; - $self->fail ($attr, 130, "invalid NMTOKEN [$val]") - unless $val =~ /^$XML::RegExp::NmToken$/; -} - -sub NMTOKENS -{ - my ($self, $attr, $val, $specified) = @_; - for (split /\s+/, $val) - { - $self->NMTOKEN ($attr, $_, $specified); - } -} - -sub Enumeration -{ - my ($self, $attr, $val, $specified) = @_; - my $re = $self->{EnumRE}->{$attr}; - - $self->fail ($attr, 160, "invalid Enumeration value [$val]") - unless $val =~ /$re/; -} - -sub NotationType -{ - my ($self, $attr, $val, $specified) = @_; - my $re = $self->{NotationRE}; - - $self->fail ($attr, 161, "invalid NOTATION value [$val]") - unless $val =~ /$re/; - - $self->fail ($attr, 162, "undefined NOTATION [$val]") - unless exists $self->{Checker}->{NOTATION}->{$val}; -} - -package XML::Checker; -use vars qw ( $VERSION $FAIL $INSIGNIF_WS ); - -BEGIN -{ - $VERSION = '0.09'; -} - -$FAIL = \&print_error; - -# Whether the last seen Char data was insignicant whitespace -$INSIGNIF_WS = 0; - -sub new -{ - my ($class, %args) = @_; - - $args{ERule} = {}; - $args{ARule} = {}; - $args{InCDATA} = 0; - -# $args{Debug} = 1; - bless \%args, $class; -} - -# PerlSAX API -sub element_decl -{ - my ($self, $hash) = @_; - $self->Element ($hash->{Name}, $hash->{Model}); -} - -# Same parameter order as the Element handler in XML::Parser module -sub Element -{ - my ($self, $name, $model) = @_; - - if (defined $self->{ERule}->{$name}) - { - $self->fail (115, "ELEMENT [$name] already defined", - Element => $name); - } - - if ($model eq "EMPTY") - { - $self->{ERule}->{$name} = new XML::Checker::ERule::EMPTY; - } - elsif ($model eq "ANY") - { - $self->{ERule}->{$name} = new XML::Checker::ERule::ANY; - } - elsif ($model =~ /#PCDATA/) - { - my $rule = new XML::Checker::ERule::Mixed; - if ($rule->setModel ($model)) - { - $self->{ERule}->{$name} = $rule; - } - else - { - $self->fail (124, "bad model [$model] for ELEMENT [$name]", - Element => $name); - } - } - else - { - my $rule = new XML::Checker::ERule::Children; - if ($rule->setModel ($model)) - { - $self->{ERule}->{$name} = $rule; - } - else - { - $self->fail (124, "bad model [$model] for ELEMENT [$name]", - Element => $name); - } - } - my $rule = $self->{ERule}->{$name}; - print "added ELEMENT model for $name: " . $rule->debug . "\n" - if $rule and $self->{Debug}; -} - -# PerlSAX API -sub attlist_decl -{ - my ($self, $hash) = @_; - $self->Attlist ($hash->{ElementName}, $hash->{AttributeName}, - $hash->{Type}, $hash->{Default}, $hash->{Fixed}); -} - -sub Attlist -{ - my ($self, $tag, $attrName, $type, $default, $fixed) = @_; - my $arule = $self->{ARule}->{$tag} ||= - new XML::Checker::ARule ($tag, $self); - - $arule->Attlist ($attrName, $type, $default, $fixed, $self); -} - -# Initializes the context stack to check an XML::DOM::Element -sub InitDomElem -{ - my $self = shift; - - # initialize Context stack - $self->{Context} = [ new XML::Checker::Context::ANY ($self) ]; - $self->{InCDATA} = 0; -} - -# Clears the context stack after checking an XML::DOM::Element -sub FinalDomElem -{ - my $self = shift; - delete $self->{Context}; -} - -# PerlSAX API -sub start_document -{ - shift->Init; -} - -sub Init -{ - my $self = shift; - - # initialize Context stack - $self->{Context} = [ new XML::Checker::DocContext ($self) ]; - $self->{InCDATA} = 0; -} - -# PerlSAX API -sub end_document -{ - shift->Final; -} - -sub Final -{ - my $self = shift; -#?? could add more statistics: unreferenced Unparsed, ID - - for (keys %{ $self->{IDREF} }) - { - my $n = $self->{IDREF}->{$_}; - $self->fail (200, "undefined ID [$_] was referenced [$n] times") - unless defined $self->{ID}->{$_}; - } - - for (keys %{ $self->{ID} }) - { - my $n = $self->{IDREF}->{$_} || 0; - $self->fail (300, "[$n] references to ID [$_]"); - } - - delete $self->{Context}; -} - -sub getRootElement -{ - my $self = shift; -# print "getRoot $self " . $self->{RootElement} . "\n"; - $_[0]->{RootElement}; -} - -# PerlSAX API -sub doctype_decl -{ - my ($self, $hash) = @_; - $self->Doctype ($hash->{Name}, $hash->{SystemId}, - $hash->{PublicId}, $hash->{Internal}); -} - -sub Doctype -{ - my ($self, $name, $sysid, $pubid, $internal) = @_; - $self->{RootElement} = $name; - - my $context = $self->{Context}->[0]; - $context->setRootElement ($name); - -#?? what else -} - -sub Attr -{ - my ($self, $tag, $attr, $val, $specified) = @_; - -#print "Attr for tag=$tag attr=$attr val=$val spec=$specified\n"; - - my $arule = $self->{ARule}->{$tag}; - if (defined $arule && $arule->{Defined}->{$attr}) - { - $arule->check ($attr, $val, $specified); - } - else - { - $self->fail (103, "undefined attribute [$attr]", Element => $tag); - } -} - -sub EndAttr -{ - my $self = shift; - - my $arule = $self->{CurrARule}; - if (defined $arule) - { - $arule->EndAttr; - } -} - -# PerlSAX API -sub start_element -{ - my ($self, $hash) = @_; - my $tag = $hash->{Name}; - my $attr = $hash->{Attributes}; - - $self->Start ($tag); - - if (exists $hash->{AttributeOrder}) - { - my $defaulted = $hash->{Defaulted}; - my @order = @{ $hash->{AttributeOrder} }; - - # Specified attributes - for (my $i = 0; $i < $defaulted; $i++) - { - my $a = $order[$i]; - $self->Attr ($tag, $a, $attr->{$a}, 1); - } - - # Defaulted attributes - for (my $i = $defaulted; $i < @order; $i++) - { - my $attr = $order[$i]; - $self->Attr ($tag, $a, $attr->{$a}, 0); - } - } - else - { - # Assume all attributes were specified - my @attr = %$attr; - my ($key, $val); - while ($key = shift @attr) - { - $val = shift @attr; - - $self->Attr ($tag, $key, $val, 1); - } - } - $self->EndAttr; -} - -sub Start -{ - my ($self, $tag) = @_; -#?? if first tag, check with root element - or does expat check this already? - - my $context = $self->{Context}; - $context->[0]->Start ($self, $tag); - - my $erule = $self->{ERule}->{$tag}; - if (defined $erule) - { - unshift @$context, $erule->context; - } - else - { - # It's not a real error according to the XML Spec. - $self->fail (101, "undefined ELEMENT [$tag]"); - unshift @$context, new XML::Checker::Context::ANY; - } - -#?? what about ARule ?? - my $arule = $self->{ARule}->{$tag}; - if (defined $arule) - { - $self->{CurrARule} = $arule; - $arule->StartAttr; - } -} - -# PerlSAX API -sub end_element -{ - shift->End; -} - -sub End -{ - my ($self) = @_; - my $context = $self->{Context}; - - $context->[0]->End ($self); - shift @$context; -} - -# PerlSAX API -sub characters -{ - my ($self, $hash) = @_; - my $data = $hash->{Data}; - - if ($self->{InCDATA}) - { - $self->CData ($data); - } - else - { - $self->Char ($data); - } -} - -# PerlSAX API -sub start_cdata -{ - $_[0]->{InCDATA} = 1; -} - -# PerlSAX API -sub end_cdata -{ - $_[0]->{InCDATA} = 0; -} - -sub Char -{ - my ($self, $text) = @_; - my $context = $self->{Context}; - - # NOTE: calls to isWS may set this to 1. - $INSIGNIF_WS = 0; - - $context->[0]->Char ($self, $text); -} - -# Treat CDATASection same as Char (Text) -sub CData -{ - my ($self, $cdata) = @_; - my $context = $self->{Context}; - - $context->[0]->Char ($self, $cdata); - - # CDATASection can never be insignificant whitespace - $INSIGNIF_WS = 0; -#?? I'm not sure if this assumption is correct -} - -# PerlSAX API -sub comment -{ - my ($self, $hash) = @_; - $self->Comment ($hash->{Data}); -} - -sub Comment -{ -# ?? what can be checked here? -} - -# PerlSAX API -sub entity_reference -{ - my ($self, $hash) = @_; - $self->EntityRef ($hash->{Name}, 0); -#?? parameter entities (like %par;) are NOT supported! -# PerlSAX::handle_default should be fixed! -} - -sub EntityRef -{ - my ($self, $ref, $isParam) = @_; - - if ($isParam) - { - # expand to "%name;" - print STDERR "XML::Checker::Entity - parameter Entity (%ent;) not implemented\n"; - } - else - { - # Treat same as Char - for now - my $context = $self->{Context}; - $context->[0]->Char ($self, "&$ref;"); - $INSIGNIF_WS = 0; -#?? I could count the number of times each Entity is referenced - } -} - -# PerlSAX API -sub unparsed_entity_decl -{ - my ($self, $hash) = @_; - $self->Unparsed ($hash->{Name}); -#?? what about Base, SytemId, PublicId ? -} - -sub Unparsed -{ - my ($self, $entity) = @_; -# print "ARule::Unparsed $entity\n"; - if ($self->{Unparsed}->{$entity}) - { - $self->fail (112, "unparsed ENTITY [$entity] already defined"); - } - else - { - $self->{Unparsed}->{$entity} = 1; - } -} - -# PerlSAX API -sub notation_decl -{ - my ($self, $hash) = @_; - $self->Notation ($hash->{Name}); -#?? what about Base, SytemId, PublicId ? -} - -sub Notation -{ - my ($self, $notation) = @_; - if ($self->{NOTATION}->{$notation}) - { - $self->fail (113, "NOTATION [$notation] already defined"); - } - else - { - $self->{NOTATION}->{$notation} = 1; - } -} - -# PerlSAX API -sub entity_decl -{ - my ($self, $hash) = @_; - - $self->Entity ($hash->{Name}, $hash->{Value}, $hash->{SystemId}, - $hash->{PublicId}, $hash->{'Notation'}); -} - -sub Entity -{ - my ($self, $name, $val, $sysId, $pubId, $ndata) = @_; - - if (exists $self->{ENTITY}->{$name}) - { - $self->fail (114, "ENTITY [$name] already defined"); - } - else - { - $self->{ENTITY}->{$name} = $val; - } -} - -# PerlSAX API -#sub xml_decl {} $hash=> Version, Encoding, Standalone -# Don't implement resolve_entity() which is called by ExternEnt! -#sub processing_instruction {} $hash=> Target, Data - -# Returns whether the Char data is whitespace and also updates the -# $INSIGNIF_WS variable to indicate whether it is insignificant whitespace. -# Note that this method is only called in places where potential whitespace -# can be insignificant (i.e. when the ERule is Children or EMPTY) -sub isWS -{ - $INSIGNIF_WS = ($_[1] =~ /^\s*$/); -} - -sub isInsignifWS -{ - $INSIGNIF_WS; -} - -sub fail -{ - my $self = shift; - &$FAIL (@_); -} - -sub print_error # static -{ - my $str = error_string (@_); - print STDERR $str; -} - -sub error_string # static -{ - my $code = shift; - my $msg = shift; - - my @a = (); - my ($key, $val); - while ($key = shift) - { - $val = shift; - push @a, ("$key " . (defined $val ? $val : "(undef)")); - } - - my $cat = $code >= 200 ? ($code >= 300 ? "INFO" : "WARNING") : "ERROR"; - my $str = join (", ", @a); - $str = length($str) ? "\tContext: $str\n" : ""; - - "XML::Checker $cat-$code: $msg\n$str"; -} - -sub debug -{ - my ($self) = @_; - my $context = $self->{Context}->[0]; - my $c = $context ? $context->debug : "no context"; - my $root = $self->{RootElement}; - - "Checker[$c,RootElement=$root]"; -} - -1; # package return code - -__END__ - -=head1 NAME - -XML::Checker - A perl module for validating XML - -=head1 SYNOPSIS - -L - an L that validates at parse time - -L - an L that validates at parse time - -(Some of the package names may change! This is only an alpha release...) - -=head1 DESCRIPTION - -XML::Checker can be used in different ways to validate XML. See the manual -pages of L and L -for more information. - -This document only describes common topics like error handling -and the XML::Checker class itself. - -WARNING: Not all errors are currently checked. Almost everything is subject to -change. Some reported errors may not be real errors. - -=head1 ERROR HANDLING - -Whenever XML::Checker (or one of the packages that uses XML::Checker) detects a -potential error, the 'fail handler' is called. It is currently also called -to report information, like how many times an Entity was referenced. -(The whole error handling mechanism is subject to change, I'm afraid...) - -The default fail handler is XML::Checker::print_error(), which prints an error -message to STDERR. It does not stop the XML::Checker, so it will continue -looking for other errors. -The error message is created with XML::Checker::error_string(). - -You can define your -own fail handler in two ways, locally and globally. Use a local variable to -temporarily override the fail handler. This way the default fail handler is restored -when the local variable goes out of scope, esp. when exceptions are thrown e.g. - - # Using a local variable to temporarily override the fail handler (preferred) - { # new block - start of local scope - local $XML::Checker::FAIL = \&my_fail; - ... your code here ... - } # end of block - the previous fail handler is restored - -You can also set the error handler globally, risking that your code may not -be reusable or may clash with other modules that use XML::Checker. - - # Globally setting the fail handler (not recommended) - $XML::Checker::FAIL = \&my_fail; - ... rest of your code ... - -The fail handler is called with the following parameters ($code, $msg, @context), -where $code is the error code, $msg is the error description and -@context contains information on where the error occurred. The @context is -a (ordered) list of (key,value) pairs and can easily be turned into a hash. -It contains the following information: - - Element - tag name of Element node (if applicable) - Attr - attribute name (if applicable) - ChildElementIndex - if applicable (see error 157) - line - only when parsing - column - only when parsing - byte - only when parsing (-1 means: end of file) - -Some examples of fail handlers: - - # Don't print info messages - sub my_fail - { - my $code = shift; - print STDERR XML::Checker::error_message ($code, @_) - if $code < 300; - } - - # Die when the first error is encountered - this will stop - # the parsing process. Ignore information messages. - sub my_fail - { - my $code = shift; - die XML::Checker::error_message ($code, @_) if $code < 300; - } - - # Count the number of undefined NOTATION references - # and print the error as usual - sub my_fail - { - my $code = shift; - $count_undef_notations++ if $code == 100; - XML::Checker::print_error ($code, @_); - } - - # Die when an error is encountered. - # Don't die if a warning or info message is encountered, just print a message. - sub my_fail { - my $code = shift; - die XML::Checker::error_string ($code, @_) if $code < 200; - XML::Checker::print_error ($code, @_); - } - -=head1 INSIGNIFICANT WHITESPACE - -XML::Checker keeps track of whether whitespace found in character data -is significant or not. It is considered insignicant if it is found inside -an element that has a ELEMENT rule that is not of type Mixed or of type ANY. -(A Mixed ELEMENT rule does contains the #PCDATA keyword. -An ANY rule contains the ANY keyword. See the XML spec for more info.) - -XML::Checker can not determine whether whitespace is insignificant in those two -cases, because they both allow regular character data to appear within -XML elements and XML::Checker can therefore not deduce whether whitespace -is part of the actual data or was just added for readability of the XML file. - -XML::Checker::Parser and XML::DOM::ValParser both have the option to skip -insignificant whitespace when setting B to 1 in their constructor. -If set, they will not call the Char handler when insignificant whitespace is -encountered. This means that in XML::DOM::ValParser no Text nodes are created -for insignificant whitespace. - -Regardless of whether the SkipInsignifWS options is set, XML::Checker always -keeps track of whether whitespace is insignificant. After making a call to -XML::Checker's Char handler, you can find out if it was insignificant whitespace -by calling the isInsignifWS method. - -When using multiple (nested) XML::Checker instances or when using XML::Checker -without using XML::Checker::Parser or XML::DOM::ValParser (which hardly anybody -probably will), make sure to set a local variable in the scope of your checking -code, e.g. - - { # new block - start of local scope - local $XML::Checker::INSIGNIF_WS = 0; - ... insert your code here ... - } # end of scope - -=head1 ERROR CODES - -There are 3 categories, errors, warnings and info messages. -(The codes are still subject to change, as well the error descriptions.) - -Most errors have a link to the appropriate Validaty Constraint (B) -or other section in the XML specification. - -=head2 ERROR Messages - -=head2 100 - 109 - -=over 4 - -=item * - -B<100> - undefined NOTATION [$notation] in ATTLIST - -The ATTLIST contained a Notation reference that was not defined in a -NOTATION definition. -B L - - -=item * - -B<101> - undefined ELEMENT [$tagName] - -The specified Element was never defined in an ELEMENT definition. -This is not an error according to the XML spec. -See L - - -=item * - -B<102> - undefined unparsed ENTITY [$entity] - -The attribute value referenced an undefined unparsed entity. -B L - - -=item * - -B<103> - undefined attribute [$attrName] - -The specified attribute was not defined in an ATTLIST for that Element. -B L - - -=back - -=head2 110 - 119 - -=over 4 - -=item * - -B<110> - attribute [$attrName] of element [$tagName] already defined - -The specified attribute was already defined in this ATTLIST definition or -in a previous one. -This is not an error according to the XML spec. -See L - - -=item * - -B<111> - ID [$value] already defined - -An ID with the specified value was already defined in an attribute -within the same document. -B L - - -=item * - -B<112> - unparsed ENTITY [$entity] already defined - -This is not an error according to the XML spec. -See L - - -=item * - -B<113> - NOTATION [$notation] already defined - - -=item * - -B<114> - ENTITY [$entity] already defined - -This is not an error according to the XML spec. -See L - - -=item * - -B<115> - ELEMENT [$name] already defined -B L - - -=back - -=head2 120 - 129 - -=over 4 - -=item * - -B<120> - invalid default ENTITY [$default] - -(Or IDREF or NMTOKEN instead of ENTITY.) -The ENTITY, IDREF or NMTOKEN reference in the default attribute -value for an attribute with types ENTITY, IDREF or NMTOKEN was not -valid. -B L - - -=item * - -B<121> - invalid default [$token] in ENTITIES [$default] - -(Or IDREFS or NMTOKENS instead of ENTITIES) -One of the ENTITY, IDREF or NMTOKEN references in the default attribute -value for an attribute with types ENTITIES, IDREFS or NMTOKENS was not -valid. -B L - - -=item * - -B<122> - invalid default attribute value [$default] - -The specified default attribute value is not a valid attribute value. -B L - - -=item * - -B<123> - invalid default ID [$default], must be #REQUIRED or #IMPLIED - -The default attribute value for an attribute of type ID has to be -#REQUIRED or #IMPLIED. -B L - - -=item * - -B<124> - bad model [$model] for ELEMENT [$name] - -The model in the ELEMENT definition did not conform to the XML syntax -for Mixed models. -See L - - -=back - -=head2 130 - 139 - -=over 4 - -=item * - -B<130> - invalid NMTOKEN [$attrValue] - -The attribute value is not a valid NmToken token. -B L - - -=item * - -B<131> - invalid ID [$attrValue] - -The specified attribute value is not a valid Name token. -B L - - -=item * - -B<132> - invalid IDREF [$value] - -The specified attribute value is not a valid Name token. -B L - - -=item * - -B<133> - invalid ENTITY name [$name] - -The specified attribute value is not a valid Name token. -B L - - -=item * - -B<134> - invalid Enumeration value [$value] in ATTLIST - -The specified value is not a valid NmToken (see XML spec for def.) -See definition of L - - -=item * - -B<135> - empty NOTATION list in ATTLIST - -The NOTATION list of the ATTLIST definition did not contain any NOTATION -references. -See definition of L - - -=item * - -B<136> - empty Enumeration list in ATTLIST - -The ATTLIST definition of the attribute of type Enumeration did not -contain any values. -See definition of L - - -=item * - -B<137> - invalid ATTLIST type [$type] - -The attribute type has to be one of: ID, IDREF, IDREFS, ENTITY, ENTITIES, -NMTOKEN, NMTOKENS, CDATA, NOTATION or an Enumeration. -See definition of L - - -=back - -=head2 150 - 159 - -=over 4 - -=item * - -B<150> - bad #FIXED attribute value [$value], it should be [$default] - -The specified attribute was defined as #FIXED in the ATTLIST definition -and the found attribute $value differs from the specified $default value. -B L - - -=item * - -B<151> - only one ID allowed in ATTLIST per element first=[$attrName] - -The ATTLIST definitions for an Element may contain only one attribute -with the type ID. The specified $attrName is the one that was found first. -B L - - -=item * - -B<152> - Element should be EMPTY, found Element [$tagName] - -The ELEMENT definition for the specified Element said it should be -EMPTY, but a child Element was found. -B L - - -=item * - -B<153> - Element should be EMPTY, found text [$text] - -The ELEMENT definition for the specified Element said it should be -EMPTY, but text was found. Currently, whitespace is not allowed between the -open and close tag. (This may be wrong, please give feedback.) -To allow whitespace (subject to change), set: - - $XML::Checker::Context::EMPTY::ALLOW_WHITE_SPACE = 1; - -B L - - -=item * - -B<154> - bad order of Elements Found=[$found] RE=[$re] - -The child elements of the specified Element did not match the -regular expression found in the ELEMENT definition. $found contains -a comma separated list of all the child element tag names that were found. -$re contains the (decoded) regular expression that was used internally. -B L - - -=item * - -B<155> - more than one root Element [$tags] - -An XML Document may only contain one Element. -$tags is a comma separated list of element tag names encountered sofar. -L (expat) throws 'no element found' exception. -See two_roots.xml for an example. -See definition of L - - -=item * - -B<156> - unexpected root Element [$tagName], expected [$rootTagName] - -The tag name of the root Element of the XML Document differs from the name -specified in the DOCTYPE section. -L (expat) throws 'not well-formed' exception. -See bad_root.xml for an example. -B L - - -=item * - -B<157> - unexpected Element [$tagName] - -The ELEMENT definition for the specified Element does not allow child -Elements with the specified $tagName. -B L - -The error context contains ChildElementIndex which is the index within -its parent Element (counting only Element nodes.) - - -=item * - -B<158> - unspecified value for #IMPLIED attribute [$attrName] - -The ATTLIST for the specified attribute said the attribute was #IMPLIED, -which means the user application should supply a value, but the attribute -value was not specified. (User applications should pass a value and set -$specified to 1 in the Attr handler.) - - -=item * - -B<159> - unspecified value for #REQUIRED attribute [$attrName] - -The ATTLIST for the specified attribute said the attribute was #REQUIRED, -which means that a value should have been specified. -B L - - -=back - -=head2 160 - 169 - -=over 4 - -=item * - -B<160> - invalid Enumeration value [$attrValue] - -The specified attribute value does not match one of the Enumeration values -in the ATTLIST. -B L - - -=item * - -B<161> - invalid NOTATION value [$attrValue] - -The specifed attribute value was not found in the list of possible NOTATION -references as found in the ATTLIST definition. -B L - - -=item * - -B<162> - undefined NOTATION [$attrValue] - -The NOTATION referenced by the specified attribute value was not defined. -B L - - -=back - -=head2 WARNING Messages (200 and up) - -=over 4 - -=item * - -B<200> - undefined ID [$id] was referenced [$n] times - -The specified ID was referenced $n times, but never defined in an attribute -value with type ID. -B L - - -=back - -=head2 INFO Messages (300 and up) - -=over 4 - -=item * - -B<300> - [$n] references to ID [$id] - -The specified ID was referenced $n times. - - -=back - -=head2 Not checked - -The following errors are already checked by L (expat) and -are currently not checked by XML::Checker: - -(?? TODO - add more info) - -=over 4 - -=item root element is missing - -L (expat) throws 'no element found' exception. -See no_root.xml for an example. - -=back - -=head1 XML::Checker - -XML::Checker can be easily plugged into your application. -It uses mostly the same style of event handlers (or callbacks) as L. -See L manual page for descriptions of most handlers. - -It also implements PerlSAX style event handlers. See L. - -Currently, the XML::Checker object is a blessed hash with the following -(potentially useful) entries: - - $checker->{RootElement} - root element name as found in the DOCTYPE - $checker->{NOTATION}->{$notation} - is 1 if the NOTATION was defined - $checker->{ENTITY}->{$name} - contains the (first) ENTITY value if defined - $checker->{Unparsed}->{$entity} - is 1 if the unparsed ENTITY was defined - $checker->{ID}->{$id} - is 1 if the ID was defined - $checker->{IDREF}->{$id} - number of times the ID was referenced - - # Less useful: - $checker->{ERule}->{$tag} - the ELEMENT rules by Element tag name - $checker->{ARule}->{$tag} - the ATTLIST rules by Element tag name - $checker->{Context} - context stack used internally - $checker->{CurrARule} - current ATTLIST rule for the current Element - -=head2 XML:Checker methods - -This section is only interesting when using XML::Checker directly. -XML::Checker supports most event handlers that L supports with minor -differences. Note that the XML::Checker event handler methods are -instance methods and not static, so don't forget to call them like this, -without passing $expat (as in the L) handlers: - - $checker->Start($tagName); - -=over 4 - -=item Constructor - - $checker = new XML::Checker; - $checker = new XML::Checker (%user_args); - -User data may be stored by client applications. Only $checker->{User} is -guaranteed not to clash with internal hash keys. - -=item getRootElement () - - $tagName = $checker->getRootElement; - -Returns the root element name as found in the DOCTYPE - -=back - -=head2 Expat interface - -XML::Checker supports what I call the I interface, which is -the collection of methods you normally specify as the callback handlers -when using XML::Parser. - -Only the following L handlers are currently supported: -Init, Final, Char, Start, End, Element, Attlist, Doctype, -Unparsed, Entity, Notation. - -I don't know how to correctly support the Default handler for all L -releases. The Start handler works a little different (see below) and I -added Attr, InitDomElem, FinalDomElem, CDATA and EntityRef handlers. -See L for a description of the handlers that are not listed below. - -Note that this interface may disappear, when the PerlSAX interface stabilizes. - -=over 4 - -=item Start ($tag) - - $checker->Start($tag); - -Call this when an Element with the specified $tag name is encountered. -Different from the Start handler in L, in that no attributes -are passed in (use the Attr handler for those.) - -=item Attr ($tag, $attrName, $attrValue, $isSpecified) - - $checker->Attr($tag,$attrName,$attrValue,$spec); - -Checks an attribute with the specified $attrName and $attrValue against the -ATTLIST definition of the element with the specified $tag name. -$isSpecified means whether the attribute was specified (1) or defaulted (0). - -=item EndAttr () - - $checker->EndAttr; - -This should be called after all attributes are passed with Attr(). -It will check which of the #REQUIRED attributes were not specified and generate -the appropriate error (159) for each one that is missing. - -=item CDATA ($text) - - $checker->CDATA($text); - -This should be called whenever CDATASections are encountered. -Similar to Char handler (but might perform different checks later...) - -=item EntityRef ($entity, $isParameterEntity) - - $checker->EntityRef($entity,$isParameterEntity); - -Checks the ENTITY reference. Set $isParameterEntity to 1 for -entity references that start with '%'. - -=item InitDomElem () and FinalDomElem () - -Used by XML::DOM::Element::check() to initialize (and cleanup) the -context stack when checking a single element. - -=back - -=head2 PerlSAX interface - -XML::Checker now also supports the PerlSAX interface, so you can use XML::Checker -wherever you use PerlSAX handlers. - -XML::Checker implements the following methods: start_document, end_document, -start_element, end_element, characters, processing_instruction, comment, -start_cdata, end_cdata, entity_reference, notation_decl, unparsed_entity_decl, -entity_decl, element_decl, attlist_decl, doctype_decl, xml_decl - -Not implemented: set_document_locator, ignorable_whitespace - -See PerlSAX.pod for details. (It is called lib/PerlSAX.pod in the libxml-perl -distribution which can be found at CPAN.) - -=head1 CAVEATS - -This is an alpha release. Almost everything is subject to change. - -=head1 AUTHOR - -Send bug reports, hints, tips, suggestions to Enno Derksen at ->. - -=head1 SEE ALSO - -The home page of XML::Checker at L - -The XML spec (Extensible Markup Language 1.0) at L - -The L and L manual pages. - -The other packages that come with XML::Checker: -L, L - -The DOM Level 1 specification at L - -The PerlSAX specification. It is currently in lib/PerlSAX.pod in the -libxml-perl distribution by Ken MacLeod. - -The original SAX specification (Simple API for XML) can be found at -L and L