diff -r 228efacd68af -r 989c70555820 common/tools/raptor/XML/SAX/PurePerl.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/common/tools/raptor/XML/SAX/PurePerl.pm Tue Aug 04 14:40:11 2009 +0100 @@ -0,0 +1,744 @@ +# $Id: PurePerl.pm,v 1.21 2007/02/07 09:33:50 grant Exp $ + +package XML::SAX::PurePerl; + +use strict; +use vars qw/$VERSION/; + +$VERSION = '0.91'; + +use XML::SAX::PurePerl::Productions qw($Any $CharMinusDash $SingleChar); +use XML::SAX::PurePerl::Reader; +use XML::SAX::PurePerl::EncodingDetect (); +use XML::SAX::Exception; +use XML::SAX::PurePerl::DocType (); +use XML::SAX::PurePerl::DTDDecls (); +use XML::SAX::PurePerl::XMLDecl (); +use XML::SAX::DocumentLocator (); +use XML::SAX::Base (); +use XML::SAX qw(Namespaces); +use XML::NamespaceSupport (); +use IO::File; + +if ($] < 5.006) { + require XML::SAX::PurePerl::NoUnicodeExt; +} +else { + require XML::SAX::PurePerl::UnicodeExt; +} + +use vars qw(@ISA); +@ISA = ('XML::SAX::Base'); + +my %int_ents = ( + amp => '&', + lt => '<', + gt => '>', + quot => '"', + apos => "'", + ); + +my $xmlns_ns = "http://www.w3.org/2000/xmlns/"; +my $xml_ns = "http://www.w3.org/XML/1998/namespace"; + +use Carp; +sub _parse_characterstream { + my $self = shift; + my ($fh) = @_; + confess("CharacterStream is not yet correctly implemented"); + my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh); + return $self->_parse($reader); +} + +sub _parse_bytestream { + my $self = shift; + my ($fh) = @_; + my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh); + return $self->_parse($reader); +} + +sub _parse_string { + my $self = shift; + my ($str) = @_; + my $reader = XML::SAX::PurePerl::Reader::String->new($str); + return $self->_parse($reader); +} + +sub _parse_systemid { + my $self = shift; + my ($uri) = @_; + my $reader = XML::SAX::PurePerl::Reader::URI->new($uri); + return $self->_parse($reader); +} + +sub _parse { + my ($self, $reader) = @_; + + $reader->public_id($self->{ParseOptions}{Source}{PublicId}); + $reader->system_id($self->{ParseOptions}{Source}{SystemId}); + + $self->{NSHelper} = XML::NamespaceSupport->new({xmlns => 1}); + + $self->set_document_locator( + XML::SAX::DocumentLocator->new( + sub { $reader->public_id }, + sub { $reader->system_id }, + sub { $reader->line }, + sub { $reader->column }, + sub { $reader->get_encoding }, + sub { $reader->get_xml_version }, + ), + ); + + $self->start_document({}); + + if (defined $self->{ParseOptions}{Source}{Encoding}) { + $reader->set_encoding($self->{ParseOptions}{Source}{Encoding}); + } + else { + $self->encoding_detect($reader); + } + + # parse a document + $self->document($reader); + + return $self->end_document({}); +} + +sub parser_error { + my $self = shift; + my ($error, $reader) = @_; + +# warn("parser error: $error from ", $reader->line, " : ", $reader->column, "\n"); + my $exception = XML::SAX::Exception::Parse->new( + Message => $error, + ColumnNumber => $reader->column, + LineNumber => $reader->line, + PublicId => $reader->public_id, + SystemId => $reader->system_id, + ); + + $self->fatal_error($exception); + $exception->throw; +} + +sub document { + my ($self, $reader) = @_; + + # document ::= prolog element Misc* + + $self->prolog($reader); + $self->element($reader) || + $self->parser_error("Document requires an element", $reader); + + while(length($reader->data)) { + $self->Misc($reader) || + $self->parser_error("Only Comments, PIs and whitespace allowed at end of document", $reader); + } +} + +sub prolog { + my ($self, $reader) = @_; + + $self->XMLDecl($reader); + + # consume all misc bits + 1 while($self->Misc($reader)); + + if ($self->doctypedecl($reader)) { + while (length($reader->data)) { + $self->Misc($reader) || last; + } + } +} + +sub element { + my ($self, $reader) = @_; + + return 0 unless $reader->match('<'); + + my $name = $self->Name($reader) || $self->parser_error("Invalid element name", $reader); + + my %attribs; + + while( my ($k, $v) = $self->Attribute($reader) ) { + $attribs{$k} = $v; + } + + my $have_namespaces = $self->get_feature(Namespaces); + + # Namespace processing + $self->{NSHelper}->push_context; + my @new_ns; +# my %attrs = @attribs; +# while (my ($k,$v) = each %attrs) { + if ($have_namespaces) { + while ( my ($k, $v) = each %attribs ) { + if ($k =~ m/^xmlns(:(.*))?$/) { + my $prefix = $2 || ''; + $self->{NSHelper}->declare_prefix($prefix, $v); + my $ns = + { + Prefix => $prefix, + NamespaceURI => $v, + }; + push @new_ns, $ns; + $self->SUPER::start_prefix_mapping($ns); + } + } + } + + # Create element object and fire event + my %attrib_hash; + while (my ($name, $value) = each %attribs ) { + # TODO normalise value here + my ($ns, $prefix, $lname); + if ($have_namespaces) { + ($ns, $prefix, $lname) = $self->{NSHelper}->process_attribute_name($name); + } + $ns ||= ''; $prefix ||= ''; $lname ||= ''; + $attrib_hash{"{$ns}$lname"} = { + Name => $name, + LocalName => $lname, + Prefix => $prefix, + NamespaceURI => $ns, + Value => $value, + }; + } + + %attribs = (); # lose the memory since we recurse deep + + my ($ns, $prefix, $lname); + if ($self->get_feature(Namespaces)) { + ($ns, $prefix, $lname) = $self->{NSHelper}->process_element_name($name); + } + else { + $lname = $name; + } + $ns ||= ''; $prefix ||= ''; $lname ||= ''; + + # Process remainder of start_element + $self->skip_whitespace($reader); + my $have_content; + my $data = $reader->data(2); + if ($data =~ /^\/>/) { + $reader->move_along(2); + } + else { + $data =~ /^>/ or $self->parser_error("No close element tag", $reader); + $reader->move_along(1); + $have_content++; + } + + my $el = + { + Name => $name, + LocalName => $lname, + Prefix => $prefix, + NamespaceURI => $ns, + Attributes => \%attrib_hash, + }; + $self->start_element($el); + + # warn("($name\n"); + + if ($have_content) { + $self->content($reader); + + my $data = $reader->data(2); + $data =~ /^<\// or $self->parser_error("No close tag marker", $reader); + $reader->move_along(2); + my $end_name = $self->Name($reader); + $end_name eq $name || $self->parser_error("End tag mismatch ($end_name != $name)", $reader); + $self->skip_whitespace($reader); + $reader->match('>') or $self->parser_error("No close '>' on end tag", $reader); + } + + my %end_el = %$el; + delete $end_el{Attributes}; + $self->end_element(\%end_el); + + for my $ns (@new_ns) { + $self->end_prefix_mapping($ns); + } + $self->{NSHelper}->pop_context; + + return 1; +} + +sub content { + my ($self, $reader) = @_; + + while (1) { + $self->CharData($reader); + + my $data = $reader->data(2); + + if ($data =~ /^<\//) { + return 1; + } + elsif ($data =~ /^&/) { + $self->Reference($reader) or $self->parser_error("bare & not allowed in content", $reader); + next; + } + elsif ($data =~ /^CDSect($reader) + or + $self->Comment($reader)) + and next; + } + elsif ($data =~ /^<\?/) { + $self->PI($reader) and next; + } + elsif ($data =~ /^element($reader) and next; + } + last; + } + + return 1; +} + +sub CDSect { + my ($self, $reader) = @_; + + my $data = $reader->data(9); + return 0 unless $data =~ /^move_along(9); + + $self->start_cdata({}); + + $data = $reader->data; + while (1) { + $self->parser_error("EOF looking for CDATA section end", $reader) + unless length($data); + + if ($data =~ /^(.*?)\]\]>/s) { + my $chars = $1; + $reader->move_along(length($chars) + 3); + $self->characters({Data => $chars}); + last; + } + else { + $self->characters({Data => $data}); + $reader->move_along(length($data)); + $data = $reader->data; + } + } + $self->end_cdata({}); + return 1; +} + +sub CharData { + my ($self, $reader) = @_; + + my $data = $reader->data; + + while (1) { + return unless length($data); + + if ($data =~ /^([^<&]*)[<&]/s) { + my $chars = $1; + $self->parser_error("String ']]>' not allowed in character data", $reader) + if $chars =~ /\]\]>/; + $reader->move_along(length($chars)); + $self->characters({Data => $chars}) if length($chars); + last; + } + else { + $self->characters({Data => $data}); + $reader->move_along(length($data)); + $data = $reader->data; + } + } +} + +sub Misc { + my ($self, $reader) = @_; + if ($self->Comment($reader)) { + return 1; + } + elsif ($self->PI($reader)) { + return 1; + } + elsif ($self->skip_whitespace($reader)) { + return 1; + } + + return 0; +} + +sub Reference { + my ($self, $reader) = @_; + + return 0 unless $reader->match('&'); + + my $data = $reader->data; + + if ($data =~ /^#x([0-9a-fA-F]+);/) { + my $ref = $1; + $reader->move_along(length($ref) + 3); + my $char = chr_ref(hex($ref)); + $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader) + unless $char =~ /$SingleChar/o; + $self->characters({ Data => $char }); + return 1; + } + elsif ($data =~ /^#([0-9]+);/) { + my $ref = $1; + $reader->move_along(length($ref) + 2); + my $char = chr_ref($ref); + $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader) + unless $char =~ /$SingleChar/o; + $self->characters({ Data => $char }); + return 1; + } + else { + # EntityRef + my $name = $self->Name($reader) + || $self->parser_error("Invalid name in entity", $reader); + $reader->match(';') or $self->parser_error("No semi-colon found after entity name", $reader); + + # warn("got entity: \&$name;\n"); + + # expand it + if ($self->_is_entity($name)) { + + if ($self->_is_external($name)) { + my $value = $self->_get_entity($name); + my $ent_reader = XML::SAX::PurePerl::Reader::URI->new($value); + $self->encoding_detect($ent_reader); + $self->extParsedEnt($ent_reader); + } + else { + my $value = $self->_stringify_entity($name); + my $ent_reader = XML::SAX::PurePerl::Reader::String->new($value); + $self->content($ent_reader); + } + return 1; + } + elsif ($name =~ /^(?:amp|gt|lt|quot|apos)$/) { + $self->characters({ Data => $int_ents{$name} }); + return 1; + } + else { + $self->parser_error("Undeclared entity", $reader); + } + } +} + +sub AttReference { + my ($self, $name, $reader) = @_; + if ($name =~ /^#x([0-9a-fA-F]+)$/) { + my $chr = chr_ref(hex($1)); + $chr =~ /$SingleChar/o or $self->parser_error("Character reference '&$name;' refers to an illegal XML character", $reader); + return $chr; + } + elsif ($name =~ /^#([0-9]+)$/) { + my $chr = chr_ref($1); + $chr =~ /$SingleChar/o or $self->parser_error("Character reference '&$name;' refers to an illegal XML character", $reader); + return $chr; + } + else { + if ($self->_is_entity($name)) { + if ($self->_is_external($name)) { + $self->parser_error("No external entity references allowed in attribute values", $reader); + } + else { + my $value = $self->_stringify_entity($name); + return $value; + } + } + elsif ($name =~ /^(?:amp|lt|gt|quot|apos)$/) { + return $int_ents{$name}; + } + else { + $self->parser_error("Undeclared entity '$name'", $reader); + } + } +} + +sub extParsedEnt { + my ($self, $reader) = @_; + + $self->TextDecl($reader); + $self->content($reader); +} + +sub _is_external { + my ($self, $name) = @_; +# TODO: Fix this to use $reader to store the entities perhaps. + if ($self->{ParseOptions}{external_entities}{$name}) { + return 1; + } + return ; +} + +sub _is_entity { + my ($self, $name) = @_; +# TODO: ditto above + if (exists $self->{ParseOptions}{entities}{$name}) { + return 1; + } + return 0; +} + +sub _stringify_entity { + my ($self, $name) = @_; +# TODO: ditto above + if (exists $self->{ParseOptions}{expanded_entity}{$name}) { + return $self->{ParseOptions}{expanded_entity}{$name}; + } + # expand + my $reader = XML::SAX::PurePerl::Reader::URI->new($self->{ParseOptions}{entities}{$name}); + my $ent = ''; + while(1) { + my $data = $reader->data; + $ent .= $data; + $reader->move_along(length($data)) or last; + } + return $self->{ParseOptions}{expanded_entity}{$name} = $ent; +} + +sub _get_entity { + my ($self, $name) = @_; +# TODO: ditto above + return $self->{ParseOptions}{entities}{$name}; +} + +sub skip_whitespace { + my ($self, $reader) = @_; + + my $data = $reader->data; + + my $found = 0; + while ($data =~ s/^([\x20\x0A\x0D\x09]*)//) { + last unless length($1); + $found++; + $reader->move_along(length($1)); + $data = $reader->data; + } + + return $found; +} + +sub Attribute { + my ($self, $reader) = @_; + + $self->skip_whitespace($reader) || return; + + my $data = $reader->data(2); + return if $data =~ /^\/?>/; + + if (my $name = $self->Name($reader)) { + $self->skip_whitespace($reader); + $reader->match('=') or $self->parser_error("No '=' in Attribute", $reader); + $self->skip_whitespace($reader); + my $value = $self->AttValue($reader); + + if (!$self->cdata_attrib($name)) { + $value =~ s/^\x20*//; # discard leading spaces + $value =~ s/\x20*$//; # discard trailing spaces + $value =~ s/ {1,}/ /g; # all >1 space to single space + } + + return $name, $value; + } + + return; +} + +sub cdata_attrib { + # TODO implement this! + return 1; +} + +sub AttValue { + my ($self, $reader) = @_; + + my $quote = $self->quote($reader); + + my $value = ''; + + while (1) { + my $data = $reader->data; + $self->parser_error("EOF found while looking for the end of attribute value", $reader) + unless length($data); + if ($data =~ /^([^$quote]*)$quote/) { + $reader->move_along(length($1) + 1); + $value .= $1; + last; + } + else { + $value .= $data; + $reader->move_along(length($data)); + } + } + + if ($value =~ /parser_error("< character not allowed in attribute values", $reader); + } + + $value =~ s/[\x09\x0A\x0D]/\x20/g; + $value =~ s/&(#(x[0-9a-fA-F]+)|([0-9]+)|\w+);/$self->AttReference($1, $reader)/geo; + + return $value; +} + +sub Comment { + my ($self, $reader) = @_; + + my $data = $reader->data(4); + if ($data =~ /^/s) { + $comment_str .= $1; + $self->parser_error("Invalid comment (dash)", $reader) if $comment_str =~ /-$/; + $reader->move_along(length($1) + 3); + last; + } + else { + $comment_str .= $data; + $reader->move_along(length($data)); + } + } + + $self->comment({ Data => $comment_str }); + + return 1; + } + return 0; +} + +sub PI { + my ($self, $reader) = @_; + + my $data = $reader->data(2); + + if ($data =~ /^<\?/) { + $reader->move_along(2); + my ($target, $data); + $target = $self->Name($reader) || + $self->parser_error("PI has no target", $reader); + if ($self->skip_whitespace($reader)) { + $target = ''; + while (1) { + my $data = $reader->data; + $self->parser_error("End of data seen while looking for close PI marker", $reader) + unless length($data); + if ($data =~ /^(.*?)\?>/s) { + $target .= $1; + $reader->move_along(length($1) + 2); + last; + } + else { + $target .= $data; + $reader->move_along(length($data)); + } + } + } + else { + my $data = $reader->data(2); + $data =~ /^\?>/ or $self->parser_error("PI closing sequence not found", $reader); + $reader->move_along(2); + } + $self->processing_instruction({ Target => $target, Data => $data }); + + return 1; + } + return 0; +} + +sub Name { + my ($self, $reader) = @_; + + my $name = ''; + while(1) { + my $data = $reader->data; + return unless length($data); + $data =~ /^([^\s>\/&\?;=<\)\(\[\],\%\#\!\*]*)/ or return; + $name .= $1; + my $len = length($1); + $reader->move_along($len); + last if ($len != length($data)); + } + + return unless length($name); + + $name =~ /$NameChar/o or $self->parser_error("Name <$name> does not match NameChar production", $reader); + + return $name; +} + +sub quote { + my ($self, $reader) = @_; + + my $data = $reader->data; + + $data =~ /^(['"])/ or $self->parser_error("Invalid quote token", $reader); + $reader->move_along(1); + return $1; +} + +1; +__END__ + +=head1 NAME + +XML::SAX::PurePerl - Pure Perl XML Parser with SAX2 interface + +=head1 SYNOPSIS + + use XML::Handler::Foo; + use XML::SAX::PurePerl; + my $handler = XML::Handler::Foo->new(); + my $parser = XML::SAX::PurePerl->new(Handler => $handler); + $parser->parse_uri("myfile.xml"); + +=head1 DESCRIPTION + +This module implements an XML parser in pure perl. It is written around the +upcoming perl 5.8's unicode support and support for multiple document +encodings (using the PerlIO layer), however it has been ported to work with +ASCII/UTF8 documents under lower perl versions. + +The SAX2 API is described in detail at http://sourceforge.net/projects/perl-xml/, in +the CVS archive, under libxml-perl/docs. Hopefully those documents will be in a +better location soon. + +Please refer to the SAX2 documentation for how to use this module - it is merely a +front end to SAX2, and implements nothing that is not in that spec (or at least tries +not to - please email me if you find errors in this implementation). + +=head1 BUGS + +XML::SAX::PurePerl is B. Very slow. I suggest you use something else +in fact. However it is great as a fallback parser for XML::SAX, where the +user might not be able to install an XS based parser or C library. + +Currently lots, probably. At the moment the weakest area is parsing DOCTYPE declarations, +though the code is in place to start doing this. Also parsing parameter entity +references is causing me much confusion, since it's not exactly what I would call +trivial, or well documented in the XML grammar. XML documents with internal subsets +are likely to fail. + +I am however trying to work towards full conformance using the Oasis test suite. + +=head1 AUTHOR + +Matt Sergeant, matt@sergeant.org. Copyright 2001. + +Please report all bugs to the Perl-XML mailing list at perl-xml@listserv.activestate.com. + +=head1 LICENSE + +This is free software. You may use it or redistribute it under the same terms as +Perl 5.7.2 itself. + +=cut +