dummy_foundation/lib/XML/Checker/Parser.pm
changeset 0 02cd6b52f378
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dummy_foundation/lib/XML/Checker/Parser.pm	Thu May 28 10:10:03 2009 +0100
@@ -0,0 +1,683 @@
+package XML::Checker::Parser;
+use strict;
+use XML::Parser;
+use XML::Checker;
+
+use vars qw( @ISA @InterceptedHandlers @SGML_SEARCH_PATH %URI_MAP
+	     $_checker $_prevFAIL
+	     $_Init $_Final $_Char $_Start $_End $_Element $_Attlist 
+	     $_Doctype $_Unparsed $_Notation $_Entity $_skipInsignifWS
+	     $_EndOfDoc
+	   );
+
+@ISA = qw( XML::Parser );
+
+@InterceptedHandlers = qw( Init Final Char Start End Element Attlist 
+			   Doctype Unparsed Notation Entity );
+
+# Where to search for external DTDs (in local file system)
+@SGML_SEARCH_PATH = ();
+
+# Where to search for external DTDs as referred to by public ID in a 
+# <!DOCTYPE ...> statement, e.g. "-//W3C//DTD HTML 4.0//EN"
+# E.g. it could map "-//W3C//DTD HTML 4.0//EN" to "file:/user/html.dtd"
+%URI_MAP = ();
+
+sub new
+{
+    my ($class, %args) = @_;
+
+    my $super = new XML::Parser (%args);
+    $super->{Checker} = new XML::Checker (%args);
+
+    my %handlers = %{$super->{Handlers}};
+
+    # Don't need Comment handler - assuming comments are allowed anywhere
+#?? What should Default handler do?
+#?? Check XMLDecl, ExternEnt, Proc?  No, for now.
+#?? Add CdataStart, CdataEnd support?
+
+    for (@InterceptedHandlers)
+    {
+	my $func = "XML::Checker::Parser::$_";
+	$handlers{$_} = \&$func;
+    }
+
+    $super->{UserHandlers} = $super->{Handlers};
+    $super->{Handlers} = \%handlers;
+
+    bless $super, $class;
+}
+
+sub getChecker
+{
+    $_[0]->{Checker}
+}
+
+sub parse
+{
+    my $self = shift;
+    my $uh = $self->{UserHandlers};
+
+    local $_checker = $self->{Checker};
+
+    local $_Init = $uh->{Init};
+    local $_Final = $uh->{Final};
+    local $_Start = $uh->{Start};
+    local $_End = $uh->{End};
+    local $_Char = $uh->{Char};
+    local $_Element = $uh->{'Element'};
+    local $_Attlist = $uh->{'Attlist'};
+    local $_Doctype = $uh->{Doctype};
+    local $_Unparsed = $uh->{Unparsed};
+    local $_Notation = $uh->{Notation};
+    local $_Entity = $uh->{Entity};
+
+    local $_prevFAIL = $XML::Checker::FAIL;
+    local $XML::Checker::FAIL = \&fail_add_context;
+
+    local $XML::Checker::INSIGNIF_WS = 0;
+    local $_skipInsignifWS = $self->{SkipInsignifWS};
+
+    local $_EndOfDoc = 0;
+    
+    $self->SUPER::parse (@_);
+}
+
+my $LWP_USER_AGENT;
+sub set_LWP_UserAgent	# static
+{
+    $LWP_USER_AGENT = shift;
+}
+
+sub load_URL		# static
+{
+    my ($url, $lwp_user_agent) = @_;
+    my $result;
+
+    # Read the file from the web with LWP.
+    #
+    # Note that we read in the entire file, which may not be ideal
+    # for large files. LWP::UserAgent also provides a callback style
+    # request, which we could convert to a stream with a fork()...
+    
+    my $response;
+    eval
+    {
+	use LWP::UserAgent;
+	
+	my $ua = $lwp_user_agent;
+	unless (defined $ua)
+	{
+	    unless (defined $LWP_USER_AGENT)
+	    {
+		$LWP_USER_AGENT = LWP::UserAgent->new;
+		
+		# Load proxy settings from environment variables, i.e.:
+		# http_proxy, ftp_proxy, no_proxy etc. (see LWP::UserAgent(3))
+		# You need these to go thru firewalls.
+		$LWP_USER_AGENT->env_proxy;
+	    }
+	    $ua = $LWP_USER_AGENT;
+	}
+	my $req = new HTTP::Request 'GET', $url;
+	$response = $LWP_USER_AGENT->request ($req);
+	$result = $response->content;
+    };
+    if ($@)
+    {
+	die "Couldn't load URL [$url] with LWP: $@";
+    }
+    if (!$result)
+    {
+	my $message = $response->as_string;
+	die "Couldn't load URL [$url] with LWP: $message";
+    }
+    return $result;
+}
+
+sub parsefile
+{
+    my $self = shift;
+    my $url = shift;
+
+    # Any other URL schemes?
+    if ($url =~ /^(https?|ftp|wais|gopher|file):/)
+    {
+	my $xml = load_URL ($url, $self->{LWP_UserAgent});
+	my $result;
+	eval
+	{
+	    # Parse the result of the HTTP request
+	    $result = $self->parse ($xml, @_);
+	};
+	if ($@)
+	{
+	    die "Couldn't parsefile [$url]: $@";
+	}
+	return $result;
+    }
+    else
+    {
+	return $self->SUPER::parsefile ($url, @_);
+    }
+}
+
+sub Init
+{
+    my $expat = shift;
+    $_checker->{Expat} = $expat;
+
+    $_checker->Init (@_);
+    &$_Init ($expat) if $_Init;
+}
+
+sub Final
+{
+    my $expat = shift;
+    $_EndOfDoc = 1;
+
+    $_checker->Final (@_);
+    my $result = &$_Final ($expat) if $_Final;
+
+    # Decouple Expat from Checker
+    delete $_checker->{Expat};
+
+    # NOTE: Checker is not decoupled
+    return $result;
+}
+
+sub Start
+{
+    my ($expat, $tag, @attr) = @_;
+
+    $_checker->Start ($tag);
+
+    my $num_spec = $expat->specified_attr;
+    for (my $i = 0; $i < @attr; $i++)
+    {
+	my $spec = ($i < $num_spec);
+	my $attr = $attr[$i];
+	my $val = $attr[++$i];
+
+#	print "--- $tag $attr $val $spec\n";
+	$_checker->Attr ($tag, $attr, $val, $spec);
+    }
+    $_checker->EndAttr;
+
+    &$_Start ($expat, $tag, @attr) if $_Start;
+}
+
+sub End
+{
+    my $expat = shift;
+    $_checker->End (@_);
+    &$_End ($expat, @_) if $_End;
+}
+
+sub Char
+{
+    my $expat = shift;
+    $_checker->Char (@_);
+    &$_Char ($expat, @_) 
+	if $_Char && !($XML::Checker::INSIGNIF_WS && $_skipInsignifWS);
+    # Skip insignificant whitespace
+}
+
+sub Element
+{
+    my $expat = shift;
+    $_checker->Element (@_);
+    &$_Element ($expat, @_) if $_Element;
+}
+
+sub Attlist
+{
+    my $expat = shift;
+    $_checker->Attlist (@_);
+    &$_Attlist ($expat, @_) if $_Attlist;
+}
+
+
+sub Doctype
+{
+    my $expat = shift;
+    my ($name, $sysid, $pubid, $internal) = @_;
+
+    my $dtd;
+    unless ($_checker->{SkipExternalDTD}) 
+    {
+	if ($sysid)
+	{
+	    # External DTD...
+	    
+	    #?? I'm not sure if we should die here or keep going?	    
+	    $dtd = load_DTD ($sysid, $expat->{LWP_UserAgent});
+	}
+	elsif ($pubid)
+	{
+	    $dtd = load_DTD ($pubid, $expat->{LWP_UserAgent});
+	}
+    }
+
+    if (defined $dtd)
+    {
+#?? what about passing ProtocolEncoding, Namespaces, Stream_Delimiter ?
+	my $parser = new XML::Parser (
+	    Checker => $_checker, 
+	    ErrorContext => $expat->{ErrorContext},
+	    Handlers => { 
+		Entity => \&XML::Checker::Parser::ExternalDTD::Entity,
+		Notation => \&XML::Checker::Parser::ExternalDTD::Notation,
+		Element => \&XML::Checker::Parser::ExternalDTD::Element,
+		Attlist => \&XML::Checker::Parser::ExternalDTD::Attlist,
+		Unparsed => \&XML::Checker::Parser::ExternalDTD::Unparsed,
+	    });
+
+	eval 
+	{
+	    $parser->parse ("<!DOCTYPE $name SYSTEM '$sysid' [\n$dtd\n]>\n<$name/>");
+	};
+	if ($@)
+	{
+	    die "Couldn't parse contents of external DTD <$sysid> :$@";
+	}
+    }
+    $_checker->Doctype (@_);
+    &$_Doctype ($expat, @_) if $_Doctype;
+}
+
+sub Unparsed
+{
+    my $expat = shift;
+    $_checker->Unparsed (@_);
+    &$_Unparsed ($expat, @_) if $_Unparsed;
+}
+
+sub Entity
+{
+    my $expat = shift;
+    $_checker->Entity (@_);
+    &$_Entity ($expat, @_) if $_Entity;
+}
+
+sub Notation
+{
+    my $expat = shift;
+    $_checker->Notation (@_);
+    &$_Notation ($expat, @_) if $_Notation;
+}
+
+sub Default
+{
+#?? what can I check here?
+#    print "Default handler got[" . join (", ", @_) . "]";
+}
+
+#sub XMLDecl
+#{
+#?? support later?
+#}
+
+sub setHandlers
+{
+    my ($self, %h) = @_;
+
+    for my $name (@InterceptedHandlers)
+    {
+	if (exists $h{$name})
+	{
+	    eval "\$_$name = \$h{$name}";
+	    delete $h{$name};
+	}
+    }
+
+    # Pass remaining handlers to the parent class (XML::Parser)
+    $self->SUPER::setHandlers (%h);
+}
+
+# Add (line, column, byte) to error context (unless it's EOF)
+sub fail_add_context	# static
+{
+    my $e = $_checker->{Expat};
+
+    my $byte = $e->current_byte;	# -1 means: end of XML document
+    if ($byte != -1 && !$_EndOfDoc)
+    {
+	push @_, (line => $e->current_line, 
+		  column => $e->current_column, 
+		  byte => $byte);
+    }
+    &$_prevFAIL (@_);
+}
+
+#-------- STATIC METHODS related to External DTDs ---------------------------
+
+sub load_DTD		# static
+{
+    my ($sysid, $lwp_user_agent) = @_;
+
+    # See if it is defined in the %URI_MAP
+    # (Public IDs are stored here, e.g. "-//W3C//DTD HTML 4.0//EN")
+    if (exists $URI_MAP{$sysid})
+    {
+	$sysid = $URI_MAP{$sysid};
+    }
+    elsif ($sysid !~ /^\w+:/) 
+    {
+	# Prefix the sysid with 'file:' if it has no protocol identifier
+	unless ($sysid =~ /^\//) 
+	{
+	    # Not an absolute path. See if it's in SGML_SEARCH_PATH.
+	    my $relative_sysid = $sysid;
+
+	    $sysid = find_in_sgml_search_path ($sysid);
+	    if (! $sysid) 
+	    {
+		if ($ENV{'SGML_SEARCH_PATH'}) 
+		{
+		    die "Couldn't find external DTD [$relative_sysid] in SGML_SEARCH_PATH ($ENV{'SGML_SEARCH_PATH'})";
+		}
+		else 
+		{
+		    die "Couldn't find external DTD [$relative_sysid], may be you should set SGML_SEARCH_PATH";
+		}
+	    }
+	}
+	$sysid = "file:$sysid";
+    }
+
+    return load_URL ($sysid, $lwp_user_agent);
+}
+
+sub map_uri			# static
+{
+    %URI_MAP = (%URI_MAP, @_);
+}
+
+sub set_sgml_search_path	# static
+{
+    @SGML_SEARCH_PATH = @_;
+}
+
+sub find_in_sgml_search_path	# static
+{
+    my $file = shift;
+
+    my @dirs = @SGML_SEARCH_PATH;
+    unless (@dirs)
+    {
+	my $path = $ENV{SGML_SEARCH_PATH};
+	if ($path)
+	{
+	    @dirs = split (':', $path);
+	}
+	else
+	{
+	    my $home = $ENV{HOME};
+	    @dirs = (".", "$home/.sgml", "/usr/lib/sgml", "/usr/share/sgml");
+	}
+    }
+
+    for my $directory (@dirs) 
+    {
+	if (-e "$directory/$file") 
+	{
+	    return "$directory/$file";
+	}
+    }
+    return undef;
+}
+
+package XML::Checker::Parser::ExternalDTD;
+
+sub Element {
+	my $expat = shift;
+	$expat->{Checker}->Element(@_);
+}
+
+sub Attlist {
+	my $expat = shift;
+	$expat->{Checker}->Attlist(@_);
+}
+
+sub Unparsed {
+	my $expat = shift;
+	$expat->{Checker}->Unparsed(@_);
+}
+
+sub Notation {
+	my $expat = shift;
+	$expat->{Checker}->Notation(@_);
+}
+
+sub Entity {
+	my $expat = shift;
+#	print "Entity: $expat\n";
+	$expat->{Checker}->Entity(@_);
+}
+
+1; # package return code
+
+__END__
+
+=head1 NAME
+
+XML::Checker::Parser - an XML::Parser that validates at parse time
+
+=head1 SYNOPSIS
+
+ use XML::Checker::Parser;
+
+ my %expat_options = (KeepCDATA => 1, 
+		      Handlers => [ Unparsed => \&my_Unparsed_handler ]);
+ my $parser = new XML::Checker::Parser (%expat_options);
+
+ eval {
+     local $XML::Checker::FAIL = \&my_fail;
+     $parser->parsefile ("fail.xml");
+ };
+ if ($@) {
+     # Either XML::Parser (expat) threw an exception or my_fail() died.
+     ... your error handling code here ...
+ }
+
+ # Throws an exception (with die) when an error is encountered, this
+ # will stop the parsing process.
+ # Don't die if a warning or info message is encountered, just print a message.
+ sub my_fail {
+     my $code = shift;
+     die XML::Checker::error_string ($code, @_) if $code < 200;
+     XML::Checker::print_error ($code, @_);
+ }
+
+=head1 DESCRIPTION
+
+XML::Checker::Parser extends L<XML::Parser>
+
+I hope the example in the SYNOPSIS says it all, just use 
+L<XML::Checker::Parser> as if it were an XML::Parser. 
+See L<XML::Parser> for the supported (expat) options.
+
+You can also derive your parser from XML::Checker::Parser instead of 
+from XML::Parser. All you should have to do is replace:
+
+ package MyParser;
+ @ISA = qw( XML::Parser );
+
+with:
+
+ package MyParser;
+ @ISA = qw( XML::Checker::Parser );
+
+=head1 XML::Checker::Parser constructor
+
+ $parser = new XML::Checker::Parser (SkipExternalDTD => 1, SkipInsignifWS => 1);
+
+The constructor takes the same parameters as L<XML::Parser> with the following additions:
+
+=over 4
+
+=item SkipExternalDTD
+
+By default, it will try to load external DTDs using LWP. You can disable this
+by setting SkipExternalDTD to 1. See L<External DTDs|"External DTDs"> for details.
+
+=item SkipInsignifWS
+
+By default, it will treat insignificant whitespace as regular Char data.
+By setting SkipInsignifWS to 1, the user Char handler will not be called
+if insignificant whitespace is encountered. 
+See L<XML::Checker/INSIGNIFICANT_WHITESPACE> for details.
+
+=item LWP_UserAgent
+
+When calling parsefile() with a URL (instead of a filename) or when loading
+external DTDs, we use LWP to download the
+remote file. By default it will use a L<LWP::UserAgent> that is created as follows:
+
+ use LWP::UserAgent;
+ $LWP_USER_AGENT = LWP::UserAgent->new;
+ $LWP_USER_AGENT->env_proxy;
+
+Note that L<env_proxy> reads proxy settings from your environment variables, 
+which is what I need to do to get thru our firewall. 
+If you want to use a different LWP::UserAgent, you can either set
+it globally with:
+
+ XML::Checker::Parser::set_LWP_UserAgent ($my_agent);
+
+or, you can specify it for a specific XML::Checker::Parser by passing it to 
+the constructor:
+
+ my $parser = new XML::Checker::Parser (LWP_UserAgent => $my_agent);
+
+Currently, LWP is used when the filename (passed to parsefile) starts with one of
+the following URL schemes: http, https, ftp, wais, gopher, or file 
+(followed by a colon.) If I missed one, please let me know. 
+
+The LWP modules are part of libwww-perl which is available at CPAN.
+
+=back
+
+=head1 External DTDs
+
+XML::Checker::Parser will try to load and parse external DTDs that are 
+referenced in DOCTYPE definitions unless you set the B<SkipExternalDTD>
+option to 1 (the default setting is 0.) 
+See L<CAVEATS|"CAVEATS"> for details on what is not supported by XML::Checker::Parser.
+
+L<XML::Parser> (version 2.27 and up) does a much better job at reading external 
+DTDs, because recently external DTD parsing was added to expat.
+Make sure you set the L<XML::Parser> option B<ParseParamEnt> to 1 and the 
+XML::Checker::Parser option B<SkipExternalDTD> to 1. 
+(They can both be set in the XML::Checker::Parser constructor.)
+
+When external DTDs are parsed by XML::Checker::Parser, they are
+located in the following order:
+
+=over 4
+
+=item *
+
+With the %URI_MAP, which can be set using B<map_uri>.
+This hash maps external resource ids (like system ID's and public ID's)
+to full path URI's.
+It was meant to aid in resolving PUBLIC IDs found in DOCTYPE declarations 
+after the PUBLIC keyword, e.g.
+
+  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">
+
+However, you can also use this to force L<XML::Checker> to read DTDs from a
+different URL than was specified (e.g. from the local file system for
+performance reasons.)
+
+=item * 
+
+on the Internet, if their system identifier starts with a protocol 
+(like http://...)
+
+=item *
+
+on the local disk, if their system identifier starts with a slash 
+(absolute path)
+
+=item *
+
+in the SGML_SEARCH_PATH, if their system identifier is a 
+relative file name. It will use @SGML_SEARCH_PATH if it was set with
+B<set_sgml_search_path()>, or the colon-separated $ENV{SGML_SEARCH_PATH},
+or (if that isn't set) the list (".", "$ENV{'HOME'}/.sgml", "/usr/lib/sgml",
+"/usr/share/sgml"), which includes the
+current directory, so it should do the right thing in most cases.
+
+=back
+
+=head2 Static methods related to External DTDs
+
+=over 4
+
+=item set_sgml_search_path (dir1, dir2, ...)
+
+External DTDs with relative file paths are looked up using the @SGML_SEARCH_PATH,
+which can be set with this method. If @SGML_SEARCH_PATH is never set, it
+will use the colon-separated $ENV{SGML_SEARCH_PATH} instead. If neither are set
+it uses the list: ".", "$ENV{'HOME'}/.sgml", "/usr/lib/sgml",
+"/usr/share/sgml".
+
+set_sgml_search_path is a static method.
+
+=item map_uri (pubid => uri, ...)
+
+To define the location of PUBLIC ids, as found in DOCTYPE declarations 
+after the PUBLIC keyword, e.g.
+
+  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">
+
+call this method, e.g.
+
+  XML::Checker::Parser::map_uri (
+	"-//W3C//DTD HTML 4.0//EN" => "file:/user/html.dtd");
+
+See L<External DTDs|"External DTDs"> for more info.
+
+XML::Checker::Parser::map_uri is a static method.
+
+=back
+
+=head1 Switching user handlers at parse time
+
+You should be able to use setHandlers() just as in L<XML::Parser>.
+(Using setHandlers has not been tested yet.)
+
+=head1 Error handling
+
+XML::Checker::Parser routes the fail handler through 
+XML::Checker::Parser::fail_add_context() before calling your fail handler
+(i.e. the global fail handler: $XML::Checker::FAIL. 
+See L<XML::Checker/ERROR_HANDLING>.)
+It adds the (line, column, byte) information from L<XML::Parser> to the 
+error context (unless it was the end of the XML document.)
+
+=head1 Supported XML::Parser handlers
+
+Only the following L<XML::Parser> handlers are currently routed through
+L<XML::Checker>: Init, Final, Char, Start, End, Element, Attlist, Doctype,
+Unparsed, Notation.
+
+=head1 CAVEATS
+
+When using XML::Checker::Parser to parse external DTDs 
+(i.e. with SkipExternalDTD => 0),
+expect trouble when your external DTD contains parameter entities inside 
+declarations or conditional sections. The external DTD should probably have
+the same encoding as the orignal XML document.
+
+=head1 AUTHOR
+
+Send bug reports, hints, tips, suggestions to Enno Derksen at
+<F<enno@att.com>>.
+
+=head1 SEE ALSO
+
+L<XML::Checker> (L<XML::Checker/SEE_ALSO>), L<XML::Parser>