dummy_foundation/lib/XML/Checker/Parser.pm
changeset 4 60053dab7e2a
parent 3 8b87ea768cb8
child 5 842a773e65f2
child 6 c34a018f3291
--- a/dummy_foundation/lib/XML/Checker/Parser.pm	Wed Jun 03 18:33:51 2009 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,683 +0,0 @@
-package XML::Checker::Parser;
-use strict;
-use XML::Parser;
-use XML::Checker;
-
-use vars qw( @ISA @InterceptedHandlers @SGML_SEARCH_PATH %URI_MAP
-	     $_checker $_prevFAIL
-	     $_Init $_Final $_Char $_Start $_End $_Element $_Attlist 
-	     $_Doctype $_Unparsed $_Notation $_Entity $_skipInsignifWS
-	     $_EndOfDoc
-	   );
-
-@ISA = qw( XML::Parser );
-
-@InterceptedHandlers = qw( Init Final Char Start End Element Attlist 
-			   Doctype Unparsed Notation Entity );
-
-# Where to search for external DTDs (in local file system)
-@SGML_SEARCH_PATH = ();
-
-# Where to search for external DTDs as referred to by public ID in a 
-# <!DOCTYPE ...> statement, e.g. "-//W3C//DTD HTML 4.0//EN"
-# E.g. it could map "-//W3C//DTD HTML 4.0//EN" to "file:/user/html.dtd"
-%URI_MAP = ();
-
-sub new
-{
-    my ($class, %args) = @_;
-
-    my $super = new XML::Parser (%args);
-    $super->{Checker} = new XML::Checker (%args);
-
-    my %handlers = %{$super->{Handlers}};
-
-    # Don't need Comment handler - assuming comments are allowed anywhere
-#?? What should Default handler do?
-#?? Check XMLDecl, ExternEnt, Proc?  No, for now.
-#?? Add CdataStart, CdataEnd support?
-
-    for (@InterceptedHandlers)
-    {
-	my $func = "XML::Checker::Parser::$_";
-	$handlers{$_} = \&$func;
-    }
-
-    $super->{UserHandlers} = $super->{Handlers};
-    $super->{Handlers} = \%handlers;
-
-    bless $super, $class;
-}
-
-sub getChecker
-{
-    $_[0]->{Checker}
-}
-
-sub parse
-{
-    my $self = shift;
-    my $uh = $self->{UserHandlers};
-
-    local $_checker = $self->{Checker};
-
-    local $_Init = $uh->{Init};
-    local $_Final = $uh->{Final};
-    local $_Start = $uh->{Start};
-    local $_End = $uh->{End};
-    local $_Char = $uh->{Char};
-    local $_Element = $uh->{'Element'};
-    local $_Attlist = $uh->{'Attlist'};
-    local $_Doctype = $uh->{Doctype};
-    local $_Unparsed = $uh->{Unparsed};
-    local $_Notation = $uh->{Notation};
-    local $_Entity = $uh->{Entity};
-
-    local $_prevFAIL = $XML::Checker::FAIL;
-    local $XML::Checker::FAIL = \&fail_add_context;
-
-    local $XML::Checker::INSIGNIF_WS = 0;
-    local $_skipInsignifWS = $self->{SkipInsignifWS};
-
-    local $_EndOfDoc = 0;
-    
-    $self->SUPER::parse (@_);
-}
-
-my $LWP_USER_AGENT;
-sub set_LWP_UserAgent	# static
-{
-    $LWP_USER_AGENT = shift;
-}
-
-sub load_URL		# static
-{
-    my ($url, $lwp_user_agent) = @_;
-    my $result;
-
-    # Read the file from the web with LWP.
-    #
-    # Note that we read in the entire file, which may not be ideal
-    # for large files. LWP::UserAgent also provides a callback style
-    # request, which we could convert to a stream with a fork()...
-    
-    my $response;
-    eval
-    {
-	use LWP::UserAgent;
-	
-	my $ua = $lwp_user_agent;
-	unless (defined $ua)
-	{
-	    unless (defined $LWP_USER_AGENT)
-	    {
-		$LWP_USER_AGENT = LWP::UserAgent->new;
-		
-		# Load proxy settings from environment variables, i.e.:
-		# http_proxy, ftp_proxy, no_proxy etc. (see LWP::UserAgent(3))
-		# You need these to go thru firewalls.
-		$LWP_USER_AGENT->env_proxy;
-	    }
-	    $ua = $LWP_USER_AGENT;
-	}
-	my $req = new HTTP::Request 'GET', $url;
-	$response = $LWP_USER_AGENT->request ($req);
-	$result = $response->content;
-    };
-    if ($@)
-    {
-	die "Couldn't load URL [$url] with LWP: $@";
-    }
-    if (!$result)
-    {
-	my $message = $response->as_string;
-	die "Couldn't load URL [$url] with LWP: $message";
-    }
-    return $result;
-}
-
-sub parsefile
-{
-    my $self = shift;
-    my $url = shift;
-
-    # Any other URL schemes?
-    if ($url =~ /^(https?|ftp|wais|gopher|file):/)
-    {
-	my $xml = load_URL ($url, $self->{LWP_UserAgent});
-	my $result;
-	eval
-	{
-	    # Parse the result of the HTTP request
-	    $result = $self->parse ($xml, @_);
-	};
-	if ($@)
-	{
-	    die "Couldn't parsefile [$url]: $@";
-	}
-	return $result;
-    }
-    else
-    {
-	return $self->SUPER::parsefile ($url, @_);
-    }
-}
-
-sub Init
-{
-    my $expat = shift;
-    $_checker->{Expat} = $expat;
-
-    $_checker->Init (@_);
-    &$_Init ($expat) if $_Init;
-}
-
-sub Final
-{
-    my $expat = shift;
-    $_EndOfDoc = 1;
-
-    $_checker->Final (@_);
-    my $result = &$_Final ($expat) if $_Final;
-
-    # Decouple Expat from Checker
-    delete $_checker->{Expat};
-
-    # NOTE: Checker is not decoupled
-    return $result;
-}
-
-sub Start
-{
-    my ($expat, $tag, @attr) = @_;
-
-    $_checker->Start ($tag);
-
-    my $num_spec = $expat->specified_attr;
-    for (my $i = 0; $i < @attr; $i++)
-    {
-	my $spec = ($i < $num_spec);
-	my $attr = $attr[$i];
-	my $val = $attr[++$i];
-
-#	print "--- $tag $attr $val $spec\n";
-	$_checker->Attr ($tag, $attr, $val, $spec);
-    }
-    $_checker->EndAttr;
-
-    &$_Start ($expat, $tag, @attr) if $_Start;
-}
-
-sub End
-{
-    my $expat = shift;
-    $_checker->End (@_);
-    &$_End ($expat, @_) if $_End;
-}
-
-sub Char
-{
-    my $expat = shift;
-    $_checker->Char (@_);
-    &$_Char ($expat, @_) 
-	if $_Char && !($XML::Checker::INSIGNIF_WS && $_skipInsignifWS);
-    # Skip insignificant whitespace
-}
-
-sub Element
-{
-    my $expat = shift;
-    $_checker->Element (@_);
-    &$_Element ($expat, @_) if $_Element;
-}
-
-sub Attlist
-{
-    my $expat = shift;
-    $_checker->Attlist (@_);
-    &$_Attlist ($expat, @_) if $_Attlist;
-}
-
-
-sub Doctype
-{
-    my $expat = shift;
-    my ($name, $sysid, $pubid, $internal) = @_;
-
-    my $dtd;
-    unless ($_checker->{SkipExternalDTD}) 
-    {
-	if ($sysid)
-	{
-	    # External DTD...
-	    
-	    #?? I'm not sure if we should die here or keep going?	    
-	    $dtd = load_DTD ($sysid, $expat->{LWP_UserAgent});
-	}
-	elsif ($pubid)
-	{
-	    $dtd = load_DTD ($pubid, $expat->{LWP_UserAgent});
-	}
-    }
-
-    if (defined $dtd)
-    {
-#?? what about passing ProtocolEncoding, Namespaces, Stream_Delimiter ?
-	my $parser = new XML::Parser (
-	    Checker => $_checker, 
-	    ErrorContext => $expat->{ErrorContext},
-	    Handlers => { 
-		Entity => \&XML::Checker::Parser::ExternalDTD::Entity,
-		Notation => \&XML::Checker::Parser::ExternalDTD::Notation,
-		Element => \&XML::Checker::Parser::ExternalDTD::Element,
-		Attlist => \&XML::Checker::Parser::ExternalDTD::Attlist,
-		Unparsed => \&XML::Checker::Parser::ExternalDTD::Unparsed,
-	    });
-
-	eval 
-	{
-	    $parser->parse ("<!DOCTYPE $name SYSTEM '$sysid' [\n$dtd\n]>\n<$name/>");
-	};
-	if ($@)
-	{
-	    die "Couldn't parse contents of external DTD <$sysid> :$@";
-	}
-    }
-    $_checker->Doctype (@_);
-    &$_Doctype ($expat, @_) if $_Doctype;
-}
-
-sub Unparsed
-{
-    my $expat = shift;
-    $_checker->Unparsed (@_);
-    &$_Unparsed ($expat, @_) if $_Unparsed;
-}
-
-sub Entity
-{
-    my $expat = shift;
-    $_checker->Entity (@_);
-    &$_Entity ($expat, @_) if $_Entity;
-}
-
-sub Notation
-{
-    my $expat = shift;
-    $_checker->Notation (@_);
-    &$_Notation ($expat, @_) if $_Notation;
-}
-
-sub Default
-{
-#?? what can I check here?
-#    print "Default handler got[" . join (", ", @_) . "]";
-}
-
-#sub XMLDecl
-#{
-#?? support later?
-#}
-
-sub setHandlers
-{
-    my ($self, %h) = @_;
-
-    for my $name (@InterceptedHandlers)
-    {
-	if (exists $h{$name})
-	{
-	    eval "\$_$name = \$h{$name}";
-	    delete $h{$name};
-	}
-    }
-
-    # Pass remaining handlers to the parent class (XML::Parser)
-    $self->SUPER::setHandlers (%h);
-}
-
-# Add (line, column, byte) to error context (unless it's EOF)
-sub fail_add_context	# static
-{
-    my $e = $_checker->{Expat};
-
-    my $byte = $e->current_byte;	# -1 means: end of XML document
-    if ($byte != -1 && !$_EndOfDoc)
-    {
-	push @_, (line => $e->current_line, 
-		  column => $e->current_column, 
-		  byte => $byte);
-    }
-    &$_prevFAIL (@_);
-}
-
-#-------- STATIC METHODS related to External DTDs ---------------------------
-
-sub load_DTD		# static
-{
-    my ($sysid, $lwp_user_agent) = @_;
-
-    # See if it is defined in the %URI_MAP
-    # (Public IDs are stored here, e.g. "-//W3C//DTD HTML 4.0//EN")
-    if (exists $URI_MAP{$sysid})
-    {
-	$sysid = $URI_MAP{$sysid};
-    }
-    elsif ($sysid !~ /^\w+:/) 
-    {
-	# Prefix the sysid with 'file:' if it has no protocol identifier
-	unless ($sysid =~ /^\//) 
-	{
-	    # Not an absolute path. See if it's in SGML_SEARCH_PATH.
-	    my $relative_sysid = $sysid;
-
-	    $sysid = find_in_sgml_search_path ($sysid);
-	    if (! $sysid) 
-	    {
-		if ($ENV{'SGML_SEARCH_PATH'}) 
-		{
-		    die "Couldn't find external DTD [$relative_sysid] in SGML_SEARCH_PATH ($ENV{'SGML_SEARCH_PATH'})";
-		}
-		else 
-		{
-		    die "Couldn't find external DTD [$relative_sysid], may be you should set SGML_SEARCH_PATH";
-		}
-	    }
-	}
-	$sysid = "file:$sysid";
-    }
-
-    return load_URL ($sysid, $lwp_user_agent);
-}
-
-sub map_uri			# static
-{
-    %URI_MAP = (%URI_MAP, @_);
-}
-
-sub set_sgml_search_path	# static
-{
-    @SGML_SEARCH_PATH = @_;
-}
-
-sub find_in_sgml_search_path	# static
-{
-    my $file = shift;
-
-    my @dirs = @SGML_SEARCH_PATH;
-    unless (@dirs)
-    {
-	my $path = $ENV{SGML_SEARCH_PATH};
-	if ($path)
-	{
-	    @dirs = split (':', $path);
-	}
-	else
-	{
-	    my $home = $ENV{HOME};
-	    @dirs = (".", "$home/.sgml", "/usr/lib/sgml", "/usr/share/sgml");
-	}
-    }
-
-    for my $directory (@dirs) 
-    {
-	if (-e "$directory/$file") 
-	{
-	    return "$directory/$file";
-	}
-    }
-    return undef;
-}
-
-package XML::Checker::Parser::ExternalDTD;
-
-sub Element {
-	my $expat = shift;
-	$expat->{Checker}->Element(@_);
-}
-
-sub Attlist {
-	my $expat = shift;
-	$expat->{Checker}->Attlist(@_);
-}
-
-sub Unparsed {
-	my $expat = shift;
-	$expat->{Checker}->Unparsed(@_);
-}
-
-sub Notation {
-	my $expat = shift;
-	$expat->{Checker}->Notation(@_);
-}
-
-sub Entity {
-	my $expat = shift;
-#	print "Entity: $expat\n";
-	$expat->{Checker}->Entity(@_);
-}
-
-1; # package return code
-
-__END__
-
-=head1 NAME
-
-XML::Checker::Parser - an XML::Parser that validates at parse time
-
-=head1 SYNOPSIS
-
- use XML::Checker::Parser;
-
- my %expat_options = (KeepCDATA => 1, 
-		      Handlers => [ Unparsed => \&my_Unparsed_handler ]);
- my $parser = new XML::Checker::Parser (%expat_options);
-
- eval {
-     local $XML::Checker::FAIL = \&my_fail;
-     $parser->parsefile ("fail.xml");
- };
- if ($@) {
-     # Either XML::Parser (expat) threw an exception or my_fail() died.
-     ... your error handling code here ...
- }
-
- # Throws an exception (with die) when an error is encountered, this
- # will stop the parsing process.
- # Don't die if a warning or info message is encountered, just print a message.
- sub my_fail {
-     my $code = shift;
-     die XML::Checker::error_string ($code, @_) if $code < 200;
-     XML::Checker::print_error ($code, @_);
- }
-
-=head1 DESCRIPTION
-
-XML::Checker::Parser extends L<XML::Parser>
-
-I hope the example in the SYNOPSIS says it all, just use 
-L<XML::Checker::Parser> as if it were an XML::Parser. 
-See L<XML::Parser> for the supported (expat) options.
-
-You can also derive your parser from XML::Checker::Parser instead of 
-from XML::Parser. All you should have to do is replace:
-
- package MyParser;
- @ISA = qw( XML::Parser );
-
-with:
-
- package MyParser;
- @ISA = qw( XML::Checker::Parser );
-
-=head1 XML::Checker::Parser constructor
-
- $parser = new XML::Checker::Parser (SkipExternalDTD => 1, SkipInsignifWS => 1);
-
-The constructor takes the same parameters as L<XML::Parser> with the following additions:
-
-=over 4
-
-=item SkipExternalDTD
-
-By default, it will try to load external DTDs using LWP. You can disable this
-by setting SkipExternalDTD to 1. See L<External DTDs|"External DTDs"> for details.
-
-=item SkipInsignifWS
-
-By default, it will treat insignificant whitespace as regular Char data.
-By setting SkipInsignifWS to 1, the user Char handler will not be called
-if insignificant whitespace is encountered. 
-See L<XML::Checker/INSIGNIFICANT_WHITESPACE> for details.
-
-=item LWP_UserAgent
-
-When calling parsefile() with a URL (instead of a filename) or when loading
-external DTDs, we use LWP to download the
-remote file. By default it will use a L<LWP::UserAgent> that is created as follows:
-
- use LWP::UserAgent;
- $LWP_USER_AGENT = LWP::UserAgent->new;
- $LWP_USER_AGENT->env_proxy;
-
-Note that L<env_proxy> reads proxy settings from your environment variables, 
-which is what I need to do to get thru our firewall. 
-If you want to use a different LWP::UserAgent, you can either set
-it globally with:
-
- XML::Checker::Parser::set_LWP_UserAgent ($my_agent);
-
-or, you can specify it for a specific XML::Checker::Parser by passing it to 
-the constructor:
-
- my $parser = new XML::Checker::Parser (LWP_UserAgent => $my_agent);
-
-Currently, LWP is used when the filename (passed to parsefile) starts with one of
-the following URL schemes: http, https, ftp, wais, gopher, or file 
-(followed by a colon.) If I missed one, please let me know. 
-
-The LWP modules are part of libwww-perl which is available at CPAN.
-
-=back
-
-=head1 External DTDs
-
-XML::Checker::Parser will try to load and parse external DTDs that are 
-referenced in DOCTYPE definitions unless you set the B<SkipExternalDTD>
-option to 1 (the default setting is 0.) 
-See L<CAVEATS|"CAVEATS"> for details on what is not supported by XML::Checker::Parser.
-
-L<XML::Parser> (version 2.27 and up) does a much better job at reading external 
-DTDs, because recently external DTD parsing was added to expat.
-Make sure you set the L<XML::Parser> option B<ParseParamEnt> to 1 and the 
-XML::Checker::Parser option B<SkipExternalDTD> to 1. 
-(They can both be set in the XML::Checker::Parser constructor.)
-
-When external DTDs are parsed by XML::Checker::Parser, they are
-located in the following order:
-
-=over 4
-
-=item *
-
-With the %URI_MAP, which can be set using B<map_uri>.
-This hash maps external resource ids (like system ID's and public ID's)
-to full path URI's.
-It was meant to aid in resolving PUBLIC IDs found in DOCTYPE declarations 
-after the PUBLIC keyword, e.g.
-
-  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">
-
-However, you can also use this to force L<XML::Checker> to read DTDs from a
-different URL than was specified (e.g. from the local file system for
-performance reasons.)
-
-=item * 
-
-on the Internet, if their system identifier starts with a protocol 
-(like http://...)
-
-=item *
-
-on the local disk, if their system identifier starts with a slash 
-(absolute path)
-
-=item *
-
-in the SGML_SEARCH_PATH, if their system identifier is a 
-relative file name. It will use @SGML_SEARCH_PATH if it was set with
-B<set_sgml_search_path()>, or the colon-separated $ENV{SGML_SEARCH_PATH},
-or (if that isn't set) the list (".", "$ENV{'HOME'}/.sgml", "/usr/lib/sgml",
-"/usr/share/sgml"), which includes the
-current directory, so it should do the right thing in most cases.
-
-=back
-
-=head2 Static methods related to External DTDs
-
-=over 4
-
-=item set_sgml_search_path (dir1, dir2, ...)
-
-External DTDs with relative file paths are looked up using the @SGML_SEARCH_PATH,
-which can be set with this method. If @SGML_SEARCH_PATH is never set, it
-will use the colon-separated $ENV{SGML_SEARCH_PATH} instead. If neither are set
-it uses the list: ".", "$ENV{'HOME'}/.sgml", "/usr/lib/sgml",
-"/usr/share/sgml".
-
-set_sgml_search_path is a static method.
-
-=item map_uri (pubid => uri, ...)
-
-To define the location of PUBLIC ids, as found in DOCTYPE declarations 
-after the PUBLIC keyword, e.g.
-
-  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">
-
-call this method, e.g.
-
-  XML::Checker::Parser::map_uri (
-	"-//W3C//DTD HTML 4.0//EN" => "file:/user/html.dtd");
-
-See L<External DTDs|"External DTDs"> for more info.
-
-XML::Checker::Parser::map_uri is a static method.
-
-=back
-
-=head1 Switching user handlers at parse time
-
-You should be able to use setHandlers() just as in L<XML::Parser>.
-(Using setHandlers has not been tested yet.)
-
-=head1 Error handling
-
-XML::Checker::Parser routes the fail handler through 
-XML::Checker::Parser::fail_add_context() before calling your fail handler
-(i.e. the global fail handler: $XML::Checker::FAIL. 
-See L<XML::Checker/ERROR_HANDLING>.)
-It adds the (line, column, byte) information from L<XML::Parser> to the 
-error context (unless it was the end of the XML document.)
-
-=head1 Supported XML::Parser handlers
-
-Only the following L<XML::Parser> handlers are currently routed through
-L<XML::Checker>: Init, Final, Char, Start, End, Element, Attlist, Doctype,
-Unparsed, Notation.
-
-=head1 CAVEATS
-
-When using XML::Checker::Parser to parse external DTDs 
-(i.e. with SkipExternalDTD => 0),
-expect trouble when your external DTD contains parameter entities inside 
-declarations or conditional sections. The external DTD should probably have
-the same encoding as the orignal XML document.
-
-=head1 AUTHOR
-
-Send bug reports, hints, tips, suggestions to Enno Derksen at
-<F<enno@att.com>>.
-
-=head1 SEE ALSO
-
-L<XML::Checker> (L<XML::Checker/SEE_ALSO>), L<XML::Parser>