655
|
1 |
package XML::Checker::Parser;
|
|
2 |
use strict;
|
|
3 |
use XML::Parser;
|
|
4 |
use XML::Checker;
|
|
5 |
|
|
6 |
use vars qw( @ISA @InterceptedHandlers @SGML_SEARCH_PATH %URI_MAP
|
|
7 |
$_checker $_prevFAIL
|
|
8 |
$_Init $_Final $_Char $_Start $_End $_Element $_Attlist
|
|
9 |
$_Doctype $_Unparsed $_Notation $_Entity $_skipInsignifWS
|
|
10 |
$_EndOfDoc
|
|
11 |
);
|
|
12 |
|
|
13 |
@ISA = qw( XML::Parser );
|
|
14 |
|
|
15 |
@InterceptedHandlers = qw( Init Final Char Start End Element Attlist
|
|
16 |
Doctype Unparsed Notation Entity );
|
|
17 |
|
|
18 |
# Where to search for external DTDs (in local file system)
|
|
19 |
@SGML_SEARCH_PATH = ();
|
|
20 |
|
|
21 |
# Where to search for external DTDs as referred to by public ID in a
|
|
22 |
# <!DOCTYPE ...> statement, e.g. "-//W3C//DTD HTML 4.0//EN"
|
|
23 |
# E.g. it could map "-//W3C//DTD HTML 4.0//EN" to "file:/user/html.dtd"
|
|
24 |
%URI_MAP = ();
|
|
25 |
|
|
26 |
sub new
|
|
27 |
{
|
|
28 |
my ($class, %args) = @_;
|
|
29 |
|
|
30 |
my $super = new XML::Parser (%args);
|
|
31 |
$super->{Checker} = new XML::Checker (%args);
|
|
32 |
|
|
33 |
my %handlers = %{$super->{Handlers}};
|
|
34 |
|
|
35 |
# Don't need Comment handler - assuming comments are allowed anywhere
|
|
36 |
#?? What should Default handler do?
|
|
37 |
#?? Check XMLDecl, ExternEnt, Proc? No, for now.
|
|
38 |
#?? Add CdataStart, CdataEnd support?
|
|
39 |
|
|
40 |
for (@InterceptedHandlers)
|
|
41 |
{
|
|
42 |
my $func = "XML::Checker::Parser::$_";
|
|
43 |
$handlers{$_} = \&$func;
|
|
44 |
}
|
|
45 |
|
|
46 |
$super->{UserHandlers} = $super->{Handlers};
|
|
47 |
$super->{Handlers} = \%handlers;
|
|
48 |
|
|
49 |
bless $super, $class;
|
|
50 |
}
|
|
51 |
|
|
52 |
sub getChecker
|
|
53 |
{
|
|
54 |
$_[0]->{Checker}
|
|
55 |
}
|
|
56 |
|
|
57 |
sub parse
|
|
58 |
{
|
|
59 |
my $self = shift;
|
|
60 |
my $uh = $self->{UserHandlers};
|
|
61 |
|
|
62 |
local $_checker = $self->{Checker};
|
|
63 |
|
|
64 |
local $_Init = $uh->{Init};
|
|
65 |
local $_Final = $uh->{Final};
|
|
66 |
local $_Start = $uh->{Start};
|
|
67 |
local $_End = $uh->{End};
|
|
68 |
local $_Char = $uh->{Char};
|
|
69 |
local $_Element = $uh->{'Element'};
|
|
70 |
local $_Attlist = $uh->{'Attlist'};
|
|
71 |
local $_Doctype = $uh->{Doctype};
|
|
72 |
local $_Unparsed = $uh->{Unparsed};
|
|
73 |
local $_Notation = $uh->{Notation};
|
|
74 |
local $_Entity = $uh->{Entity};
|
|
75 |
|
|
76 |
local $_prevFAIL = $XML::Checker::FAIL;
|
|
77 |
local $XML::Checker::FAIL = \&fail_add_context;
|
|
78 |
|
|
79 |
local $XML::Checker::INSIGNIF_WS = 0;
|
|
80 |
local $_skipInsignifWS = $self->{SkipInsignifWS};
|
|
81 |
|
|
82 |
local $_EndOfDoc = 0;
|
|
83 |
|
|
84 |
$self->SUPER::parse (@_);
|
|
85 |
}
|
|
86 |
|
|
87 |
my $LWP_USER_AGENT;
|
|
88 |
sub set_LWP_UserAgent # static
|
|
89 |
{
|
|
90 |
$LWP_USER_AGENT = shift;
|
|
91 |
}
|
|
92 |
|
|
93 |
sub load_URL # static
|
|
94 |
{
|
|
95 |
my ($url, $lwp_user_agent) = @_;
|
|
96 |
my $result;
|
|
97 |
|
|
98 |
# Read the file from the web with LWP.
|
|
99 |
#
|
|
100 |
# Note that we read in the entire file, which may not be ideal
|
|
101 |
# for large files. LWP::UserAgent also provides a callback style
|
|
102 |
# request, which we could convert to a stream with a fork()...
|
|
103 |
|
|
104 |
my $response;
|
|
105 |
eval
|
|
106 |
{
|
|
107 |
use LWP::UserAgent;
|
|
108 |
|
|
109 |
my $ua = $lwp_user_agent;
|
|
110 |
unless (defined $ua)
|
|
111 |
{
|
|
112 |
unless (defined $LWP_USER_AGENT)
|
|
113 |
{
|
|
114 |
$LWP_USER_AGENT = LWP::UserAgent->new;
|
|
115 |
|
|
116 |
# Load proxy settings from environment variables, i.e.:
|
|
117 |
# http_proxy, ftp_proxy, no_proxy etc. (see LWP::UserAgent(3))
|
|
118 |
# You need these to go thru firewalls.
|
|
119 |
$LWP_USER_AGENT->env_proxy;
|
|
120 |
}
|
|
121 |
$ua = $LWP_USER_AGENT;
|
|
122 |
}
|
|
123 |
my $req = new HTTP::Request 'GET', $url;
|
|
124 |
$response = $LWP_USER_AGENT->request ($req);
|
|
125 |
$result = $response->content;
|
|
126 |
};
|
|
127 |
if ($@)
|
|
128 |
{
|
|
129 |
die "Couldn't load URL [$url] with LWP: $@";
|
|
130 |
}
|
|
131 |
if (!$result)
|
|
132 |
{
|
|
133 |
my $message = $response->as_string;
|
|
134 |
die "Couldn't load URL [$url] with LWP: $message";
|
|
135 |
}
|
|
136 |
return $result;
|
|
137 |
}
|
|
138 |
|
|
139 |
sub parsefile
|
|
140 |
{
|
|
141 |
my $self = shift;
|
|
142 |
my $url = shift;
|
|
143 |
|
|
144 |
# Any other URL schemes?
|
|
145 |
if ($url =~ /^(https?|ftp|wais|gopher|file):/)
|
|
146 |
{
|
|
147 |
my $xml = load_URL ($url, $self->{LWP_UserAgent});
|
|
148 |
my $result;
|
|
149 |
eval
|
|
150 |
{
|
|
151 |
# Parse the result of the HTTP request
|
|
152 |
$result = $self->parse ($xml, @_);
|
|
153 |
};
|
|
154 |
if ($@)
|
|
155 |
{
|
|
156 |
die "Couldn't parsefile [$url]: $@";
|
|
157 |
}
|
|
158 |
return $result;
|
|
159 |
}
|
|
160 |
else
|
|
161 |
{
|
|
162 |
return $self->SUPER::parsefile ($url, @_);
|
|
163 |
}
|
|
164 |
}
|
|
165 |
|
|
166 |
sub Init
|
|
167 |
{
|
|
168 |
my $expat = shift;
|
|
169 |
$_checker->{Expat} = $expat;
|
|
170 |
|
|
171 |
$_checker->Init (@_);
|
|
172 |
&$_Init ($expat) if $_Init;
|
|
173 |
}
|
|
174 |
|
|
175 |
sub Final
|
|
176 |
{
|
|
177 |
my $expat = shift;
|
|
178 |
$_EndOfDoc = 1;
|
|
179 |
|
|
180 |
$_checker->Final (@_);
|
|
181 |
my $result = &$_Final ($expat) if $_Final;
|
|
182 |
|
|
183 |
# Decouple Expat from Checker
|
|
184 |
delete $_checker->{Expat};
|
|
185 |
|
|
186 |
# NOTE: Checker is not decoupled
|
|
187 |
return $result;
|
|
188 |
}
|
|
189 |
|
|
190 |
sub Start
|
|
191 |
{
|
|
192 |
my ($expat, $tag, @attr) = @_;
|
|
193 |
|
|
194 |
$_checker->Start ($tag);
|
|
195 |
|
|
196 |
my $num_spec = $expat->specified_attr;
|
|
197 |
for (my $i = 0; $i < @attr; $i++)
|
|
198 |
{
|
|
199 |
my $spec = ($i < $num_spec);
|
|
200 |
my $attr = $attr[$i];
|
|
201 |
my $val = $attr[++$i];
|
|
202 |
|
|
203 |
# print "--- $tag $attr $val $spec\n";
|
|
204 |
$_checker->Attr ($tag, $attr, $val, $spec);
|
|
205 |
}
|
|
206 |
$_checker->EndAttr;
|
|
207 |
|
|
208 |
&$_Start ($expat, $tag, @attr) if $_Start;
|
|
209 |
}
|
|
210 |
|
|
211 |
sub End
|
|
212 |
{
|
|
213 |
my $expat = shift;
|
|
214 |
$_checker->End (@_);
|
|
215 |
&$_End ($expat, @_) if $_End;
|
|
216 |
}
|
|
217 |
|
|
218 |
sub Char
|
|
219 |
{
|
|
220 |
my $expat = shift;
|
|
221 |
$_checker->Char (@_);
|
|
222 |
&$_Char ($expat, @_)
|
|
223 |
if $_Char && !($XML::Checker::INSIGNIF_WS && $_skipInsignifWS);
|
|
224 |
# Skip insignificant whitespace
|
|
225 |
}
|
|
226 |
|
|
227 |
sub Element
|
|
228 |
{
|
|
229 |
my $expat = shift;
|
|
230 |
$_checker->Element (@_);
|
|
231 |
&$_Element ($expat, @_) if $_Element;
|
|
232 |
}
|
|
233 |
|
|
234 |
sub Attlist
|
|
235 |
{
|
|
236 |
my $expat = shift;
|
|
237 |
$_checker->Attlist (@_);
|
|
238 |
&$_Attlist ($expat, @_) if $_Attlist;
|
|
239 |
}
|
|
240 |
|
|
241 |
|
|
242 |
sub Doctype
|
|
243 |
{
|
|
244 |
my $expat = shift;
|
|
245 |
my ($name, $sysid, $pubid, $internal) = @_;
|
|
246 |
|
|
247 |
my $dtd;
|
|
248 |
unless ($_checker->{SkipExternalDTD})
|
|
249 |
{
|
|
250 |
if ($sysid)
|
|
251 |
{
|
|
252 |
# External DTD...
|
|
253 |
|
|
254 |
#?? I'm not sure if we should die here or keep going?
|
|
255 |
$dtd = load_DTD ($sysid, $expat->{LWP_UserAgent});
|
|
256 |
}
|
|
257 |
elsif ($pubid)
|
|
258 |
{
|
|
259 |
$dtd = load_DTD ($pubid, $expat->{LWP_UserAgent});
|
|
260 |
}
|
|
261 |
}
|
|
262 |
|
|
263 |
if (defined $dtd)
|
|
264 |
{
|
|
265 |
#?? what about passing ProtocolEncoding, Namespaces, Stream_Delimiter ?
|
|
266 |
my $parser = new XML::Parser (
|
|
267 |
Checker => $_checker,
|
|
268 |
ErrorContext => $expat->{ErrorContext},
|
|
269 |
Handlers => {
|
|
270 |
Entity => \&XML::Checker::Parser::ExternalDTD::Entity,
|
|
271 |
Notation => \&XML::Checker::Parser::ExternalDTD::Notation,
|
|
272 |
Element => \&XML::Checker::Parser::ExternalDTD::Element,
|
|
273 |
Attlist => \&XML::Checker::Parser::ExternalDTD::Attlist,
|
|
274 |
Unparsed => \&XML::Checker::Parser::ExternalDTD::Unparsed,
|
|
275 |
});
|
|
276 |
|
|
277 |
eval
|
|
278 |
{
|
|
279 |
$parser->parse ("<!DOCTYPE $name SYSTEM '$sysid' [\n$dtd\n]>\n<$name/>");
|
|
280 |
};
|
|
281 |
if ($@)
|
|
282 |
{
|
|
283 |
die "Couldn't parse contents of external DTD <$sysid> :$@";
|
|
284 |
}
|
|
285 |
}
|
|
286 |
$_checker->Doctype (@_);
|
|
287 |
&$_Doctype ($expat, @_) if $_Doctype;
|
|
288 |
}
|
|
289 |
|
|
290 |
sub Unparsed
|
|
291 |
{
|
|
292 |
my $expat = shift;
|
|
293 |
$_checker->Unparsed (@_);
|
|
294 |
&$_Unparsed ($expat, @_) if $_Unparsed;
|
|
295 |
}
|
|
296 |
|
|
297 |
sub Entity
|
|
298 |
{
|
|
299 |
my $expat = shift;
|
|
300 |
$_checker->Entity (@_);
|
|
301 |
&$_Entity ($expat, @_) if $_Entity;
|
|
302 |
}
|
|
303 |
|
|
304 |
sub Notation
|
|
305 |
{
|
|
306 |
my $expat = shift;
|
|
307 |
$_checker->Notation (@_);
|
|
308 |
&$_Notation ($expat, @_) if $_Notation;
|
|
309 |
}
|
|
310 |
|
|
311 |
sub Default
|
|
312 |
{
|
|
313 |
#?? what can I check here?
|
|
314 |
# print "Default handler got[" . join (", ", @_) . "]";
|
|
315 |
}
|
|
316 |
|
|
317 |
#sub XMLDecl
|
|
318 |
#{
|
|
319 |
#?? support later?
|
|
320 |
#}
|
|
321 |
|
|
322 |
sub setHandlers
|
|
323 |
{
|
|
324 |
my ($self, %h) = @_;
|
|
325 |
|
|
326 |
for my $name (@InterceptedHandlers)
|
|
327 |
{
|
|
328 |
if (exists $h{$name})
|
|
329 |
{
|
|
330 |
eval "\$_$name = \$h{$name}";
|
|
331 |
delete $h{$name};
|
|
332 |
}
|
|
333 |
}
|
|
334 |
|
|
335 |
# Pass remaining handlers to the parent class (XML::Parser)
|
|
336 |
$self->SUPER::setHandlers (%h);
|
|
337 |
}
|
|
338 |
|
|
339 |
# Add (line, column, byte) to error context (unless it's EOF)
|
|
340 |
sub fail_add_context # static
|
|
341 |
{
|
|
342 |
my $e = $_checker->{Expat};
|
|
343 |
|
|
344 |
my $byte = $e->current_byte; # -1 means: end of XML document
|
|
345 |
if ($byte != -1 && !$_EndOfDoc)
|
|
346 |
{
|
|
347 |
push @_, (line => $e->current_line,
|
|
348 |
column => $e->current_column,
|
|
349 |
byte => $byte);
|
|
350 |
}
|
|
351 |
&$_prevFAIL (@_);
|
|
352 |
}
|
|
353 |
|
|
354 |
#-------- STATIC METHODS related to External DTDs ---------------------------
|
|
355 |
|
|
356 |
sub load_DTD # static
|
|
357 |
{
|
|
358 |
my ($sysid, $lwp_user_agent) = @_;
|
|
359 |
|
|
360 |
# See if it is defined in the %URI_MAP
|
|
361 |
# (Public IDs are stored here, e.g. "-//W3C//DTD HTML 4.0//EN")
|
|
362 |
if (exists $URI_MAP{$sysid})
|
|
363 |
{
|
|
364 |
$sysid = $URI_MAP{$sysid};
|
|
365 |
}
|
|
366 |
elsif ($sysid !~ /^\w+:/)
|
|
367 |
{
|
|
368 |
# Prefix the sysid with 'file:' if it has no protocol identifier
|
|
369 |
unless ($sysid =~ /^\//)
|
|
370 |
{
|
|
371 |
# Not an absolute path. See if it's in SGML_SEARCH_PATH.
|
|
372 |
my $relative_sysid = $sysid;
|
|
373 |
|
|
374 |
$sysid = find_in_sgml_search_path ($sysid);
|
|
375 |
if (! $sysid)
|
|
376 |
{
|
|
377 |
if ($ENV{'SGML_SEARCH_PATH'})
|
|
378 |
{
|
|
379 |
die "Couldn't find external DTD [$relative_sysid] in SGML_SEARCH_PATH ($ENV{'SGML_SEARCH_PATH'})";
|
|
380 |
}
|
|
381 |
else
|
|
382 |
{
|
|
383 |
die "Couldn't find external DTD [$relative_sysid], may be you should set SGML_SEARCH_PATH";
|
|
384 |
}
|
|
385 |
}
|
|
386 |
}
|
|
387 |
$sysid = "file:$sysid";
|
|
388 |
}
|
|
389 |
|
|
390 |
return load_URL ($sysid, $lwp_user_agent);
|
|
391 |
}
|
|
392 |
|
|
393 |
sub map_uri # static
|
|
394 |
{
|
|
395 |
%URI_MAP = (%URI_MAP, @_);
|
|
396 |
}
|
|
397 |
|
|
398 |
sub set_sgml_search_path # static
|
|
399 |
{
|
|
400 |
@SGML_SEARCH_PATH = @_;
|
|
401 |
}
|
|
402 |
|
|
403 |
sub find_in_sgml_search_path # static
|
|
404 |
{
|
|
405 |
my $file = shift;
|
|
406 |
|
|
407 |
my @dirs = @SGML_SEARCH_PATH;
|
|
408 |
unless (@dirs)
|
|
409 |
{
|
|
410 |
my $path = $ENV{SGML_SEARCH_PATH};
|
|
411 |
if ($path)
|
|
412 |
{
|
|
413 |
@dirs = split (':', $path);
|
|
414 |
}
|
|
415 |
else
|
|
416 |
{
|
|
417 |
my $home = $ENV{HOME};
|
|
418 |
@dirs = (".", "$home/.sgml", "/usr/lib/sgml", "/usr/share/sgml");
|
|
419 |
}
|
|
420 |
}
|
|
421 |
|
|
422 |
for my $directory (@dirs)
|
|
423 |
{
|
|
424 |
if (-e "$directory/$file")
|
|
425 |
{
|
|
426 |
return "$directory/$file";
|
|
427 |
}
|
|
428 |
}
|
|
429 |
return undef;
|
|
430 |
}
|
|
431 |
|
|
432 |
package XML::Checker::Parser::ExternalDTD;
|
|
433 |
|
|
434 |
sub Element {
|
|
435 |
my $expat = shift;
|
|
436 |
$expat->{Checker}->Element(@_);
|
|
437 |
}
|
|
438 |
|
|
439 |
sub Attlist {
|
|
440 |
my $expat = shift;
|
|
441 |
$expat->{Checker}->Attlist(@_);
|
|
442 |
}
|
|
443 |
|
|
444 |
sub Unparsed {
|
|
445 |
my $expat = shift;
|
|
446 |
$expat->{Checker}->Unparsed(@_);
|
|
447 |
}
|
|
448 |
|
|
449 |
sub Notation {
|
|
450 |
my $expat = shift;
|
|
451 |
$expat->{Checker}->Notation(@_);
|
|
452 |
}
|
|
453 |
|
|
454 |
sub Entity {
|
|
455 |
my $expat = shift;
|
|
456 |
# print "Entity: $expat\n";
|
|
457 |
$expat->{Checker}->Entity(@_);
|
|
458 |
}
|
|
459 |
|
|
460 |
1; # package return code
|
|
461 |
|
|
462 |
__END__
|
|
463 |
|
|
464 |
=head1 NAME
|
|
465 |
|
|
466 |
XML::Checker::Parser - an XML::Parser that validates at parse time
|
|
467 |
|
|
468 |
=head1 SYNOPSIS
|
|
469 |
|
|
470 |
use XML::Checker::Parser;
|
|
471 |
|
|
472 |
my %expat_options = (KeepCDATA => 1,
|
|
473 |
Handlers => [ Unparsed => \&my_Unparsed_handler ]);
|
|
474 |
my $parser = new XML::Checker::Parser (%expat_options);
|
|
475 |
|
|
476 |
eval {
|
|
477 |
local $XML::Checker::FAIL = \&my_fail;
|
|
478 |
$parser->parsefile ("fail.xml");
|
|
479 |
};
|
|
480 |
if ($@) {
|
|
481 |
# Either XML::Parser (expat) threw an exception or my_fail() died.
|
|
482 |
... your error handling code here ...
|
|
483 |
}
|
|
484 |
|
|
485 |
# Throws an exception (with die) when an error is encountered, this
|
|
486 |
# will stop the parsing process.
|
|
487 |
# Don't die if a warning or info message is encountered, just print a message.
|
|
488 |
sub my_fail {
|
|
489 |
my $code = shift;
|
|
490 |
die XML::Checker::error_string ($code, @_) if $code < 200;
|
|
491 |
XML::Checker::print_error ($code, @_);
|
|
492 |
}
|
|
493 |
|
|
494 |
=head1 DESCRIPTION
|
|
495 |
|
|
496 |
XML::Checker::Parser extends L<XML::Parser>
|
|
497 |
|
|
498 |
I hope the example in the SYNOPSIS says it all, just use
|
|
499 |
L<XML::Checker::Parser> as if it were an XML::Parser.
|
|
500 |
See L<XML::Parser> for the supported (expat) options.
|
|
501 |
|
|
502 |
You can also derive your parser from XML::Checker::Parser instead of
|
|
503 |
from XML::Parser. All you should have to do is replace:
|
|
504 |
|
|
505 |
package MyParser;
|
|
506 |
@ISA = qw( XML::Parser );
|
|
507 |
|
|
508 |
with:
|
|
509 |
|
|
510 |
package MyParser;
|
|
511 |
@ISA = qw( XML::Checker::Parser );
|
|
512 |
|
|
513 |
=head1 XML::Checker::Parser constructor
|
|
514 |
|
|
515 |
$parser = new XML::Checker::Parser (SkipExternalDTD => 1, SkipInsignifWS => 1);
|
|
516 |
|
|
517 |
The constructor takes the same parameters as L<XML::Parser> with the following additions:
|
|
518 |
|
|
519 |
=over 4
|
|
520 |
|
|
521 |
=item SkipExternalDTD
|
|
522 |
|
|
523 |
By default, it will try to load external DTDs using LWP. You can disable this
|
|
524 |
by setting SkipExternalDTD to 1. See L<External DTDs|"External DTDs"> for details.
|
|
525 |
|
|
526 |
=item SkipInsignifWS
|
|
527 |
|
|
528 |
By default, it will treat insignificant whitespace as regular Char data.
|
|
529 |
By setting SkipInsignifWS to 1, the user Char handler will not be called
|
|
530 |
if insignificant whitespace is encountered.
|
|
531 |
See L<XML::Checker/INSIGNIFICANT_WHITESPACE> for details.
|
|
532 |
|
|
533 |
=item LWP_UserAgent
|
|
534 |
|
|
535 |
When calling parsefile() with a URL (instead of a filename) or when loading
|
|
536 |
external DTDs, we use LWP to download the
|
|
537 |
remote file. By default it will use a L<LWP::UserAgent> that is created as follows:
|
|
538 |
|
|
539 |
use LWP::UserAgent;
|
|
540 |
$LWP_USER_AGENT = LWP::UserAgent->new;
|
|
541 |
$LWP_USER_AGENT->env_proxy;
|
|
542 |
|
|
543 |
Note that L<env_proxy> reads proxy settings from your environment variables,
|
|
544 |
which is what I need to do to get thru our firewall.
|
|
545 |
If you want to use a different LWP::UserAgent, you can either set
|
|
546 |
it globally with:
|
|
547 |
|
|
548 |
XML::Checker::Parser::set_LWP_UserAgent ($my_agent);
|
|
549 |
|
|
550 |
or, you can specify it for a specific XML::Checker::Parser by passing it to
|
|
551 |
the constructor:
|
|
552 |
|
|
553 |
my $parser = new XML::Checker::Parser (LWP_UserAgent => $my_agent);
|
|
554 |
|
|
555 |
Currently, LWP is used when the filename (passed to parsefile) starts with one of
|
|
556 |
the following URL schemes: http, https, ftp, wais, gopher, or file
|
|
557 |
(followed by a colon.) If I missed one, please let me know.
|
|
558 |
|
|
559 |
The LWP modules are part of libwww-perl which is available at CPAN.
|
|
560 |
|
|
561 |
=back
|
|
562 |
|
|
563 |
=head1 External DTDs
|
|
564 |
|
|
565 |
XML::Checker::Parser will try to load and parse external DTDs that are
|
|
566 |
referenced in DOCTYPE definitions unless you set the B<SkipExternalDTD>
|
|
567 |
option to 1 (the default setting is 0.)
|
|
568 |
See L<CAVEATS|"CAVEATS"> for details on what is not supported by XML::Checker::Parser.
|
|
569 |
|
|
570 |
L<XML::Parser> (version 2.27 and up) does a much better job at reading external
|
|
571 |
DTDs, because recently external DTD parsing was added to expat.
|
|
572 |
Make sure you set the L<XML::Parser> option B<ParseParamEnt> to 1 and the
|
|
573 |
XML::Checker::Parser option B<SkipExternalDTD> to 1.
|
|
574 |
(They can both be set in the XML::Checker::Parser constructor.)
|
|
575 |
|
|
576 |
When external DTDs are parsed by XML::Checker::Parser, they are
|
|
577 |
located in the following order:
|
|
578 |
|
|
579 |
=over 4
|
|
580 |
|
|
581 |
=item *
|
|
582 |
|
|
583 |
With the %URI_MAP, which can be set using B<map_uri>.
|
|
584 |
This hash maps external resource ids (like system ID's and public ID's)
|
|
585 |
to full path URI's.
|
|
586 |
It was meant to aid in resolving PUBLIC IDs found in DOCTYPE declarations
|
|
587 |
after the PUBLIC keyword, e.g.
|
|
588 |
|
|
589 |
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">
|
|
590 |
|
|
591 |
However, you can also use this to force L<XML::Checker> to read DTDs from a
|
|
592 |
different URL than was specified (e.g. from the local file system for
|
|
593 |
performance reasons.)
|
|
594 |
|
|
595 |
=item *
|
|
596 |
|
|
597 |
on the Internet, if their system identifier starts with a protocol
|
|
598 |
(like http://...)
|
|
599 |
|
|
600 |
=item *
|
|
601 |
|
|
602 |
on the local disk, if their system identifier starts with a slash
|
|
603 |
(absolute path)
|
|
604 |
|
|
605 |
=item *
|
|
606 |
|
|
607 |
in the SGML_SEARCH_PATH, if their system identifier is a
|
|
608 |
relative file name. It will use @SGML_SEARCH_PATH if it was set with
|
|
609 |
B<set_sgml_search_path()>, or the colon-separated $ENV{SGML_SEARCH_PATH},
|
|
610 |
or (if that isn't set) the list (".", "$ENV{'HOME'}/.sgml", "/usr/lib/sgml",
|
|
611 |
"/usr/share/sgml"), which includes the
|
|
612 |
current directory, so it should do the right thing in most cases.
|
|
613 |
|
|
614 |
=back
|
|
615 |
|
|
616 |
=head2 Static methods related to External DTDs
|
|
617 |
|
|
618 |
=over 4
|
|
619 |
|
|
620 |
=item set_sgml_search_path (dir1, dir2, ...)
|
|
621 |
|
|
622 |
External DTDs with relative file paths are looked up using the @SGML_SEARCH_PATH,
|
|
623 |
which can be set with this method. If @SGML_SEARCH_PATH is never set, it
|
|
624 |
will use the colon-separated $ENV{SGML_SEARCH_PATH} instead. If neither are set
|
|
625 |
it uses the list: ".", "$ENV{'HOME'}/.sgml", "/usr/lib/sgml",
|
|
626 |
"/usr/share/sgml".
|
|
627 |
|
|
628 |
set_sgml_search_path is a static method.
|
|
629 |
|
|
630 |
=item map_uri (pubid => uri, ...)
|
|
631 |
|
|
632 |
To define the location of PUBLIC ids, as found in DOCTYPE declarations
|
|
633 |
after the PUBLIC keyword, e.g.
|
|
634 |
|
|
635 |
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN">
|
|
636 |
|
|
637 |
call this method, e.g.
|
|
638 |
|
|
639 |
XML::Checker::Parser::map_uri (
|
|
640 |
"-//W3C//DTD HTML 4.0//EN" => "file:/user/html.dtd");
|
|
641 |
|
|
642 |
See L<External DTDs|"External DTDs"> for more info.
|
|
643 |
|
|
644 |
XML::Checker::Parser::map_uri is a static method.
|
|
645 |
|
|
646 |
=back
|
|
647 |
|
|
648 |
=head1 Switching user handlers at parse time
|
|
649 |
|
|
650 |
You should be able to use setHandlers() just as in L<XML::Parser>.
|
|
651 |
(Using setHandlers has not been tested yet.)
|
|
652 |
|
|
653 |
=head1 Error handling
|
|
654 |
|
|
655 |
XML::Checker::Parser routes the fail handler through
|
|
656 |
XML::Checker::Parser::fail_add_context() before calling your fail handler
|
|
657 |
(i.e. the global fail handler: $XML::Checker::FAIL.
|
|
658 |
See L<XML::Checker/ERROR_HANDLING>.)
|
|
659 |
It adds the (line, column, byte) information from L<XML::Parser> to the
|
|
660 |
error context (unless it was the end of the XML document.)
|
|
661 |
|
|
662 |
=head1 Supported XML::Parser handlers
|
|
663 |
|
|
664 |
Only the following L<XML::Parser> handlers are currently routed through
|
|
665 |
L<XML::Checker>: Init, Final, Char, Start, End, Element, Attlist, Doctype,
|
|
666 |
Unparsed, Notation.
|
|
667 |
|
|
668 |
=head1 CAVEATS
|
|
669 |
|
|
670 |
When using XML::Checker::Parser to parse external DTDs
|
|
671 |
(i.e. with SkipExternalDTD => 0),
|
|
672 |
expect trouble when your external DTD contains parameter entities inside
|
|
673 |
declarations or conditional sections. The external DTD should probably have
|
|
674 |
the same encoding as the orignal XML document.
|
|
675 |
|
|
676 |
=head1 AUTHOR
|
|
677 |
|
|
678 |
Send bug reports, hints, tips, suggestions to Enno Derksen at
|
|
679 |
<F<enno@att.com>>.
|
|
680 |
|
|
681 |
=head1 SEE ALSO
|
|
682 |
|
|
683 |
L<XML::Checker> (L<XML::Checker/SEE_ALSO>), L<XML::Parser>
|