|
1 package XML::Filter::DetectWS; |
|
2 use strict; |
|
3 use XML::Filter::SAXT; |
|
4 |
|
5 #---------------------------------------------------------------------- |
|
6 # CONSTANT DEFINITIONS |
|
7 #---------------------------------------------------------------------- |
|
8 |
|
9 # Locations of whitespace |
|
10 sub WS_START (%) { 1 } # just after <a> |
|
11 sub WS_END (%) { 2 } # just before </a> |
|
12 sub WS_INTER (%) { 0 } # not at the start or end (i.e. intermediate) |
|
13 sub WS_ONLY (%) { 3 } # both START and END, i.e. between <a> and </a> |
|
14 |
|
15 # The states of the WhiteSpace detection code |
|
16 # for regular elements, i.e. elements that: |
|
17 # 1) don't have xml:space="preserve" |
|
18 # 2) have an ELEMENT model that allows text children (i.e. ANY or Mixed content) |
|
19 |
|
20 sub START (%) { 0 } # just saw <elem> |
|
21 sub ONLY_WS (%) { 1 } # saw <elem> followed by whitespace (only) |
|
22 sub ENDS_IN_WS (%) { 2 } # ends in whitespace (sofar) |
|
23 sub ENDS_IN_NON_WS (%) { 3 } # ends in non-ws text or non-text node (sofar) |
|
24 |
|
25 # NO_TEXT States: when <!ELEMENT> model does not allow text |
|
26 # (we assume that all text children are whitespace) |
|
27 sub NO_TEXT_START (%) { 4 } # just saw <elem> |
|
28 sub NO_TEXT_ONLY_WS (%) { 5 } # saw <elem> followed by whitespace (only) |
|
29 sub NO_TEXT_ENDS_IN_WS (%) { 6 } # ends in whitespace (sofar) |
|
30 sub NO_TEXT_ENDS_IN_NON_WS (%) { 7 } # ends in non-text node (sofar) |
|
31 |
|
32 # State for elements with xml:space="preserve" (all text is non-WS) |
|
33 sub PRESERVE_WS (%) { 8 } |
|
34 |
|
35 #---------------------------------------------------------------------- |
|
36 # METHOD DEFINITIONS |
|
37 #---------------------------------------------------------------------- |
|
38 |
|
39 # Constructor options: |
|
40 # |
|
41 # SkipIgnorableWS 1 means: don't forward ignorable_whitespace events |
|
42 # Handler SAX Handler that will receive the resulting events |
|
43 # |
|
44 |
|
45 sub new |
|
46 { |
|
47 my ($class, %options) = @_; |
|
48 |
|
49 my $self = bless \%options, $class; |
|
50 |
|
51 $self->init_handlers; |
|
52 |
|
53 $self; |
|
54 } |
|
55 |
|
56 # Does nothing |
|
57 sub noop {} |
|
58 |
|
59 sub init_handlers |
|
60 { |
|
61 my ($self) = @_; |
|
62 my %handlers; |
|
63 |
|
64 my $handler = $self->{Handler}; |
|
65 |
|
66 for my $cb (map { @{$_} } values %XML::Filter::SAXT::SAX_HANDLERS) |
|
67 { |
|
68 if (UNIVERSAL::can ($handler, $cb)) |
|
69 { |
|
70 $handlers{$cb} = eval "sub { \$handler->$cb (\@_) }"; |
|
71 } |
|
72 else |
|
73 { |
|
74 $handlers{$cb} = \&noop; |
|
75 } |
|
76 } |
|
77 |
|
78 if ($self->{SkipIgnorableWS}) |
|
79 { |
|
80 delete $handlers{ignorable_whitespace}; # if it exists |
|
81 } |
|
82 elsif (UNIVERSAL::can ($handler, 'ignorable_whitespace')) |
|
83 { |
|
84 # Support ignorable_whitespace callback if it exists |
|
85 # (if not, just use characters callback) |
|
86 $handlers{ignorable_whitespace} = |
|
87 sub { $handler->ignorable_whitespace (@_) }; |
|
88 } |
|
89 else |
|
90 { |
|
91 $handlers{ignorable_whitespace} = $handlers{characters}; |
|
92 } |
|
93 |
|
94 $handlers{ws} = $handlers{characters}; |
|
95 #?? were should whitespace go? |
|
96 |
|
97 # NOTE: 'cdata' is not a valid PerlSAX callback |
|
98 if (UNIVERSAL::can ($handler, 'start_cdata') && |
|
99 UNIVERSAL::can ($handler, 'end_cdata')) |
|
100 { |
|
101 $handlers{cdata} = sub { |
|
102 $handler->start_cdata; |
|
103 $handler->characters (@_); |
|
104 $handler->end_cdata; |
|
105 } |
|
106 } |
|
107 else # pass CDATA as regular characters |
|
108 { |
|
109 $handlers{cdata} = $handlers{characters}; |
|
110 } |
|
111 |
|
112 $self->{Callback} = \%handlers; |
|
113 } |
|
114 |
|
115 sub start_cdata |
|
116 { |
|
117 my ($self, $event) = @_; |
|
118 |
|
119 $self->{InCDATA} = 1; |
|
120 } |
|
121 |
|
122 sub end_cdata |
|
123 { |
|
124 my ($self, $event) = @_; |
|
125 |
|
126 $self->{InCDATA} = 0; |
|
127 } |
|
128 |
|
129 sub entity_reference |
|
130 { |
|
131 my ($self, $event) = @_; |
|
132 |
|
133 $self->push_event ('entity_reference', $event); |
|
134 |
|
135 my $parent = $self->{ParentStack}->[-1]; |
|
136 $parent->{State} |= ENDS_IN_NON_WS unless $parent->{State} == PRESERVE_WS; |
|
137 } |
|
138 |
|
139 sub comment |
|
140 { |
|
141 my ($self, $event) = @_; |
|
142 |
|
143 $self->push_event ('comment', $event); |
|
144 |
|
145 my $parent = $self->{ParentStack}->[-1]; |
|
146 $parent->{State} |= ENDS_IN_NON_WS unless $parent->{State} == PRESERVE_WS; |
|
147 } |
|
148 |
|
149 sub processing_instruction |
|
150 { |
|
151 my ($self, $event) = @_; |
|
152 |
|
153 $self->push_event ('processing_instruction', $event); |
|
154 |
|
155 my $parent = $self->{ParentStack}->[-1]; |
|
156 $parent->{State} |= ENDS_IN_NON_WS unless $parent->{State} == PRESERVE_WS; |
|
157 } |
|
158 |
|
159 sub start_document |
|
160 { |
|
161 my ($self, $event) = @_; |
|
162 |
|
163 # Initialize initial state |
|
164 $self->{ParentStack} = []; |
|
165 $self->{EventQ} = []; |
|
166 $self->{InCDATA} = 0; |
|
167 |
|
168 $self->init_handlers; |
|
169 |
|
170 $event = {} unless defined $event; |
|
171 # Don't preserve WS by default (unless specified by the user) |
|
172 $event->{PreserveWS} = defined ($self->{PreserveWS}) ? |
|
173 $self->{PreserveWS} : 0; |
|
174 |
|
175 # We don't need whitespace detection at the document level |
|
176 $event->{State} = PRESERVE_WS; |
|
177 |
|
178 $self->push_event ('start_document', $event); |
|
179 push @{ $self->{ParentStack} }, $event; |
|
180 } |
|
181 |
|
182 sub end_document |
|
183 { |
|
184 my ($self, $event) = @_; |
|
185 $event = {} unless defined $event; |
|
186 |
|
187 $self->push_event ('end_document', $event); |
|
188 |
|
189 $self->flush; |
|
190 } |
|
191 |
|
192 sub start_element |
|
193 { |
|
194 my ($self, $event) = @_; |
|
195 |
|
196 my $pres = $event->{Attributes}->{'xml:space'}; |
|
197 if (defined $pres) |
|
198 { |
|
199 $event->{PreserveWS} = $pres eq "preserve"; |
|
200 } |
|
201 else |
|
202 { |
|
203 $event->{PreserveWS} = $self->{ParentStack}->[-1]->{PreserveWS}; |
|
204 } |
|
205 |
|
206 if ($self->{NoText}->{ $event->{Name} }) |
|
207 { |
|
208 $event->{NoText} = 1; |
|
209 } |
|
210 |
|
211 $event->{State} = $self->get_init_state ($event); |
|
212 |
|
213 $self->push_event ('start_element', $event); |
|
214 push @{ $self->{ParentStack} }, $event; |
|
215 } |
|
216 |
|
217 sub end_element |
|
218 { |
|
219 my ($self, $event) = @_; |
|
220 |
|
221 # Mark previous whitespace event as the last event (WS_END) |
|
222 # (if it's there) |
|
223 my $prev = $self->{EventQ}->[-1]; |
|
224 $prev->{Loc} |= WS_END if exists $prev->{Loc}; |
|
225 |
|
226 $self->push_event ('end_element', $event); |
|
227 |
|
228 my $elem = pop @{ $self->{ParentStack} }; |
|
229 delete $elem->{State}; |
|
230 } |
|
231 |
|
232 sub characters |
|
233 { |
|
234 my ($self, $event) = @_; |
|
235 |
|
236 if ($self->{InCDATA}) |
|
237 { |
|
238 # NOTE: 'cdata' is not a valid PerlSAX callback |
|
239 $self->push_event ('cdata', $event); |
|
240 |
|
241 my $parent = $self->{ParentStack}->[-1]; |
|
242 $parent->{State} |= ENDS_IN_NON_WS unless $parent->{State} == PRESERVE_WS; |
|
243 return; |
|
244 } |
|
245 |
|
246 my $text = $event->{Data}; |
|
247 return unless length ($text); |
|
248 |
|
249 my $state = $self->{ParentStack}->[-1]->{State}; |
|
250 if ($state == PRESERVE_WS) |
|
251 { |
|
252 $self->push_event ('characters', $event); |
|
253 } |
|
254 elsif ($state == NO_TEXT_START) |
|
255 { |
|
256 # ELEMENT model does not allow regular text. |
|
257 # All characters are whitespace. |
|
258 $self->push_event ('ignorable_whitespace', { Data => $text, Loc => WS_START }); |
|
259 $state = NO_TEXT_ONLY_WS; |
|
260 } |
|
261 elsif ($state == NO_TEXT_ONLY_WS) |
|
262 { |
|
263 $self->merge_text ($text, 'ignorable_whitespace', WS_START ); |
|
264 } |
|
265 elsif ($state == NO_TEXT_ENDS_IN_NON_WS) |
|
266 { |
|
267 $self->push_event ('ignorable_whitespace', { Data => $text, Loc => WS_INTER }); |
|
268 $state = NO_TEXT_ENDS_IN_WS; |
|
269 } |
|
270 elsif ($state == NO_TEXT_ENDS_IN_WS) |
|
271 { |
|
272 $self->merge_text ($text, 'ignorable_whitespace', WS_INTER ); |
|
273 } |
|
274 elsif ($state == START) |
|
275 { |
|
276 #?? add support for full Unicode |
|
277 $text =~ /^(\s*)(\S(?:.*\S)?)?(\s*)$/; |
|
278 if (length $1) |
|
279 { |
|
280 $self->push_event ('ws', { Data => $1, Loc => WS_START }); |
|
281 $state = ONLY_WS; |
|
282 } |
|
283 if (length $2) |
|
284 { |
|
285 $self->push_event ('characters', { Data => $2 }); |
|
286 $state = ENDS_IN_NON_WS; |
|
287 } |
|
288 if (length $3) |
|
289 { |
|
290 $self->push_event ('ws', { Data => $3, Loc => WS_INTER }); |
|
291 $state = ENDS_IN_WS; |
|
292 } |
|
293 } |
|
294 elsif ($state == ONLY_WS) |
|
295 { |
|
296 $text =~ /^(\s*)(\S(?:.*\S)?)?(\s*)$/; |
|
297 if (length $1) |
|
298 { |
|
299 $self->merge_text ($1, 'ws', WS_START); |
|
300 } |
|
301 if (length $2) |
|
302 { |
|
303 $self->push_event ('characters', { Data => $2 }); |
|
304 $state = ENDS_IN_NON_WS; |
|
305 } |
|
306 if (length $3) |
|
307 { |
|
308 $self->push_event ('ws', { Data => $3, Loc => WS_INTER }); |
|
309 $state = ENDS_IN_WS; |
|
310 } |
|
311 } |
|
312 else # state == ENDS_IN_WS or ENDS_IN_NON_WS |
|
313 { |
|
314 $text =~ /^(.*\S)?(\s*)$/; |
|
315 if (length $1) |
|
316 { |
|
317 if ($state == ENDS_IN_NON_WS) |
|
318 { |
|
319 $self->merge_text ($1, 'characters'); |
|
320 } |
|
321 else |
|
322 { |
|
323 $self->push_event ('characters', { Data => $1 }); |
|
324 $state = ENDS_IN_NON_WS; |
|
325 } |
|
326 } |
|
327 if (length $2) |
|
328 { |
|
329 if ($state == ENDS_IN_WS) |
|
330 { |
|
331 $self->merge_text ($2, 'ws', WS_INTER); |
|
332 } |
|
333 else |
|
334 { |
|
335 $self->push_event ('ws', { Data => $2, Loc => WS_INTER }); |
|
336 $state = ENDS_IN_WS; |
|
337 } |
|
338 } |
|
339 } |
|
340 |
|
341 $self->{ParentStack}->[-1]->{State} = $state; |
|
342 } |
|
343 |
|
344 sub element_decl |
|
345 { |
|
346 my ($self, $event) = @_; |
|
347 my $tag = $event->{Name}; |
|
348 my $model = $event->{Model}; |
|
349 |
|
350 # Check the model to see if the elements may contain regular text |
|
351 $self->{NoText}->{$tag} = ($model eq 'EMPTY' || $model !~ /\#PCDATA/); |
|
352 |
|
353 $self->push_event ('element_decl', $event); |
|
354 } |
|
355 |
|
356 sub attlist_decl |
|
357 { |
|
358 my ($self, $event) = @_; |
|
359 |
|
360 my $prev = $self->{EventQ}->[-1]; |
|
361 if ($prev->{EventType} eq 'attlist_decl' && |
|
362 $prev->{ElementName} eq $event->{ElementName}) |
|
363 { |
|
364 $prev->{MoreFollow} = 1; |
|
365 $event->{First} = 0; |
|
366 } |
|
367 else |
|
368 { |
|
369 $event->{First} = 1; |
|
370 } |
|
371 |
|
372 $self->push_event ('attlist_decl', $event); |
|
373 } |
|
374 |
|
375 sub notation_decl |
|
376 { |
|
377 my ($self, $event) = @_; |
|
378 $self->push_event ('notation_decl', $event); |
|
379 } |
|
380 |
|
381 sub unparsed_entity_decl |
|
382 { |
|
383 my ($self, $event) = @_; |
|
384 $self->push_event ('unparsed_entity_decl', $event); |
|
385 } |
|
386 |
|
387 sub entity_decl |
|
388 { |
|
389 my ($self, $event) = @_; |
|
390 $self->push_event ('entity_decl', $event); |
|
391 } |
|
392 |
|
393 sub doctype_decl |
|
394 { |
|
395 my ($self, $event) = @_; |
|
396 $self->push_event ('doctype_decl', $event); |
|
397 } |
|
398 |
|
399 sub xml_decl |
|
400 { |
|
401 my ($self, $event) = @_; |
|
402 $self->push_event ('xml_decl', $event); |
|
403 } |
|
404 |
|
405 #?? what about set_document_locator, resolve_entity |
|
406 |
|
407 # |
|
408 # Determine the initial State for the current Element. |
|
409 # By default, we look at the PreserveWS property (i.e. value of xml:space.) |
|
410 # The user can override this to force xml:space="preserve" for a particular |
|
411 # element with e.g. |
|
412 # |
|
413 # sub get_init_state |
|
414 # { |
|
415 # my ($self, $event) = @_; |
|
416 # ($event->{Name} eq 'foo' || $event->{PreserveWS}) ? PRESERVE_WS : START; |
|
417 # } |
|
418 # |
|
419 sub get_init_state |
|
420 { |
|
421 my ($self, $event) = @_; |
|
422 my $tag = $event->{Name}; |
|
423 |
|
424 if ($self->{NoText}->{$tag}) # ELEMENT model does not allow text |
|
425 { |
|
426 return NO_TEXT_START; |
|
427 } |
|
428 $event->{PreserveWS} ? PRESERVE_WS : START; |
|
429 } |
|
430 |
|
431 sub push_event |
|
432 { |
|
433 my ($self, $type, $event) = @_; |
|
434 |
|
435 $event->{EventType} = $type; |
|
436 |
|
437 $self->flush; |
|
438 push @{ $self->{EventQ} }, $event; |
|
439 } |
|
440 |
|
441 # Merge text with previous event (if it has the same EventType) |
|
442 # or push a new text event |
|
443 sub merge_text |
|
444 { |
|
445 my ($self, $str, $eventType, $wsLocation) = @_; |
|
446 my $q = $self->{EventQ}; |
|
447 |
|
448 my $prev = $q->[-1]; |
|
449 if (defined $prev && $prev->{EventType} eq $eventType) |
|
450 { |
|
451 $prev->{Data} .= $str; |
|
452 } |
|
453 else |
|
454 { |
|
455 my $event = { Data => $str }; |
|
456 $event->{Loc} = $wsLocation if defined $wsLocation; |
|
457 $self->push_event ($eventType, $event); |
|
458 } |
|
459 } |
|
460 |
|
461 # Forward all events on the EventQ |
|
462 sub flush |
|
463 { |
|
464 my ($self) = @_; |
|
465 |
|
466 my $q = $self->{EventQ}; |
|
467 while (@$q) |
|
468 { |
|
469 my $event = shift @$q; |
|
470 my $type = $event->{EventType}; |
|
471 delete $event->{EventType}; |
|
472 |
|
473 $self->{Callback}->{$type}->($event); |
|
474 } |
|
475 } |
|
476 |
|
477 1; # package return code |
|
478 |
|
479 __END__ |
|
480 |
|
481 =head1 NAME |
|
482 |
|
483 XML::Filter::DetectWS - A PerlSAX filter that detects ignorable whitespace |
|
484 |
|
485 =head1 SYNOPSIS |
|
486 |
|
487 use XML::Filter::DetectWS; |
|
488 |
|
489 my $detect = new XML::Filter::DetectWS (Handler => $handler, |
|
490 SkipIgnorableWS => 1); |
|
491 |
|
492 =head1 DESCRIPTION |
|
493 |
|
494 This a PerlSAX filter that detects which character data contains |
|
495 ignorable whitespace and optionally filters it. |
|
496 |
|
497 Note that this is just a first stab at the implementation and it may |
|
498 change completely in the near future. Please provide feedback whether |
|
499 you like it or not, so I know whether I should change it. |
|
500 |
|
501 The XML spec defines ignorable whitespace as the character data found in elements |
|
502 that were defined in an <!ELEMENT> declaration with a model of 'EMPTY' or |
|
503 'Children' (Children is the rule that does not contain '#PCDATA'.) |
|
504 |
|
505 In addition, XML::Filter::DetectWS allows the user to define other whitespace to |
|
506 be I<ignorable>. The ignorable whitespace is passed to the PerlSAX Handler with |
|
507 the B<ignorable_whitespace> handler, provided that the Handler implements this |
|
508 method. (Otherwise it is passed to the characters handler.) |
|
509 If the B<SkipIgnorableWS> is set, the ignorable whitespace is simply |
|
510 discarded. |
|
511 |
|
512 XML::Filter::DetectWS also takes xml:space attributes into account. See below |
|
513 for details. |
|
514 |
|
515 CDATA sections are passed in the standard PerlSAX way (i.e. with surrounding |
|
516 start_cdata and end_cdata events), unless the Handler does not implement these |
|
517 methods. In that case, the CDATA section is simply passed to the characters |
|
518 method. |
|
519 |
|
520 =head1 Constructor Options |
|
521 |
|
522 =over 4 |
|
523 |
|
524 =item * SkipIgnorableWS (Default: 0) |
|
525 |
|
526 When set, detected ignorable whitespace is discarded. |
|
527 |
|
528 =item * Handler |
|
529 |
|
530 The PerlSAX handler (or filter) that will receive the PerlSAX events from this |
|
531 filter. |
|
532 |
|
533 =back |
|
534 |
|
535 =head1 Current Implementation |
|
536 |
|
537 When determining which whitespace is ignorable, it first looks at the |
|
538 xml:space attribute of the parent element node (and its ancestors.) |
|
539 If the attribute value is "preserve", then it is *NOT* ignorable. |
|
540 (If someone took the trouble of adding xml:space="preserve", then that is |
|
541 the final answer...) |
|
542 |
|
543 If xml:space="default", then we look at the <!ELEMENT> definition of the parent |
|
544 element. If the model is 'EMPTY' or follows the 'Children' rule (i.e. does not |
|
545 contain '#PCDATA') then we know that the whitespace is ignorable. |
|
546 Otherwise we need input from the user somehow. |
|
547 |
|
548 The idea is that the API of DetectWS will be extended, so that you can |
|
549 specify/override e.g. which elements should behave as if xml:space="preserve" |
|
550 were set, and/or which elements should behave as if the <!ELEMENT> model was |
|
551 defined a certain way, etc. |
|
552 |
|
553 Please send feedback! |
|
554 |
|
555 The current implementation also detects whitespace after an element-start tag, |
|
556 whitespace before an element-end tag. |
|
557 It also detects whitespace before an element-start and after an element-end tag |
|
558 and before or after comments, processing instruction, cdata sections etc., |
|
559 but this needs to be reimplemented. |
|
560 In either case, the detected whitespace is split off into its own PerlSAX |
|
561 characters event and an extra property 'Loc' is added. It can have 4 possible |
|
562 values: |
|
563 |
|
564 =over 4 |
|
565 |
|
566 =item * 1 (WS_START) - whitespace immediately after element-start tag |
|
567 |
|
568 =item * 2 (WS_END) - whitespace just before element-end tag |
|
569 |
|
570 =item * 3 (WS_ONLY) - both WS_START and WS_END, i.e. it's the only text found between the start and end tag and it's all whitespace |
|
571 |
|
572 =item * 0 (WS_INTER) - none of the above, probably before an element-start tag, |
|
573 after an element-end tag, or before or after a comment, PI, cdata section etc. |
|
574 |
|
575 =back |
|
576 |
|
577 Note that WS_INTER may not be that useful, so this may change. |
|
578 |
|
579 =head1 xml:space attribute |
|
580 |
|
581 The XML spec states that: A special attribute |
|
582 named xml:space may be attached to an element |
|
583 to signal an intention that in that element, |
|
584 white space should be preserved by applications. |
|
585 In valid documents, this attribute, like any other, must be |
|
586 declared if it is used. |
|
587 When declared, it must be given as an |
|
588 enumerated type whose only |
|
589 possible values are "default" and "preserve". |
|
590 For example: |
|
591 |
|
592 <!ATTLIST poem xml:space (default|preserve) 'preserve'> |
|
593 |
|
594 The value "default" signals that applications' |
|
595 default white-space processing modes are acceptable for this element; the |
|
596 value "preserve" indicates the intent that applications preserve |
|
597 all the white space. |
|
598 This declared intent is considered to apply to all elements within the content |
|
599 of the element where it is specified, unless overriden with another instance |
|
600 of the xml:space attribute. |
|
601 |
|
602 The root element of any document |
|
603 is considered to have signaled no intentions as regards application space |
|
604 handling, unless it provides a value for |
|
605 this attribute or the attribute is declared with a default value. |
|
606 |
|
607 [... end of excerpt ...] |
|
608 |
|
609 =head1 CAVEATS |
|
610 |
|
611 This code is highly experimental! |
|
612 It has not been tested well and the API may change. |
|
613 |
|
614 The code that detects of blocks of whitespace at potential indent positions |
|
615 may need some work. See |
|
616 |
|
617 =head1 AUTHOR |
|
618 |
|
619 Send bug reports, hints, tips, suggestions to Enno Derksen at |
|
620 <F<enno@att.com>>. |
|
621 |
|
622 =cut |