dummy_foundation/lib/XML/Filter/DetectWS.pm
changeset 0 02cd6b52f378
equal deleted inserted replaced
-1:000000000000 0:02cd6b52f378
       
     1 package XML::Filter::DetectWS;
       
     2 use strict;
       
     3 use XML::Filter::SAXT;
       
     4 
       
     5 #----------------------------------------------------------------------
       
     6 #	CONSTANT DEFINITIONS
       
     7 #----------------------------------------------------------------------
       
     8 
       
     9 # Locations of whitespace
       
    10 sub WS_START	(%) { 1 }	# just after <a>
       
    11 sub WS_END	(%) { 2 }	# just before </a>
       
    12 sub WS_INTER	(%) { 0 }	# not at the start or end (i.e. intermediate)
       
    13 sub WS_ONLY	(%) { 3 }	# both START and END, i.e. between <a> and </a>
       
    14 
       
    15 # The states of the WhiteSpace detection code
       
    16 # for regular elements, i.e. elements that:
       
    17 # 1) don't have xml:space="preserve"
       
    18 # 2) have an ELEMENT model that allows text children (i.e. ANY or Mixed content)
       
    19 
       
    20 sub START          (%) { 0 }	# just saw <elem>
       
    21 sub ONLY_WS        (%) { 1 }	# saw <elem> followed by whitespace (only)
       
    22 sub ENDS_IN_WS	   (%) { 2 }	# ends in whitespace (sofar)
       
    23 sub ENDS_IN_NON_WS (%) { 3 }	# ends in non-ws text or non-text node (sofar)
       
    24 
       
    25 # NO_TEXT States: when <!ELEMENT> model does not allow text
       
    26 # (we assume that all text children are whitespace)
       
    27 sub NO_TEXT_START	   (%) { 4 }	# just saw <elem>
       
    28 sub NO_TEXT_ONLY_WS        (%) { 5 }	# saw <elem> followed by whitespace (only)
       
    29 sub NO_TEXT_ENDS_IN_WS	   (%) { 6 }	# ends in whitespace (sofar)
       
    30 sub NO_TEXT_ENDS_IN_NON_WS (%) { 7 }	# ends in non-text node (sofar)
       
    31 
       
    32 # State for elements with xml:space="preserve" (all text is non-WS)
       
    33 sub PRESERVE_WS    (%) { 8 }
       
    34 
       
    35 #----------------------------------------------------------------------
       
    36 #	METHOD DEFINITIONS
       
    37 #----------------------------------------------------------------------
       
    38 
       
    39 # Constructor options:
       
    40 #
       
    41 # SkipIgnorableWS	1 means: don't forward ignorable_whitespace events
       
    42 # Handler		SAX Handler that will receive the resulting events
       
    43 #
       
    44 
       
    45 sub new
       
    46 {
       
    47     my ($class, %options) = @_;
       
    48 
       
    49     my $self = bless \%options, $class;
       
    50 
       
    51     $self->init_handlers;
       
    52 
       
    53     $self;
       
    54 }
       
    55 
       
    56 # Does nothing
       
    57 sub noop {}
       
    58 
       
    59 sub init_handlers
       
    60 {
       
    61     my ($self) = @_;
       
    62     my %handlers;
       
    63     
       
    64     my $handler = $self->{Handler};
       
    65     
       
    66     for my $cb (map { @{$_} } values %XML::Filter::SAXT::SAX_HANDLERS)
       
    67     {
       
    68 	if (UNIVERSAL::can ($handler, $cb))
       
    69 	{
       
    70 	    $handlers{$cb} = eval "sub { \$handler->$cb (\@_) }";
       
    71 	}
       
    72 	else
       
    73 	{
       
    74 	    $handlers{$cb} = \&noop;
       
    75 	}
       
    76     }
       
    77 
       
    78     if ($self->{SkipIgnorableWS})
       
    79     {
       
    80 	delete $handlers{ignorable_whitespace};	# if it exists
       
    81     }
       
    82     elsif (UNIVERSAL::can ($handler, 'ignorable_whitespace'))
       
    83     {
       
    84 	# Support ignorable_whitespace callback if it exists
       
    85 	# (if not, just use characters callback)
       
    86 	$handlers{ignorable_whitespace} = 
       
    87 	    sub { $handler->ignorable_whitespace (@_) };
       
    88     }
       
    89     else
       
    90     {
       
    91 	$handlers{ignorable_whitespace} = $handlers{characters};
       
    92     }
       
    93 
       
    94     $handlers{ws} = $handlers{characters};    
       
    95 #?? were should whitespace go?
       
    96 
       
    97     # NOTE: 'cdata' is not a valid PerlSAX callback
       
    98     if (UNIVERSAL::can ($handler, 'start_cdata') &&
       
    99 	UNIVERSAL::can ($handler, 'end_cdata'))
       
   100     {
       
   101 	$handlers{cdata} = sub {
       
   102 	    $handler->start_cdata;
       
   103 	    $handler->characters (@_);
       
   104 	    $handler->end_cdata;
       
   105 	}
       
   106     }
       
   107     else	# pass CDATA as regular characters
       
   108     {
       
   109 	$handlers{cdata} = $handlers{characters};
       
   110     }
       
   111 
       
   112     $self->{Callback} = \%handlers;
       
   113 }
       
   114 
       
   115 sub start_cdata
       
   116 {
       
   117     my ($self, $event) = @_;
       
   118 
       
   119     $self->{InCDATA} = 1;
       
   120 }
       
   121 
       
   122 sub end_cdata
       
   123 {
       
   124     my ($self, $event) = @_;
       
   125 
       
   126     $self->{InCDATA} = 0;
       
   127 }
       
   128 
       
   129 sub entity_reference
       
   130 {
       
   131     my ($self, $event) = @_;
       
   132     
       
   133     $self->push_event ('entity_reference', $event);
       
   134 
       
   135     my $parent = $self->{ParentStack}->[-1];
       
   136     $parent->{State} |= ENDS_IN_NON_WS unless $parent->{State} == PRESERVE_WS;
       
   137 }
       
   138 
       
   139 sub comment
       
   140 {
       
   141     my ($self, $event) = @_;
       
   142     
       
   143     $self->push_event ('comment', $event);
       
   144 
       
   145     my $parent = $self->{ParentStack}->[-1];
       
   146     $parent->{State} |= ENDS_IN_NON_WS unless $parent->{State} == PRESERVE_WS;
       
   147 }
       
   148 
       
   149 sub processing_instruction
       
   150 {
       
   151     my ($self, $event) = @_;
       
   152     
       
   153     $self->push_event ('processing_instruction', $event);
       
   154 
       
   155     my $parent = $self->{ParentStack}->[-1];
       
   156     $parent->{State} |= ENDS_IN_NON_WS unless $parent->{State} == PRESERVE_WS;
       
   157 }
       
   158 
       
   159 sub start_document
       
   160 {
       
   161     my ($self, $event) = @_;
       
   162 
       
   163     # Initialize initial state
       
   164     $self->{ParentStack} = [];
       
   165     $self->{EventQ} = [];
       
   166     $self->{InCDATA} = 0;
       
   167 
       
   168     $self->init_handlers;
       
   169 
       
   170     $event = {} unless defined $event;
       
   171     # Don't preserve WS by default (unless specified by the user)
       
   172     $event->{PreserveWS} = defined ($self->{PreserveWS}) ? 
       
   173 					$self->{PreserveWS} : 0;
       
   174 
       
   175     # We don't need whitespace detection at the document level
       
   176     $event->{State} = PRESERVE_WS;
       
   177 
       
   178     $self->push_event ('start_document', $event);
       
   179     push @{ $self->{ParentStack} }, $event;
       
   180 }
       
   181 
       
   182 sub end_document
       
   183 {
       
   184     my ($self, $event) = @_;
       
   185     $event = {} unless defined $event;
       
   186 
       
   187     $self->push_event ('end_document', $event);
       
   188 
       
   189     $self->flush;
       
   190 }
       
   191 
       
   192 sub start_element
       
   193 {
       
   194     my ($self, $event) = @_;
       
   195 
       
   196     my $pres = $event->{Attributes}->{'xml:space'};
       
   197     if (defined $pres)
       
   198     {
       
   199 	$event->{PreserveWS} = $pres eq "preserve";
       
   200     }
       
   201     else
       
   202     {
       
   203 	$event->{PreserveWS} = $self->{ParentStack}->[-1]->{PreserveWS};
       
   204     }
       
   205 
       
   206     if ($self->{NoText}->{ $event->{Name} })
       
   207     {
       
   208 	$event->{NoText} = 1;
       
   209     }
       
   210 
       
   211     $event->{State} = $self->get_init_state ($event);
       
   212 
       
   213     $self->push_event ('start_element', $event);
       
   214     push @{ $self->{ParentStack} }, $event;
       
   215 }
       
   216 
       
   217 sub end_element
       
   218 {
       
   219     my ($self, $event) = @_;
       
   220 
       
   221     # Mark previous whitespace event as the last event (WS_END)
       
   222     # (if it's there)
       
   223     my $prev = $self->{EventQ}->[-1];
       
   224     $prev->{Loc} |= WS_END if exists $prev->{Loc};
       
   225 
       
   226     $self->push_event ('end_element', $event);
       
   227     
       
   228     my $elem = pop @{ $self->{ParentStack} };
       
   229     delete $elem->{State};
       
   230 }
       
   231 
       
   232 sub characters
       
   233 {
       
   234     my ($self, $event) = @_;
       
   235 
       
   236     if ($self->{InCDATA})
       
   237     {
       
   238 	# NOTE: 'cdata' is not a valid PerlSAX callback
       
   239 	$self->push_event ('cdata', $event);
       
   240 	
       
   241 	my $parent = $self->{ParentStack}->[-1];
       
   242 	$parent->{State} |= ENDS_IN_NON_WS unless $parent->{State} == PRESERVE_WS;
       
   243 	return;
       
   244     }
       
   245 
       
   246     my $text = $event->{Data};
       
   247     return unless length ($text);
       
   248 
       
   249     my $state = $self->{ParentStack}->[-1]->{State};
       
   250     if ($state == PRESERVE_WS)
       
   251     {
       
   252 	$self->push_event ('characters', $event);
       
   253     }
       
   254     elsif ($state == NO_TEXT_START)
       
   255     {
       
   256 	# ELEMENT model does not allow regular text.
       
   257 	# All characters are whitespace.
       
   258 	$self->push_event ('ignorable_whitespace', { Data => $text, Loc => WS_START });
       
   259 	$state = NO_TEXT_ONLY_WS;
       
   260     }
       
   261     elsif ($state == NO_TEXT_ONLY_WS)
       
   262     {
       
   263 	$self->merge_text ($text, 'ignorable_whitespace', WS_START );
       
   264     }
       
   265     elsif ($state == NO_TEXT_ENDS_IN_NON_WS)
       
   266     {
       
   267 	$self->push_event ('ignorable_whitespace', { Data => $text, Loc => WS_INTER });
       
   268 	$state = NO_TEXT_ENDS_IN_WS;
       
   269     }
       
   270     elsif ($state == NO_TEXT_ENDS_IN_WS)
       
   271     {
       
   272 	$self->merge_text ($text, 'ignorable_whitespace', WS_INTER );
       
   273     }
       
   274     elsif ($state == START)
       
   275     {
       
   276 #?? add support for full Unicode
       
   277 	$text =~ /^(\s*)(\S(?:.*\S)?)?(\s*)$/;
       
   278 	if (length $1)
       
   279 	{
       
   280 	    $self->push_event ('ws', { Data => $1, Loc => WS_START });
       
   281 	    $state = ONLY_WS;
       
   282 	}
       
   283 	if (length $2)
       
   284 	{
       
   285 	    $self->push_event ('characters', { Data => $2 });
       
   286 	    $state = ENDS_IN_NON_WS;
       
   287 	}
       
   288 	if (length $3)
       
   289 	{
       
   290 	    $self->push_event ('ws', { Data => $3, Loc => WS_INTER });
       
   291 	    $state = ENDS_IN_WS;
       
   292 	}
       
   293     }
       
   294     elsif ($state == ONLY_WS)
       
   295     {
       
   296 	$text =~ /^(\s*)(\S(?:.*\S)?)?(\s*)$/;
       
   297 	if (length $1)
       
   298 	{
       
   299 	    $self->merge_text ($1, 'ws', WS_START);
       
   300 	}
       
   301 	if (length $2)
       
   302 	{
       
   303 	    $self->push_event ('characters', { Data => $2 });
       
   304 	    $state = ENDS_IN_NON_WS;	    
       
   305 	}
       
   306 	if (length $3)
       
   307 	{
       
   308 	    $self->push_event ('ws', { Data => $3, Loc => WS_INTER });
       
   309 	    $state = ENDS_IN_WS;	    
       
   310 	}
       
   311     }
       
   312     else # state == ENDS_IN_WS or ENDS_IN_NON_WS
       
   313     {
       
   314 	$text =~ /^(.*\S)?(\s*)$/;
       
   315 	if (length $1)
       
   316 	{
       
   317 	    if ($state == ENDS_IN_NON_WS)
       
   318 	    {
       
   319 		$self->merge_text ($1, 'characters');
       
   320 	    }
       
   321 	    else
       
   322 	    {
       
   323 		$self->push_event ('characters', { Data => $1 });
       
   324 		$state = ENDS_IN_NON_WS;	    
       
   325 	    }
       
   326 	}
       
   327 	if (length $2)
       
   328 	{
       
   329 	    if ($state == ENDS_IN_WS)
       
   330 	    {
       
   331 		$self->merge_text ($2, 'ws', WS_INTER);
       
   332 	    }
       
   333 	    else
       
   334 	    {
       
   335 		$self->push_event ('ws', { Data => $2, Loc => WS_INTER });
       
   336 		$state = ENDS_IN_WS;
       
   337 	    }
       
   338 	}
       
   339     }
       
   340 
       
   341     $self->{ParentStack}->[-1]->{State} = $state;
       
   342 }
       
   343 
       
   344 sub element_decl
       
   345 {
       
   346     my ($self, $event) = @_;
       
   347     my $tag = $event->{Name};
       
   348     my $model = $event->{Model};
       
   349 
       
   350     # Check the model to see if the elements may contain regular text
       
   351     $self->{NoText}->{$tag} = ($model eq 'EMPTY' || $model !~ /\#PCDATA/);
       
   352 
       
   353     $self->push_event ('element_decl', $event);
       
   354 }
       
   355 
       
   356 sub attlist_decl
       
   357 {
       
   358     my ($self, $event) = @_;
       
   359     
       
   360     my $prev = $self->{EventQ}->[-1];
       
   361     if ($prev->{EventType} eq 'attlist_decl' && 
       
   362 	$prev->{ElementName} eq $event->{ElementName})
       
   363     {
       
   364 	$prev->{MoreFollow} = 1;
       
   365 	$event->{First} = 0;
       
   366     }
       
   367     else
       
   368     {
       
   369 	$event->{First} = 1;
       
   370     }
       
   371 
       
   372     $self->push_event ('attlist_decl', $event);
       
   373 }
       
   374 
       
   375 sub notation_decl
       
   376 {
       
   377     my ($self, $event) = @_;
       
   378     $self->push_event ('notation_decl', $event);
       
   379 }
       
   380 
       
   381 sub unparsed_entity_decl
       
   382 {
       
   383     my ($self, $event) = @_;
       
   384     $self->push_event ('unparsed_entity_decl', $event);
       
   385 }
       
   386 
       
   387 sub entity_decl
       
   388 {
       
   389     my ($self, $event) = @_;
       
   390     $self->push_event ('entity_decl', $event);
       
   391 }
       
   392 
       
   393 sub doctype_decl
       
   394 {
       
   395     my ($self, $event) = @_;
       
   396     $self->push_event ('doctype_decl', $event);
       
   397 }
       
   398 
       
   399 sub xml_decl
       
   400 {
       
   401     my ($self, $event) = @_;
       
   402     $self->push_event ('xml_decl', $event);
       
   403 }
       
   404 
       
   405 #?? what about set_document_locator, resolve_entity
       
   406 
       
   407 #
       
   408 # Determine the initial State for the current Element.
       
   409 # By default, we look at the PreserveWS property (i.e. value of xml:space.)
       
   410 # The user can override this to force xml:space="preserve" for a particular
       
   411 # element with e.g.
       
   412 #
       
   413 # sub get_init_state
       
   414 # {
       
   415 #    my ($self, $event) = @_;
       
   416 #    ($event->{Name} eq 'foo' || $event->{PreserveWS}) ? PRESERVE_WS : START;
       
   417 # }
       
   418 #
       
   419 sub get_init_state
       
   420 {
       
   421     my ($self, $event) = @_;
       
   422     my $tag = $event->{Name};
       
   423 
       
   424     if ($self->{NoText}->{$tag})	# ELEMENT model does not allow text
       
   425     {
       
   426 	return NO_TEXT_START;
       
   427     }
       
   428     $event->{PreserveWS} ? PRESERVE_WS : START;
       
   429 }
       
   430 
       
   431 sub push_event
       
   432 {
       
   433     my ($self, $type, $event) = @_;
       
   434 
       
   435     $event->{EventType} = $type;
       
   436 
       
   437     $self->flush;
       
   438     push @{ $self->{EventQ} }, $event;
       
   439 }
       
   440 
       
   441 # Merge text with previous event (if it has the same EventType)
       
   442 # or push a new text event
       
   443 sub merge_text
       
   444 {
       
   445     my ($self, $str, $eventType, $wsLocation) = @_;
       
   446     my $q = $self->{EventQ};
       
   447 
       
   448     my $prev = $q->[-1];
       
   449     if (defined $prev && $prev->{EventType} eq $eventType)
       
   450     {
       
   451 	$prev->{Data} .= $str;
       
   452     }
       
   453     else
       
   454     {
       
   455 	my $event = { Data => $str };
       
   456 	$event->{Loc} = $wsLocation if defined $wsLocation;
       
   457 	$self->push_event ($eventType, $event);
       
   458     }
       
   459 }
       
   460 
       
   461 # Forward all events on the EventQ
       
   462 sub flush
       
   463 {
       
   464     my ($self) = @_;
       
   465 
       
   466     my $q = $self->{EventQ};
       
   467     while (@$q)
       
   468     {
       
   469 	my $event = shift @$q;
       
   470 	my $type = $event->{EventType};
       
   471 	delete $event->{EventType};
       
   472 
       
   473 	$self->{Callback}->{$type}->($event);
       
   474     }
       
   475 }
       
   476 
       
   477 1; # package return code
       
   478 
       
   479 __END__
       
   480 
       
   481 =head1 NAME
       
   482 
       
   483 XML::Filter::DetectWS - A PerlSAX filter that detects ignorable whitespace
       
   484 
       
   485 =head1 SYNOPSIS
       
   486 
       
   487  use XML::Filter::DetectWS;
       
   488 
       
   489  my $detect = new XML::Filter::DetectWS (Handler => $handler,
       
   490 					 SkipIgnorableWS => 1);
       
   491 
       
   492 =head1 DESCRIPTION
       
   493 
       
   494 This a PerlSAX filter that detects which character data contains 
       
   495 ignorable whitespace and optionally filters it.
       
   496 
       
   497 Note that this is just a first stab at the implementation and it may
       
   498 change completely in the near future. Please provide feedback whether
       
   499 you like it or not, so I know whether I should change it.
       
   500 
       
   501 The XML spec defines ignorable whitespace as the character data found in elements
       
   502 that were defined in an <!ELEMENT> declaration with a model of 'EMPTY' or
       
   503 'Children' (Children is the rule that does not contain '#PCDATA'.)
       
   504 
       
   505 In addition, XML::Filter::DetectWS allows the user to define other whitespace to 
       
   506 be I<ignorable>. The ignorable whitespace is passed to the PerlSAX Handler with
       
   507 the B<ignorable_whitespace> handler, provided that the Handler implements this 
       
   508 method. (Otherwise it is passed to the characters handler.)
       
   509 If the B<SkipIgnorableWS> is set, the ignorable whitespace is simply
       
   510 discarded.
       
   511 
       
   512 XML::Filter::DetectWS also takes xml:space attributes into account. See below
       
   513 for details.
       
   514 
       
   515 CDATA sections are passed in the standard PerlSAX way (i.e. with surrounding
       
   516 start_cdata and end_cdata events), unless the Handler does not implement these
       
   517 methods. In that case, the CDATA section is simply passed to the characters 
       
   518 method.
       
   519 
       
   520 =head1 Constructor Options
       
   521 
       
   522 =over 4
       
   523 
       
   524 =item * SkipIgnorableWS (Default: 0)
       
   525 
       
   526 When set, detected ignorable whitespace is discarded.
       
   527 
       
   528 =item * Handler
       
   529 
       
   530 The PerlSAX handler (or filter) that will receive the PerlSAX events from this 
       
   531 filter.
       
   532 
       
   533 =back
       
   534 
       
   535 =head1 Current Implementation
       
   536 
       
   537 When determining which whitespace is ignorable, it first looks at the
       
   538 xml:space attribute of the parent element node (and its ancestors.) 
       
   539 If the attribute value is "preserve", then it is *NOT* ignorable.
       
   540 (If someone took the trouble of adding xml:space="preserve", then that is
       
   541 the final answer...)
       
   542 
       
   543 If xml:space="default", then we look at the <!ELEMENT> definition of the parent
       
   544 element. If the model is 'EMPTY' or follows the 'Children' rule (i.e. does not
       
   545 contain '#PCDATA') then we know that the whitespace is ignorable.
       
   546 Otherwise we need input from the user somehow.
       
   547 
       
   548 The idea is that the API of DetectWS will be extended, so that you can
       
   549 specify/override e.g. which elements should behave as if xml:space="preserve" 
       
   550 were set, and/or which elements should behave as if the <!ELEMENT> model was
       
   551 defined a certain way, etc.
       
   552 
       
   553 Please send feedback!
       
   554 
       
   555 The current implementation also detects whitespace after an element-start tag,
       
   556 whitespace before an element-end tag. 
       
   557 It also detects whitespace before an element-start and after an element-end tag
       
   558 and before or after comments, processing instruction, cdata sections etc.,
       
   559 but this needs to be reimplemented.
       
   560 In either case, the detected whitespace is split off into its own PerlSAX
       
   561 characters event and an extra property 'Loc' is added. It can have 4 possible
       
   562 values:
       
   563 
       
   564 =over 4
       
   565 
       
   566 =item * 1 (WS_START) - whitespace immediately after element-start tag
       
   567 
       
   568 =item * 2 (WS_END) - whitespace just before element-end tag
       
   569 
       
   570 =item * 3 (WS_ONLY) - both WS_START and WS_END, i.e. it's the only text found between the start and end tag and it's all whitespace
       
   571 
       
   572 =item * 0 (WS_INTER) - none of the above, probably before an element-start tag,
       
   573 after an element-end tag, or before or after a comment, PI, cdata section etc.
       
   574 
       
   575 =back
       
   576 
       
   577 Note that WS_INTER may not be that useful, so this may change.
       
   578 
       
   579 =head1 xml:space attribute
       
   580 
       
   581 The XML spec states that: A special attribute
       
   582 named xml:space may be attached to an element
       
   583 to signal an intention that in that element,
       
   584 white space should be preserved by applications.
       
   585 In valid documents, this attribute, like any other, must be 
       
   586 declared if it is used.
       
   587 When declared, it must be given as an 
       
   588 enumerated type whose only
       
   589 possible values are "default" and "preserve".
       
   590 For example:
       
   591 
       
   592  <!ATTLIST poem   xml:space (default|preserve) 'preserve'>
       
   593 
       
   594 The value "default" signals that applications'
       
   595 default white-space processing modes are acceptable for this element; the
       
   596 value "preserve" indicates the intent that applications preserve
       
   597 all the white space.
       
   598 This declared intent is considered to apply to all elements within the content
       
   599 of the element where it is specified, unless overriden with another instance
       
   600 of the xml:space attribute.
       
   601 
       
   602 The root element of any document
       
   603 is considered to have signaled no intentions as regards application space
       
   604 handling, unless it provides a value for 
       
   605 this attribute or the attribute is declared with a default value.
       
   606 
       
   607 [... end of excerpt ...]
       
   608 
       
   609 =head1 CAVEATS
       
   610 
       
   611 This code is highly experimental! 
       
   612 It has not been tested well and the API may change.
       
   613 
       
   614 The code that detects of blocks of whitespace at potential indent positions
       
   615 may need some work. See 
       
   616 
       
   617 =head1 AUTHOR
       
   618 
       
   619 Send bug reports, hints, tips, suggestions to Enno Derksen at
       
   620 <F<enno@att.com>>. 
       
   621 
       
   622 =cut