--- a/dummy_foundation/lib/XML/Filter/DetectWS.pm Wed Jun 03 18:33:51 2009 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,622 +0,0 @@
-package XML::Filter::DetectWS;
-use strict;
-use XML::Filter::SAXT;
-
-#----------------------------------------------------------------------
-# CONSTANT DEFINITIONS
-#----------------------------------------------------------------------
-
-# Locations of whitespace
-sub WS_START (%) { 1 } # just after <a>
-sub WS_END (%) { 2 } # just before </a>
-sub WS_INTER (%) { 0 } # not at the start or end (i.e. intermediate)
-sub WS_ONLY (%) { 3 } # both START and END, i.e. between <a> and </a>
-
-# The states of the WhiteSpace detection code
-# for regular elements, i.e. elements that:
-# 1) don't have xml:space="preserve"
-# 2) have an ELEMENT model that allows text children (i.e. ANY or Mixed content)
-
-sub START (%) { 0 } # just saw <elem>
-sub ONLY_WS (%) { 1 } # saw <elem> followed by whitespace (only)
-sub ENDS_IN_WS (%) { 2 } # ends in whitespace (sofar)
-sub ENDS_IN_NON_WS (%) { 3 } # ends in non-ws text or non-text node (sofar)
-
-# NO_TEXT States: when <!ELEMENT> model does not allow text
-# (we assume that all text children are whitespace)
-sub NO_TEXT_START (%) { 4 } # just saw <elem>
-sub NO_TEXT_ONLY_WS (%) { 5 } # saw <elem> followed by whitespace (only)
-sub NO_TEXT_ENDS_IN_WS (%) { 6 } # ends in whitespace (sofar)
-sub NO_TEXT_ENDS_IN_NON_WS (%) { 7 } # ends in non-text node (sofar)
-
-# State for elements with xml:space="preserve" (all text is non-WS)
-sub PRESERVE_WS (%) { 8 }
-
-#----------------------------------------------------------------------
-# METHOD DEFINITIONS
-#----------------------------------------------------------------------
-
-# Constructor options:
-#
-# SkipIgnorableWS 1 means: don't forward ignorable_whitespace events
-# Handler SAX Handler that will receive the resulting events
-#
-
-sub new
-{
- my ($class, %options) = @_;
-
- my $self = bless \%options, $class;
-
- $self->init_handlers;
-
- $self;
-}
-
-# Does nothing
-sub noop {}
-
-sub init_handlers
-{
- my ($self) = @_;
- my %handlers;
-
- my $handler = $self->{Handler};
-
- for my $cb (map { @{$_} } values %XML::Filter::SAXT::SAX_HANDLERS)
- {
- if (UNIVERSAL::can ($handler, $cb))
- {
- $handlers{$cb} = eval "sub { \$handler->$cb (\@_) }";
- }
- else
- {
- $handlers{$cb} = \&noop;
- }
- }
-
- if ($self->{SkipIgnorableWS})
- {
- delete $handlers{ignorable_whitespace}; # if it exists
- }
- elsif (UNIVERSAL::can ($handler, 'ignorable_whitespace'))
- {
- # Support ignorable_whitespace callback if it exists
- # (if not, just use characters callback)
- $handlers{ignorable_whitespace} =
- sub { $handler->ignorable_whitespace (@_) };
- }
- else
- {
- $handlers{ignorable_whitespace} = $handlers{characters};
- }
-
- $handlers{ws} = $handlers{characters};
-#?? were should whitespace go?
-
- # NOTE: 'cdata' is not a valid PerlSAX callback
- if (UNIVERSAL::can ($handler, 'start_cdata') &&
- UNIVERSAL::can ($handler, 'end_cdata'))
- {
- $handlers{cdata} = sub {
- $handler->start_cdata;
- $handler->characters (@_);
- $handler->end_cdata;
- }
- }
- else # pass CDATA as regular characters
- {
- $handlers{cdata} = $handlers{characters};
- }
-
- $self->{Callback} = \%handlers;
-}
-
-sub start_cdata
-{
- my ($self, $event) = @_;
-
- $self->{InCDATA} = 1;
-}
-
-sub end_cdata
-{
- my ($self, $event) = @_;
-
- $self->{InCDATA} = 0;
-}
-
-sub entity_reference
-{
- my ($self, $event) = @_;
-
- $self->push_event ('entity_reference', $event);
-
- my $parent = $self->{ParentStack}->[-1];
- $parent->{State} |= ENDS_IN_NON_WS unless $parent->{State} == PRESERVE_WS;
-}
-
-sub comment
-{
- my ($self, $event) = @_;
-
- $self->push_event ('comment', $event);
-
- my $parent = $self->{ParentStack}->[-1];
- $parent->{State} |= ENDS_IN_NON_WS unless $parent->{State} == PRESERVE_WS;
-}
-
-sub processing_instruction
-{
- my ($self, $event) = @_;
-
- $self->push_event ('processing_instruction', $event);
-
- my $parent = $self->{ParentStack}->[-1];
- $parent->{State} |= ENDS_IN_NON_WS unless $parent->{State} == PRESERVE_WS;
-}
-
-sub start_document
-{
- my ($self, $event) = @_;
-
- # Initialize initial state
- $self->{ParentStack} = [];
- $self->{EventQ} = [];
- $self->{InCDATA} = 0;
-
- $self->init_handlers;
-
- $event = {} unless defined $event;
- # Don't preserve WS by default (unless specified by the user)
- $event->{PreserveWS} = defined ($self->{PreserveWS}) ?
- $self->{PreserveWS} : 0;
-
- # We don't need whitespace detection at the document level
- $event->{State} = PRESERVE_WS;
-
- $self->push_event ('start_document', $event);
- push @{ $self->{ParentStack} }, $event;
-}
-
-sub end_document
-{
- my ($self, $event) = @_;
- $event = {} unless defined $event;
-
- $self->push_event ('end_document', $event);
-
- $self->flush;
-}
-
-sub start_element
-{
- my ($self, $event) = @_;
-
- my $pres = $event->{Attributes}->{'xml:space'};
- if (defined $pres)
- {
- $event->{PreserveWS} = $pres eq "preserve";
- }
- else
- {
- $event->{PreserveWS} = $self->{ParentStack}->[-1]->{PreserveWS};
- }
-
- if ($self->{NoText}->{ $event->{Name} })
- {
- $event->{NoText} = 1;
- }
-
- $event->{State} = $self->get_init_state ($event);
-
- $self->push_event ('start_element', $event);
- push @{ $self->{ParentStack} }, $event;
-}
-
-sub end_element
-{
- my ($self, $event) = @_;
-
- # Mark previous whitespace event as the last event (WS_END)
- # (if it's there)
- my $prev = $self->{EventQ}->[-1];
- $prev->{Loc} |= WS_END if exists $prev->{Loc};
-
- $self->push_event ('end_element', $event);
-
- my $elem = pop @{ $self->{ParentStack} };
- delete $elem->{State};
-}
-
-sub characters
-{
- my ($self, $event) = @_;
-
- if ($self->{InCDATA})
- {
- # NOTE: 'cdata' is not a valid PerlSAX callback
- $self->push_event ('cdata', $event);
-
- my $parent = $self->{ParentStack}->[-1];
- $parent->{State} |= ENDS_IN_NON_WS unless $parent->{State} == PRESERVE_WS;
- return;
- }
-
- my $text = $event->{Data};
- return unless length ($text);
-
- my $state = $self->{ParentStack}->[-1]->{State};
- if ($state == PRESERVE_WS)
- {
- $self->push_event ('characters', $event);
- }
- elsif ($state == NO_TEXT_START)
- {
- # ELEMENT model does not allow regular text.
- # All characters are whitespace.
- $self->push_event ('ignorable_whitespace', { Data => $text, Loc => WS_START });
- $state = NO_TEXT_ONLY_WS;
- }
- elsif ($state == NO_TEXT_ONLY_WS)
- {
- $self->merge_text ($text, 'ignorable_whitespace', WS_START );
- }
- elsif ($state == NO_TEXT_ENDS_IN_NON_WS)
- {
- $self->push_event ('ignorable_whitespace', { Data => $text, Loc => WS_INTER });
- $state = NO_TEXT_ENDS_IN_WS;
- }
- elsif ($state == NO_TEXT_ENDS_IN_WS)
- {
- $self->merge_text ($text, 'ignorable_whitespace', WS_INTER );
- }
- elsif ($state == START)
- {
-#?? add support for full Unicode
- $text =~ /^(\s*)(\S(?:.*\S)?)?(\s*)$/;
- if (length $1)
- {
- $self->push_event ('ws', { Data => $1, Loc => WS_START });
- $state = ONLY_WS;
- }
- if (length $2)
- {
- $self->push_event ('characters', { Data => $2 });
- $state = ENDS_IN_NON_WS;
- }
- if (length $3)
- {
- $self->push_event ('ws', { Data => $3, Loc => WS_INTER });
- $state = ENDS_IN_WS;
- }
- }
- elsif ($state == ONLY_WS)
- {
- $text =~ /^(\s*)(\S(?:.*\S)?)?(\s*)$/;
- if (length $1)
- {
- $self->merge_text ($1, 'ws', WS_START);
- }
- if (length $2)
- {
- $self->push_event ('characters', { Data => $2 });
- $state = ENDS_IN_NON_WS;
- }
- if (length $3)
- {
- $self->push_event ('ws', { Data => $3, Loc => WS_INTER });
- $state = ENDS_IN_WS;
- }
- }
- else # state == ENDS_IN_WS or ENDS_IN_NON_WS
- {
- $text =~ /^(.*\S)?(\s*)$/;
- if (length $1)
- {
- if ($state == ENDS_IN_NON_WS)
- {
- $self->merge_text ($1, 'characters');
- }
- else
- {
- $self->push_event ('characters', { Data => $1 });
- $state = ENDS_IN_NON_WS;
- }
- }
- if (length $2)
- {
- if ($state == ENDS_IN_WS)
- {
- $self->merge_text ($2, 'ws', WS_INTER);
- }
- else
- {
- $self->push_event ('ws', { Data => $2, Loc => WS_INTER });
- $state = ENDS_IN_WS;
- }
- }
- }
-
- $self->{ParentStack}->[-1]->{State} = $state;
-}
-
-sub element_decl
-{
- my ($self, $event) = @_;
- my $tag = $event->{Name};
- my $model = $event->{Model};
-
- # Check the model to see if the elements may contain regular text
- $self->{NoText}->{$tag} = ($model eq 'EMPTY' || $model !~ /\#PCDATA/);
-
- $self->push_event ('element_decl', $event);
-}
-
-sub attlist_decl
-{
- my ($self, $event) = @_;
-
- my $prev = $self->{EventQ}->[-1];
- if ($prev->{EventType} eq 'attlist_decl' &&
- $prev->{ElementName} eq $event->{ElementName})
- {
- $prev->{MoreFollow} = 1;
- $event->{First} = 0;
- }
- else
- {
- $event->{First} = 1;
- }
-
- $self->push_event ('attlist_decl', $event);
-}
-
-sub notation_decl
-{
- my ($self, $event) = @_;
- $self->push_event ('notation_decl', $event);
-}
-
-sub unparsed_entity_decl
-{
- my ($self, $event) = @_;
- $self->push_event ('unparsed_entity_decl', $event);
-}
-
-sub entity_decl
-{
- my ($self, $event) = @_;
- $self->push_event ('entity_decl', $event);
-}
-
-sub doctype_decl
-{
- my ($self, $event) = @_;
- $self->push_event ('doctype_decl', $event);
-}
-
-sub xml_decl
-{
- my ($self, $event) = @_;
- $self->push_event ('xml_decl', $event);
-}
-
-#?? what about set_document_locator, resolve_entity
-
-#
-# Determine the initial State for the current Element.
-# By default, we look at the PreserveWS property (i.e. value of xml:space.)
-# The user can override this to force xml:space="preserve" for a particular
-# element with e.g.
-#
-# sub get_init_state
-# {
-# my ($self, $event) = @_;
-# ($event->{Name} eq 'foo' || $event->{PreserveWS}) ? PRESERVE_WS : START;
-# }
-#
-sub get_init_state
-{
- my ($self, $event) = @_;
- my $tag = $event->{Name};
-
- if ($self->{NoText}->{$tag}) # ELEMENT model does not allow text
- {
- return NO_TEXT_START;
- }
- $event->{PreserveWS} ? PRESERVE_WS : START;
-}
-
-sub push_event
-{
- my ($self, $type, $event) = @_;
-
- $event->{EventType} = $type;
-
- $self->flush;
- push @{ $self->{EventQ} }, $event;
-}
-
-# Merge text with previous event (if it has the same EventType)
-# or push a new text event
-sub merge_text
-{
- my ($self, $str, $eventType, $wsLocation) = @_;
- my $q = $self->{EventQ};
-
- my $prev = $q->[-1];
- if (defined $prev && $prev->{EventType} eq $eventType)
- {
- $prev->{Data} .= $str;
- }
- else
- {
- my $event = { Data => $str };
- $event->{Loc} = $wsLocation if defined $wsLocation;
- $self->push_event ($eventType, $event);
- }
-}
-
-# Forward all events on the EventQ
-sub flush
-{
- my ($self) = @_;
-
- my $q = $self->{EventQ};
- while (@$q)
- {
- my $event = shift @$q;
- my $type = $event->{EventType};
- delete $event->{EventType};
-
- $self->{Callback}->{$type}->($event);
- }
-}
-
-1; # package return code
-
-__END__
-
-=head1 NAME
-
-XML::Filter::DetectWS - A PerlSAX filter that detects ignorable whitespace
-
-=head1 SYNOPSIS
-
- use XML::Filter::DetectWS;
-
- my $detect = new XML::Filter::DetectWS (Handler => $handler,
- SkipIgnorableWS => 1);
-
-=head1 DESCRIPTION
-
-This a PerlSAX filter that detects which character data contains
-ignorable whitespace and optionally filters it.
-
-Note that this is just a first stab at the implementation and it may
-change completely in the near future. Please provide feedback whether
-you like it or not, so I know whether I should change it.
-
-The XML spec defines ignorable whitespace as the character data found in elements
-that were defined in an <!ELEMENT> declaration with a model of 'EMPTY' or
-'Children' (Children is the rule that does not contain '#PCDATA'.)
-
-In addition, XML::Filter::DetectWS allows the user to define other whitespace to
-be I<ignorable>. The ignorable whitespace is passed to the PerlSAX Handler with
-the B<ignorable_whitespace> handler, provided that the Handler implements this
-method. (Otherwise it is passed to the characters handler.)
-If the B<SkipIgnorableWS> is set, the ignorable whitespace is simply
-discarded.
-
-XML::Filter::DetectWS also takes xml:space attributes into account. See below
-for details.
-
-CDATA sections are passed in the standard PerlSAX way (i.e. with surrounding
-start_cdata and end_cdata events), unless the Handler does not implement these
-methods. In that case, the CDATA section is simply passed to the characters
-method.
-
-=head1 Constructor Options
-
-=over 4
-
-=item * SkipIgnorableWS (Default: 0)
-
-When set, detected ignorable whitespace is discarded.
-
-=item * Handler
-
-The PerlSAX handler (or filter) that will receive the PerlSAX events from this
-filter.
-
-=back
-
-=head1 Current Implementation
-
-When determining which whitespace is ignorable, it first looks at the
-xml:space attribute of the parent element node (and its ancestors.)
-If the attribute value is "preserve", then it is *NOT* ignorable.
-(If someone took the trouble of adding xml:space="preserve", then that is
-the final answer...)
-
-If xml:space="default", then we look at the <!ELEMENT> definition of the parent
-element. If the model is 'EMPTY' or follows the 'Children' rule (i.e. does not
-contain '#PCDATA') then we know that the whitespace is ignorable.
-Otherwise we need input from the user somehow.
-
-The idea is that the API of DetectWS will be extended, so that you can
-specify/override e.g. which elements should behave as if xml:space="preserve"
-were set, and/or which elements should behave as if the <!ELEMENT> model was
-defined a certain way, etc.
-
-Please send feedback!
-
-The current implementation also detects whitespace after an element-start tag,
-whitespace before an element-end tag.
-It also detects whitespace before an element-start and after an element-end tag
-and before or after comments, processing instruction, cdata sections etc.,
-but this needs to be reimplemented.
-In either case, the detected whitespace is split off into its own PerlSAX
-characters event and an extra property 'Loc' is added. It can have 4 possible
-values:
-
-=over 4
-
-=item * 1 (WS_START) - whitespace immediately after element-start tag
-
-=item * 2 (WS_END) - whitespace just before element-end tag
-
-=item * 3 (WS_ONLY) - both WS_START and WS_END, i.e. it's the only text found between the start and end tag and it's all whitespace
-
-=item * 0 (WS_INTER) - none of the above, probably before an element-start tag,
-after an element-end tag, or before or after a comment, PI, cdata section etc.
-
-=back
-
-Note that WS_INTER may not be that useful, so this may change.
-
-=head1 xml:space attribute
-
-The XML spec states that: A special attribute
-named xml:space may be attached to an element
-to signal an intention that in that element,
-white space should be preserved by applications.
-In valid documents, this attribute, like any other, must be
-declared if it is used.
-When declared, it must be given as an
-enumerated type whose only
-possible values are "default" and "preserve".
-For example:
-
- <!ATTLIST poem xml:space (default|preserve) 'preserve'>
-
-The value "default" signals that applications'
-default white-space processing modes are acceptable for this element; the
-value "preserve" indicates the intent that applications preserve
-all the white space.
-This declared intent is considered to apply to all elements within the content
-of the element where it is specified, unless overriden with another instance
-of the xml:space attribute.
-
-The root element of any document
-is considered to have signaled no intentions as regards application space
-handling, unless it provides a value for
-this attribute or the attribute is declared with a default value.
-
-[... end of excerpt ...]
-
-=head1 CAVEATS
-
-This code is highly experimental!
-It has not been tested well and the API may change.
-
-The code that detects of blocks of whitespace at potential indent positions
-may need some work. See
-
-=head1 AUTHOR
-
-Send bug reports, hints, tips, suggestions to Enno Derksen at
-<F<enno@att.com>>.
-
-=cut