dummy_foundation/lib/XML/Filter/Reindent.pm
changeset 4 60053dab7e2a
parent 3 8b87ea768cb8
child 5 842a773e65f2
child 6 c34a018f3291
equal deleted inserted replaced
3:8b87ea768cb8 4:60053dab7e2a
     1 package XML::Filter::Reindent;
       
     2 use strict;
       
     3 use XML::Filter::DetectWS;
       
     4 
       
     5 use vars qw{ @ISA };
       
     6 @ISA = qw{ XML::Filter::DetectWS };
       
     7 
       
     8 sub MAYBE (%) { 2 }
       
     9 
       
    10 sub new
       
    11 {
       
    12     my $class = shift;
       
    13     my $self = $class->SUPER::new (@_);
       
    14 
       
    15     # Use one space per indent level (by default)
       
    16     $self->{Tab} = " " unless defined $self->{Tab};
       
    17 
       
    18     # Note that this is a PerlSAX filter so we use the XML newline ("\x0A"),
       
    19     # not the Perl output newline ("\n"), by default.
       
    20     $self->{Newline} = "\x0A" unless defined $self->{Newline};
       
    21 
       
    22     $self;
       
    23 }
       
    24 
       
    25 # Indent the element if its parent element says so
       
    26 sub indent_element
       
    27 {
       
    28     my ($self, $event, $parent_says_indent) = @_;
       
    29     return $parent_says_indent;
       
    30 }
       
    31 
       
    32 # Always indent children unless element (or its ancestor) has 
       
    33 # xml:space="preserve" attribute
       
    34 sub indent_children
       
    35 {
       
    36     my ($self, $event) = @_;
       
    37     return $event->{PreserveWS} ? 0 : MAYBE;
       
    38 }
       
    39 
       
    40 sub start_element
       
    41 {
       
    42     my ($self, $event) = @_;
       
    43 
       
    44     my $parent = $self->{ParentStack}->[-1];
       
    45     my $level = $self->{Level}++;
       
    46     $self->SUPER::start_element ($event);
       
    47 
       
    48     my $parent_says_indent = $parent->{IndentChildren} ? 1 : 0;
       
    49     # init with 1 if parent says MAYBE
       
    50     $event->{Indent} = $self->indent_element ($event, $parent_says_indent) ?
       
    51 			$level : undef;
       
    52 
       
    53     $event->{IndentChildren} = $self->indent_children ($event);
       
    54 }
       
    55 
       
    56 sub end_element
       
    57 {
       
    58     my ($self, $event) = @_;
       
    59     my $start_element = $self->{ParentStack}->[-1];
       
    60 
       
    61     if ($start_element->{IndentChildren} == MAYBE)
       
    62     {
       
    63 	my $q = $self->{EventQ};
       
    64 	my $prev = $q->[-1];
       
    65 
       
    66 	if ($prev == $start_element)
       
    67 	{
       
    68 	    # End tag follows start tag: compress tag
       
    69 	    $start_element->{Compress} = 1;
       
    70 	    $event->{Compress} = 1;
       
    71 #?? could detect if it contains only ignorable_ws
       
    72 	}
       
    73 	elsif ($prev->{EventType} eq 'characters')
       
    74 	{
       
    75 	    if ($q->[-2] == $start_element)
       
    76 	    {
       
    77 		# Element has only one child, a text node.
       
    78 		# Print element as: <a>text here</a>
       
    79 		delete $prev->{Indent};
       
    80 		$start_element->{IndentChildren} = 0;
       
    81 	    }
       
    82 	}
       
    83     }
       
    84 
       
    85     my $level = --$self->{Level};
       
    86     $event->{Indent} = $start_element->{IndentChildren} ? $level : undef;
       
    87 
       
    88     my $compress = $start_element->{Compress};
       
    89     if ($compress)
       
    90     {
       
    91 	$event->{Compress} = $compress;
       
    92 	delete $event->{Indent};
       
    93     }
       
    94 
       
    95     $self->SUPER::end_element ($event);
       
    96 }
       
    97 
       
    98 sub end_document
       
    99 {
       
   100     my ($self, $event) = @_;
       
   101 
       
   102     $self->push_event ('end_document', $event || {});
       
   103     $self->flush (0);	# send remaining events
       
   104 }
       
   105 
       
   106 sub push_event
       
   107 {
       
   108     my ($self, $type, $event) = @_;
       
   109 
       
   110     $event->{EventType} = $type;
       
   111     if ($type =~ /^(characters|comment|processing_instruction|entity_reference|cdata)$/)
       
   112     {
       
   113 	my $indent_kids = $self->{ParentStack}->[-1]->{IndentChildren} ? 1 : 0;
       
   114 	$event->{Indent} =  $indent_kids ? $self->{Level} : undef;
       
   115     }
       
   116 
       
   117     my $q = $self->{EventQ};
       
   118     push @$q, $event;
       
   119 
       
   120     $self->flush (4);	# keep 4 events on the stack (maybe 3 is enough)
       
   121 }
       
   122 
       
   123 sub flush
       
   124 {
       
   125     my ($self, $keep) = @_;
       
   126     my $q = $self->{EventQ};
       
   127 
       
   128     while (@$q > $keep)
       
   129     {
       
   130 	my $head = $q->[0];
       
   131 #	print "head=" . $head->{EventType} . " indent=" . $head->{Indent} . "\n";
       
   132 
       
   133 	if ($head->{EventType} =~ /ws|ignorable/)
       
   134 	{
       
   135 	    my $next = $q->[1];
       
   136 	    my $indent = $next->{Indent};
       
   137 
       
   138 	    if (defined $indent)	# fix existing indent
       
   139 	    {
       
   140 		$head->{Data} = $self->{Newline} . ($self->{Tab} x $indent);
       
   141 		$self->send (2);
       
   142 	    }
       
   143 	    else		# remove existing indent
       
   144 	    {
       
   145 		shift @$q;
       
   146 		$self->send (1);
       
   147 	    }
       
   148 #?? remove keys: Indent, ...
       
   149 	}
       
   150 	else
       
   151 	{
       
   152 	    my $indent = $head->{Indent};
       
   153 
       
   154 	    if (defined $indent)	# insert indent
       
   155 	    {
       
   156 		unshift @$q, { EventType => 'ws', 
       
   157 			       Data => $self->{Newline} . ($self->{Tab} x $indent) };
       
   158 		$self->send (2);
       
   159 	    }
       
   160 	    else		# no indent - leave as is
       
   161 	    {
       
   162 		$self->send (1);
       
   163 	    }
       
   164 	}
       
   165     }
       
   166 }
       
   167 
       
   168 sub send
       
   169 {
       
   170     my ($self, $i) = @_;
       
   171     
       
   172     my $q = $self->{EventQ};
       
   173 
       
   174     while ($i--)
       
   175     {
       
   176 	my $event = shift @$q;
       
   177 	my $type = $event->{EventType};
       
   178 	delete $event->{EventType};
       
   179 
       
   180 #print "TYPE=$type " . join(",", map { "$_=" . $event->{$_} } keys %$event) . "\n";
       
   181 	$self->{Callback}->{$type}->($event);
       
   182     }
       
   183 }
       
   184 
       
   185 1;	# package return code
       
   186 
       
   187 =head1 NAME
       
   188 
       
   189 XML::Filter::Reindent - Reformats whitespace for pretty printing XML
       
   190 
       
   191 =head1 SYNOPSIS
       
   192 
       
   193  use XML::Handler::Composer;
       
   194  use XML::Filter::Reindent;
       
   195 
       
   196  my $composer = new XML::Handler::Composer (%OPTIONS);
       
   197  my $indent = new XML::Filter::Reindent (Handler => $composer, %OPTIONS);
       
   198 
       
   199 =head1 DESCRIPTION
       
   200 
       
   201 XML::Filter::Reindent is a sub class of L<XML::Filter::DetectWS>.
       
   202 
       
   203 XML::Filter::Reindent can be used as a PerlSAX filter to reformat an
       
   204 XML document before sending it to a PerlSAX handler that prints it
       
   205 (like L<XML::Handler::Composer>.)
       
   206 
       
   207 Like L<XML::Filter::DetectWS>, it detects ignorable whitespace and blocks of
       
   208 whitespace characters in certain places. It uses this information and
       
   209 information supplied by the user to determine where whitespace may be
       
   210 modified, deleted or inserted. 
       
   211 Based on the indent settings, it then modifies, inserts and deletes characters
       
   212 and ignorable_whitespace events accordingly.
       
   213 
       
   214 This is just a first stab at the implementation.
       
   215 It may be buggy and may change completely!
       
   216 
       
   217 =head1 Constructor Options
       
   218 
       
   219 =over 4
       
   220 
       
   221 =item * Handler
       
   222 
       
   223 The PerlSAX handler (or filter) that will receive the PerlSAX events from this 
       
   224 filter.
       
   225 
       
   226 =item * Tab (Default: one space)
       
   227 
       
   228 The number of spaces per indent level for elements etc. in document content.
       
   229 
       
   230 =item * Newline (Default: "\x0A")
       
   231 
       
   232 The newline to use when re-indenting. 
       
   233 The default is the internal newline used by L<XML::Parser>, L<XML::DOM> etc.,
       
   234 and should be fine when used in combination with L<XML::Handler::Composer>.
       
   235 
       
   236 =back
       
   237 
       
   238 =head1 $self->indent_children ($start_element_event)
       
   239 
       
   240 This method determines whether children of a certain element
       
   241 may be reformatted. 
       
   242 The default implementation checks the PreserveWS parameter of the specified
       
   243 start_element event and returns 0 if it is set or MAYBE otherwise.
       
   244 The value MAYBE (2) indicates that further investigation is needed, e.g.
       
   245 by examining the element contents. A value of 1 means yes, indent the
       
   246 child nodes, no further investigation is needed.
       
   247 
       
   248 NOTE: the PreserveWS parameter is set by the parent class, 
       
   249 L<XML::Filter::DetectWS>, when the element or one of its ancestors has
       
   250 the attribute xml:space="preserve".
       
   251 
       
   252 Override this method to tweak the behavior of this class.
       
   253 
       
   254 =head1 $self->indent_element ($start_element_event, $parent_says_indent)
       
   255 
       
   256 This method determines whether a certain element may be re-indented. 
       
   257 The default implementation returns the value of the $parent_says_indent
       
   258 parameter, which was set to the value returned by indent_children for the
       
   259 parent element. In other words, the element will be re-indented if the
       
   260 parent element allows it.
       
   261 
       
   262 Override this method to tweak the behavior of this class.
       
   263 I'm not sure how useful this hook is. Please provide feedback!
       
   264 
       
   265 =head1 Current Implementation
       
   266 
       
   267 The current implementation puts all incoming Perl SAX events in a queue for
       
   268 further processing. When determining which nodes should be re-indented,
       
   269 it sometimes needs information from previous events, hence the use of the 
       
   270 queue.
       
   271 
       
   272 The parameter (Compress => 1) is added to 
       
   273 matching start_element and end_element events with no events in between
       
   274 This indicates to an XML printer that a compressed notation can be used, 
       
   275 e.g <foo/>.
       
   276 
       
   277 If an element allows reformatting of its contents (xml:space="preserve" was 
       
   278 not active and indent_children returned MAYBE), the element
       
   279 contents will be reformatted unless it only has one child node and that
       
   280 child is a regular text node (characters event.) 
       
   281 In that case, the element will be printed as <foo>text contents</foo>.
       
   282 
       
   283 If you want element nodes with just one text child to be reindented as well,
       
   284 simply override indent_children to return 1 instead of MAYBE (2.)
       
   285 
       
   286 This behavior may be changed or extended in the future.
       
   287 
       
   288 =head1 CAVEATS
       
   289 
       
   290 This code is highly experimental! 
       
   291 It has not been tested well and the API may change.
       
   292 
       
   293 The code that detects blocks of whitespace at potential indent positions
       
   294 may need some work.
       
   295 
       
   296 =head1 AUTHOR
       
   297 
       
   298 Send bug reports, hints, tips, suggestions to Enno Derksen at
       
   299 <F<enno@att.com>>. 
       
   300 
       
   301 =cut