1 package XML::Filter::Reindent; |
|
2 use strict; |
|
3 use XML::Filter::DetectWS; |
|
4 |
|
5 use vars qw{ @ISA }; |
|
6 @ISA = qw{ XML::Filter::DetectWS }; |
|
7 |
|
8 sub MAYBE (%) { 2 } |
|
9 |
|
10 sub new |
|
11 { |
|
12 my $class = shift; |
|
13 my $self = $class->SUPER::new (@_); |
|
14 |
|
15 # Use one space per indent level (by default) |
|
16 $self->{Tab} = " " unless defined $self->{Tab}; |
|
17 |
|
18 # Note that this is a PerlSAX filter so we use the XML newline ("\x0A"), |
|
19 # not the Perl output newline ("\n"), by default. |
|
20 $self->{Newline} = "\x0A" unless defined $self->{Newline}; |
|
21 |
|
22 $self; |
|
23 } |
|
24 |
|
25 # Indent the element if its parent element says so |
|
26 sub indent_element |
|
27 { |
|
28 my ($self, $event, $parent_says_indent) = @_; |
|
29 return $parent_says_indent; |
|
30 } |
|
31 |
|
32 # Always indent children unless element (or its ancestor) has |
|
33 # xml:space="preserve" attribute |
|
34 sub indent_children |
|
35 { |
|
36 my ($self, $event) = @_; |
|
37 return $event->{PreserveWS} ? 0 : MAYBE; |
|
38 } |
|
39 |
|
40 sub start_element |
|
41 { |
|
42 my ($self, $event) = @_; |
|
43 |
|
44 my $parent = $self->{ParentStack}->[-1]; |
|
45 my $level = $self->{Level}++; |
|
46 $self->SUPER::start_element ($event); |
|
47 |
|
48 my $parent_says_indent = $parent->{IndentChildren} ? 1 : 0; |
|
49 # init with 1 if parent says MAYBE |
|
50 $event->{Indent} = $self->indent_element ($event, $parent_says_indent) ? |
|
51 $level : undef; |
|
52 |
|
53 $event->{IndentChildren} = $self->indent_children ($event); |
|
54 } |
|
55 |
|
56 sub end_element |
|
57 { |
|
58 my ($self, $event) = @_; |
|
59 my $start_element = $self->{ParentStack}->[-1]; |
|
60 |
|
61 if ($start_element->{IndentChildren} == MAYBE) |
|
62 { |
|
63 my $q = $self->{EventQ}; |
|
64 my $prev = $q->[-1]; |
|
65 |
|
66 if ($prev == $start_element) |
|
67 { |
|
68 # End tag follows start tag: compress tag |
|
69 $start_element->{Compress} = 1; |
|
70 $event->{Compress} = 1; |
|
71 #?? could detect if it contains only ignorable_ws |
|
72 } |
|
73 elsif ($prev->{EventType} eq 'characters') |
|
74 { |
|
75 if ($q->[-2] == $start_element) |
|
76 { |
|
77 # Element has only one child, a text node. |
|
78 # Print element as: <a>text here</a> |
|
79 delete $prev->{Indent}; |
|
80 $start_element->{IndentChildren} = 0; |
|
81 } |
|
82 } |
|
83 } |
|
84 |
|
85 my $level = --$self->{Level}; |
|
86 $event->{Indent} = $start_element->{IndentChildren} ? $level : undef; |
|
87 |
|
88 my $compress = $start_element->{Compress}; |
|
89 if ($compress) |
|
90 { |
|
91 $event->{Compress} = $compress; |
|
92 delete $event->{Indent}; |
|
93 } |
|
94 |
|
95 $self->SUPER::end_element ($event); |
|
96 } |
|
97 |
|
98 sub end_document |
|
99 { |
|
100 my ($self, $event) = @_; |
|
101 |
|
102 $self->push_event ('end_document', $event || {}); |
|
103 $self->flush (0); # send remaining events |
|
104 } |
|
105 |
|
106 sub push_event |
|
107 { |
|
108 my ($self, $type, $event) = @_; |
|
109 |
|
110 $event->{EventType} = $type; |
|
111 if ($type =~ /^(characters|comment|processing_instruction|entity_reference|cdata)$/) |
|
112 { |
|
113 my $indent_kids = $self->{ParentStack}->[-1]->{IndentChildren} ? 1 : 0; |
|
114 $event->{Indent} = $indent_kids ? $self->{Level} : undef; |
|
115 } |
|
116 |
|
117 my $q = $self->{EventQ}; |
|
118 push @$q, $event; |
|
119 |
|
120 $self->flush (4); # keep 4 events on the stack (maybe 3 is enough) |
|
121 } |
|
122 |
|
123 sub flush |
|
124 { |
|
125 my ($self, $keep) = @_; |
|
126 my $q = $self->{EventQ}; |
|
127 |
|
128 while (@$q > $keep) |
|
129 { |
|
130 my $head = $q->[0]; |
|
131 # print "head=" . $head->{EventType} . " indent=" . $head->{Indent} . "\n"; |
|
132 |
|
133 if ($head->{EventType} =~ /ws|ignorable/) |
|
134 { |
|
135 my $next = $q->[1]; |
|
136 my $indent = $next->{Indent}; |
|
137 |
|
138 if (defined $indent) # fix existing indent |
|
139 { |
|
140 $head->{Data} = $self->{Newline} . ($self->{Tab} x $indent); |
|
141 $self->send (2); |
|
142 } |
|
143 else # remove existing indent |
|
144 { |
|
145 shift @$q; |
|
146 $self->send (1); |
|
147 } |
|
148 #?? remove keys: Indent, ... |
|
149 } |
|
150 else |
|
151 { |
|
152 my $indent = $head->{Indent}; |
|
153 |
|
154 if (defined $indent) # insert indent |
|
155 { |
|
156 unshift @$q, { EventType => 'ws', |
|
157 Data => $self->{Newline} . ($self->{Tab} x $indent) }; |
|
158 $self->send (2); |
|
159 } |
|
160 else # no indent - leave as is |
|
161 { |
|
162 $self->send (1); |
|
163 } |
|
164 } |
|
165 } |
|
166 } |
|
167 |
|
168 sub send |
|
169 { |
|
170 my ($self, $i) = @_; |
|
171 |
|
172 my $q = $self->{EventQ}; |
|
173 |
|
174 while ($i--) |
|
175 { |
|
176 my $event = shift @$q; |
|
177 my $type = $event->{EventType}; |
|
178 delete $event->{EventType}; |
|
179 |
|
180 #print "TYPE=$type " . join(",", map { "$_=" . $event->{$_} } keys %$event) . "\n"; |
|
181 $self->{Callback}->{$type}->($event); |
|
182 } |
|
183 } |
|
184 |
|
185 1; # package return code |
|
186 |
|
187 =head1 NAME |
|
188 |
|
189 XML::Filter::Reindent - Reformats whitespace for pretty printing XML |
|
190 |
|
191 =head1 SYNOPSIS |
|
192 |
|
193 use XML::Handler::Composer; |
|
194 use XML::Filter::Reindent; |
|
195 |
|
196 my $composer = new XML::Handler::Composer (%OPTIONS); |
|
197 my $indent = new XML::Filter::Reindent (Handler => $composer, %OPTIONS); |
|
198 |
|
199 =head1 DESCRIPTION |
|
200 |
|
201 XML::Filter::Reindent is a sub class of L<XML::Filter::DetectWS>. |
|
202 |
|
203 XML::Filter::Reindent can be used as a PerlSAX filter to reformat an |
|
204 XML document before sending it to a PerlSAX handler that prints it |
|
205 (like L<XML::Handler::Composer>.) |
|
206 |
|
207 Like L<XML::Filter::DetectWS>, it detects ignorable whitespace and blocks of |
|
208 whitespace characters in certain places. It uses this information and |
|
209 information supplied by the user to determine where whitespace may be |
|
210 modified, deleted or inserted. |
|
211 Based on the indent settings, it then modifies, inserts and deletes characters |
|
212 and ignorable_whitespace events accordingly. |
|
213 |
|
214 This is just a first stab at the implementation. |
|
215 It may be buggy and may change completely! |
|
216 |
|
217 =head1 Constructor Options |
|
218 |
|
219 =over 4 |
|
220 |
|
221 =item * Handler |
|
222 |
|
223 The PerlSAX handler (or filter) that will receive the PerlSAX events from this |
|
224 filter. |
|
225 |
|
226 =item * Tab (Default: one space) |
|
227 |
|
228 The number of spaces per indent level for elements etc. in document content. |
|
229 |
|
230 =item * Newline (Default: "\x0A") |
|
231 |
|
232 The newline to use when re-indenting. |
|
233 The default is the internal newline used by L<XML::Parser>, L<XML::DOM> etc., |
|
234 and should be fine when used in combination with L<XML::Handler::Composer>. |
|
235 |
|
236 =back |
|
237 |
|
238 =head1 $self->indent_children ($start_element_event) |
|
239 |
|
240 This method determines whether children of a certain element |
|
241 may be reformatted. |
|
242 The default implementation checks the PreserveWS parameter of the specified |
|
243 start_element event and returns 0 if it is set or MAYBE otherwise. |
|
244 The value MAYBE (2) indicates that further investigation is needed, e.g. |
|
245 by examining the element contents. A value of 1 means yes, indent the |
|
246 child nodes, no further investigation is needed. |
|
247 |
|
248 NOTE: the PreserveWS parameter is set by the parent class, |
|
249 L<XML::Filter::DetectWS>, when the element or one of its ancestors has |
|
250 the attribute xml:space="preserve". |
|
251 |
|
252 Override this method to tweak the behavior of this class. |
|
253 |
|
254 =head1 $self->indent_element ($start_element_event, $parent_says_indent) |
|
255 |
|
256 This method determines whether a certain element may be re-indented. |
|
257 The default implementation returns the value of the $parent_says_indent |
|
258 parameter, which was set to the value returned by indent_children for the |
|
259 parent element. In other words, the element will be re-indented if the |
|
260 parent element allows it. |
|
261 |
|
262 Override this method to tweak the behavior of this class. |
|
263 I'm not sure how useful this hook is. Please provide feedback! |
|
264 |
|
265 =head1 Current Implementation |
|
266 |
|
267 The current implementation puts all incoming Perl SAX events in a queue for |
|
268 further processing. When determining which nodes should be re-indented, |
|
269 it sometimes needs information from previous events, hence the use of the |
|
270 queue. |
|
271 |
|
272 The parameter (Compress => 1) is added to |
|
273 matching start_element and end_element events with no events in between |
|
274 This indicates to an XML printer that a compressed notation can be used, |
|
275 e.g <foo/>. |
|
276 |
|
277 If an element allows reformatting of its contents (xml:space="preserve" was |
|
278 not active and indent_children returned MAYBE), the element |
|
279 contents will be reformatted unless it only has one child node and that |
|
280 child is a regular text node (characters event.) |
|
281 In that case, the element will be printed as <foo>text contents</foo>. |
|
282 |
|
283 If you want element nodes with just one text child to be reindented as well, |
|
284 simply override indent_children to return 1 instead of MAYBE (2.) |
|
285 |
|
286 This behavior may be changed or extended in the future. |
|
287 |
|
288 =head1 CAVEATS |
|
289 |
|
290 This code is highly experimental! |
|
291 It has not been tested well and the API may change. |
|
292 |
|
293 The code that detects blocks of whitespace at potential indent positions |
|
294 may need some work. |
|
295 |
|
296 =head1 AUTHOR |
|
297 |
|
298 Send bug reports, hints, tips, suggestions to Enno Derksen at |
|
299 <F<enno@att.com>>. |
|
300 |
|
301 =cut |
|