diff -r 000000000000 -r 02cd6b52f378 dummy_foundation/lib/XML/Handler/Composer.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dummy_foundation/lib/XML/Handler/Composer.pm Thu May 28 10:10:03 2009 +0100 @@ -0,0 +1,821 @@ +package XML::Handler::Composer; +use strict; +use XML::UM; +use Carp; + +use vars qw{ %DEFAULT_QUOTES %XML_MAPPING_CRITERIA }; + +%DEFAULT_QUOTES = ( + XMLDecl => '"', + Attr => '"', + Entity => '"', + SystemLiteral => '"', + ); + +%XML_MAPPING_CRITERIA = +( + Text => + { + '<' => '<', + '&' => '&', + + ']]>' => ']]>', + }, + + CDataSection => + { + ']]>' => ']]>', # NOTE: this won't be translated back correctly + }, + + Attr => # attribute value (assuming double quotes "" are used) + { +# '"' => '"', # Use ("'" => ''') when using single quotes + '<' => '<', + '&' => '&', + }, + + Entity => # entity value (assuming double quotes "" are used) + { +# '"' => '"', # Use ("'" => ''') when using single quotes + '%' => '%', + '&' => '&', + }, + + Comment => + { + '--' => '--', # NOTE: this won't be translated back correctly + }, + + ProcessingInstruction => + { + '?>' => '?>', # not sure if this will be translated back correctly + }, + + # The SYSTEM and PUBLIC identifiers in DOCTYPE declaration (quoted strings) + SystemLiteral => + { +# '"' => '"', # Use ("'" => ''') when using single quotes + }, + +); + +sub new +{ + my ($class, %options) = @_; + my $self = bless \%options, $class; + + $self->{EndWithNewline} = 1 unless defined $self->{EndWithNewline}; + + if (defined $self->{Newline}) + { + $self->{ConvertNewlines} = 1; + } + else + { + # Use this when printing newlines in case the user didn't specify one + $self->{Newline} = "\x0A"; + } + + $self->{DocTypeIndent} = $self->{Newline} . " " + unless defined $self->{DocTypeIndent}; + + $self->{IndentAttlist} = " " unless defined $self->{IndentAttlist}; + + $self->{Print} = sub { print @_ } unless defined $self->{Print}; + + $self->{Quote} ||= {}; + for my $q (keys %DEFAULT_QUOTES) + { + $self->{Quote}->{$q} ||= $DEFAULT_QUOTES{$q}; + } + + # Convert to UTF-8 by default, i.e. when is missing + # and no {Encoding} is specified. + # Note that the internal representation *is* UTF-8, so we + # simply return the (string) parameter. + $self->{Encode} = sub { shift } unless defined $self->{Encode}; + + # Convert unmapped characters to hexadecimal constants a la '号' + $self->{EncodeUnmapped} = \&XML::UM::encode_unmapped_hex + unless defined $self->{EncodeUnmapped}; + + my $encoding = $self->{Encoding}; + $self->setEncoding ($encoding) if defined $encoding; + + $self->initMappers; + + $self; +} + +# +# Setup the mapping routines that convert '<' to '<' etc. +# for the specific XML constructs. +# +sub initMappers +{ + my $self = shift; + my %escape; + my $convert_newlines = $self->{ConvertNewlines}; + + for my $n (qw{ Text Comment CDataSection Attr SystemLiteral + ProcessingInstruction Entity }) + { + $escape{$n} = $self->create_utf8_mapper ($n, $convert_newlines); + } + + # Text with xml:space="preserve", should not have newlines converted. + $escape{TextPreserveNL} = $self->create_utf8_mapper ('Text', 0); + # (If newline conversion is inactive, $escape{TextPreserveNL} does the + # same as $escape{Text} defined above ...) + + $self->{Escape} = \%escape; +} + +sub setEncoding +{ + my ($self, $encoding) = @_; + + $self->{Encode} = XML::UM::get_encode ( + Encoding => $encoding, EncodeUnmapped => $self->{EncodeUnmapped}); +} + +sub create_utf8_mapper +{ + my ($self, $construct, $convert_newlines) = @_; + + my $c = $XML_MAPPING_CRITERIA{$construct}; + croak "no XML mapping criteria defined for $construct" + unless defined $c; + + my %hash = %$c; + + # If this construct appears between quotes in the XML document + # (and it has a quoting character defined), + # ensure that the quoting character is appropriately converted + # to " or ' + my $quote = $self->{Quote}->{$construct}; + if (defined $quote) + { + $hash{$quote} = $quote eq '"' ? '"' : '''; + } + + if ($convert_newlines) + { + $hash{"\x0A"} = $self->{Newline}; + } + + gen_utf8_subst (%hash); +} + +# +# Converts a string literal e.g. "ABC" into '\x41\x42\x43' +# so it can be stuffed into a regular expression +# +sub str_to_hex # static +{ + my $s = shift; + + $s =~ s/(.)/ sprintf ("\\x%02x", ord ($1)) /egos; + + $s; +} + +# +# In later perl versions (5.005_55 and up) we can simply say: +# +# use utf8; +# $literals = join ("|", map { str_to_hex ($_) } keys %hash); +# $s =~ s/($literals)/$hash{$1}/ego; +# + +sub gen_utf8_subst # static +{ + my (%hash) = @_; + + my $code = 'sub { my $s = shift; $s =~ s/('; + $code .= join ("|", map { str_to_hex ($_) } keys %hash); + $code .= ')|('; + $code .= '[\\x00-\\xBF]|[\\xC0-\\xDF].|[\\xE0-\\xEF]..|[\\xF0-\\xFF]...'; + $code .= ')/ defined ($1) ? $hash{$1} : $2 /ego; $s }'; + + my $f = eval $code; + croak "XML::Handler::Composer - can't eval code: $code\nReason: $@" if $@; + + $f; +} + +# This should be optimized! +sub print +{ + my ($self, $str) = @_; + $self->{Print}->($self->{Encode}->($str)); +} + +# Used by start_element. It determines the style in which empty elements +# are printed. The default implementation returns "/>" so they are printed +# like this: +# Override this method to support e.g. XHTML style tags. +sub get_compressed_element_suffix +{ + my ($self, $event) = @_; + + "/>"; + + # return " />" for XHTML style, or + # "><$tagName/>" for uncompressed tags (where $tagName is $event->{Name}) +} + +#----- PerlSAX handlers ------------------------------------------------------- + +sub start_document +{ + my ($self) = @_; + + $self->{InCDATA} = 0; + $self->{DTD} = undef; + $self->{PreserveWS} = 0; # root element has xml:space="default" + $self->{PreserveStack} = []; + $self->{PrintedXmlDecl} = 0; # whether was printed +} + +sub end_document +{ + my ($self) = @_; + + # Print final Newline at the end of the XML document (if desired) + $self->print ($self->{Newline}) if $self->{EndWithNewline}; +} + +# This event is received *AFTER* the Notation, Element, Attlist etc. events +# that are contained within the DTD. +sub doctype_decl +{ + my ($self, $event) = @_; + $self->flush_xml_decl; + + my $q = $self->{Quote}->{SystemLiteral}; + my $escape_literal = $self->{Escape}->{SystemLiteral}; + + my $name = $event->{Name}; + my $sysId = $event->{SystemId}; + $sysId = &$escape_literal ($sysId) if defined $sysId; + my $pubId = $event->{PublicId}; + $pubId = &$escape_literal ($pubId) if defined $pubId; + + my $str = "{DTD}; + my $nl = $self->{Newline}; + + if (defined $dtd_contents) + { + delete $self->{DTD}; + + $str .= " [$dtd_contents$nl]>$nl"; + } + else + { + $str .= ">$nl"; + } + $self->print ($str); +} + +sub start_element +{ + my ($self, $event) = @_; + + my $preserve_stack = $self->{PreserveStack}; + if (@$preserve_stack == 0) + { + # This is the root element. Print the declaration now if + # it wasn't printed and it should be. + $self->flush_xml_decl; + } + + my $str = "<" . $event->{Name}; + + my $suffix = ">"; + if ($event->{Compress}) + { + $suffix = $self->get_compressed_element_suffix ($event); + } + + # Push PreserveWS state of parent element on the stack + push @{ $preserve_stack }, $self->{PreserveWS}; + $self->{PreserveWS} = $event->{PreserveWS}; + + my $ha = $event->{Attributes}; + my @attr; + if (exists $event->{AttributeOrder}) + { + my $defaulted = $event->{Defaulted}; + if (defined $defaulted && !$self->{PrintDefaultAttr}) + { + if ($defaulted > 0) + { + @attr = @{ $event->{AttributeOrder} }[0 .. $defaulted - 1]; + } + # else: all attributes are defaulted i.e. @attr = (); + } + else # no attr are defaulted + { + @attr = @{ $event->{AttributeOrder} }; + } + } + else # no attr order defined + { + @attr = keys %$ha; + } + + my $escape = $self->{Escape}->{Attr}; + my $q = $self->{Quote}->{Attr}; + + for (my $i = 0; $i < @attr; $i++) + { +#?? could print a newline every so often... + my $name = $attr[$i]; + my $val = &$escape ($ha->{$name}); + $str .= " $name=$q$val$q"; + } + $str .= $suffix; + + $self->print ($str); +} + +sub end_element +{ + my ($self, $event) = @_; + + $self->{PreserveWS} = pop @{ $self->{PreserveStack} }; + + return if $event->{Compress}; + + $self->print ("{Name} . ">"); +} + +sub characters +{ + my ($self, $event) = @_; + + if ($self->{InCDATA}) + { +#?? should this use $self->{PreserveWS} ? + + my $esc = $self->{Escape}->{CDataSection}; + $self->print (&$esc ($event->{Data})); + } + else # regular text + { + my $esc = $self->{PreserveWS} ? + $self->{Escape}->{TextPreserveNL} : + $self->{Escape}->{Text}; + + $self->print (&$esc ($event->{Data})); + } +} + +sub start_cdata +{ + my $self = shift; + $self->{InCDATA} = 1; + + $self->print ("{InCDATA} = 0; + + $self->print ("]]>"); +} + +sub comment +{ + my ($self, $event) = @_; + $self->flush_xml_decl; + + my $esc = $self->{Escape}->{Comment}; +#?? still need to support comments in the DTD + + $self->print (""); +} + +sub entity_reference +{ + my ($self, $event) = @_; + $self->flush_xml_decl; + + my $par = $event->{Parameter} ? '%' : '&'; +#?? parameter entities (like %par;) are NOT supported! +# PerlSAX::handle_default should be fixed! + + $self->print ($par . $event->{Name} . ";"); +} + +sub unparsed_entity_decl +{ + my ($self, $event) = @_; + $self->flush_xml_decl; + + $self->entity_decl ($event); +} + +sub notation_decl +{ + my ($self, $event) = @_; + $self->flush_xml_decl; + + my $name = $event->{Name}; + my $sysId = $event->{SystemId}; + my $pubId = $event->{PublicId}; + + my $q = $self->{Quote}->{SystemLiteral}; + my $escape = $self->{Escape}->{SystemLiteral}; + + $sysId = &$escape ($sysId) if defined $sysId; + $pubId = &$escape ($pubId) if defined $pubId; + + my $str = $self->{DocTypeIndent} . ""; + + $self->{DTD} .= $str; +} + +sub element_decl +{ + my ($self, $event) = @_; + $self->flush_xml_decl; + + my $name = $event->{Name}; + my $model = $event->{Model}; + + $self->{DTD} .= $self->{DocTypeIndent} . ""; +} + +sub entity_decl +{ + my ($self, $event) = @_; + $self->flush_xml_decl; + + my $name = $event->{Name}; + + my $par = ""; + if ($name =~ /^%/) + { + # It's a parameter entity (i.e. %ent; instead of &ent;) + $name = substr ($name, 1); + $par = "% "; + } + + my $str = $self->{DocTypeIndent} . "{Value}; + my $sysId = $event->{SysId}; + my $pubId = $event->{PubId}; + my $ndata = $event->{Ndata}; + + my $q = $self->{Quote}->{SystemLiteral}; + my $escape = $self->{Escape}->{SystemLiteral}; + + if (defined $value) + { +#?? use {Entity} quote etc... + my $esc = $self->{Escape}->{Entity}; + my $p = $self->{Quote}->{Entity}; + $str .= " $p" . &$esc ($value) . $p; + } + if (defined $pubId) + { + $str .= " PUBLIC $q" . &$escape ($pubId) . $q; + } + elsif (defined $sysId) + { + $str .= " SYSTEM"; + } + + if (defined $sysId) + { + $str .= " $q" . &$escape ($sysId) . $q; + } + $str .= " NDATA $ndata" if defined $ndata; + $str .= ">"; + + $self->{DTD} .= $str; +} + +sub attlist_decl +{ + my ($self, $event) = @_; + $self->flush_xml_decl; + + my $elem = $event->{ElementName}; + + my $str = $event->{AttributeName} . " " . $event->{Type}; + $str .= " #FIXED" if defined $event->{Fixed}; + + $str = $str; + + my $def = $event->{Default}; + if (defined $def) + { + $str .= " $def"; + + # Note sometimes Default is a value with quotes. + # We'll use the existing quotes in that case... + } + + my $indent; + if (!exists($event->{First}) || $event->{First}) + { + $self->{DTD} .= $self->{DocTypeIndent} . "{MoreFollow}) + { + $indent = $self->{Newline} . $self->{IndentAttlist}; + } + else + { + $indent = " "; + } + } + else + { + $indent = $self->{Newline} . $self->{IndentAttlist}; + } + + $self->{DTD} .= $indent . $str; + + unless ($event->{MoreFollow}) + { + $self->{DTD} .= '>'; + } +} + +sub xml_decl +{ + my ($self, $event) = @_; + return if $self->{PrintedXmlDecl}; # already printed it + + my $version = $event->{Version}; + my $encoding = $event->{Encoding}; + if (defined $self->{Encoding}) + { + $encoding = $self->{Encoding}; + } + else + { + $self->setEncoding ($encoding) if defined $encoding; + } + + my $standalone = $event->{Standalone}; + $standalone = ($standalone ? "yes" : "no") if defined $standalone; + + my $q = $self->{Quote}->{XMLDecl}; + my $nl = $self->{Newline}; + + my $str = "$nl$nl"; + + $self->print ($str); + $self->{PrintedXmlDecl} = 1; +} + +# +# Prints the declaration if it wasn't already printed +# *and* the user wanted it to be printed (because s/he set $self->{Encoding}) +# +sub flush_xml_decl +{ + my ($self) = @_; + return if $self->{PrintedXmlDecl}; + + if (defined $self->{Encoding}) + { + $self->xml_decl ({ Version => '1.0', Encoding => $self->{Encoding} }); + } + + # If it wasn't printed just now, it doesn't need to be printed at all, + # so pretend we did print it. + $self->{PrintedXmlDecl} = 1; +} + +sub processing_instruction +{ + my ($self, $event) = @_; + $self->flush_xml_decl; + + my $escape = $self->{Escape}->{ProcessingInstruction}; + + my $str = "{Target} . " " . + &$escape ($event->{Data}). "?>"; + + $self->print ($str); +} + +1; # package return code + +__END__ + +=head1 NAME + +XML::Handler::Composer - Another XML printer/writer/generator + +=head1 SYNOPSIS + +use XML::Handler::Composer; + +my $composer = new XML::Handler::Composer ( [OPTIONS] ); + +=head1 DESCRIPTION + +XML::Handler::Composer is similar to XML::Writer, XML::Handler::XMLWriter, +XML::Handler::YAWriter etc. in that it generates XML output. + +This implementation may not be fast and it may not be the best solution for +your particular problem, but it has some features that may be missing in the +other implementations: + +=over 4 + +=item * Supports every output encoding that L supports + +L supports every encoding for which there is a mapping file +in the L distribution. + +=item * Pretty printing + +When used with L. + +=item * Fine control over which kind of quotes are used + +See options below. + +=item * Supports PerlSAX interface + +=back + +=head1 Constructor Options + +=over 4 + +=item * EndWithNewline (Default: 1) + +Whether to print a newline at the end of the file (i.e. after the root element) + +=item * Newline (Default: undef) + +If defined, which newline to use for printing. +(Note that XML::Parser etc. convert newlines into "\x0A".) + +If undef, newlines will not be converted and XML::Handler::Composer will +use "\x0A" when printing. + +A value of "\n" will convert the internal newlines into the platform +specific line separator. + +See the PreserveWS option in the characters event (below) for finer control +over when newline conversion is active. + +=item * DocTypeIndent (Default: a Newline and 2 spaces) + +Newline plus indent that is used to separate lines inside the DTD. + +=item * IndentAttList (Default: 8 spaces) + +Indent used when printing an declaration that has more than one +attribute definition, e.g. + + + +=item * Quote (Default: { XMLDecl => '"', Attr => '"', Entity => '"', SystemLiteral => '"' }) + +Quote contains a reference to a hash that defines which quoting characters +to use when printing XML declarations (XMLDecl), attribute values (Attr), + values (Entity) and system/public literals (SystemLiteral) +as found in , declarations etc. + +=item * PrintDefaultAttr (Default: 0) + +If 1, prints attribute values regardless of whether they are default +attribute values (as defined in declarations.) +Normally, default attributes are not printed. + +=item * Encoding (Default: undef) + +Defines the output encoding (if specified.) +Note that future calls to the xml_decl() handler may override this setting +(if they contain an Encoding definition.) + +=item * EncodeUnmapped (Default: \&XML::UM::encode_unmapped_dec) + +Defines how Unicode characters not found in the mapping file (of the +specified encoding) are printed. +By default, they are converted to decimal entity references, like '{' + +Use \&XML::UM::encode_unmapped_hex for hexadecimal constants, like '«' + +=item * Print (Default: sub { print @_ }, which prints to stdout) + +The subroutine that is used to print the encoded XML output. +The default prints the string to stdout. + +=back + +=head1 Method: get_compressed_element_suffix ($event) + +Override this method to support the different styles for printing +empty elements in compressed notation, e.g.

,

,

,

. + +The default returns "/>", which results in

. +Use " />" for XHTML style elements or ">" for certain HTML style elements. + +The $event parameter is the hash reference that was received from the +start_element() handler. + +=head1 Extra PerlSAX event information + +XML::Handler::Composer relies on hints from previous SAX filters to +format certain parts of the XML. +These SAX filters (e.g. XML::Filter::Reindent) pass extra information by adding +name/value pairs to the appropriate PerlSAX events (the events themselves are +hash references.) + +=over 4 + +=item * entity_reference: Parameter => 1 + +If Parameter is 1, it means that it is a parameter entity reference. +A parameter entity is referenced with %ent; instead of &ent; and the +entity declaration starts with instead of + +NOTE: This should be added to the PerlSAX interface! + +=item * start_element/end_element: Compress => 1 + +If Compress is 1 in both the start_element and end_element event, the element +will be printed in compressed form, e.g. instead of . + +=item * start_element: PreserveWS => 1 + +If newline conversion is active (i.e. Newline was defined in the constructor), +then newlines will *NOT* be converted in text (character events) within this +element. + +=item * attlist_decl: First, MoreFollow + +The First and MoreFollow options can be used to force successive +declarations for the same element to be merged, e.g. + + + +In this example, the attlist_decl event for foo should contain +(First => 1, MoreFollow => 1) and the event for bar should contain +(MoreFollow => 1). The quux event should have no extra info. + +'First' indicates that the event is the first of a sequence. +'MoreFollow' indicates that more events will follow in this sequence. + +If neither option is set by the preceding PerlSAX filter, each attribute +definition will be printed as a separate line. + +=back + +=head1 CAVEATS + +This code is highly experimental! +It has not been tested well and the API may change. + +=head1 AUTHOR + +Send bug reports, hints, tips, suggestions to Enno Derksen at +>. + +=cut