diff -r 000000000000 -r 02cd6b52f378 dummy_foundation/lib/XML/UM.pm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dummy_foundation/lib/XML/UM.pm Thu May 28 10:10:03 2009 +0100 @@ -0,0 +1,466 @@ +# +# TO DO: +# +# - Implement SlowMappers for expat builtin encodings (for which there +# are no .enc files), e.g. UTF-16, US-ASCII, ISO-8859-1. +# - Instead of parsing the .xml file with XML::Encoding, we should use XS. +# If this will not be implemented for a while, we could try reading the +# .enc file directly, instead of the .xml file. +# I started writing XML::UM::EncParser to do this (see EOF), but got stuck. +# + +use strict; + +package XML::UM::SlowMapper; +use Carp; +use XML::Encoding; + +use vars qw{ $ENCDIR %DEFAULT_ASCII_MAPPINGS }; + +my $UTFCHAR = '[\\x00-\\xBF]|[\\xC0-\\xDF].|[\\xE0-\\xEF]..|[\\xF0-\\xFF]...'; + +# +# The directory that contains the .xml files that come with XML::Encoding. +# Include the terminating '\' or '/' !! +# +$ENCDIR = "/home1/enno/perlModules/XML-Encoding-1.01/maps/"; +#$ENCDIR = "c:\\src\\perl\\xml\\XML-Encoding-1.01\\maps\\"; + +# +# From xmlparse.h in expat distribution: +# +# Expat places certain restrictions on the encodings that are supported +# using this mechanism. +# +# 1. Every ASCII character that can appear in a well-formed XML document, +# other than the characters +# +# $@\^`{}~ +# +# must be represented by a single byte, and that byte must be the +# same byte that represents that character in ASCII. +# +# [end of excerpt] + +#?? Which 'ASCII characters can appear in a well-formed XML document ?? + +# All ASCII codes 0 - 127, excl. 36,64,92,94,96,123,125,126 i.e. $@\^`{}~ +%DEFAULT_ASCII_MAPPINGS = map { (chr($_), chr($_)) } (0 .. 35, 37 .. 63, + 65 .. 91, 93, 95, + 97 .. 122, 124, 127); + +sub new +{ + my ($class, %hash) = @_; + my $self = bless \%hash, $class; + + $self->read_encoding_file; + + $self; +} + +sub dispose +{ + my $self = shift; + $self->{Factory}->dispose_mapper ($self); + delete $self->{Encode}; +} + +# Reads the XML file that contains the encoding definition. +# These files come with XML::Encoding. +sub read_encoding_file +{ +#?? This should parse the .enc files (the .xml files are not installed) !! + + my ($self) = @_; + my $encoding = $self->{Encoding}; + + # There is no .enc (or .xml) file for US-ASCII, but the mapping is simple + # so here it goes... + if ($encoding eq 'US-ASCII') + { + $self->{EncMapName} = 'US-ASCII'; + $self->{Map} = \%DEFAULT_ASCII_MAPPINGS; # I hope this is right + return; + } + + my $file = $self->find_encoding_file ($encoding); + + my %uni = %DEFAULT_ASCII_MAPPINGS; + my $prefix = ""; + my $DIR = "file:$ENCDIR"; + + my $enc = new XML::Encoding (Handlers => { + Init => + sub { + my $base = shift->base ($DIR); + } + }, + + PushPrefixFcn => + sub { + $prefix .= chr (shift); + undef; + }, + + PopPrefixFcn => + sub { + chop $prefix; + undef; + }, + + RangeSetFcn => + sub { + my ($byte, $uni, $len) = @_; + for (my $i = $uni; $len--; $uni++) + { + $uni{XML::UM::unicode_to_utf8($uni)} = $prefix . chr ($byte++); + } + undef; + }); + + $self->{EncMapName} = $enc->parsefile ($file); + +#print "Parsed Encoding " . $self->{Encoding} . " MapName=" . $self->{EncMapName} . "\n"; + + $self->{Map} = \%uni; +} + +sub find_encoding_file +{ + my ($self, $enc) = @_; + + return "$ENCDIR\L$enc\E.xml"; # .xml filename is lower case +} + +# Returns a closure (method) that converts a UTF-8 encoded string to an +# encoded byte sequence. +sub get_encode +{ + my ($self, %hash) = @_; + my $MAP = $self->{Map}; + my $ENCODE_UNMAPPED = $hash{EncodeUnmapped} || \&XML::UM::encode_unmapped_dec; + + my $code = "sub {\n my \$str = shift;\n \$str =~ s/"; + + $code .= "($UTFCHAR)/\n"; + $code .= "defined \$MAP->{\$1} ? \$MAP->{\$1} : "; + $code .= "\&\$ENCODE_UNMAPPED(\$1) /egs;\n"; + + $code .= "\$str }\n"; +# print $code; + + my $func = eval $code; + croak "could not eval generated code=[$code]: $@" if $@; + + $func; +} + +# +# Optimized version for when the encoding is UTF-8. +# (In that case no conversion takes place.) +# +package XML::UM::SlowMapper::UTF8; +use vars qw{ @ISA }; +@ISA = qw{ XML::UM::SlowMapper }; + +sub read_encoding_file +{ + # ignore it +} + +sub get_encode +{ + \&dont_convert; +} + +sub dont_convert # static +{ + shift # return argument unchanged +} + +package XML::UM::SlowMapperFactory; + +sub new +{ + my ($class, %hash) = @_; + bless \%hash, $class; +} + +sub get_encode +{ + my ($self, %options) = @_; + my $encoding = $options{Encoding}; + + my $mapper = $self->get_mapper ($encoding); + return $mapper->get_encode (%options); +} + +sub get_mapper +{ + my ($self, $encoding) = @_; + $self->{Mapper}->{$encoding} ||= + ($encoding eq "UTF-8" ? + new XML::UM::SlowMapper::UTF8 (Encoding => $encoding, + Factory => $self) : + new XML::UM::SlowMapper (Encoding => $encoding, + Factory => $self)); +} + +# +# Prepare for garbage collection (remove circular refs) +# +sub dispose_encoding +{ + my ($self, $encoding) = @_; + my $mapper = $self->{Mapper}->{$encoding}; + return unless defined $mapper; + + delete $mapper->{Factory}; + delete $self->{Mapper}->{$encoding}; +} + +package XML::UM; +use Carp; + +use vars qw{ $FACTORY %XML_MAPPING_CRITERIA }; +$FACTORY = XML::UM::SlowMapperFactory->new; + +sub get_encode # static +{ + $FACTORY->get_encode (@_); +} + +sub dispose_encoding # static +{ + $FACTORY->dispose_encoding (@_); +} + +# Convert UTF-8 byte sequence to Unicode index; then to '&#xNN;' string +sub encode_unmapped_hex # static +{ + my $n = utf8_to_unicode (shift); + sprintf ("&#x%X;", $n); +} + +sub encode_unmapped_dec # static +{ + my $n = utf8_to_unicode (shift); + "&#$n;" +} + +# Converts a UTF-8 byte sequence that represents one character, +# to its Unicode index. +sub utf8_to_unicode # static +{ + my $str = shift; + my $len = length ($str); + + if ($len == 1) + { + return ord ($str); + } + if ($len == 2) + { + my @n = unpack "C2", $str; + return (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f); + } + elsif ($len == 3) + { + my @n = unpack "C3", $str; + return (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + + ($n[2] & 0x3f); + } + elsif ($len == 4) + { + my @n = unpack "C4", $str; + return (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12) + + (($n[2] & 0x3f) << 6) + ($n[3] & 0x3f); + } + else + { + croak "bad UTF8 sequence [$str] hex=" . hb($str); + } +} + +# Converts a Unicode character index to the byte sequence +# that represents that character in UTF-8. +sub unicode_to_utf8 # static +{ + my $n = shift; + if ($n < 0x80) + { + return chr ($n); + } + elsif ($n < 0x800) + { + return pack ("CC", (($n >> 6) | 0xc0), (($n & 0x3f) | 0x80)); + } + elsif ($n < 0x10000) + { + return pack ("CCC", (($n >> 12) | 0xe0), ((($n >> 6) & 0x3f) | 0x80), + (($n & 0x3f) | 0x80)); + } + elsif ($n < 0x110000) + { + return pack ("CCCC", (($n >> 18) | 0xf0), ((($n >> 12) & 0x3f) | 0x80), + ((($n >> 6) & 0x3f) | 0x80), (($n & 0x3f) | 0x80)); + } + croak "number [$n] is too large for Unicode in \&unicode_to_utf8"; +} + +#?? The following package is unfinished. +#?? It should parse the .enc file and create an array that maps +#?? Unicode-index to encoded-str. I got stuck... + +# package XML::UM::EncParser; +# +# sub new +# { +# my ($class, %hash) = @_; +# my $self = bless \%hash, $class; +# $self; +# } +# +# sub parse +# { +# my ($self, $filename) = @_; +# open (FILE, $filename) || die "can't open .enc file $filename"; +# binmode (FILE); +# +# my $buf; +# read (FILE, $buf, 4 + 40 + 2 + 2 + 1024); +# +# my ($magic, $name, $pfsize, $bmsize, @map) = unpack ("NA40nnN256", $buf); +# printf "magic=%04x name=$name pfsize=$pfsize bmsize=$bmsize\n", $magic; +# +# if ($magic != 0xFEEBFACE) +# { +# close FILE; +# die sprintf ("bad magic number [0x%08X] in $filename, expected 0xFEEBFACE", $magic); +# } +# +# for (my $i = 0; $i < 256; $i++) +# { +# printf "[%d]=%d ", $i, $map[$i]; +# print "\n" if ($i % 8 == 7); +# } +# +# for (my $i = 0; $i < $pfsize; $i++) +# { +# print "----- PrefixMap $i ----\n"; +# read (FILE, $buf, 2 + 2 + 32 + 32); +# my ($min, $len, $bmap_start, @ispfx) = unpack ("CCnC64", $buf); +# my (@ischar) = splice @ispfx, 32, 32, (); +# #?? could use b256 instead of C32 for bitvector a la vec() +# +# print "ispfx=@ispfx\n"; +# print "ischar=@ischar\n"; +# $len = 256 if $len == 0; +# +# print " min=$min len=$len bmap_start=$bmap_start\n"; +# } +# +# close FILE; +# } + +1; # package return code + +__END__ + +=head1 NAME + +XML::UM - Convert UTF-8 strings to any encoding supported by XML::Encoding + +=head1 SYNOPSIS + + use XML::UM; + + # Set directory with .xml files that comes with XML::Encoding distribution + # Always include the trailing slash! + $XML::UM::ENCDIR = '/home1/enno/perlModules/XML-Encoding-1.01/maps/'; + + # Create the encoding routine + my $encode = XML::UM::get_encode ( + Encoding => 'ISO-8859-2', + EncodeUnmapped => \&XML::UM::encode_unmapped_dec); + + # Convert a string from UTF-8 to the specified Encoding + my $encoded_str = $encode->($utf8_str); + + # Remove circular references for garbage collection + XML::UM::dispose_encoding ('ISO-8859-2'); + +=head1 DESCRIPTION + +This module provides methods to convert UTF-8 strings to any XML encoding +that L supports. It creates mapping routines from the .xml +files that can be found in the maps/ directory in the L +distribution. Note that the XML::Encoding distribution does install the +.enc files in your perl directory, but not the.xml files they were created +from. That's why you have to specify $ENCDIR as in the SYNOPSIS. + +This implementation uses the XML::Encoding class to parse the .xml +file and creates a hash that maps UTF-8 characters (each consisting of up +to 4 bytes) to their equivalent byte sequence in the specified encoding. +Note that large mappings may consume a lot of memory! + +Future implementations may parse the .enc files directly, or +do the conversions entirely in XS (i.e. C code.) + +=head1 get_encode (Encoding => STRING, EncodeUnmapped => SUB) + +The central entry point to this module is the XML::UM::get_encode() method. +It forwards the call to the global $XML::UM::FACTORY, which is defined as +an instance of XML::UM::SlowMapperFactory by default. Override this variable +to plug in your own mapper factory. + +The XML::UM::SlowMapperFactory creates an instance of XML::UM::SlowMapper +(and caches it for subsequent use) that reads in the .xml encoding file and +creates a hash that maps UTF-8 characters to encoded characters. + +The get_encode() method of XML::UM::SlowMapper is called, finally, which +generates an anonimous subroutine that uses the hash to convert +multi-character UTF-8 blocks to the proper encoding. + +=head1 dispose_encoding ($encoding_name) + +Call this to free the memory used by the SlowMapper for a specific encoding. +Note that in order to free the big conversion hash, the user should no longer +have references to the subroutines generated by get_encode(). + +The parameters to the get_encode() method (defined as name/value pairs) are: + +=over 4 + +=item * Encoding + +The name of the desired encoding, e.g. 'ISO-8859-2' + +=item * EncodeUnmapped (Default: \&XML::UM::encode_unmapped_dec) + +Defines how Unicode characters not found in the mapping file (of the +specified encoding) are printed. +By default, they are converted to decimal entity references, like '{' + +Use \&XML::UM::encode_unmapped_hex for hexadecimal constants, like '«' + +=back + +=head1 CAVEATS + +I'm not exactly sure about which Unicode characters in the range (0 .. 127) +should be mapped to themselves. See comments in XML/UM.pm near +%DEFAULT_ASCII_MAPPINGS. + +The encodings that expat supports by default are currently not supported, +(e.g. UTF-16, ISO-8859-1), +because there are no .enc files available for these encodings. +This module needs some more work. If you have the time, please help! + +=head1 AUTHOR + +Send bug reports, hints, tips, suggestions to Enno Derksen at +>. + +=cut