diff -r 8b87ea768cb8 -r 60053dab7e2a dummy_foundation/lib/XML/UM.pm --- a/dummy_foundation/lib/XML/UM.pm Wed Jun 03 18:33:51 2009 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,466 +0,0 @@ -# -# TO DO: -# -# - Implement SlowMappers for expat builtin encodings (for which there -# are no .enc files), e.g. UTF-16, US-ASCII, ISO-8859-1. -# - Instead of parsing the .xml file with XML::Encoding, we should use XS. -# If this will not be implemented for a while, we could try reading the -# .enc file directly, instead of the .xml file. -# I started writing XML::UM::EncParser to do this (see EOF), but got stuck. -# - -use strict; - -package XML::UM::SlowMapper; -use Carp; -use XML::Encoding; - -use vars qw{ $ENCDIR %DEFAULT_ASCII_MAPPINGS }; - -my $UTFCHAR = '[\\x00-\\xBF]|[\\xC0-\\xDF].|[\\xE0-\\xEF]..|[\\xF0-\\xFF]...'; - -# -# The directory that contains the .xml files that come with XML::Encoding. -# Include the terminating '\' or '/' !! -# -$ENCDIR = "/home1/enno/perlModules/XML-Encoding-1.01/maps/"; -#$ENCDIR = "c:\\src\\perl\\xml\\XML-Encoding-1.01\\maps\\"; - -# -# From xmlparse.h in expat distribution: -# -# Expat places certain restrictions on the encodings that are supported -# using this mechanism. -# -# 1. Every ASCII character that can appear in a well-formed XML document, -# other than the characters -# -# $@\^`{}~ -# -# must be represented by a single byte, and that byte must be the -# same byte that represents that character in ASCII. -# -# [end of excerpt] - -#?? Which 'ASCII characters can appear in a well-formed XML document ?? - -# All ASCII codes 0 - 127, excl. 36,64,92,94,96,123,125,126 i.e. $@\^`{}~ -%DEFAULT_ASCII_MAPPINGS = map { (chr($_), chr($_)) } (0 .. 35, 37 .. 63, - 65 .. 91, 93, 95, - 97 .. 122, 124, 127); - -sub new -{ - my ($class, %hash) = @_; - my $self = bless \%hash, $class; - - $self->read_encoding_file; - - $self; -} - -sub dispose -{ - my $self = shift; - $self->{Factory}->dispose_mapper ($self); - delete $self->{Encode}; -} - -# Reads the XML file that contains the encoding definition. -# These files come with XML::Encoding. -sub read_encoding_file -{ -#?? This should parse the .enc files (the .xml files are not installed) !! - - my ($self) = @_; - my $encoding = $self->{Encoding}; - - # There is no .enc (or .xml) file for US-ASCII, but the mapping is simple - # so here it goes... - if ($encoding eq 'US-ASCII') - { - $self->{EncMapName} = 'US-ASCII'; - $self->{Map} = \%DEFAULT_ASCII_MAPPINGS; # I hope this is right - return; - } - - my $file = $self->find_encoding_file ($encoding); - - my %uni = %DEFAULT_ASCII_MAPPINGS; - my $prefix = ""; - my $DIR = "file:$ENCDIR"; - - my $enc = new XML::Encoding (Handlers => { - Init => - sub { - my $base = shift->base ($DIR); - } - }, - - PushPrefixFcn => - sub { - $prefix .= chr (shift); - undef; - }, - - PopPrefixFcn => - sub { - chop $prefix; - undef; - }, - - RangeSetFcn => - sub { - my ($byte, $uni, $len) = @_; - for (my $i = $uni; $len--; $uni++) - { - $uni{XML::UM::unicode_to_utf8($uni)} = $prefix . chr ($byte++); - } - undef; - }); - - $self->{EncMapName} = $enc->parsefile ($file); - -#print "Parsed Encoding " . $self->{Encoding} . " MapName=" . $self->{EncMapName} . "\n"; - - $self->{Map} = \%uni; -} - -sub find_encoding_file -{ - my ($self, $enc) = @_; - - return "$ENCDIR\L$enc\E.xml"; # .xml filename is lower case -} - -# Returns a closure (method) that converts a UTF-8 encoded string to an -# encoded byte sequence. -sub get_encode -{ - my ($self, %hash) = @_; - my $MAP = $self->{Map}; - my $ENCODE_UNMAPPED = $hash{EncodeUnmapped} || \&XML::UM::encode_unmapped_dec; - - my $code = "sub {\n my \$str = shift;\n \$str =~ s/"; - - $code .= "($UTFCHAR)/\n"; - $code .= "defined \$MAP->{\$1} ? \$MAP->{\$1} : "; - $code .= "\&\$ENCODE_UNMAPPED(\$1) /egs;\n"; - - $code .= "\$str }\n"; -# print $code; - - my $func = eval $code; - croak "could not eval generated code=[$code]: $@" if $@; - - $func; -} - -# -# Optimized version for when the encoding is UTF-8. -# (In that case no conversion takes place.) -# -package XML::UM::SlowMapper::UTF8; -use vars qw{ @ISA }; -@ISA = qw{ XML::UM::SlowMapper }; - -sub read_encoding_file -{ - # ignore it -} - -sub get_encode -{ - \&dont_convert; -} - -sub dont_convert # static -{ - shift # return argument unchanged -} - -package XML::UM::SlowMapperFactory; - -sub new -{ - my ($class, %hash) = @_; - bless \%hash, $class; -} - -sub get_encode -{ - my ($self, %options) = @_; - my $encoding = $options{Encoding}; - - my $mapper = $self->get_mapper ($encoding); - return $mapper->get_encode (%options); -} - -sub get_mapper -{ - my ($self, $encoding) = @_; - $self->{Mapper}->{$encoding} ||= - ($encoding eq "UTF-8" ? - new XML::UM::SlowMapper::UTF8 (Encoding => $encoding, - Factory => $self) : - new XML::UM::SlowMapper (Encoding => $encoding, - Factory => $self)); -} - -# -# Prepare for garbage collection (remove circular refs) -# -sub dispose_encoding -{ - my ($self, $encoding) = @_; - my $mapper = $self->{Mapper}->{$encoding}; - return unless defined $mapper; - - delete $mapper->{Factory}; - delete $self->{Mapper}->{$encoding}; -} - -package XML::UM; -use Carp; - -use vars qw{ $FACTORY %XML_MAPPING_CRITERIA }; -$FACTORY = XML::UM::SlowMapperFactory->new; - -sub get_encode # static -{ - $FACTORY->get_encode (@_); -} - -sub dispose_encoding # static -{ - $FACTORY->dispose_encoding (@_); -} - -# Convert UTF-8 byte sequence to Unicode index; then to '&#xNN;' string -sub encode_unmapped_hex # static -{ - my $n = utf8_to_unicode (shift); - sprintf ("&#x%X;", $n); -} - -sub encode_unmapped_dec # static -{ - my $n = utf8_to_unicode (shift); - "&#$n;" -} - -# Converts a UTF-8 byte sequence that represents one character, -# to its Unicode index. -sub utf8_to_unicode # static -{ - my $str = shift; - my $len = length ($str); - - if ($len == 1) - { - return ord ($str); - } - if ($len == 2) - { - my @n = unpack "C2", $str; - return (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f); - } - elsif ($len == 3) - { - my @n = unpack "C3", $str; - return (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + - ($n[2] & 0x3f); - } - elsif ($len == 4) - { - my @n = unpack "C4", $str; - return (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12) + - (($n[2] & 0x3f) << 6) + ($n[3] & 0x3f); - } - else - { - croak "bad UTF8 sequence [$str] hex=" . hb($str); - } -} - -# Converts a Unicode character index to the byte sequence -# that represents that character in UTF-8. -sub unicode_to_utf8 # static -{ - my $n = shift; - if ($n < 0x80) - { - return chr ($n); - } - elsif ($n < 0x800) - { - return pack ("CC", (($n >> 6) | 0xc0), (($n & 0x3f) | 0x80)); - } - elsif ($n < 0x10000) - { - return pack ("CCC", (($n >> 12) | 0xe0), ((($n >> 6) & 0x3f) | 0x80), - (($n & 0x3f) | 0x80)); - } - elsif ($n < 0x110000) - { - return pack ("CCCC", (($n >> 18) | 0xf0), ((($n >> 12) & 0x3f) | 0x80), - ((($n >> 6) & 0x3f) | 0x80), (($n & 0x3f) | 0x80)); - } - croak "number [$n] is too large for Unicode in \&unicode_to_utf8"; -} - -#?? The following package is unfinished. -#?? It should parse the .enc file and create an array that maps -#?? Unicode-index to encoded-str. I got stuck... - -# package XML::UM::EncParser; -# -# sub new -# { -# my ($class, %hash) = @_; -# my $self = bless \%hash, $class; -# $self; -# } -# -# sub parse -# { -# my ($self, $filename) = @_; -# open (FILE, $filename) || die "can't open .enc file $filename"; -# binmode (FILE); -# -# my $buf; -# read (FILE, $buf, 4 + 40 + 2 + 2 + 1024); -# -# my ($magic, $name, $pfsize, $bmsize, @map) = unpack ("NA40nnN256", $buf); -# printf "magic=%04x name=$name pfsize=$pfsize bmsize=$bmsize\n", $magic; -# -# if ($magic != 0xFEEBFACE) -# { -# close FILE; -# die sprintf ("bad magic number [0x%08X] in $filename, expected 0xFEEBFACE", $magic); -# } -# -# for (my $i = 0; $i < 256; $i++) -# { -# printf "[%d]=%d ", $i, $map[$i]; -# print "\n" if ($i % 8 == 7); -# } -# -# for (my $i = 0; $i < $pfsize; $i++) -# { -# print "----- PrefixMap $i ----\n"; -# read (FILE, $buf, 2 + 2 + 32 + 32); -# my ($min, $len, $bmap_start, @ispfx) = unpack ("CCnC64", $buf); -# my (@ischar) = splice @ispfx, 32, 32, (); -# #?? could use b256 instead of C32 for bitvector a la vec() -# -# print "ispfx=@ispfx\n"; -# print "ischar=@ischar\n"; -# $len = 256 if $len == 0; -# -# print " min=$min len=$len bmap_start=$bmap_start\n"; -# } -# -# close FILE; -# } - -1; # package return code - -__END__ - -=head1 NAME - -XML::UM - Convert UTF-8 strings to any encoding supported by XML::Encoding - -=head1 SYNOPSIS - - use XML::UM; - - # Set directory with .xml files that comes with XML::Encoding distribution - # Always include the trailing slash! - $XML::UM::ENCDIR = '/home1/enno/perlModules/XML-Encoding-1.01/maps/'; - - # Create the encoding routine - my $encode = XML::UM::get_encode ( - Encoding => 'ISO-8859-2', - EncodeUnmapped => \&XML::UM::encode_unmapped_dec); - - # Convert a string from UTF-8 to the specified Encoding - my $encoded_str = $encode->($utf8_str); - - # Remove circular references for garbage collection - XML::UM::dispose_encoding ('ISO-8859-2'); - -=head1 DESCRIPTION - -This module provides methods to convert UTF-8 strings to any XML encoding -that L supports. It creates mapping routines from the .xml -files that can be found in the maps/ directory in the L -distribution. Note that the XML::Encoding distribution does install the -.enc files in your perl directory, but not the.xml files they were created -from. That's why you have to specify $ENCDIR as in the SYNOPSIS. - -This implementation uses the XML::Encoding class to parse the .xml -file and creates a hash that maps UTF-8 characters (each consisting of up -to 4 bytes) to their equivalent byte sequence in the specified encoding. -Note that large mappings may consume a lot of memory! - -Future implementations may parse the .enc files directly, or -do the conversions entirely in XS (i.e. C code.) - -=head1 get_encode (Encoding => STRING, EncodeUnmapped => SUB) - -The central entry point to this module is the XML::UM::get_encode() method. -It forwards the call to the global $XML::UM::FACTORY, which is defined as -an instance of XML::UM::SlowMapperFactory by default. Override this variable -to plug in your own mapper factory. - -The XML::UM::SlowMapperFactory creates an instance of XML::UM::SlowMapper -(and caches it for subsequent use) that reads in the .xml encoding file and -creates a hash that maps UTF-8 characters to encoded characters. - -The get_encode() method of XML::UM::SlowMapper is called, finally, which -generates an anonimous subroutine that uses the hash to convert -multi-character UTF-8 blocks to the proper encoding. - -=head1 dispose_encoding ($encoding_name) - -Call this to free the memory used by the SlowMapper for a specific encoding. -Note that in order to free the big conversion hash, the user should no longer -have references to the subroutines generated by get_encode(). - -The parameters to the get_encode() method (defined as name/value pairs) are: - -=over 4 - -=item * Encoding - -The name of the desired encoding, e.g. 'ISO-8859-2' - -=item * EncodeUnmapped (Default: \&XML::UM::encode_unmapped_dec) - -Defines how Unicode characters not found in the mapping file (of the -specified encoding) are printed. -By default, they are converted to decimal entity references, like '{' - -Use \&XML::UM::encode_unmapped_hex for hexadecimal constants, like '«' - -=back - -=head1 CAVEATS - -I'm not exactly sure about which Unicode characters in the range (0 .. 127) -should be mapped to themselves. See comments in XML/UM.pm near -%DEFAULT_ASCII_MAPPINGS. - -The encodings that expat supports by default are currently not supported, -(e.g. UTF-16, ISO-8859-1), -because there are no .enc files available for these encodings. -This module needs some more work. If you have the time, please help! - -=head1 AUTHOR - -Send bug reports, hints, tips, suggestions to Enno Derksen at ->. - -=cut