dummy_foundation/lib/XML/UM.pm
changeset 4 60053dab7e2a
parent 3 8b87ea768cb8
child 5 c34a018f3291
--- a/dummy_foundation/lib/XML/UM.pm	Wed Jun 03 18:33:51 2009 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,466 +0,0 @@
-#
-# TO DO:
-#
-# - Implement SlowMappers for expat builtin encodings (for which there
-#   are no .enc files), e.g. UTF-16, US-ASCII, ISO-8859-1.
-# - Instead of parsing the .xml file with XML::Encoding, we should use XS.
-#   If this will not be implemented for a while, we could try reading the
-#   .enc file directly, instead of the .xml file.
-#   I started writing XML::UM::EncParser to do this (see EOF), but got stuck.
-#
-
-use strict;
-
-package XML::UM::SlowMapper;
-use Carp;
-use XML::Encoding;
-
-use vars qw{ $ENCDIR %DEFAULT_ASCII_MAPPINGS };
-
-my $UTFCHAR = '[\\x00-\\xBF]|[\\xC0-\\xDF].|[\\xE0-\\xEF]..|[\\xF0-\\xFF]...';
-
-#
-# The directory that contains the .xml files that come with XML::Encoding.
-# Include the terminating '\' or '/' !!
-#
-$ENCDIR = "/home1/enno/perlModules/XML-Encoding-1.01/maps/";
-#$ENCDIR = "c:\\src\\perl\\xml\\XML-Encoding-1.01\\maps\\";
-
-#
-# From xmlparse.h in expat distribution:
-#
-# Expat places certain restrictions on the encodings that are supported
-# using this mechanism.
-#
-# 1. Every ASCII character that can appear in a well-formed XML document,
-# other than the characters
-#
-#  $@\^`{}~
-#
-# must be represented by a single byte, and that byte must be the
-# same byte that represents that character in ASCII.
-#
-# [end of excerpt]
-
-#?? Which 'ASCII characters can appear in a well-formed XML document ??
-
-# All ASCII codes 0 - 127, excl. 36,64,92,94,96,123,125,126 i.e. $@\^`{}~
-%DEFAULT_ASCII_MAPPINGS = map { (chr($_), chr($_)) } (0 .. 35, 37 .. 63, 
-						      65 .. 91, 93, 95, 
-						      97 .. 122, 124, 127);
-
-sub new
-{
-    my ($class, %hash) = @_;
-    my $self = bless \%hash, $class;
-    
-    $self->read_encoding_file;
-
-    $self;
-}
-
-sub dispose
-{
-    my $self = shift;
-    $self->{Factory}->dispose_mapper ($self);
-    delete $self->{Encode};
-}
-
-# Reads the XML file that contains the encoding definition.
-# These files come with XML::Encoding.
-sub read_encoding_file
-{
-#?? This should parse the .enc files (the .xml files are not installed) !!
-
-    my ($self) = @_;
-    my $encoding = $self->{Encoding};
-
-    # There is no .enc (or .xml) file for US-ASCII, but the mapping is simple
-    # so here it goes...
-    if ($encoding eq 'US-ASCII')
-    {
-	$self->{EncMapName} = 'US-ASCII';
-	$self->{Map} = \%DEFAULT_ASCII_MAPPINGS;	# I hope this is right
-	return;
-    }
-
-    my $file = $self->find_encoding_file ($encoding);
-    
-    my %uni = %DEFAULT_ASCII_MAPPINGS;
-    my $prefix = "";
-    my $DIR = "file:$ENCDIR";
-
-    my $enc = new XML::Encoding (Handlers => { 
-					       Init =>
-					       sub {
-						   my $base = shift->base ($DIR);
-					       }
-				 },
-
-				 PushPrefixFcn => 
-				 sub { 
-				     $prefix .= chr (shift); 
-				     undef;
-				 },
-
-				 PopPrefixFcn => 
-				 sub {
-				     chop $prefix;
-				     undef;
-				 },
-
-				 RangeSetFcn => 
-				 sub {
-				     my ($byte, $uni, $len) = @_;
-				     for (my $i = $uni; $len--; $uni++)
-				     {
-					 $uni{XML::UM::unicode_to_utf8($uni)} = $prefix . chr ($byte++);
-				     }
-				     undef;
-				 });
-
-    $self->{EncMapName} = $enc->parsefile ($file);
-
-#print "Parsed Encoding " . $self->{Encoding} . " MapName=" . $self->{EncMapName} . "\n";
-
-    $self->{Map} = \%uni;
-}
-
-sub find_encoding_file
-{
-    my ($self, $enc) = @_;
-
-    return "$ENCDIR\L$enc\E.xml";	 # .xml filename is lower case
-}
-
-# Returns a closure (method) that converts a UTF-8 encoded string to an 
-# encoded byte sequence.
-sub get_encode
-{
-    my ($self, %hash) = @_;
-    my $MAP = $self->{Map};
-    my $ENCODE_UNMAPPED = $hash{EncodeUnmapped} || \&XML::UM::encode_unmapped_dec;
-
-    my $code = "sub {\n    my \$str = shift;\n    \$str =~ s/";
-
-    $code .= "($UTFCHAR)/\n";
-    $code .= "defined \$MAP->{\$1} ? \$MAP->{\$1} : ";
-    $code .= "\&\$ENCODE_UNMAPPED(\$1) /egs;\n";
-
-    $code .= "\$str }\n";
-#    print $code;
-
-    my $func = eval $code;
-    croak "could not eval generated code=[$code]: $@" if $@;
-
-    $func;
-}
-
-#
-# Optimized version for when the encoding is UTF-8.
-# (In that case no conversion takes place.)
-#
-package XML::UM::SlowMapper::UTF8;
-use vars qw{ @ISA };
-@ISA = qw{ XML::UM::SlowMapper };
-
-sub read_encoding_file
-{
-    # ignore it
-}
-
-sub get_encode
-{
-    \&dont_convert;
-}
-
-sub dont_convert	# static
-{
-    shift		# return argument unchanged
-}
-
-package XML::UM::SlowMapperFactory;
-
-sub new
-{
-    my ($class, %hash) = @_;
-    bless \%hash, $class;
-}
-
-sub get_encode
-{
-    my ($self, %options) = @_;
-    my $encoding = $options{Encoding};
-
-    my $mapper = $self->get_mapper ($encoding);
-    return $mapper->get_encode (%options);
-}
-
-sub get_mapper
-{
-    my ($self, $encoding) = @_;
-    $self->{Mapper}->{$encoding} ||= 
-	($encoding eq "UTF-8" ?
-	 new XML::UM::SlowMapper::UTF8 (Encoding => $encoding, 
-					Factory => $self) :
-	 new XML::UM::SlowMapper (Encoding => $encoding, 
-				  Factory => $self));
-}
-
-#
-# Prepare for garbage collection (remove circular refs)
-#
-sub dispose_encoding
-{
-    my ($self, $encoding) = @_;
-    my $mapper = $self->{Mapper}->{$encoding};
-    return unless defined $mapper;
-
-    delete $mapper->{Factory};
-    delete $self->{Mapper}->{$encoding};
-}
-
-package XML::UM;
-use Carp;
-
-use vars qw{ $FACTORY %XML_MAPPING_CRITERIA };
-$FACTORY = XML::UM::SlowMapperFactory->new;
-
-sub get_encode		# static
-{
-    $FACTORY->get_encode (@_);
-}
-
-sub dispose_encoding	# static
-{
-    $FACTORY->dispose_encoding (@_);
-}
-
-# Convert UTF-8 byte sequence to Unicode index; then to '&#xNN;' string
-sub encode_unmapped_hex	# static
-{
-    my $n = utf8_to_unicode (shift);
-    sprintf ("&#x%X;", $n);
-}
-
-sub encode_unmapped_dec	# static
-{
-    my $n = utf8_to_unicode (shift);
-    "&#$n;"
-}
-
-# Converts a UTF-8 byte sequence that represents one character,
-# to its Unicode index.
-sub utf8_to_unicode	 # static
-{
-    my $str = shift;
-    my $len = length ($str);
-
-    if ($len == 1)
-    {
-	return ord ($str);
-    }
-    if ($len == 2)
-    {
-	my @n = unpack "C2", $str;
-	return (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f);
-    }
-    elsif ($len == 3)
-    {
-	my @n = unpack "C3", $str;
-	return (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + 
-		($n[2] & 0x3f);
-    }
-    elsif ($len == 4)
-    {
-	my @n = unpack "C4", $str;
-	return (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12) + 
-		(($n[2] & 0x3f) << 6) + ($n[3] & 0x3f);
-    }
-    else
-    {
-	croak "bad UTF8 sequence [$str] hex=" . hb($str);
-    }
-}
-
-# Converts a Unicode character index to the byte sequence
-# that represents that character in UTF-8.
-sub unicode_to_utf8	# static
-{
-    my $n = shift;
-    if ($n < 0x80)
-    {
-	return chr ($n);
-    }
-    elsif ($n < 0x800)
-    {
-	return pack ("CC", (($n >> 6) | 0xc0), (($n & 0x3f) | 0x80));
-    }
-    elsif ($n < 0x10000)
-    {
-	return pack ("CCC", (($n >> 12) | 0xe0), ((($n >> 6) & 0x3f) | 0x80),
-		     (($n & 0x3f) | 0x80));
-    }
-    elsif ($n < 0x110000)
-    {
-	return pack ("CCCC", (($n >> 18) | 0xf0), ((($n >> 12) & 0x3f) | 0x80),
-		     ((($n >> 6) & 0x3f) | 0x80), (($n & 0x3f) | 0x80));
-    }
-    croak "number [$n] is too large for Unicode in \&unicode_to_utf8";
-}
-
-#?? The following package is unfinished. 
-#?? It should parse the .enc file and create an array that maps
-#?? Unicode-index to encoded-str. I got stuck...
-
-# package XML::UM::EncParser;
-#
-# sub new
-# {
-#     my ($class, %hash) = @_;
-#     my $self = bless \%hash, $class;
-#     $self;
-# }
-#
-# sub parse
-# {
-#     my ($self, $filename) = @_;
-#     open (FILE, $filename) || die "can't open .enc file $filename";
-#     binmode (FILE);
-#
-#     my $buf;
-#     read (FILE, $buf, 4 + 40 + 2 + 2 + 1024);
-#    
-#     my ($magic, $name, $pfsize, $bmsize, @map) = unpack ("NA40nnN256", $buf);
-#     printf "magic=%04x name=$name pfsize=$pfsize bmsize=$bmsize\n", $magic;
-#    
-#     if ($magic != 0xFEEBFACE)
-#     {
-# 	close FILE;
-# 	die sprintf ("bad magic number [0x%08X] in $filename, expected 0xFEEBFACE", $magic);
-#     }
-#
-#     for (my $i = 0; $i < 256; $i++)
-#     {
-# 	printf "[%d]=%d ", $i, $map[$i];
-# 	print "\n" if ($i % 8 == 7);
-#     }
-#
-#     for (my $i = 0; $i < $pfsize; $i++)
-#     {
-# 	print "----- PrefixMap $i ----\n";
-# 	read (FILE, $buf, 2 + 2 + 32 + 32);
-# 	my ($min, $len, $bmap_start, @ispfx) = unpack ("CCnC64", $buf);
-# 	my (@ischar) = splice @ispfx, 32, 32, ();
-# #?? could use b256 instead of C32 for bitvector a la vec()
-#
-# 	print "ispfx=@ispfx\n";
-# 	print "ischar=@ischar\n";
-# 	$len = 256 if $len == 0;
-#
-# 	print " min=$min len=$len bmap_start=$bmap_start\n";
-#     }
-#
-#     close FILE;
-# }
-
-1; # package return code
-
-__END__
-
-=head1 NAME
-
-XML::UM - Convert UTF-8 strings to any encoding supported by XML::Encoding
-
-=head1 SYNOPSIS
-
- use XML::UM;
-
- # Set directory with .xml files that comes with XML::Encoding distribution
- # Always include the trailing slash!
- $XML::UM::ENCDIR = '/home1/enno/perlModules/XML-Encoding-1.01/maps/';
-
- # Create the encoding routine
- my $encode = XML::UM::get_encode (
-	Encoding => 'ISO-8859-2',
-	EncodeUnmapped => \&XML::UM::encode_unmapped_dec);
-
- # Convert a string from UTF-8 to the specified Encoding
- my $encoded_str = $encode->($utf8_str);
-
- # Remove circular references for garbage collection
- XML::UM::dispose_encoding ('ISO-8859-2');
-
-=head1 DESCRIPTION
-
-This module provides methods to convert UTF-8 strings to any XML encoding
-that L<XML::Encoding> supports. It creates mapping routines from the .xml
-files that can be found in the maps/ directory in the L<XML::Encoding>
-distribution. Note that the XML::Encoding distribution does install the 
-.enc files in your perl directory, but not the.xml files they were created
-from. That's why you have to specify $ENCDIR as in the SYNOPSIS.
-
-This implementation uses the XML::Encoding class to parse the .xml
-file and creates a hash that maps UTF-8 characters (each consisting of up
-to 4 bytes) to their equivalent byte sequence in the specified encoding. 
-Note that large mappings may consume a lot of memory! 
-
-Future implementations may parse the .enc files directly, or
-do the conversions entirely in XS (i.e. C code.)
-
-=head1 get_encode (Encoding => STRING, EncodeUnmapped => SUB)
-
-The central entry point to this module is the XML::UM::get_encode() method. 
-It forwards the call to the global $XML::UM::FACTORY, which is defined as
-an instance of XML::UM::SlowMapperFactory by default. Override this variable
-to plug in your own mapper factory.
-
-The XML::UM::SlowMapperFactory creates an instance of XML::UM::SlowMapper
-(and caches it for subsequent use) that reads in the .xml encoding file and
-creates a hash that maps UTF-8 characters to encoded characters.
-
-The get_encode() method of XML::UM::SlowMapper is called, finally, which
-generates an anonimous subroutine that uses the hash to convert 
-multi-character UTF-8 blocks to the proper encoding.
-
-=head1 dispose_encoding ($encoding_name)
-
-Call this to free the memory used by the SlowMapper for a specific encoding.
-Note that in order to free the big conversion hash, the user should no longer
-have references to the subroutines generated by get_encode().
-
-The parameters to the get_encode() method (defined as name/value pairs) are:
-
-=over 4
-
-=item * Encoding
-
-The name of the desired encoding, e.g. 'ISO-8859-2'
-
-=item * EncodeUnmapped (Default: \&XML::UM::encode_unmapped_dec)
-
-Defines how Unicode characters not found in the mapping file (of the 
-specified encoding) are printed. 
-By default, they are converted to decimal entity references, like '&#123;'
-
-Use \&XML::UM::encode_unmapped_hex for hexadecimal constants, like '&#xAB;'
-
-=back
-
-=head1 CAVEATS
-
-I'm not exactly sure about which Unicode characters in the range (0 .. 127) 
-should be mapped to themselves. See comments in XML/UM.pm near
-%DEFAULT_ASCII_MAPPINGS.
-
-The encodings that expat supports by default are currently not supported, 
-(e.g. UTF-16, ISO-8859-1),
-because there are no .enc files available for these encodings.
-This module needs some more work. If you have the time, please help!
-
-=head1 AUTHOR
-
-Send bug reports, hints, tips, suggestions to Enno Derksen at
-<F<enno@att.com>>. 
-
-=cut