dummy_foundation/lib/XML/UM.pm
changeset 0 02cd6b52f378
equal deleted inserted replaced
-1:000000000000 0:02cd6b52f378
       
     1 #
       
     2 # TO DO:
       
     3 #
       
     4 # - Implement SlowMappers for expat builtin encodings (for which there
       
     5 #   are no .enc files), e.g. UTF-16, US-ASCII, ISO-8859-1.
       
     6 # - Instead of parsing the .xml file with XML::Encoding, we should use XS.
       
     7 #   If this will not be implemented for a while, we could try reading the
       
     8 #   .enc file directly, instead of the .xml file.
       
     9 #   I started writing XML::UM::EncParser to do this (see EOF), but got stuck.
       
    10 #
       
    11 
       
    12 use strict;
       
    13 
       
    14 package XML::UM::SlowMapper;
       
    15 use Carp;
       
    16 use XML::Encoding;
       
    17 
       
    18 use vars qw{ $ENCDIR %DEFAULT_ASCII_MAPPINGS };
       
    19 
       
    20 my $UTFCHAR = '[\\x00-\\xBF]|[\\xC0-\\xDF].|[\\xE0-\\xEF]..|[\\xF0-\\xFF]...';
       
    21 
       
    22 #
       
    23 # The directory that contains the .xml files that come with XML::Encoding.
       
    24 # Include the terminating '\' or '/' !!
       
    25 #
       
    26 $ENCDIR = "/home1/enno/perlModules/XML-Encoding-1.01/maps/";
       
    27 #$ENCDIR = "c:\\src\\perl\\xml\\XML-Encoding-1.01\\maps\\";
       
    28 
       
    29 #
       
    30 # From xmlparse.h in expat distribution:
       
    31 #
       
    32 # Expat places certain restrictions on the encodings that are supported
       
    33 # using this mechanism.
       
    34 #
       
    35 # 1. Every ASCII character that can appear in a well-formed XML document,
       
    36 # other than the characters
       
    37 #
       
    38 #  $@\^`{}~
       
    39 #
       
    40 # must be represented by a single byte, and that byte must be the
       
    41 # same byte that represents that character in ASCII.
       
    42 #
       
    43 # [end of excerpt]
       
    44 
       
    45 #?? Which 'ASCII characters can appear in a well-formed XML document ??
       
    46 
       
    47 # All ASCII codes 0 - 127, excl. 36,64,92,94,96,123,125,126 i.e. $@\^`{}~
       
    48 %DEFAULT_ASCII_MAPPINGS = map { (chr($_), chr($_)) } (0 .. 35, 37 .. 63, 
       
    49 						      65 .. 91, 93, 95, 
       
    50 						      97 .. 122, 124, 127);
       
    51 
       
    52 sub new
       
    53 {
       
    54     my ($class, %hash) = @_;
       
    55     my $self = bless \%hash, $class;
       
    56     
       
    57     $self->read_encoding_file;
       
    58 
       
    59     $self;
       
    60 }
       
    61 
       
    62 sub dispose
       
    63 {
       
    64     my $self = shift;
       
    65     $self->{Factory}->dispose_mapper ($self);
       
    66     delete $self->{Encode};
       
    67 }
       
    68 
       
    69 # Reads the XML file that contains the encoding definition.
       
    70 # These files come with XML::Encoding.
       
    71 sub read_encoding_file
       
    72 {
       
    73 #?? This should parse the .enc files (the .xml files are not installed) !!
       
    74 
       
    75     my ($self) = @_;
       
    76     my $encoding = $self->{Encoding};
       
    77 
       
    78     # There is no .enc (or .xml) file for US-ASCII, but the mapping is simple
       
    79     # so here it goes...
       
    80     if ($encoding eq 'US-ASCII')
       
    81     {
       
    82 	$self->{EncMapName} = 'US-ASCII';
       
    83 	$self->{Map} = \%DEFAULT_ASCII_MAPPINGS;	# I hope this is right
       
    84 	return;
       
    85     }
       
    86 
       
    87     my $file = $self->find_encoding_file ($encoding);
       
    88     
       
    89     my %uni = %DEFAULT_ASCII_MAPPINGS;
       
    90     my $prefix = "";
       
    91     my $DIR = "file:$ENCDIR";
       
    92 
       
    93     my $enc = new XML::Encoding (Handlers => { 
       
    94 					       Init =>
       
    95 					       sub {
       
    96 						   my $base = shift->base ($DIR);
       
    97 					       }
       
    98 				 },
       
    99 
       
   100 				 PushPrefixFcn => 
       
   101 				 sub { 
       
   102 				     $prefix .= chr (shift); 
       
   103 				     undef;
       
   104 				 },
       
   105 
       
   106 				 PopPrefixFcn => 
       
   107 				 sub {
       
   108 				     chop $prefix;
       
   109 				     undef;
       
   110 				 },
       
   111 
       
   112 				 RangeSetFcn => 
       
   113 				 sub {
       
   114 				     my ($byte, $uni, $len) = @_;
       
   115 				     for (my $i = $uni; $len--; $uni++)
       
   116 				     {
       
   117 					 $uni{XML::UM::unicode_to_utf8($uni)} = $prefix . chr ($byte++);
       
   118 				     }
       
   119 				     undef;
       
   120 				 });
       
   121 
       
   122     $self->{EncMapName} = $enc->parsefile ($file);
       
   123 
       
   124 #print "Parsed Encoding " . $self->{Encoding} . " MapName=" . $self->{EncMapName} . "\n";
       
   125 
       
   126     $self->{Map} = \%uni;
       
   127 }
       
   128 
       
   129 sub find_encoding_file
       
   130 {
       
   131     my ($self, $enc) = @_;
       
   132 
       
   133     return "$ENCDIR\L$enc\E.xml";	 # .xml filename is lower case
       
   134 }
       
   135 
       
   136 # Returns a closure (method) that converts a UTF-8 encoded string to an 
       
   137 # encoded byte sequence.
       
   138 sub get_encode
       
   139 {
       
   140     my ($self, %hash) = @_;
       
   141     my $MAP = $self->{Map};
       
   142     my $ENCODE_UNMAPPED = $hash{EncodeUnmapped} || \&XML::UM::encode_unmapped_dec;
       
   143 
       
   144     my $code = "sub {\n    my \$str = shift;\n    \$str =~ s/";
       
   145 
       
   146     $code .= "($UTFCHAR)/\n";
       
   147     $code .= "defined \$MAP->{\$1} ? \$MAP->{\$1} : ";
       
   148     $code .= "\&\$ENCODE_UNMAPPED(\$1) /egs;\n";
       
   149 
       
   150     $code .= "\$str }\n";
       
   151 #    print $code;
       
   152 
       
   153     my $func = eval $code;
       
   154     croak "could not eval generated code=[$code]: $@" if $@;
       
   155 
       
   156     $func;
       
   157 }
       
   158 
       
   159 #
       
   160 # Optimized version for when the encoding is UTF-8.
       
   161 # (In that case no conversion takes place.)
       
   162 #
       
   163 package XML::UM::SlowMapper::UTF8;
       
   164 use vars qw{ @ISA };
       
   165 @ISA = qw{ XML::UM::SlowMapper };
       
   166 
       
   167 sub read_encoding_file
       
   168 {
       
   169     # ignore it
       
   170 }
       
   171 
       
   172 sub get_encode
       
   173 {
       
   174     \&dont_convert;
       
   175 }
       
   176 
       
   177 sub dont_convert	# static
       
   178 {
       
   179     shift		# return argument unchanged
       
   180 }
       
   181 
       
   182 package XML::UM::SlowMapperFactory;
       
   183 
       
   184 sub new
       
   185 {
       
   186     my ($class, %hash) = @_;
       
   187     bless \%hash, $class;
       
   188 }
       
   189 
       
   190 sub get_encode
       
   191 {
       
   192     my ($self, %options) = @_;
       
   193     my $encoding = $options{Encoding};
       
   194 
       
   195     my $mapper = $self->get_mapper ($encoding);
       
   196     return $mapper->get_encode (%options);
       
   197 }
       
   198 
       
   199 sub get_mapper
       
   200 {
       
   201     my ($self, $encoding) = @_;
       
   202     $self->{Mapper}->{$encoding} ||= 
       
   203 	($encoding eq "UTF-8" ?
       
   204 	 new XML::UM::SlowMapper::UTF8 (Encoding => $encoding, 
       
   205 					Factory => $self) :
       
   206 	 new XML::UM::SlowMapper (Encoding => $encoding, 
       
   207 				  Factory => $self));
       
   208 }
       
   209 
       
   210 #
       
   211 # Prepare for garbage collection (remove circular refs)
       
   212 #
       
   213 sub dispose_encoding
       
   214 {
       
   215     my ($self, $encoding) = @_;
       
   216     my $mapper = $self->{Mapper}->{$encoding};
       
   217     return unless defined $mapper;
       
   218 
       
   219     delete $mapper->{Factory};
       
   220     delete $self->{Mapper}->{$encoding};
       
   221 }
       
   222 
       
   223 package XML::UM;
       
   224 use Carp;
       
   225 
       
   226 use vars qw{ $FACTORY %XML_MAPPING_CRITERIA };
       
   227 $FACTORY = XML::UM::SlowMapperFactory->new;
       
   228 
       
   229 sub get_encode		# static
       
   230 {
       
   231     $FACTORY->get_encode (@_);
       
   232 }
       
   233 
       
   234 sub dispose_encoding	# static
       
   235 {
       
   236     $FACTORY->dispose_encoding (@_);
       
   237 }
       
   238 
       
   239 # Convert UTF-8 byte sequence to Unicode index; then to '&#xNN;' string
       
   240 sub encode_unmapped_hex	# static
       
   241 {
       
   242     my $n = utf8_to_unicode (shift);
       
   243     sprintf ("&#x%X;", $n);
       
   244 }
       
   245 
       
   246 sub encode_unmapped_dec	# static
       
   247 {
       
   248     my $n = utf8_to_unicode (shift);
       
   249     "&#$n;"
       
   250 }
       
   251 
       
   252 # Converts a UTF-8 byte sequence that represents one character,
       
   253 # to its Unicode index.
       
   254 sub utf8_to_unicode	 # static
       
   255 {
       
   256     my $str = shift;
       
   257     my $len = length ($str);
       
   258 
       
   259     if ($len == 1)
       
   260     {
       
   261 	return ord ($str);
       
   262     }
       
   263     if ($len == 2)
       
   264     {
       
   265 	my @n = unpack "C2", $str;
       
   266 	return (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f);
       
   267     }
       
   268     elsif ($len == 3)
       
   269     {
       
   270 	my @n = unpack "C3", $str;
       
   271 	return (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + 
       
   272 		($n[2] & 0x3f);
       
   273     }
       
   274     elsif ($len == 4)
       
   275     {
       
   276 	my @n = unpack "C4", $str;
       
   277 	return (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12) + 
       
   278 		(($n[2] & 0x3f) << 6) + ($n[3] & 0x3f);
       
   279     }
       
   280     else
       
   281     {
       
   282 	croak "bad UTF8 sequence [$str] hex=" . hb($str);
       
   283     }
       
   284 }
       
   285 
       
   286 # Converts a Unicode character index to the byte sequence
       
   287 # that represents that character in UTF-8.
       
   288 sub unicode_to_utf8	# static
       
   289 {
       
   290     my $n = shift;
       
   291     if ($n < 0x80)
       
   292     {
       
   293 	return chr ($n);
       
   294     }
       
   295     elsif ($n < 0x800)
       
   296     {
       
   297 	return pack ("CC", (($n >> 6) | 0xc0), (($n & 0x3f) | 0x80));
       
   298     }
       
   299     elsif ($n < 0x10000)
       
   300     {
       
   301 	return pack ("CCC", (($n >> 12) | 0xe0), ((($n >> 6) & 0x3f) | 0x80),
       
   302 		     (($n & 0x3f) | 0x80));
       
   303     }
       
   304     elsif ($n < 0x110000)
       
   305     {
       
   306 	return pack ("CCCC", (($n >> 18) | 0xf0), ((($n >> 12) & 0x3f) | 0x80),
       
   307 		     ((($n >> 6) & 0x3f) | 0x80), (($n & 0x3f) | 0x80));
       
   308     }
       
   309     croak "number [$n] is too large for Unicode in \&unicode_to_utf8";
       
   310 }
       
   311 
       
   312 #?? The following package is unfinished. 
       
   313 #?? It should parse the .enc file and create an array that maps
       
   314 #?? Unicode-index to encoded-str. I got stuck...
       
   315 
       
   316 # package XML::UM::EncParser;
       
   317 #
       
   318 # sub new
       
   319 # {
       
   320 #     my ($class, %hash) = @_;
       
   321 #     my $self = bless \%hash, $class;
       
   322 #     $self;
       
   323 # }
       
   324 #
       
   325 # sub parse
       
   326 # {
       
   327 #     my ($self, $filename) = @_;
       
   328 #     open (FILE, $filename) || die "can't open .enc file $filename";
       
   329 #     binmode (FILE);
       
   330 #
       
   331 #     my $buf;
       
   332 #     read (FILE, $buf, 4 + 40 + 2 + 2 + 1024);
       
   333 #    
       
   334 #     my ($magic, $name, $pfsize, $bmsize, @map) = unpack ("NA40nnN256", $buf);
       
   335 #     printf "magic=%04x name=$name pfsize=$pfsize bmsize=$bmsize\n", $magic;
       
   336 #    
       
   337 #     if ($magic != 0xFEEBFACE)
       
   338 #     {
       
   339 # 	close FILE;
       
   340 # 	die sprintf ("bad magic number [0x%08X] in $filename, expected 0xFEEBFACE", $magic);
       
   341 #     }
       
   342 #
       
   343 #     for (my $i = 0; $i < 256; $i++)
       
   344 #     {
       
   345 # 	printf "[%d]=%d ", $i, $map[$i];
       
   346 # 	print "\n" if ($i % 8 == 7);
       
   347 #     }
       
   348 #
       
   349 #     for (my $i = 0; $i < $pfsize; $i++)
       
   350 #     {
       
   351 # 	print "----- PrefixMap $i ----\n";
       
   352 # 	read (FILE, $buf, 2 + 2 + 32 + 32);
       
   353 # 	my ($min, $len, $bmap_start, @ispfx) = unpack ("CCnC64", $buf);
       
   354 # 	my (@ischar) = splice @ispfx, 32, 32, ();
       
   355 # #?? could use b256 instead of C32 for bitvector a la vec()
       
   356 #
       
   357 # 	print "ispfx=@ispfx\n";
       
   358 # 	print "ischar=@ischar\n";
       
   359 # 	$len = 256 if $len == 0;
       
   360 #
       
   361 # 	print " min=$min len=$len bmap_start=$bmap_start\n";
       
   362 #     }
       
   363 #
       
   364 #     close FILE;
       
   365 # }
       
   366 
       
   367 1; # package return code
       
   368 
       
   369 __END__
       
   370 
       
   371 =head1 NAME
       
   372 
       
   373 XML::UM - Convert UTF-8 strings to any encoding supported by XML::Encoding
       
   374 
       
   375 =head1 SYNOPSIS
       
   376 
       
   377  use XML::UM;
       
   378 
       
   379  # Set directory with .xml files that comes with XML::Encoding distribution
       
   380  # Always include the trailing slash!
       
   381  $XML::UM::ENCDIR = '/home1/enno/perlModules/XML-Encoding-1.01/maps/';
       
   382 
       
   383  # Create the encoding routine
       
   384  my $encode = XML::UM::get_encode (
       
   385 	Encoding => 'ISO-8859-2',
       
   386 	EncodeUnmapped => \&XML::UM::encode_unmapped_dec);
       
   387 
       
   388  # Convert a string from UTF-8 to the specified Encoding
       
   389  my $encoded_str = $encode->($utf8_str);
       
   390 
       
   391  # Remove circular references for garbage collection
       
   392  XML::UM::dispose_encoding ('ISO-8859-2');
       
   393 
       
   394 =head1 DESCRIPTION
       
   395 
       
   396 This module provides methods to convert UTF-8 strings to any XML encoding
       
   397 that L<XML::Encoding> supports. It creates mapping routines from the .xml
       
   398 files that can be found in the maps/ directory in the L<XML::Encoding>
       
   399 distribution. Note that the XML::Encoding distribution does install the 
       
   400 .enc files in your perl directory, but not the.xml files they were created
       
   401 from. That's why you have to specify $ENCDIR as in the SYNOPSIS.
       
   402 
       
   403 This implementation uses the XML::Encoding class to parse the .xml
       
   404 file and creates a hash that maps UTF-8 characters (each consisting of up
       
   405 to 4 bytes) to their equivalent byte sequence in the specified encoding. 
       
   406 Note that large mappings may consume a lot of memory! 
       
   407 
       
   408 Future implementations may parse the .enc files directly, or
       
   409 do the conversions entirely in XS (i.e. C code.)
       
   410 
       
   411 =head1 get_encode (Encoding => STRING, EncodeUnmapped => SUB)
       
   412 
       
   413 The central entry point to this module is the XML::UM::get_encode() method. 
       
   414 It forwards the call to the global $XML::UM::FACTORY, which is defined as
       
   415 an instance of XML::UM::SlowMapperFactory by default. Override this variable
       
   416 to plug in your own mapper factory.
       
   417 
       
   418 The XML::UM::SlowMapperFactory creates an instance of XML::UM::SlowMapper
       
   419 (and caches it for subsequent use) that reads in the .xml encoding file and
       
   420 creates a hash that maps UTF-8 characters to encoded characters.
       
   421 
       
   422 The get_encode() method of XML::UM::SlowMapper is called, finally, which
       
   423 generates an anonimous subroutine that uses the hash to convert 
       
   424 multi-character UTF-8 blocks to the proper encoding.
       
   425 
       
   426 =head1 dispose_encoding ($encoding_name)
       
   427 
       
   428 Call this to free the memory used by the SlowMapper for a specific encoding.
       
   429 Note that in order to free the big conversion hash, the user should no longer
       
   430 have references to the subroutines generated by get_encode().
       
   431 
       
   432 The parameters to the get_encode() method (defined as name/value pairs) are:
       
   433 
       
   434 =over 4
       
   435 
       
   436 =item * Encoding
       
   437 
       
   438 The name of the desired encoding, e.g. 'ISO-8859-2'
       
   439 
       
   440 =item * EncodeUnmapped (Default: \&XML::UM::encode_unmapped_dec)
       
   441 
       
   442 Defines how Unicode characters not found in the mapping file (of the 
       
   443 specified encoding) are printed. 
       
   444 By default, they are converted to decimal entity references, like '&#123;'
       
   445 
       
   446 Use \&XML::UM::encode_unmapped_hex for hexadecimal constants, like '&#xAB;'
       
   447 
       
   448 =back
       
   449 
       
   450 =head1 CAVEATS
       
   451 
       
   452 I'm not exactly sure about which Unicode characters in the range (0 .. 127) 
       
   453 should be mapped to themselves. See comments in XML/UM.pm near
       
   454 %DEFAULT_ASCII_MAPPINGS.
       
   455 
       
   456 The encodings that expat supports by default are currently not supported, 
       
   457 (e.g. UTF-16, ISO-8859-1),
       
   458 because there are no .enc files available for these encodings.
       
   459 This module needs some more work. If you have the time, please help!
       
   460 
       
   461 =head1 AUTHOR
       
   462 
       
   463 Send bug reports, hints, tips, suggestions to Enno Derksen at
       
   464 <F<enno@att.com>>. 
       
   465 
       
   466 =cut