655
|
1 |
#
|
|
2 |
# TO DO:
|
|
3 |
#
|
|
4 |
# - Implement SlowMappers for expat builtin encodings (for which there
|
|
5 |
# are no .enc files), e.g. UTF-16, US-ASCII, ISO-8859-1.
|
|
6 |
# - Instead of parsing the .xml file with XML::Encoding, we should use XS.
|
|
7 |
# If this will not be implemented for a while, we could try reading the
|
|
8 |
# .enc file directly, instead of the .xml file.
|
|
9 |
# I started writing XML::UM::EncParser to do this (see EOF), but got stuck.
|
|
10 |
#
|
|
11 |
|
|
12 |
use strict;
|
|
13 |
|
|
14 |
package XML::UM::SlowMapper;
|
|
15 |
use Carp;
|
|
16 |
use XML::Encoding;
|
|
17 |
|
|
18 |
use vars qw{ $ENCDIR %DEFAULT_ASCII_MAPPINGS };
|
|
19 |
|
|
20 |
my $UTFCHAR = '[\\x00-\\xBF]|[\\xC0-\\xDF].|[\\xE0-\\xEF]..|[\\xF0-\\xFF]...';
|
|
21 |
|
|
22 |
#
|
|
23 |
# The directory that contains the .xml files that come with XML::Encoding.
|
|
24 |
# Include the terminating '\' or '/' !!
|
|
25 |
#
|
|
26 |
$ENCDIR = "/home1/enno/perlModules/XML-Encoding-1.01/maps/";
|
|
27 |
#$ENCDIR = "c:\\src\\perl\\xml\\XML-Encoding-1.01\\maps\\";
|
|
28 |
|
|
29 |
#
|
|
30 |
# From xmlparse.h in expat distribution:
|
|
31 |
#
|
|
32 |
# Expat places certain restrictions on the encodings that are supported
|
|
33 |
# using this mechanism.
|
|
34 |
#
|
|
35 |
# 1. Every ASCII character that can appear in a well-formed XML document,
|
|
36 |
# other than the characters
|
|
37 |
#
|
|
38 |
# $@\^`{}~
|
|
39 |
#
|
|
40 |
# must be represented by a single byte, and that byte must be the
|
|
41 |
# same byte that represents that character in ASCII.
|
|
42 |
#
|
|
43 |
# [end of excerpt]
|
|
44 |
|
|
45 |
#?? Which 'ASCII characters can appear in a well-formed XML document ??
|
|
46 |
|
|
47 |
# All ASCII codes 0 - 127, excl. 36,64,92,94,96,123,125,126 i.e. $@\^`{}~
|
|
48 |
%DEFAULT_ASCII_MAPPINGS = map { (chr($_), chr($_)) } (0 .. 35, 37 .. 63,
|
|
49 |
65 .. 91, 93, 95,
|
|
50 |
97 .. 122, 124, 127);
|
|
51 |
|
|
52 |
sub new
|
|
53 |
{
|
|
54 |
my ($class, %hash) = @_;
|
|
55 |
my $self = bless \%hash, $class;
|
|
56 |
|
|
57 |
$self->read_encoding_file;
|
|
58 |
|
|
59 |
$self;
|
|
60 |
}
|
|
61 |
|
|
62 |
sub dispose
|
|
63 |
{
|
|
64 |
my $self = shift;
|
|
65 |
$self->{Factory}->dispose_mapper ($self);
|
|
66 |
delete $self->{Encode};
|
|
67 |
}
|
|
68 |
|
|
69 |
# Reads the XML file that contains the encoding definition.
|
|
70 |
# These files come with XML::Encoding.
|
|
71 |
sub read_encoding_file
|
|
72 |
{
|
|
73 |
#?? This should parse the .enc files (the .xml files are not installed) !!
|
|
74 |
|
|
75 |
my ($self) = @_;
|
|
76 |
my $encoding = $self->{Encoding};
|
|
77 |
|
|
78 |
# There is no .enc (or .xml) file for US-ASCII, but the mapping is simple
|
|
79 |
# so here it goes...
|
|
80 |
if ($encoding eq 'US-ASCII')
|
|
81 |
{
|
|
82 |
$self->{EncMapName} = 'US-ASCII';
|
|
83 |
$self->{Map} = \%DEFAULT_ASCII_MAPPINGS; # I hope this is right
|
|
84 |
return;
|
|
85 |
}
|
|
86 |
|
|
87 |
my $file = $self->find_encoding_file ($encoding);
|
|
88 |
|
|
89 |
my %uni = %DEFAULT_ASCII_MAPPINGS;
|
|
90 |
my $prefix = "";
|
|
91 |
my $DIR = "file:$ENCDIR";
|
|
92 |
|
|
93 |
my $enc = new XML::Encoding (Handlers => {
|
|
94 |
Init =>
|
|
95 |
sub {
|
|
96 |
my $base = shift->base ($DIR);
|
|
97 |
}
|
|
98 |
},
|
|
99 |
|
|
100 |
PushPrefixFcn =>
|
|
101 |
sub {
|
|
102 |
$prefix .= chr (shift);
|
|
103 |
undef;
|
|
104 |
},
|
|
105 |
|
|
106 |
PopPrefixFcn =>
|
|
107 |
sub {
|
|
108 |
chop $prefix;
|
|
109 |
undef;
|
|
110 |
},
|
|
111 |
|
|
112 |
RangeSetFcn =>
|
|
113 |
sub {
|
|
114 |
my ($byte, $uni, $len) = @_;
|
|
115 |
for (my $i = $uni; $len--; $uni++)
|
|
116 |
{
|
|
117 |
$uni{XML::UM::unicode_to_utf8($uni)} = $prefix . chr ($byte++);
|
|
118 |
}
|
|
119 |
undef;
|
|
120 |
});
|
|
121 |
|
|
122 |
$self->{EncMapName} = $enc->parsefile ($file);
|
|
123 |
|
|
124 |
#print "Parsed Encoding " . $self->{Encoding} . " MapName=" . $self->{EncMapName} . "\n";
|
|
125 |
|
|
126 |
$self->{Map} = \%uni;
|
|
127 |
}
|
|
128 |
|
|
129 |
sub find_encoding_file
|
|
130 |
{
|
|
131 |
my ($self, $enc) = @_;
|
|
132 |
|
|
133 |
return "$ENCDIR\L$enc\E.xml"; # .xml filename is lower case
|
|
134 |
}
|
|
135 |
|
|
136 |
# Returns a closure (method) that converts a UTF-8 encoded string to an
|
|
137 |
# encoded byte sequence.
|
|
138 |
sub get_encode
|
|
139 |
{
|
|
140 |
my ($self, %hash) = @_;
|
|
141 |
my $MAP = $self->{Map};
|
|
142 |
my $ENCODE_UNMAPPED = $hash{EncodeUnmapped} || \&XML::UM::encode_unmapped_dec;
|
|
143 |
|
|
144 |
my $code = "sub {\n my \$str = shift;\n \$str =~ s/";
|
|
145 |
|
|
146 |
$code .= "($UTFCHAR)/\n";
|
|
147 |
$code .= "defined \$MAP->{\$1} ? \$MAP->{\$1} : ";
|
|
148 |
$code .= "\&\$ENCODE_UNMAPPED(\$1) /egs;\n";
|
|
149 |
|
|
150 |
$code .= "\$str }\n";
|
|
151 |
# print $code;
|
|
152 |
|
|
153 |
my $func = eval $code;
|
|
154 |
croak "could not eval generated code=[$code]: $@" if $@;
|
|
155 |
|
|
156 |
$func;
|
|
157 |
}
|
|
158 |
|
|
159 |
#
|
|
160 |
# Optimized version for when the encoding is UTF-8.
|
|
161 |
# (In that case no conversion takes place.)
|
|
162 |
#
|
|
163 |
package XML::UM::SlowMapper::UTF8;
|
|
164 |
use vars qw{ @ISA };
|
|
165 |
@ISA = qw{ XML::UM::SlowMapper };
|
|
166 |
|
|
167 |
sub read_encoding_file
|
|
168 |
{
|
|
169 |
# ignore it
|
|
170 |
}
|
|
171 |
|
|
172 |
sub get_encode
|
|
173 |
{
|
|
174 |
\&dont_convert;
|
|
175 |
}
|
|
176 |
|
|
177 |
sub dont_convert # static
|
|
178 |
{
|
|
179 |
shift # return argument unchanged
|
|
180 |
}
|
|
181 |
|
|
182 |
package XML::UM::SlowMapperFactory;
|
|
183 |
|
|
184 |
sub new
|
|
185 |
{
|
|
186 |
my ($class, %hash) = @_;
|
|
187 |
bless \%hash, $class;
|
|
188 |
}
|
|
189 |
|
|
190 |
sub get_encode
|
|
191 |
{
|
|
192 |
my ($self, %options) = @_;
|
|
193 |
my $encoding = $options{Encoding};
|
|
194 |
|
|
195 |
my $mapper = $self->get_mapper ($encoding);
|
|
196 |
return $mapper->get_encode (%options);
|
|
197 |
}
|
|
198 |
|
|
199 |
sub get_mapper
|
|
200 |
{
|
|
201 |
my ($self, $encoding) = @_;
|
|
202 |
$self->{Mapper}->{$encoding} ||=
|
|
203 |
($encoding eq "UTF-8" ?
|
|
204 |
new XML::UM::SlowMapper::UTF8 (Encoding => $encoding,
|
|
205 |
Factory => $self) :
|
|
206 |
new XML::UM::SlowMapper (Encoding => $encoding,
|
|
207 |
Factory => $self));
|
|
208 |
}
|
|
209 |
|
|
210 |
#
|
|
211 |
# Prepare for garbage collection (remove circular refs)
|
|
212 |
#
|
|
213 |
sub dispose_encoding
|
|
214 |
{
|
|
215 |
my ($self, $encoding) = @_;
|
|
216 |
my $mapper = $self->{Mapper}->{$encoding};
|
|
217 |
return unless defined $mapper;
|
|
218 |
|
|
219 |
delete $mapper->{Factory};
|
|
220 |
delete $self->{Mapper}->{$encoding};
|
|
221 |
}
|
|
222 |
|
|
223 |
package XML::UM;
|
|
224 |
use Carp;
|
|
225 |
|
|
226 |
use vars qw{ $FACTORY %XML_MAPPING_CRITERIA };
|
|
227 |
$FACTORY = XML::UM::SlowMapperFactory->new;
|
|
228 |
|
|
229 |
sub get_encode # static
|
|
230 |
{
|
|
231 |
$FACTORY->get_encode (@_);
|
|
232 |
}
|
|
233 |
|
|
234 |
sub dispose_encoding # static
|
|
235 |
{
|
|
236 |
$FACTORY->dispose_encoding (@_);
|
|
237 |
}
|
|
238 |
|
|
239 |
# Convert UTF-8 byte sequence to Unicode index; then to '&#xNN;' string
|
|
240 |
sub encode_unmapped_hex # static
|
|
241 |
{
|
|
242 |
my $n = utf8_to_unicode (shift);
|
|
243 |
sprintf ("&#x%X;", $n);
|
|
244 |
}
|
|
245 |
|
|
246 |
sub encode_unmapped_dec # static
|
|
247 |
{
|
|
248 |
my $n = utf8_to_unicode (shift);
|
|
249 |
"&#$n;"
|
|
250 |
}
|
|
251 |
|
|
252 |
# Converts a UTF-8 byte sequence that represents one character,
|
|
253 |
# to its Unicode index.
|
|
254 |
sub utf8_to_unicode # static
|
|
255 |
{
|
|
256 |
my $str = shift;
|
|
257 |
my $len = length ($str);
|
|
258 |
|
|
259 |
if ($len == 1)
|
|
260 |
{
|
|
261 |
return ord ($str);
|
|
262 |
}
|
|
263 |
if ($len == 2)
|
|
264 |
{
|
|
265 |
my @n = unpack "C2", $str;
|
|
266 |
return (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f);
|
|
267 |
}
|
|
268 |
elsif ($len == 3)
|
|
269 |
{
|
|
270 |
my @n = unpack "C3", $str;
|
|
271 |
return (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) +
|
|
272 |
($n[2] & 0x3f);
|
|
273 |
}
|
|
274 |
elsif ($len == 4)
|
|
275 |
{
|
|
276 |
my @n = unpack "C4", $str;
|
|
277 |
return (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12) +
|
|
278 |
(($n[2] & 0x3f) << 6) + ($n[3] & 0x3f);
|
|
279 |
}
|
|
280 |
else
|
|
281 |
{
|
|
282 |
croak "bad UTF8 sequence [$str] hex=" . hb($str);
|
|
283 |
}
|
|
284 |
}
|
|
285 |
|
|
286 |
# Converts a Unicode character index to the byte sequence
|
|
287 |
# that represents that character in UTF-8.
|
|
288 |
sub unicode_to_utf8 # static
|
|
289 |
{
|
|
290 |
my $n = shift;
|
|
291 |
if ($n < 0x80)
|
|
292 |
{
|
|
293 |
return chr ($n);
|
|
294 |
}
|
|
295 |
elsif ($n < 0x800)
|
|
296 |
{
|
|
297 |
return pack ("CC", (($n >> 6) | 0xc0), (($n & 0x3f) | 0x80));
|
|
298 |
}
|
|
299 |
elsif ($n < 0x10000)
|
|
300 |
{
|
|
301 |
return pack ("CCC", (($n >> 12) | 0xe0), ((($n >> 6) & 0x3f) | 0x80),
|
|
302 |
(($n & 0x3f) | 0x80));
|
|
303 |
}
|
|
304 |
elsif ($n < 0x110000)
|
|
305 |
{
|
|
306 |
return pack ("CCCC", (($n >> 18) | 0xf0), ((($n >> 12) & 0x3f) | 0x80),
|
|
307 |
((($n >> 6) & 0x3f) | 0x80), (($n & 0x3f) | 0x80));
|
|
308 |
}
|
|
309 |
croak "number [$n] is too large for Unicode in \&unicode_to_utf8";
|
|
310 |
}
|
|
311 |
|
|
312 |
#?? The following package is unfinished.
|
|
313 |
#?? It should parse the .enc file and create an array that maps
|
|
314 |
#?? Unicode-index to encoded-str. I got stuck...
|
|
315 |
|
|
316 |
# package XML::UM::EncParser;
|
|
317 |
#
|
|
318 |
# sub new
|
|
319 |
# {
|
|
320 |
# my ($class, %hash) = @_;
|
|
321 |
# my $self = bless \%hash, $class;
|
|
322 |
# $self;
|
|
323 |
# }
|
|
324 |
#
|
|
325 |
# sub parse
|
|
326 |
# {
|
|
327 |
# my ($self, $filename) = @_;
|
|
328 |
# open (FILE, $filename) || die "can't open .enc file $filename";
|
|
329 |
# binmode (FILE);
|
|
330 |
#
|
|
331 |
# my $buf;
|
|
332 |
# read (FILE, $buf, 4 + 40 + 2 + 2 + 1024);
|
|
333 |
#
|
|
334 |
# my ($magic, $name, $pfsize, $bmsize, @map) = unpack ("NA40nnN256", $buf);
|
|
335 |
# printf "magic=%04x name=$name pfsize=$pfsize bmsize=$bmsize\n", $magic;
|
|
336 |
#
|
|
337 |
# if ($magic != 0xFEEBFACE)
|
|
338 |
# {
|
|
339 |
# close FILE;
|
|
340 |
# die sprintf ("bad magic number [0x%08X] in $filename, expected 0xFEEBFACE", $magic);
|
|
341 |
# }
|
|
342 |
#
|
|
343 |
# for (my $i = 0; $i < 256; $i++)
|
|
344 |
# {
|
|
345 |
# printf "[%d]=%d ", $i, $map[$i];
|
|
346 |
# print "\n" if ($i % 8 == 7);
|
|
347 |
# }
|
|
348 |
#
|
|
349 |
# for (my $i = 0; $i < $pfsize; $i++)
|
|
350 |
# {
|
|
351 |
# print "----- PrefixMap $i ----\n";
|
|
352 |
# read (FILE, $buf, 2 + 2 + 32 + 32);
|
|
353 |
# my ($min, $len, $bmap_start, @ispfx) = unpack ("CCnC64", $buf);
|
|
354 |
# my (@ischar) = splice @ispfx, 32, 32, ();
|
|
355 |
# #?? could use b256 instead of C32 for bitvector a la vec()
|
|
356 |
#
|
|
357 |
# print "ispfx=@ispfx\n";
|
|
358 |
# print "ischar=@ischar\n";
|
|
359 |
# $len = 256 if $len == 0;
|
|
360 |
#
|
|
361 |
# print " min=$min len=$len bmap_start=$bmap_start\n";
|
|
362 |
# }
|
|
363 |
#
|
|
364 |
# close FILE;
|
|
365 |
# }
|
|
366 |
|
|
367 |
1; # package return code
|
|
368 |
|
|
369 |
__END__
|
|
370 |
|
|
371 |
=head1 NAME
|
|
372 |
|
|
373 |
XML::UM - Convert UTF-8 strings to any encoding supported by XML::Encoding
|
|
374 |
|
|
375 |
=head1 SYNOPSIS
|
|
376 |
|
|
377 |
use XML::UM;
|
|
378 |
|
|
379 |
# Set directory with .xml files that comes with XML::Encoding distribution
|
|
380 |
# Always include the trailing slash!
|
|
381 |
$XML::UM::ENCDIR = '/home1/enno/perlModules/XML-Encoding-1.01/maps/';
|
|
382 |
|
|
383 |
# Create the encoding routine
|
|
384 |
my $encode = XML::UM::get_encode (
|
|
385 |
Encoding => 'ISO-8859-2',
|
|
386 |
EncodeUnmapped => \&XML::UM::encode_unmapped_dec);
|
|
387 |
|
|
388 |
# Convert a string from UTF-8 to the specified Encoding
|
|
389 |
my $encoded_str = $encode->($utf8_str);
|
|
390 |
|
|
391 |
# Remove circular references for garbage collection
|
|
392 |
XML::UM::dispose_encoding ('ISO-8859-2');
|
|
393 |
|
|
394 |
=head1 DESCRIPTION
|
|
395 |
|
|
396 |
This module provides methods to convert UTF-8 strings to any XML encoding
|
|
397 |
that L<XML::Encoding> supports. It creates mapping routines from the .xml
|
|
398 |
files that can be found in the maps/ directory in the L<XML::Encoding>
|
|
399 |
distribution. Note that the XML::Encoding distribution does install the
|
|
400 |
.enc files in your perl directory, but not the.xml files they were created
|
|
401 |
from. That's why you have to specify $ENCDIR as in the SYNOPSIS.
|
|
402 |
|
|
403 |
This implementation uses the XML::Encoding class to parse the .xml
|
|
404 |
file and creates a hash that maps UTF-8 characters (each consisting of up
|
|
405 |
to 4 bytes) to their equivalent byte sequence in the specified encoding.
|
|
406 |
Note that large mappings may consume a lot of memory!
|
|
407 |
|
|
408 |
Future implementations may parse the .enc files directly, or
|
|
409 |
do the conversions entirely in XS (i.e. C code.)
|
|
410 |
|
|
411 |
=head1 get_encode (Encoding => STRING, EncodeUnmapped => SUB)
|
|
412 |
|
|
413 |
The central entry point to this module is the XML::UM::get_encode() method.
|
|
414 |
It forwards the call to the global $XML::UM::FACTORY, which is defined as
|
|
415 |
an instance of XML::UM::SlowMapperFactory by default. Override this variable
|
|
416 |
to plug in your own mapper factory.
|
|
417 |
|
|
418 |
The XML::UM::SlowMapperFactory creates an instance of XML::UM::SlowMapper
|
|
419 |
(and caches it for subsequent use) that reads in the .xml encoding file and
|
|
420 |
creates a hash that maps UTF-8 characters to encoded characters.
|
|
421 |
|
|
422 |
The get_encode() method of XML::UM::SlowMapper is called, finally, which
|
|
423 |
generates an anonimous subroutine that uses the hash to convert
|
|
424 |
multi-character UTF-8 blocks to the proper encoding.
|
|
425 |
|
|
426 |
=head1 dispose_encoding ($encoding_name)
|
|
427 |
|
|
428 |
Call this to free the memory used by the SlowMapper for a specific encoding.
|
|
429 |
Note that in order to free the big conversion hash, the user should no longer
|
|
430 |
have references to the subroutines generated by get_encode().
|
|
431 |
|
|
432 |
The parameters to the get_encode() method (defined as name/value pairs) are:
|
|
433 |
|
|
434 |
=over 4
|
|
435 |
|
|
436 |
=item * Encoding
|
|
437 |
|
|
438 |
The name of the desired encoding, e.g. 'ISO-8859-2'
|
|
439 |
|
|
440 |
=item * EncodeUnmapped (Default: \&XML::UM::encode_unmapped_dec)
|
|
441 |
|
|
442 |
Defines how Unicode characters not found in the mapping file (of the
|
|
443 |
specified encoding) are printed.
|
|
444 |
By default, they are converted to decimal entity references, like '{'
|
|
445 |
|
|
446 |
Use \&XML::UM::encode_unmapped_hex for hexadecimal constants, like '«'
|
|
447 |
|
|
448 |
=back
|
|
449 |
|
|
450 |
=head1 CAVEATS
|
|
451 |
|
|
452 |
I'm not exactly sure about which Unicode characters in the range (0 .. 127)
|
|
453 |
should be mapped to themselves. See comments in XML/UM.pm near
|
|
454 |
%DEFAULT_ASCII_MAPPINGS.
|
|
455 |
|
|
456 |
The encodings that expat supports by default are currently not supported,
|
|
457 |
(e.g. UTF-16, ISO-8859-1),
|
|
458 |
because there are no .enc files available for these encodings.
|
|
459 |
This module needs some more work. If you have the time, please help!
|
|
460 |
|
|
461 |
=head1 AUTHOR
|
|
462 |
|
|
463 |
Send bug reports, hints, tips, suggestions to Enno Derksen at
|
|
464 |
<F<enno@att.com>>.
|
|
465 |
|
|
466 |
=cut
|