|
1 # |
|
2 # TO DO: |
|
3 # |
|
4 # - Implement SlowMappers for expat builtin encodings (for which there |
|
5 # are no .enc files), e.g. UTF-16, US-ASCII, ISO-8859-1. |
|
6 # - Instead of parsing the .xml file with XML::Encoding, we should use XS. |
|
7 # If this will not be implemented for a while, we could try reading the |
|
8 # .enc file directly, instead of the .xml file. |
|
9 # I started writing XML::UM::EncParser to do this (see EOF), but got stuck. |
|
10 # |
|
11 |
|
12 use strict; |
|
13 |
|
14 package XML::UM::SlowMapper; |
|
15 use Carp; |
|
16 use XML::Encoding; |
|
17 |
|
18 use vars qw{ $ENCDIR %DEFAULT_ASCII_MAPPINGS }; |
|
19 |
|
20 my $UTFCHAR = '[\\x00-\\xBF]|[\\xC0-\\xDF].|[\\xE0-\\xEF]..|[\\xF0-\\xFF]...'; |
|
21 |
|
22 # |
|
23 # The directory that contains the .xml files that come with XML::Encoding. |
|
24 # Include the terminating '\' or '/' !! |
|
25 # |
|
26 $ENCDIR = "/home1/enno/perlModules/XML-Encoding-1.01/maps/"; |
|
27 #$ENCDIR = "c:\\src\\perl\\xml\\XML-Encoding-1.01\\maps\\"; |
|
28 |
|
29 # |
|
30 # From xmlparse.h in expat distribution: |
|
31 # |
|
32 # Expat places certain restrictions on the encodings that are supported |
|
33 # using this mechanism. |
|
34 # |
|
35 # 1. Every ASCII character that can appear in a well-formed XML document, |
|
36 # other than the characters |
|
37 # |
|
38 # $@\^`{}~ |
|
39 # |
|
40 # must be represented by a single byte, and that byte must be the |
|
41 # same byte that represents that character in ASCII. |
|
42 # |
|
43 # [end of excerpt] |
|
44 |
|
45 #?? Which 'ASCII characters can appear in a well-formed XML document ?? |
|
46 |
|
47 # All ASCII codes 0 - 127, excl. 36,64,92,94,96,123,125,126 i.e. $@\^`{}~ |
|
48 %DEFAULT_ASCII_MAPPINGS = map { (chr($_), chr($_)) } (0 .. 35, 37 .. 63, |
|
49 65 .. 91, 93, 95, |
|
50 97 .. 122, 124, 127); |
|
51 |
|
52 sub new |
|
53 { |
|
54 my ($class, %hash) = @_; |
|
55 my $self = bless \%hash, $class; |
|
56 |
|
57 $self->read_encoding_file; |
|
58 |
|
59 $self; |
|
60 } |
|
61 |
|
62 sub dispose |
|
63 { |
|
64 my $self = shift; |
|
65 $self->{Factory}->dispose_mapper ($self); |
|
66 delete $self->{Encode}; |
|
67 } |
|
68 |
|
69 # Reads the XML file that contains the encoding definition. |
|
70 # These files come with XML::Encoding. |
|
71 sub read_encoding_file |
|
72 { |
|
73 #?? This should parse the .enc files (the .xml files are not installed) !! |
|
74 |
|
75 my ($self) = @_; |
|
76 my $encoding = $self->{Encoding}; |
|
77 |
|
78 # There is no .enc (or .xml) file for US-ASCII, but the mapping is simple |
|
79 # so here it goes... |
|
80 if ($encoding eq 'US-ASCII') |
|
81 { |
|
82 $self->{EncMapName} = 'US-ASCII'; |
|
83 $self->{Map} = \%DEFAULT_ASCII_MAPPINGS; # I hope this is right |
|
84 return; |
|
85 } |
|
86 |
|
87 my $file = $self->find_encoding_file ($encoding); |
|
88 |
|
89 my %uni = %DEFAULT_ASCII_MAPPINGS; |
|
90 my $prefix = ""; |
|
91 my $DIR = "file:$ENCDIR"; |
|
92 |
|
93 my $enc = new XML::Encoding (Handlers => { |
|
94 Init => |
|
95 sub { |
|
96 my $base = shift->base ($DIR); |
|
97 } |
|
98 }, |
|
99 |
|
100 PushPrefixFcn => |
|
101 sub { |
|
102 $prefix .= chr (shift); |
|
103 undef; |
|
104 }, |
|
105 |
|
106 PopPrefixFcn => |
|
107 sub { |
|
108 chop $prefix; |
|
109 undef; |
|
110 }, |
|
111 |
|
112 RangeSetFcn => |
|
113 sub { |
|
114 my ($byte, $uni, $len) = @_; |
|
115 for (my $i = $uni; $len--; $uni++) |
|
116 { |
|
117 $uni{XML::UM::unicode_to_utf8($uni)} = $prefix . chr ($byte++); |
|
118 } |
|
119 undef; |
|
120 }); |
|
121 |
|
122 $self->{EncMapName} = $enc->parsefile ($file); |
|
123 |
|
124 #print "Parsed Encoding " . $self->{Encoding} . " MapName=" . $self->{EncMapName} . "\n"; |
|
125 |
|
126 $self->{Map} = \%uni; |
|
127 } |
|
128 |
|
129 sub find_encoding_file |
|
130 { |
|
131 my ($self, $enc) = @_; |
|
132 |
|
133 return "$ENCDIR\L$enc\E.xml"; # .xml filename is lower case |
|
134 } |
|
135 |
|
136 # Returns a closure (method) that converts a UTF-8 encoded string to an |
|
137 # encoded byte sequence. |
|
138 sub get_encode |
|
139 { |
|
140 my ($self, %hash) = @_; |
|
141 my $MAP = $self->{Map}; |
|
142 my $ENCODE_UNMAPPED = $hash{EncodeUnmapped} || \&XML::UM::encode_unmapped_dec; |
|
143 |
|
144 my $code = "sub {\n my \$str = shift;\n \$str =~ s/"; |
|
145 |
|
146 $code .= "($UTFCHAR)/\n"; |
|
147 $code .= "defined \$MAP->{\$1} ? \$MAP->{\$1} : "; |
|
148 $code .= "\&\$ENCODE_UNMAPPED(\$1) /egs;\n"; |
|
149 |
|
150 $code .= "\$str }\n"; |
|
151 # print $code; |
|
152 |
|
153 my $func = eval $code; |
|
154 croak "could not eval generated code=[$code]: $@" if $@; |
|
155 |
|
156 $func; |
|
157 } |
|
158 |
|
159 # |
|
160 # Optimized version for when the encoding is UTF-8. |
|
161 # (In that case no conversion takes place.) |
|
162 # |
|
163 package XML::UM::SlowMapper::UTF8; |
|
164 use vars qw{ @ISA }; |
|
165 @ISA = qw{ XML::UM::SlowMapper }; |
|
166 |
|
167 sub read_encoding_file |
|
168 { |
|
169 # ignore it |
|
170 } |
|
171 |
|
172 sub get_encode |
|
173 { |
|
174 \&dont_convert; |
|
175 } |
|
176 |
|
177 sub dont_convert # static |
|
178 { |
|
179 shift # return argument unchanged |
|
180 } |
|
181 |
|
182 package XML::UM::SlowMapperFactory; |
|
183 |
|
184 sub new |
|
185 { |
|
186 my ($class, %hash) = @_; |
|
187 bless \%hash, $class; |
|
188 } |
|
189 |
|
190 sub get_encode |
|
191 { |
|
192 my ($self, %options) = @_; |
|
193 my $encoding = $options{Encoding}; |
|
194 |
|
195 my $mapper = $self->get_mapper ($encoding); |
|
196 return $mapper->get_encode (%options); |
|
197 } |
|
198 |
|
199 sub get_mapper |
|
200 { |
|
201 my ($self, $encoding) = @_; |
|
202 $self->{Mapper}->{$encoding} ||= |
|
203 ($encoding eq "UTF-8" ? |
|
204 new XML::UM::SlowMapper::UTF8 (Encoding => $encoding, |
|
205 Factory => $self) : |
|
206 new XML::UM::SlowMapper (Encoding => $encoding, |
|
207 Factory => $self)); |
|
208 } |
|
209 |
|
210 # |
|
211 # Prepare for garbage collection (remove circular refs) |
|
212 # |
|
213 sub dispose_encoding |
|
214 { |
|
215 my ($self, $encoding) = @_; |
|
216 my $mapper = $self->{Mapper}->{$encoding}; |
|
217 return unless defined $mapper; |
|
218 |
|
219 delete $mapper->{Factory}; |
|
220 delete $self->{Mapper}->{$encoding}; |
|
221 } |
|
222 |
|
223 package XML::UM; |
|
224 use Carp; |
|
225 |
|
226 use vars qw{ $FACTORY %XML_MAPPING_CRITERIA }; |
|
227 $FACTORY = XML::UM::SlowMapperFactory->new; |
|
228 |
|
229 sub get_encode # static |
|
230 { |
|
231 $FACTORY->get_encode (@_); |
|
232 } |
|
233 |
|
234 sub dispose_encoding # static |
|
235 { |
|
236 $FACTORY->dispose_encoding (@_); |
|
237 } |
|
238 |
|
239 # Convert UTF-8 byte sequence to Unicode index; then to '&#xNN;' string |
|
240 sub encode_unmapped_hex # static |
|
241 { |
|
242 my $n = utf8_to_unicode (shift); |
|
243 sprintf ("&#x%X;", $n); |
|
244 } |
|
245 |
|
246 sub encode_unmapped_dec # static |
|
247 { |
|
248 my $n = utf8_to_unicode (shift); |
|
249 "&#$n;" |
|
250 } |
|
251 |
|
252 # Converts a UTF-8 byte sequence that represents one character, |
|
253 # to its Unicode index. |
|
254 sub utf8_to_unicode # static |
|
255 { |
|
256 my $str = shift; |
|
257 my $len = length ($str); |
|
258 |
|
259 if ($len == 1) |
|
260 { |
|
261 return ord ($str); |
|
262 } |
|
263 if ($len == 2) |
|
264 { |
|
265 my @n = unpack "C2", $str; |
|
266 return (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f); |
|
267 } |
|
268 elsif ($len == 3) |
|
269 { |
|
270 my @n = unpack "C3", $str; |
|
271 return (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + |
|
272 ($n[2] & 0x3f); |
|
273 } |
|
274 elsif ($len == 4) |
|
275 { |
|
276 my @n = unpack "C4", $str; |
|
277 return (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12) + |
|
278 (($n[2] & 0x3f) << 6) + ($n[3] & 0x3f); |
|
279 } |
|
280 else |
|
281 { |
|
282 croak "bad UTF8 sequence [$str] hex=" . hb($str); |
|
283 } |
|
284 } |
|
285 |
|
286 # Converts a Unicode character index to the byte sequence |
|
287 # that represents that character in UTF-8. |
|
288 sub unicode_to_utf8 # static |
|
289 { |
|
290 my $n = shift; |
|
291 if ($n < 0x80) |
|
292 { |
|
293 return chr ($n); |
|
294 } |
|
295 elsif ($n < 0x800) |
|
296 { |
|
297 return pack ("CC", (($n >> 6) | 0xc0), (($n & 0x3f) | 0x80)); |
|
298 } |
|
299 elsif ($n < 0x10000) |
|
300 { |
|
301 return pack ("CCC", (($n >> 12) | 0xe0), ((($n >> 6) & 0x3f) | 0x80), |
|
302 (($n & 0x3f) | 0x80)); |
|
303 } |
|
304 elsif ($n < 0x110000) |
|
305 { |
|
306 return pack ("CCCC", (($n >> 18) | 0xf0), ((($n >> 12) & 0x3f) | 0x80), |
|
307 ((($n >> 6) & 0x3f) | 0x80), (($n & 0x3f) | 0x80)); |
|
308 } |
|
309 croak "number [$n] is too large for Unicode in \&unicode_to_utf8"; |
|
310 } |
|
311 |
|
312 #?? The following package is unfinished. |
|
313 #?? It should parse the .enc file and create an array that maps |
|
314 #?? Unicode-index to encoded-str. I got stuck... |
|
315 |
|
316 # package XML::UM::EncParser; |
|
317 # |
|
318 # sub new |
|
319 # { |
|
320 # my ($class, %hash) = @_; |
|
321 # my $self = bless \%hash, $class; |
|
322 # $self; |
|
323 # } |
|
324 # |
|
325 # sub parse |
|
326 # { |
|
327 # my ($self, $filename) = @_; |
|
328 # open (FILE, $filename) || die "can't open .enc file $filename"; |
|
329 # binmode (FILE); |
|
330 # |
|
331 # my $buf; |
|
332 # read (FILE, $buf, 4 + 40 + 2 + 2 + 1024); |
|
333 # |
|
334 # my ($magic, $name, $pfsize, $bmsize, @map) = unpack ("NA40nnN256", $buf); |
|
335 # printf "magic=%04x name=$name pfsize=$pfsize bmsize=$bmsize\n", $magic; |
|
336 # |
|
337 # if ($magic != 0xFEEBFACE) |
|
338 # { |
|
339 # close FILE; |
|
340 # die sprintf ("bad magic number [0x%08X] in $filename, expected 0xFEEBFACE", $magic); |
|
341 # } |
|
342 # |
|
343 # for (my $i = 0; $i < 256; $i++) |
|
344 # { |
|
345 # printf "[%d]=%d ", $i, $map[$i]; |
|
346 # print "\n" if ($i % 8 == 7); |
|
347 # } |
|
348 # |
|
349 # for (my $i = 0; $i < $pfsize; $i++) |
|
350 # { |
|
351 # print "----- PrefixMap $i ----\n"; |
|
352 # read (FILE, $buf, 2 + 2 + 32 + 32); |
|
353 # my ($min, $len, $bmap_start, @ispfx) = unpack ("CCnC64", $buf); |
|
354 # my (@ischar) = splice @ispfx, 32, 32, (); |
|
355 # #?? could use b256 instead of C32 for bitvector a la vec() |
|
356 # |
|
357 # print "ispfx=@ispfx\n"; |
|
358 # print "ischar=@ischar\n"; |
|
359 # $len = 256 if $len == 0; |
|
360 # |
|
361 # print " min=$min len=$len bmap_start=$bmap_start\n"; |
|
362 # } |
|
363 # |
|
364 # close FILE; |
|
365 # } |
|
366 |
|
367 1; # package return code |
|
368 |
|
369 __END__ |
|
370 |
|
371 =head1 NAME |
|
372 |
|
373 XML::UM - Convert UTF-8 strings to any encoding supported by XML::Encoding |
|
374 |
|
375 =head1 SYNOPSIS |
|
376 |
|
377 use XML::UM; |
|
378 |
|
379 # Set directory with .xml files that comes with XML::Encoding distribution |
|
380 # Always include the trailing slash! |
|
381 $XML::UM::ENCDIR = '/home1/enno/perlModules/XML-Encoding-1.01/maps/'; |
|
382 |
|
383 # Create the encoding routine |
|
384 my $encode = XML::UM::get_encode ( |
|
385 Encoding => 'ISO-8859-2', |
|
386 EncodeUnmapped => \&XML::UM::encode_unmapped_dec); |
|
387 |
|
388 # Convert a string from UTF-8 to the specified Encoding |
|
389 my $encoded_str = $encode->($utf8_str); |
|
390 |
|
391 # Remove circular references for garbage collection |
|
392 XML::UM::dispose_encoding ('ISO-8859-2'); |
|
393 |
|
394 =head1 DESCRIPTION |
|
395 |
|
396 This module provides methods to convert UTF-8 strings to any XML encoding |
|
397 that L<XML::Encoding> supports. It creates mapping routines from the .xml |
|
398 files that can be found in the maps/ directory in the L<XML::Encoding> |
|
399 distribution. Note that the XML::Encoding distribution does install the |
|
400 .enc files in your perl directory, but not the.xml files they were created |
|
401 from. That's why you have to specify $ENCDIR as in the SYNOPSIS. |
|
402 |
|
403 This implementation uses the XML::Encoding class to parse the .xml |
|
404 file and creates a hash that maps UTF-8 characters (each consisting of up |
|
405 to 4 bytes) to their equivalent byte sequence in the specified encoding. |
|
406 Note that large mappings may consume a lot of memory! |
|
407 |
|
408 Future implementations may parse the .enc files directly, or |
|
409 do the conversions entirely in XS (i.e. C code.) |
|
410 |
|
411 =head1 get_encode (Encoding => STRING, EncodeUnmapped => SUB) |
|
412 |
|
413 The central entry point to this module is the XML::UM::get_encode() method. |
|
414 It forwards the call to the global $XML::UM::FACTORY, which is defined as |
|
415 an instance of XML::UM::SlowMapperFactory by default. Override this variable |
|
416 to plug in your own mapper factory. |
|
417 |
|
418 The XML::UM::SlowMapperFactory creates an instance of XML::UM::SlowMapper |
|
419 (and caches it for subsequent use) that reads in the .xml encoding file and |
|
420 creates a hash that maps UTF-8 characters to encoded characters. |
|
421 |
|
422 The get_encode() method of XML::UM::SlowMapper is called, finally, which |
|
423 generates an anonimous subroutine that uses the hash to convert |
|
424 multi-character UTF-8 blocks to the proper encoding. |
|
425 |
|
426 =head1 dispose_encoding ($encoding_name) |
|
427 |
|
428 Call this to free the memory used by the SlowMapper for a specific encoding. |
|
429 Note that in order to free the big conversion hash, the user should no longer |
|
430 have references to the subroutines generated by get_encode(). |
|
431 |
|
432 The parameters to the get_encode() method (defined as name/value pairs) are: |
|
433 |
|
434 =over 4 |
|
435 |
|
436 =item * Encoding |
|
437 |
|
438 The name of the desired encoding, e.g. 'ISO-8859-2' |
|
439 |
|
440 =item * EncodeUnmapped (Default: \&XML::UM::encode_unmapped_dec) |
|
441 |
|
442 Defines how Unicode characters not found in the mapping file (of the |
|
443 specified encoding) are printed. |
|
444 By default, they are converted to decimal entity references, like '{' |
|
445 |
|
446 Use \&XML::UM::encode_unmapped_hex for hexadecimal constants, like '«' |
|
447 |
|
448 =back |
|
449 |
|
450 =head1 CAVEATS |
|
451 |
|
452 I'm not exactly sure about which Unicode characters in the range (0 .. 127) |
|
453 should be mapped to themselves. See comments in XML/UM.pm near |
|
454 %DEFAULT_ASCII_MAPPINGS. |
|
455 |
|
456 The encodings that expat supports by default are currently not supported, |
|
457 (e.g. UTF-16, ISO-8859-1), |
|
458 because there are no .enc files available for these encodings. |
|
459 This module needs some more work. If you have the time, please help! |
|
460 |
|
461 =head1 AUTHOR |
|
462 |
|
463 Send bug reports, hints, tips, suggestions to Enno Derksen at |
|
464 <F<enno@att.com>>. |
|
465 |
|
466 =cut |