|
1 # |
|
2 # Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
3 # All rights reserved. |
|
4 # This component and the accompanying materials are made available |
|
5 # under the terms of "Eclipse Public License v1.0" |
|
6 # which accompanies this distribution, and is available |
|
7 # at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 # |
|
9 # Initial Contributors: |
|
10 # Nokia Corporation - initial contribution. |
|
11 # |
|
12 # Contributors: |
|
13 # |
|
14 # Description: |
|
15 # |
|
16 |
|
17 use strict; |
|
18 use integer; |
|
19 |
|
20 sub PerlScriptPath |
|
21 { |
|
22 my $perlScriptPath=$0; |
|
23 my $os = $^O; #get the OS type |
|
24 #check OS type |
|
25 if($os=~/MSWin32/) #Windows OS |
|
26 { |
|
27 $perlScriptPath=~s/\//\\/g; # replace any forward-slashes with back-slashes |
|
28 $perlScriptPath=~s/(\\?)[^\\]+$/$1/; # get rid of this Perl-script's file-name |
|
29 } |
|
30 else #Unix OS |
|
31 { |
|
32 $perlScriptPath=~s/\\/\//g; # replace any back-slashes with forward-slashes |
|
33 $perlScriptPath=~s/(\/?)[^\/]+$/$1/; # get rid of this Perl-script's file-name |
|
34 } |
|
35 return $perlScriptPath; |
|
36 } |
|
37 BEGIN |
|
38 { |
|
39 unshift(@INC, &PerlScriptPath()); # can't do "use lib &PerlScriptPath()" here as "use lib" only seems to work with *hard-coded* directory names |
|
40 } |
|
41 use PARSER; |
|
42 use UTF; |
|
43 |
|
44 # The following numbers are used for byte-orders: |
|
45 # 0 means unspecified |
|
46 # 1 means big-endian |
|
47 # 2 means little-endian |
|
48 |
|
49 FixParametersToWorkWithWindows98(\@ARGV); |
|
50 my $versionNumber = 3; |
|
51 my $outputByteOrderMark = 0; |
|
52 my $unicodeByteOrder = 0; |
|
53 my $inputEncoding = ""; |
|
54 my $outputEncoding = ""; |
|
55 my %foreignCharacters = (); # Hash with the foreign Character code as the value, unicode as key |
|
56 my %unicodeCharacters = (); # Hash with the Unicode Character code as the value, foreign as key |
|
57 |
|
58 |
|
59 my $inputFile=\*STDIN; |
|
60 my $outputFile=\*STDOUT; |
|
61 ReadParameters(\@ARGV,\$outputByteOrderMark,\$unicodeByteOrder,\$inputEncoding,\$outputEncoding,\$inputFile,\$outputFile); |
|
62 HandleByteOrderMarks($outputByteOrderMark,\$unicodeByteOrder, \$inputEncoding,\$outputEncoding, $inputFile, $outputFile); |
|
63 DoConversion(\$unicodeByteOrder, \$inputEncoding, \$outputEncoding, $inputFile, $outputFile, \%foreignCharacters, \%unicodeCharacters); |
|
64 if ($inputFile!=\*STDIN) |
|
65 { |
|
66 close($inputFile) or die; |
|
67 } |
|
68 if ($outputFile!=\*STDOUT) |
|
69 { |
|
70 close($outputFile) or die; |
|
71 } |
|
72 |
|
73 sub FixParametersToWorkWithWindows98 |
|
74 { |
|
75 my $parameters=shift; |
|
76 my $i; |
|
77 for ($i=@$parameters-2; $i>=0; --$i) # iterate backwards as some parameters may be deleted from @$parameters |
|
78 { |
|
79 if (($parameters->[$i]=~/^(-input)$/i) || |
|
80 ($parameters->[$i]=~/^(-output)$/i)) |
|
81 { |
|
82 $parameters->[$i].='='.$parameters->[$i+1]; |
|
83 splice(@$parameters, $i+1, 1); |
|
84 } |
|
85 } |
|
86 } |
|
87 |
|
88 sub PrintUsage |
|
89 { |
|
90 print "\nVersion $versionNumber\n\nCharacter set conversion tool\nCopyright (c) 1999 Symbian Ltd\n\n"; |
|
91 print "Usage:\n\n\t charconv [<options>] <inputspec> <outputspec>\n\nwhere\n\n\t"; |
|
92 print "options := [-big|-little][-byteordermark]\n\t"; |
|
93 print "inputspec := -input=<format> [<input_file>]\n\t"; |
|
94 print "outputspec := -output=<format> [<output_file>]\n\t"; |
|
95 print "format := unicode|utf8|big5|gb2312...\n\n"; |
|
96 } |
|
97 |
|
98 sub Assert |
|
99 { |
|
100 my $condition = shift; |
|
101 my $errorMessage = shift; |
|
102 if (!($condition)) # find out where this is used and work this out |
|
103 { |
|
104 die("Error: $errorMessage"); |
|
105 } |
|
106 } |
|
107 |
|
108 sub PrintWarning |
|
109 { |
|
110 my $warningMessage = shift; |
|
111 print STDERR "Warning: $warningMessage\n"; |
|
112 } |
|
113 |
|
114 |
|
115 sub TryFileParameter |
|
116 { |
|
117 my $args = shift; |
|
118 my $argindex = shift; |
|
119 my $inputoroutput = shift; |
|
120 my $encoding = shift; |
|
121 my $filehandle = shift; |
|
122 my $prefix = "-$inputoroutput="; |
|
123 |
|
124 if ($args->[$$argindex] =~ /^$prefix(.*)/) |
|
125 { |
|
126 Assert($$encoding eq "", "\"$prefix...\" is specified more than once"); |
|
127 $$encoding = $1; |
|
128 ++$$argindex; |
|
129 if (($$argindex >= @$args) || ($args->[$$argindex] =~ /^-/)) |
|
130 { |
|
131 --$$argindex; |
|
132 } |
|
133 else |
|
134 { |
|
135 if ($inputoroutput =~ /input/i) |
|
136 { |
|
137 open(INPUT_FILE,"<$args->[$$argindex]") or die "opening $inputoroutput-file failed $!"; |
|
138 $$filehandle=\*INPUT_FILE; |
|
139 } |
|
140 else |
|
141 { |
|
142 open(OUTPUT_FILE,">$args->[$$argindex]") or die "opening $inputoroutput-file failed $!"; |
|
143 $$filehandle=\*OUTPUT_FILE; |
|
144 } |
|
145 } |
|
146 binmode $$filehandle; |
|
147 return 1; |
|
148 } |
|
149 return 0; |
|
150 } |
|
151 |
|
152 sub ReadParameters |
|
153 { |
|
154 my $args = shift; |
|
155 my $outputbyteordermark = shift; |
|
156 my $unicodebyteorder = shift; |
|
157 my $inputencoding = shift; |
|
158 my $outputencoding = shift; |
|
159 my $inputhandle = shift; |
|
160 my $outputhandle = shift; |
|
161 my $i; |
|
162 my $range; |
|
163 if ((@$args <= 0) || ($args->[0] eq "?") || ($args->[0] eq "/?")) |
|
164 { |
|
165 PrintUsage(); |
|
166 exit; |
|
167 } |
|
168 |
|
169 for ($i = 0; $i < @$args ; ++$i) |
|
170 { |
|
171 if ( $args->[$i]=~ /-byteordermark/i) |
|
172 { |
|
173 Assert(!$$outputbyteordermark, "\"-byteordermark\" is specified more than once"); |
|
174 $$outputbyteordermark = 1; |
|
175 } |
|
176 elsif ($args->[$i]=~ /-big/i) |
|
177 { |
|
178 Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once"); |
|
179 $$unicodebyteorder = 1; |
|
180 } |
|
181 elsif ($args->[$i]=~ /-little/i) |
|
182 { |
|
183 Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once"); |
|
184 $$unicodebyteorder = 2; |
|
185 } |
|
186 else |
|
187 { |
|
188 Assert(TryFileParameter($args, \$i, "input",$inputencoding,$inputhandle) || |
|
189 TryFileParameter($args, \$i, "output",$outputencoding, $outputhandle), "bad parameter \"$args->[$i]\""); |
|
190 } |
|
191 } |
|
192 Assert($$inputencoding ne "", "no input encoding is specified"); |
|
193 Assert($$outputencoding ne "", "no output encoding is specified"); |
|
194 } |
|
195 |
|
196 sub ReadFromFile |
|
197 { |
|
198 my $buffer = shift; |
|
199 my $numOfBytesToRead = shift; |
|
200 my $inputhandle = shift; |
|
201 my $numOfBytesRead = 0; |
|
202 my $numOfBytesToReadThisTime = $numOfBytesToRead; |
|
203 |
|
204 for(;;) |
|
205 { |
|
206 for(;;) |
|
207 { |
|
208 my $remainingNumOfBytesToRead = $numOfBytesToRead - $numOfBytesRead; |
|
209 if ($numOfBytesToReadThisTime > $remainingNumOfBytesToRead) |
|
210 { |
|
211 $numOfBytesToReadThisTime = $remainingNumOfBytesToRead; |
|
212 } |
|
213 my $numOfBytesReadThisTime = read $inputhandle, $$buffer, $numOfBytesToReadThisTime; |
|
214 if (defined $numOfBytesReadThisTime) |
|
215 { |
|
216 $numOfBytesRead += $numOfBytesReadThisTime; |
|
217 Assert($numOfBytesRead <= $numOfBytesReadThisTime, "internal error (read too many bytes)"); |
|
218 if (($numOfBytesRead >= $numOfBytesReadThisTime) || $numOfBytesReadThisTime == 0) |
|
219 { |
|
220 return; |
|
221 } |
|
222 last; |
|
223 } |
|
224 $numOfBytesToReadThisTime /= 2; |
|
225 Assert($numOfBytesToReadThisTime >0, "reading from file failed"); |
|
226 } |
|
227 } |
|
228 } |
|
229 |
|
230 sub HandleByteOrderMarks |
|
231 { |
|
232 my $outputbyteordermark = shift; |
|
233 my $unicodebyteorder = shift; |
|
234 my $inputencoding = shift; |
|
235 my $outputencoding = shift; |
|
236 my $inputhandle = shift; |
|
237 my $outputhandle = shift; |
|
238 |
|
239 if ($$inputencoding =~ /unicode/i) |
|
240 { |
|
241 my $firstUnicodeCharacter = 0; |
|
242 ReadFromFile(\$firstUnicodeCharacter, 2, $inputhandle); |
|
243 my $byteOrderSpecifiedByByteOrderMark = 0; |
|
244 if (length($firstUnicodeCharacter) == 2) |
|
245 { |
|
246 my @firstUnicodeCharacter = unpack "C*", $firstUnicodeCharacter; |
|
247 if (($firstUnicodeCharacter[0]==0xff) && ($firstUnicodeCharacter[1]==0xfe)) |
|
248 { |
|
249 $byteOrderSpecifiedByByteOrderMark = 2; |
|
250 } |
|
251 elsif (($firstUnicodeCharacter[0]==0xfe) && ($firstUnicodeCharacter[1]==0xff)) |
|
252 { |
|
253 $byteOrderSpecifiedByByteOrderMark = 1; |
|
254 } |
|
255 else |
|
256 { |
|
257 my $error = seek $inputhandle, 0, 0; # rewind to start of file |
|
258 Assert ($error == 1, "could not rewind to the start of input file"); |
|
259 } |
|
260 } |
|
261 if ($byteOrderSpecifiedByByteOrderMark!=0) |
|
262 { |
|
263 if (($$unicodebyteorder!=0) && ($byteOrderSpecifiedByByteOrderMark!=$$unicodebyteorder)) |
|
264 { |
|
265 PrintWarning ("the byte order specified by the byte-order mark in the unicode input is different from the byte order specified by the parameter - taking the byte-order specified by the byte-order mark in the unicode input"); |
|
266 } |
|
267 $$unicodebyteorder = $byteOrderSpecifiedByByteOrderMark; |
|
268 } |
|
269 } |
|
270 if ($outputbyteordermark) |
|
271 { |
|
272 if ($$outputencoding ne "unicode") |
|
273 { |
|
274 PrintWarning("\"-byteordermark\" is only relevant for unicode output"); |
|
275 } |
|
276 else |
|
277 { |
|
278 Assert($$unicodebyteorder!=0, "the byte order must be specified if a byte-order mark is to be added to the unicode output"); |
|
279 my $firstUnicodeCharacter=($$unicodebyteorder==1)? "\xfe\xff": "\xff\xfe"; |
|
280 WriteToFile(\$firstUnicodeCharacter, $outputhandle); |
|
281 } |
|
282 } |
|
283 } |
|
284 |
|
285 sub WriteToFile |
|
286 { |
|
287 my $buffer = shift; |
|
288 my $outputhandle = shift; |
|
289 |
|
290 print $outputhandle $$buffer; |
|
291 } |
|
292 |
|
293 sub DoConversion |
|
294 { |
|
295 my $unicodebyteorder = shift; |
|
296 my $inputencoding = shift; |
|
297 my $outputencoding = shift; |
|
298 my $inputhandle = shift; |
|
299 my $outputhandle = shift; |
|
300 my $foreignCharacters = shift; |
|
301 my $unicodeCharacters = shift; |
|
302 |
|
303 my $currentBuffer = 0; |
|
304 my @arrayOfBuffers = ('', '', ''); |
|
305 my $largeNumber=1000000; |
|
306 ReadFromFile(\($arrayOfBuffers[$currentBuffer]), $largeNumber, $inputhandle); |
|
307 ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $inputencoding, \($arrayOfBuffers[$currentBuffer])); |
|
308 if ($$inputencoding ne $$outputencoding) |
|
309 { |
|
310 if ($$inputencoding !~ /^unicode$/i) |
|
311 { |
|
312 my $nextBuffer = $currentBuffer + 1; |
|
313 OtherToUnicode ($inputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v'); |
|
314 $currentBuffer = $nextBuffer; |
|
315 } |
|
316 if ($$outputencoding !~ /^unicode$/i) |
|
317 { |
|
318 my $nextBuffer = $currentBuffer + 1; |
|
319 UnicodeToOther($outputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v'); |
|
320 $currentBuffer = $nextBuffer; |
|
321 } |
|
322 } |
|
323 ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $outputencoding, \($arrayOfBuffers[$currentBuffer])); |
|
324 WriteToFile(\($arrayOfBuffers[$currentBuffer]), $outputhandle); |
|
325 } |
|
326 |
|
327 sub ReverseByteOrderIfUnicodeAndBigEndian |
|
328 { |
|
329 my $unicodebyteorder = shift; |
|
330 my $encoding = shift; |
|
331 my $buffer = shift; |
|
332 my $i; |
|
333 |
|
334 if ($$encoding =~ /^unicode$/i) |
|
335 { |
|
336 Assert(length($$buffer)%2==0, "internal error (bad number of bytes in unicode buffer)"); |
|
337 if ($$unicodebyteorder==0) |
|
338 { |
|
339 PrintWarning("the byte order of unicode text is unspecified - defaulting to little-endian"); |
|
340 $$unicodebyteorder = 2; |
|
341 } |
|
342 if ($$unicodebyteorder==1) |
|
343 { |
|
344 $$buffer=pack('v*', unpack('n*', $$buffer)); |
|
345 } |
|
346 } |
|
347 } |
|
348 |
|
349 sub FillInHashes |
|
350 { |
|
351 my $foreignCharacters = shift; |
|
352 my $unicodeCharacters = shift; |
|
353 my $encoding = shift; |
|
354 my $replacementCharacter = shift; |
|
355 my $ranges = shift; |
|
356 my $bigEndian = shift; |
|
357 |
|
358 my $endianness = 0; |
|
359 my $replacenum = 0; |
|
360 my $rangenum = 0; |
|
361 my $fileread = 0; |
|
362 my $largenumber = 1000000; |
|
363 |
|
364 my $dataFile=&PerlScriptPath()."charconv\\".$$encoding.'.dat'; |
|
365 |
|
366 my $line; |
|
367 |
|
368 if (-e $dataFile) |
|
369 { |
|
370 open (HASH_INPUT, "< $dataFile") or die ("Could not open file for reading"); |
|
371 |
|
372 binmode HASH_INPUT; |
|
373 # reading the endianness |
|
374 $fileread = read HASH_INPUT, $endianness, 1; |
|
375 $endianness = unpack "C",$endianness; |
|
376 if ($endianness == 0) |
|
377 { |
|
378 # set the template to a default-> n for the eman time |
|
379 $$bigEndian = 0; |
|
380 } |
|
381 elsif ($endianness == 1) |
|
382 { |
|
383 $$bigEndian = 0; |
|
384 } |
|
385 elsif ($endianness == 2) |
|
386 { |
|
387 $$bigEndian = 1; |
|
388 } |
|
389 else |
|
390 { |
|
391 print "Illegal Endianness specified in the control files"; |
|
392 } |
|
393 #reading the replacement characters |
|
394 $fileread = read HASH_INPUT, $replacenum,1; |
|
395 $replacenum= unpack "C",$replacenum; |
|
396 $fileread = read HASH_INPUT, $$replacementCharacter,$replacenum; |
|
397 # reading the ranges |
|
398 $fileread = read HASH_INPUT, $rangenum, 1; |
|
399 $rangenum = unpack "C",$rangenum; |
|
400 my $i; # loop variable |
|
401 for ($i=0; $i < $rangenum; ++$i) |
|
402 { |
|
403 my $lowerrange = 0; |
|
404 my $upperrange = 0; |
|
405 my $followchar = 0; |
|
406 |
|
407 $fileread = read HASH_INPUT,$lowerrange,1; |
|
408 $lowerrange = unpack "C",$lowerrange; |
|
409 $fileread = read HASH_INPUT,$upperrange,1; |
|
410 $upperrange = unpack "C",$upperrange; |
|
411 $fileread = read HASH_INPUT,$followchar,1; |
|
412 $followchar = unpack "C",$followchar; |
|
413 |
|
414 push @$ranges,[$lowerrange,$upperrange,$followchar]; |
|
415 } |
|
416 my $data = 0; |
|
417 my @unpackeddata = 0; |
|
418 $fileread = read HASH_INPUT, $data, $largenumber; |
|
419 @unpackeddata = unpack "v*",$data; |
|
420 for($i = 0; $i <= $#unpackeddata; $i= $i+2) |
|
421 { |
|
422 $unicodeCharacters->{$unpackeddata[$i]}=$unpackeddata[$i+1]; |
|
423 $foreignCharacters->{$unpackeddata[$i+1]}=$unpackeddata[$i]; |
|
424 } |
|
425 } |
|
426 else |
|
427 { |
|
428 die ("Encoding Format \"$$encoding\" not recognised"); |
|
429 } |
|
430 } |
|
431 |
|
432 sub OtherToUnicode |
|
433 { |
|
434 my $inputencoding = shift; |
|
435 my $unicode = shift; |
|
436 my $other = shift; |
|
437 my $foreignCharacters = shift; |
|
438 my $unicodeCharacters = shift; |
|
439 my $unicodetemplate = shift; |
|
440 my $replacementCharacter = 0; |
|
441 my $unicodeReplacementCharacter = pack($unicodetemplate, 0xfffd); |
|
442 my @ranges=(); |
|
443 |
|
444 my $otherIndex= 0; |
|
445 my $numOfBytes = length($other); |
|
446 my $key = 0; |
|
447 my $inRange = 0; |
|
448 my $followByte = -1; |
|
449 |
|
450 if ($$inputencoding=~/^utf8$/i) |
|
451 { |
|
452 return &Utf8ToUnicode($unicode, $other, $unicodetemplate); |
|
453 } |
|
454 my $bigEndian; |
|
455 FillInHashes($foreignCharacters,$unicodeCharacters, $inputencoding, \$replacementCharacter,\@ranges,\$bigEndian); |
|
456 for (;;) |
|
457 { |
|
458 if ($otherIndex > $numOfBytes -1) |
|
459 { |
|
460 last; |
|
461 } |
|
462 my $frontByte = (unpack("x$otherIndex".'C', $other))[0]; |
|
463 # @ranges is an array of references. Each reference is a reference to an array |
|
464 for ($key = 0; $key <= $#ranges; ++$key) |
|
465 { |
|
466 my $arrayref = $ranges[$key]; |
|
467 if (($frontByte >= $arrayref->[0]) && ($frontByte <= $arrayref->[1])) |
|
468 { |
|
469 $followByte = $arrayref->[2]; |
|
470 $inRange = 1; |
|
471 } |
|
472 } |
|
473 Assert ($inRange != 0, "cannot figure out the Byte size of the character"); |
|
474 my $tempByte = 0; |
|
475 for ($key = 0; $key<= $followByte; ++$key) |
|
476 { |
|
477 if ($bigEndian) |
|
478 { |
|
479 $tempByte = ($tempByte << 8) | (unpack("x$otherIndex".'C', $other))[0]; |
|
480 } |
|
481 else |
|
482 { |
|
483 $tempByte = $tempByte | ((unpack("x$otherIndex".'C', $other))[0] << (8*$key)); |
|
484 } |
|
485 $otherIndex++; |
|
486 } |
|
487 if (exists $unicodeCharacters->{$tempByte}) |
|
488 { |
|
489 $$unicode .= pack $unicodetemplate , $unicodeCharacters->{$tempByte}; |
|
490 } |
|
491 else |
|
492 { |
|
493 $$unicode .= $unicodeReplacementCharacter; |
|
494 } |
|
495 } |
|
496 } |
|
497 |
|
498 sub UnicodeToOther |
|
499 { |
|
500 my $outputencoding = shift; |
|
501 my $other = shift; |
|
502 my $unicode = shift; |
|
503 my $foreignCharacters = shift; |
|
504 my $unicodeCharacters = shift; |
|
505 my $unicodetemplate = shift; |
|
506 my $replacementCharacter = 0; |
|
507 my @ranges=(); |
|
508 |
|
509 my $unicodeIndex= 0; |
|
510 my $numOfBytes = length($unicode); |
|
511 my @UnicodeUnpacked = (); |
|
512 my $key = 0; |
|
513 |
|
514 if ($$outputencoding=~/^utf8$/i) |
|
515 { |
|
516 return &UnicodeToUtf8($other, $unicode, $unicodetemplate); |
|
517 } |
|
518 my $bigEndian; |
|
519 FillInHashes($foreignCharacters,$unicodeCharacters, $outputencoding, \$replacementCharacter,\@ranges,\$bigEndian); |
|
520 my $foreignTemplate=$bigEndian? 'n': 'v'; |
|
521 @UnicodeUnpacked = unpack "$unicodetemplate*", $unicode; |
|
522 foreach $key (@UnicodeUnpacked) |
|
523 { |
|
524 if (!exists($foreignCharacters->{$key})) |
|
525 { |
|
526 $$other .= $replacementCharacter; |
|
527 } |
|
528 else |
|
529 { |
|
530 # This is the WRONG but it will work for the mean time |
|
531 # This will fail if the foreignCharacter has characters that are more than |
|
532 # two bytes long ..... But this should work for foreign characters of 1 or 2 Bytes |
|
533 |
|
534 my $foreignValue = $foreignCharacters->{$key}; |
|
535 if ( $foreignValue <= 255) |
|
536 { |
|
537 $$other .= pack "C" , $foreignValue; |
|
538 } |
|
539 else |
|
540 { |
|
541 $$other .= pack $foreignTemplate, $foreignValue; |
|
542 } |
|
543 } |
|
544 } |
|
545 } |
|
546 |