# Copyright (c) 2000-2009 Nokia Corporation and/or its subsidiary(-ies).
# All rights reserved.
# This component and the accompanying materials are made available
# under the terms of "Eclipse Public License v1.0"
# which accompanies this distribution, and is available
# at the URL "http://www.eclipse.org/legal/epl-v10.html".
# Initial Contributors:
# Nokia Corporation - initial contribution.
# Contributors:
# Description:
use strict;
use integer;
sub PerlScriptPath
my $perlScriptPath=$0;
my $os = $^O; #get the OS type
#check OS type
if($os=~/MSWin32/) #Windows OS
$perlScriptPath=~s/\//\\/g; # replace any forward-slashes with back-slashes
$perlScriptPath=~s/(\\?)[^\\]+$/$1/; # get rid of this Perl-script's file-name
else #Unix OS
$perlScriptPath=~s/\\/\//g; # replace any back-slashes with forward-slashes
$perlScriptPath=~s/(\/?)[^\/]+$/$1/; # get rid of this Perl-script's file-name
return $perlScriptPath;
unshift(@INC, &PerlScriptPath()); # can't do "use lib &PerlScriptPath()" here as "use lib" only seems to work with *hard-coded* directory names
use UTF;
# The following numbers are used for byte-orders:
# 0 means unspecified
# 1 means big-endian
# 2 means little-endian
my $versionNumber = 3;
my $outputByteOrderMark = 0;
my $unicodeByteOrder = 0;
my $inputEncoding = "";
my $outputEncoding = "";
my %foreignCharacters = (); # Hash with the foreign Character code as the value, unicode as key
my %unicodeCharacters = (); # Hash with the Unicode Character code as the value, foreign as key
my $inputFile=\*STDIN;
my $outputFile=\*STDOUT;
HandleByteOrderMarks($outputByteOrderMark,\$unicodeByteOrder, \$inputEncoding,\$outputEncoding, $inputFile, $outputFile);
DoConversion(\$unicodeByteOrder, \$inputEncoding, \$outputEncoding, $inputFile, $outputFile, \%foreignCharacters, \%unicodeCharacters);
if ($inputFile!=\*STDIN)
close($inputFile) or die;
if ($outputFile!=\*STDOUT)
close($outputFile) or die;
sub FixParametersToWorkWithWindows98
my $parameters=shift;
my $i;
for ($i=@$parameters-2; $i>=0; --$i) # iterate backwards as some parameters may be deleted from @$parameters
if (($parameters->[$i]=~/^(-input)$/i) ||
splice(@$parameters, $i+1, 1);
sub PrintUsage
print "\nVersion $versionNumber\n\nCharacter set conversion tool\nCopyright (c) 1999 Symbian Ltd\n\n";
print "Usage:\n\n\t charconv [<options>] <inputspec> <outputspec>\n\nwhere\n\n\t";
print "options := [-big|-little][-byteordermark]\n\t";
print "inputspec := -input=<format> [<input_file>]\n\t";
print "outputspec := -output=<format> [<output_file>]\n\t";
print "format := unicode|utf8|big5|gb2312...\n\n";
sub Assert
my $condition = shift;
my $errorMessage = shift;
if (!($condition)) # find out where this is used and work this out
die("Error: $errorMessage");
sub PrintWarning
my $warningMessage = shift;
print STDERR "Warning: $warningMessage\n";
sub TryFileParameter
my $args = shift;
my $argindex = shift;
my $inputoroutput = shift;
my $encoding = shift;
my $filehandle = shift;
my $prefix = "-$inputoroutput=";
if ($args->[$$argindex] =~ /^$prefix(.*)/)
Assert($$encoding eq "", "\"$prefix...\" is specified more than once");
$$encoding = $1;
if (($$argindex >= @$args) || ($args->[$$argindex] =~ /^-/))
if ($inputoroutput =~ /input/i)
open(INPUT_FILE,"<$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
open(OUTPUT_FILE,">$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
binmode $$filehandle;
return 1;
return 0;
sub ReadParameters
my $args = shift;
my $outputbyteordermark = shift;
my $unicodebyteorder = shift;
my $inputencoding = shift;
my $outputencoding = shift;
my $inputhandle = shift;
my $outputhandle = shift;
my $i;
my $range;
if ((@$args <= 0) || ($args->[0] eq "?") || ($args->[0] eq "/?"))
for ($i = 0; $i < @$args ; ++$i)
if ( $args->[$i]=~ /-byteordermark/i)
Assert(!$$outputbyteordermark, "\"-byteordermark\" is specified more than once");
$$outputbyteordermark = 1;
elsif ($args->[$i]=~ /-big/i)
Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
$$unicodebyteorder = 1;
elsif ($args->[$i]=~ /-little/i)
Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
$$unicodebyteorder = 2;
Assert(TryFileParameter($args, \$i, "input",$inputencoding,$inputhandle) ||
TryFileParameter($args, \$i, "output",$outputencoding, $outputhandle), "bad parameter \"$args->[$i]\"");
Assert($$inputencoding ne "", "no input encoding is specified");
Assert($$outputencoding ne "", "no output encoding is specified");
sub ReadFromFile
my $buffer = shift;
my $numOfBytesToRead = shift;
my $inputhandle = shift;
my $numOfBytesRead = 0;
my $numOfBytesToReadThisTime = $numOfBytesToRead;
my $remainingNumOfBytesToRead = $numOfBytesToRead - $numOfBytesRead;
if ($numOfBytesToReadThisTime > $remainingNumOfBytesToRead)
$numOfBytesToReadThisTime = $remainingNumOfBytesToRead;
my $numOfBytesReadThisTime = read $inputhandle, $$buffer, $numOfBytesToReadThisTime;
if (defined $numOfBytesReadThisTime)
$numOfBytesRead += $numOfBytesReadThisTime;
Assert($numOfBytesRead <= $numOfBytesReadThisTime, "internal error (read too many bytes)");
if (($numOfBytesRead >= $numOfBytesReadThisTime) || $numOfBytesReadThisTime == 0)
$numOfBytesToReadThisTime /= 2;
Assert($numOfBytesToReadThisTime >0, "reading from file failed");
sub HandleByteOrderMarks
my $outputbyteordermark = shift;
my $unicodebyteorder = shift;
my $inputencoding = shift;
my $outputencoding = shift;
my $inputhandle = shift;
my $outputhandle = shift;
if ($$inputencoding =~ /unicode/i)
my $firstUnicodeCharacter = 0;
ReadFromFile(\$firstUnicodeCharacter, 2, $inputhandle);
my $byteOrderSpecifiedByByteOrderMark = 0;
if (length($firstUnicodeCharacter) == 2)
my @firstUnicodeCharacter = unpack "C*", $firstUnicodeCharacter;
if (($firstUnicodeCharacter[0]==0xff) && ($firstUnicodeCharacter[1]==0xfe))
$byteOrderSpecifiedByByteOrderMark = 2;
elsif (($firstUnicodeCharacter[0]==0xfe) && ($firstUnicodeCharacter[1]==0xff))
$byteOrderSpecifiedByByteOrderMark = 1;
my $error = seek $inputhandle, 0, 0; # rewind to start of file
Assert ($error == 1, "could not rewind to the start of input file");
if ($byteOrderSpecifiedByByteOrderMark!=0)
if (($$unicodebyteorder!=0) && ($byteOrderSpecifiedByByteOrderMark!=$$unicodebyteorder))
PrintWarning ("the byte order specified by the byte-order mark in the unicode input is different from the byte order specified by the parameter - taking the byte-order specified by the byte-order mark in the unicode input");
$$unicodebyteorder = $byteOrderSpecifiedByByteOrderMark;
if ($outputbyteordermark)
if ($$outputencoding ne "unicode")
PrintWarning("\"-byteordermark\" is only relevant for unicode output");
Assert($$unicodebyteorder!=0, "the byte order must be specified if a byte-order mark is to be added to the unicode output");
my $firstUnicodeCharacter=($$unicodebyteorder==1)? "\xfe\xff": "\xff\xfe";
WriteToFile(\$firstUnicodeCharacter, $outputhandle);
sub WriteToFile
my $buffer = shift;
my $outputhandle = shift;
print $outputhandle $$buffer;
sub DoConversion
my $unicodebyteorder = shift;
my $inputencoding = shift;
my $outputencoding = shift;
my $inputhandle = shift;
my $outputhandle = shift;
my $foreignCharacters = shift;
my $unicodeCharacters = shift;
my $currentBuffer = 0;
my @arrayOfBuffers = ('', '', '');
my $largeNumber=1000000;
ReadFromFile(\($arrayOfBuffers[$currentBuffer]), $largeNumber, $inputhandle);
ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $inputencoding, \($arrayOfBuffers[$currentBuffer]));
if ($$inputencoding ne $$outputencoding)
if ($$inputencoding !~ /^unicode$/i)
my $nextBuffer = $currentBuffer + 1;
OtherToUnicode ($inputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
$currentBuffer = $nextBuffer;
if ($$outputencoding !~ /^unicode$/i)
my $nextBuffer = $currentBuffer + 1;
UnicodeToOther($outputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
$currentBuffer = $nextBuffer;
ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $outputencoding, \($arrayOfBuffers[$currentBuffer]));
WriteToFile(\($arrayOfBuffers[$currentBuffer]), $outputhandle);
sub ReverseByteOrderIfUnicodeAndBigEndian
my $unicodebyteorder = shift;
my $encoding = shift;
my $buffer = shift;
my $i;
if ($$encoding =~ /^unicode$/i)
Assert(length($$buffer)%2==0, "internal error (bad number of bytes in unicode buffer)");
if ($$unicodebyteorder==0)
PrintWarning("the byte order of unicode text is unspecified - defaulting to little-endian");
$$unicodebyteorder = 2;
if ($$unicodebyteorder==1)
$$buffer=pack('v*', unpack('n*', $$buffer));
sub FillInHashes
my $foreignCharacters = shift;
my $unicodeCharacters = shift;
my $encoding = shift;
my $replacementCharacter = shift;
my $ranges = shift;
my $bigEndian = shift;
my $endianness = 0;
my $replacenum = 0;
my $rangenum = 0;
my $fileread = 0;
my $largenumber = 1000000;
my $dataFile=&PerlScriptPath()."charconv\\".$$encoding.'.dat';
my $line;
if (-e $dataFile)
open (HASH_INPUT, "< $dataFile") or die ("Could not open file for reading");
binmode HASH_INPUT;
# reading the endianness
$fileread = read HASH_INPUT, $endianness, 1;
$endianness = unpack "C",$endianness;
if ($endianness == 0)
# set the template to a default-> n for the eman time
$$bigEndian = 0;
elsif ($endianness == 1)
$$bigEndian = 0;
elsif ($endianness == 2)
$$bigEndian = 1;
print "Illegal Endianness specified in the control files";
#reading the replacement characters
$fileread = read HASH_INPUT, $replacenum,1;
$replacenum= unpack "C",$replacenum;
$fileread = read HASH_INPUT, $$replacementCharacter,$replacenum;
# reading the ranges
$fileread = read HASH_INPUT, $rangenum, 1;
$rangenum = unpack "C",$rangenum;
my $i; # loop variable
for ($i=0; $i < $rangenum; ++$i)
my $lowerrange = 0;
my $upperrange = 0;
my $followchar = 0;
$fileread = read HASH_INPUT,$lowerrange,1;
$lowerrange = unpack "C",$lowerrange;
$fileread = read HASH_INPUT,$upperrange,1;
$upperrange = unpack "C",$upperrange;
$fileread = read HASH_INPUT,$followchar,1;
$followchar = unpack "C",$followchar;
push @$ranges,[$lowerrange,$upperrange,$followchar];
my $data = 0;
my @unpackeddata = 0;
$fileread = read HASH_INPUT, $data, $largenumber;
@unpackeddata = unpack "v*",$data;
for($i = 0; $i <= $#unpackeddata; $i= $i+2)
die ("Encoding Format \"$$encoding\" not recognised");
sub OtherToUnicode
my $inputencoding = shift;
my $unicode = shift;
my $other = shift;
my $foreignCharacters = shift;
my $unicodeCharacters = shift;
my $unicodetemplate = shift;
my $replacementCharacter = 0;
my $unicodeReplacementCharacter = pack($unicodetemplate, 0xfffd);
my @ranges=();
my $otherIndex= 0;
my $numOfBytes = length($other);
my $key = 0;
my $inRange = 0;
my $followByte = -1;
if ($$inputencoding=~/^utf8$/i)
return &Utf8ToUnicode($unicode, $other, $unicodetemplate);
my $bigEndian;
FillInHashes($foreignCharacters,$unicodeCharacters, $inputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
for (;;)
if ($otherIndex > $numOfBytes -1)
my $frontByte = (unpack("x$otherIndex".'C', $other))[0];
# @ranges is an array of references. Each reference is a reference to an array
for ($key = 0; $key <= $#ranges; ++$key)
my $arrayref = $ranges[$key];
if (($frontByte >= $arrayref->[0]) && ($frontByte <= $arrayref->[1]))
$followByte = $arrayref->[2];
$inRange = 1;
Assert ($inRange != 0, "cannot figure out the Byte size of the character");
my $tempByte = 0;
for ($key = 0; $key<= $followByte; ++$key)
if ($bigEndian)
$tempByte = ($tempByte << 8) | (unpack("x$otherIndex".'C', $other))[0];
$tempByte = $tempByte | ((unpack("x$otherIndex".'C', $other))[0] << (8*$key));
if (exists $unicodeCharacters->{$tempByte})
$$unicode .= pack $unicodetemplate , $unicodeCharacters->{$tempByte};
$$unicode .= $unicodeReplacementCharacter;
sub UnicodeToOther
my $outputencoding = shift;
my $other = shift;
my $unicode = shift;
my $foreignCharacters = shift;
my $unicodeCharacters = shift;
my $unicodetemplate = shift;
my $replacementCharacter = 0;
my @ranges=();
my $unicodeIndex= 0;
my $numOfBytes = length($unicode);
my @UnicodeUnpacked = ();
my $key = 0;
if ($$outputencoding=~/^utf8$/i)
return &UnicodeToUtf8($other, $unicode, $unicodetemplate);
my $bigEndian;
FillInHashes($foreignCharacters,$unicodeCharacters, $outputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
my $foreignTemplate=$bigEndian? 'n': 'v';
@UnicodeUnpacked = unpack "$unicodetemplate*", $unicode;
foreach $key (@UnicodeUnpacked)
if (!exists($foreignCharacters->{$key}))
$$other .= $replacementCharacter;
# This is the WRONG but it will work for the mean time
# This will fail if the foreignCharacter has characters that are more than
# two bytes long ..... But this should work for foreign characters of 1 or 2 Bytes
my $foreignValue = $foreignCharacters->{$key};
if ( $foreignValue <= 255)
$$other .= pack "C" , $foreignValue;
$$other .= pack $foreignTemplate, $foreignValue;