charconvfw/Charconv/ongoing/Group/CHARCONV.PL
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Fri, 16 Apr 2010 16:55:07 +0300
changeset 16 56cd22a7a1cb
parent 0 1fb32624e06b
permissions -rw-r--r--
Revision: 201011 Kit: 201015

#
# Copyright (c) 2000 Nokia Corporation and/or its subsidiary(-ies).
# All rights reserved.
# This component and the accompanying materials are made available
# under the terms of the License "Eclipse Public License v1.0"
# which accompanies this distribution, and is available
# at the URL "http://www.eclipse.org/legal/epl-v10.html".
#
# Initial Contributors:
# Nokia Corporation - initial contribution.
#
# Contributors:
#
# Description:      
#

use strict;
use integer;

sub PerlScriptPath
	{
	my $perlScriptPath=$0;
	$perlScriptPath=~s/\//\\/g; # replace any forward-slashes with back-slashes
	$perlScriptPath=~s/(\\?)[^\\]+$/$1/; # get rid of this Perl-script's file-name
	return $perlScriptPath;
	}
BEGIN
	{
	unshift(@INC, &PerlScriptPath()); # can't do "use lib &PerlScriptPath()" here as "use lib" only seems to work with *hard-coded* directory names
	}
use PARSER;
use UTF;

# The following numbers are used for byte-orders:
#     0 means unspecified
#     1 means big-endian
#     2 means little-endian

FixParametersToWorkWithWindows98(\@ARGV);
my $versionNumber = 3;
my $outputByteOrderMark = 0;
my $unicodeByteOrder = 0;
my $inputEncoding = ""; 
my $outputEncoding = ""; 
my %foreignCharacters = (); # Hash with the foreign Character code as the value, unicode as key 
my %unicodeCharacters = (); # Hash with the Unicode Character code as the value, foreign as key 


my $inputFile=\*STDIN;
my $outputFile=\*STDOUT;
ReadParameters(\@ARGV,\$outputByteOrderMark,\$unicodeByteOrder,\$inputEncoding,\$outputEncoding,\$inputFile,\$outputFile);
HandleByteOrderMarks($outputByteOrderMark,\$unicodeByteOrder, \$inputEncoding,\$outputEncoding, $inputFile, $outputFile);
DoConversion(\$unicodeByteOrder, \$inputEncoding, \$outputEncoding, $inputFile, $outputFile, \%foreignCharacters, \%unicodeCharacters);
if ($inputFile!=\*STDIN)
	{
	close($inputFile) or die;
	}
if ($outputFile!=\*STDOUT)
	{
	close($outputFile) or die;
	}

sub FixParametersToWorkWithWindows98
	{
	my $parameters=shift;
	my $i;
	for ($i=@$parameters-2; $i>=0; --$i) # iterate backwards as some parameters may be deleted from @$parameters
		{
		if (($parameters->[$i]=~/^(-input)$/i) ||
			($parameters->[$i]=~/^(-output)$/i))
			{
			$parameters->[$i].='='.$parameters->[$i+1];
			splice(@$parameters, $i+1, 1);
			}
		}
	}

sub PrintUsage 
	{
	print "\nVersion $versionNumber\n\nCharacter set conversion tool\nCopyright (c) 1999 Symbian Ltd\n\n";
	print "Usage:\n\n\t charconv [<options>] <inputspec> <outputspec>\n\nwhere\n\n\t";
	print "options    :=  [-big|-little][-byteordermark]\n\t";
	print "inputspec  :=  -input=<format> [<input_file>]\n\t";
	print "outputspec :=  -output=<format> [<output_file>]\n\t";
	print "format     :=  unicode|utf8|big5|gb2312...\n\n";
	}

sub Assert
	{
	my $condition = shift;
	my $errorMessage = shift;
	if (!($condition)) # find out where this is used and work this out 
		{
		die("Error: $errorMessage");
		}
	}

sub PrintWarning 
	{
	my $warningMessage = shift;
	print STDERR "Warning: $warningMessage\n";
	}
	

sub TryFileParameter 
	{
	my $args = shift;
	my $argindex = shift;
	my $inputoroutput = shift;
	my $encoding = shift;
	my $filehandle = shift;
	my $prefix = "-$inputoroutput=";

	if ($args->[$$argindex] =~ /^$prefix(.*)/)
		{
		Assert($$encoding eq "", "\"$prefix...\" is specified more than once");
		$$encoding = $1;
		++$$argindex;
		if (($$argindex >= @$args) || ($args->[$$argindex] =~ /^-/))
			{
			--$$argindex;
			}
		else
			{
			if ($inputoroutput =~ /input/i)
				{
				open(INPUT_FILE,"<$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
				$$filehandle=\*INPUT_FILE;
				}
			else
				{
				open(OUTPUT_FILE,">$args->[$$argindex]") or die "opening $inputoroutput-file failed $!";
				$$filehandle=\*OUTPUT_FILE;
				}
			}
		binmode $$filehandle;
		return  1;
		}
	return 0; 
	}

sub ReadParameters 
	{
	my $args = shift; 
	my $outputbyteordermark = shift;
	my $unicodebyteorder = shift;
	my $inputencoding = shift;
	my $outputencoding = shift;
	my $inputhandle = shift;
	my $outputhandle = shift;
	my $i;
	my $range;
	if ((@$args <= 0) || ($args->[0] eq "?") || ($args->[0] eq "/?"))
		{
		PrintUsage();
		exit;
		}

	for ($i = 0; $i < @$args ; ++$i)
		{
		if ( $args->[$i]=~ /-byteordermark/i)
			{
			Assert(!$$outputbyteordermark, "\"-byteordermark\" is specified more than once");
			$$outputbyteordermark = 1; 
			}
		elsif ($args->[$i]=~ /-big/i)
			{
			Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
			$$unicodebyteorder = 1;
			}
		elsif ($args->[$i]=~ /-little/i)
			{
			Assert(($$unicodebyteorder==0),"the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
			$$unicodebyteorder = 2;
			}
		else
			{ 
			Assert(TryFileParameter($args, \$i, "input",$inputencoding,$inputhandle) ||
				   TryFileParameter($args, \$i, "output",$outputencoding, $outputhandle), "bad parameter \"$args->[$i]\"");
			}
		}
	Assert($$inputencoding ne "", "no input encoding is specified");
	Assert($$outputencoding ne "", "no output encoding is specified");
	}

sub ReadFromFile
	{  
	my $buffer = shift;
	my $numOfBytesToRead = shift;
	my $inputhandle = shift;
	my $numOfBytesRead = 0;									
	my $numOfBytesToReadThisTime = $numOfBytesToRead;		

	for(;;)
		{
		for(;;)
			{
			my $remainingNumOfBytesToRead = $numOfBytesToRead - $numOfBytesRead;  
			if ($numOfBytesToReadThisTime > $remainingNumOfBytesToRead)
				{
				$numOfBytesToReadThisTime = $remainingNumOfBytesToRead;
				}  		
			my $numOfBytesReadThisTime = read $inputhandle, $$buffer, $numOfBytesToReadThisTime;  
			if (defined $numOfBytesReadThisTime)
				{
				$numOfBytesRead += $numOfBytesReadThisTime;
				Assert($numOfBytesRead <= $numOfBytesReadThisTime, "internal error (read too many bytes)");
				if (($numOfBytesRead >= $numOfBytesReadThisTime) || $numOfBytesReadThisTime == 0)
					{
					return;
					}
				last;
				}
			$numOfBytesToReadThisTime /= 2;
			Assert($numOfBytesToReadThisTime >0, "reading from file failed");
			}
		}
	}

sub HandleByteOrderMarks 
	{
	my $outputbyteordermark = shift;
	my $unicodebyteorder = shift;
	my $inputencoding = shift;
	my $outputencoding = shift;
	my $inputhandle = shift;
	my $outputhandle = shift;

	if ($$inputencoding =~ /unicode/i) 
		{
		my $firstUnicodeCharacter = 0; 
		ReadFromFile(\$firstUnicodeCharacter, 2, $inputhandle);
		my $byteOrderSpecifiedByByteOrderMark = 0;
		if (length($firstUnicodeCharacter) == 2)
			{
			my @firstUnicodeCharacter = unpack "C*", $firstUnicodeCharacter;
			if (($firstUnicodeCharacter[0]==0xff) && ($firstUnicodeCharacter[1]==0xfe))
				{
				$byteOrderSpecifiedByByteOrderMark = 2; 
				}
			elsif (($firstUnicodeCharacter[0]==0xfe) && ($firstUnicodeCharacter[1]==0xff))
				{
				$byteOrderSpecifiedByByteOrderMark = 1; 
				}
			else
				{
				my $error = seek $inputhandle, 0, 0; # rewind to start of file
				Assert ($error == 1, "could not rewind to the start of input file");
				}
			}
		if ($byteOrderSpecifiedByByteOrderMark!=0)
			{
			if (($$unicodebyteorder!=0) && ($byteOrderSpecifiedByByteOrderMark!=$$unicodebyteorder))
				{
				PrintWarning ("the byte order specified by the byte-order mark in the unicode input is different from the byte order specified by the parameter - taking the byte-order specified by the byte-order mark in the unicode input");
				}
			$$unicodebyteorder = $byteOrderSpecifiedByByteOrderMark;
			}
		}
	if ($outputbyteordermark)
		{
		if ($$outputencoding ne "unicode") 
			{
			PrintWarning("\"-byteordermark\" is only relevant for unicode output");
			}
		else
			{
			Assert($$unicodebyteorder!=0, "the byte order must be specified if a byte-order mark is to be added to the unicode output");
			my $firstUnicodeCharacter=($$unicodebyteorder==1)? "\xfe\xff": "\xff\xfe";
			WriteToFile(\$firstUnicodeCharacter, $outputhandle);
			}
		}
	}

sub WriteToFile 
	{
	my $buffer = shift;
	my $outputhandle = shift;

	print $outputhandle $$buffer;
	}

sub DoConversion
	{
	my $unicodebyteorder = shift;
	my $inputencoding = shift;
	my $outputencoding = shift;
	my $inputhandle = shift;
	my $outputhandle = shift; 
	my $foreignCharacters = shift;
	my $unicodeCharacters = shift;
	
	my $currentBuffer = 0;
	my @arrayOfBuffers = ('', '', '');
	my $largeNumber=1000000;
	ReadFromFile(\($arrayOfBuffers[$currentBuffer]), $largeNumber, $inputhandle);
	ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $inputencoding, \($arrayOfBuffers[$currentBuffer]));
	if ($$inputencoding ne $$outputencoding) 
		{
		if ($$inputencoding !~ /^unicode$/i)
			{
			my $nextBuffer = $currentBuffer + 1;
			OtherToUnicode ($inputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
			$currentBuffer = $nextBuffer;
			}
		if ($$outputencoding !~ /^unicode$/i)
			{
			my $nextBuffer = $currentBuffer + 1;
			UnicodeToOther($outputencoding, \($arrayOfBuffers[$nextBuffer]), ($arrayOfBuffers[$currentBuffer]), $foreignCharacters, $unicodeCharacters, 'v');
			$currentBuffer = $nextBuffer;
			}
		}
	ReverseByteOrderIfUnicodeAndBigEndian($unicodebyteorder, $outputencoding, \($arrayOfBuffers[$currentBuffer]));
	WriteToFile(\($arrayOfBuffers[$currentBuffer]), $outputhandle);
	}

sub ReverseByteOrderIfUnicodeAndBigEndian
	{
	my $unicodebyteorder = shift;
	my $encoding = shift;
	my $buffer = shift;
	my $i;

	if ($$encoding =~ /^unicode$/i)
		{
		Assert(length($$buffer)%2==0, "internal error (bad number of bytes in unicode buffer)");
		if ($$unicodebyteorder==0)
			{
			PrintWarning("the byte order of unicode text is unspecified - defaulting to little-endian");
			$$unicodebyteorder = 2;
			}
		if ($$unicodebyteorder==1)
			{
			$$buffer=pack('v*', unpack('n*', $$buffer));
			}
		}
	}

sub FillInHashes
	{
	my $foreignCharacters = shift;
	my $unicodeCharacters = shift;
	my $encoding = shift; 
	my $replacementCharacter = shift;
	my $ranges = shift;
	my $bigEndian = shift;

	my $endianness = 0;
	my $replacenum = 0;
	my $rangenum = 0;
	my $fileread = 0;
	my $largenumber = 1000000;

	my $dataFile=&PerlScriptPath()."charconv\\".$$encoding.'.dat';

	my $line;

	if (-e $dataFile)
		{
		open (HASH_INPUT, "< $dataFile") or die ("Could not open file for reading");
		
		binmode HASH_INPUT;
		# reading the endianness
		$fileread = read HASH_INPUT, $endianness, 1;
		$endianness = unpack "C",$endianness;
		if ($endianness == 0)
			{
			# set the template to a default-> n for the eman time
			$$bigEndian = 0;
			}
		elsif ($endianness == 1)
			{
			$$bigEndian = 0;
			}
		elsif ($endianness == 2)
			{
			$$bigEndian = 1;
			}
		else
			{
			print "Illegal Endianness specified in the control files";
			}
		#reading the replacement characters
		$fileread = read HASH_INPUT, $replacenum,1;
		$replacenum= unpack "C",$replacenum;
		$fileread = read HASH_INPUT, $$replacementCharacter,$replacenum;
		# reading the ranges
		$fileread = read HASH_INPUT, $rangenum, 1;
		$rangenum = unpack "C",$rangenum;
		my $i; # loop variable 
		for ($i=0; $i < $rangenum; ++$i)
			{
			my $lowerrange = 0;
			my $upperrange = 0;
			my $followchar = 0;

			$fileread = read HASH_INPUT,$lowerrange,1;
			$lowerrange = unpack "C",$lowerrange;
			$fileread = read HASH_INPUT,$upperrange,1;
			$upperrange = unpack "C",$upperrange;
			$fileread = read HASH_INPUT,$followchar,1;
			$followchar = unpack "C",$followchar;

			push @$ranges,[$lowerrange,$upperrange,$followchar];
			}
		my $data = 0;
		my @unpackeddata = 0;
		$fileread = read HASH_INPUT, $data, $largenumber;
		@unpackeddata = unpack "v*",$data;
		for($i = 0; $i <= $#unpackeddata; $i= $i+2)
			{
			$unicodeCharacters->{$unpackeddata[$i]}=$unpackeddata[$i+1];
			$foreignCharacters->{$unpackeddata[$i+1]}=$unpackeddata[$i];
			}
		}
	else
		{
		die ("Encoding Format \"$$encoding\" not recognised");
		}
	}

sub OtherToUnicode
	{
	my $inputencoding = shift;
	my $unicode = shift;  
	my $other = shift; 
	my $foreignCharacters = shift;
	my $unicodeCharacters = shift;
	my $unicodetemplate = shift;
	my $replacementCharacter = 0;
	my $unicodeReplacementCharacter = pack($unicodetemplate, 0xfffd);
	my @ranges=(); 
 
	my $otherIndex= 0;
	my $numOfBytes = length($other);
	my $key = 0;
	my $inRange = 0;
	my $followByte = -1;

	if ($$inputencoding=~/^utf8$/i)
		{
		return &Utf8ToUnicode($unicode, $other, $unicodetemplate);
		}
	my $bigEndian;
	FillInHashes($foreignCharacters,$unicodeCharacters, $inputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
	for (;;)
		{
		if ($otherIndex > $numOfBytes -1) 
			{
			last;
			}
		my $frontByte = (unpack("x$otherIndex".'C', $other))[0];
		# @ranges is an array of references. Each reference is a reference to an array
		for ($key = 0; $key <= $#ranges; ++$key)
			{
			my $arrayref = $ranges[$key];
			if (($frontByte >= $arrayref->[0]) && ($frontByte <= $arrayref->[1]))
				{
				$followByte = $arrayref->[2];
				$inRange = 1;
				}
			}
		Assert ($inRange != 0, "cannot figure out the Byte size of the character");
		my $tempByte = 0;
		for ($key = 0; $key<= $followByte; ++$key)
			{
			if ($bigEndian)
				{
				$tempByte = ($tempByte << 8) | (unpack("x$otherIndex".'C', $other))[0];
				}
			else
				{
				$tempByte = $tempByte | ((unpack("x$otherIndex".'C', $other))[0] << (8*$key));
				}
			$otherIndex++;	
			}
		if (exists $unicodeCharacters->{$tempByte})
			{
			$$unicode .= pack $unicodetemplate , $unicodeCharacters->{$tempByte};
			}
		else
			{
			$$unicode .= $unicodeReplacementCharacter;
			}
		}
	}

sub UnicodeToOther
	{
	my $outputencoding = shift;
	my $other = shift;
	my $unicode = shift;
	my $foreignCharacters = shift;
	my $unicodeCharacters = shift;
	my $unicodetemplate = shift;
	my $replacementCharacter = 0;
	my @ranges=(); 

	my $unicodeIndex= 0;
	my $numOfBytes = length($unicode);
	my @UnicodeUnpacked = ();
	my $key = 0;

	if ($$outputencoding=~/^utf8$/i)
		{
		return &UnicodeToUtf8($other, $unicode, $unicodetemplate);
		}
	my $bigEndian;
	FillInHashes($foreignCharacters,$unicodeCharacters, $outputencoding, \$replacementCharacter,\@ranges,\$bigEndian);
	my $foreignTemplate=$bigEndian? 'n': 'v';
	@UnicodeUnpacked = unpack "$unicodetemplate*", $unicode;
	foreach $key (@UnicodeUnpacked)
		{
		if (!exists($foreignCharacters->{$key}))
			{
			$$other .= $replacementCharacter;
			}
		else
			{
			# This is the WRONG but it will work for the mean time
			# This will fail if the foreignCharacter has characters that are more than
			# two bytes long ..... But this should work for foreign characters of 1 or 2 Bytes

			my $foreignValue = $foreignCharacters->{$key};
			if ( $foreignValue <= 255)
				{
				$$other .= pack "C" , $foreignValue;
				}
			else
				{
				$$other .= pack $foreignTemplate, $foreignValue;
				}
			}
		}
	}