charconvfw/Charconv/ongoing/Group/UTF.PM
changeset 0 1fb32624e06b
child 16 56cd22a7a1cb
equal deleted inserted replaced
-1:000000000000 0:1fb32624e06b
       
     1 #
       
     2 # Copyright (c) 2000 Nokia Corporation and/or its subsidiary(-ies).
       
     3 # All rights reserved.
       
     4 # This component and the accompanying materials are made available
       
     5 # under the terms of "Eclipse Public License v1.0"
       
     6 # which accompanies this distribution, and is available
       
     7 # at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 #
       
     9 # Initial Contributors:
       
    10 # Nokia Corporation - initial contribution.
       
    11 #
       
    12 # Contributors:
       
    13 #
       
    14 # Description:      
       
    15 #
       
    16 
       
    17 use strict;
       
    18 use integer;
       
    19 
       
    20 package UTF;
       
    21 require Exporter;
       
    22 @UTF::ISA=qw(Exporter);
       
    23 @UTF::EXPORT=qw(Utf8ToUnicode UnicodeToUtf8);
       
    24 
       
    25 my $KErrorIllFormedInput=-1;
       
    26 
       
    27 sub Utf8ToUnicode
       
    28 	{
       
    29 	my $Unicode = shift;  
       
    30 	my $Utf8 = shift;
       
    31 	my $UnicodeTemplate = shift;
       
    32 	my $Utf8Index = 0;
       
    33 	my $UnicodeIndex = 0;
       
    34 	my $numOfBytes = length($Utf8);
       
    35 	my @Utf8Unpacked = unpack "C*",$Utf8;
       
    36 	my @UnicodeUnpacked = (); 
       
    37 
       
    38 	for (;;)
       
    39 		{
       
    40 		if ($Utf8Index > $#Utf8Unpacked)
       
    41 			{
       
    42 			last;
       
    43 			}
       
    44 
       
    45 		my $currentUtf8Byte = $Utf8Unpacked[$Utf8Index];
       
    46 		
       
    47 		if (($currentUtf8Byte&0x80)==0x00)
       
    48 			{
       
    49 			$UnicodeUnpacked[$UnicodeIndex] = $currentUtf8Byte;
       
    50 			}
       
    51 		
       
    52 		elsif (($currentUtf8Byte&0xe0)==0xc0)
       
    53 			{
       
    54 			my $currentUnicodeCharacter=(($currentUtf8Byte&0x1f)<<6);
       
    55 			++$Utf8Index;
       
    56 			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
       
    57 			if (($currentUtf8Byte&0xc0)!=0x80)
       
    58 				{
       
    59 				return $KErrorIllFormedInput;
       
    60 				}
       
    61 			$currentUnicodeCharacter|=($currentUtf8Byte&0x3f);
       
    62 			$UnicodeUnpacked[$UnicodeIndex] = $currentUnicodeCharacter;
       
    63 			}
       
    64 
       
    65 		elsif (($currentUtf8Byte&0xf0)==0xe0)
       
    66 			{
       
    67 			my $currentUnicodeCharacter=(($currentUtf8Byte&0x0f)<<12);
       
    68 			++$Utf8Index;
       
    69 			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
       
    70 			if (($currentUtf8Byte&0xc0)!=0x80)
       
    71 				{
       
    72 				return $KErrorIllFormedInput;
       
    73 				}
       
    74 			$currentUnicodeCharacter|=(($currentUtf8Byte&0x3f)<<6);
       
    75 			++$Utf8Index;
       
    76 			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
       
    77 			if (($currentUtf8Byte&0xc0)!=0x80)
       
    78 				{
       
    79 				return $KErrorIllFormedInput;
       
    80 				}
       
    81 			$currentUnicodeCharacter|=($currentUtf8Byte&0x3f);
       
    82 			$UnicodeUnpacked[$UnicodeIndex] = $currentUnicodeCharacter;
       
    83 			}
       
    84 
       
    85 		elsif (($currentUtf8Byte&0xf8)==0xf0)
       
    86 			{                                         
       
    87 			my $currentUnicodeCharacter=(($currentUtf8Byte&0x07)<<8);
       
    88 			++$Utf8Index;
       
    89 			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
       
    90 			if (($currentUtf8Byte&0xc0)!=0x80)
       
    91 				{
       
    92 				return $KErrorIllFormedInput;
       
    93 				}
       
    94 			$currentUnicodeCharacter|=(($currentUtf8Byte&0x3f)<<2);
       
    95 			if ($currentUnicodeCharacter<0x0040)
       
    96 				{
       
    97 				return $KErrorIllFormedInput;
       
    98 				}
       
    99 			$currentUnicodeCharacter-=0x0040;
       
   100 			if ($currentUnicodeCharacter>=0x0400)
       
   101 				{
       
   102 				return $KErrorIllFormedInput;
       
   103 				}
       
   104 			++$Utf8Index;
       
   105 			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
       
   106 			if (($currentUtf8Byte&0xc0)!=0x80)
       
   107 				{
       
   108 				return $KErrorIllFormedInput;
       
   109 				}
       
   110 			$currentUnicodeCharacter|=(($currentUtf8Byte&0x30)>>4);
       
   111 			$UnicodeUnpacked[$UnicodeIndex] = (0xd800|$currentUnicodeCharacter);
       
   112 			$currentUnicodeCharacter=(($currentUtf8Byte&0x0f)<<6);
       
   113 			++$Utf8Index;
       
   114 			$currentUtf8Byte=$Utf8Unpacked[$Utf8Index];
       
   115 			if (($currentUtf8Byte&0xc0)!=0x80)
       
   116 				{
       
   117 				return $KErrorIllFormedInput;
       
   118 				}
       
   119 			$currentUnicodeCharacter|=($currentUtf8Byte&0x3f);
       
   120 			++$UnicodeIndex;
       
   121 			$UnicodeUnpacked[$UnicodeIndex] = (0xdc00|$currentUnicodeCharacter);
       
   122 			}
       
   123 		else
       
   124 			{
       
   125 			return $KErrorIllFormedInput;
       
   126 			}
       
   127 		++$UnicodeIndex;
       
   128 		++$Utf8Index;
       
   129 		}
       
   130 	$$Unicode = (); 
       
   131 	$$Unicode = pack "$UnicodeTemplate*", @UnicodeUnpacked;
       
   132 	return $UnicodeIndex;  
       
   133 	}
       
   134 
       
   135 sub UnicodeToUtf8
       
   136 	{
       
   137 	my $Utf8 = shift; 
       
   138 	my $Unicode = shift;
       
   139 	my $UnicodeTemplate = shift;
       
   140 	my $Utf8Index = 0;
       
   141 	my $UnicodeIndex = 0;
       
   142 	my $numOfBytes = length($Unicode);
       
   143 	my @UnicodeUnpacked = unpack "$UnicodeTemplate*", $Unicode;
       
   144 	my @Utf8Unpacked = ();
       
   145 	
       
   146 	for (;;)
       
   147 		{
       
   148 		# exit the loop if no more in the UnicodeUnpacked
       
   149 		if ($UnicodeIndex > $#UnicodeUnpacked)
       
   150 			{
       
   151 			last;
       
   152 			}
       
   153 
       
   154 		my $currentUnicodeCharacter=$UnicodeUnpacked[$UnicodeIndex];
       
   155 		if (($currentUnicodeCharacter&0xff80)==0x0000)
       
   156 			{	
       
   157 			$Utf8Unpacked[$Utf8Index]= $currentUnicodeCharacter;
       
   158 			}
       
   159 		elsif (($currentUnicodeCharacter&0xf800)==0x0000)
       
   160 			{
       
   161 
       
   162 			$Utf8Unpacked[$Utf8Index]= (0xc0 | $currentUnicodeCharacter >> 6);
       
   163 			++$Utf8Index;
       
   164 			$Utf8Unpacked[$Utf8Index]= (0x80 | $currentUnicodeCharacter&0x3f);
       
   165 			}
       
   166 		elsif (($currentUnicodeCharacter&0xfc00)==0xd800)
       
   167 			{
       
   168 			$currentUnicodeCharacter+=0x0040;
       
   169 			$Utf8Unpacked[$Utf8Index]= (0xf0|(($currentUnicodeCharacter>>8)&0x07));
       
   170 			++$Utf8Index;
       
   171 			$Utf8Unpacked[$Utf8Index]= (0x80|(($currentUnicodeCharacter>>2)&0x3f));
       
   172 			my $currentUtf8Byte=(0x80|(($currentUnicodeCharacter&0x03)<<4));
       
   173 			++$UnicodeIndex;
       
   174 			$currentUnicodeCharacter=$UnicodeUnpacked[$UnicodeIndex];
       
   175 			if (($currentUnicodeCharacter&0xfc00)!=0xdc00)
       
   176 				{
       
   177 				return $KErrorIllFormedInput;
       
   178 				}
       
   179 			$currentUtf8Byte|=(($currentUnicodeCharacter>>6)&0x0f);
       
   180 			++$Utf8Index;
       
   181 			$Utf8Unpacked[$Utf8Index]= $currentUtf8Byte;
       
   182 			++$Utf8Index;
       
   183 			$Utf8Unpacked[$Utf8Index]= (0x80| ($currentUnicodeCharacter&0x3f));
       
   184 			}
       
   185 		else
       
   186 			{
       
   187 			$Utf8Unpacked[$Utf8Index]= (0xe0|($currentUnicodeCharacter>>12));
       
   188 			++$Utf8Index;
       
   189 			$Utf8Unpacked[$Utf8Index]= (0x80|(($currentUnicodeCharacter>>6)&0x3f));
       
   190 			++$Utf8Index;
       
   191 			$Utf8Unpacked[$Utf8Index]= (0x80| ($currentUnicodeCharacter&0x3f));
       
   192 			}
       
   193 		++$Utf8Index;
       
   194 		++$UnicodeIndex;
       
   195 		}
       
   196 	$$Utf8 = ();	
       
   197 	$$Utf8 = pack "C*", @Utf8Unpacked;
       
   198 	return $Utf8Index; 
       
   199 
       
   200 	}