kernel/eka/euser/unicode/perl/UnicodeMaxDecompose.pl
changeset 0 a41df078684a
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kernel/eka/euser/unicode/perl/UnicodeMaxDecompose.pl	Mon Oct 19 15:55:17 2009 +0100
@@ -0,0 +1,163 @@
+#
+# Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies).
+# All rights reserved.
+# This component and the accompanying materials are made available
+# under the terms of the License "Eclipse Public License v1.0"
+# which accompanies this distribution, and is available
+# at the URL "http://www.eclipse.org/legal/epl-v10.html".
+#
+# Initial Contributors:
+# Nokia Corporation - initial contribution.
+#
+# Contributors:
+#
+# Description:
+#
+# UnicodeMaxDecompose.pl
+#
+# Adds maximal decompositions of the character and maximal decompositions of
+# its folded varient to the Unicode data.
+#
+# Added as the fourth field after the 'Symbain:' marker in the following format:
+#
+# Symbian:<grapheme-role>;<excluded>;<folded>;<max-decomposition>;<folded-decomposition>
+# where each of <max-decomposition> and <folded-decomposition> are strings
+# of hex numbers separated by spaces, representing the complete decomposition
+# of the character and its folded equivalent respectively.
+#
+# Usage:
+# perl -w UnicodeMaxDecompose.pl < <output-of-UnicodeAddFolded>
+
+use strict;
+
+if (scalar(@ARGV) != 0)
+	{
+	print (STDERR "Usage:\nperl -w UnicodeMaxDecompose.pl < <output-of-UnicodeAddFolded>\n");
+	exit 1;
+	}
+
+my %StatedDecomposition = ();
+my %CompleteDecomposition = ();
+
+sub Decompose
+	{
+	my ($code) = @_;
+	return unless exists $StatedDecomposition{$code};
+	my $stated = $StatedDecomposition{$code};
+	delete $StatedDecomposition{$code};
+	my @complete = ();
+	foreach my $hexelt ( split(' ', $stated) )
+		{
+		if ($hexelt)
+			{
+			Decompose($hexelt);
+			if (exists $CompleteDecomposition{$hexelt})
+				{
+				push @complete, $CompleteDecomposition{$hexelt};
+				}
+			else
+				{
+				push @complete, $hexelt;
+				}
+			}
+		}
+	$CompleteDecomposition{$code} = join(' ', @complete);
+	}
+
+my %Folded = ();
+my %LineToCode = ();
+my @RawLine = ();
+
+my $lineNo = 0;
+while (my $line = <STDIN>)
+	{
+	chomp $line;
+	$lineNo++;
+	# Split into fields: make sure trailing null strings are not
+	# deleted by adding a dummy final field
+	my @attribute = split(/;/, $line.';dummy');
+	# Delete the dummy field
+	pop @attribute;
+	die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeAddFolded been run?")
+		if (scalar(@attribute) == 16);
+	if (scalar(@attribute) == 17)
+		{
+		die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeAddFolded been run?")
+			if ($attribute[15] !~ /^[ \t]*symbian:/i);
+		my $code = $attribute[0];
+		die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
+			unless ($code =~ /^1?[0-9a-fA-F]{4,5}$/ && hex($code) < 0x110000);
+		my $decomposition = $attribute[5];
+		die("Decomposition '$decomposition' at line $lineNo is not a valid Unicode decomposition.")
+			unless $decomposition =~ /^[ \t]*(<.*>[ \t]*[0-9a-fA-F])?[0-9a-fA-F \t]*$/;
+		my $folded = $attribute[16];
+		die ("'$folded' not a valid string of hex values at line $lineNo.")
+			unless $folded =~ /[0-9a-fA-F \t]*/;
+		# Store all decompositions that  have no tag and at least one value
+		if ($decomposition =~ /^[ \t]*[0-9a-fA-F]/)
+			{
+			$StatedDecomposition{$code} = $decomposition;
+			}
+		if ($folded =~ /[0-9a-fA-F]/)
+			{
+			$Folded{$code} = $folded;
+			}
+		$LineToCode{$lineNo-1} = $code;
+		}
+	elsif ($line !~ /^[ \t]*$/)
+		{
+		die 'Do not understand line '.$lineNo;
+		}
+	$RawLine[$lineNo-1] = $line;
+	}
+
+# Completely decompose all strings in the %StatedDecomposition
+foreach my $code (keys %StatedDecomposition)
+	{
+	Decompose($code);
+	}
+
+# Now decompose all the folded versions
+foreach my $code (keys %Folded)
+	{
+	my @result = ();
+	foreach my $hexelt (split(' ', $Folded{$code}))
+		{
+		if (exists $CompleteDecomposition{$hexelt})
+			{
+			push @result, split(' ', $CompleteDecomposition{$hexelt});
+			}
+		else
+			{
+			push @result, $hexelt;
+			}
+		}
+	$Folded{$code} = join(' ', @result);
+	}
+
+# Now output all the results
+for (my $i = 0; $i != scalar(@RawLine); $i++)
+	{
+	print $RawLine[$i];
+	if (exists $LineToCode{$i})
+		{
+		my $code = $LineToCode{$i};
+		print ';';
+		my $decomp = '';
+		$decomp = $CompleteDecomposition{$code}
+			if exists $CompleteDecomposition{$code};
+		print $decomp.';';
+		if (exists $Folded{$code})
+			{
+			print $Folded{$code}
+			}
+		else
+			{
+			# If there is no folded value, but there is a decomposition
+			# sequence, the character must fold to the decomposition
+			# sequence too.
+			print $decomp;
+			}
+		}
+	print "\n";
+	}