kernel/eka/euser/unicode/perl/UnicodeAddFolded.pl
author Tom Cosgrove <tom.cosgrove@nokia.com>
Fri, 28 May 2010 16:29:07 +0100
changeset 30 8aab599e3476
parent 0 a41df078684a
permissions -rw-r--r--
Fix for bug 2283 (RVCT 4.0 support is missing from PDK 3.0.h) Have multiple extension sections in the bld.inf, one for each version of the compiler. The RVCT version building the tools will build the runtime libraries for its version, but make sure we extract all the other versions from zip archives. Also add the archive for RVCT4.

# Copyright (c) 2002-2009 Nokia Corporation and/or its subsidiary(-ies).
# All rights reserved.
# This component and the accompanying materials are made available
# under the terms of the License "Eclipse Public License v1.0"
# which accompanies this distribution, and is available
# at the URL "http://www.eclipse.org/legal/epl-v10.html".
#
# Initial Contributors:
# Nokia Corporation - initial contribution.
#
# Contributors:
#
# Description:
# Adds folding information to Unicode data
# Added as the third field after the 'Symbian:' marker in the following format:
# Symbian:<grapheme-role>;<excluded-from-composition>;<folded-form>
# where <folded-form> is null or a sequence of hex unicode values
# separated by spaces representing the folded form of the character.
# Usage:
# perl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>
# 
#

use strict;

if (scalar(@ARGV) != 1)
	{
	print (STDERR "Usage:\nperl -w UnicodeAddFolded.pl CaseFolding.txt < <output-of-UnicodeCompositionEx>\n");
	exit 1;
	}

open(FOLDING, $ARGV[0]) or die("Could not open file $ARGV[0]\n");

my %Fold = ();
my %MappingLine = ();
my $lineNo = 0;
while (<FOLDING>)
	{
	$lineNo++;
	my ($line, $comment) = split(/#/, $_, 2);
	if ($line =~ /^[ \t]*(1?[0-9a-fA-F]{4,5});[ \t]*([LEICSFT]);[ \t]*([0-9a-fA-F][0-9a-fA-F \t]*);[ \t]*$/)
		{
		my $code = hex($1);
		my $type = $2;
		my $folded = $3;
		# We'll deal with Turkic mappings with our own hack.
		# F = Full mappings (fold is longer than one character)
		# T = I = Turkic mapping
		if ($type !~ /[FTI]/ && $folded !~ /[ \t]/)
			{
			die ("$code has two mappings: lines $MappingLine{$code} and $lineNo.")
				if (exists $Fold{$code});
			$Fold{$code} = $folded;
			$MappingLine{$code} = $lineNo;
			}
		}
	elsif ($line !~ /^[ \t]*$/)
		{
		die ("Did not understand line $lineNo of $ARGV[0]");
		}
	}

close FOLDING;

# Turkic hack:
# Map dotted capital I and dotless small I to lower case i.
# This makes all the 'i's fold the same, which isn't very nice for Turkic
# languages, but it at least gives us behaviour consistent across locales
# which does at least map dotted I, and i to the same value, as well
# as mapping I and dotless i to the same value, and mapping I and i
# to the same value.
$Fold{0x49} = '0069';
$Fold{0x130} = '0069';
$Fold{0x131} = '0069';

$lineNo = 0;
while (my $line = <STDIN>)
	{
	chomp $line;
	$lineNo++;
	# Split into fields: make sure trailing null strings are not
	# deleted by adding a dummy final field
	my @attribute = split(/;/, $line.';dummy');
	# Delete the dummy field
	pop @attribute;
	die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
		if (scalar(@attribute) == 15);
	if (scalar(@attribute) == 16)
		{
		die ("Line $lineNo is missing 'Symbian:' entries. Has UnicodeCompositionEx been run?")
			if ($attribute[15] !~ /^[ \t]*symbian:/i);
		my $code = $attribute[0];
		die("First attribute '$code' not a valid Unicode codepoint at line $lineNo")
			unless $code =~ /^1?[0-9a-fA-F]{4,5}$/;
		$code = hex($code);
		$attribute[16] = exists $Fold{$code}? $Fold{$code} : '';
		print join(';', @attribute);
		}
	elsif ($line !~ /^[ \t]*$/)
		{
		die 'Do not understand line '.$lineNo;
		}
	else
		{
		print $line;
		}
	print "\n";
	}