webengine/osswebengine/WebCore/platform/make-charset-table.pl
changeset 0 dd21522fd290
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/webengine/osswebengine/WebCore/platform/make-charset-table.pl	Mon Mar 30 12:54:55 2009 +0300
@@ -0,0 +1,225 @@
+#!/usr/bin/perl -w
+
+# Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1.  Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer. 
+# 2.  Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution. 
+# 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
+#     its contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission. 
+#
+# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
+# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+use strict;
+
+my %aliasesFromCharsetsFile;
+my %namesWritten;
+
+my $output = "";
+
+my $error = 0;
+
+sub error ($)
+{
+    print STDERR @_, "\n";
+    $error = 1;
+}
+
+sub emit_line
+{
+    my ($name, $prefix, $encoding, $flags) = @_;
+ 
+    error "$name shows up twice in output" if $namesWritten{$name};
+    $namesWritten{$name} = 1;
+    
+    $output .= "        { \"$name\", $prefix$encoding },\n";
+}
+
+sub process_platform_encodings
+{
+    my ($filename, $PlatformPrefix) = @_;
+    my $baseFilename = $filename;
+    $baseFilename =~ s|.*/||;
+    
+    my %seenPlatformNames;
+    my %seenIANANames;
+    
+    open PLATFORM_ENCODINGS, $filename or die;
+    
+    while (<PLATFORM_ENCODINGS>) {
+        chomp;
+        s/\#.*$//;
+        s/\s+$//;
+	if (my ($PlatformName, undef, $flags, $IANANames) = /^(.+?)(, (.+))?: (.+)$/) {
+            my %aliases;
+            
+            my $PlatformNameWithFlags = $PlatformName;
+            if ($flags) {
+                $PlatformNameWithFlags .= ", " . $flags;
+            } else {
+                $flags = "NoEncodingFlags";
+            }
+            error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformNameWithFlags};
+            $seenPlatformNames{$PlatformNameWithFlags} = 1;
+
+            # Build the aliases list.
+            # Also check that no two names are part of the same entry in the charsets file.
+	    my @IANANames = split ", ", $IANANames;
+            my $firstName = "";
+            my $canonicalFirstName = "";
+            my $prevName = "";
+            for my $name (@IANANames) {
+                if ($firstName eq "") {
+                    if ($name !~ /^[-A-Za-z0-9_]+$/) {
+                        error "$name, in $baseFilename, has illegal characters in it";
+                        next;
+                    }
+                    $firstName = $name;
+                } else {
+                    if ($name !~ /^[a-z0-9]+$/) {
+                        error "$name, in $baseFilename, has illegal characters in it (must be all lowercase alphanumeric)";
+                        next;
+                    }
+                    if ($name le $prevName) {
+                        error "$name comes after $prevName in $baseFilename, but everything must be in alphabetical order";
+                    }
+                    $prevName = $name;
+                }
+                
+                my $canonicalName = lc $name;
+                $canonicalName =~ tr/-_//d;
+                
+                $canonicalFirstName = $canonicalName if $canonicalFirstName eq "";
+                
+                error "$name is mentioned twice in $baseFilename" if $seenIANANames{$canonicalName};
+                $seenIANANames{$canonicalName} = 1;
+                
+                $aliases{$canonicalName} = 1;
+                next if !$aliasesFromCharsetsFile{$canonicalName};
+                for my $alias (@{$aliasesFromCharsetsFile{$canonicalName}}) {
+                    $aliases{$alias} = 1;
+                }
+                for my $otherName (@IANANames) {
+                    next if $canonicalName eq $otherName;
+                    if ($aliasesFromCharsetsFile{$otherName}
+                        && $aliasesFromCharsetsFile{$canonicalName} eq $aliasesFromCharsetsFile{$otherName}
+                        && $canonicalName le $otherName) {
+                        error "$baseFilename lists both $name and $otherName under $PlatformName, but that aliasing is already specified in character-sets.txt";
+                    }
+                }
+            }
+            
+            # write out
+            emit_line($firstName, $PlatformPrefix, $PlatformName, $flags);
+            for my $alias (sort keys %aliases) {
+                emit_line($alias, $PlatformPrefix, $PlatformName, $flags) if $alias ne $canonicalFirstName;
+            }
+	} elsif (/^([a-zA-Z0-9_]+)(, (.+))?$/) {
+            my $PlatformName = $1;
+            
+            error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformName};
+            $seenPlatformNames{$PlatformName} = 1;
+        } elsif (/./) {
+            error "syntax error in platform-encodings.txt, line $.";
+        }
+    }
+    
+    close PLATFORM_ENCODINGS;
+}
+
+sub process_iana_charset 
+{
+    my ($canonical_name, @aliases) = @_;
+    
+    return if !$canonical_name;
+    
+    my @names = sort $canonical_name, @aliases;
+    
+    for my $name (@names) {
+        $aliasesFromCharsetsFile{$name} = \@names;
+    }
+}
+
+sub process_iana_charsets
+{
+    my ($filename) = @_;
+    
+    open CHARSETS, $filename or die;
+    
+    my %seen;
+    
+    my $canonical_name;
+    my @aliases;
+    
+    my %exceptions = ( isoir91 => 1, isoir92 => 1 );
+    
+    while (<CHARSETS>) {
+        chomp;
+        if ((my $new_canonical_name) = /Name: ([^ \t]*).*/) {
+            $new_canonical_name = lc $new_canonical_name;
+            $new_canonical_name =~ tr/a-z0-9//cd;
+            
+            error "saw $new_canonical_name twice in character-sets.txt", if $seen{$new_canonical_name};
+            $seen{$new_canonical_name} = $new_canonical_name;
+            
+            process_iana_charset $canonical_name, @aliases;
+	    
+	          $canonical_name = $new_canonical_name;
+	          @aliases = ();
+        } elsif ((my $new_alias) = /Alias: ([^ \t]*).*/) {
+            $new_alias = lc $new_alias;
+            $new_alias =~ tr/a-z0-9//cd;
+            
+            # do this after normalizing the alias, sometimes character-sets.txt
+            # has weird escape characters, e.g. \b after None
+            next if $new_alias eq "none";
+
+            error "saw $new_alias twice in character-sets.txt $seen{$new_alias}, $canonical_name", if $seen{$new_alias} && $seen{$new_alias} ne $canonical_name && !$exceptions{$new_alias};
+            push @aliases, $new_alias if !$seen{$new_alias};
+            $seen{$new_alias} = $canonical_name;            
+        }
+    }
+    
+    process_iana_charset $canonical_name, @aliases;
+    
+    close CHARSETS;
+}
+
+# Program body
+
+process_iana_charsets($ARGV[0]);
+process_platform_encodings($ARGV[1], $ARGV[2]);
+
+exit 1 if $error;
+
+print <<EOF
+// File generated by make-charset-table.pl. Do not edit!
+
+#include "config.h"
+#include "CharsetData.h"
+
+namespace WebCore {
+
+    const CharsetEntry CharsetTable[] = {
+$output
+        { 0, 0 }
+    };
+
+}
+EOF