uh_parser/XML/SAX/PurePerl/EncodingDetect.pm
author Simon Howkins <simonh@symbian.org>
Thu, 13 May 2010 16:27:37 +0100
changeset 244 2251fde91223
parent 176 6d3c3db11e72
permissions -rw-r--r--
Changed script to use CSV formatted input, rather than TSV. This means that the script can directly process the CSV downloaded from Bugzilla, without any need to use Excel to convert it.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
176
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     1
# $Id: EncodingDetect.pm,v 1.6 2007/02/07 09:33:50 grant Exp $
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     2
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     3
package XML::SAX::PurePerl; # NB, not ::EncodingDetect!
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     4
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     5
use strict;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     6
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     7
sub encoding_detect {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     8
    my ($parser, $reader) = @_;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     9
    
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    10
    my $error = "Invalid byte sequence at start of file";
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    11
    
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    12
    my $data = $reader->data;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    13
    if ($data =~ /^\x00\x00\xFE\xFF/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    14
        # BO-UCS4-be
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    15
        $reader->move_along(4);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    16
        $reader->set_encoding('UCS-4BE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    17
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    18
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    19
    elsif ($data =~ /^\x00\x00\xFF\xFE/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    20
        # BO-UCS-4-2143
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    21
        $reader->move_along(4);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    22
        $reader->set_encoding('UCS-4-2143');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    23
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    24
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    25
    elsif ($data =~ /^\x00\x00\x00\x3C/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    26
        $reader->set_encoding('UCS-4BE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    27
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    28
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    29
    elsif ($data =~ /^\x00\x00\x3C\x00/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    30
        $reader->set_encoding('UCS-4-2143');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    31
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    32
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    33
    elsif ($data =~ /^\x00\x3C\x00\x00/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    34
        $reader->set_encoding('UCS-4-3412');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    35
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    36
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    37
    elsif ($data =~ /^\x00\x3C\x00\x3F/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    38
        $reader->set_encoding('UTF-16BE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    39
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    40
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    41
    elsif ($data =~ /^\xFF\xFE\x00\x00/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    42
        # BO-UCS-4LE
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    43
        $reader->move_along(4);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    44
        $reader->set_encoding('UCS-4LE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    45
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    46
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    47
    elsif ($data =~ /^\xFF\xFE/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    48
        $reader->move_along(2);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    49
        $reader->set_encoding('UTF-16LE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    50
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    51
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    52
    elsif ($data =~ /^\xFE\xFF\x00\x00/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    53
        $reader->move_along(4);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    54
        $reader->set_encoding('UCS-4-3412');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    55
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    56
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    57
    elsif ($data =~ /^\xFE\xFF/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    58
        $reader->move_along(2);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    59
        $reader->set_encoding('UTF-16BE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    60
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    61
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    62
    elsif ($data =~ /^\xEF\xBB\xBF/) { # UTF-8 BOM
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    63
        $reader->move_along(3);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    64
        $reader->set_encoding('UTF-8');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    65
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    66
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    67
    elsif ($data =~ /^\x3C\x00\x00\x00/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    68
        $reader->set_encoding('UCS-4LE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    69
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    70
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    71
    elsif ($data =~ /^\x3C\x00\x3F\x00/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    72
        $reader->set_encoding('UTF-16LE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    73
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    74
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    75
    elsif ($data =~ /^\x3C\x3F\x78\x6D/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    76
        # $reader->set_encoding('UTF-8');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    77
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    78
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    79
    elsif ($data =~ /^\x3C\x3F\x78/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    80
        # $reader->set_encoding('UTF-8');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    81
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    82
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    83
    elsif ($data =~ /^\x3C\x3F/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    84
        # $reader->set_encoding('UTF-8');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    85
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    86
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    87
    elsif ($data =~ /^\x3C/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    88
        # $reader->set_encoding('UTF-8');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    89
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    90
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    91
    elsif ($data =~ /^[\x20\x09\x0A\x0D]+\x3C[^\x3F]/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    92
        # $reader->set_encoding('UTF-8');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    93
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    94
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    95
    elsif ($data =~ /^\x4C\x6F\xA7\x94/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    96
        $reader->set_encoding('EBCDIC');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    97
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    98
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    99
    
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
   100
    warn("Unable to recognise encoding of this document");
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
   101
    return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
   102
}
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
   103
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
   104
1;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
   105