uh_parser/XML/SAX/PurePerl/EncodingDetect.pm
author Dario Sestito <darios@symbian.org>
Wed, 09 Jun 2010 18:41:17 +0100
changeset 255 87a39e76ac33
parent 176 6d3c3db11e72
permissions -rw-r--r--
Add stubs for the most commonly used tools
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
176
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     1
# $Id: EncodingDetect.pm,v 1.6 2007/02/07 09:33:50 grant Exp $
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     2
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     3
package XML::SAX::PurePerl; # NB, not ::EncodingDetect!
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     4
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     5
use strict;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     6
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     7
sub encoding_detect {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     8
    my ($parser, $reader) = @_;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
     9
    
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    10
    my $error = "Invalid byte sequence at start of file";
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    11
    
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    12
    my $data = $reader->data;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    13
    if ($data =~ /^\x00\x00\xFE\xFF/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    14
        # BO-UCS4-be
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    15
        $reader->move_along(4);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    16
        $reader->set_encoding('UCS-4BE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    17
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    18
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    19
    elsif ($data =~ /^\x00\x00\xFF\xFE/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    20
        # BO-UCS-4-2143
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    21
        $reader->move_along(4);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    22
        $reader->set_encoding('UCS-4-2143');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    23
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    24
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    25
    elsif ($data =~ /^\x00\x00\x00\x3C/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    26
        $reader->set_encoding('UCS-4BE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    27
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    28
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    29
    elsif ($data =~ /^\x00\x00\x3C\x00/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    30
        $reader->set_encoding('UCS-4-2143');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    31
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    32
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    33
    elsif ($data =~ /^\x00\x3C\x00\x00/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    34
        $reader->set_encoding('UCS-4-3412');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    35
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    36
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    37
    elsif ($data =~ /^\x00\x3C\x00\x3F/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    38
        $reader->set_encoding('UTF-16BE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    39
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    40
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    41
    elsif ($data =~ /^\xFF\xFE\x00\x00/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    42
        # BO-UCS-4LE
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    43
        $reader->move_along(4);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    44
        $reader->set_encoding('UCS-4LE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    45
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    46
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    47
    elsif ($data =~ /^\xFF\xFE/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    48
        $reader->move_along(2);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    49
        $reader->set_encoding('UTF-16LE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    50
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    51
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    52
    elsif ($data =~ /^\xFE\xFF\x00\x00/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    53
        $reader->move_along(4);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    54
        $reader->set_encoding('UCS-4-3412');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    55
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    56
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    57
    elsif ($data =~ /^\xFE\xFF/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    58
        $reader->move_along(2);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    59
        $reader->set_encoding('UTF-16BE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    60
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    61
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    62
    elsif ($data =~ /^\xEF\xBB\xBF/) { # UTF-8 BOM
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    63
        $reader->move_along(3);
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    64
        $reader->set_encoding('UTF-8');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    65
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    66
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    67
    elsif ($data =~ /^\x3C\x00\x00\x00/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    68
        $reader->set_encoding('UCS-4LE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    69
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    70
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    71
    elsif ($data =~ /^\x3C\x00\x3F\x00/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    72
        $reader->set_encoding('UTF-16LE');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    73
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    74
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    75
    elsif ($data =~ /^\x3C\x3F\x78\x6D/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    76
        # $reader->set_encoding('UTF-8');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    77
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    78
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    79
    elsif ($data =~ /^\x3C\x3F\x78/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    80
        # $reader->set_encoding('UTF-8');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    81
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    82
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    83
    elsif ($data =~ /^\x3C\x3F/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    84
        # $reader->set_encoding('UTF-8');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    85
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    86
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    87
    elsif ($data =~ /^\x3C/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    88
        # $reader->set_encoding('UTF-8');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    89
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    90
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    91
    elsif ($data =~ /^[\x20\x09\x0A\x0D]+\x3C[^\x3F]/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    92
        # $reader->set_encoding('UTF-8');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    93
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    94
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    95
    elsif ($data =~ /^\x4C\x6F\xA7\x94/) {
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    96
        $reader->set_encoding('EBCDIC');
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    97
        return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    98
    }
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
    99
    
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
   100
    warn("Unable to recognise encoding of this document");
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
   101
    return;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
   102
}
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
   103
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
   104
1;
6d3c3db11e72 Add Raptor uh parser
Dario Sestito <darios@symbian.org>
parents:
diff changeset
   105