common/tools/raptor/XML/SAX/PurePerl/EncodingDetect.pm
changeset 906 5239d4d0bed1
parent 905 9ed73a51c728
child 907 bab81256b297
equal deleted inserted replaced
905:9ed73a51c728 906:5239d4d0bed1
     1 # $Id: EncodingDetect.pm,v 1.6 2007/02/07 09:33:50 grant Exp $
       
     2 
       
     3 package XML::SAX::PurePerl; # NB, not ::EncodingDetect!
       
     4 
       
     5 use strict;
       
     6 
       
     7 sub encoding_detect {
       
     8     my ($parser, $reader) = @_;
       
     9     
       
    10     my $error = "Invalid byte sequence at start of file";
       
    11     
       
    12     my $data = $reader->data;
       
    13     if ($data =~ /^\x00\x00\xFE\xFF/) {
       
    14         # BO-UCS4-be
       
    15         $reader->move_along(4);
       
    16         $reader->set_encoding('UCS-4BE');
       
    17         return;
       
    18     }
       
    19     elsif ($data =~ /^\x00\x00\xFF\xFE/) {
       
    20         # BO-UCS-4-2143
       
    21         $reader->move_along(4);
       
    22         $reader->set_encoding('UCS-4-2143');
       
    23         return;
       
    24     }
       
    25     elsif ($data =~ /^\x00\x00\x00\x3C/) {
       
    26         $reader->set_encoding('UCS-4BE');
       
    27         return;
       
    28     }
       
    29     elsif ($data =~ /^\x00\x00\x3C\x00/) {
       
    30         $reader->set_encoding('UCS-4-2143');
       
    31         return;
       
    32     }
       
    33     elsif ($data =~ /^\x00\x3C\x00\x00/) {
       
    34         $reader->set_encoding('UCS-4-3412');
       
    35         return;
       
    36     }
       
    37     elsif ($data =~ /^\x00\x3C\x00\x3F/) {
       
    38         $reader->set_encoding('UTF-16BE');
       
    39         return;
       
    40     }
       
    41     elsif ($data =~ /^\xFF\xFE\x00\x00/) {
       
    42         # BO-UCS-4LE
       
    43         $reader->move_along(4);
       
    44         $reader->set_encoding('UCS-4LE');
       
    45         return;
       
    46     }
       
    47     elsif ($data =~ /^\xFF\xFE/) {
       
    48         $reader->move_along(2);
       
    49         $reader->set_encoding('UTF-16LE');
       
    50         return;
       
    51     }
       
    52     elsif ($data =~ /^\xFE\xFF\x00\x00/) {
       
    53         $reader->move_along(4);
       
    54         $reader->set_encoding('UCS-4-3412');
       
    55         return;
       
    56     }
       
    57     elsif ($data =~ /^\xFE\xFF/) {
       
    58         $reader->move_along(2);
       
    59         $reader->set_encoding('UTF-16BE');
       
    60         return;
       
    61     }
       
    62     elsif ($data =~ /^\xEF\xBB\xBF/) { # UTF-8 BOM
       
    63         $reader->move_along(3);
       
    64         $reader->set_encoding('UTF-8');
       
    65         return;
       
    66     }
       
    67     elsif ($data =~ /^\x3C\x00\x00\x00/) {
       
    68         $reader->set_encoding('UCS-4LE');
       
    69         return;
       
    70     }
       
    71     elsif ($data =~ /^\x3C\x00\x3F\x00/) {
       
    72         $reader->set_encoding('UTF-16LE');
       
    73         return;
       
    74     }
       
    75     elsif ($data =~ /^\x3C\x3F\x78\x6D/) {
       
    76         # $reader->set_encoding('UTF-8');
       
    77         return;
       
    78     }
       
    79     elsif ($data =~ /^\x3C\x3F\x78/) {
       
    80         # $reader->set_encoding('UTF-8');
       
    81         return;
       
    82     }
       
    83     elsif ($data =~ /^\x3C\x3F/) {
       
    84         # $reader->set_encoding('UTF-8');
       
    85         return;
       
    86     }
       
    87     elsif ($data =~ /^\x3C/) {
       
    88         # $reader->set_encoding('UTF-8');
       
    89         return;
       
    90     }
       
    91     elsif ($data =~ /^[\x20\x09\x0A\x0D]+\x3C[^\x3F]/) {
       
    92         # $reader->set_encoding('UTF-8');
       
    93         return;
       
    94     }
       
    95     elsif ($data =~ /^\x4C\x6F\xA7\x94/) {
       
    96         $reader->set_encoding('EBCDIC');
       
    97         return;
       
    98     }
       
    99     
       
   100     warn("Unable to recognise encoding of this document");
       
   101     return;
       
   102 }
       
   103 
       
   104 1;
       
   105