uh_parser/XML/SAX/PurePerl/DTDDecls.pm
author William Roberts <williamr@symbian.org>
Fri, 27 Aug 2010 15:27:06 +0100
changeset 281 c62bd4f9dbce
parent 176 6d3c3db11e72
permissions -rw-r--r--
Add delete_builds.pl - a utility for making space quickly on build machines This Perl script deletes some directories known to contain very large files first, before deleting the rest of the build which contains millions of small files. Given multiple builds, it will do this breadth first, so that lost of space is released quickly.

# $Id: DTDDecls.pm,v 1.7 2005/10/14 20:31:20 matt Exp $

package XML::SAX::PurePerl;

use strict;
use XML::SAX::PurePerl::Productions qw($NameChar $SingleChar);

sub elementdecl {
    my ($self, $reader) = @_;
    
    my $data = $reader->data(9);
    return 0 unless $data =~ /^<!ELEMENT/;
    $reader->move_along(9);
    
    $self->skip_whitespace($reader) ||
        $self->parser_error("No whitespace after ELEMENT declaration", $reader);
    
    my $name = $self->Name($reader);
    
    $self->skip_whitespace($reader) ||
        $self->parser_error("No whitespace after ELEMENT's name", $reader);
        
    $self->contentspec($reader, $name);
    
    $self->skip_whitespace($reader);
    
    $reader->match('>') or $self->parser_error("Closing angle bracket not found on ELEMENT declaration", $reader);
    
    return 1;
}

sub contentspec {
    my ($self, $reader, $name) = @_;
    
    my $data = $reader->data(5);
    
    my $model;
    if ($data =~ /^EMPTY/) {
        $reader->move_along(5);
        $model = 'EMPTY';
    }
    elsif ($data =~ /^ANY/) {
        $reader->move_along(3);
        $model = 'ANY';
    }
    else {
        $model = $self->Mixed_or_children($reader);
    }

    if ($model) {
        # call SAX callback now.
        $self->element_decl({Name => $name, Model => $model});
        return 1;
    }
    
    $self->parser_error("contentspec not found in ELEMENT declaration", $reader);
}

sub Mixed_or_children {
    my ($self, $reader) = @_;

    my $data = $reader->data(8);
    $data =~ /^\(/ or return; # $self->parser_error("No opening bracket in Mixed or children", $reader);
    
    if ($data =~ /^\(\s*\#PCDATA/) {
        $reader->match('(');
        $self->skip_whitespace($reader);
        $reader->move_along(7);
        my $model = $self->Mixed($reader);
        return $model;
    }

    # not matched - must be Children
    return $self->children($reader);
}

# Mixed ::= ( '(' S* PCDATA ( S* '|' S* QName )* S* ')' '*' )
#               | ( '(' S* PCDATA S* ')' )
sub Mixed {
    my ($self, $reader) = @_;

    # Mixed_or_children already matched '(' S* '#PCDATA'

    my $model = '(#PCDATA';
            
    $self->skip_whitespace($reader);

    my %seen;
    
    while (1) {
        last unless $reader->match('|');
        $self->skip_whitespace($reader);

        my $name = $self->Name($reader) || 
            $self->parser_error("No 'Name' after Mixed content '|'", $reader);

        if ($seen{$name}) {
            $self->parser_error("Element '$name' has already appeared in this group", $reader);
        }
        $seen{$name}++;

        $model .= "|$name";
        
        $self->skip_whitespace($reader);
    }
    
    $reader->match(')') || $self->parser_error("no closing bracket on mixed content", $reader);

    $model .= ")";

    if ($reader->match('*')) {
        $model .= "*";
    }
    
    return $model;
}

# [[47]] Children ::= ChoiceOrSeq Cardinality?
# [[48]] Cp ::= ( QName | ChoiceOrSeq ) Cardinality?
#       ChoiceOrSeq ::= '(' S* Cp ( Choice | Seq )? S* ')'
# [[49]] Choice ::= ( S* '|' S* Cp )+
# [[50]] Seq    ::= ( S* ',' S* Cp )+
#        // Children ::= (Choice | Seq) Cardinality?
#        // Cp ::= ( QName | Choice | Seq) Cardinality?
#        // Choice ::= '(' S* Cp ( S* '|' S* Cp )+ S* ')'
#        // Seq    ::= '(' S* Cp ( S* ',' S* Cp )* S* ')'
# [[51]] Mixed ::= ( '(' S* PCDATA ( S* '|' S* QName )* S* ')' MixedCardinality )
#                | ( '(' S* PCDATA S* ')' )
#        Cardinality ::= '?' | '+' | '*'
#        MixedCardinality ::= '*'
sub children {
    my ($self, $reader) = @_;
    
    return $self->ChoiceOrSeq($reader) . $self->Cardinality($reader);
}

sub ChoiceOrSeq {
    my ($self, $reader) = @_;
    
    $reader->match('(') or $self->parser_error("choice/seq contains no opening bracket", $reader);
    
    my $model = '(';
    
    $self->skip_whitespace($reader);

    $model .= $self->Cp($reader);
    
    if (my $choice = $self->Choice($reader)) {
        $model .= $choice;
    }
    else {
        $model .= $self->Seq($reader);
    }

    $self->skip_whitespace($reader);

    $reader->match(')') or $self->parser_error("choice/seq contains no closing bracket", $reader);

    $model .= ')';
    
    return $model;
}

sub Cardinality {
    my ($self, $reader) = @_;
    # cardinality is always optional
    my $data = $reader->data;
    if ($data =~ /^([\?\+\*])/) {
        $reader->move_along(1);
        return $1;
    }
    return '';
}

sub Cp {
    my ($self, $reader) = @_;

    my $model;
    my $name = eval
    {
	if (my $name = $self->Name($reader)) {
	    return $name . $self->Cardinality($reader);
	}
    };
    return $name if defined $name;
    return $self->ChoiceOrSeq($reader) . $self->Cardinality($reader);
}

sub Choice {
    my ($self, $reader) = @_;
    
    my $model = '';
    $self->skip_whitespace($reader);
    
    while ($reader->match('|')) {
        $self->skip_whitespace($reader);
        $model .= '|';
        $model .= $self->Cp($reader);
        $self->skip_whitespace($reader);
    }

    return $model;
}

sub Seq {
    my ($self, $reader) = @_;
    
    my $model = '';
    $self->skip_whitespace($reader);
    
    while ($reader->match(',')) {
        $self->skip_whitespace($reader);
        my $cp = $self->Cp($reader);
        if ($cp) {
            $model .= ',';
            $model .= $cp;
        }
        $self->skip_whitespace($reader);
    }

    return $model;
}

sub AttlistDecl {
    my ($self, $reader) = @_;
    
    my $data = $reader->data(9);
    if ($data =~ /^<!ATTLIST/) {
        # It's an attlist
        
        $reader->move_along(9);
        
        $self->skip_whitespace($reader) || 
            $self->parser_error("No whitespace after ATTLIST declaration", $reader);
        my $name = $self->Name($reader);

        $self->AttDefList($reader, $name);

        $self->skip_whitespace($reader);
        
        $reader->match('>') or $self->parser_error("Closing angle bracket not found on ATTLIST declaration", $reader);
        
        return 1;
    }
    
    return 0;
}

sub AttDefList {
    my ($self, $reader, $name) = @_;

    1 while $self->AttDef($reader, $name);
}

sub AttDef {
    my ($self, $reader, $el_name) = @_;

    $self->skip_whitespace($reader) || return 0;
    my $att_name = $self->Name($reader) || return 0;
    $self->skip_whitespace($reader) || 
        $self->parser_error("No whitespace after Name in attribute definition", $reader);
    my $att_type = $self->AttType($reader);

    $self->skip_whitespace($reader) ||
        $self->parser_error("No whitespace after AttType in attribute definition", $reader);
    my ($mode, $value) = $self->DefaultDecl($reader);
    
    # fire SAX event here!
    $self->attribute_decl({
            eName => $el_name, 
            aName => $att_name, 
            Type => $att_type, 
            Mode => $mode, 
            Value => $value,
            });
    return 1;
}

sub AttType {
    my ($self, $reader) = @_;

    return $self->StringType($reader) ||
            $self->TokenizedType($reader) ||
            $self->EnumeratedType($reader) ||
            $self->parser_error("Can't match AttType", $reader);
}

sub StringType {
    my ($self, $reader) = @_;
    
    my $data = $reader->data(5);
    return unless $data =~ /^CDATA/;
    $reader->move_along(5);
    return 'CDATA';
}

sub TokenizedType {
    my ($self, $reader) = @_;
    
    my $data = $reader->data(8);
    if ($data =~ /^(IDREFS?|ID|ENTITIES|ENTITY|NMTOKENS?)/) {
        $reader->move_along(length($1));
        return $1;
    }
    return;
}

sub EnumeratedType {
    my ($self, $reader) = @_;
    return $self->NotationType($reader) || $self->Enumeration($reader);
}

sub NotationType {
    my ($self, $reader) = @_;
    
    my $data = $reader->data(8);
    return unless $data =~ /^NOTATION/;
    $reader->move_along(8);
    
    $self->skip_whitespace($reader) ||
        $self->parser_error("No whitespace after NOTATION", $reader);
    $reader->match('(') or $self->parser_error("No opening bracket in notation section", $reader);
    
    $self->skip_whitespace($reader);
    my $model = 'NOTATION (';
    my $name = $self->Name($reader) ||
        $self->parser_error("No name in notation section", $reader);
    $model .= $name;
    $self->skip_whitespace($reader);
    $data = $reader->data;
    while ($data =~ /^\|/) {
        $reader->move_along(1);
        $model .= '|';
        $self->skip_whitespace($reader);
        my $name = $self->Name($reader) ||
            $self->parser_error("No name in notation section", $reader);
        $model .= $name;
        $self->skip_whitespace($reader);
        $data = $reader->data;
    }
    $data =~ /^\)/ or $self->parser_error("No closing bracket in notation section", $reader);
    $reader->move_along(1);
    
    $model .= ')';

    return $model;
}

sub Enumeration {
    my ($self, $reader) = @_;
    
    return unless $reader->match('(');
    
    $self->skip_whitespace($reader);
    my $model = '(';
    my $nmtoken = $self->Nmtoken($reader) ||
        $self->parser_error("No Nmtoken in enumerated declaration", $reader);
    $model .= $nmtoken;
    $self->skip_whitespace($reader);
    my $data = $reader->data;
    while ($data =~ /^\|/) {
        $model .= '|';
        $reader->move_along(1);
        $self->skip_whitespace($reader);
        my $nmtoken = $self->Nmtoken($reader) ||
            $self->parser_error("No Nmtoken in enumerated declaration", $reader);
        $model .= $nmtoken;
        $self->skip_whitespace($reader);
        $data = $reader->data;
    }
    $data =~ /^\)/ or $self->parser_error("No closing bracket in enumerated declaration", $reader);
    $reader->move_along(1);
    
    $model .= ')';

    return $model;
}

sub Nmtoken {
    my ($self, $reader) = @_;
    return $self->Name($reader);
}

sub DefaultDecl {
    my ($self, $reader) = @_;
    
    my $data = $reader->data(9);
    if ($data =~ /^(\#REQUIRED|\#IMPLIED)/) {
        $reader->move_along(length($1));
        return $1;
    }
    my $model = '';
    if ($data =~ /^\#FIXED/) {
        $reader->move_along(6);
        $self->skip_whitespace($reader) || $self->parser_error(
                "no whitespace after FIXED specifier", $reader);
        my $value = $self->AttValue($reader);
        return "#FIXED", $value;
    }
    my $value = $self->AttValue($reader);
    return undef, $value;
}

sub EntityDecl {
    my ($self, $reader) = @_;
    
    my $data = $reader->data(8);
    return 0 unless $data =~ /^<!ENTITY/;
    $reader->move_along(8);
    
    $self->skip_whitespace($reader) || $self->parser_error(
        "No whitespace after ENTITY declaration", $reader);
    
    $self->PEDecl($reader) || $self->GEDecl($reader);
    
    $self->skip_whitespace($reader);
    
    $reader->match('>') or $self->parser_error("No closing '>' in entity definition", $reader);
    
    return 1;
}

sub GEDecl {
    my ($self, $reader) = @_;

    my $name = $self->Name($reader) || $self->parser_error("No entity name given", $reader);
    $self->skip_whitespace($reader) || $self->parser_error("No whitespace after entity name", $reader);

    # TODO: ExternalID calls lexhandler method. Wrong place for it.
    my $value;
    if ($value = $self->ExternalID($reader)) {
        $value .= $self->NDataDecl($reader);
    }
    else {
        $value = $self->EntityValue($reader);
    }

    if ($self->{ParseOptions}{entities}{$name}) {
        warn("entity $name already exists\n");
    } else {
        $self->{ParseOptions}{entities}{$name} = 1;
        $self->{ParseOptions}{expanded_entity}{$name} = $value; # ???
    }
    # do callback?
    return 1;
}

sub PEDecl {
    my ($self, $reader) = @_;
    
    return 0 unless $reader->match('%');

    $self->skip_whitespace($reader) || $self->parser_error("No whitespace after parameter entity marker", $reader);
    my $name = $self->Name($reader) || $self->parser_error("No parameter entity name given", $reader);
    $self->skip_whitespace($reader) || $self->parser_error("No whitespace after parameter entity name", $reader);
    my $value = $self->ExternalID($reader) ||
                $self->EntityValue($reader) ||
                $self->parser_error("PE is not a value or an external resource", $reader);
    # do callback?
    return 1;
}

my $quotre = qr/[^%&\"]/;
my $aposre = qr/[^%&\']/;

sub EntityValue {
    my ($self, $reader) = @_;
    
    my $data = $reader->data;
    my $quote = '"';
    my $re = $quotre;
    if (!$data =~ /^"/) {
        $data =~ /^'/ or $self->parser_error("Not a quote character", $reader);
        $quote = "'";
        $re = $aposre;
    }
    $reader->move_along(1);
    
    my $value = '';
    
    while (1) {
        my $data = $reader->data;

        $self->parser_error("EOF found while reading entity value", $reader)
            unless length($data);
        
        if ($data =~ /^($re+)/) {
            my $match = $1;
            $value .= $match;
            $reader->move_along(length($match));
        }
        elsif ($reader->match('&')) {
            # if it's a char ref, expand now:
            if ($reader->match('#')) {
                my $char;
                my $ref = '';
                if ($reader->match('x')) {
                    my $data = $reader->data;
                    while (1) {
                        $self->parser_error("EOF looking for reference end", $reader)
                            unless length($data);
                        if ($data !~ /^([0-9a-fA-F]*)/) {
                            last;
                        }
                        $ref .= $1;
                        $reader->move_along(length($1));
                        if (length($1) == length($data)) {
                            $data = $reader->data;
                        }
                        else {
                            last;
                        }
                    }
                    $char = chr_ref(hex($ref));
                    $ref = "x$ref";
                }
                else {
                    my $data = $reader->data;
                    while (1) {
                        $self->parser_error("EOF looking for reference end", $reader)
                            unless length($data);
                        if ($data !~ /^([0-9]*)/) {
                            last;
                        }
                        $ref .= $1;
                        $reader->move_along(length($1));
                        if (length($1) == length($data)) {
                            $data = $reader->data;
                        }
                        else {
                            last;
                        }
                    }
                    $char = chr($ref);
                }
                $reader->match(';') ||
                    $self->parser_error("No semi-colon found after character reference", $reader);
                if ($char !~ $SingleChar) { # match a single character
                    $self->parser_error("Character reference '&#$ref;' refers to an illegal XML character ($char)", $reader);
                }
                $value .= $char;
            }
            else {
                # entity refs in entities get expanded later, so don't parse now.
                $value .= '&';
            }
        }
        elsif ($reader->match('%')) {
            $value .= $self->PEReference($reader);
        }
        elsif ($reader->match($quote)) {
            # end of attrib
            last;
        }
        else {
            $self->parser_error("Invalid character in attribute value: " . substr($reader->data, 0, 1), $reader);
        }
    }
    
    return $value;
}

sub NDataDecl {
    my ($self, $reader) = @_;
    $self->skip_whitespace($reader) || return '';
    my $data = $reader->data(5);
    return '' unless $data =~ /^NDATA/;
    $reader->move_along(5);
    $self->skip_whitespace($reader) || $self->parser_error("No whitespace after NDATA declaration", $reader);
    my $name = $self->Name($reader) || $self->parser_error("NDATA declaration lacks a proper Name", $reader);
    return " NDATA $name";
}

sub NotationDecl {
    my ($self, $reader) = @_;
    
    my $data = $reader->data(10);
    return 0 unless $data =~ /^<!NOTATION/;
    $reader->move_along(10);
    $self->skip_whitespace($reader) ||
        $self->parser_error("No whitespace after NOTATION declaration", $reader);
    $data = $reader->data;
    my $value = '';
    while(1) {
        $self->parser_error("EOF found while looking for end of NotationDecl", $reader)
            unless length($data);
        
        if ($data =~ /^([^>]*)>/) {
            $value .= $1;
            $reader->move_along(length($1) + 1);
            $self->notation_decl({Name => "FIXME", SystemId => "FIXME", PublicId => "FIXME" });
            last;
        }
        else {
            $value .= $data;
            $reader->move_along(length($data));
            $data = $reader->data;
        }
    }
    return 1;
}

1;