WebCore/html/HTMLTokenizer.h
changeset 0 4f2f89ce4247
equal deleted inserted replaced
-1:000000000000 0:4f2f89ce4247
       
     1 /*
       
     2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
       
     3  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
       
     4  *
       
     5  * Redistribution and use in source and binary forms, with or without
       
     6  * modification, are permitted provided that the following conditions
       
     7  * are met:
       
     8  * 1. Redistributions of source code must retain the above copyright
       
     9  *    notice, this list of conditions and the following disclaimer.
       
    10  * 2. Redistributions in binary form must reproduce the above copyright
       
    11  *    notice, this list of conditions and the following disclaimer in the
       
    12  *    documentation and/or other materials provided with the distribution.
       
    13  *
       
    14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
       
    15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       
    17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
       
    18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
       
    19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
       
    20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
       
    21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
       
    22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       
    23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
       
    24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
       
    25  */
       
    26 
       
    27 #ifndef HTMLTokenizer_h
       
    28 #define HTMLTokenizer_h
       
    29 
       
    30 #include "AtomicString.h"
       
    31 #include "SegmentedString.h"
       
    32 #include <wtf/Noncopyable.h>
       
    33 #include <wtf/Vector.h>
       
    34 
       
    35 namespace WebCore {
       
    36 
       
    37 class HTMLToken;
       
    38 
       
    39 class HTMLTokenizer : public Noncopyable {
       
    40 public:
       
    41     enum State {
       
    42         DataState,
       
    43         CharacterReferenceInDataState,
       
    44         RCDATAState,
       
    45         CharacterReferenceInRCDATAState,
       
    46         RAWTEXTState,
       
    47         ScriptDataState,
       
    48         PLAINTEXTState,
       
    49         TagOpenState,
       
    50         EndTagOpenState,
       
    51         TagNameState,
       
    52         RCDATALessThanSignState,
       
    53         RCDATAEndTagOpenState,
       
    54         RCDATAEndTagNameState,
       
    55         RAWTEXTLessThanSignState,
       
    56         RAWTEXTEndTagOpenState,
       
    57         RAWTEXTEndTagNameState,
       
    58         ScriptDataLessThanSignState,
       
    59         ScriptDataEndTagOpenState,
       
    60         ScriptDataEndTagNameState,
       
    61         ScriptDataEscapeStartState,
       
    62         ScriptDataEscapeStartDashState,
       
    63         ScriptDataEscapedState,
       
    64         ScriptDataEscapedDashState,
       
    65         ScriptDataEscapedDashDashState,
       
    66         ScriptDataEscapedLessThanSignState,
       
    67         ScriptDataEscapedEndTagOpenState,
       
    68         ScriptDataEscapedEndTagNameState,
       
    69         ScriptDataDoubleEscapeStartState,
       
    70         ScriptDataDoubleEscapedState,
       
    71         ScriptDataDoubleEscapedDashState,
       
    72         ScriptDataDoubleEscapedDashDashState,
       
    73         ScriptDataDoubleEscapedLessThanSignState,
       
    74         ScriptDataDoubleEscapeEndState,
       
    75         BeforeAttributeNameState,
       
    76         AttributeNameState,
       
    77         AfterAttributeNameState,
       
    78         BeforeAttributeValueState,
       
    79         AttributeValueDoubleQuotedState,
       
    80         AttributeValueSingleQuotedState,
       
    81         AttributeValueUnquotedState,
       
    82         CharacterReferenceInAttributeValueState,
       
    83         AfterAttributeValueQuotedState,
       
    84         SelfClosingStartTagState,
       
    85         BogusCommentState,
       
    86         // The ContinueBogusCommentState is not in the HTML5 spec, but we use
       
    87         // it internally to keep track of whether we've started the bogus
       
    88         // comment token yet.
       
    89         ContinueBogusCommentState,
       
    90         MarkupDeclarationOpenState,
       
    91         CommentStartState,
       
    92         CommentStartDashState,
       
    93         CommentState,
       
    94         CommentEndDashState,
       
    95         CommentEndState,
       
    96         CommentEndBangState,
       
    97         CommentEndSpaceState,
       
    98         DOCTYPEState,
       
    99         BeforeDOCTYPENameState,
       
   100         DOCTYPENameState,
       
   101         AfterDOCTYPENameState,
       
   102         AfterDOCTYPEPublicKeywordState,
       
   103         BeforeDOCTYPEPublicIdentifierState,
       
   104         DOCTYPEPublicIdentifierDoubleQuotedState,
       
   105         DOCTYPEPublicIdentifierSingleQuotedState,
       
   106         AfterDOCTYPEPublicIdentifierState,
       
   107         BetweenDOCTYPEPublicAndSystemIdentifiersState,
       
   108         AfterDOCTYPESystemKeywordState,
       
   109         BeforeDOCTYPESystemIdentifierState,
       
   110         DOCTYPESystemIdentifierDoubleQuotedState,
       
   111         DOCTYPESystemIdentifierSingleQuotedState,
       
   112         AfterDOCTYPESystemIdentifierState,
       
   113         BogusDOCTYPEState,
       
   114         CDATASectionState,
       
   115     };
       
   116 
       
   117     HTMLTokenizer();
       
   118     ~HTMLTokenizer();
       
   119 
       
   120     void reset();
       
   121 
       
   122     // This function returns true if it emits a token.  Otherwise, callers
       
   123     // must provide the same (in progress) token on the next call (unless
       
   124     // they call reset() first).
       
   125     bool nextToken(SegmentedString&, HTMLToken&);
       
   126 
       
   127     int lineNumber() const { return m_lineNumber; }
       
   128     int columnNumber() const { return 1; } // Matches LegacyHTMLDocumentParser.h behavior.
       
   129 
       
   130     State state() const { return m_state; }
       
   131     void setState(State state) { m_state = state; }
       
   132 
       
   133     // Hack to skip leading newline in <pre>/<listing> for authoring ease.
       
   134     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
       
   135     void skipLeadingNewLineForListing() { m_skipLeadingNewLineForListing = true; }
       
   136 
       
   137 private:
       
   138     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
       
   139     class InputStreamPreprocessor : public Noncopyable {
       
   140     public:
       
   141         InputStreamPreprocessor()
       
   142             : m_nextInputCharacter('\0')
       
   143             , m_skipNextNewLine(false)
       
   144         {
       
   145         }
       
   146 
       
   147         UChar nextInputCharacter() const { return m_nextInputCharacter; }
       
   148 
       
   149         // Returns whether we succeeded in peeking at the next character.
       
   150         // The only way we can fail to peek is if there are no more
       
   151         // characters in |source| (after collapsing \r\n, etc).
       
   152         bool peek(SegmentedString& source, int& lineNumber)
       
   153         {
       
   154             m_nextInputCharacter = *source;
       
   155             if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
       
   156                 m_skipNextNewLine = false;
       
   157                 source.advancePastNewline(lineNumber);
       
   158                 if (source.isEmpty())
       
   159                     return false;
       
   160                 m_nextInputCharacter = *source;
       
   161             }
       
   162             if (m_nextInputCharacter == '\r') {
       
   163                 m_nextInputCharacter = '\n';
       
   164                 m_skipNextNewLine = true;
       
   165             } else {
       
   166                 m_skipNextNewLine = false;
       
   167                 // FIXME: The spec indicates that the surrogate pair range as well as
       
   168                 // a number of specific character values are parse errors and should be replaced
       
   169                 // by the replacement character. We suspect this is a problem with the spec as doing
       
   170                 // that filtering breaks surrogate pair handling and causes us not to match Minefield.
       
   171                 if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source))
       
   172                     m_nextInputCharacter = 0xFFFD;
       
   173             }
       
   174             return true;
       
   175         }
       
   176 
       
   177         // Returns whether there are more characters in |source| after advancing.
       
   178         bool advance(SegmentedString& source, int& lineNumber)
       
   179         {
       
   180             source.advance(lineNumber);
       
   181             if (source.isEmpty())
       
   182                 return false;
       
   183             return peek(source, lineNumber);
       
   184         }
       
   185 
       
   186         static const UChar endOfFileMarker;
       
   187 
       
   188     private:
       
   189         bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
       
   190         {
       
   191             return source.isClosed() && source.length() == 1;
       
   192         }
       
   193 
       
   194         // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
       
   195         UChar m_nextInputCharacter;
       
   196         bool m_skipNextNewLine;
       
   197     };
       
   198 
       
   199     inline bool processEntity(SegmentedString&);
       
   200 
       
   201     inline void parseError();
       
   202     inline void bufferCharacter(UChar);
       
   203     inline void bufferCodePoint(unsigned);
       
   204 
       
   205     inline bool emitAndResumeIn(SegmentedString&, State);
       
   206     inline bool emitAndReconsumeIn(SegmentedString&, State);
       
   207     inline bool emitEndOfFile(SegmentedString&);
       
   208     inline bool flushEmitAndResumeIn(SegmentedString&, State);
       
   209 
       
   210     // Return whether we need to emit a character token before dealing with
       
   211     // the buffered end tag.
       
   212     inline bool flushBufferedEndTag(SegmentedString&);
       
   213     inline bool temporaryBufferIs(const String&);
       
   214 
       
   215     // Sometimes we speculatively consume input characters and we don't
       
   216     // know whether they represent end tags or RCDATA, etc.  These
       
   217     // functions help manage these state.
       
   218     inline void addToPossibleEndTag(UChar cc);
       
   219     inline void saveEndTagNameIfNeeded();
       
   220     inline bool isAppropriateEndTag();
       
   221 
       
   222     inline bool shouldEmitBufferedCharacterToken(const SegmentedString&);
       
   223 
       
   224     State m_state;
       
   225 
       
   226     Vector<UChar, 32> m_appropriateEndTagName;
       
   227 
       
   228     // m_token is owned by the caller.  If nextToken is not on the stack,
       
   229     // this member might be pointing to unallocated memory.
       
   230     HTMLToken* m_token;
       
   231     int m_lineNumber;
       
   232 
       
   233     bool m_skipLeadingNewLineForListing;
       
   234 
       
   235     // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
       
   236     Vector<UChar, 32> m_temporaryBuffer;
       
   237 
       
   238     // We occationally want to emit both a character token and an end tag
       
   239     // token (e.g., when lexing script).  We buffer the name of the end tag
       
   240     // token here so we remember it next time we re-enter the tokenizer.
       
   241     Vector<UChar, 32> m_bufferedEndTagName;
       
   242 
       
   243     // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
       
   244     UChar m_additionalAllowedCharacter;
       
   245 
       
   246     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
       
   247     InputStreamPreprocessor m_inputStreamPreprocessor;
       
   248 };
       
   249 
       
   250 }
       
   251 
       
   252 #endif