|
1 /* |
|
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. |
|
3 * Copyright (C) 2010 Google, Inc. All Rights Reserved. |
|
4 * |
|
5 * Redistribution and use in source and binary forms, with or without |
|
6 * modification, are permitted provided that the following conditions |
|
7 * are met: |
|
8 * 1. Redistributions of source code must retain the above copyright |
|
9 * notice, this list of conditions and the following disclaimer. |
|
10 * 2. Redistributions in binary form must reproduce the above copyright |
|
11 * notice, this list of conditions and the following disclaimer in the |
|
12 * documentation and/or other materials provided with the distribution. |
|
13 * |
|
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
|
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
|
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
25 */ |
|
26 |
|
27 #ifndef HTMLTokenizer_h |
|
28 #define HTMLTokenizer_h |
|
29 |
|
30 #include "AtomicString.h" |
|
31 #include "SegmentedString.h" |
|
32 #include <wtf/Noncopyable.h> |
|
33 #include <wtf/Vector.h> |
|
34 |
|
35 namespace WebCore { |
|
36 |
|
37 class HTMLToken; |
|
38 |
|
39 class HTMLTokenizer : public Noncopyable { |
|
40 public: |
|
41 enum State { |
|
42 DataState, |
|
43 CharacterReferenceInDataState, |
|
44 RCDATAState, |
|
45 CharacterReferenceInRCDATAState, |
|
46 RAWTEXTState, |
|
47 ScriptDataState, |
|
48 PLAINTEXTState, |
|
49 TagOpenState, |
|
50 EndTagOpenState, |
|
51 TagNameState, |
|
52 RCDATALessThanSignState, |
|
53 RCDATAEndTagOpenState, |
|
54 RCDATAEndTagNameState, |
|
55 RAWTEXTLessThanSignState, |
|
56 RAWTEXTEndTagOpenState, |
|
57 RAWTEXTEndTagNameState, |
|
58 ScriptDataLessThanSignState, |
|
59 ScriptDataEndTagOpenState, |
|
60 ScriptDataEndTagNameState, |
|
61 ScriptDataEscapeStartState, |
|
62 ScriptDataEscapeStartDashState, |
|
63 ScriptDataEscapedState, |
|
64 ScriptDataEscapedDashState, |
|
65 ScriptDataEscapedDashDashState, |
|
66 ScriptDataEscapedLessThanSignState, |
|
67 ScriptDataEscapedEndTagOpenState, |
|
68 ScriptDataEscapedEndTagNameState, |
|
69 ScriptDataDoubleEscapeStartState, |
|
70 ScriptDataDoubleEscapedState, |
|
71 ScriptDataDoubleEscapedDashState, |
|
72 ScriptDataDoubleEscapedDashDashState, |
|
73 ScriptDataDoubleEscapedLessThanSignState, |
|
74 ScriptDataDoubleEscapeEndState, |
|
75 BeforeAttributeNameState, |
|
76 AttributeNameState, |
|
77 AfterAttributeNameState, |
|
78 BeforeAttributeValueState, |
|
79 AttributeValueDoubleQuotedState, |
|
80 AttributeValueSingleQuotedState, |
|
81 AttributeValueUnquotedState, |
|
82 CharacterReferenceInAttributeValueState, |
|
83 AfterAttributeValueQuotedState, |
|
84 SelfClosingStartTagState, |
|
85 BogusCommentState, |
|
86 // The ContinueBogusCommentState is not in the HTML5 spec, but we use |
|
87 // it internally to keep track of whether we've started the bogus |
|
88 // comment token yet. |
|
89 ContinueBogusCommentState, |
|
90 MarkupDeclarationOpenState, |
|
91 CommentStartState, |
|
92 CommentStartDashState, |
|
93 CommentState, |
|
94 CommentEndDashState, |
|
95 CommentEndState, |
|
96 CommentEndBangState, |
|
97 CommentEndSpaceState, |
|
98 DOCTYPEState, |
|
99 BeforeDOCTYPENameState, |
|
100 DOCTYPENameState, |
|
101 AfterDOCTYPENameState, |
|
102 AfterDOCTYPEPublicKeywordState, |
|
103 BeforeDOCTYPEPublicIdentifierState, |
|
104 DOCTYPEPublicIdentifierDoubleQuotedState, |
|
105 DOCTYPEPublicIdentifierSingleQuotedState, |
|
106 AfterDOCTYPEPublicIdentifierState, |
|
107 BetweenDOCTYPEPublicAndSystemIdentifiersState, |
|
108 AfterDOCTYPESystemKeywordState, |
|
109 BeforeDOCTYPESystemIdentifierState, |
|
110 DOCTYPESystemIdentifierDoubleQuotedState, |
|
111 DOCTYPESystemIdentifierSingleQuotedState, |
|
112 AfterDOCTYPESystemIdentifierState, |
|
113 BogusDOCTYPEState, |
|
114 CDATASectionState, |
|
115 }; |
|
116 |
|
117 HTMLTokenizer(); |
|
118 ~HTMLTokenizer(); |
|
119 |
|
120 void reset(); |
|
121 |
|
122 // This function returns true if it emits a token. Otherwise, callers |
|
123 // must provide the same (in progress) token on the next call (unless |
|
124 // they call reset() first). |
|
125 bool nextToken(SegmentedString&, HTMLToken&); |
|
126 |
|
127 int lineNumber() const { return m_lineNumber; } |
|
128 int columnNumber() const { return 1; } // Matches LegacyHTMLDocumentParser.h behavior. |
|
129 |
|
130 State state() const { return m_state; } |
|
131 void setState(State state) { m_state = state; } |
|
132 |
|
133 // Hack to skip leading newline in <pre>/<listing> for authoring ease. |
|
134 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody |
|
135 void skipLeadingNewLineForListing() { m_skipLeadingNewLineForListing = true; } |
|
136 |
|
137 private: |
|
138 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream |
|
139 class InputStreamPreprocessor : public Noncopyable { |
|
140 public: |
|
141 InputStreamPreprocessor() |
|
142 : m_nextInputCharacter('\0') |
|
143 , m_skipNextNewLine(false) |
|
144 { |
|
145 } |
|
146 |
|
147 UChar nextInputCharacter() const { return m_nextInputCharacter; } |
|
148 |
|
149 // Returns whether we succeeded in peeking at the next character. |
|
150 // The only way we can fail to peek is if there are no more |
|
151 // characters in |source| (after collapsing \r\n, etc). |
|
152 bool peek(SegmentedString& source, int& lineNumber) |
|
153 { |
|
154 m_nextInputCharacter = *source; |
|
155 if (m_nextInputCharacter == '\n' && m_skipNextNewLine) { |
|
156 m_skipNextNewLine = false; |
|
157 source.advancePastNewline(lineNumber); |
|
158 if (source.isEmpty()) |
|
159 return false; |
|
160 m_nextInputCharacter = *source; |
|
161 } |
|
162 if (m_nextInputCharacter == '\r') { |
|
163 m_nextInputCharacter = '\n'; |
|
164 m_skipNextNewLine = true; |
|
165 } else { |
|
166 m_skipNextNewLine = false; |
|
167 // FIXME: The spec indicates that the surrogate pair range as well as |
|
168 // a number of specific character values are parse errors and should be replaced |
|
169 // by the replacement character. We suspect this is a problem with the spec as doing |
|
170 // that filtering breaks surrogate pair handling and causes us not to match Minefield. |
|
171 if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) |
|
172 m_nextInputCharacter = 0xFFFD; |
|
173 } |
|
174 return true; |
|
175 } |
|
176 |
|
177 // Returns whether there are more characters in |source| after advancing. |
|
178 bool advance(SegmentedString& source, int& lineNumber) |
|
179 { |
|
180 source.advance(lineNumber); |
|
181 if (source.isEmpty()) |
|
182 return false; |
|
183 return peek(source, lineNumber); |
|
184 } |
|
185 |
|
186 static const UChar endOfFileMarker; |
|
187 |
|
188 private: |
|
189 bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const |
|
190 { |
|
191 return source.isClosed() && source.length() == 1; |
|
192 } |
|
193 |
|
194 // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character |
|
195 UChar m_nextInputCharacter; |
|
196 bool m_skipNextNewLine; |
|
197 }; |
|
198 |
|
199 inline bool processEntity(SegmentedString&); |
|
200 |
|
201 inline void parseError(); |
|
202 inline void bufferCharacter(UChar); |
|
203 inline void bufferCodePoint(unsigned); |
|
204 |
|
205 inline bool emitAndResumeIn(SegmentedString&, State); |
|
206 inline bool emitAndReconsumeIn(SegmentedString&, State); |
|
207 inline bool emitEndOfFile(SegmentedString&); |
|
208 inline bool flushEmitAndResumeIn(SegmentedString&, State); |
|
209 |
|
210 // Return whether we need to emit a character token before dealing with |
|
211 // the buffered end tag. |
|
212 inline bool flushBufferedEndTag(SegmentedString&); |
|
213 inline bool temporaryBufferIs(const String&); |
|
214 |
|
215 // Sometimes we speculatively consume input characters and we don't |
|
216 // know whether they represent end tags or RCDATA, etc. These |
|
217 // functions help manage these state. |
|
218 inline void addToPossibleEndTag(UChar cc); |
|
219 inline void saveEndTagNameIfNeeded(); |
|
220 inline bool isAppropriateEndTag(); |
|
221 |
|
222 inline bool shouldEmitBufferedCharacterToken(const SegmentedString&); |
|
223 |
|
224 State m_state; |
|
225 |
|
226 Vector<UChar, 32> m_appropriateEndTagName; |
|
227 |
|
228 // m_token is owned by the caller. If nextToken is not on the stack, |
|
229 // this member might be pointing to unallocated memory. |
|
230 HTMLToken* m_token; |
|
231 int m_lineNumber; |
|
232 |
|
233 bool m_skipLeadingNewLineForListing; |
|
234 |
|
235 // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer |
|
236 Vector<UChar, 32> m_temporaryBuffer; |
|
237 |
|
238 // We occationally want to emit both a character token and an end tag |
|
239 // token (e.g., when lexing script). We buffer the name of the end tag |
|
240 // token here so we remember it next time we re-enter the tokenizer. |
|
241 Vector<UChar, 32> m_bufferedEndTagName; |
|
242 |
|
243 // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character |
|
244 UChar m_additionalAllowedCharacter; |
|
245 |
|
246 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream |
|
247 InputStreamPreprocessor m_inputStreamPreprocessor; |
|
248 }; |
|
249 |
|
250 } |
|
251 |
|
252 #endif |