|
1 /**************************************************************************** |
|
2 ** |
|
3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). |
|
4 ** All rights reserved. |
|
5 ** Contact: Nokia Corporation (qt-info@nokia.com) |
|
6 ** |
|
7 ** This file is part of the QtXmlPatterns module of the Qt Toolkit. |
|
8 ** |
|
9 ** $QT_BEGIN_LICENSE:LGPL$ |
|
10 ** No Commercial Usage |
|
11 ** This file contains pre-release code and may not be distributed. |
|
12 ** You may use this file in accordance with the terms and conditions |
|
13 ** contained in the Technology Preview License Agreement accompanying |
|
14 ** this package. |
|
15 ** |
|
16 ** GNU Lesser General Public License Usage |
|
17 ** Alternatively, this file may be used under the terms of the GNU Lesser |
|
18 ** General Public License version 2.1 as published by the Free Software |
|
19 ** Foundation and appearing in the file LICENSE.LGPL included in the |
|
20 ** packaging of this file. Please review the following information to |
|
21 ** ensure the GNU Lesser General Public License version 2.1 requirements |
|
22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
|
23 ** |
|
24 ** In addition, as a special exception, Nokia gives you certain additional |
|
25 ** rights. These rights are described in the Nokia Qt LGPL Exception |
|
26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
|
27 ** |
|
28 ** If you have questions regarding the use of this file, please contact |
|
29 ** Nokia at qt-info@nokia.com. |
|
30 ** |
|
31 ** |
|
32 ** |
|
33 ** |
|
34 ** |
|
35 ** |
|
36 ** |
|
37 ** |
|
38 ** $QT_END_LICENSE$ |
|
39 ** |
|
40 ****************************************************************************/ |
|
41 |
|
42 // |
|
43 // W A R N I N G |
|
44 // ------------- |
|
45 // |
|
46 // This file is not part of the Qt API. It exists purely as an |
|
47 // implementation detail. This header file may change from version to |
|
48 // version without notice, or even be removed. |
|
49 // |
|
50 // We mean it. |
|
51 #ifndef Patternist_XQueryTokenizer_H |
|
52 #define Patternist_XQueryTokenizer_H |
|
53 |
|
54 #include <QHash> |
|
55 #include <QSet> |
|
56 #include <QStack> |
|
57 #include <QString> |
|
58 #include <QUrl> |
|
59 |
|
60 #include "qtokenizer_p.h" |
|
61 |
|
62 QT_BEGIN_HEADER |
|
63 |
|
64 QT_BEGIN_NAMESPACE |
|
65 |
|
66 namespace QPatternist |
|
67 { |
|
68 struct TokenMap; |
|
69 |
|
70 /** |
|
71 * @short A hand-written tokenizer which tokenizes XQuery 1.0 & XPath 2.0, |
|
72 * and delivers tokens to the Bison generated parser. |
|
73 * |
|
74 * @author Frans Englich <frans.englich@nokia.com> |
|
75 */ |
|
76 class XQueryTokenizer : public Tokenizer |
|
77 { |
|
78 public: |
|
79 /** |
|
80 * Tokenizer states. Organized alphabetically. |
|
81 */ |
|
82 enum State |
|
83 { |
|
84 AfterAxisSeparator, |
|
85 AposAttributeContent, |
|
86 Axis, |
|
87 Default, |
|
88 ElementContent, |
|
89 EndTag, |
|
90 ItemType, |
|
91 KindTest, |
|
92 KindTestForPI, |
|
93 NamespaceDecl, |
|
94 NamespaceKeyword, |
|
95 OccurrenceIndicator, |
|
96 Operator, |
|
97 Pragma, |
|
98 PragmaContent, |
|
99 ProcessingInstructionContent, |
|
100 ProcessingInstructionName, |
|
101 QuotAttributeContent, |
|
102 StartTag, |
|
103 VarName, |
|
104 XMLComment, |
|
105 XMLSpaceDecl, |
|
106 XQueryVersion |
|
107 }; |
|
108 |
|
109 XQueryTokenizer(const QString &query, |
|
110 const QUrl &location, |
|
111 const State startingState = Default); |
|
112 |
|
113 virtual Token nextToken(YYLTYPE *const sourceLocator); |
|
114 virtual int commenceScanOnly(); |
|
115 virtual void resumeTokenizationFrom(const int position); |
|
116 |
|
117 /** |
|
118 * Does nothing. |
|
119 */ |
|
120 virtual void setParserContext(const ParserContext::Ptr &parseInfo); |
|
121 |
|
122 private: |
|
123 |
|
124 /** |
|
125 * Returns the character corresponding to the builtin reference @p |
|
126 * reference. For instance, passing @c gt will give you '>' in return. |
|
127 * |
|
128 * If @p reference is an invalid character reference, a null QChar is |
|
129 * returned. |
|
130 * |
|
131 * @see QChar::isNull() |
|
132 */ |
|
133 QChar charForReference(const QString &reference); |
|
134 |
|
135 inline Token tokenAndChangeState(const TokenType code, |
|
136 const State state, |
|
137 const int advance = 1); |
|
138 inline Token tokenAndChangeState(const TokenType code, |
|
139 const QString &value, |
|
140 const State state); |
|
141 inline Token tokenAndAdvance(const TokenType code, |
|
142 const int advance = 1); |
|
143 QString tokenizeCharacterReference(); |
|
144 |
|
145 inline Token tokenizeStringLiteral(); |
|
146 inline Token tokenizeNumberLiteral(); |
|
147 |
|
148 /** |
|
149 * @returns the character @p length characters from the current |
|
150 * position. |
|
151 */ |
|
152 inline char peekAhead(const int length = 1) const; |
|
153 |
|
154 /** |
|
155 * @returns whether the stream, starting from @p offset from the |
|
156 * current position, matches @p chs. The length of @p chs is @p len. |
|
157 */ |
|
158 inline bool aheadEquals(const char *const chs, |
|
159 const int len, |
|
160 const int offset = 1) const; |
|
161 |
|
162 inline Token tokenizeNCName(); |
|
163 static inline bool isOperatorKeyword(const TokenType); |
|
164 |
|
165 static inline bool isDigit(const char ch); |
|
166 static inline Token error(); |
|
167 inline TokenType consumeWhitespace(); |
|
168 |
|
169 /** |
|
170 * @short Returns the character at the current position, converted to |
|
171 * @c ASCII. |
|
172 * |
|
173 * Equivalent to calling: |
|
174 * |
|
175 * @code |
|
176 * current().toAscii(); |
|
177 * @endcode |
|
178 */ |
|
179 inline char peekCurrent() const; |
|
180 |
|
181 /** |
|
182 * Disregarding encoding conversion, equivalent to calling: |
|
183 * |
|
184 * @code |
|
185 * peekAhead(0); |
|
186 * @endcode |
|
187 */ |
|
188 inline const QChar current() const; |
|
189 |
|
190 /** |
|
191 * @p hadWhitespace is always set to a proper value. |
|
192 * |
|
193 * @returns the length of whitespace scanned before reaching "::", or |
|
194 * -1 if something else was found. |
|
195 */ |
|
196 int peekForColonColon() const; |
|
197 |
|
198 static inline bool isNCNameStart(const QChar ch); |
|
199 static inline bool isNCNameBody(const QChar ch); |
|
200 static inline const TokenMap *lookupKeyword(const QString &keyword); |
|
201 inline void popState(); |
|
202 inline void pushState(const State state); |
|
203 inline State state() const; |
|
204 inline void setState(const State s); |
|
205 static bool isTypeToken(const TokenType t); |
|
206 |
|
207 inline Token tokenizeNCNameOrQName(); |
|
208 /** |
|
209 * Advances m_pos until content is encountered. |
|
210 * |
|
211 * Returned is the length stretching from m_pos when starting, until |
|
212 * @p content is encountered. @p content is not included in the length. |
|
213 */ |
|
214 int scanUntil(const char *const content); |
|
215 |
|
216 /** |
|
217 * Same as calling: |
|
218 * @code |
|
219 * pushState(currentState()); |
|
220 * @endcode |
|
221 */ |
|
222 inline void pushState(); |
|
223 |
|
224 /** |
|
225 * Consumes only whitespace, in the traditional sense. The function exits |
|
226 * if non-whitespace is encountered, such as the start of a comment. |
|
227 * |
|
228 * @returns @c true if the end was reached, otherwise @c false |
|
229 */ |
|
230 inline bool consumeRawWhitespace(); |
|
231 |
|
232 /** |
|
233 * @short Parses comments: <tt>(: comment content :)</tt>. It recurses for |
|
234 * parsing nested comments. |
|
235 * |
|
236 * It is assumed that the start token for the comment, "(:", has |
|
237 * already been parsed. |
|
238 * |
|
239 * Typically, don't call this function, but ignoreWhitespace(). |
|
240 * |
|
241 * @see <a href="http://www.w3.org/TR/xpath20/#comments">XML Path Language (XPath) |
|
242 * 2.0, 2.6 Comments</a> |
|
243 * @returns |
|
244 * - SUCCESS if everything went ok |
|
245 * - ERROR if there was an error in parsing one or more comments |
|
246 * - END_OF_FILE if the end was reached |
|
247 */ |
|
248 Tokenizer::TokenType consumeComment(); |
|
249 |
|
250 /** |
|
251 * Determines whether @p code is a keyword |
|
252 * that is followed by a second keyword. For instance <tt>declare |
|
253 * function</tt>. |
|
254 */ |
|
255 static inline bool isPhraseKeyword(const TokenType code); |
|
256 |
|
257 /** |
|
258 * A set of indexes into a QString, the one being passed to |
|
259 * normalizeEOL() whose characters shouldn't be normalized. */ |
|
260 typedef QSet<int> CharacterSkips; |
|
261 |
|
262 /** |
|
263 * Returns @p input, normalized according to |
|
264 * <a href="http://www.w3.org/TR/xquery/#id-eol-handling">XQuery 1.0: |
|
265 * An XML Query Language, A.2.3 End-of-Line Handling</a> |
|
266 */ |
|
267 static QString normalizeEOL(const QString &input, |
|
268 const CharacterSkips &characterSkips); |
|
269 |
|
270 inline bool atEnd() const |
|
271 { |
|
272 return m_pos == m_length; |
|
273 } |
|
274 |
|
275 Token nextToken(); |
|
276 /** |
|
277 * Instead of recognizing and tokenizing embedded expressions in |
|
278 * direct attriute constructors, this function is essentially a mini |
|
279 * recursive-descent parser that has the necessary logic to recognize |
|
280 * embedded expressions and their potentially interfering string literals, in |
|
281 * order to scan to the very end of the attribute value, and return the |
|
282 * whole as a string. |
|
283 * |
|
284 * There is of course syntax errors this function will not detect, but |
|
285 * that is ok since the attributes will be parsed once more. |
|
286 * |
|
287 * An inelegant solution, but which gets the job done. |
|
288 * |
|
289 * @see commenceScanOnly(), resumeTokenizationFrom() |
|
290 */ |
|
291 Token attributeAsRaw(const QChar separator, |
|
292 int &stack, |
|
293 const int startPos, |
|
294 const bool inLiteral, |
|
295 QString &result); |
|
296 |
|
297 const QString m_data; |
|
298 const int m_length; |
|
299 State m_state; |
|
300 QStack<State> m_stateStack; |
|
301 int m_pos; |
|
302 |
|
303 /** |
|
304 * The current line number. |
|
305 * |
|
306 * The line number and column number both starts at 1. |
|
307 */ |
|
308 int m_line; |
|
309 |
|
310 /** |
|
311 * The offset into m_length for where |
|
312 * the current column starts. So m_length - m_columnOffset |
|
313 * is the current column. |
|
314 * |
|
315 * The line number and column number both starts at 1. |
|
316 */ |
|
317 int m_columnOffset; |
|
318 |
|
319 const NamePool::Ptr m_namePool; |
|
320 QStack<Token> m_tokenStack; |
|
321 QHash<QString, QChar> m_charRefs; |
|
322 bool m_scanOnly; |
|
323 |
|
324 Q_DISABLE_COPY(XQueryTokenizer) |
|
325 }; |
|
326 } |
|
327 |
|
328 QT_END_NAMESPACE |
|
329 |
|
330 QT_END_HEADER |
|
331 |
|
332 #endif |