author | Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com> |
Fri, 12 Mar 2010 15:46:37 +0200 | |
branch | RCL_3 |
changeset 5 | d3bac044e0f0 |
parent 4 | 3b1da2848fc7 |
permissions | -rw-r--r-- |
0 | 1 |
/**************************************************************************** |
2 |
** |
|
4
3b1da2848fc7
Revision: 201003
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
0
diff
changeset
|
3 |
** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies). |
0 | 4 |
** All rights reserved. |
5 |
** Contact: Nokia Corporation (qt-info@nokia.com) |
|
6 |
** |
|
7 |
** This file is part of the QtXmlPatterns module of the Qt Toolkit. |
|
8 |
** |
|
9 |
** $QT_BEGIN_LICENSE:LGPL$ |
|
10 |
** No Commercial Usage |
|
11 |
** This file contains pre-release code and may not be distributed. |
|
12 |
** You may use this file in accordance with the terms and conditions |
|
13 |
** contained in the Technology Preview License Agreement accompanying |
|
14 |
** this package. |
|
15 |
** |
|
16 |
** GNU Lesser General Public License Usage |
|
17 |
** Alternatively, this file may be used under the terms of the GNU Lesser |
|
18 |
** General Public License version 2.1 as published by the Free Software |
|
19 |
** Foundation and appearing in the file LICENSE.LGPL included in the |
|
20 |
** packaging of this file. Please review the following information to |
|
21 |
** ensure the GNU Lesser General Public License version 2.1 requirements |
|
22 |
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
|
23 |
** |
|
24 |
** In addition, as a special exception, Nokia gives you certain additional |
|
25 |
** rights. These rights are described in the Nokia Qt LGPL Exception |
|
26 |
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
|
27 |
** |
|
28 |
** If you have questions regarding the use of this file, please contact |
|
29 |
** Nokia at qt-info@nokia.com. |
|
30 |
** |
|
31 |
** |
|
32 |
** |
|
33 |
** |
|
34 |
** |
|
35 |
** |
|
36 |
** |
|
37 |
** |
|
38 |
** $QT_END_LICENSE$ |
|
39 |
** |
|
40 |
****************************************************************************/ |
|
41 |
||
42 |
// |
|
43 |
// W A R N I N G |
|
44 |
// ------------- |
|
45 |
// |
|
46 |
// This file is not part of the Qt API. It exists purely as an |
|
47 |
// implementation detail. This header file may change from version to |
|
48 |
// version without notice, or even be removed. |
|
49 |
// |
|
50 |
// We mean it. |
|
51 |
#ifndef Patternist_XQueryTokenizer_H |
|
52 |
#define Patternist_XQueryTokenizer_H |
|
53 |
||
54 |
#include <QHash> |
|
55 |
#include <QSet> |
|
56 |
#include <QStack> |
|
57 |
#include <QString> |
|
58 |
#include <QUrl> |
|
59 |
||
60 |
#include "qtokenizer_p.h" |
|
61 |
||
62 |
QT_BEGIN_HEADER |
|
63 |
||
64 |
QT_BEGIN_NAMESPACE |
|
65 |
||
66 |
namespace QPatternist |
|
67 |
{ |
|
68 |
struct TokenMap; |
|
69 |
||
70 |
/** |
|
71 |
* @short A hand-written tokenizer which tokenizes XQuery 1.0 & XPath 2.0, |
|
72 |
* and delivers tokens to the Bison generated parser. |
|
73 |
* |
|
74 |
* @author Frans Englich <frans.englich@nokia.com> |
|
75 |
*/ |
|
76 |
class XQueryTokenizer : public Tokenizer |
|
77 |
{ |
|
78 |
public: |
|
79 |
/** |
|
80 |
* Tokenizer states. Organized alphabetically. |
|
81 |
*/ |
|
82 |
enum State |
|
83 |
{ |
|
84 |
AfterAxisSeparator, |
|
85 |
AposAttributeContent, |
|
86 |
Axis, |
|
87 |
Default, |
|
88 |
ElementContent, |
|
89 |
EndTag, |
|
90 |
ItemType, |
|
91 |
KindTest, |
|
92 |
KindTestForPI, |
|
93 |
NamespaceDecl, |
|
94 |
NamespaceKeyword, |
|
95 |
OccurrenceIndicator, |
|
96 |
Operator, |
|
97 |
Pragma, |
|
98 |
PragmaContent, |
|
99 |
ProcessingInstructionContent, |
|
100 |
ProcessingInstructionName, |
|
101 |
QuotAttributeContent, |
|
102 |
StartTag, |
|
103 |
VarName, |
|
104 |
XMLComment, |
|
105 |
XMLSpaceDecl, |
|
106 |
XQueryVersion |
|
107 |
}; |
|
108 |
||
109 |
XQueryTokenizer(const QString &query, |
|
110 |
const QUrl &location, |
|
111 |
const State startingState = Default); |
|
112 |
||
113 |
virtual Token nextToken(YYLTYPE *const sourceLocator); |
|
114 |
virtual int commenceScanOnly(); |
|
115 |
virtual void resumeTokenizationFrom(const int position); |
|
116 |
||
117 |
/** |
|
118 |
* Does nothing. |
|
119 |
*/ |
|
120 |
virtual void setParserContext(const ParserContext::Ptr &parseInfo); |
|
121 |
||
122 |
private: |
|
123 |
||
124 |
/** |
|
125 |
* Returns the character corresponding to the builtin reference @p |
|
126 |
* reference. For instance, passing @c gt will give you '>' in return. |
|
127 |
* |
|
128 |
* If @p reference is an invalid character reference, a null QChar is |
|
129 |
* returned. |
|
130 |
* |
|
131 |
* @see QChar::isNull() |
|
132 |
*/ |
|
133 |
QChar charForReference(const QString &reference); |
|
134 |
||
135 |
inline Token tokenAndChangeState(const TokenType code, |
|
136 |
const State state, |
|
137 |
const int advance = 1); |
|
138 |
inline Token tokenAndChangeState(const TokenType code, |
|
139 |
const QString &value, |
|
140 |
const State state); |
|
141 |
inline Token tokenAndAdvance(const TokenType code, |
|
142 |
const int advance = 1); |
|
143 |
QString tokenizeCharacterReference(); |
|
144 |
||
145 |
inline Token tokenizeStringLiteral(); |
|
146 |
inline Token tokenizeNumberLiteral(); |
|
147 |
||
148 |
/** |
|
149 |
* @returns the character @p length characters from the current |
|
150 |
* position. |
|
151 |
*/ |
|
152 |
inline char peekAhead(const int length = 1) const; |
|
153 |
||
154 |
/** |
|
155 |
* @returns whether the stream, starting from @p offset from the |
|
156 |
* current position, matches @p chs. The length of @p chs is @p len. |
|
157 |
*/ |
|
158 |
inline bool aheadEquals(const char *const chs, |
|
159 |
const int len, |
|
160 |
const int offset = 1) const; |
|
161 |
||
162 |
inline Token tokenizeNCName(); |
|
163 |
static inline bool isOperatorKeyword(const TokenType); |
|
164 |
||
165 |
static inline bool isDigit(const char ch); |
|
166 |
static inline Token error(); |
|
167 |
inline TokenType consumeWhitespace(); |
|
168 |
||
169 |
/** |
|
170 |
* @short Returns the character at the current position, converted to |
|
171 |
* @c ASCII. |
|
172 |
* |
|
173 |
* Equivalent to calling: |
|
174 |
* |
|
175 |
* @code |
|
176 |
* current().toAscii(); |
|
177 |
* @endcode |
|
178 |
*/ |
|
179 |
inline char peekCurrent() const; |
|
180 |
||
181 |
/** |
|
182 |
* Disregarding encoding conversion, equivalent to calling: |
|
183 |
* |
|
184 |
* @code |
|
185 |
* peekAhead(0); |
|
186 |
* @endcode |
|
187 |
*/ |
|
188 |
inline const QChar current() const; |
|
189 |
||
190 |
/** |
|
191 |
* @p hadWhitespace is always set to a proper value. |
|
192 |
* |
|
193 |
* @returns the length of whitespace scanned before reaching "::", or |
|
194 |
* -1 if something else was found. |
|
195 |
*/ |
|
196 |
int peekForColonColon() const; |
|
197 |
||
198 |
static inline bool isNCNameStart(const QChar ch); |
|
199 |
static inline bool isNCNameBody(const QChar ch); |
|
200 |
static inline const TokenMap *lookupKeyword(const QString &keyword); |
|
201 |
inline void popState(); |
|
202 |
inline void pushState(const State state); |
|
203 |
inline State state() const; |
|
204 |
inline void setState(const State s); |
|
205 |
static bool isTypeToken(const TokenType t); |
|
206 |
||
207 |
inline Token tokenizeNCNameOrQName(); |
|
208 |
/** |
|
209 |
* Advances m_pos until content is encountered. |
|
210 |
* |
|
211 |
* Returned is the length stretching from m_pos when starting, until |
|
212 |
* @p content is encountered. @p content is not included in the length. |
|
213 |
*/ |
|
214 |
int scanUntil(const char *const content); |
|
215 |
||
216 |
/** |
|
217 |
* Same as calling: |
|
218 |
* @code |
|
219 |
* pushState(currentState()); |
|
220 |
* @endcode |
|
221 |
*/ |
|
222 |
inline void pushState(); |
|
223 |
||
224 |
/** |
|
225 |
* Consumes only whitespace, in the traditional sense. The function exits |
|
226 |
* if non-whitespace is encountered, such as the start of a comment. |
|
227 |
* |
|
228 |
* @returns @c true if the end was reached, otherwise @c false |
|
229 |
*/ |
|
230 |
inline bool consumeRawWhitespace(); |
|
231 |
||
232 |
/** |
|
233 |
* @short Parses comments: <tt>(: comment content :)</tt>. It recurses for |
|
234 |
* parsing nested comments. |
|
235 |
* |
|
236 |
* It is assumed that the start token for the comment, "(:", has |
|
237 |
* already been parsed. |
|
238 |
* |
|
239 |
* Typically, don't call this function, but ignoreWhitespace(). |
|
240 |
* |
|
241 |
* @see <a href="http://www.w3.org/TR/xpath20/#comments">XML Path Language (XPath) |
|
242 |
* 2.0, 2.6 Comments</a> |
|
243 |
* @returns |
|
244 |
* - SUCCESS if everything went ok |
|
245 |
* - ERROR if there was an error in parsing one or more comments |
|
246 |
* - END_OF_FILE if the end was reached |
|
247 |
*/ |
|
248 |
Tokenizer::TokenType consumeComment(); |
|
249 |
||
250 |
/** |
|
251 |
* Determines whether @p code is a keyword |
|
252 |
* that is followed by a second keyword. For instance <tt>declare |
|
253 |
* function</tt>. |
|
254 |
*/ |
|
255 |
static inline bool isPhraseKeyword(const TokenType code); |
|
256 |
||
257 |
/** |
|
258 |
* A set of indexes into a QString, the one being passed to |
|
259 |
* normalizeEOL() whose characters shouldn't be normalized. */ |
|
260 |
typedef QSet<int> CharacterSkips; |
|
261 |
||
262 |
/** |
|
263 |
* Returns @p input, normalized according to |
|
264 |
* <a href="http://www.w3.org/TR/xquery/#id-eol-handling">XQuery 1.0: |
|
265 |
* An XML Query Language, A.2.3 End-of-Line Handling</a> |
|
266 |
*/ |
|
267 |
static QString normalizeEOL(const QString &input, |
|
268 |
const CharacterSkips &characterSkips); |
|
269 |
||
270 |
inline bool atEnd() const |
|
271 |
{ |
|
272 |
return m_pos == m_length; |
|
273 |
} |
|
274 |
||
275 |
Token nextToken(); |
|
276 |
/** |
|
277 |
* Instead of recognizing and tokenizing embedded expressions in |
|
278 |
* direct attriute constructors, this function is essentially a mini |
|
279 |
* recursive-descent parser that has the necessary logic to recognize |
|
280 |
* embedded expressions and their potentially interfering string literals, in |
|
281 |
* order to scan to the very end of the attribute value, and return the |
|
282 |
* whole as a string. |
|
283 |
* |
|
284 |
* There is of course syntax errors this function will not detect, but |
|
285 |
* that is ok since the attributes will be parsed once more. |
|
286 |
* |
|
287 |
* An inelegant solution, but which gets the job done. |
|
288 |
* |
|
289 |
* @see commenceScanOnly(), resumeTokenizationFrom() |
|
290 |
*/ |
|
291 |
Token attributeAsRaw(const QChar separator, |
|
292 |
int &stack, |
|
293 |
const int startPos, |
|
294 |
const bool inLiteral, |
|
295 |
QString &result); |
|
296 |
||
297 |
const QString m_data; |
|
298 |
const int m_length; |
|
299 |
State m_state; |
|
300 |
QStack<State> m_stateStack; |
|
301 |
int m_pos; |
|
302 |
||
303 |
/** |
|
304 |
* The current line number. |
|
305 |
* |
|
306 |
* The line number and column number both starts at 1. |
|
307 |
*/ |
|
308 |
int m_line; |
|
309 |
||
310 |
/** |
|
311 |
* The offset into m_length for where |
|
312 |
* the current column starts. So m_length - m_columnOffset |
|
313 |
* is the current column. |
|
314 |
* |
|
315 |
* The line number and column number both starts at 1. |
|
316 |
*/ |
|
317 |
int m_columnOffset; |
|
318 |
||
319 |
const NamePool::Ptr m_namePool; |
|
320 |
QStack<Token> m_tokenStack; |
|
321 |
QHash<QString, QChar> m_charRefs; |
|
322 |
bool m_scanOnly; |
|
323 |
||
324 |
Q_DISABLE_COPY(XQueryTokenizer) |
|
325 |
}; |
|
326 |
} |
|
327 |
||
328 |
QT_END_NAMESPACE |
|
329 |
||
330 |
QT_END_HEADER |
|
331 |
||
332 |
#endif |