66 * strength language and converting the character stream into |
66 * strength language and converting the character stream into |
67 * language token stream. Note: Regular expression syntax |
67 * language token stream. Note: Regular expression syntax |
68 * (e.g. "file*.tx?") itself is not supported) |
68 * (e.g. "file*.tx?") itself is not supported) |
69 */ |
69 */ |
70 namespace Lex { |
70 namespace Lex { |
71 |
71 |
72 |
72 typedef const wchar_t* token_type_t; |
73 /** |
73 |
74 * Basic token types |
74 |
75 */ |
75 extern token_type_t TOKEN_UNKNOWN; |
76 enum TokenType { |
76 extern token_type_t TOKEN_EOF; |
77 TOKEN_UNKNOWN = 0, |
77 extern token_type_t TOKEN_WS; |
78 TOKEN_EOF = 1, |
78 extern token_type_t TOKEN_COMMENT; |
79 TOKEN_WS, |
79 extern token_type_t TOKEN_ID; |
80 TOKEN_ID, |
80 extern token_type_t TOKEN_STRLIT; |
81 TOKEN_STRLIT, |
81 extern token_type_t TOKEN_INTLIT; |
82 TOKEN_INTLIT, |
82 extern token_type_t TOKEN_REALLIT; |
83 TOKEN_REALLIT, |
83 extern token_type_t TOKEN_LIT; |
84 TOKEN_LIT, |
|
85 |
|
86 TOKEN_LAST_RESERVED // 8 |
|
87 }; |
|
88 |
84 |
89 class LexException : public ITxtCtxtExc { |
85 class LexException : public ITxtCtxtExc { |
90 public: |
86 public: |
91 LexException(const wchar_t* what, const wchar_t* where); |
87 LexException(const wchar_t* what, const wchar_t* where); |
92 virtual ~LexException(); |
88 virtual ~LexException(); |
104 * hazardous, if the original tokenized string is modified or |
100 * hazardous, if the original tokenized string is modified or |
105 * released. |
101 * released. |
106 */ |
102 */ |
107 class Token { |
103 class Token { |
108 public: |
104 public: |
109 Token(int type, const wchar_t* begin, const wchar_t* end); |
105 Token(token_type_t type, const wchar_t* begin, const wchar_t* end); |
110 Token(); |
106 Token(); |
111 int type() const; |
107 const wchar_t* type() const; |
112 const wchar_t* begin() const; |
108 const wchar_t* begin() const; |
113 const wchar_t* end() const; |
109 const wchar_t* end() const; |
114 int length() const; |
110 int length() const; |
115 std::wstring text() const; |
111 std::wstring text() const; |
116 private: |
112 private: |
117 int type_; |
113 token_type_t type_; |
118 const wchar_t* begin_; |
114 const wchar_t* begin_; |
119 const wchar_t* end_; |
115 const wchar_t* end_; |
120 }; |
116 }; |
121 |
117 |
122 /** |
118 /** |
219 const wchar_t* end_; |
215 const wchar_t* end_; |
220 }; |
216 }; |
221 |
217 |
222 class SymbolTokenizer : public Tokenizer { |
218 class SymbolTokenizer : public Tokenizer { |
223 public: |
219 public: |
224 SymbolTokenizer(int tokenType, const wchar_t* symbol); |
220 SymbolTokenizer(const wchar_t* tokenType, const wchar_t* symbol); |
225 virtual void reset(); |
221 virtual void reset(); |
226 virtual Token get(); |
222 virtual Token get(); |
227 virtual TokenizerState consume(const wchar_t* cursor); |
223 virtual TokenizerState consume(const wchar_t* cursor); |
228 private: |
224 private: |
229 const wchar_t* begin_; |
225 const wchar_t* begin_; |
230 const wchar_t* end_; |
226 const wchar_t* end_; |
231 int tokenType_; |
227 token_type_t tokenType_; |
232 const wchar_t* symbol_; |
228 const wchar_t* symbol_; |
233 }; |
229 }; |
234 |
230 |
|
231 /** |
|
232 * C style line comment, e.g. // comment |
|
233 */ |
|
234 class LineCommentTokenizer : public Tokenizer { |
|
235 public: |
|
236 LineCommentTokenizer(); |
|
237 virtual void reset(); |
|
238 virtual Token get(); |
|
239 virtual TokenizerState consume(const wchar_t* cursor); |
|
240 private: |
|
241 enum State { |
|
242 READY, |
|
243 SLASH_CONSUMED, |
|
244 COMMENT, |
|
245 FINISHED |
|
246 }; |
|
247 State state_; |
|
248 const wchar_t* begin_; |
|
249 const wchar_t* end_; |
|
250 }; |
|
251 |
|
252 /** |
|
253 * C++ style section comments. Like the one's surrounding this comment |
|
254 */ |
|
255 class SectionCommentTokenizer : public Tokenizer { |
|
256 public: |
|
257 SectionCommentTokenizer(); |
|
258 virtual void reset(); |
|
259 virtual Token get(); |
|
260 virtual TokenizerState consume(const wchar_t* cursor); |
|
261 private: |
|
262 enum State { |
|
263 READY, |
|
264 SLASH_CONSUMED, |
|
265 COMMENT, |
|
266 STAR_CONSUMED, |
|
267 FINISH |
|
268 }; |
|
269 State state_; |
|
270 const wchar_t* begin_; |
|
271 const wchar_t* end_; |
|
272 |
|
273 }; |
|
274 |
235 /** |
275 /** |
236 * Tokenizes text by using given tokenizers. Text is consumed |
276 * Tokenizes text by using given tokenizers. Text is consumed |
237 * until no tokenizer is in hungry state e.g., all tokenizers |
277 * until no tokenizer is in hungry state e.g., all tokenizers |
238 * are either failed or finished. In case a number of |
278 * are either failed or finished. In case a number of |
239 * tokenizers have finished, the longest token is used. If a |
279 * tokenizers have finished, the longest token is used. If a |
301 */ |
341 */ |
302 virtual Token operator++(int) = 0; |
342 virtual Token operator++(int) = 0; |
303 |
343 |
304 virtual ~TokenIterator(); |
344 virtual ~TokenIterator(); |
305 }; |
345 }; |
|
346 |
|
347 class WhitespaceSplitter : public TokenIterator { |
|
348 public: |
|
349 WhitespaceSplitter(const wchar_t* text); |
|
350 virtual operator bool(); |
|
351 virtual Token operator++(int); |
|
352 public: |
|
353 const wchar_t* begin_; |
|
354 const wchar_t* end_; |
|
355 }; |
306 |
356 |
307 /** |
357 /** |
308 * Uses tokenizer for converting given text into token stream |
358 * Uses tokenizer for converting given text into token stream |
309 * and provides means for iterating throught the token |
359 * and provides means for iterating throught the token |
310 * stream's tokens. |
360 * stream's tokens. |
326 }; |
376 }; |
327 |
377 |
328 /** |
378 /** |
329 * Filters out all tokens of type TOKEN_WS |
379 * Filters out all tokens of type TOKEN_WS |
330 */ |
380 */ |
331 class WhiteSpaceFilter : public TokenIterator { |
381 class StdFilter : public TokenIterator { |
332 public: |
382 public: |
333 WhiteSpaceFilter(TokenIterator& tokens); |
383 StdFilter(TokenIterator& tokens); |
334 virtual operator bool(); |
384 virtual operator bool(); |
335 virtual Token operator++(int); |
385 virtual Token operator++(int); |
336 private: |
386 private: |
337 void prepareNext(); |
387 void prepareNext(); |
338 private: // data |
388 private: // data |
423 class Lexer : public Lex::TokenReader { |
473 class Lexer : public Lex::TokenReader { |
424 public: |
474 public: |
425 Lexer(Lex::TokenIterator& tokens); |
475 Lexer(Lex::TokenIterator& tokens); |
426 // throws ParseException instead of LexException on EOF. |
476 // throws ParseException instead of LexException on EOF. |
427 virtual Lex::Token operator++(int); |
477 virtual Lex::Token operator++(int); |
428 Lex::Token eat(int tokenType); |
478 Lex::Token eat(Lex::token_type_t tokenType); |
429 void eatEof(); |
479 void eatEof(); |
430 std::wstring eatId(); |
480 std::wstring eatId(); |
431 std::wstring eatString(); |
481 std::wstring eatString(); |
432 long eatInteger(); |
482 long eatInteger(); |
433 double eatReal(); |
483 double eatReal(); |