25 #include "cpixtools.h" |
25 #include "cpixtools.h" |
26 |
26 |
27 #include <iostream> |
27 #include <iostream> |
28 #include <sstream> |
28 #include <sstream> |
29 #include <stdlib.h> |
29 #include <stdlib.h> |
|
30 #include "wctype.h" |
|
31 |
|
32 namespace { |
|
33 |
|
34 std::wstring describeException(std::wstring what, const wchar_t* context, const wchar_t* where, const wchar_t* where2) { |
|
35 std::wstring line; |
|
36 int l = 0; |
|
37 bool found = false; |
|
38 |
|
39 for (; ; context++) { |
|
40 if (context == where) { |
|
41 line += L"*here*"; |
|
42 found = true; |
|
43 if (!where2) break; |
|
44 } |
|
45 if (context == where2) { |
|
46 line += L"*here*"; |
|
47 break; |
|
48 } |
|
49 if (!*context) { |
|
50 line += L"*here*"; |
|
51 break; |
|
52 } else if (*context == '\n' && !found) { |
|
53 l++; |
|
54 line = L""; |
|
55 } else { |
|
56 line += *context; |
|
57 } |
|
58 } |
|
59 for (; *context && *context != '\n' && *context != '\r'; context++) { |
|
60 line += *context; |
|
61 } |
|
62 |
|
63 std::wostringstream tmp; |
|
64 tmp<<what; |
|
65 tmp<<L" at"; |
|
66 if ( l ) { |
|
67 tmp<<L" line "<<(l+1); |
|
68 } |
|
69 tmp<<L": \n\""; |
|
70 tmp<<line; |
|
71 tmp<<L"\""; |
|
72 return tmp.str(); |
|
73 } |
|
74 |
|
75 } |
30 |
76 |
31 namespace Cpt { |
77 namespace Cpt { |
32 |
78 |
33 |
79 |
34 namespace Lex { |
80 namespace Lex { |
|
81 |
|
82 token_type_t TOKEN_UNKNOWN = L"unknown"; |
|
83 token_type_t TOKEN_EOF = L"eof"; |
|
84 token_type_t TOKEN_WS = L"whitespace"; |
|
85 token_type_t TOKEN_COMMENT = L"comment"; |
|
86 token_type_t TOKEN_ID = L"identifier"; |
|
87 token_type_t TOKEN_STRLIT = L"string"; |
|
88 token_type_t TOKEN_INTLIT = L"integer"; |
|
89 token_type_t TOKEN_REALLIT = L"real number"; |
|
90 token_type_t TOKEN_LIT = L"literal"; |
35 |
91 |
36 const wchar_t ESCAPE_SYMBOL = '\\'; |
92 const wchar_t ESCAPE_SYMBOL = '\\'; |
37 |
93 |
38 Tokenizer::~Tokenizer() {} |
94 Tokenizer::~Tokenizer() {} |
39 |
95 |
54 } |
110 } |
55 |
111 |
56 const wchar_t* LexException::wWhat() const throw() { |
112 const wchar_t* LexException::wWhat() const throw() { |
57 return wWhat_.c_str(); |
113 return wWhat_.c_str(); |
58 } |
114 } |
59 |
115 |
60 void LexException::setContext(const wchar_t * context) |
116 void LexException::setContext(const wchar_t * context) { |
61 { |
117 wWhat_ = describeException(wWhat_, context, where_, NULL); |
62 // TODO legacy of implementation of obsoleted describe() - |
118 } |
63 // it can be optimized by doind direct substring - concat |
119 |
64 // operations instead of looping through context |
120 Token::Token(const wchar_t* type, const wchar_t* begin, const wchar_t* end) |
65 std::wstring tmp; |
|
66 tmp += wWhat_; |
|
67 tmp += L" at: \""; |
|
68 for (; ; context++) { |
|
69 if (context == where_) { |
|
70 tmp += L"*here*"; |
|
71 } |
|
72 if (!*context) { |
|
73 break; |
|
74 } |
|
75 tmp += *context; |
|
76 } |
|
77 tmp += L"\""; |
|
78 |
|
79 wWhat_ = tmp; |
|
80 } |
|
81 |
|
82 |
|
83 Token::Token(int type, const wchar_t* begin, const wchar_t* end) |
|
84 : type_(type), begin_(begin), end_(end) { |
121 : type_(type), begin_(begin), end_(end) { |
85 } |
122 } |
86 |
123 |
87 Token::Token() |
124 Token::Token() |
88 : type_(0), begin_(0), end_(0) { |
125 : type_(0), begin_(0), end_(0) { |
89 } |
126 } |
90 |
127 |
91 int Token::type() const { return type_; }; |
128 token_type_t Token::type() const { return type_; }; |
92 const wchar_t* Token::begin() const { return begin_; }; |
129 const wchar_t* Token::begin() const { return begin_; }; |
93 const wchar_t* Token::end() const { return end_; }; |
130 const wchar_t* Token::end() const { return end_; }; |
94 int Token::length() const { return end_ - begin_; }; |
131 int Token::length() const { return end_ - begin_; }; |
95 std::wstring Token::text() const { |
132 std::wstring Token::text() const { |
96 std::wstring ret; |
133 std::wstring ret; |
314 return TOKENIZER_HUNGRY; |
351 return TOKENIZER_HUNGRY; |
315 } else { |
352 } else { |
316 return TOKENIZER_FAILED; |
353 return TOKENIZER_FAILED; |
317 } |
354 } |
318 } |
355 } |
|
356 |
|
357 LineCommentTokenizer::LineCommentTokenizer() : state_( READY ) {} |
|
358 |
|
359 void LineCommentTokenizer::reset() { |
|
360 state_ = READY; |
|
361 } |
|
362 Token LineCommentTokenizer::get() { |
|
363 return Token( TOKEN_COMMENT, begin_, end_ ); |
|
364 } |
|
365 |
|
366 TokenizerState LineCommentTokenizer::consume(const wchar_t* cursor) { |
|
367 switch (state_) { |
|
368 case READY: |
|
369 if (*cursor == '/') { |
|
370 begin_ = cursor; |
|
371 state_ = SLASH_CONSUMED; |
|
372 return TOKENIZER_HUNGRY; |
|
373 } |
|
374 break; |
|
375 case SLASH_CONSUMED: |
|
376 if (*cursor == '/') { |
|
377 state_ = COMMENT; |
|
378 return TOKENIZER_HUNGRY; |
|
379 } |
|
380 break; |
|
381 case COMMENT: |
|
382 if (*cursor == '\n' || *cursor == '\r' || *cursor == '\0') { |
|
383 state_ = FINISHED; |
|
384 end_ = cursor; |
|
385 return TOKENIZER_FINISHED; |
|
386 } |
|
387 return TOKENIZER_HUNGRY; |
|
388 } |
|
389 return TOKENIZER_FAILED; |
|
390 } |
|
391 |
|
392 SectionCommentTokenizer::SectionCommentTokenizer() : state_( READY ) {} |
|
393 |
|
394 void SectionCommentTokenizer::reset() { |
|
395 state_ = READY; |
|
396 } |
|
397 Token SectionCommentTokenizer::get() { |
|
398 return Token( TOKEN_COMMENT, begin_, end_ ); |
|
399 } |
|
400 TokenizerState SectionCommentTokenizer::consume(const wchar_t* cursor) { |
|
401 if (*cursor == '\0') return TOKENIZER_FAILED; |
|
402 switch (state_) { |
|
403 case READY: |
|
404 if (*cursor == '/') { |
|
405 begin_ = cursor; |
|
406 state_ = SLASH_CONSUMED; |
|
407 return TOKENIZER_HUNGRY; |
|
408 } |
|
409 break; |
|
410 case SLASH_CONSUMED: |
|
411 if (*cursor == '*') { |
|
412 state_ = COMMENT; |
|
413 return TOKENIZER_HUNGRY; |
|
414 } |
|
415 break; |
|
416 case COMMENT: |
|
417 if (*cursor == '*') { |
|
418 state_ = STAR_CONSUMED; |
|
419 } |
|
420 return TOKENIZER_HUNGRY; |
|
421 case STAR_CONSUMED: |
|
422 if (*cursor == '/') { |
|
423 end_ = cursor+1; |
|
424 return TOKENIZER_FINISHED; |
|
425 } else { |
|
426 if (*cursor != '*') { |
|
427 state_ = COMMENT; |
|
428 } |
|
429 return TOKENIZER_HUNGRY; |
|
430 } |
|
431 } |
|
432 return TOKENIZER_FAILED; |
|
433 } |
319 |
434 |
320 MultiTokenizer::MultiTokenizer(Tokenizer** tokenizers, bool ownTokenizers) |
435 MultiTokenizer::MultiTokenizer(Tokenizer** tokenizers, bool ownTokenizers) |
321 : ownTokenizers_(ownTokenizers) |
436 : ownTokenizers_(ownTokenizers) |
322 { |
437 { |
323 int len = 0; while (tokenizers[len]) len++; |
438 int len = 0; while (tokenizers[len]) len++; |
456 { |
571 { |
457 return multiTokenizer_->consume(cursor); |
572 return multiTokenizer_->consume(cursor); |
458 } |
573 } |
459 |
574 |
460 TokenIterator::~TokenIterator() {} |
575 TokenIterator::~TokenIterator() {} |
|
576 |
|
577 WhitespaceSplitter::WhitespaceSplitter(const wchar_t* text) |
|
578 : begin_( text ), end_( 0 ) {} |
|
579 |
|
580 WhitespaceSplitter::operator bool() { |
|
581 if ( !end_ && *begin_ ) { |
|
582 // skip whitespace |
|
583 while (iswspace(*begin_)) begin_++; |
|
584 end_ = begin_; |
|
585 // consume letters |
|
586 while (*end_ && !iswspace(*end_)) end_++; |
|
587 } |
|
588 return *begin_; |
|
589 } |
|
590 |
|
591 Token WhitespaceSplitter::operator++(int) { |
|
592 if (!*this) throw LexException(L"Out of tokens.", begin_); |
|
593 Token ret(TOKEN_UNKNOWN, begin_, end_); |
|
594 begin_ = end_; |
|
595 end_ = 0; |
|
596 return ret; |
|
597 } |
461 |
598 |
462 Tokens::Tokens(Tokenizer& tokenizer, const wchar_t* text) |
599 Tokens::Tokens(Tokenizer& tokenizer, const wchar_t* text) |
463 : cursor_(text), |
600 : cursor_(text), |
464 tokenizer_(tokenizer), |
601 tokenizer_(tokenizer), |
465 hasNext_(false) |
602 hasNext_(false) |
502 hasNext_ = true; |
639 hasNext_ = true; |
503 } |
640 } |
504 } |
641 } |
505 } |
642 } |
506 |
643 |
507 WhiteSpaceFilter::WhiteSpaceFilter(TokenIterator& tokens) |
644 StdFilter::StdFilter(TokenIterator& tokens) |
508 : tokens_(tokens), next_(), hasNext_(false) {} |
645 : tokens_(tokens), next_(), hasNext_(false) {} |
509 |
646 |
510 WhiteSpaceFilter::operator bool() |
647 StdFilter::operator bool() |
511 { |
648 { |
512 prepareNext(); |
649 prepareNext(); |
513 return hasNext_; |
650 return hasNext_; |
514 } |
651 } |
515 |
652 |
516 Token WhiteSpaceFilter::operator++(int) |
653 Token StdFilter::operator++(int) |
517 { |
654 { |
518 prepareNext(); |
655 prepareNext(); |
519 if (!hasNext_) { |
656 if (!hasNext_) { |
520 throw LexException(L"Out of tokens", 0); |
657 throw LexException(L"Out of tokens", 0); |
521 } |
658 } |
522 hasNext_ = false; |
659 hasNext_ = false; |
523 return next_; |
660 return next_; |
524 } |
661 } |
525 void WhiteSpaceFilter::prepareNext() |
662 void StdFilter::prepareNext() |
526 { |
663 { |
527 while (!hasNext_ && tokens_) { |
664 while (!hasNext_ && tokens_) { |
528 next_ = tokens_++; |
665 next_ = tokens_++; |
529 if (next_.type() != TOKEN_WS) { |
666 if (next_.type() != TOKEN_WS |
|
667 && next_.type() != TOKEN_COMMENT) { |
530 hasNext_ = true; |
668 hasNext_ = true; |
531 } |
669 } |
532 } |
670 } |
533 } |
671 } |
|
672 |
534 |
673 |
535 TokenReader::TokenReader(TokenIterator& tokens) |
674 TokenReader::TokenReader(TokenIterator& tokens) |
536 : tokens_(tokens), |
675 : tokens_(tokens), |
537 location_(0), |
676 location_(0), |
538 forward_(), |
677 forward_(), |
611 return wWhat_.c_str(); |
750 return wWhat_.c_str(); |
612 } |
751 } |
613 |
752 |
614 void ParseException::setContext(const wchar_t * context) |
753 void ParseException::setContext(const wchar_t * context) |
615 { |
754 { |
616 // TODO legacy of implementation of obsoleted describe() - |
755 wWhat_ = describeException(wWhat_, context, where_.begin(), where_.end()); |
617 // it can be optimized by doind direct substring - concat |
|
618 // operations instead of looping through context |
|
619 std::wstring tmp; |
|
620 tmp += wWhat_; |
|
621 tmp += L" at: \""; |
|
622 if (where_.type() == Lex::TOKEN_EOF) { |
|
623 tmp += context; |
|
624 tmp += L"*here*"; |
|
625 } else { |
|
626 for (; ; context++) { |
|
627 if (context == where_.begin()) { |
|
628 tmp += L"*here*"; |
|
629 } |
|
630 if (context == where_.end()) { |
|
631 tmp += L"*here*"; |
|
632 } |
|
633 if (!*context) break; |
|
634 tmp += *context; |
|
635 } |
|
636 } |
|
637 tmp += L"\""; |
|
638 |
|
639 wWhat_ = tmp; |
|
640 } |
756 } |
641 |
757 |
642 namespace Lit { |
758 namespace Lit { |
643 |
759 |
644 std::wstring ParseString(const Lex::Token& token) { |
760 std::wstring ParseString(const Lex::Token& token) { |
704 return Lex::TokenReader::operator++(0); |
820 return Lex::TokenReader::operator++(0); |
705 } |
821 } |
706 throw ParseException(L"Unexpected EOF", Lex::Token(Lex::TOKEN_EOF, 0, 0)); |
822 throw ParseException(L"Unexpected EOF", Lex::Token(Lex::TOKEN_EOF, 0, 0)); |
707 } |
823 } |
708 |
824 |
709 Lex::Token Lexer::eat(int tokenType) { |
825 Lex::Token Lexer::eat(Lex::token_type_t tokenType) { |
710 Lex::Token token = ((*this)++); |
826 Lex::Token token = ((*this)++); |
711 if (token.type() != tokenType) { |
827 if (token.type() != tokenType) { |
712 std::wostringstream msg; |
828 std::wostringstream msg; |
713 msg<<"Expected token of type "<<tokenType<<" instead of token '"<<token.text()<<"' of type "<<token.type(); |
829 msg<<"Expected "<<tokenType<<" instead of token '"<<token.text()<<"' of type "<<token.type(); |
714 throw ParseException(msg.str().c_str(), token); |
830 throw ParseException(msg.str().c_str(), token); |
715 } |
831 } |
716 return token; |
832 return token; |
717 } |
833 } |
718 std::wstring Lexer::eatId() { |
834 std::wstring Lexer::eatId() { |
745 double Lexer::eatReal() { |
861 double Lexer::eatReal() { |
746 return Lit::ParseReal((*this)++); |
862 return Lit::ParseReal((*this)++); |
747 } |
863 } |
748 |
864 |
749 StdLexer::StdLexer(Lex::Tokenizer& tokenizer, const wchar_t* text) |
865 StdLexer::StdLexer(Lex::Tokenizer& tokenizer, const wchar_t* text) |
750 : Lexer(ws_), |
866 : Lexer(filter_), |
751 tokens_(tokenizer, text), |
867 tokens_(tokenizer, text), |
752 ws_(tokens_) |
868 filter_(tokens_) |
753 |
869 |
754 {} |
870 {} |
755 |
871 |
756 |
872 |
757 } // Parser |
873 } // Parser |