searchengine/util/cpixtools/src/cpixparsetools.cpp
changeset 10 afe194b6b1cd
parent 0 671dee74050a
equal deleted inserted replaced
9:d575fd691cf9 10:afe194b6b1cd
    25 #include "cpixtools.h"
    25 #include "cpixtools.h"
    26 
    26 
    27 #include <iostream>
    27 #include <iostream>
    28 #include <sstream>
    28 #include <sstream>
    29 #include <stdlib.h>
    29 #include <stdlib.h>
       
    30 #include "wctype.h"
       
    31 
       
    32 namespace {
       
    33 
       
    34 	std::wstring describeException(std::wstring what, const wchar_t* context, const wchar_t* where, const wchar_t* where2) {
       
    35 		std::wstring line;
       
    36 		int l = 0;
       
    37 		bool found = false; 
       
    38 		 
       
    39 		for (; ; context++) {
       
    40 			if (context == where) {
       
    41 				line += L"*here*";
       
    42 				found = true; 
       
    43 				if (!where2) break; 
       
    44 			}
       
    45 			if (context == where2) {
       
    46 				line += L"*here*";
       
    47 				break; 
       
    48 			}
       
    49 			if (!*context) {
       
    50 				line += L"*here*";
       
    51 				break;
       
    52 			} else if (*context == '\n' && !found) {
       
    53 				l++; 
       
    54 				line = L"";  
       
    55 			} else {
       
    56 				line += *context;
       
    57 			}
       
    58 		}
       
    59 		for (; *context && *context != '\n' && *context != '\r'; context++) {
       
    60 			line += *context; 
       
    61 		}
       
    62 		 
       
    63 		std::wostringstream tmp; 
       
    64 		tmp<<what; 
       
    65 		tmp<<L" at";
       
    66 		if ( l ) {
       
    67 			tmp<<L" line "<<(l+1);
       
    68 		}
       
    69 		tmp<<L": \n\"";
       
    70 		tmp<<line;
       
    71 		tmp<<L"\"";
       
    72 		return tmp.str();        	
       
    73 	}
       
    74 
       
    75 }
    30 
    76 
    31 namespace Cpt {
    77 namespace Cpt {
    32 
    78 
    33 
    79 
    34     namespace Lex {
    80     namespace Lex {
       
    81     
       
    82 		token_type_t TOKEN_UNKNOWN = L"unknown";
       
    83 		token_type_t TOKEN_EOF = L"eof";
       
    84 		token_type_t TOKEN_WS = L"whitespace"; 
       
    85 		token_type_t TOKEN_COMMENT = L"comment";  
       
    86 		token_type_t TOKEN_ID = L"identifier";	
       
    87 		token_type_t TOKEN_STRLIT = L"string";
       
    88 		token_type_t TOKEN_INTLIT = L"integer";
       
    89 		token_type_t TOKEN_REALLIT = L"real number";
       
    90 		token_type_t TOKEN_LIT = L"literal";
    35 	
    91 	
    36         const wchar_t ESCAPE_SYMBOL = '\\';
    92         const wchar_t ESCAPE_SYMBOL = '\\';
    37 	
    93 	
    38         Tokenizer::~Tokenizer() {}
    94         Tokenizer::~Tokenizer() {}
    39 	
    95 	
    54         }
   110         }
    55 
   111 
    56         const wchar_t* LexException::wWhat() const throw() {
   112         const wchar_t* LexException::wWhat() const throw() {
    57             return wWhat_.c_str();
   113             return wWhat_.c_str();
    58         }
   114         }
    59 
   115         
    60         void LexException::setContext(const wchar_t * context)
   116         void LexException::setContext(const wchar_t * context) {
    61         {
   117 			wWhat_ = describeException(wWhat_, context, where_, NULL); 
    62             // TODO legacy of implementation of obsoleted describe() -
   118         }
    63             // it can be optimized by doind direct substring - concat
   119 
    64             // operations instead of looping through context
   120         Token::Token(const wchar_t* type, const wchar_t* begin, const wchar_t* end) 
    65             std::wstring tmp;
       
    66             tmp += wWhat_; 
       
    67             tmp += L" at: \""; 
       
    68             for (; ; context++) {
       
    69                 if (context == where_) {
       
    70                     tmp += L"*here*";
       
    71                 }
       
    72                 if (!*context) {
       
    73                     break; 
       
    74                 }
       
    75                 tmp += *context;
       
    76             }
       
    77             tmp += L"\"";
       
    78 
       
    79             wWhat_ = tmp;
       
    80         }
       
    81 
       
    82 
       
    83         Token::Token(int type, const wchar_t* begin, const wchar_t* end) 
       
    84             : type_(type), begin_(begin), end_(end) {
   121             : type_(type), begin_(begin), end_(end) {
    85         }
   122         }
    86 
   123 
    87         Token::Token() 
   124         Token::Token() 
    88             : type_(0), begin_(0), end_(0) {
   125             : type_(0), begin_(0), end_(0) {
    89         }
   126         }
    90 		
   127 		
    91         int Token::type() const { return type_; }; 
   128         token_type_t Token::type() const { return type_; }; 
    92         const wchar_t* Token::begin() const { return begin_; };
   129         const wchar_t* Token::begin() const { return begin_; };
    93         const wchar_t* Token::end() const { return end_; };
   130         const wchar_t* Token::end() const { return end_; };
    94         int Token::length() const { return end_ - begin_; };
   131         int Token::length() const { return end_ - begin_; };
    95         std::wstring Token::text() const {
   132         std::wstring Token::text() const {
    96             std::wstring ret;
   133             std::wstring ret;
   287                 return TOKENIZER_FINISHED; 
   324                 return TOKENIZER_FINISHED; 
   288             } 
   325             } 
   289             return TOKENIZER_HUNGRY; 
   326             return TOKENIZER_HUNGRY; 
   290         }
   327         }
   291 
   328 
   292         SymbolTokenizer::SymbolTokenizer(int tokenType, const wchar_t* symbol) 
   329         SymbolTokenizer::SymbolTokenizer(token_type_t tokenType, const wchar_t* symbol) 
   293             : tokenType_( tokenType ), 
   330             : tokenType_( tokenType ), 
   294               symbol_( symbol ) 
   331               symbol_( symbol ) 
   295         {
   332         {
   296         }
   333         }
   297 		
   334 		
   314                 return TOKENIZER_HUNGRY; 
   351                 return TOKENIZER_HUNGRY; 
   315             } else {
   352             } else {
   316                 return TOKENIZER_FAILED; 
   353                 return TOKENIZER_FAILED; 
   317             }
   354             }
   318         }
   355         }
       
   356         
       
   357         LineCommentTokenizer::LineCommentTokenizer() : state_( READY ) {}
       
   358         
       
   359         void LineCommentTokenizer::reset() {
       
   360         	state_ = READY; 
       
   361         }
       
   362         Token LineCommentTokenizer::get() {
       
   363         	return Token( TOKEN_COMMENT, begin_, end_ ); 
       
   364         }
       
   365         
       
   366         TokenizerState LineCommentTokenizer::consume(const wchar_t* cursor) {
       
   367         	switch (state_) {
       
   368         		case READY: 
       
   369         			if (*cursor == '/') {
       
   370 						begin_ = cursor; 
       
   371 						state_ = SLASH_CONSUMED; 
       
   372 						return TOKENIZER_HUNGRY;
       
   373         			}
       
   374         			break;
       
   375         		case SLASH_CONSUMED:
       
   376 					if (*cursor == '/') {
       
   377 						state_ = COMMENT;
       
   378 						return TOKENIZER_HUNGRY; 
       
   379 					}
       
   380 					break; 
       
   381         		case COMMENT:
       
   382         			if (*cursor == '\n' || *cursor == '\r' || *cursor == '\0') {
       
   383 						state_ = FINISHED; 
       
   384 						end_ = cursor; 
       
   385 						return TOKENIZER_FINISHED;
       
   386         			}
       
   387 					return TOKENIZER_HUNGRY; 
       
   388         	}
       
   389         	return TOKENIZER_FAILED; 
       
   390         }
       
   391 
       
   392         SectionCommentTokenizer::SectionCommentTokenizer() : state_( READY ) {}
       
   393            
       
   394         void SectionCommentTokenizer::reset() {
       
   395         	state_ = READY; 
       
   396         }
       
   397         Token SectionCommentTokenizer::get() {
       
   398         	return Token( TOKEN_COMMENT, begin_, end_ );
       
   399         }
       
   400         TokenizerState SectionCommentTokenizer::consume(const wchar_t* cursor) {
       
   401 			if (*cursor == '\0') return TOKENIZER_FAILED;
       
   402         	switch (state_) {
       
   403         		case READY: 
       
   404         			if (*cursor == '/') {
       
   405 						begin_ = cursor; 
       
   406 						state_ = SLASH_CONSUMED; 
       
   407 						return TOKENIZER_HUNGRY;
       
   408         			}
       
   409         			break;
       
   410         		case SLASH_CONSUMED: 
       
   411 					if (*cursor == '*') {
       
   412 						state_ = COMMENT;
       
   413 						return TOKENIZER_HUNGRY; 
       
   414 					}
       
   415 					break; 
       
   416         		case COMMENT:
       
   417         			if (*cursor == '*') {
       
   418 						state_ = STAR_CONSUMED; 
       
   419         			}
       
   420 					return TOKENIZER_HUNGRY; 
       
   421         		case STAR_CONSUMED: 
       
   422         			if (*cursor == '/') {
       
   423 						end_ = cursor+1; 
       
   424 						return TOKENIZER_FINISHED;
       
   425         			} else {
       
   426 						if (*cursor != '*') {
       
   427 							state_ = COMMENT;
       
   428 	        			}
       
   429 						return TOKENIZER_HUNGRY;
       
   430         			}
       
   431         	}
       
   432         	return TOKENIZER_FAILED; 
       
   433         }
   319 		
   434 		
   320         MultiTokenizer::MultiTokenizer(Tokenizer** tokenizers, bool ownTokenizers) 
   435         MultiTokenizer::MultiTokenizer(Tokenizer** tokenizers, bool ownTokenizers) 
   321             : ownTokenizers_(ownTokenizers)
   436             : ownTokenizers_(ownTokenizers)
   322         {
   437         {
   323             int len = 0; while (tokenizers[len]) len++; 
   438             int len = 0; while (tokenizers[len]) len++; 
   456         {
   571         {
   457             return multiTokenizer_->consume(cursor);
   572             return multiTokenizer_->consume(cursor);
   458         }
   573         }
   459             
   574             
   460         TokenIterator::~TokenIterator() {}
   575         TokenIterator::~TokenIterator() {}
       
   576         
       
   577         WhitespaceSplitter::WhitespaceSplitter(const wchar_t* text) 
       
   578         : begin_( text ), end_( 0 ) {}
       
   579         
       
   580         WhitespaceSplitter::operator bool() {
       
   581         	if ( !end_ && *begin_ ) {
       
   582 				// skip whitespace
       
   583 				while (iswspace(*begin_)) begin_++;
       
   584 				end_ = begin_;
       
   585 				// consume letters
       
   586 				while (*end_ && !iswspace(*end_)) end_++; 
       
   587         	}
       
   588         	return *begin_; 
       
   589         }
       
   590         
       
   591         Token WhitespaceSplitter::operator++(int) {
       
   592         	if (!*this) throw LexException(L"Out of tokens.", begin_);
       
   593         	Token ret(TOKEN_UNKNOWN, begin_, end_); 
       
   594         	begin_ = end_; 
       
   595         	end_ = 0; 
       
   596         	return ret; 
       
   597         }
   461 
   598 
   462         Tokens::Tokens(Tokenizer& tokenizer, const wchar_t* text)
   599         Tokens::Tokens(Tokenizer& tokenizer, const wchar_t* text)
   463             :	cursor_(text),
   600             :	cursor_(text),
   464                 tokenizer_(tokenizer), 
   601                 tokenizer_(tokenizer), 
   465                 hasNext_(false)
   602                 hasNext_(false)
   502                     hasNext_ = true; 
   639                     hasNext_ = true; 
   503                 }
   640                 }
   504             }
   641             }
   505         }
   642         }
   506 
   643 
   507         WhiteSpaceFilter::WhiteSpaceFilter(TokenIterator& tokens) 
   644         StdFilter::StdFilter(TokenIterator& tokens) 
   508             :	tokens_(tokens), next_(), hasNext_(false) {}
   645             :	tokens_(tokens), next_(), hasNext_(false) {}
   509 		
   646 		
   510         WhiteSpaceFilter::operator bool()
   647         StdFilter::operator bool()
   511         {
   648         {
   512             prepareNext();
   649             prepareNext();
   513             return hasNext_; 
   650             return hasNext_; 
   514         }
   651         }
   515 		
   652 		
   516         Token WhiteSpaceFilter::operator++(int)
   653         Token StdFilter::operator++(int)
   517         {
   654         {
   518             prepareNext();
   655             prepareNext();
   519             if (!hasNext_) {
   656             if (!hasNext_) {
   520                 throw LexException(L"Out of tokens", 0); 
   657                 throw LexException(L"Out of tokens", 0); 
   521             }
   658             }
   522             hasNext_ = false;
   659             hasNext_ = false;
   523             return next_;
   660             return next_;
   524         }
   661         }
   525         void WhiteSpaceFilter::prepareNext()
   662         void StdFilter::prepareNext()
   526         {
   663         {
   527             while (!hasNext_ && tokens_) {
   664             while (!hasNext_ && tokens_) {
   528                 next_ = tokens_++;
   665                 next_ = tokens_++;
   529                 if (next_.type() != TOKEN_WS) {
   666                 if (next_.type() != TOKEN_WS 
       
   667                  && next_.type() != TOKEN_COMMENT) {
   530                     hasNext_ = true; 
   668                     hasNext_ = true; 
   531                 }
   669                 }
   532             }
   670             }
   533         }
   671         }
       
   672 
   534 		
   673 		
   535         TokenReader::TokenReader(TokenIterator& tokens) 
   674         TokenReader::TokenReader(TokenIterator& tokens) 
   536             :	tokens_(tokens), 
   675             :	tokens_(tokens), 
   537                 location_(0),
   676                 location_(0),
   538                 forward_(), 
   677                 forward_(), 
   611             return wWhat_.c_str();
   750             return wWhat_.c_str();
   612         }
   751         }
   613 		
   752 		
   614         void ParseException::setContext(const wchar_t * context)
   753         void ParseException::setContext(const wchar_t * context)
   615         {
   754         {
   616             // TODO legacy of implementation of obsoleted describe() -
   755 			wWhat_ = describeException(wWhat_, context, where_.begin(), where_.end()); 
   617             // it can be optimized by doind direct substring - concat
       
   618             // operations instead of looping through context
       
   619             std::wstring tmp;
       
   620             tmp += wWhat_; 
       
   621             tmp += L" at: \""; 
       
   622             if (where_.type() == Lex::TOKEN_EOF) {
       
   623                 tmp += context; 
       
   624                 tmp += L"*here*";
       
   625             } else {
       
   626                 for (; ; context++) {
       
   627                     if (context == where_.begin()) {
       
   628                         tmp += L"*here*";
       
   629                     }
       
   630                     if (context == where_.end()) {
       
   631                         tmp += L"*here*";
       
   632                     }
       
   633                     if (!*context) break; 
       
   634                     tmp += *context;
       
   635                 }
       
   636             }
       
   637             tmp += L"\"";
       
   638 
       
   639             wWhat_ = tmp;
       
   640         }
   756         }
   641 		
   757 		
   642         namespace Lit {
   758         namespace Lit {
   643 		
   759 		
   644             std::wstring ParseString(const Lex::Token& token) {
   760             std::wstring ParseString(const Lex::Token& token) {
   704                 return Lex::TokenReader::operator++(0); 
   820                 return Lex::TokenReader::operator++(0); 
   705             }
   821             }
   706             throw ParseException(L"Unexpected EOF", Lex::Token(Lex::TOKEN_EOF, 0, 0));  
   822             throw ParseException(L"Unexpected EOF", Lex::Token(Lex::TOKEN_EOF, 0, 0));  
   707         }
   823         }
   708 
   824 
   709         Lex::Token Lexer::eat(int tokenType) {
   825         Lex::Token Lexer::eat(Lex::token_type_t tokenType) {
   710             Lex::Token token = ((*this)++);
   826             Lex::Token token = ((*this)++);
   711             if (token.type() != tokenType) {
   827             if (token.type() != tokenType) {
   712                 std::wostringstream msg; 
   828                 std::wostringstream msg; 
   713                 msg<<"Expected token of type "<<tokenType<<" instead of token '"<<token.text()<<"' of type "<<token.type();  
   829                 msg<<"Expected "<<tokenType<<" instead of token '"<<token.text()<<"' of type "<<token.type();  
   714                 throw ParseException(msg.str().c_str(), token);  
   830                 throw ParseException(msg.str().c_str(), token);  
   715             }
   831             }
   716             return token; 
   832             return token; 
   717         }
   833         }
   718         std::wstring Lexer::eatId() {
   834         std::wstring Lexer::eatId() {
   745         double Lexer::eatReal() {
   861         double Lexer::eatReal() {
   746             return Lit::ParseReal((*this)++); 
   862             return Lit::ParseReal((*this)++); 
   747         }
   863         }
   748 
   864 
   749         StdLexer::StdLexer(Lex::Tokenizer& tokenizer, const wchar_t* text) 
   865         StdLexer::StdLexer(Lex::Tokenizer& tokenizer, const wchar_t* text) 
   750             : Lexer(ws_),
   866             : Lexer(filter_),
   751               tokens_(tokenizer, text), 
   867               tokens_(tokenizer, text), 
   752               ws_(tokens_)
   868               filter_(tokens_)
   753               
   869               
   754         {}
   870         {}
   755 		
   871 		
   756 		
   872 		
   757     } // Parser
   873     } // Parser