/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description:
*
*/
/*
* cpixparsetools.cpp
*
* Created on: Apr 14, 2009
* Author: admin
*/
#include "cpixparsetools.h"
#include "cpixtools.h"
#include <iostream>
#include <sstream>
#include <stdlib.h>
namespace Cpt {
namespace Lex {
const wchar_t ESCAPE_SYMBOL = '\\';
Tokenizer::~Tokenizer() {}
LexException::LexException(const wchar_t* wWhat,
const wchar_t* where)
: wWhat_(wWhat),
where_(where) {
;
}
LexException::~LexException()
{
;
}
const wchar_t* LexException::where() const {
return where_;
}
const wchar_t* LexException::wWhat() const throw() {
return wWhat_.c_str();
}
void LexException::setContext(const wchar_t * context)
{
// TODO legacy of implementation of obsoleted describe() -
// it can be optimized by doind direct substring - concat
// operations instead of looping through context
std::wstring tmp;
tmp += wWhat_;
tmp += L" at: \"";
for (; ; context++) {
if (context == where_) {
tmp += L"*here*";
}
if (!*context) {
break;
}
tmp += *context;
}
tmp += L"\"";
wWhat_ = tmp;
}
Token::Token(int type, const wchar_t* begin, const wchar_t* end)
: type_(type), begin_(begin), end_(end) {
}
Token::Token()
: type_(0), begin_(0), end_(0) {
}
int Token::type() const { return type_; };
const wchar_t* Token::begin() const { return begin_; };
const wchar_t* Token::end() const { return end_; };
int Token::length() const { return end_ - begin_; };
std::wstring Token::text() const {
std::wstring ret;
for (const wchar_t* i = begin_; i != end_; i++) {
ret += *i;
}
return ret;
}
StrLitTokenizer::StrLitTokenizer(wchar_t citate)
: citate_(citate)
{
reset();
}
void StrLitTokenizer::reset()
{
escape_ = false,
opened_ = false,
begin_ = 0;
end_ = 0;
}
Token StrLitTokenizer::get()
{
return Token( TOKEN_STRLIT, begin_, end_ );
}
TokenizerState StrLitTokenizer::consume(const wchar_t* cursor)
{
if (!*cursor) return TOKENIZER_FAILED; // fail always on EOF
if (!opened_)
{
if (*cursor == citate_)
{
opened_ = true;
begin_ = cursor;
} else {
return TOKENIZER_FAILED;
}
} else if (escape_) {
escape_ = false;
} else {
if (*cursor == citate_) {
end_ = cursor+1;
return TOKENIZER_FINISHED;
} else if (*cursor == '\\') {
escape_ = true;
}
}
return TOKENIZER_HUNGRY;
}
IntLitTokenizer::IntLitTokenizer() {
reset();
}
void IntLitTokenizer::reset() {
begin_ = NULL;
end_ = NULL;
beginning_ = true;
}
Token IntLitTokenizer::get() {
return Token(TOKEN_INTLIT, begin_, end_);
}
TokenizerState IntLitTokenizer::consume(const wchar_t * cursor) {
TokenizerState
rv = TOKENIZER_HUNGRY;
if (beginning_)
{
if (*cursor != L'+'
&& *cursor != L'-'
&& !isdigit(*cursor))
{
rv = TOKENIZER_FAILED;
}
beginning_ = false;
begin_ = cursor;
}
else if (!isdigit(*cursor))
{
rv = TOKENIZER_FINISHED;
end_ = cursor;
}
return rv;
}
RealLitTokenizer::RealLitTokenizer() {
reset();
}
void RealLitTokenizer::reset() {
begin_ = NULL;
end_ = NULL;
beginning_ = true;
hadDotAlready_ = false;
}
Token RealLitTokenizer::get() {
return Token(TOKEN_REALLIT, begin_, end_);
}
TokenizerState RealLitTokenizer::consume(const wchar_t * cursor) {
TokenizerState
rv = TOKENIZER_HUNGRY;
if (beginning_)
{
if (*cursor != L'+'
&& *cursor != L'-'
&& !isdigit(*cursor)
&& *cursor != L'.')
{
rv = TOKENIZER_FAILED;
}
beginning_ = false;
begin_ = cursor;
}
else if (*cursor == L'.')
{
if (hadDotAlready_)
{
rv = TOKENIZER_FINISHED;
end_ = cursor;
}
hadDotAlready_ = true;
}
else if (!isdigit(*cursor))
{
rv = TOKENIZER_FINISHED;
end_ = cursor;
}
return rv;
}
WhitespaceTokenizer::WhitespaceTokenizer() {
reset();
}
void WhitespaceTokenizer::reset()
{
empty_ = true;
begin_ = 0;
end_ = 0;
}
Token WhitespaceTokenizer::get()
{
return Token( TOKEN_WS, begin_, end_ );
}
TokenizerState WhitespaceTokenizer::consume(const wchar_t* cursor)
{
if (!begin_) begin_ = cursor;
if (isspace(*cursor))
{
empty_ = false;
} else {
end_ = cursor;
return empty_ ? TOKENIZER_FAILED : TOKENIZER_FINISHED;
}
return TOKENIZER_HUNGRY;
}
IdTokenizer::IdTokenizer()
{
reset();
}
void IdTokenizer::reset()
{
begin_ = 0;
end_ = 0;
}
Token IdTokenizer::get()
{
return Token( TOKEN_ID, begin_, end_ );
}
TokenizerState IdTokenizer::consume(const wchar_t* cursor)
{
if (!begin_) begin_ = cursor;
if (cursor == begin_ && !isalpha(*cursor)) {
return TOKENIZER_FAILED;
} else if (cursor > begin_ && !isalnum(*cursor)) {
end_ = cursor;
return TOKENIZER_FINISHED;
}
return TOKENIZER_HUNGRY;
}
SymbolTokenizer::SymbolTokenizer(int tokenType, const wchar_t* symbol)
: tokenType_( tokenType ),
symbol_( symbol )
{
}
void SymbolTokenizer::reset() {
begin_ = 0;
}
Token SymbolTokenizer::get() {
return Token( tokenType_, begin_, end_ );
}
TokenizerState SymbolTokenizer::consume(const wchar_t* cursor) {
if (!begin_) begin_ = cursor;
if (symbol_[cursor-begin_] == *cursor) {
if (!symbol_[cursor-begin_+1]) {
// we reached end of symbol
end_ = cursor + 1;
return TOKENIZER_FINISHED;
}
return TOKENIZER_HUNGRY;
} else {
return TOKENIZER_FAILED;
}
}
MultiTokenizer::MultiTokenizer(Tokenizer** tokenizers, bool ownTokenizers)
: ownTokenizers_(ownTokenizers)
{
int len = 0; while (tokenizers[len]) len++;
tokenizers_.assign(tokenizers,
tokenizers + len);
states_ = new TokenizerState[len];
reset();
}
MultiTokenizer::~MultiTokenizer()
{
if (ownTokenizers_)
{
typedef std::vector<Tokenizer*>::iterator iterator;
for (iterator i = tokenizers_.begin(); i != tokenizers_.end(); )
{
delete *(i++);
}
}
delete[] states_;
}
void MultiTokenizer::reset()
{
TokenizerState* s = states_;
running_ = 0;
std::vector<Tokenizer*>::iterator
i = tokenizers_.begin(),
end = tokenizers_.end();
for (; i != end; ++i, ++s) {
(*i)->reset();
(*s) = TOKENIZER_HUNGRY;
running_++;
}
found_ = false;
}
Token MultiTokenizer::get()
{
Token token(TOKEN_UNKNOWN, 0, 0);
TokenizerState* s = states_;
std::vector<Tokenizer*>::iterator
i = tokenizers_.begin(),
end = tokenizers_.end();
for (; i != end; ++i, ++s ) {
if (*s == TOKENIZER_FINISHED) {
Token c = (*i)->get();
if (c.length() > token.length()) {
token = c;
}
}
}
if (token.length() == 0) {
// NOTE: not really a lexical exception, but logical one
throw LexException(L"Trying to get token without a token ready.", 0);
}
return token;
}
TokenizerState MultiTokenizer::consume(const wchar_t* cursor) {
TokenizerState* s = states_;
std::vector<Tokenizer*>::iterator
i = tokenizers_.begin(),
end = tokenizers_.end();
for (; i != end; ++i, ++s) {
if (*s == TOKENIZER_HUNGRY)
{
*s = (*i)->consume(cursor);
if (*s != TOKENIZER_HUNGRY) running_--;
if (*s == TOKENIZER_FINISHED) {
found_ = true;
}
}
}
if (running_ == 0) {
return found_ ? TOKENIZER_FINISHED : TOKENIZER_FAILED;
}
return TOKENIZER_HUNGRY;
}
LitTokenizer::LitTokenizer(wchar_t citate)
: multiTokenizer_(NULL)
{
using namespace std;
auto_ptr<StrLitTokenizer>
s(new StrLitTokenizer(citate));
auto_ptr<IntLitTokenizer>
i(new IntLitTokenizer);
auto_ptr<RealLitTokenizer>
r(new RealLitTokenizer);
Tokenizer * tokenizers[] = {
s.get(),
i.get(),
r.get(),
NULL
};
multiTokenizer_ = new MultiTokenizer(tokenizers, true);
s.release();
i.release();
r.release();
reset();
}
LitTokenizer::~LitTokenizer()
{
delete multiTokenizer_;
}
void LitTokenizer::reset()
{
multiTokenizer_->reset();
}
Token LitTokenizer::get()
{
Token
subToken = multiTokenizer_->get();
return Token(TOKEN_LIT,
subToken.begin(),
subToken.end());
}
TokenizerState LitTokenizer::consume(const wchar_t * cursor)
{
return multiTokenizer_->consume(cursor);
}
TokenIterator::~TokenIterator() {}
Tokens::Tokens(Tokenizer& tokenizer, const wchar_t* text)
: cursor_(text),
tokenizer_(tokenizer),
hasNext_(false)
{}
Tokens::operator bool() {
prepareNext();
return hasNext_;
}
Token Tokens::operator++(int) {
prepareNext();
if (!hasNext_) {
throw LexException(L"Out of tokens.", cursor_);
}
hasNext_ = false;
// get the token
Token ret = tokenizer_.get();
cursor_ = ret.end();
return ret;
}
void Tokens::prepareNext() {
if (!hasNext_ && *cursor_) {
const wchar_t* begin = cursor_;
tokenizer_.reset();
TokenizerState state = TOKENIZER_HUNGRY;
while (state == TOKENIZER_HUNGRY) {
state = tokenizer_.consume(cursor_);
if (*cursor_) cursor_++; // don't go beyond eof.
}
if (state == TOKENIZER_FAILED) {
std::wostringstream msg;
msg<<L"Unrecognized syntax: '";
for (int i = 0; &begin[i] < cursor_; i++) msg<<begin[i];
msg<<L"'";
throw LexException(msg.str().c_str(), begin);
} else {
// Means that: state == TOKENIZER_FINISHED
hasNext_ = true;
}
}
}
WhiteSpaceFilter::WhiteSpaceFilter(TokenIterator& tokens)
: tokens_(tokens), next_(), hasNext_(false) {}
WhiteSpaceFilter::operator bool()
{
prepareNext();
return hasNext_;
}
Token WhiteSpaceFilter::operator++(int)
{
prepareNext();
if (!hasNext_) {
throw LexException(L"Out of tokens", 0);
}
hasNext_ = false;
return next_;
}
void WhiteSpaceFilter::prepareNext()
{
while (!hasNext_ && tokens_) {
next_ = tokens_++;
if (next_.type() != TOKEN_WS) {
hasNext_ = true;
}
}
}
TokenReader::TokenReader(TokenIterator& tokens)
: tokens_(tokens),
location_(0),
forward_(),
backward_(),
marks_()
{}
TokenReader::operator bool() {
return !forward_.empty() || tokens_;
}
Token TokenReader::operator++(int) {
Token token;
if (forward_.size() > 0) {
token = forward_.back();
forward_.pop_back();
} else {
token = tokens_++;
}
if (!marks_.empty()) {
backward_.push_back(token);
}
location_++;
return token;
}
Token TokenReader::peek() {
if (forward_.empty()) {
Token token = (*this)++;
forward_.push_back(token);
return token;
} else {
return forward_.back();
}
}
void TokenReader::pushMark() {
marks_.push_back(location_);
}
void TokenReader::popMark() {
int mark = marks_.back(); marks_.pop_back();
while (location_ > mark) {
forward_.push_back(backward_.back());
backward_.pop_back();
location_--;
}
}
void TokenReader::clearMark() {
marks_.back(); marks_.pop_back();
if (marks_.empty()) {
backward_.clear();
}
}
} // Lex
namespace Parser {
ParseException::ParseException(const wchar_t* wWhat,
const Lex::Token& where)
: wWhat_(wWhat),
where_(where) {
;
}
Lex::Token ParseException::where() const {
return where_;
}
const wchar_t* ParseException::wWhat() const throw() {
return wWhat_.c_str();
}
void ParseException::setContext(const wchar_t * context)
{
// TODO legacy of implementation of obsoleted describe() -
// it can be optimized by doind direct substring - concat
// operations instead of looping through context
std::wstring tmp;
tmp += wWhat_;
tmp += L" at: \"";
if (where_.type() == Lex::TOKEN_EOF) {
tmp += context;
tmp += L"*here*";
} else {
for (; ; context++) {
if (context == where_.begin()) {
tmp += L"*here*";
}
if (context == where_.end()) {
tmp += L"*here*";
}
if (!*context) break;
tmp += *context;
}
}
tmp += L"\"";
wWhat_ = tmp;
}
namespace Lit {
std::wstring ParseString(const Lex::Token& token) {
if (token.type() != Lex::TOKEN_STRLIT) {
std::wostringstream msg;
msg<<L"Expected literal instead of token '"<<token.text()<<"' of type "<<token.type();
throw ParseException(msg.str().c_str(), token);
}
std::wstring ret;
const wchar_t* text = token.begin();
// NOTE: We are assuming that the literal sitation marks are one character wide
for (int i = 1; &text[i] < token.end()-1; i++) {// skip first and last characters
if (text[i] == Lex::ESCAPE_SYMBOL) {
i++;
switch (text[i]) {
case '0':
ret += L"\0";
break;
case 'n':
ret += L"\n";
break;
case 'r':
ret += L"\r";
break;
case 't':
ret += L"\t";
break;
default:
ret += text[i];
}
} else {
ret += text[i];
}
}
return ret;
}
long ParseInteger(const Lex::Token& token) {
if (token.type() != Lex::TOKEN_INTLIT) {
std::wostringstream msg;
msg<<L"Expected literal instead of token '"<<token.text()<<"' of type "<<token.type();
throw ParseException(msg.str().c_str(), token);
}
wchar_t* end = const_cast<wchar_t*>(token.end());
return wcstol(token.begin(), &end, 10);
}
double ParseReal(const Lex::Token& token) {
if (token.type() != Lex::TOKEN_REALLIT) {
std::wostringstream msg;
msg<<L"Expected literal instead of token '"<<token.text()<<"' of type "<<token.type();
throw ParseException(msg.str().c_str(), token);
}
wchar_t* end = const_cast<wchar_t*>(token.end());
return wcstod(token.begin(), &end);
}
}
Lexer::Lexer(Lex::TokenIterator& tokens) : Lex::TokenReader(tokens) {
}
Lex::Token Lexer::operator++(int) {
if (*this) {
return Lex::TokenReader::operator++(0);
}
throw ParseException(L"Unexpected EOF", Lex::Token(Lex::TOKEN_EOF, 0, 0));
}
Lex::Token Lexer::eat(int tokenType) {
Lex::Token token = ((*this)++);
if (token.type() != tokenType) {
std::wostringstream msg;
msg<<"Expected token of type "<<tokenType<<" instead of token '"<<token.text()<<"' of type "<<token.type();
throw ParseException(msg.str().c_str(), token);
}
return token;
}
std::wstring Lexer::eatId() {
Lex::Token token = ((*this)++);
if (token.type() != Lex::TOKEN_ID) {
std::wostringstream msg;
msg<<L"Expected identifier instead of token '"<<token.text()<<"' of type "<<token.type();
throw ParseException(msg.str().c_str(), token);
}
return token.text();
}
void Lexer::eatEof() {
if (*this) {
Lex::Token token = ((*this)++);
std::wostringstream msg;
msg<<L"Expected EOF instead of '"<<token.text()<<"' of type "<<token.type();
throw ParseException(msg.str().c_str(), token);
}
}
std::wstring Lexer::eatString() {
return Lit::ParseString((*this)++);
}
long Lexer::eatInteger() {
return Lit::ParseInteger((*this)++);
}
double Lexer::eatReal() {
return Lit::ParseReal((*this)++);
}
StdLexer::StdLexer(Lex::Tokenizer& tokenizer, const wchar_t* text)
: Lexer(ws_),
tokens_(tokenizer, text),
ws_(tokens_)
{}
} // Parser
} // Cpt