--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/util/cpixtools/src/cpixparsetools.cpp Mon Apr 19 14:40:16 2010 +0300
@@ -0,0 +1,759 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description:
+*
+*/
+/*
+ * cpixparsetools.cpp
+ *
+ * Created on: Apr 14, 2009
+ * Author: admin
+ */
+
+#include "cpixparsetools.h"
+#include "cpixtools.h"
+
+#include <iostream>
+#include <sstream>
+#include <stdlib.h>
+
+namespace Cpt {
+
+
+ namespace Lex {
+
+ const wchar_t ESCAPE_SYMBOL = '\\';
+
+ Tokenizer::~Tokenizer() {}
+
+ LexException::LexException(const wchar_t* wWhat,
+ const wchar_t* where)
+ : wWhat_(wWhat),
+ where_(where) {
+ ;
+ }
+
+ LexException::~LexException()
+ {
+ ;
+ }
+
+ const wchar_t* LexException::where() const {
+ return where_;
+ }
+
+ const wchar_t* LexException::wWhat() const throw() {
+ return wWhat_.c_str();
+ }
+
+ void LexException::setContext(const wchar_t * context)
+ {
+ // TODO legacy of implementation of obsoleted describe() -
+ // it can be optimized by doind direct substring - concat
+ // operations instead of looping through context
+ std::wstring tmp;
+ tmp += wWhat_;
+ tmp += L" at: \"";
+ for (; ; context++) {
+ if (context == where_) {
+ tmp += L"*here*";
+ }
+ if (!*context) {
+ break;
+ }
+ tmp += *context;
+ }
+ tmp += L"\"";
+
+ wWhat_ = tmp;
+ }
+
+
+ Token::Token(int type, const wchar_t* begin, const wchar_t* end)
+ : type_(type), begin_(begin), end_(end) {
+ }
+
+ Token::Token()
+ : type_(0), begin_(0), end_(0) {
+ }
+
+ int Token::type() const { return type_; };
+ const wchar_t* Token::begin() const { return begin_; };
+ const wchar_t* Token::end() const { return end_; };
+ int Token::length() const { return end_ - begin_; };
+ std::wstring Token::text() const {
+ std::wstring ret;
+ for (const wchar_t* i = begin_; i != end_; i++) {
+ ret += *i;
+ }
+ return ret;
+ }
+
+ StrLitTokenizer::StrLitTokenizer(wchar_t citate)
+ : citate_(citate)
+ {
+ reset();
+ }
+
+ void StrLitTokenizer::reset()
+ {
+ escape_ = false,
+ opened_ = false,
+ begin_ = 0;
+ end_ = 0;
+ }
+ Token StrLitTokenizer::get()
+ {
+ return Token( TOKEN_STRLIT, begin_, end_ );
+ }
+ TokenizerState StrLitTokenizer::consume(const wchar_t* cursor)
+ {
+ if (!*cursor) return TOKENIZER_FAILED; // fail always on EOF
+ if (!opened_)
+ {
+ if (*cursor == citate_)
+ {
+ opened_ = true;
+ begin_ = cursor;
+ } else {
+ return TOKENIZER_FAILED;
+ }
+ } else if (escape_) {
+ escape_ = false;
+ } else {
+ if (*cursor == citate_) {
+ end_ = cursor+1;
+ return TOKENIZER_FINISHED;
+ } else if (*cursor == '\\') {
+ escape_ = true;
+ }
+ }
+ return TOKENIZER_HUNGRY;
+ }
+
+ IntLitTokenizer::IntLitTokenizer() {
+ reset();
+ }
+
+ void IntLitTokenizer::reset() {
+ begin_ = NULL;
+ end_ = NULL;
+ beginning_ = true;
+ }
+
+ Token IntLitTokenizer::get() {
+ return Token(TOKEN_INTLIT, begin_, end_);
+ }
+
+ TokenizerState IntLitTokenizer::consume(const wchar_t * cursor) {
+ TokenizerState
+ rv = TOKENIZER_HUNGRY;
+
+ if (beginning_)
+ {
+ if (*cursor != L'+'
+ && *cursor != L'-'
+ && !isdigit(*cursor))
+ {
+ rv = TOKENIZER_FAILED;
+ }
+ beginning_ = false;
+ begin_ = cursor;
+ }
+ else if (!isdigit(*cursor))
+ {
+ rv = TOKENIZER_FINISHED;
+ end_ = cursor;
+ }
+
+ return rv;
+ }
+
+ RealLitTokenizer::RealLitTokenizer() {
+ reset();
+ }
+
+ void RealLitTokenizer::reset() {
+ begin_ = NULL;
+ end_ = NULL;
+ beginning_ = true;
+ hadDotAlready_ = false;
+ }
+
+ Token RealLitTokenizer::get() {
+ return Token(TOKEN_REALLIT, begin_, end_);
+ }
+
+ TokenizerState RealLitTokenizer::consume(const wchar_t * cursor) {
+ TokenizerState
+ rv = TOKENIZER_HUNGRY;
+
+ if (beginning_)
+ {
+ if (*cursor != L'+'
+ && *cursor != L'-'
+ && !isdigit(*cursor)
+ && *cursor != L'.')
+ {
+ rv = TOKENIZER_FAILED;
+ }
+ beginning_ = false;
+ begin_ = cursor;
+ }
+ else if (*cursor == L'.')
+ {
+ if (hadDotAlready_)
+ {
+ rv = TOKENIZER_FINISHED;
+ end_ = cursor;
+ }
+
+ hadDotAlready_ = true;
+ }
+ else if (!isdigit(*cursor))
+ {
+ rv = TOKENIZER_FINISHED;
+ end_ = cursor;
+ }
+
+ return rv;
+ }
+
+ WhitespaceTokenizer::WhitespaceTokenizer() {
+ reset();
+ }
+
+ void WhitespaceTokenizer::reset()
+ {
+ empty_ = true;
+ begin_ = 0;
+ end_ = 0;
+ }
+
+ Token WhitespaceTokenizer::get()
+ {
+ return Token( TOKEN_WS, begin_, end_ );
+ }
+
+ TokenizerState WhitespaceTokenizer::consume(const wchar_t* cursor)
+ {
+ if (!begin_) begin_ = cursor;
+
+ if (isspace(*cursor))
+ {
+ empty_ = false;
+ } else {
+ end_ = cursor;
+ return empty_ ? TOKENIZER_FAILED : TOKENIZER_FINISHED;
+ }
+ return TOKENIZER_HUNGRY;
+ }
+
+ IdTokenizer::IdTokenizer()
+ {
+ reset();
+ }
+
+ void IdTokenizer::reset()
+ {
+ begin_ = 0;
+ end_ = 0;
+ }
+
+
+ Token IdTokenizer::get()
+ {
+ return Token( TOKEN_ID, begin_, end_ );
+ }
+
+ TokenizerState IdTokenizer::consume(const wchar_t* cursor)
+ {
+ if (!begin_) begin_ = cursor;
+ if (cursor == begin_ && !isalpha(*cursor)) {
+ return TOKENIZER_FAILED;
+ } else if (cursor > begin_ && !isalnum(*cursor)) {
+ end_ = cursor;
+ return TOKENIZER_FINISHED;
+ }
+ return TOKENIZER_HUNGRY;
+ }
+
+ SymbolTokenizer::SymbolTokenizer(int tokenType, const wchar_t* symbol)
+ : tokenType_( tokenType ),
+ symbol_( symbol )
+ {
+ }
+
+ void SymbolTokenizer::reset() {
+ begin_ = 0;
+ }
+
+ Token SymbolTokenizer::get() {
+ return Token( tokenType_, begin_, end_ );
+ }
+
+ TokenizerState SymbolTokenizer::consume(const wchar_t* cursor) {
+ if (!begin_) begin_ = cursor;
+ if (symbol_[cursor-begin_] == *cursor) {
+ if (!symbol_[cursor-begin_+1]) {
+ // we reached end of symbol
+ end_ = cursor + 1;
+ return TOKENIZER_FINISHED;
+ }
+ return TOKENIZER_HUNGRY;
+ } else {
+ return TOKENIZER_FAILED;
+ }
+ }
+
+ MultiTokenizer::MultiTokenizer(Tokenizer** tokenizers, bool ownTokenizers)
+ : ownTokenizers_(ownTokenizers)
+ {
+ int len = 0; while (tokenizers[len]) len++;
+ tokenizers_.assign(tokenizers,
+ tokenizers + len);
+ states_ = new TokenizerState[len];
+ reset();
+ }
+
+ MultiTokenizer::~MultiTokenizer()
+ {
+ if (ownTokenizers_)
+ {
+ typedef std::vector<Tokenizer*>::iterator iterator;
+ for (iterator i = tokenizers_.begin(); i != tokenizers_.end(); )
+ {
+ delete *(i++);
+ }
+ }
+ delete[] states_;
+ }
+
+
+ void MultiTokenizer::reset()
+ {
+ TokenizerState* s = states_;
+ running_ = 0;
+ std::vector<Tokenizer*>::iterator
+ i = tokenizers_.begin(),
+ end = tokenizers_.end();
+
+ for (; i != end; ++i, ++s) {
+ (*i)->reset();
+ (*s) = TOKENIZER_HUNGRY;
+ running_++;
+ }
+ found_ = false;
+ }
+
+ Token MultiTokenizer::get()
+ {
+ Token token(TOKEN_UNKNOWN, 0, 0);
+ TokenizerState* s = states_;
+ std::vector<Tokenizer*>::iterator
+ i = tokenizers_.begin(),
+ end = tokenizers_.end();
+
+ for (; i != end; ++i, ++s ) {
+ if (*s == TOKENIZER_FINISHED) {
+ Token c = (*i)->get();
+ if (c.length() > token.length()) {
+ token = c;
+ }
+ }
+ }
+ if (token.length() == 0) {
+ // NOTE: not really a lexical exception, but logical one
+ throw LexException(L"Trying to get token without a token ready.", 0);
+ }
+ return token;
+ }
+
+ TokenizerState MultiTokenizer::consume(const wchar_t* cursor) {
+ TokenizerState* s = states_;
+ std::vector<Tokenizer*>::iterator
+ i = tokenizers_.begin(),
+ end = tokenizers_.end();
+
+ for (; i != end; ++i, ++s) {
+ if (*s == TOKENIZER_HUNGRY)
+ {
+ *s = (*i)->consume(cursor);
+ if (*s != TOKENIZER_HUNGRY) running_--;
+ if (*s == TOKENIZER_FINISHED) {
+ found_ = true;
+ }
+ }
+ }
+ if (running_ == 0) {
+ return found_ ? TOKENIZER_FINISHED : TOKENIZER_FAILED;
+ }
+ return TOKENIZER_HUNGRY;
+ }
+
+
+ LitTokenizer::LitTokenizer(wchar_t citate)
+ : multiTokenizer_(NULL)
+ {
+ using namespace std;
+
+ auto_ptr<StrLitTokenizer>
+ s(new StrLitTokenizer(citate));
+ auto_ptr<IntLitTokenizer>
+ i(new IntLitTokenizer);
+ auto_ptr<RealLitTokenizer>
+ r(new RealLitTokenizer);
+
+ Tokenizer * tokenizers[] = {
+ s.get(),
+ i.get(),
+ r.get(),
+ NULL
+ };
+
+ multiTokenizer_ = new MultiTokenizer(tokenizers, true);
+
+ s.release();
+ i.release();
+ r.release();
+
+ reset();
+ }
+
+
+ LitTokenizer::~LitTokenizer()
+ {
+ delete multiTokenizer_;
+ }
+
+ void LitTokenizer::reset()
+ {
+ multiTokenizer_->reset();
+ }
+
+ Token LitTokenizer::get()
+ {
+ Token
+ subToken = multiTokenizer_->get();
+
+ return Token(TOKEN_LIT,
+ subToken.begin(),
+ subToken.end());
+ }
+
+ TokenizerState LitTokenizer::consume(const wchar_t * cursor)
+ {
+ return multiTokenizer_->consume(cursor);
+ }
+
+ TokenIterator::~TokenIterator() {}
+
+ Tokens::Tokens(Tokenizer& tokenizer, const wchar_t* text)
+ : cursor_(text),
+ tokenizer_(tokenizer),
+ hasNext_(false)
+ {}
+
+ Tokens::operator bool() {
+ prepareNext();
+ return hasNext_;
+ }
+
+ Token Tokens::operator++(int) {
+ prepareNext();
+ if (!hasNext_) {
+ throw LexException(L"Out of tokens.", cursor_);
+ }
+ hasNext_ = false;
+ // get the token
+ Token ret = tokenizer_.get();
+ cursor_ = ret.end();
+ return ret;
+ }
+
+ void Tokens::prepareNext() {
+ if (!hasNext_ && *cursor_) {
+ const wchar_t* begin = cursor_;
+ tokenizer_.reset();
+ TokenizerState state = TOKENIZER_HUNGRY;
+ while (state == TOKENIZER_HUNGRY) {
+ state = tokenizer_.consume(cursor_);
+ if (*cursor_) cursor_++; // don't go beyond eof.
+ }
+ if (state == TOKENIZER_FAILED) {
+ std::wostringstream msg;
+ msg<<L"Unrecognized syntax: '";
+ for (int i = 0; &begin[i] < cursor_; i++) msg<<begin[i];
+ msg<<L"'";
+ throw LexException(msg.str().c_str(), begin);
+ } else {
+ // Means that: state == TOKENIZER_FINISHED
+ hasNext_ = true;
+ }
+ }
+ }
+
+ WhiteSpaceFilter::WhiteSpaceFilter(TokenIterator& tokens)
+ : tokens_(tokens), next_(), hasNext_(false) {}
+
+ WhiteSpaceFilter::operator bool()
+ {
+ prepareNext();
+ return hasNext_;
+ }
+
+ Token WhiteSpaceFilter::operator++(int)
+ {
+ prepareNext();
+ if (!hasNext_) {
+ throw LexException(L"Out of tokens", 0);
+ }
+ hasNext_ = false;
+ return next_;
+ }
+ void WhiteSpaceFilter::prepareNext()
+ {
+ while (!hasNext_ && tokens_) {
+ next_ = tokens_++;
+ if (next_.type() != TOKEN_WS) {
+ hasNext_ = true;
+ }
+ }
+ }
+
+ TokenReader::TokenReader(TokenIterator& tokens)
+ : tokens_(tokens),
+ location_(0),
+ forward_(),
+ backward_(),
+ marks_()
+ {}
+
+
+ TokenReader::operator bool() {
+ return !forward_.empty() || tokens_;
+ }
+
+ Token TokenReader::operator++(int) {
+ Token token;
+ if (forward_.size() > 0) {
+ token = forward_.back();
+ forward_.pop_back();
+ } else {
+ token = tokens_++;
+ }
+ if (!marks_.empty()) {
+ backward_.push_back(token);
+ }
+ location_++;
+ return token;
+ }
+
+ Token TokenReader::peek() {
+ if (forward_.empty()) {
+ Token token = (*this)++;
+ forward_.push_back(token);
+ return token;
+ } else {
+ return forward_.back();
+ }
+ }
+
+ void TokenReader::pushMark() {
+ marks_.push_back(location_);
+ }
+
+ void TokenReader::popMark() {
+ int mark = marks_.back(); marks_.pop_back();
+ while (location_ > mark) {
+ forward_.push_back(backward_.back());
+ backward_.pop_back();
+ location_--;
+ }
+ }
+
+ void TokenReader::clearMark() {
+ marks_.back(); marks_.pop_back();
+ if (marks_.empty()) {
+ backward_.clear();
+ }
+ }
+
+ } // Lex
+
+ namespace Parser {
+
+ ParseException::ParseException(const wchar_t* wWhat,
+ const Lex::Token& where)
+ : wWhat_(wWhat),
+ where_(where) {
+ ;
+ }
+
+
+ Lex::Token ParseException::where() const {
+ return where_;
+ }
+
+
+ const wchar_t* ParseException::wWhat() const throw() {
+ return wWhat_.c_str();
+ }
+
+ void ParseException::setContext(const wchar_t * context)
+ {
+ // TODO legacy of implementation of obsoleted describe() -
+ // it can be optimized by doind direct substring - concat
+ // operations instead of looping through context
+ std::wstring tmp;
+ tmp += wWhat_;
+ tmp += L" at: \"";
+ if (where_.type() == Lex::TOKEN_EOF) {
+ tmp += context;
+ tmp += L"*here*";
+ } else {
+ for (; ; context++) {
+ if (context == where_.begin()) {
+ tmp += L"*here*";
+ }
+ if (context == where_.end()) {
+ tmp += L"*here*";
+ }
+ if (!*context) break;
+ tmp += *context;
+ }
+ }
+ tmp += L"\"";
+
+ wWhat_ = tmp;
+ }
+
+ namespace Lit {
+
+ std::wstring ParseString(const Lex::Token& token) {
+ if (token.type() != Lex::TOKEN_STRLIT) {
+ std::wostringstream msg;
+ msg<<L"Expected literal instead of token '"<<token.text()<<"' of type "<<token.type();
+ throw ParseException(msg.str().c_str(), token);
+ }
+ std::wstring ret;
+ const wchar_t* text = token.begin();
+ // NOTE: We are assuming that the literal sitation marks are one character wide
+ for (int i = 1; &text[i] < token.end()-1; i++) {// skip first and last characters
+ if (text[i] == Lex::ESCAPE_SYMBOL) {
+ i++;
+ switch (text[i]) {
+ case '0':
+ ret += L"\0";
+ break;
+ case 'n':
+ ret += L"\n";
+ break;
+ case 'r':
+ ret += L"\r";
+ break;
+ case 't':
+ ret += L"\t";
+ break;
+ default:
+ ret += text[i];
+ }
+ } else {
+ ret += text[i];
+ }
+ }
+ return ret;
+ }
+ long ParseInteger(const Lex::Token& token) {
+ if (token.type() != Lex::TOKEN_INTLIT) {
+ std::wostringstream msg;
+ msg<<L"Expected literal instead of token '"<<token.text()<<"' of type "<<token.type();
+ throw ParseException(msg.str().c_str(), token);
+ }
+ wchar_t* end = const_cast<wchar_t*>(token.end());
+ return wcstol(token.begin(), &end, 10);
+ }
+ double ParseReal(const Lex::Token& token) {
+ if (token.type() != Lex::TOKEN_REALLIT) {
+ std::wostringstream msg;
+ msg<<L"Expected literal instead of token '"<<token.text()<<"' of type "<<token.type();
+ throw ParseException(msg.str().c_str(), token);
+ }
+ wchar_t* end = const_cast<wchar_t*>(token.end());
+ return wcstod(token.begin(), &end);
+ }
+ }
+
+
+ Lexer::Lexer(Lex::TokenIterator& tokens) : Lex::TokenReader(tokens) {
+ }
+
+ Lex::Token Lexer::operator++(int) {
+ if (*this) {
+ return Lex::TokenReader::operator++(0);
+ }
+ throw ParseException(L"Unexpected EOF", Lex::Token(Lex::TOKEN_EOF, 0, 0));
+ }
+
+ Lex::Token Lexer::eat(int tokenType) {
+ Lex::Token token = ((*this)++);
+ if (token.type() != tokenType) {
+ std::wostringstream msg;
+ msg<<"Expected token of type "<<tokenType<<" instead of token '"<<token.text()<<"' of type "<<token.type();
+ throw ParseException(msg.str().c_str(), token);
+ }
+ return token;
+ }
+ std::wstring Lexer::eatId() {
+ Lex::Token token = ((*this)++);
+ if (token.type() != Lex::TOKEN_ID) {
+ std::wostringstream msg;
+ msg<<L"Expected identifier instead of token '"<<token.text()<<"' of type "<<token.type();
+ throw ParseException(msg.str().c_str(), token);
+ }
+ return token.text();
+ }
+
+ void Lexer::eatEof() {
+ if (*this) {
+ Lex::Token token = ((*this)++);
+ std::wostringstream msg;
+ msg<<L"Expected EOF instead of '"<<token.text()<<"' of type "<<token.type();
+ throw ParseException(msg.str().c_str(), token);
+ }
+ }
+
+ std::wstring Lexer::eatString() {
+ return Lit::ParseString((*this)++);
+ }
+
+ long Lexer::eatInteger() {
+ return Lit::ParseInteger((*this)++);
+ }
+
+ double Lexer::eatReal() {
+ return Lit::ParseReal((*this)++);
+ }
+
+ StdLexer::StdLexer(Lex::Tokenizer& tokenizer, const wchar_t* text)
+ : Lexer(ws_),
+ tokens_(tokenizer, text),
+ ws_(tokens_)
+
+ {}
+
+
+ } // Parser
+} // Cpt
+