FCL/sf/mw/searchsrv: comparison searchengine/util/cpixtools/src/cpixparsetools.cpp

equal deleted inserted replaced

--1:000000000000
+:671dee74050a
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description:
+*
+*/
+/*
+* cpixparsetools.cpp
+*
+*  Created on: Apr 14, 2009
+*      Author: admin
+*/
+#include "cpixparsetools.h"
+#include "cpixtools.h"
+#include <iostream>
+#include <sstream>
+#include <stdlib.h>
+namespace Cpt {
+namespace Lex {
+const wchar_t ESCAPE_SYMBOL = '\\';
+Tokenizer::~Tokenizer() {}
+LexException::LexException(const wchar_t* wWhat,
+const wchar_t* where)
+: wWhat_(wWhat),
+where_(where) {
+;
+}
+LexException::~LexException()
+{
+;
+}
+const wchar_t* LexException::where() const {
+return where_;
+}
+const wchar_t* LexException::wWhat() const throw() {
+return wWhat_.c_str();
+}
+void LexException::setContext(const wchar_t * context)
+{
+// TODO legacy of implementation of obsoleted describe() -
+// it can be optimized by doind direct substring - concat
+// operations instead of looping through context
+std::wstring tmp;
+tmp += wWhat_;
+tmp += L" at: \"";
+for (; ; context++) {
+if (context == where_) {
+tmp += L"*here*";
+}
+if (!*context) {
+break;
+}
+tmp += *context;
+}
+tmp += L"\"";
+wWhat_ = tmp;
+}
+Token::Token(int type, const wchar_t* begin, const wchar_t* end)
+: type_(type), begin_(begin), end_(end) {
+}
+Token::Token()
+: type_(0), begin_(0), end_(0) {
+}
+int Token::type() const { return type_; };
+const wchar_t* Token::begin() const { return begin_; };
+const wchar_t* Token::end() const { return end_; };
+int Token::length() const { return end_ - begin_; };
+std::wstring Token::text() const {
+std::wstring ret;
+for (const wchar_t* i = begin_; i != end_; i++) {
+ret += *i;
+}
+return ret;
+}
+StrLitTokenizer::StrLitTokenizer(wchar_t citate)
+: 	citate_(citate)
+{
+reset();
+}
+void StrLitTokenizer::reset()
+{
+escape_ = false,
+opened_ = false,
+begin_ = 0;
+end_ = 0;
+}
+Token StrLitTokenizer::get()
+{
+return Token( TOKEN_STRLIT, begin_, end_ );
+}
+TokenizerState StrLitTokenizer::consume(const wchar_t* cursor)
+{
+if (!*cursor) return TOKENIZER_FAILED; // fail always on EOF
+if (!opened_)
+{
+if (*cursor == citate_)
+{
+opened_ = true;
+begin_ = cursor;
+} else {
+return TOKENIZER_FAILED;
+}
+} else if (escape_)  {
+escape_ = false;
+} else {
+if (*cursor == citate_) {
+end_ = cursor+1;
+return TOKENIZER_FINISHED;
+} else if (*cursor == '\\') {
+escape_ = true;
+}
+}
+return TOKENIZER_HUNGRY;
+}
+IntLitTokenizer::IntLitTokenizer() {
+reset();
+}
+void IntLitTokenizer::reset() {
+begin_ = NULL;
+end_ = NULL;
+beginning_ = true;
+}
+Token IntLitTokenizer::get() {
+return Token(TOKEN_INTLIT, begin_, end_);
+}
+TokenizerState IntLitTokenizer::consume(const wchar_t * cursor) {
+TokenizerState
+rv = TOKENIZER_HUNGRY;
+if (beginning_)
+{
+if (*cursor != L'+'
+&& *cursor != L'-'
+&& !isdigit(*cursor))
+{
+rv = TOKENIZER_FAILED;
+}
+beginning_ = false;
+begin_ = cursor;
+}
+else if (!isdigit(*cursor))
+{
+rv = TOKENIZER_FINISHED;
+end_ = cursor;
+}
+return rv;
+}
+RealLitTokenizer::RealLitTokenizer() {
+reset();
+}
+void RealLitTokenizer::reset() {
+begin_ = NULL;
+end_ = NULL;
+beginning_ = true;
+hadDotAlready_ = false;
+}
+Token RealLitTokenizer::get() {
+return Token(TOKEN_REALLIT, begin_, end_);
+}
+TokenizerState RealLitTokenizer::consume(const wchar_t * cursor) {
+TokenizerState
+rv = TOKENIZER_HUNGRY;
+if (beginning_)
+{
+if (*cursor != L'+'
+&& *cursor != L'-'
+&& !isdigit(*cursor)
+&& *cursor != L'.')
+{
+rv = TOKENIZER_FAILED;
+}
+beginning_ = false;
+begin_ = cursor;
+}
+else if (*cursor == L'.')
+{
+if (hadDotAlready_)
+{
+rv = TOKENIZER_FINISHED;
+end_ = cursor;
+}
+hadDotAlready_ = true;
+}
+else if (!isdigit(*cursor))
+{
+rv = TOKENIZER_FINISHED;
+end_ = cursor;
+}
+return rv;
+}
+WhitespaceTokenizer::WhitespaceTokenizer() {
+reset();
+}
+void WhitespaceTokenizer::reset()
+{
+empty_ = true;
+begin_ = 0;
+end_ = 0;
+}
+Token WhitespaceTokenizer::get()
+{
+return Token( TOKEN_WS, begin_, end_ );
+}
+TokenizerState WhitespaceTokenizer::consume(const wchar_t* cursor)
+{
+if (!begin_) begin_ = cursor;
+if (isspace(*cursor))
+{
+empty_ = false;
+} else {
+end_ = cursor;
+return empty_ ? TOKENIZER_FAILED : TOKENIZER_FINISHED;
+}
+return TOKENIZER_HUNGRY;
+}
+IdTokenizer::IdTokenizer()
+{
+reset();
+}
+void IdTokenizer::reset()
+{
+begin_ = 0;
+end_ = 0;
+}
+Token IdTokenizer::get()
+{
+return Token( TOKEN_ID, begin_, end_ );
+}
+TokenizerState IdTokenizer::consume(const wchar_t* cursor)
+{
+if (!begin_) begin_ = cursor;
+if (cursor == begin_ && !isalpha(*cursor)) {
+return TOKENIZER_FAILED;
+} else if (cursor > begin_ && !isalnum(*cursor)) {
+end_ = cursor;
+return TOKENIZER_FINISHED;
+}
+return TOKENIZER_HUNGRY;
+}
+SymbolTokenizer::SymbolTokenizer(int tokenType, const wchar_t* symbol)
+: tokenType_( tokenType ),
+symbol_( symbol )
+{
+}
+void SymbolTokenizer::reset() {
+begin_ = 0;
+}
+Token SymbolTokenizer::get() {
+return Token( tokenType_, begin_, end_ );
+}
+TokenizerState SymbolTokenizer::consume(const wchar_t* cursor) {
+if (!begin_) begin_ = cursor;
+if (symbol_[cursor-begin_] == *cursor) {
+if (!symbol_[cursor-begin_+1]) {
+// we reached end of symbol
+end_ = cursor + 1;
+return TOKENIZER_FINISHED;
+}
+return TOKENIZER_HUNGRY;
+} else {
+return TOKENIZER_FAILED;
+}
+}
+MultiTokenizer::MultiTokenizer(Tokenizer** tokenizers, bool ownTokenizers)
+: ownTokenizers_(ownTokenizers)
+{
+int len = 0; while (tokenizers[len]) len++;
+tokenizers_.assign(tokenizers,
+tokenizers + len);
+states_ = new TokenizerState[len];
+reset();
+}
+MultiTokenizer::~MultiTokenizer()
+{
+if (ownTokenizers_)
+{
+typedef std::vector<Tokenizer*>::iterator iterator;
+for (iterator i = tokenizers_.begin(); i != tokenizers_.end(); )
+{
+delete *(i++);
+}
+}
+delete[] states_;
+}
+void MultiTokenizer::reset()
+{
+TokenizerState* s = states_;
+running_ = 0;
+std::vector<Tokenizer*>::iterator
+i = tokenizers_.begin(),
+end = tokenizers_.end();
+for (; i != end; ++i, ++s) {
+(*i)->reset();
+(*s) = TOKENIZER_HUNGRY;
+running_++;
+}
+found_ = false;
+}
+Token MultiTokenizer::get()
+{
+Token token(TOKEN_UNKNOWN, 0, 0);
+TokenizerState* s = states_;
+std::vector<Tokenizer*>::iterator
+i = tokenizers_.begin(),
+end = tokenizers_.end();
+for (; i != end; ++i, ++s ) {
+if (*s == TOKENIZER_FINISHED) {
+Token c = (*i)->get();
+if (c.length() > token.length()) {
+token = c;
+}
+}
+}
+if (token.length() == 0) {
+// NOTE: not really a lexical exception, but logical one
+throw LexException(L"Trying to get token without a token ready.", 0);
+}
+return token;
+}
+TokenizerState MultiTokenizer::consume(const wchar_t* cursor) {
+TokenizerState* s = states_;
+std::vector<Tokenizer*>::iterator
+i = tokenizers_.begin(),
+end = tokenizers_.end();
+for (; i != end; ++i, ++s) {
+if (*s == TOKENIZER_HUNGRY)
+{
+*s = (*i)->consume(cursor);
+if (*s != TOKENIZER_HUNGRY) running_--;
+if (*s == TOKENIZER_FINISHED) {
+found_ = true;
+}
+}
+}
+if (running_ == 0) {
+return found_ ? TOKENIZER_FINISHED : TOKENIZER_FAILED;
+}
+return TOKENIZER_HUNGRY;
+}
+LitTokenizer::LitTokenizer(wchar_t citate)
+: multiTokenizer_(NULL)
+{
+using namespace std;
+auto_ptr<StrLitTokenizer>
+s(new StrLitTokenizer(citate));
+auto_ptr<IntLitTokenizer>
+i(new IntLitTokenizer);
+auto_ptr<RealLitTokenizer>
+r(new RealLitTokenizer);
+Tokenizer * tokenizers[] = {
+s.get(),
+i.get(),
+r.get(),
+NULL
+};
+multiTokenizer_ = new MultiTokenizer(tokenizers, true);
+s.release();
+i.release();
+r.release();
+reset();
+}
+LitTokenizer::~LitTokenizer()
+{
+delete multiTokenizer_;
+}
+void LitTokenizer::reset()
+{
+multiTokenizer_->reset();
+}
+Token LitTokenizer::get()
+{
+Token
+subToken = multiTokenizer_->get();
+return Token(TOKEN_LIT,
+subToken.begin(),
+subToken.end());
+}
+TokenizerState LitTokenizer::consume(const wchar_t * cursor)
+{
+return multiTokenizer_->consume(cursor);
+}
+TokenIterator::~TokenIterator() {}
+Tokens::Tokens(Tokenizer& tokenizer, const wchar_t* text)
+:	cursor_(text),
+tokenizer_(tokenizer),
+hasNext_(false)
+{}
+Tokens::operator bool() {
+prepareNext();
+return hasNext_;
+}
+Token Tokens::operator++(int) {
+prepareNext();
+if (!hasNext_) {
+throw LexException(L"Out of tokens.", cursor_);
+}
+hasNext_ = false;
+// get the token
+Token ret = tokenizer_.get();
+cursor_ = ret.end();
+return ret;
+}
+void Tokens::prepareNext() {
+if (!hasNext_ && *cursor_) {
+const wchar_t* begin = cursor_;
+tokenizer_.reset();
+TokenizerState state = TOKENIZER_HUNGRY;
+while (state == TOKENIZER_HUNGRY) {
+state = tokenizer_.consume(cursor_);
+if (*cursor_) cursor_++; // don't go beyond eof.
+}
+if (state == TOKENIZER_FAILED) {
+std::wostringstream msg;
+msg<<L"Unrecognized syntax: '";
+for (int i = 0; &begin[i] < cursor_; i++) msg<<begin[i];
+msg<<L"'";
+throw LexException(msg.str().c_str(), begin);
+} else {
+// Means that: state == TOKENIZER_FINISHED
+hasNext_ = true;
+}
+}
+}
+WhiteSpaceFilter::WhiteSpaceFilter(TokenIterator& tokens)
+:	tokens_(tokens), next_(), hasNext_(false) {}
+WhiteSpaceFilter::operator bool()
+{
+prepareNext();
+return hasNext_;
+}
+Token WhiteSpaceFilter::operator++(int)
+{
+prepareNext();
+if (!hasNext_) {
+throw LexException(L"Out of tokens", 0);
+}
+hasNext_ = false;
+return next_;
+}
+void WhiteSpaceFilter::prepareNext()
+{
+while (!hasNext_ && tokens_) {
+next_ = tokens_++;
+if (next_.type() != TOKEN_WS) {
+hasNext_ = true;
+}
+}
+}
+TokenReader::TokenReader(TokenIterator& tokens)
+:	tokens_(tokens),
+location_(0),
+forward_(),
+backward_(),
+marks_()
+{}
+TokenReader::operator bool() {
+return !forward_.empty() || tokens_;
+}
+Token TokenReader::operator++(int) {
+Token token;
+if (forward_.size() > 0) {
+token = forward_.back();
+forward_.pop_back();
+} else {
+token = tokens_++;
+}
+if (!marks_.empty()) {
+backward_.push_back(token);
+}
+location_++;
+return token;
+}
+Token TokenReader::peek() {
+if (forward_.empty()) {
+Token token = (*this)++;
+forward_.push_back(token);
+return token;
+} else {
+return forward_.back();
+}
+}
+void TokenReader::pushMark() {
+marks_.push_back(location_);
+}
+void TokenReader::popMark() {
+int mark = marks_.back(); marks_.pop_back();
+while (location_ > mark) {
+forward_.push_back(backward_.back());
+backward_.pop_back();
+location_--;
+}
+}
+void TokenReader::clearMark() {
+marks_.back(); marks_.pop_back();
+if (marks_.empty()) {
+backward_.clear();
+}
+}
+} // Lex
+namespace Parser {
+ParseException::ParseException(const wchar_t* wWhat,
+const Lex::Token& where)
+: wWhat_(wWhat),
+where_(where) {
+;
+}
+Lex::Token ParseException::where() const {
+return where_;
+}
+const wchar_t* ParseException::wWhat() const throw() {
+return wWhat_.c_str();
+}
+void ParseException::setContext(const wchar_t * context)
+{
+// TODO legacy of implementation of obsoleted describe() -
+// it can be optimized by doind direct substring - concat
+// operations instead of looping through context
+std::wstring tmp;
+tmp += wWhat_;
+tmp += L" at: \"";
+if (where_.type() == Lex::TOKEN_EOF) {
+tmp += context;
+tmp += L"*here*";
+} else {
+for (; ; context++) {
+if (context == where_.begin()) {
+tmp += L"*here*";
+}
+if (context == where_.end()) {
+tmp += L"*here*";
+}
+if (!*context) break;
+tmp += *context;
+}
+}
+tmp += L"\"";
+wWhat_ = tmp;
+}
+namespace Lit {
+std::wstring ParseString(const Lex::Token& token) {
+if (token.type() != Lex::TOKEN_STRLIT) {
+std::wostringstream msg;
+msg<<L"Expected literal instead of token '"<<token.text()<<"' of type "<<token.type();
+throw ParseException(msg.str().c_str(), token);
+}
+std::wstring ret;
+const wchar_t* text = token.begin();
+// NOTE: We are assuming that the literal sitation marks are one character wide
+for (int i = 1; &text[i] < token.end()-1; i++) {// skip first and last characters
+if (text[i] == Lex::ESCAPE_SYMBOL) {
+i++;
+switch (text[i]) {
+case '0':
+ret += L"\0";
+break;
+case 'n':
+ret += L"\n";
+break;
+case 'r':
+ret += L"\r";
+break;
+case 't':
+ret += L"\t";
+break;
+default:
+ret += text[i];
+}
+} else {
+ret += text[i];
+}
+}
+return ret;
+}
+long ParseInteger(const Lex::Token& token) {
+if (token.type() != Lex::TOKEN_INTLIT) {
+std::wostringstream msg;
+msg<<L"Expected literal instead of token '"<<token.text()<<"' of type "<<token.type();
+throw ParseException(msg.str().c_str(), token);
+}
+wchar_t* end = const_cast<wchar_t*>(token.end());
+return wcstol(token.begin(), &end, 10);
+}
+double ParseReal(const Lex::Token& token) {
+if (token.type() != Lex::TOKEN_REALLIT) {
+std::wostringstream msg;
+msg<<L"Expected literal instead of token '"<<token.text()<<"' of type "<<token.type();
+throw ParseException(msg.str().c_str(), token);
+}
+wchar_t* end = const_cast<wchar_t*>(token.end());
+return wcstod(token.begin(), &end);
+}
+}
+Lexer::Lexer(Lex::TokenIterator& tokens) : Lex::TokenReader(tokens) {
+}
+Lex::Token Lexer::operator++(int) {
+if (*this) {
+return Lex::TokenReader::operator++(0);
+}
+throw ParseException(L"Unexpected EOF", Lex::Token(Lex::TOKEN_EOF, 0, 0));
+}
+Lex::Token Lexer::eat(int tokenType) {
+Lex::Token token = ((*this)++);
+if (token.type() != tokenType) {
+std::wostringstream msg;
+msg<<"Expected token of type "<<tokenType<<" instead of token '"<<token.text()<<"' of type "<<token.type();
+throw ParseException(msg.str().c_str(), token);
+}
+return token;
+}
+std::wstring Lexer::eatId() {
+Lex::Token token = ((*this)++);
+if (token.type() != Lex::TOKEN_ID) {
+std::wostringstream msg;
+msg<<L"Expected identifier instead of token '"<<token.text()<<"' of type "<<token.type();
+throw ParseException(msg.str().c_str(), token);
+}
+return token.text();
+}
+void Lexer::eatEof() {
+if (*this) {
+Lex::Token token = ((*this)++);
+std::wostringstream msg;
+msg<<L"Expected EOF instead of '"<<token.text()<<"' of type "<<token.type();
+throw ParseException(msg.str().c_str(), token);
+}
+}
+std::wstring Lexer::eatString() {
+return Lit::ParseString((*this)++);
+}
+long Lexer::eatInteger() {
+return Lit::ParseInteger((*this)++);
+}
+double Lexer::eatReal() {
+return Lit::ParseReal((*this)++);
+}
+StdLexer::StdLexer(Lex::Tokenizer& tokenizer, const wchar_t* text)
+: Lexer(ws_),
+tokens_(tokenizer, text),
+ws_(tokens_)
+{}
+} // Parser
+} // Cpt

changeset 0	671dee74050a
child 8	6547bf8ca13a