searchengine/util/cpixtools/src/cpixparsetools.cpp
changeset 0 671dee74050a
child 8 6547bf8ca13a
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/util/cpixtools/src/cpixparsetools.cpp	Mon Apr 19 14:40:16 2010 +0300
@@ -0,0 +1,759 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: 
+*
+*/
+/*
+ * cpixparsetools.cpp
+ *
+ *  Created on: Apr 14, 2009
+ *      Author: admin
+ */
+
+#include "cpixparsetools.h"
+#include "cpixtools.h"
+
+#include <iostream>
+#include <sstream>
+#include <stdlib.h>
+
+namespace Cpt {
+
+
+    namespace Lex {
+	
+        const wchar_t ESCAPE_SYMBOL = '\\';
+	
+        Tokenizer::~Tokenizer() {}
+	
+        LexException::LexException(const wchar_t* wWhat, 
+                                   const wchar_t* where) 
+            : wWhat_(wWhat), 
+              where_(where) {
+            ;
+        }
+
+        LexException::~LexException() 
+        {
+            ;
+        }
+
+        const wchar_t* LexException::where() const {
+            return where_;
+        }
+
+        const wchar_t* LexException::wWhat() const throw() {
+            return wWhat_.c_str();
+        }
+
+        void LexException::setContext(const wchar_t * context)
+        {
+            // TODO legacy of implementation of obsoleted describe() -
+            // it can be optimized by doind direct substring - concat
+            // operations instead of looping through context
+            std::wstring tmp;
+            tmp += wWhat_; 
+            tmp += L" at: \""; 
+            for (; ; context++) {
+                if (context == where_) {
+                    tmp += L"*here*";
+                }
+                if (!*context) {
+                    break; 
+                }
+                tmp += *context;
+            }
+            tmp += L"\"";
+
+            wWhat_ = tmp;
+        }
+
+
+        Token::Token(int type, const wchar_t* begin, const wchar_t* end) 
+            : type_(type), begin_(begin), end_(end) {
+        }
+
+        Token::Token() 
+            : type_(0), begin_(0), end_(0) {
+        }
+		
+        int Token::type() const { return type_; }; 
+        const wchar_t* Token::begin() const { return begin_; };
+        const wchar_t* Token::end() const { return end_; };
+        int Token::length() const { return end_ - begin_; };
+        std::wstring Token::text() const {
+            std::wstring ret;
+            for (const wchar_t* i = begin_; i != end_; i++) {
+                ret += *i; 
+            }
+            return ret; 
+        }
+
+        StrLitTokenizer::StrLitTokenizer(wchar_t citate) 
+            : 	citate_(citate)
+        {	
+            reset(); 
+        }
+						
+        void StrLitTokenizer::reset() 
+        { 
+            escape_ = false, 
+                opened_ = false, 
+                begin_ = 0;
+            end_ = 0; 
+        }
+        Token StrLitTokenizer::get() 
+        { 
+            return Token( TOKEN_STRLIT, begin_, end_ ); 
+        }
+        TokenizerState StrLitTokenizer::consume(const wchar_t* cursor) 
+        {
+            if (!*cursor) return TOKENIZER_FAILED; // fail always on EOF
+            if (!opened_) 
+                {
+                    if (*cursor == citate_) 
+                        {
+                            opened_ = true;
+                            begin_ = cursor; 
+                        } else {
+                        return TOKENIZER_FAILED; 
+                    }
+                } else if (escape_)  {
+                escape_ = false;
+            } else {
+                if (*cursor == citate_) {
+                    end_ = cursor+1; 
+                    return TOKENIZER_FINISHED;
+                } else if (*cursor == '\\') {
+                    escape_ = true;
+                }
+            } 
+            return TOKENIZER_HUNGRY; 
+        }
+		
+        IntLitTokenizer::IntLitTokenizer() {
+            reset();
+        }
+
+        void IntLitTokenizer::reset() {
+            begin_ = NULL;
+            end_ = NULL;
+            beginning_ = true;
+        }
+
+        Token IntLitTokenizer::get() {
+            return Token(TOKEN_INTLIT, begin_, end_);
+        }
+
+        TokenizerState IntLitTokenizer::consume(const wchar_t * cursor) {
+            TokenizerState
+                rv = TOKENIZER_HUNGRY;
+
+            if (beginning_)
+                {
+                    if (*cursor != L'+'
+                        && *cursor != L'-'
+                        && !isdigit(*cursor))
+                        {
+                            rv = TOKENIZER_FAILED;
+                        }
+                    beginning_ = false;
+                    begin_ = cursor;
+                }
+            else if (!isdigit(*cursor))
+                {
+                    rv = TOKENIZER_FINISHED;
+                    end_ = cursor;
+                }
+
+            return rv;
+        }
+
+        RealLitTokenizer::RealLitTokenizer() {
+            reset();
+        }
+
+        void RealLitTokenizer::reset() {
+            begin_ = NULL;
+            end_ = NULL;
+            beginning_ = true;
+            hadDotAlready_ = false;
+        }
+
+        Token RealLitTokenizer::get() {
+            return Token(TOKEN_REALLIT, begin_, end_);
+        }
+
+        TokenizerState RealLitTokenizer::consume(const wchar_t * cursor) {
+            TokenizerState
+                rv = TOKENIZER_HUNGRY;
+
+            if (beginning_)
+                {
+                    if (*cursor != L'+'
+                        && *cursor != L'-'
+                        && !isdigit(*cursor)
+                        && *cursor != L'.')
+                        {
+                            rv = TOKENIZER_FAILED;
+                        }
+                    beginning_ = false;
+                    begin_ = cursor;
+                }
+            else if (*cursor == L'.')
+                {
+                    if (hadDotAlready_)
+                        {
+                            rv = TOKENIZER_FINISHED;
+                            end_ = cursor;
+                        }
+
+                    hadDotAlready_ = true;
+                }
+            else if (!isdigit(*cursor))
+                {
+                    rv = TOKENIZER_FINISHED;
+                    end_ = cursor;
+                }
+
+            return rv;
+        }
+
+        WhitespaceTokenizer::WhitespaceTokenizer() { 
+            reset(); 
+        }
+
+        void WhitespaceTokenizer::reset() 
+        { 
+            empty_ = true; 
+            begin_ = 0;
+            end_ = 0; 
+        }
+		
+        Token WhitespaceTokenizer::get() 
+        {
+            return Token( TOKEN_WS, begin_, end_ );
+        }
+		
+        TokenizerState WhitespaceTokenizer::consume(const wchar_t* cursor) 
+        {
+            if (!begin_) begin_ = cursor; 
+			
+            if (isspace(*cursor))  
+                {
+                    empty_ = false;
+                } else {
+                end_ = cursor; 
+                return empty_ ? TOKENIZER_FAILED : TOKENIZER_FINISHED; 
+            }
+            return TOKENIZER_HUNGRY;  
+        }
+		
+        IdTokenizer::IdTokenizer() 
+        { 
+            reset();
+        }
+		
+        void IdTokenizer::reset() 
+        {
+            begin_ = 0; 
+            end_ = 0;  
+        }
+		
+		
+        Token IdTokenizer::get() 
+        {
+            return Token( TOKEN_ID, begin_, end_ );
+        }
+		
+        TokenizerState IdTokenizer::consume(const wchar_t* cursor) 
+        {
+            if (!begin_) begin_ = cursor; 
+            if (cursor == begin_ && !isalpha(*cursor)) {
+                return TOKENIZER_FAILED;
+            } else if (cursor > begin_ && !isalnum(*cursor)) {  
+                end_ = cursor;
+                return TOKENIZER_FINISHED; 
+            } 
+            return TOKENIZER_HUNGRY; 
+        }
+
+        SymbolTokenizer::SymbolTokenizer(int tokenType, const wchar_t* symbol) 
+            : tokenType_( tokenType ), 
+              symbol_( symbol ) 
+        {
+        }
+		
+        void SymbolTokenizer::reset() {
+            begin_ = 0; 
+        }
+		
+        Token SymbolTokenizer::get() {
+            return Token( tokenType_, begin_, end_ );
+        }
+		
+        TokenizerState SymbolTokenizer::consume(const wchar_t* cursor) {
+            if (!begin_) begin_ = cursor; 
+            if (symbol_[cursor-begin_] == *cursor) {
+                if (!symbol_[cursor-begin_+1]) {
+                    // we reached end of symbol
+                    end_ = cursor + 1; 
+                    return TOKENIZER_FINISHED;
+                } 
+                return TOKENIZER_HUNGRY; 
+            } else {
+                return TOKENIZER_FAILED; 
+            }
+        }
+		
+        MultiTokenizer::MultiTokenizer(Tokenizer** tokenizers, bool ownTokenizers) 
+            : ownTokenizers_(ownTokenizers)
+        {
+            int len = 0; while (tokenizers[len]) len++; 
+            tokenizers_.assign(tokenizers,
+                               tokenizers + len);
+            states_ = new TokenizerState[len]; 
+            reset(); 
+        }
+
+        MultiTokenizer::~MultiTokenizer()
+        {
+            if (ownTokenizers_) 
+                {
+                    typedef std::vector<Tokenizer*>::iterator iterator; 
+                    for (iterator i = tokenizers_.begin(); i != tokenizers_.end(); ) 
+                        {
+                            delete *(i++); 
+                        }
+                }
+            delete[] states_; 
+        }
+		
+
+        void MultiTokenizer::reset() 
+        {
+            TokenizerState* s = states_;
+            running_ = 0; 
+            std::vector<Tokenizer*>::iterator
+                i = tokenizers_.begin(),
+                end = tokenizers_.end();
+
+            for (; i != end; ++i, ++s) {
+                (*i)->reset();
+                (*s) = TOKENIZER_HUNGRY;
+                running_++; 
+            }
+            found_ = false;
+        }
+		 
+        Token MultiTokenizer::get() 
+        {
+            Token token(TOKEN_UNKNOWN, 0, 0); 
+            TokenizerState* s = states_;
+            std::vector<Tokenizer*>::iterator
+                i = tokenizers_.begin(),
+                end = tokenizers_.end();
+
+            for (; i != end; ++i, ++s ) {
+                if (*s == TOKENIZER_FINISHED) {
+                    Token c = (*i)->get(); 
+                    if (c.length() > token.length()) {
+                        token = c; 
+                    }
+                }
+            }
+            if (token.length() == 0) {
+                // NOTE: not really a lexical exception, but logical one
+                throw LexException(L"Trying to get token without a token ready.", 0); 
+            }
+            return token;
+        }
+
+        TokenizerState MultiTokenizer::consume(const wchar_t* cursor) {
+            TokenizerState* s = states_;
+            std::vector<Tokenizer*>::iterator
+                i = tokenizers_.begin(),
+                end = tokenizers_.end();
+
+            for (; i != end; ++i, ++s) {
+                if (*s == TOKENIZER_HUNGRY) 
+                    {
+                        *s = (*i)->consume(cursor);
+                        if (*s != TOKENIZER_HUNGRY) running_--; 
+                        if (*s == TOKENIZER_FINISHED) {
+                            found_ = true; 
+                        }
+                    }
+            }
+            if (running_ == 0) {
+                return found_ ? TOKENIZER_FINISHED : TOKENIZER_FAILED; 
+            }
+            return TOKENIZER_HUNGRY;
+        }
+		
+
+        LitTokenizer::LitTokenizer(wchar_t citate)
+            : multiTokenizer_(NULL)
+        {
+            using namespace std;
+
+            auto_ptr<StrLitTokenizer>
+                s(new StrLitTokenizer(citate));
+            auto_ptr<IntLitTokenizer>
+                i(new IntLitTokenizer);
+            auto_ptr<RealLitTokenizer>
+                r(new RealLitTokenizer);
+
+            Tokenizer * tokenizers[] = {
+                s.get(),
+                i.get(),
+                r.get(),
+                NULL
+            };
+
+            multiTokenizer_ = new MultiTokenizer(tokenizers, true);
+                
+            s.release();
+            i.release();
+            r.release();
+
+            reset();
+        }
+
+
+        LitTokenizer::~LitTokenizer()
+        {
+            delete multiTokenizer_;
+        }
+
+        void LitTokenizer::reset()
+        {
+            multiTokenizer_->reset();
+        }
+
+        Token LitTokenizer::get()
+        {
+            Token
+                subToken = multiTokenizer_->get();
+
+            return Token(TOKEN_LIT,
+                         subToken.begin(),
+                         subToken.end());
+        }
+
+        TokenizerState LitTokenizer::consume(const wchar_t * cursor)
+        {
+            return multiTokenizer_->consume(cursor);
+        }
+            
+        TokenIterator::~TokenIterator() {}
+
+        Tokens::Tokens(Tokenizer& tokenizer, const wchar_t* text)
+            :	cursor_(text),
+                tokenizer_(tokenizer), 
+                hasNext_(false)
+        {}
+		
+        Tokens::operator bool() {
+            prepareNext(); 
+            return hasNext_;
+        } 
+		
+        Token Tokens::operator++(int) {
+            prepareNext();
+            if (!hasNext_) {
+                throw LexException(L"Out of tokens.", cursor_);
+            }
+            hasNext_ = false;
+            // get the token
+            Token ret = tokenizer_.get();
+            cursor_ = ret.end();
+            return ret;
+        }
+				
+        void Tokens::prepareNext() {
+            if (!hasNext_ && *cursor_) {
+                const wchar_t* begin = cursor_; 
+                tokenizer_.reset(); 
+                TokenizerState state = TOKENIZER_HUNGRY;
+                while (state == TOKENIZER_HUNGRY) {
+                    state = tokenizer_.consume(cursor_);
+                    if (*cursor_) cursor_++; // don't go beyond eof. 
+                }
+                if (state == TOKENIZER_FAILED) {
+                    std::wostringstream msg; 
+                    msg<<L"Unrecognized syntax: '";
+                    for (int i = 0; &begin[i] < cursor_; i++) msg<<begin[i];
+                    msg<<L"'";
+                    throw LexException(msg.str().c_str(), begin); 
+                } else { 
+                    // Means that: state == TOKENIZER_FINISHED
+                    hasNext_ = true; 
+                }
+            }
+        }
+
+        WhiteSpaceFilter::WhiteSpaceFilter(TokenIterator& tokens) 
+            :	tokens_(tokens), next_(), hasNext_(false) {}
+		
+        WhiteSpaceFilter::operator bool()
+        {
+            prepareNext();
+            return hasNext_; 
+        }
+		
+        Token WhiteSpaceFilter::operator++(int)
+        {
+            prepareNext();
+            if (!hasNext_) {
+                throw LexException(L"Out of tokens", 0); 
+            }
+            hasNext_ = false;
+            return next_;
+        }
+        void WhiteSpaceFilter::prepareNext()
+        {
+            while (!hasNext_ && tokens_) {
+                next_ = tokens_++;
+                if (next_.type() != TOKEN_WS) {
+                    hasNext_ = true; 
+                }
+            }
+        }
+		
+        TokenReader::TokenReader(TokenIterator& tokens) 
+            :	tokens_(tokens), 
+                location_(0),
+                forward_(), 
+                backward_(), 
+                marks_()
+        {}
+		
+		
+        TokenReader::operator bool() {
+            return !forward_.empty() || tokens_; 
+        }
+		
+        Token TokenReader::operator++(int) {
+            Token token; 
+            if (forward_.size() > 0) {
+                token = forward_.back();
+                forward_.pop_back(); 
+            } else {
+                token = tokens_++; 
+            }
+            if (!marks_.empty()) {
+                backward_.push_back(token);  
+            }
+            location_++; 
+            return token; 
+        }
+
+        Token TokenReader::peek() {
+            if (forward_.empty()) {
+                Token token = (*this)++;
+                forward_.push_back(token); 
+                return token; 
+            } else {
+                return forward_.back(); 
+            }
+        }
+
+        void TokenReader::pushMark() {
+            marks_.push_back(location_); 
+        }
+		
+        void TokenReader::popMark() {
+            int mark = marks_.back(); marks_.pop_back();
+            while (location_ > mark) {
+                forward_.push_back(backward_.back()); 
+                backward_.pop_back();
+                location_--;
+            }
+        }
+		
+        void TokenReader::clearMark() {
+            marks_.back(); marks_.pop_back();
+            if (marks_.empty()) {
+                backward_.clear(); 
+            }
+        }
+		
+    } // Lex 
+	
+    namespace Parser {
+	
+        ParseException::ParseException(const wchar_t* wWhat, 
+                                       const Lex::Token& where) 
+            : wWhat_(wWhat), 
+              where_(where) {
+            ;
+        }
+		
+
+        Lex::Token ParseException::where() const {
+            return where_;
+        }
+
+
+        const wchar_t* ParseException::wWhat() const throw() {
+            return wWhat_.c_str();
+        }
+		
+        void ParseException::setContext(const wchar_t * context)
+        {
+            // TODO legacy of implementation of obsoleted describe() -
+            // it can be optimized by doind direct substring - concat
+            // operations instead of looping through context
+            std::wstring tmp;
+            tmp += wWhat_; 
+            tmp += L" at: \""; 
+            if (where_.type() == Lex::TOKEN_EOF) {
+                tmp += context; 
+                tmp += L"*here*";
+            } else {
+                for (; ; context++) {
+                    if (context == where_.begin()) {
+                        tmp += L"*here*";
+                    }
+                    if (context == where_.end()) {
+                        tmp += L"*here*";
+                    }
+                    if (!*context) break; 
+                    tmp += *context;
+                }
+            }
+            tmp += L"\"";
+
+            wWhat_ = tmp;
+        }
+		
+        namespace Lit {
+		
+            std::wstring ParseString(const Lex::Token& token) {
+                if (token.type() != Lex::TOKEN_STRLIT) {
+                    std::wostringstream msg; 
+                    msg<<L"Expected literal instead of token '"<<token.text()<<"' of type "<<token.type(); 
+                    throw ParseException(msg.str().c_str(), token);  
+                }
+                std::wstring ret; 
+                const wchar_t* text = token.begin(); 
+                // NOTE: We are assuming that the literal sitation marks are one character wide
+                for (int i = 1; &text[i] < token.end()-1; i++) {// skip first and last characters
+                    if (text[i] == Lex::ESCAPE_SYMBOL) {
+                        i++; 
+                        switch (text[i]) {
+                        case '0':
+                            ret += L"\0";
+                            break;
+                        case 'n':
+                            ret += L"\n";
+                            break;
+                        case 'r':
+                            ret += L"\r";
+                            break;
+                        case 't':
+                            ret += L"\t";
+                            break;
+                        default: 
+                            ret += text[i]; 
+                        }
+                    } else {
+                        ret += text[i];
+                    }
+                }
+                return ret; 
+            }
+            long ParseInteger(const Lex::Token& token) {
+                if (token.type() != Lex::TOKEN_INTLIT) {
+                    std::wostringstream msg; 
+                    msg<<L"Expected literal instead of token '"<<token.text()<<"' of type "<<token.type(); 
+                    throw ParseException(msg.str().c_str(), token);  
+                }
+                wchar_t* end = const_cast<wchar_t*>(token.end());
+                return wcstol(token.begin(), &end, 10);
+            }
+            double ParseReal(const Lex::Token& token) {
+                if (token.type() != Lex::TOKEN_REALLIT) {
+                    std::wostringstream msg; 
+                    msg<<L"Expected literal instead of token '"<<token.text()<<"' of type "<<token.type(); 
+                    throw ParseException(msg.str().c_str(), token);  
+                }
+                wchar_t* end = const_cast<wchar_t*>(token.end());
+                return wcstod(token.begin(), &end);
+            }
+        }
+		
+
+        Lexer::Lexer(Lex::TokenIterator& tokens) : Lex::TokenReader(tokens) {
+        }
+	
+        Lex::Token Lexer::operator++(int) {
+            if (*this) {
+                return Lex::TokenReader::operator++(0); 
+            }
+            throw ParseException(L"Unexpected EOF", Lex::Token(Lex::TOKEN_EOF, 0, 0));  
+        }
+
+        Lex::Token Lexer::eat(int tokenType) {
+            Lex::Token token = ((*this)++);
+            if (token.type() != tokenType) {
+                std::wostringstream msg; 
+                msg<<"Expected token of type "<<tokenType<<" instead of token '"<<token.text()<<"' of type "<<token.type();  
+                throw ParseException(msg.str().c_str(), token);  
+            }
+            return token; 
+        }
+        std::wstring Lexer::eatId() {
+            Lex::Token token = ((*this)++);
+            if (token.type() != Lex::TOKEN_ID) {
+                std::wostringstream msg; 
+                msg<<L"Expected identifier instead of token '"<<token.text()<<"' of type "<<token.type(); 
+                throw ParseException(msg.str().c_str(), token);  
+            }
+            return token.text(); 
+        }
+
+        void Lexer::eatEof() {
+            if (*this) {
+                Lex::Token token = ((*this)++);
+                std::wostringstream msg; 
+                msg<<L"Expected EOF instead of '"<<token.text()<<"' of type "<<token.type(); 
+                throw ParseException(msg.str().c_str(), token);  
+            }
+        }
+
+        std::wstring Lexer::eatString() {
+            return Lit::ParseString((*this)++); 
+        }
+
+        long Lexer::eatInteger() {
+            return Lit::ParseInteger((*this)++); 
+        }
+
+        double Lexer::eatReal() {
+            return Lit::ParseReal((*this)++); 
+        }
+
+        StdLexer::StdLexer(Lex::Tokenizer& tokenizer, const wchar_t* text) 
+            : Lexer(ws_),
+              tokens_(tokenizer, text), 
+              ws_(tokens_)
+              
+        {}
+		
+		
+    } // Parser
+} // Cpt
+