searchengine/util/cpixtools/inc/public/cpixparsetools.h
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Tue, 06 Jul 2010 15:30:04 +0300
changeset 10 afe194b6b1cd
parent 0 671dee74050a
permissions -rw-r--r--
Revision: 201025 Kit: 2010127

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/
/*
 * cpixparsetools.h
 *
 *  Created on: Apr 14, 2009
 *      Author: admin
 */

#ifndef CPIXPARSETOOLS_H_
#define CPIXPARSETOOLS_H_

#include <string>
#include <wchar.h>
#include <ctype.h>
#include <exception>
#include <vector>
#include <memory>

namespace Cpt {


    /**
     * Exception class that can gather some contextual information as
     * it ascends in the call stack.
     */
    class ITxtCtxtExc
    {
    public:
        /**
         * TODO comment
         */
        virtual const wchar_t * wWhat() const throw () = 0;

        /**
         * TODO comment: after an invocation of setContext, the string
         * returned by wWhat() should be far more informative.
         */
        virtual void setContext(const wchar_t * context) = 0;


        /**
         * TODO comment and move to .cpp file
         */
        virtual ~ITxtCtxtExc() { ; }
    };



    /**
     * Tools for lexical analysis, that is parsing regular expression
     * strength language and converting the character stream into
     * language token stream.  Note: Regular expression syntax
     * (e.g. "file*.tx?") itself is not supported)
     */
    namespace Lex {
    
		typedef const wchar_t* token_type_t; 


		extern token_type_t TOKEN_UNKNOWN;
		extern token_type_t TOKEN_EOF;
		extern token_type_t TOKEN_WS; 
		extern token_type_t TOKEN_COMMENT;  
		extern token_type_t TOKEN_ID;	
		extern token_type_t TOKEN_STRLIT;
		extern token_type_t TOKEN_INTLIT;
		extern token_type_t TOKEN_REALLIT;
		extern token_type_t TOKEN_LIT;
		
        class LexException : public ITxtCtxtExc {
        public: 
            LexException(const wchar_t* what, const wchar_t* where);
            virtual ~LexException(); 
            const wchar_t * where() const;
            virtual const wchar_t * wWhat() const throw ();
            virtual void setContext(const wchar_t * context);
        private:
            std::wstring wWhat_; 
            const wchar_t* where_;
        };
	
        /**
         * A token provided by regular expression. Contains pointers
         * into the original data string. NOTE: this object becomes
         * hazardous, if the original tokenized string is modified or
         * released.
         */
        class Token {
        public: 
            Token(token_type_t type, const wchar_t* begin, const wchar_t* end);
            Token();
            const wchar_t* type() const;
            const wchar_t* begin() const;
            const wchar_t* end() const;
            int length() const;
            std::wstring text() const; 
        private: 
            token_type_t type_;
            const wchar_t* begin_;
            const wchar_t* end_;
        };
		
        /**
         * Describes the state of the tokenizer. 
         */
        enum TokenizerState { 
            TOKENIZER_HUNGRY, 	/// ok, but don't have yet a proper token formed.
            TOKENIZER_FINISHED, /// has ready token
            TOKENIZER_FAILED 	/// failed
        };
		
        /**
         * Provides mechanism for extracting nearly regular expression
         * strength tokens.
         */
        class Tokenizer {
        public: 
            virtual ~Tokenizer(); 
            /**
             * Prepares the tokenizer for consuming a new token.
             */
            virtual void reset() = 0;
            /**
             * Returns the token. MUST be called only, if tokenizer
             * has returned TOKENIZER_FINISHED with last call to
             * consume and reset has not been called.
             */
            virtual Token get() = 0; 
				
            /**
             * Consumes a character and returns the state of the
             * tokenizer. If the tokenizer returns TOKENIZER_FINISHED,
             * the read token can be requested by using get().
             */
            virtual TokenizerState consume(const wchar_t* cursor) = 0; 
        };
		
        class StrLitTokenizer : public Tokenizer {
        public: 
            StrLitTokenizer(wchar_t citate); 
            virtual void reset();
            virtual Token get(); 
            virtual TokenizerState consume(const wchar_t* cursor);
        private:
            const wchar_t* begin_; 
            const wchar_t* end_; 
            wchar_t citate_;
            bool opened_; 
            bool escape_;
        };
	
        class IntLitTokenizer : public Tokenizer {
        public:
            IntLitTokenizer();
            virtual void reset();
            virtual Token get();
            virtual TokenizerState consume(const wchar_t * cursor);
                
        private:
            const wchar_t * begin_;
            const wchar_t * end_;
            bool            beginning_;
        };


        class RealLitTokenizer : public Tokenizer {
        public:
            RealLitTokenizer();
            virtual void reset();
            virtual Token get();
            virtual TokenizerState consume(const wchar_t * cursor);
                
        private:
            const wchar_t * begin_;
            const wchar_t * end_;
            bool            beginning_;
            bool            hadDotAlready_;
        };

        class WhitespaceTokenizer : public Tokenizer {
        public: 
            WhitespaceTokenizer();
            virtual void reset();
            virtual Token get();
            virtual TokenizerState consume(const wchar_t* cursor);
        private:
            const wchar_t* begin_; 
            const wchar_t* end_; 
            bool empty_;
        };
	
        class IdTokenizer : public Tokenizer {
        public: 
            IdTokenizer(); 
            virtual void reset();
            virtual Token get();
            virtual TokenizerState consume(const wchar_t* cursor);
        private:
            const wchar_t* begin_; 
            const wchar_t* end_; 
        };
	
        class SymbolTokenizer : public Tokenizer {
        public: 
            SymbolTokenizer(const wchar_t* tokenType, const wchar_t* symbol);
            virtual void reset();
            virtual Token get();
            virtual TokenizerState consume(const wchar_t* cursor);
        private:
            const wchar_t* begin_;
            const wchar_t* end_; 
            token_type_t tokenType_; 
            const wchar_t* symbol_;
        };
 
        /**
         * C style line comment, e.g. // comment
         */
        class LineCommentTokenizer : public Tokenizer {
        public: 
        	LineCommentTokenizer();
            virtual void reset();
            virtual Token get();
            virtual TokenizerState consume(const wchar_t* cursor);
        private:
            enum State {
				READY,
				SLASH_CONSUMED, 
				COMMENT,
				FINISHED
            };
        	State state_;
        	const wchar_t* begin_; 
        	const wchar_t* end_;
        };

        /**
         * C++ style section comments. Like the one's surrounding this comment
         */
        class SectionCommentTokenizer : public Tokenizer {
        public: 
        	SectionCommentTokenizer();
            virtual void reset();
            virtual Token get();
            virtual TokenizerState consume(const wchar_t* cursor);
        private:
            enum State {
				READY,
				SLASH_CONSUMED, 
				COMMENT, 
				STAR_CONSUMED, 
				FINISH
            };
        	State state_;
        	const wchar_t* begin_; 
        	const wchar_t* end_;
        	
        };

        /**
         * Tokenizes text by using given tokenizers. Text is consumed
         * until no tokenizer is in hungry state e.g., all tokenizers
         * are either failed or finished. In case a number of
         * tokenizers have finished, the longest token is used. If a
         * number of tokens have the same length, the order is
         * determined based on order of the tokenizers array. The
         * tokenizers first in the list have highest priority.
         */
        class MultiTokenizer : public Tokenizer {
        public: 
            /**
             * @param tokenizers zero-terminated array. Nothing passes
             * ownership. The array itself need not be alive later,
             * but the contained tokenizers do.
             */
            MultiTokenizer(Tokenizer** tokenizers, bool ownTokenizers = false);
            virtual ~MultiTokenizer(); 
            void reset();
            Token get();
            TokenizerState consume(const wchar_t* cursor);
        private:
            int running_;
            bool found_; 
            TokenizerState* states_;
            std::vector<Tokenizer*> tokenizers_;
            bool ownTokenizers_;
        };

        /**
         * Tokenizes whatever literals: string, real or int. The token
         * type of the tokens returned by this class are TOKEN_LIT
         * (you don't know if they are str, int or real).
         */
        class LitTokenizer : public Tokenizer {
        public:
            /**
             * Strings literals will be tokenized according to citate
             * (see StrLitTokenizer constructor).
             */
            LitTokenizer(wchar_t citate);
            virtual ~LitTokenizer();
            virtual void reset();
            virtual Token get();
            virtual TokenizerState consume(const wchar_t * cursor);

        private:
            MultiTokenizer   * multiTokenizer_;
        };

        /**
         * Not-so-C++ style iterator for iterating through tokens.
         * When having a variable i of type TokenIterator, check if
         * (i) can be used to see
         */
        class TokenIterator {
        public:
            /**
             * Returns true, if token iterator contains more tokens
             */
            virtual operator bool() = 0;
				
            /**
             * Returns the next token in the stream.  FIXME should be
             * operator of form ++i. Current syntax breaks C++
             * standard
             */
            virtual Token operator++(int) = 0;
				
            virtual ~TokenIterator(); 
        };
        
        class WhitespaceSplitter : public TokenIterator {
        public:
        	WhitespaceSplitter(const wchar_t* text);
            virtual operator bool();
            virtual Token operator++(int);
        public: 
            const wchar_t* begin_;
            const wchar_t* end_;
        };
		
        /**
         * Uses tokenizer for converting given text into token stream
         * and provides means for iterating throught the token
         * stream's tokens.
         */
        class Tokens : public TokenIterator {
        public:
            Tokens(Tokenizer& tokenizer, const wchar_t* text);
            virtual operator bool();
            virtual Token operator++(int);
        private:
            void prepareNext();
        private: // data
            /**
             * Current location in the text
             */
            const wchar_t* cursor_; 
            Tokenizer& tokenizer_;
            bool hasNext_; 
        };
		
        /**
         * Filters out all tokens of type TOKEN_WS
         */
        class StdFilter : public TokenIterator {
        public:
        	StdFilter(TokenIterator& tokens);
            virtual operator bool();
            virtual Token operator++(int);
        private:
            void prepareNext();
        private: // data
            TokenIterator& tokens_; 
            Token next_;  
            bool hasNext_;
        };
 
        /**
         * Provides mechanism for marking a location in the token
         * stream and returning to it. Essential tool for parsing
         * context free grammar.
         */
        class TokenReader : public TokenIterator {
        public: 
            TokenReader(TokenIterator& tokens); 
            virtual operator bool();
            virtual Token operator++(int);
            Token peek(); 
            void pushMark();   // stores this position
            void popMark();    // pops position and returns reader to point it
            void clearMark();  // pops position and frees it; may also free tokens in backward buf.
        private: 
            TokenIterator& tokens_;
            int location_;
            std::vector<Token> forward_; // buffer for tokens front of this location
            std::vector<Token> backward_; // buffer for tokens behind this location
            std::vector<int> marks_; 
        };

        /**
         * Marks the position in the token stream and returns the
         * token stream position, when going out of scope unless
         * release is called.
         */
        class TokenPositionSentry {
        public:
            inline TokenPositionSentry(TokenReader& reader) 
                : reader_(reader), released_(false) { 
                reader_.pushMark(); 
            }
            inline ~TokenPositionSentry() {
                if (released_) {
                    reader_.clearMark(); 
                } else {
                    reader_.popMark(); 
                }
            }
            inline void release() { released_ = true; } 
        private:
            bool released_; 
            TokenReader& reader_; 
        };


    } // Lex 
	
    /**
     * Tools for parsing context-free grammar
     */
    namespace Parser {

        /**
         * Informs of an parsing error. 
         */
        class ParseException : public ITxtCtxtExc {
        public: 
            ParseException(const wchar_t* wWhat, const Lex::Token& where);
            Lex::Token where() const; 
            virtual const wchar_t * wWhat() const throw ();
            virtual void setContext(const wchar_t * context);
				
        private:
            std::wstring wWhat_; 
            Lex::Token where_;
        };
		
        namespace Lit {
            std::wstring ParseString(const Lex::Token& token) ;  
            long ParseInteger(const Lex::Token& token);
            double ParseReal(const Lex::Token& token);
        }
		
        /**
         * Utility, which is used for parsing. Throws parse
         * exceptions, when underlying assumptions don't hold.
         */
        class Lexer : public Lex::TokenReader {
        public: 
            Lexer(Lex::TokenIterator& tokens); 
            // throws ParseException instead of LexException on EOF. 
            virtual Lex::Token operator++(int);
            Lex::Token eat(Lex::token_type_t tokenType);
            void eatEof();
            std::wstring eatId();
            std::wstring eatString();
            long eatInteger();
            double eatReal();
        };
	
        /*
         * Creates token stream iterator and whitespace filter
         */ 
        class StdLexer : public Lexer {
        public: 
            StdLexer(Lex::Tokenizer& tokens, const wchar_t* text); 
        private: 
            Lex::Tokens tokens_; 
            Lex::StdFilter filter_;
        };
		
    } // Parser
} // Cpt

#endif /* CPIXPARSETOOLS_H_ */