searchengine/util/cpixtools/inc/public/cpixparsetools.h
changeset 10 afe194b6b1cd
parent 0 671dee74050a
equal deleted inserted replaced
9:d575fd691cf9 10:afe194b6b1cd
    66      * strength language and converting the character stream into
    66      * strength language and converting the character stream into
    67      * language token stream.  Note: Regular expression syntax
    67      * language token stream.  Note: Regular expression syntax
    68      * (e.g. "file*.tx?") itself is not supported)
    68      * (e.g. "file*.tx?") itself is not supported)
    69      */
    69      */
    70     namespace Lex {
    70     namespace Lex {
    71 
    71     
    72 
    72 		typedef const wchar_t* token_type_t; 
    73         /**
    73 
    74          * Basic token types
    74 
    75          */
    75 		extern token_type_t TOKEN_UNKNOWN;
    76         enum TokenType {
    76 		extern token_type_t TOKEN_EOF;
    77             TOKEN_UNKNOWN = 0,
    77 		extern token_type_t TOKEN_WS; 
    78             TOKEN_EOF = 1, 
    78 		extern token_type_t TOKEN_COMMENT;  
    79             TOKEN_WS,  
    79 		extern token_type_t TOKEN_ID;	
    80             TOKEN_ID, 
    80 		extern token_type_t TOKEN_STRLIT;
    81             TOKEN_STRLIT,
    81 		extern token_type_t TOKEN_INTLIT;
    82             TOKEN_INTLIT,
    82 		extern token_type_t TOKEN_REALLIT;
    83             TOKEN_REALLIT,
    83 		extern token_type_t TOKEN_LIT;
    84             TOKEN_LIT,
       
    85 			
       
    86             TOKEN_LAST_RESERVED // 8
       
    87         };
       
    88 		
    84 		
    89         class LexException : public ITxtCtxtExc {
    85         class LexException : public ITxtCtxtExc {
    90         public: 
    86         public: 
    91             LexException(const wchar_t* what, const wchar_t* where);
    87             LexException(const wchar_t* what, const wchar_t* where);
    92             virtual ~LexException(); 
    88             virtual ~LexException(); 
   104          * hazardous, if the original tokenized string is modified or
   100          * hazardous, if the original tokenized string is modified or
   105          * released.
   101          * released.
   106          */
   102          */
   107         class Token {
   103         class Token {
   108         public: 
   104         public: 
   109             Token(int type, const wchar_t* begin, const wchar_t* end);
   105             Token(token_type_t type, const wchar_t* begin, const wchar_t* end);
   110             Token();
   106             Token();
   111             int type() const;
   107             const wchar_t* type() const;
   112             const wchar_t* begin() const;
   108             const wchar_t* begin() const;
   113             const wchar_t* end() const;
   109             const wchar_t* end() const;
   114             int length() const;
   110             int length() const;
   115             std::wstring text() const; 
   111             std::wstring text() const; 
   116         private: 
   112         private: 
   117             int type_;
   113             token_type_t type_;
   118             const wchar_t* begin_;
   114             const wchar_t* begin_;
   119             const wchar_t* end_;
   115             const wchar_t* end_;
   120         };
   116         };
   121 		
   117 		
   122         /**
   118         /**
   219             const wchar_t* end_; 
   215             const wchar_t* end_; 
   220         };
   216         };
   221 	
   217 	
   222         class SymbolTokenizer : public Tokenizer {
   218         class SymbolTokenizer : public Tokenizer {
   223         public: 
   219         public: 
   224             SymbolTokenizer(int tokenType, const wchar_t* symbol);
   220             SymbolTokenizer(const wchar_t* tokenType, const wchar_t* symbol);
   225             virtual void reset();
   221             virtual void reset();
   226             virtual Token get();
   222             virtual Token get();
   227             virtual TokenizerState consume(const wchar_t* cursor);
   223             virtual TokenizerState consume(const wchar_t* cursor);
   228         private:
   224         private:
   229             const wchar_t* begin_;
   225             const wchar_t* begin_;
   230             const wchar_t* end_; 
   226             const wchar_t* end_; 
   231             int tokenType_; 
   227             token_type_t tokenType_; 
   232             const wchar_t* symbol_;
   228             const wchar_t* symbol_;
   233         };
   229         };
   234 	
   230  
       
   231         /**
       
   232          * C style line comment, e.g. // comment
       
   233          */
       
   234         class LineCommentTokenizer : public Tokenizer {
       
   235         public: 
       
   236         	LineCommentTokenizer();
       
   237             virtual void reset();
       
   238             virtual Token get();
       
   239             virtual TokenizerState consume(const wchar_t* cursor);
       
   240         private:
       
   241             enum State {
       
   242 				READY,
       
   243 				SLASH_CONSUMED, 
       
   244 				COMMENT,
       
   245 				FINISHED
       
   246             };
       
   247         	State state_;
       
   248         	const wchar_t* begin_; 
       
   249         	const wchar_t* end_;
       
   250         };
       
   251 
       
   252         /**
       
   253          * C++ style section comments. Like the one's surrounding this comment
       
   254          */
       
   255         class SectionCommentTokenizer : public Tokenizer {
       
   256         public: 
       
   257         	SectionCommentTokenizer();
       
   258             virtual void reset();
       
   259             virtual Token get();
       
   260             virtual TokenizerState consume(const wchar_t* cursor);
       
   261         private:
       
   262             enum State {
       
   263 				READY,
       
   264 				SLASH_CONSUMED, 
       
   265 				COMMENT, 
       
   266 				STAR_CONSUMED, 
       
   267 				FINISH
       
   268             };
       
   269         	State state_;
       
   270         	const wchar_t* begin_; 
       
   271         	const wchar_t* end_;
       
   272         	
       
   273         };
       
   274 
   235         /**
   275         /**
   236          * Tokenizes text by using given tokenizers. Text is consumed
   276          * Tokenizes text by using given tokenizers. Text is consumed
   237          * until no tokenizer is in hungry state e.g., all tokenizers
   277          * until no tokenizer is in hungry state e.g., all tokenizers
   238          * are either failed or finished. In case a number of
   278          * are either failed or finished. In case a number of
   239          * tokenizers have finished, the longest token is used. If a
   279          * tokenizers have finished, the longest token is used. If a
   301              */
   341              */
   302             virtual Token operator++(int) = 0;
   342             virtual Token operator++(int) = 0;
   303 				
   343 				
   304             virtual ~TokenIterator(); 
   344             virtual ~TokenIterator(); 
   305         };
   345         };
       
   346         
       
   347         class WhitespaceSplitter : public TokenIterator {
       
   348         public:
       
   349         	WhitespaceSplitter(const wchar_t* text);
       
   350             virtual operator bool();
       
   351             virtual Token operator++(int);
       
   352         public: 
       
   353             const wchar_t* begin_;
       
   354             const wchar_t* end_;
       
   355         };
   306 		
   356 		
   307         /**
   357         /**
   308          * Uses tokenizer for converting given text into token stream
   358          * Uses tokenizer for converting given text into token stream
   309          * and provides means for iterating throught the token
   359          * and provides means for iterating throught the token
   310          * stream's tokens.
   360          * stream's tokens.
   326         };
   376         };
   327 		
   377 		
   328         /**
   378         /**
   329          * Filters out all tokens of type TOKEN_WS
   379          * Filters out all tokens of type TOKEN_WS
   330          */
   380          */
   331         class WhiteSpaceFilter : public TokenIterator {
   381         class StdFilter : public TokenIterator {
   332         public:
   382         public:
   333             WhiteSpaceFilter(TokenIterator& tokens);
   383         	StdFilter(TokenIterator& tokens);
   334             virtual operator bool();
   384             virtual operator bool();
   335             virtual Token operator++(int);
   385             virtual Token operator++(int);
   336         private:
   386         private:
   337             void prepareNext();
   387             void prepareNext();
   338         private: // data
   388         private: // data
   423         class Lexer : public Lex::TokenReader {
   473         class Lexer : public Lex::TokenReader {
   424         public: 
   474         public: 
   425             Lexer(Lex::TokenIterator& tokens); 
   475             Lexer(Lex::TokenIterator& tokens); 
   426             // throws ParseException instead of LexException on EOF. 
   476             // throws ParseException instead of LexException on EOF. 
   427             virtual Lex::Token operator++(int);
   477             virtual Lex::Token operator++(int);
   428             Lex::Token eat(int tokenType);
   478             Lex::Token eat(Lex::token_type_t tokenType);
   429             void eatEof();
   479             void eatEof();
   430             std::wstring eatId();
   480             std::wstring eatId();
   431             std::wstring eatString();
   481             std::wstring eatString();
   432             long eatInteger();
   482             long eatInteger();
   433             double eatReal();
   483             double eatReal();
   439         class StdLexer : public Lexer {
   489         class StdLexer : public Lexer {
   440         public: 
   490         public: 
   441             StdLexer(Lex::Tokenizer& tokens, const wchar_t* text); 
   491             StdLexer(Lex::Tokenizer& tokens, const wchar_t* text); 
   442         private: 
   492         private: 
   443             Lex::Tokens tokens_; 
   493             Lex::Tokens tokens_; 
   444             Lex::WhiteSpaceFilter ws_;
   494             Lex::StdFilter filter_;
   445         };
   495         };
   446 		
   496 		
   447     } // Parser
   497     } // Parser
   448 } // Cpt
   498 } // Cpt
   449 
   499