searchengine/util/cpixtools/inc/public/cpixparsetools.h
changeset 8 6547bf8ca13a
parent 0 671dee74050a
--- a/searchengine/util/cpixtools/inc/public/cpixparsetools.h	Fri Jun 11 14:43:47 2010 +0300
+++ b/searchengine/util/cpixtools/inc/public/cpixparsetools.h	Mon Jun 28 10:34:53 2010 +0530
@@ -68,23 +68,19 @@
      * (e.g. "file*.tx?") itself is not supported)
      */
     namespace Lex {
+    
+		typedef const wchar_t* token_type_t; 
 
 
-        /**
-         * Basic token types
-         */
-        enum TokenType {
-            TOKEN_UNKNOWN = 0,
-            TOKEN_EOF = 1, 
-            TOKEN_WS,  
-            TOKEN_ID, 
-            TOKEN_STRLIT,
-            TOKEN_INTLIT,
-            TOKEN_REALLIT,
-            TOKEN_LIT,
-			
-            TOKEN_LAST_RESERVED // 8
-        };
+		extern token_type_t TOKEN_UNKNOWN;
+		extern token_type_t TOKEN_EOF;
+		extern token_type_t TOKEN_WS; 
+		extern token_type_t TOKEN_COMMENT;  
+		extern token_type_t TOKEN_ID;	
+		extern token_type_t TOKEN_STRLIT;
+		extern token_type_t TOKEN_INTLIT;
+		extern token_type_t TOKEN_REALLIT;
+		extern token_type_t TOKEN_LIT;
 		
         class LexException : public ITxtCtxtExc {
         public: 
@@ -106,15 +102,15 @@
          */
         class Token {
         public: 
-            Token(int type, const wchar_t* begin, const wchar_t* end);
+            Token(token_type_t type, const wchar_t* begin, const wchar_t* end);
             Token();
-            int type() const;
+            const wchar_t* type() const;
             const wchar_t* begin() const;
             const wchar_t* end() const;
             int length() const;
             std::wstring text() const; 
         private: 
-            int type_;
+            token_type_t type_;
             const wchar_t* begin_;
             const wchar_t* end_;
         };
@@ -221,17 +217,61 @@
 	
         class SymbolTokenizer : public Tokenizer {
         public: 
-            SymbolTokenizer(int tokenType, const wchar_t* symbol);
+            SymbolTokenizer(const wchar_t* tokenType, const wchar_t* symbol);
             virtual void reset();
             virtual Token get();
             virtual TokenizerState consume(const wchar_t* cursor);
         private:
             const wchar_t* begin_;
             const wchar_t* end_; 
-            int tokenType_; 
+            token_type_t tokenType_; 
             const wchar_t* symbol_;
         };
-	
+ 
+        /**
+         * C style line comment, e.g. // comment
+         */
+        class LineCommentTokenizer : public Tokenizer {
+        public: 
+        	LineCommentTokenizer();
+            virtual void reset();
+            virtual Token get();
+            virtual TokenizerState consume(const wchar_t* cursor);
+        private:
+            enum State {
+				READY,
+				SLASH_CONSUMED, 
+				COMMENT,
+				FINISHED
+            };
+        	State state_;
+        	const wchar_t* begin_; 
+        	const wchar_t* end_;
+        };
+
+        /**
+         * C++ style section comments. Like the one's surrounding this comment
+         */
+        class SectionCommentTokenizer : public Tokenizer {
+        public: 
+        	SectionCommentTokenizer();
+            virtual void reset();
+            virtual Token get();
+            virtual TokenizerState consume(const wchar_t* cursor);
+        private:
+            enum State {
+				READY,
+				SLASH_CONSUMED, 
+				COMMENT, 
+				STAR_CONSUMED, 
+				FINISH
+            };
+        	State state_;
+        	const wchar_t* begin_; 
+        	const wchar_t* end_;
+        	
+        };
+
         /**
          * Tokenizes text by using given tokenizers. Text is consumed
          * until no tokenizer is in hungry state e.g., all tokenizers
@@ -303,6 +343,16 @@
 				
             virtual ~TokenIterator(); 
         };
+        
+        class WhitespaceSplitter : public TokenIterator {
+        public:
+        	WhitespaceSplitter(const wchar_t* text);
+            virtual operator bool();
+            virtual Token operator++(int);
+        public: 
+            const wchar_t* begin_;
+            const wchar_t* end_;
+        };
 		
         /**
          * Uses tokenizer for converting given text into token stream
@@ -328,9 +378,9 @@
         /**
          * Filters out all tokens of type TOKEN_WS
          */
-        class WhiteSpaceFilter : public TokenIterator {
+        class StdFilter : public TokenIterator {
         public:
-            WhiteSpaceFilter(TokenIterator& tokens);
+        	StdFilter(TokenIterator& tokens);
             virtual operator bool();
             virtual Token operator++(int);
         private:
@@ -425,7 +475,7 @@
             Lexer(Lex::TokenIterator& tokens); 
             // throws ParseException instead of LexException on EOF. 
             virtual Lex::Token operator++(int);
-            Lex::Token eat(int tokenType);
+            Lex::Token eat(Lex::token_type_t tokenType);
             void eatEof();
             std::wstring eatId();
             std::wstring eatString();
@@ -441,7 +491,7 @@
             StdLexer(Lex::Tokenizer& tokens, const wchar_t* text); 
         private: 
             Lex::Tokens tokens_; 
-            Lex::WhiteSpaceFilter ws_;
+            Lex::StdFilter filter_;
         };
 		
     } // Parser