searchengine/util/tsrc/cpixtoolsunittest/src/parseunittest.cpp
changeset 8 6547bf8ca13a
parent 0 671dee74050a
equal deleted inserted replaced
7:a5fbfefd615f 8:6547bf8ca13a
    22 
    22 
    23 using namespace Cpt::Lex; 
    23 using namespace Cpt::Lex; 
    24 using namespace Cpt::Parser; 
    24 using namespace Cpt::Parser; 
    25 using namespace std; 
    25 using namespace std; 
    26 
    26 
    27 enum TokenType {
    27 const wchar_t* TOKEN_LEFT_BRACKET = L"left bracket";
    28 	TOKEN_LEFT_BRACKET = Cpt::Lex::TOKEN_LAST_RESERVED,  // 8
    28 const wchar_t* TOKEN_RIGHT_BRACKET = L"right bracket"; 
    29 	TOKEN_RIGHT_BRACKET, 
    29 const wchar_t* TOKEN_COMMA = L"comma";
    30 	TOKEN_COMMA, // 10
    30 const wchar_t* TOKEN_PIPE = L"pipe";
    31 	TOKEN_PIPE,
    31 const wchar_t* TOKEN_SWITCH = L"switch";
    32 	TOKEN_SWITCH,
    32 const wchar_t* TOKEN_CASE = L"case";
    33 	TOKEN_CASE,
    33 const wchar_t* TOKEN_DEFAULT = L"default";
    34 	TOKEN_DEFAULT,
    34 const wchar_t* TOKEN_LEFT_BRACE = L"left brace";
    35 	TOKEN_LEFT_BRACE, // 15
    35 const wchar_t* TOKEN_RIGHT_BRACE = L"right brace";
    36 	TOKEN_RIGHT_BRACE,
    36 const wchar_t* TOKEN_COLON = L"colon";
    37 	TOKEN_COLON,
    37 const wchar_t* TOKEN_TERMINATOR = L"terminator";
    38 	TOKEN_TERMINATOR
       
    39 };
       
    40 
    38 
    41 void PrintToken(Cpt::Lex::Token token) {
    39 void PrintToken(Cpt::Lex::Token token) {
    42 	switch (token.type()) {
    40 	wcout<<token.type()<<L"('"<<token.text()<<L"')";  
    43 		case TOKEN_WS: wcout<<L"space"; break; 
    41 }
    44 		case TOKEN_ID: wcout<<"id"; break;
    42 
    45 		case TOKEN_LIT: wcout<<"lit"; break;
    43 void TestTokenization(Itk::TestMgr  * testMgr,
    46 		case TOKEN_STRLIT: wcout<<"str-lit"; break;
       
    47 		case TOKEN_REALLIT: wcout<<"real-lit"; break;
       
    48 		case TOKEN_INTLIT: wcout<<"int-lit"; break;
       
    49 		case TOKEN_LEFT_BRACKET: wcout<<"lbr"; break;
       
    50 		case TOKEN_RIGHT_BRACKET: wcout<<"rbr"; break;
       
    51 		case TOKEN_COMMA: wcout<<"comma"; break;
       
    52 		case TOKEN_PIPE: wcout<<"pipe"; break;
       
    53 		case TOKEN_SWITCH : wcout<<"sw"; break;
       
    54 		case TOKEN_CASE : wcout<<"case"; break;
       
    55 		case TOKEN_DEFAULT : wcout<<"default"; break;
       
    56 		case TOKEN_LEFT_BRACE : wcout<<"lbc"; break;
       
    57 		case TOKEN_RIGHT_BRACE : wcout<<"rbc"; break;
       
    58 		case TOKEN_COLON : wcout<<"cl"; break;
       
    59 		case TOKEN_TERMINATOR : wcout<<"tr"; break;
       
    60 
       
    61 		default: wcout<<"unknown"; break;
       
    62 	}
       
    63 	wcout<<L"('"<<token.text()<<L"')";  
       
    64 }
       
    65 
       
    66 void TestTokenization(Itk::TestMgr  * ,
       
    67                       const wchar_t * inputStr)
    44                       const wchar_t * inputStr)
    68 {
    45 {
    69 	WhitespaceTokenizer ws; 
    46 	WhitespaceTokenizer ws; 
       
    47 	LineCommentTokenizer line; 
       
    48 	SectionCommentTokenizer section; 
    70 	IdTokenizer ids; 
    49 	IdTokenizer ids; 
    71         IntLitTokenizer ints;
    50         IntLitTokenizer ints;
    72         RealLitTokenizer reals;
    51         RealLitTokenizer reals;
    73 	LitTokenizer lits('\''); 
    52 	LitTokenizer lits('\''); 
    74 	SymbolTokenizer lb(TOKEN_LEFT_BRACKET, L"("); 
    53 	SymbolTokenizer lb(TOKEN_LEFT_BRACKET, L"("); 
    82         // (to check if those types are recognized correctly). So
    61         // (to check if those types are recognized correctly). So
    83         // basically, in test cases, lit will mean string literals,
    62         // basically, in test cases, lit will mean string literals,
    84         // and int-lit, real-lit will mean integer and real literals,
    63         // and int-lit, real-lit will mean integer and real literals,
    85         // respectively.
    64         // respectively.
    86 	Tokenizer* tokenizers[] = {
    65 	Tokenizer* tokenizers[] = {
    87 		&ws, &lb, &rb, &cm, &pp, &ids, &ints, &reals, &lits, 0
    66 		&ws, &line, &section, &lb, &rb, &cm, &pp, &ids, &ints, &reals, &lits, 0
    88 	};
    67 	};
    89 	MultiTokenizer tokenizer(tokenizers);
    68 	MultiTokenizer tokenizer(tokenizers);
    90 	
    69 	
    91 	Tokens 
    70 	Tokens 
    92             source(tokenizer, 
    71             source(tokenizer, 
    93                    inputStr);
    72                    inputStr);
    94 	WhiteSpaceFilter tokens(source); 
    73 	StdFilter tokens(source); 
    95 	
    74 	
    96 	while (tokens) PrintToken(tokens++); 
    75 	while (tokens) PrintToken(tokens++); 
    97 	cout<<endl;
    76 	cout<<endl;
    98 }
    77 }
    99 
    78 
   105 }
    84 }
   106 
    85 
   107 void TestTokenization2(Itk::TestMgr * testMgr)
    86 void TestTokenization2(Itk::TestMgr * testMgr)
   108 {
    87 {
   109     TestTokenization(testMgr,
    88     TestTokenization(testMgr,
   110                      L"'foo' 0 1 -2 'bar' +234 -34");
    89                      L"'foo' 0 1 -2 'bar' +234 -34 // side note");
   111 }
    90 }
   112 
    91 
   113 
    92 
   114 void TestTokenization3(Itk::TestMgr * testMgr)
    93 void TestTokenization3(Itk::TestMgr * testMgr)
   115 {
    94 {
   119 
    98 
   120 
    99 
   121 void TestTokenization4(Itk::TestMgr * testMgr)
   100 void TestTokenization4(Itk::TestMgr * testMgr)
   122 {
   101 {
   123     TestTokenization(testMgr,
   102     TestTokenization(testMgr,
   124                      L"'\\' ''\\\\' '\\a' '\\\n'");
   103                      L"'\\' ''\\\\' '\\a' '\\\n' // comment\n /*foobar*/");
   125 }
   104 }
   126 
   105 
   127 
   106 
   128 void TestTokenization5(Itk::TestMgr * )
   107 void TestTokenization5(Itk::TestMgr * testMgr)
   129 {
   108 {
   130     WhitespaceTokenizer 
   109     WhitespaceTokenizer 
   131         ws; 
   110         ws; 
   132     IdTokenizer 
   111     IdTokenizer 
   133         ids; 
   112         ids; 
   134     SymbolTokenizer 
   113     SymbolTokenizer 
   135         for_(0xf00, L"for"); 
   114         for_(L"for", L"for"); 
   136     SymbolTokenizer 
   115     SymbolTokenizer 
   137         if_(0xbeef, L"if"); 
   116         if_(L"if", L"if"); 
   138     Tokenizer* tokenizers[] = {
   117     Tokenizer* tokenizers[] = {
   139         &ws, &for_, &if_, &ids, 0
   118         &ws, &for_, &if_, &ids, 0
   140     };
   119     };
   141 
   120 
   142     MultiTokenizer 
   121     MultiTokenizer 
   143         tokenizer(tokenizers);
   122         tokenizer(tokenizers);
   144 
   123 
   145     Tokens 
   124     Tokens 
   146         source(tokenizer, 
   125         source(tokenizer, 
   147                L"fo for fore forth ofor oforo i if ifdom ifer fif fifi forfi fifor"); // test escape in literals
   126                L"fo for fore forth ofor oforo i if ifdom ifer fif fifi forfi fifor"); // test escape in literals
   148     WhiteSpaceFilter 
   127     StdFilter 
   149         tokens(source); 
   128         tokens(source); 
   150 
   129 
   151     while (tokens) PrintToken(tokens++); 
   130     while (tokens) PrintToken(tokens++); 
   152     cout<<endl;
   131     cout<<endl;
   153 }
   132 }
   154 
   133 
   155 void TestTokenizationErrors(Itk::TestMgr* ) 
   134 void TestTokenization6(Itk::TestMgr * testMgr)
       
   135 {
       
   136     WhitespaceTokenizer 
       
   137         ws; 
       
   138     LineCommentTokenizer 
       
   139         line; 
       
   140     SectionCommentTokenizer 
       
   141         section; 
       
   142     IdTokenizer 
       
   143         ids; 
       
   144     IntLitTokenizer 
       
   145         intLit; 
       
   146     RealLitTokenizer 
       
   147         realLit; 
       
   148     SymbolTokenizer 
       
   149         div(L"slash", L"/"); 
       
   150     SymbolTokenizer 
       
   151         mul(L"star", L"*");
       
   152     SymbolTokenizer 
       
   153         plus(L"plus", L"+");
       
   154     SymbolTokenizer 
       
   155         minus(L"minus", L"-");
       
   156     SymbolTokenizer 
       
   157         equal(L"equals", L"=");
       
   158     
       
   159     Tokenizer* tokenizers[] = {
       
   160         &ws, &line, &section, &ids, &intLit, &realLit, &div, &mul, &plus, &minus, &equal, 0
       
   161     };
       
   162 
       
   163     MultiTokenizer 
       
   164         tokenizer(tokenizers);
       
   165     
       
   166     const wchar_t* text = 
       
   167     	L"4 + 6 = 2 * 5\n"
       
   168         L"6 / 2 = 1*3 // true\n"
       
   169 		L"3 / x /*important thingie*/ = 2 * y\n"
       
   170 		L"6 / x * / * / /* non sense / * / */ // zap"
       
   171 		L"//\n"
       
   172 		L"//"; 
       
   173 
       
   174     {
       
   175 		cout<<"With whitespaces & comments visible"<<endl;
       
   176 		Tokens 
       
   177 			tokens(tokenizer, text);
       
   178 	
       
   179 		while (tokens) PrintToken(tokens++); 
       
   180 		cout<<endl;
       
   181     }
       
   182 
       
   183     {
       
   184 		cout<<"With whitespaces & comments filtered"<<endl;
       
   185 		Tokens 
       
   186 			source(tokenizer, text);
       
   187 		
       
   188 		StdFilter tokens(source); 
       
   189 	
       
   190 		while (tokens) PrintToken(tokens++); 
       
   191 		cout<<endl;
       
   192     }
       
   193 
       
   194 }
       
   195 
       
   196 void TestTokenizationErrors(Itk::TestMgr* mgr) 
   156 {
   197 {
   157 	WhitespaceTokenizer ws; 
   198 	WhitespaceTokenizer ws; 
   158 	IdTokenizer ids; 
   199 	IdTokenizer ids; 
   159 	LitTokenizer lits('\''); 
   200 	LitTokenizer lits('\''); 
   160 	SymbolTokenizer lb(TOKEN_LEFT_BRACKET, L"("); 
   201 	SymbolTokenizer lb(TOKEN_LEFT_BRACKET, L"("); 
   169 	{
   210 	{
   170 		Tokens tokens(tokenizer, text = L"stdtokens>lowercase>stopwords('a', 'an','the)>stem('en')");
   211 		Tokens tokens(tokenizer, text = L"stdtokens>lowercase>stopwords('a', 'an','the)>stem('en')");
   171 		try {
   212 		try {
   172 			while (tokens) PrintToken(tokens++); 
   213 			while (tokens) PrintToken(tokens++); 
   173 		} catch (LexException& exc) {
   214 		} catch (LexException& exc) {
   174                     /* OBS
   215 			exc.setContext(text);
   175 			wcout<<endl<<L"LexException: "<<exc.describe(text)<<endl; 
   216 			wcout<<endl<<L"LexException: "<<exc.wWhat()<<endl; 
   176                     */
       
   177                     exc.setContext(text);
       
   178                     wcout<<endl<<L"LexException: "<<exc.wWhat()<<endl; 
       
   179 		} catch (exception& exc) {
   217 		} catch (exception& exc) {
   180 			cout<<endl<<"Exception: "<<exc.what()<<endl; 
   218 			cout<<endl<<"Exception: "<<exc.what()<<endl; 
   181 		}
   219 		}
   182 	}
   220 	}
   183 	{
   221 	{
   184 		Tokens tokens(tokenizer, text = L"fas-324we?`213ff3*21(+");
   222 		Tokens tokens(tokenizer, text = L"fas-324we?`213ff3*21(+");
   185 		try {
   223 		try {
   186 			while (tokens) PrintToken(tokens++); 
   224 			while (tokens) PrintToken(tokens++); 
   187 		} catch (LexException& exc) {
   225 		} catch (LexException& exc) {
   188                     /* OBS
   226 			exc.setContext(text);
   189 			wcout<<endl<<L"LexException: "<<exc.describe(text)<<endl; 
   227 			wcout<<endl<<L"LexException: "<<exc.wWhat()<<endl; 
   190                     */
       
   191                     exc.setContext(text);
       
   192                     wcout<<endl<<L"LexException: "<<exc.wWhat()<<endl; 
       
   193 		} catch (exception& exc) {
   228 		} catch (exception& exc) {
   194 			cout<<endl<<"Exception: "<<exc.what()<<endl; 
   229 			cout<<endl<<"Exception: "<<exc.what()<<endl; 
   195 		}
   230 		}
   196 	}
   231 	}
   197 }
   232 }
   198 
   233 
       
   234 void TestWhitespaceSplitter(Itk::TestMgr* mgr) 
       
   235 {
       
   236 	{
       
   237 		WhitespaceSplitter tokens(L"foobar foo bar foo\tbar _*4 4bar foo*bar foo\nbar foo\rbar foo\0bar");
       
   238 		while (tokens) printf(" \"%S\"", tokens++.text().c_str());
       
   239 		printf("\n");
       
   240 	}
       
   241 	
       
   242 	{
       
   243 		WhitespaceSplitter tokens(L"foobar");
       
   244 		while (tokens) printf(" \"%S\"", tokens++.text().c_str());
       
   245 		printf("\n");
       
   246 	}
       
   247 
       
   248 	{
       
   249 		WhitespaceSplitter tokens(L"   foobar  \r\n");
       
   250 		while (tokens) printf(" \"%S\"", tokens++.text().c_str());
       
   251 		printf("\n");
       
   252 	}
       
   253 
       
   254 	{
       
   255 		WhitespaceSplitter tokens(L"   ");
       
   256 		while (tokens) printf(" \"%S\"", tokens++.text().c_str());
       
   257 		printf("\n");
       
   258 	}
       
   259 
       
   260 	{
       
   261 		WhitespaceSplitter tokens(L"");
       
   262 		while (tokens) printf(" \"%S\"", tokens++.text().c_str());
       
   263 		printf("\n");
       
   264 	}
       
   265 
       
   266 }
       
   267 
   199 Itk::TesterBase * CreateParsingTests()
   268 Itk::TesterBase * CreateParsingTests()
   200 {
   269 {
   201     using namespace Itk;
   270     using namespace Itk;
   202 
   271 
   203     SuiteTester
   272     SuiteTester
   204         * parsingTests = new SuiteTester("parsing");
   273         * parsingTests = new SuiteTester("parsing");
   205    
   274    
   206 
       
   207     parsingTests->add("tokenization1",
   275     parsingTests->add("tokenization1",
   208                       TestTokenization1,
   276                       TestTokenization1,
   209                       "tokenization1");
   277                       "tokenization1");
   210 
   278 
   211     parsingTests->add("tokenization2",
   279     parsingTests->add("tokenization2",
   221                       "tokenization4");
   289                       "tokenization4");
   222 
   290 
   223     parsingTests->add("tokenization5",
   291     parsingTests->add("tokenization5",
   224                       TestTokenization5,
   292                       TestTokenization5,
   225                       "tokenization5");
   293                       "tokenization5");
       
   294     
       
   295     parsingTests->add("tokenization6",
       
   296                       TestTokenization6,
       
   297                       "tokenization6");
   226 
   298 
   227     parsingTests->add("syntaxerrors",
   299     parsingTests->add("syntaxerrors",
   228                       TestTokenizationErrors,
   300                       TestTokenizationErrors,
   229                       "syntaxerrors");
   301                       "syntaxerrors");
   230 	    
   302 
       
   303     parsingTests->add("whitespace",
       
   304 					  TestWhitespaceSplitter,
       
   305                       "whitespace");
       
   306 
   231     return parsingTests;
   307     return parsingTests;
   232 }
   308 }
   233 
   309 
   234 
   310