FCL/sf/mw/searchsrv: comparison searchengine/util/tsrc/cpixtoolsunittest/src/parseunittest.cpp

equal deleted inserted replaced

-:a5fbfefd615f
+:6547bf8ca13a
 using namespace Cpt::Lex;
 using namespace Cpt::Parser;
 using namespace std;
-enum TokenType {
+const wchar_t* TOKEN_LEFT_BRACKET = L"left bracket";
-	TOKEN_LEFT_BRACKET = Cpt::Lex::TOKEN_LAST_RESERVED,  // 8
+const wchar_t* TOKEN_RIGHT_BRACKET = L"right bracket";
-	TOKEN_RIGHT_BRACKET,
+const wchar_t* TOKEN_COMMA = L"comma";
-	TOKEN_COMMA, // 10
+const wchar_t* TOKEN_PIPE = L"pipe";
-	TOKEN_PIPE,
+const wchar_t* TOKEN_SWITCH = L"switch";
-	TOKEN_SWITCH,
+const wchar_t* TOKEN_CASE = L"case";
-	TOKEN_CASE,
+const wchar_t* TOKEN_DEFAULT = L"default";
-	TOKEN_DEFAULT,
+const wchar_t* TOKEN_LEFT_BRACE = L"left brace";
-	TOKEN_LEFT_BRACE, // 15
+const wchar_t* TOKEN_RIGHT_BRACE = L"right brace";
-	TOKEN_RIGHT_BRACE,
+const wchar_t* TOKEN_COLON = L"colon";
-	TOKEN_COLON,
+const wchar_t* TOKEN_TERMINATOR = L"terminator";
-	TOKEN_TERMINATOR
-};
 void PrintToken(Cpt::Lex::Token token) {
-	switch (token.type()) {
+	wcout<<token.type()<<L"('"<<token.text()<<L"')";
-		case TOKEN_WS: wcout<<L"space"; break;
+}
-		case TOKEN_ID: wcout<<"id"; break;
-		case TOKEN_LIT: wcout<<"lit"; break;
+void TestTokenization(Itk::TestMgr  * testMgr,
-		case TOKEN_STRLIT: wcout<<"str-lit"; break;
-		case TOKEN_REALLIT: wcout<<"real-lit"; break;
-		case TOKEN_INTLIT: wcout<<"int-lit"; break;
-		case TOKEN_LEFT_BRACKET: wcout<<"lbr"; break;
-		case TOKEN_RIGHT_BRACKET: wcout<<"rbr"; break;
-		case TOKEN_COMMA: wcout<<"comma"; break;
-		case TOKEN_PIPE: wcout<<"pipe"; break;
-		case TOKEN_SWITCH : wcout<<"sw"; break;
-		case TOKEN_CASE : wcout<<"case"; break;
-		case TOKEN_DEFAULT : wcout<<"default"; break;
-		case TOKEN_LEFT_BRACE : wcout<<"lbc"; break;
-		case TOKEN_RIGHT_BRACE : wcout<<"rbc"; break;
-		case TOKEN_COLON : wcout<<"cl"; break;
-		case TOKEN_TERMINATOR : wcout<<"tr"; break;
-		default: wcout<<"unknown"; break;
-	}
-	wcout<<L"('"<<token.text()<<L"')";
-}
-void TestTokenization(Itk::TestMgr  * ,
 const wchar_t * inputStr)
 {
 	WhitespaceTokenizer ws;
+	LineCommentTokenizer line;
+	SectionCommentTokenizer section;
 	IdTokenizer ids;
 IntLitTokenizer ints;
 RealLitTokenizer reals;
 	LitTokenizer lits('\'');
 	SymbolTokenizer lb(TOKEN_LEFT_BRACKET, L"(");
 // (to check if those types are recognized correctly). So
 // basically, in test cases, lit will mean string literals,
 // and int-lit, real-lit will mean integer and real literals,
 // respectively.
 	Tokenizer* tokenizers[] = {
-		&ws, &lb, &rb, &cm, &pp, &ids, &ints, &reals, &lits, 0
+		&ws, &line, &section, &lb, &rb, &cm, &pp, &ids, &ints, &reals, &lits, 0
 	};
 	MultiTokenizer tokenizer(tokenizers);
 	Tokens
 source(tokenizer,
 inputStr);
-	WhiteSpaceFilter tokens(source);
+	StdFilter tokens(source);
 	while (tokens) PrintToken(tokens++);
 	cout<<endl;
 }
 }
 void TestTokenization2(Itk::TestMgr * testMgr)
 {
 TestTokenization(testMgr,
-L"'foo' 0 1 -2 'bar' +234 -34");
+L"'foo' 0 1 -2 'bar' +234 -34 // side note");
 }
 void TestTokenization3(Itk::TestMgr * testMgr)
 {
 void TestTokenization4(Itk::TestMgr * testMgr)
 {
 TestTokenization(testMgr,
-L"'\\' ''\\\\' '\\a' '\\\n'");
+L"'\\' ''\\\\' '\\a' '\\\n' // comment\n /*foobar*/");
 }
-void TestTokenization5(Itk::TestMgr * )
+void TestTokenization5(Itk::TestMgr * testMgr)
 {
 WhitespaceTokenizer
 ws;
 IdTokenizer
 ids;
 SymbolTokenizer
-for_(0xf00, L"for");
+for_(L"for", L"for");
 SymbolTokenizer
-if_(0xbeef, L"if");
+if_(L"if", L"if");
 Tokenizer* tokenizers[] = {
 &ws, &for_, &if_, &ids, 0
 };
 MultiTokenizer
 tokenizer(tokenizers);
 Tokens
 source(tokenizer,
 L"fo for fore forth ofor oforo i if ifdom ifer fif fifi forfi fifor"); // test escape in literals
-WhiteSpaceFilter
+StdFilter
 tokens(source);
 while (tokens) PrintToken(tokens++);
 cout<<endl;
 }
-void TestTokenizationErrors(Itk::TestMgr* )
+void TestTokenization6(Itk::TestMgr * testMgr)
+{
+WhitespaceTokenizer
+ws;
+LineCommentTokenizer
+line;
+SectionCommentTokenizer
+section;
+IdTokenizer
+ids;
+IntLitTokenizer
+intLit;
+RealLitTokenizer
+realLit;
+SymbolTokenizer
+div(L"slash", L"/");
+SymbolTokenizer
+mul(L"star", L"*");
+SymbolTokenizer
+plus(L"plus", L"+");
+SymbolTokenizer
+minus(L"minus", L"-");
+SymbolTokenizer
+equal(L"equals", L"=");
+Tokenizer* tokenizers[] = {
+&ws, &line, &section, &ids, &intLit, &realLit, &div, &mul, &plus, &minus, &equal, 0
+};
+MultiTokenizer
+tokenizer(tokenizers);
+const wchar_t* text =
+	L"4 + 6 = 2 * 5\n"
+L"6 / 2 = 1*3 // true\n"
+		L"3 / x /*important thingie*/ = 2 * y\n"
+		L"6 / x * / * / /* non sense / * / */ // zap"
+		L"//\n"
+		L"//";
+{
+		cout<<"With whitespaces & comments visible"<<endl;
+		Tokens
+			tokens(tokenizer, text);
+		while (tokens) PrintToken(tokens++);
+		cout<<endl;
+}
+{
+		cout<<"With whitespaces & comments filtered"<<endl;
+		Tokens
+			source(tokenizer, text);
+		StdFilter tokens(source);
+		while (tokens) PrintToken(tokens++);
+		cout<<endl;
+}
+}
+void TestTokenizationErrors(Itk::TestMgr* mgr)
 {
 	WhitespaceTokenizer ws;
 	IdTokenizer ids;
 	LitTokenizer lits('\'');
 	SymbolTokenizer lb(TOKEN_LEFT_BRACKET, L"(");
 	{
 		Tokens tokens(tokenizer, text = L"stdtokens>lowercase>stopwords('a', 'an','the)>stem('en')");
 		try {
 			while (tokens) PrintToken(tokens++);
 		} catch (LexException& exc) {
-/* OBS
+			exc.setContext(text);
-			wcout<<endl<<L"LexException: "<<exc.describe(text)<<endl;
+			wcout<<endl<<L"LexException: "<<exc.wWhat()<<endl;
-*/
-exc.setContext(text);
-wcout<<endl<<L"LexException: "<<exc.wWhat()<<endl;
 		} catch (exception& exc) {
 			cout<<endl<<"Exception: "<<exc.what()<<endl;
 		}
 	}
 	{
 		Tokens tokens(tokenizer, text = L"fas-324we?`213ff3*21(+");
 		try {
 			while (tokens) PrintToken(tokens++);
 		} catch (LexException& exc) {
-/* OBS
+			exc.setContext(text);
-			wcout<<endl<<L"LexException: "<<exc.describe(text)<<endl;
+			wcout<<endl<<L"LexException: "<<exc.wWhat()<<endl;
-*/
-exc.setContext(text);
-wcout<<endl<<L"LexException: "<<exc.wWhat()<<endl;
 		} catch (exception& exc) {
 			cout<<endl<<"Exception: "<<exc.what()<<endl;
 		}
 	}
 }
+void TestWhitespaceSplitter(Itk::TestMgr* mgr)
+{
+	{
+		WhitespaceSplitter tokens(L"foobar foo bar foo\tbar _*4 4bar foo*bar foo\nbar foo\rbar foo\0bar");
+		while (tokens) printf(" \"%S\"", tokens++.text().c_str());
+		printf("\n");
+	}
+	{
+		WhitespaceSplitter tokens(L"foobar");
+		while (tokens) printf(" \"%S\"", tokens++.text().c_str());
+		printf("\n");
+	}
+	{
+		WhitespaceSplitter tokens(L"   foobar  \r\n");
+		while (tokens) printf(" \"%S\"", tokens++.text().c_str());
+		printf("\n");
+	}
+	{
+		WhitespaceSplitter tokens(L"   ");
+		while (tokens) printf(" \"%S\"", tokens++.text().c_str());
+		printf("\n");
+	}
+	{
+		WhitespaceSplitter tokens(L"");
+		while (tokens) printf(" \"%S\"", tokens++.text().c_str());
+		printf("\n");
+	}
+}
 Itk::TesterBase * CreateParsingTests()
 {
 using namespace Itk;
 SuiteTester
 * parsingTests = new SuiteTester("parsing");
 parsingTests->add("tokenization1",
 TestTokenization1,
 "tokenization1");
 parsingTests->add("tokenization2",
 "tokenization4");
 parsingTests->add("tokenization5",
 TestTokenization5,
 "tokenization5");
+parsingTests->add("tokenization6",
+TestTokenization6,
+"tokenization6");
 parsingTests->add("syntaxerrors",
 TestTokenizationErrors,
 "syntaxerrors");
+parsingTests->add("whitespace",
+					  TestWhitespaceSplitter,
+"whitespace");
 return parsingTests;
 }

changeset 8	6547bf8ca13a
parent 0	671dee74050a