searchengine/oss/loc/analysisunittest/src/tinyanalysis.cpp
changeset 24 65456528cac2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/loc/analysisunittest/src/tinyanalysis.cpp	Fri Oct 15 12:09:28 2010 +0530
@@ -0,0 +1,206 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: 
+*
+*/
+
+#include "tinyanalysis.h"
+#include "tinyanalysis.inl"
+#include "tinyunicode.h"
+
+#include "itk.h"
+
+#include <iostream>
+
+#include "CLucene.h"
+
+#include "wchar.h"
+
+#include "analysisunittest.h"
+#include "evaluationtool.h"
+
+using namespace evaluationtool; 
+
+template <class T>
+void TestLetters(Itk::TestMgr* testMgr, T text) {
+	using namespace analysis::tiny;
+	
+	CustomTokenizer<T> letters(iswalpha);
+	RelaxedTokenizer<T> tokens(letters);
+	
+	Token<T> t; 
+	while (t = tokens.consume(text)) {
+        wchar_t buf[256];
+        t.utf16(buf);
+        wprintf(L"\"%S\" ", buf);
+	}
+	wprintf(L"\n");
+}
+
+int isnotspace(int c) {
+    return !iswspace(c);
+}
+
+template <class T>
+void TestNGram(Itk::TestMgr* testMgr, T text) {
+	using namespace analysis::tiny;
+	
+	NGramTokenizer<T> ngram(2, isnotspace);
+	RelaxedTokenizer<T> tokens(ngram);
+	
+	Token<T> t; 
+	while (t = tokens.consume(text)) {
+        wchar_t buf[256];
+        t.utf16(buf);
+        wprintf(L"\"%S\" ", buf);
+	}
+    wprintf(L"\n");
+}
+
+void TinyWcharTest(Itk::TestMgr* testMgr) {
+	TestLetters(testMgr, L"foo bar foobar foo*bar foo_bar");
+	TestNGram(testMgr, L"foo bar foobar foo*bar foo_bar");
+}
+
+void TinyReaderTest(Itk::TestMgr* testMgr) {
+	{
+		lucene::util::StringReader reader(L"foo bar foobar foo*bar foo_bar");
+		analysis::tiny::cl::ReaderBuffer<8> buf(reader);
+		TestLetters(testMgr, buf.begin());
+	}
+	{
+		lucene::util::StringReader reader(L"foo bar foobar foo*bar foo_bar");
+		analysis::tiny::cl::ReaderBuffer<8> buf(reader);
+		TestNGram(testMgr, buf.begin());
+	}
+}
+
+void TinyChinaTest(Itk::TestMgr* testMgr) {
+    using namespace analysis::tiny;
+    Corpus corpus(CHINESE_PRC_TEXTCORPUS); 
+    typedef cl::ReaderBuffer<64> buffer;
+
+    for (int i = 0; i < corpus.size(); i++) {
+        lucene::util::StringReader reader(corpus[i]);
+        buffer buf(reader);
+        TestNGram( testMgr, Utf16Iterator<buffer::iterator>( buf.begin() ) );
+    }
+}
+
+void TinyUtf16Test(Itk::TestMgr* testMgr) {
+    using namespace analysis::tiny;
+    Corpus corpus(CHINESE_PRC_TEXTCORPUS); 
+    typedef cl::ReaderBuffer<512> buffer;
+    typedef Utf16Iterator<buffer::iterator> u16iter;
+
+    for (int i = 0; i < corpus.size(); i++) {
+        {
+            lucene::util::StringReader reader(corpus[i]);
+            buffer buf(reader);
+            {
+                u16iter i( buf.begin() );
+                for (; *i; ++i) {
+                    int c = *i;
+                    wcout<<(void*)c<<L" ";
+                }
+            }
+        }
+        wcout<<endl;
+        {
+            wchar_t c[512];
+            {
+                lucene::util::StringReader reader(corpus[i]);
+                buffer buf(reader);
+    
+                buffer::iterator j = buf.begin();
+                {
+                    int i;
+                    for (i = 0; *j; i++, ++j) {
+                        c[i] = *j;
+                    }
+                    c[i] = '\0';
+                }
+            }
+            lucene::util::StringReader reader(corpus[i]);
+            buffer buf(reader);
+            u16iter i( buf.begin() );
+            wchar_t b[512];
+            wcout<<flush;
+            Utf16Writer<wchar_t*>(b)<<i<<L'\0';
+            wprintf(L"%S\n", b);
+            fflush(stdout);
+            for (int k = 0; c[k] || b[k]; k++) {
+                if (c[k] != b[k]) {
+                    wcout<<"x";
+                } else {
+                    wcout<<".";
+                }
+            }
+        }
+
+        wcout<<endl;
+    }
+}
+
+void TinyJamuTest(Itk::TestMgr* testMgr) {
+    using namespace analysis::tiny;
+    Corpus corpus(KOREAN_TEXTCORPUS);
+    
+    typedef cl::ReaderBuffer<512> buffer;
+    typedef Utf16Iterator<buffer::iterator> u16iter;
+    typedef JamuIterator<u16iter> iter;
+    
+    for (int line = 0; line < corpus.size(); line++) {
+        lucene::util::StringReader reader(corpus[line]);
+        buffer buf(reader);
+        iter i(u16iter(buf.begin()));
+
+        printf("%S\n", utf16str(i).c_str());
+    }
+}
+
+
+void TinyHangulTest(Itk::TestMgr* testMgr) {
+    using namespace analysis::tiny;
+    Corpus corpus(KOREAN_TEXTCORPUS);
+    
+    typedef cl::ReaderBuffer<512> buffer;
+    typedef Utf16Iterator<buffer::iterator> u16iter;
+    typedef HangulIterator<u16iter> iter;
+    
+    for (int line = 0; line < corpus.size(); line++) {
+        lucene::util::StringReader reader(corpus[line]);
+        buffer buf(reader);
+        iter i(u16iter(buf.begin()));
+
+        printf("%S\n", utf16str(i).c_str());
+    }
+}
+Itk::TesterBase * CreateTinyAnalysisUnitTest() 
+{
+	using namespace Itk;
+	
+	SuiteTester
+		* testSuite = 
+			new SuiteTester( "tiny" );
+	
+	testSuite->add( "wchar", TinyWcharTest, "wchar" );
+	testSuite->add( "reader", TinyReaderTest, "reader" );
+    testSuite->add( "cn", TinyChinaTest, "cn" );
+    testSuite->add( "utf16", TinyUtf16Test, "utf16" );
+    testSuite->add( "jamu", TinyJamuTest, "jamu" );
+    testSuite->add( "hangul", TinyHangulTest, "hangul" );
+    
+	return testSuite;
+}