searchengine/oss/loc/analysisunittest/src/tinyanalysis.cpp
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/

#include "tinyanalysis.h"
#include "tinyanalysis.inl"
#include "tinyunicode.h"

#include "itk.h"

#include <iostream>

#include "CLucene.h"

#include "wchar.h"

#include "analysisunittest.h"
#include "evaluationtool.h"

using namespace evaluationtool; 

template <class T>
void TestLetters(Itk::TestMgr* testMgr, T text) {
	using namespace analysis::tiny;
	
	CustomTokenizer<T> letters(iswalpha);
	RelaxedTokenizer<T> tokens(letters);
	
	Token<T> t; 
	while (t = tokens.consume(text)) {
        wchar_t buf[256];
        t.utf16(buf);
        wprintf(L"\"%S\" ", buf);
	}
	wprintf(L"\n");
}

int isnotspace(int c) {
    return !iswspace(c);
}

template <class T>
void TestNGram(Itk::TestMgr* testMgr, T text) {
	using namespace analysis::tiny;
	
	NGramTokenizer<T> ngram(2, isnotspace);
	RelaxedTokenizer<T> tokens(ngram);
	
	Token<T> t; 
	while (t = tokens.consume(text)) {
        wchar_t buf[256];
        t.utf16(buf);
        wprintf(L"\"%S\" ", buf);
	}
    wprintf(L"\n");
}

void TinyWcharTest(Itk::TestMgr* testMgr) {
	TestLetters(testMgr, L"foo bar foobar foo*bar foo_bar");
	TestNGram(testMgr, L"foo bar foobar foo*bar foo_bar");
}

void TinyReaderTest(Itk::TestMgr* testMgr) {
	{
		lucene::util::StringReader reader(L"foo bar foobar foo*bar foo_bar");
		analysis::tiny::cl::ReaderBuffer<8> buf(reader);
		TestLetters(testMgr, buf.begin());
	}
	{
		lucene::util::StringReader reader(L"foo bar foobar foo*bar foo_bar");
		analysis::tiny::cl::ReaderBuffer<8> buf(reader);
		TestNGram(testMgr, buf.begin());
	}
}

void TinyChinaTest(Itk::TestMgr* testMgr) {
    using namespace analysis::tiny;
    Corpus corpus(CHINESE_PRC_TEXTCORPUS); 
    typedef cl::ReaderBuffer<64> buffer;

    for (int i = 0; i < corpus.size(); i++) {
        lucene::util::StringReader reader(corpus[i]);
        buffer buf(reader);
        TestNGram( testMgr, Utf16Iterator<buffer::iterator>( buf.begin() ) );
    }
}

void TinyUtf16Test(Itk::TestMgr* testMgr) {
    using namespace analysis::tiny;
    Corpus corpus(CHINESE_PRC_TEXTCORPUS); 
    typedef cl::ReaderBuffer<512> buffer;
    typedef Utf16Iterator<buffer::iterator> u16iter;

    for (int i = 0; i < corpus.size(); i++) {
        {
            lucene::util::StringReader reader(corpus[i]);
            buffer buf(reader);
            {
                u16iter i( buf.begin() );
                for (; *i; ++i) {
                    int c = *i;
                    wcout<<(void*)c<<L" ";
                }
            }
        }
        wcout<<endl;
        {
            wchar_t c[512];
            {
                lucene::util::StringReader reader(corpus[i]);
                buffer buf(reader);
    
                buffer::iterator j = buf.begin();
                {
                    int i;
                    for (i = 0; *j; i++, ++j) {
                        c[i] = *j;
                    }
                    c[i] = '\0';
                }
            }
            lucene::util::StringReader reader(corpus[i]);
            buffer buf(reader);
            u16iter i( buf.begin() );
            wchar_t b[512];
            wcout<<flush;
            Utf16Writer<wchar_t*>(b)<<i<<L'\0';
            wprintf(L"%S\n", b);
            fflush(stdout);
            for (int k = 0; c[k] || b[k]; k++) {
                if (c[k] != b[k]) {
                    wcout<<"x";
                } else {
                    wcout<<".";
                }
            }
        }

        wcout<<endl;
    }
}

void TinyJamuTest(Itk::TestMgr* testMgr) {
    using namespace analysis::tiny;
    Corpus corpus(KOREAN_TEXTCORPUS);
    
    typedef cl::ReaderBuffer<512> buffer;
    typedef Utf16Iterator<buffer::iterator> u16iter;
    typedef JamuIterator<u16iter> iter;
    
    for (int line = 0; line < corpus.size(); line++) {
        lucene::util::StringReader reader(corpus[line]);
        buffer buf(reader);
        iter i(u16iter(buf.begin()));

        printf("%S\n", utf16str(i).c_str());
    }
}


void TinyHangulTest(Itk::TestMgr* testMgr) {
    using namespace analysis::tiny;
    Corpus corpus(KOREAN_TEXTCORPUS);
    
    typedef cl::ReaderBuffer<512> buffer;
    typedef Utf16Iterator<buffer::iterator> u16iter;
    typedef HangulIterator<u16iter> iter;
    
    for (int line = 0; line < corpus.size(); line++) {
        lucene::util::StringReader reader(corpus[line]);
        buffer buf(reader);
        iter i(u16iter(buf.begin()));

        printf("%S\n", utf16str(i).c_str());
    }
}
Itk::TesterBase * CreateTinyAnalysisUnitTest() 
{
	using namespace Itk;
	
	SuiteTester
		* testSuite = 
			new SuiteTester( "tiny" );
	
	testSuite->add( "wchar", TinyWcharTest, "wchar" );
	testSuite->add( "reader", TinyReaderTest, "reader" );
    testSuite->add( "cn", TinyChinaTest, "cn" );
    testSuite->add( "utf16", TinyUtf16Test, "utf16" );
    testSuite->add( "jamu", TinyJamuTest, "jamu" );
    testSuite->add( "hangul", TinyHangulTest, "hangul" );
    
	return testSuite;
}