searchengine/oss/loc/analysisunittest/src/tinyanalysis.cpp
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 #include "tinyanalysis.h"
       
    19 #include "tinyanalysis.inl"
       
    20 #include "tinyunicode.h"
       
    21 
       
    22 #include "itk.h"
       
    23 
       
    24 #include <iostream>
       
    25 
       
    26 #include "CLucene.h"
       
    27 
       
    28 #include "wchar.h"
       
    29 
       
    30 #include "analysisunittest.h"
       
    31 #include "evaluationtool.h"
       
    32 
       
    33 using namespace evaluationtool; 
       
    34 
       
    35 template <class T>
       
    36 void TestLetters(Itk::TestMgr* testMgr, T text) {
       
    37 	using namespace analysis::tiny;
       
    38 	
       
    39 	CustomTokenizer<T> letters(iswalpha);
       
    40 	RelaxedTokenizer<T> tokens(letters);
       
    41 	
       
    42 	Token<T> t; 
       
    43 	while (t = tokens.consume(text)) {
       
    44         wchar_t buf[256];
       
    45         t.utf16(buf);
       
    46         wprintf(L"\"%S\" ", buf);
       
    47 	}
       
    48 	wprintf(L"\n");
       
    49 }
       
    50 
       
    51 int isnotspace(int c) {
       
    52     return !iswspace(c);
       
    53 }
       
    54 
       
    55 template <class T>
       
    56 void TestNGram(Itk::TestMgr* testMgr, T text) {
       
    57 	using namespace analysis::tiny;
       
    58 	
       
    59 	NGramTokenizer<T> ngram(2, isnotspace);
       
    60 	RelaxedTokenizer<T> tokens(ngram);
       
    61 	
       
    62 	Token<T> t; 
       
    63 	while (t = tokens.consume(text)) {
       
    64         wchar_t buf[256];
       
    65         t.utf16(buf);
       
    66         wprintf(L"\"%S\" ", buf);
       
    67 	}
       
    68     wprintf(L"\n");
       
    69 }
       
    70 
       
    71 void TinyWcharTest(Itk::TestMgr* testMgr) {
       
    72 	TestLetters(testMgr, L"foo bar foobar foo*bar foo_bar");
       
    73 	TestNGram(testMgr, L"foo bar foobar foo*bar foo_bar");
       
    74 }
       
    75 
       
    76 void TinyReaderTest(Itk::TestMgr* testMgr) {
       
    77 	{
       
    78 		lucene::util::StringReader reader(L"foo bar foobar foo*bar foo_bar");
       
    79 		analysis::tiny::cl::ReaderBuffer<8> buf(reader);
       
    80 		TestLetters(testMgr, buf.begin());
       
    81 	}
       
    82 	{
       
    83 		lucene::util::StringReader reader(L"foo bar foobar foo*bar foo_bar");
       
    84 		analysis::tiny::cl::ReaderBuffer<8> buf(reader);
       
    85 		TestNGram(testMgr, buf.begin());
       
    86 	}
       
    87 }
       
    88 
       
    89 void TinyChinaTest(Itk::TestMgr* testMgr) {
       
    90     using namespace analysis::tiny;
       
    91     Corpus corpus(CHINESE_PRC_TEXTCORPUS); 
       
    92     typedef cl::ReaderBuffer<64> buffer;
       
    93 
       
    94     for (int i = 0; i < corpus.size(); i++) {
       
    95         lucene::util::StringReader reader(corpus[i]);
       
    96         buffer buf(reader);
       
    97         TestNGram( testMgr, Utf16Iterator<buffer::iterator>( buf.begin() ) );
       
    98     }
       
    99 }
       
   100 
       
   101 void TinyUtf16Test(Itk::TestMgr* testMgr) {
       
   102     using namespace analysis::tiny;
       
   103     Corpus corpus(CHINESE_PRC_TEXTCORPUS); 
       
   104     typedef cl::ReaderBuffer<512> buffer;
       
   105     typedef Utf16Iterator<buffer::iterator> u16iter;
       
   106 
       
   107     for (int i = 0; i < corpus.size(); i++) {
       
   108         {
       
   109             lucene::util::StringReader reader(corpus[i]);
       
   110             buffer buf(reader);
       
   111             {
       
   112                 u16iter i( buf.begin() );
       
   113                 for (; *i; ++i) {
       
   114                     int c = *i;
       
   115                     wcout<<(void*)c<<L" ";
       
   116                 }
       
   117             }
       
   118         }
       
   119         wcout<<endl;
       
   120         {
       
   121             wchar_t c[512];
       
   122             {
       
   123                 lucene::util::StringReader reader(corpus[i]);
       
   124                 buffer buf(reader);
       
   125     
       
   126                 buffer::iterator j = buf.begin();
       
   127                 {
       
   128                     int i;
       
   129                     for (i = 0; *j; i++, ++j) {
       
   130                         c[i] = *j;
       
   131                     }
       
   132                     c[i] = '\0';
       
   133                 }
       
   134             }
       
   135             lucene::util::StringReader reader(corpus[i]);
       
   136             buffer buf(reader);
       
   137             u16iter i( buf.begin() );
       
   138             wchar_t b[512];
       
   139             wcout<<flush;
       
   140             Utf16Writer<wchar_t*>(b)<<i<<L'\0';
       
   141             wprintf(L"%S\n", b);
       
   142             fflush(stdout);
       
   143             for (int k = 0; c[k] || b[k]; k++) {
       
   144                 if (c[k] != b[k]) {
       
   145                     wcout<<"x";
       
   146                 } else {
       
   147                     wcout<<".";
       
   148                 }
       
   149             }
       
   150         }
       
   151 
       
   152         wcout<<endl;
       
   153     }
       
   154 }
       
   155 
       
   156 void TinyJamuTest(Itk::TestMgr* testMgr) {
       
   157     using namespace analysis::tiny;
       
   158     Corpus corpus(KOREAN_TEXTCORPUS);
       
   159     
       
   160     typedef cl::ReaderBuffer<512> buffer;
       
   161     typedef Utf16Iterator<buffer::iterator> u16iter;
       
   162     typedef JamuIterator<u16iter> iter;
       
   163     
       
   164     for (int line = 0; line < corpus.size(); line++) {
       
   165         lucene::util::StringReader reader(corpus[line]);
       
   166         buffer buf(reader);
       
   167         iter i(u16iter(buf.begin()));
       
   168 
       
   169         printf("%S\n", utf16str(i).c_str());
       
   170     }
       
   171 }
       
   172 
       
   173 
       
   174 void TinyHangulTest(Itk::TestMgr* testMgr) {
       
   175     using namespace analysis::tiny;
       
   176     Corpus corpus(KOREAN_TEXTCORPUS);
       
   177     
       
   178     typedef cl::ReaderBuffer<512> buffer;
       
   179     typedef Utf16Iterator<buffer::iterator> u16iter;
       
   180     typedef HangulIterator<u16iter> iter;
       
   181     
       
   182     for (int line = 0; line < corpus.size(); line++) {
       
   183         lucene::util::StringReader reader(corpus[line]);
       
   184         buffer buf(reader);
       
   185         iter i(u16iter(buf.begin()));
       
   186 
       
   187         printf("%S\n", utf16str(i).c_str());
       
   188     }
       
   189 }
       
   190 Itk::TesterBase * CreateTinyAnalysisUnitTest() 
       
   191 {
       
   192 	using namespace Itk;
       
   193 	
       
   194 	SuiteTester
       
   195 		* testSuite = 
       
   196 			new SuiteTester( "tiny" );
       
   197 	
       
   198 	testSuite->add( "wchar", TinyWcharTest, "wchar" );
       
   199 	testSuite->add( "reader", TinyReaderTest, "reader" );
       
   200     testSuite->add( "cn", TinyChinaTest, "cn" );
       
   201     testSuite->add( "utf16", TinyUtf16Test, "utf16" );
       
   202     testSuite->add( "jamu", TinyJamuTest, "jamu" );
       
   203     testSuite->add( "hangul", TinyHangulTest, "hangul" );
       
   204     
       
   205 	return testSuite;
       
   206 }