|
1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 #include "tinyanalysis.h" |
|
19 #include "tinyanalysis.inl" |
|
20 #include "tinyunicode.h" |
|
21 |
|
22 #include "itk.h" |
|
23 |
|
24 #include <iostream> |
|
25 |
|
26 #include "CLucene.h" |
|
27 |
|
28 #include "wchar.h" |
|
29 |
|
30 #include "analysisunittest.h" |
|
31 #include "evaluationtool.h" |
|
32 |
|
33 using namespace evaluationtool; |
|
34 |
|
35 template <class T> |
|
36 void TestLetters(Itk::TestMgr* testMgr, T text) { |
|
37 using namespace analysis::tiny; |
|
38 |
|
39 CustomTokenizer<T> letters(iswalpha); |
|
40 RelaxedTokenizer<T> tokens(letters); |
|
41 |
|
42 Token<T> t; |
|
43 while (t = tokens.consume(text)) { |
|
44 wchar_t buf[256]; |
|
45 t.utf16(buf); |
|
46 wprintf(L"\"%S\" ", buf); |
|
47 } |
|
48 wprintf(L"\n"); |
|
49 } |
|
50 |
|
51 int isnotspace(int c) { |
|
52 return !iswspace(c); |
|
53 } |
|
54 |
|
55 template <class T> |
|
56 void TestNGram(Itk::TestMgr* testMgr, T text) { |
|
57 using namespace analysis::tiny; |
|
58 |
|
59 NGramTokenizer<T> ngram(2, isnotspace); |
|
60 RelaxedTokenizer<T> tokens(ngram); |
|
61 |
|
62 Token<T> t; |
|
63 while (t = tokens.consume(text)) { |
|
64 wchar_t buf[256]; |
|
65 t.utf16(buf); |
|
66 wprintf(L"\"%S\" ", buf); |
|
67 } |
|
68 wprintf(L"\n"); |
|
69 } |
|
70 |
|
71 void TinyWcharTest(Itk::TestMgr* testMgr) { |
|
72 TestLetters(testMgr, L"foo bar foobar foo*bar foo_bar"); |
|
73 TestNGram(testMgr, L"foo bar foobar foo*bar foo_bar"); |
|
74 } |
|
75 |
|
76 void TinyReaderTest(Itk::TestMgr* testMgr) { |
|
77 { |
|
78 lucene::util::StringReader reader(L"foo bar foobar foo*bar foo_bar"); |
|
79 analysis::tiny::cl::ReaderBuffer<8> buf(reader); |
|
80 TestLetters(testMgr, buf.begin()); |
|
81 } |
|
82 { |
|
83 lucene::util::StringReader reader(L"foo bar foobar foo*bar foo_bar"); |
|
84 analysis::tiny::cl::ReaderBuffer<8> buf(reader); |
|
85 TestNGram(testMgr, buf.begin()); |
|
86 } |
|
87 } |
|
88 |
|
89 void TinyChinaTest(Itk::TestMgr* testMgr) { |
|
90 using namespace analysis::tiny; |
|
91 Corpus corpus(CHINESE_PRC_TEXTCORPUS); |
|
92 typedef cl::ReaderBuffer<64> buffer; |
|
93 |
|
94 for (int i = 0; i < corpus.size(); i++) { |
|
95 lucene::util::StringReader reader(corpus[i]); |
|
96 buffer buf(reader); |
|
97 TestNGram( testMgr, Utf16Iterator<buffer::iterator>( buf.begin() ) ); |
|
98 } |
|
99 } |
|
100 |
|
101 void TinyUtf16Test(Itk::TestMgr* testMgr) { |
|
102 using namespace analysis::tiny; |
|
103 Corpus corpus(CHINESE_PRC_TEXTCORPUS); |
|
104 typedef cl::ReaderBuffer<512> buffer; |
|
105 typedef Utf16Iterator<buffer::iterator> u16iter; |
|
106 |
|
107 for (int i = 0; i < corpus.size(); i++) { |
|
108 { |
|
109 lucene::util::StringReader reader(corpus[i]); |
|
110 buffer buf(reader); |
|
111 { |
|
112 u16iter i( buf.begin() ); |
|
113 for (; *i; ++i) { |
|
114 int c = *i; |
|
115 wcout<<(void*)c<<L" "; |
|
116 } |
|
117 } |
|
118 } |
|
119 wcout<<endl; |
|
120 { |
|
121 wchar_t c[512]; |
|
122 { |
|
123 lucene::util::StringReader reader(corpus[i]); |
|
124 buffer buf(reader); |
|
125 |
|
126 buffer::iterator j = buf.begin(); |
|
127 { |
|
128 int i; |
|
129 for (i = 0; *j; i++, ++j) { |
|
130 c[i] = *j; |
|
131 } |
|
132 c[i] = '\0'; |
|
133 } |
|
134 } |
|
135 lucene::util::StringReader reader(corpus[i]); |
|
136 buffer buf(reader); |
|
137 u16iter i( buf.begin() ); |
|
138 wchar_t b[512]; |
|
139 wcout<<flush; |
|
140 Utf16Writer<wchar_t*>(b)<<i<<L'\0'; |
|
141 wprintf(L"%S\n", b); |
|
142 fflush(stdout); |
|
143 for (int k = 0; c[k] || b[k]; k++) { |
|
144 if (c[k] != b[k]) { |
|
145 wcout<<"x"; |
|
146 } else { |
|
147 wcout<<"."; |
|
148 } |
|
149 } |
|
150 } |
|
151 |
|
152 wcout<<endl; |
|
153 } |
|
154 } |
|
155 |
|
156 void TinyJamuTest(Itk::TestMgr* testMgr) { |
|
157 using namespace analysis::tiny; |
|
158 Corpus corpus(KOREAN_TEXTCORPUS); |
|
159 |
|
160 typedef cl::ReaderBuffer<512> buffer; |
|
161 typedef Utf16Iterator<buffer::iterator> u16iter; |
|
162 typedef JamuIterator<u16iter> iter; |
|
163 |
|
164 for (int line = 0; line < corpus.size(); line++) { |
|
165 lucene::util::StringReader reader(corpus[line]); |
|
166 buffer buf(reader); |
|
167 iter i(u16iter(buf.begin())); |
|
168 |
|
169 printf("%S\n", utf16str(i).c_str()); |
|
170 } |
|
171 } |
|
172 |
|
173 |
|
174 void TinyHangulTest(Itk::TestMgr* testMgr) { |
|
175 using namespace analysis::tiny; |
|
176 Corpus corpus(KOREAN_TEXTCORPUS); |
|
177 |
|
178 typedef cl::ReaderBuffer<512> buffer; |
|
179 typedef Utf16Iterator<buffer::iterator> u16iter; |
|
180 typedef HangulIterator<u16iter> iter; |
|
181 |
|
182 for (int line = 0; line < corpus.size(); line++) { |
|
183 lucene::util::StringReader reader(corpus[line]); |
|
184 buffer buf(reader); |
|
185 iter i(u16iter(buf.begin())); |
|
186 |
|
187 printf("%S\n", utf16str(i).c_str()); |
|
188 } |
|
189 } |
|
190 Itk::TesterBase * CreateTinyAnalysisUnitTest() |
|
191 { |
|
192 using namespace Itk; |
|
193 |
|
194 SuiteTester |
|
195 * testSuite = |
|
196 new SuiteTester( "tiny" ); |
|
197 |
|
198 testSuite->add( "wchar", TinyWcharTest, "wchar" ); |
|
199 testSuite->add( "reader", TinyReaderTest, "reader" ); |
|
200 testSuite->add( "cn", TinyChinaTest, "cn" ); |
|
201 testSuite->add( "utf16", TinyUtf16Test, "utf16" ); |
|
202 testSuite->add( "jamu", TinyJamuTest, "jamu" ); |
|
203 testSuite->add( "hangul", TinyHangulTest, "hangul" ); |
|
204 |
|
205 return testSuite; |
|
206 } |