author | Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com> |
Tue, 06 Jul 2010 15:30:04 +0300 | |
changeset 10 | afe194b6b1cd |
permissions | -rw-r--r-- |
10
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
1 |
/* |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
2 |
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
3 |
* All rights reserved. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
4 |
* This component and the accompanying materials are made available |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
5 |
* under the terms of "Eclipse Public License v1.0" |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
6 |
* which accompanies this distribution, and is available |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
7 |
* at the URL "http://www.eclipse.org/legal/epl-v10.html". |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
8 |
* |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
9 |
* Initial Contributors: |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
10 |
* Nokia Corporation - initial contribution. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
11 |
* |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
12 |
* Contributors: |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
13 |
* |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
14 |
* Description: |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
15 |
* |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
16 |
*/ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
17 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
18 |
#ifndef KOREANANALYZER_H_ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
19 |
#define KOREANANALYZER_H_ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
20 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
21 |
#include "Clucene.h" |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
22 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
23 |
#include "ngram.h" |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
24 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
25 |
#include "tinyanalysis.h" |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
26 |
#include "tinyutf16.h" |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
27 |
#include "tinyunicode.h" |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
28 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
29 |
namespace analysis |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
30 |
{ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
31 |
// Forward declarations |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
32 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
33 |
/** |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
34 |
* Special Korean analyzer that is designed so, that Cpix can |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
35 |
* update the result list, when each individual Jamu character is |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
36 |
* entered. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
37 |
* |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
38 |
* The analyzer tries to first convert given character stream into |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
39 |
* a form, where all Jamu characters are composed into Hangul form. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
40 |
* This means, that character sequences of form LV and LVT are |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
41 |
* eliminated and replaced with hangul syllables (L is leading Jamu |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
42 |
* consonant, V is for vocal and T is for trailing consonant). |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
43 |
* |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
44 |
* The idea behind the analyzer is that it produces up to 3 alternative |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
45 |
* tokens for each hangul syllabic. All of these alternatives are returned |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
46 |
* to be located at the same position. Let's have some Hangul syllabic H1 |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
47 |
* consisting of Jamu characters so that H1=J1J2J3. If H2=J1J2, then first |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
48 |
* returned token is H1, second token is H2 and third token is J1. This |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
49 |
* means, that when user enters H1, H2 or J1, the term H1 will be found. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
50 |
* Also, if user enters J1J2J3 or J1J2, term will be found, because |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
51 |
* J1J2J3 is automatically turned to H1 and J1J2 is turned to H2. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
52 |
* |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
53 |
* NOTE: This analyzer MUST NOT be used, when searching, because |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
54 |
* CLuceneQueryParser will break, when it faces tokens with zero |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
55 |
* increment. Use KoreanQueryAnalyzer for searching material indexed |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
56 |
* with this analyzer. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
57 |
*/ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
58 |
class KoreanTokenizer : public lucene::analysis::Tokenizer { |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
59 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
60 |
public: |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
61 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
62 |
/** Used to read from buffer */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
63 |
typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
64 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
65 |
/** Turns utf16 code points into unicode */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
66 |
typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
67 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
68 |
/** Turns Jamu alphabets into Hangul syllables */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
69 |
typedef tiny::HangulIterator<utf16_iterator> iterator; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
70 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
71 |
KoreanTokenizer(lucene::util::Reader* reader); |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
72 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
73 |
virtual bool next(lucene::analysis::Token* token); |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
74 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
75 |
private: |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
76 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
77 |
/** Jamu form of last consumed hangul syllable */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
78 |
wchar_t jamu_[4]; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
79 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
80 |
/** offsets of last consumed hangul syllable */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
81 |
int begin_, end_; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
82 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
83 |
/** |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
84 |
* The amount of jamu characters left in buffer. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
85 |
* If this is non-zero, hangul syllable is being processed. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
86 |
*/ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
87 |
int state_; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
88 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
89 |
/** |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
90 |
* Tiny CJK tokenizer is used to construct 1-grams out of |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
91 |
* chinese and japanese characters and to turn latin script |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
92 |
* into terms. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
93 |
*/ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
94 |
TinyCjkTokenizer<iterator> t_; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
95 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
96 |
/** 512 byte buffer for storing characters read with reader */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
97 |
tiny::cl::ReaderBuffer<512> in_; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
98 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
99 |
/** |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
100 |
* Reads utf16 from in_ buffer, turns it into unicode and |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
101 |
* then composes jamu alphabets into hangul syllables. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
102 |
*/ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
103 |
iterator i_; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
104 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
105 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
106 |
}; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
107 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
108 |
/** Korean tokenizer plus lowercase filter */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
109 |
typedef TemplateAnalyzer1F<KoreanTokenizer, lucene::analysis::LowerCaseFilter> |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
110 |
KoreanAnalyzer; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
111 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
112 |
/** |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
113 |
* Turns Jamu characters into Hangul syllables and generates 1-grams for |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
114 |
* all Chinese, Korean and Japanese text. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
115 |
*/ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
116 |
class KoreanQueryTokenizer : public lucene::analysis::Tokenizer { |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
117 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
118 |
public: |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
119 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
120 |
/** Used to read from buffer */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
121 |
typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
122 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
123 |
/** Turns utf16 code points into unicode */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
124 |
typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
125 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
126 |
/** Turns Hangul syllables into Jamu alphabets */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
127 |
typedef tiny::HangulIterator<utf16_iterator> iterator; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
128 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
129 |
public: |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
130 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
131 |
KoreanQueryTokenizer( lucene::util::Reader* reader ); |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
132 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
133 |
virtual bool next( lucene::analysis::Token* token ); |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
134 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
135 |
private: |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
136 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
137 |
/** Buffer for storing characters read with reader */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
138 |
TinyCjkTokenizer<iterator> t_; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
139 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
140 |
/** Buffer for storing characters read with reader */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
141 |
tiny::cl::ReaderBuffer<512> in_; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
142 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
143 |
/** |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
144 |
* Reads utf16 from in_ buffer, turns it into unicode and |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
145 |
* then composes jamu alphabets into hangul syllables. |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
146 |
*/ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
147 |
iterator i_; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
148 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
149 |
}; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
150 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
151 |
/** Korean query analyzer plus lowercase filter */ |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
152 |
typedef TemplateAnalyzer1F<KoreanQueryTokenizer, lucene::analysis::LowerCaseFilter> |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
153 |
KoreanQueryAnalyzer; |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
154 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
155 |
} |
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
156 |
|
afe194b6b1cd
Revision: 201025
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
157 |
#endif /* KOREANANALYZER_H_ */ |