24
|
1 |
/*
|
|
2 |
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
|
|
3 |
* All rights reserved.
|
|
4 |
* This component and the accompanying materials are made available
|
|
5 |
* under the terms of "Eclipse Public License v1.0"
|
|
6 |
* which accompanies this distribution, and is available
|
|
7 |
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
|
8 |
*
|
|
9 |
* Initial Contributors:
|
|
10 |
* Nokia Corporation - initial contribution.
|
|
11 |
*
|
|
12 |
* Contributors:
|
|
13 |
*
|
|
14 |
* Description:
|
|
15 |
*
|
|
16 |
*/
|
|
17 |
|
|
18 |
#ifndef KOREANANALYZER_H_
|
|
19 |
#define KOREANANALYZER_H_
|
|
20 |
|
|
21 |
#include "Clucene.h"
|
|
22 |
|
|
23 |
#include "ngram.h"
|
|
24 |
|
|
25 |
#include "tinyanalysis.h"
|
|
26 |
#include "tinyutf16.h"
|
|
27 |
#include "tinyunicode.h"
|
|
28 |
|
|
29 |
namespace analysis
|
|
30 |
{
|
|
31 |
// Forward declarations
|
|
32 |
|
|
33 |
/**
|
|
34 |
* Special Korean analyzer that is designed so, that Cpix can
|
|
35 |
* update the result list, when each individual Jamu character is
|
|
36 |
* entered.
|
|
37 |
*
|
|
38 |
* The analyzer tries to first convert given character stream into
|
|
39 |
* a form, where all Jamu characters are composed into Hangul form.
|
|
40 |
* This means, that character sequences of form LV and LVT are
|
|
41 |
* eliminated and replaced with hangul syllables (L is leading Jamu
|
|
42 |
* consonant, V is for vocal and T is for trailing consonant).
|
|
43 |
*
|
|
44 |
* The idea behind the analyzer is that it produces up to 3 alternative
|
|
45 |
* tokens for each hangul syllabic. All of these alternatives are returned
|
|
46 |
* to be located at the same position. Let's have some Hangul syllabic H1
|
|
47 |
* consisting of Jamu characters so that H1=J1J2J3. If H2=J1J2, then first
|
|
48 |
* returned token is H1, second token is H2 and third token is J1. This
|
|
49 |
* means, that when user enters H1, H2 or J1, the term H1 will be found.
|
|
50 |
* Also, if user enters J1J2J3 or J1J2, term will be found, because
|
|
51 |
* J1J2J3 is automatically turned to H1 and J1J2 is turned to H2.
|
|
52 |
*
|
|
53 |
* NOTE: This analyzer MUST NOT be used, when searching, because
|
|
54 |
* CLuceneQueryParser will break, when it faces tokens with zero
|
|
55 |
* increment. Use KoreanQueryAnalyzer for searching material indexed
|
|
56 |
* with this analyzer.
|
|
57 |
*/
|
|
58 |
class KoreanTokenizer : public lucene::analysis::Tokenizer {
|
|
59 |
|
|
60 |
public:
|
|
61 |
|
|
62 |
/** Used to read from buffer */
|
|
63 |
typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
|
|
64 |
|
|
65 |
/** Turns utf16 code points into unicode */
|
|
66 |
typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
|
|
67 |
|
|
68 |
/** Turns Jamu alphabets into Hangul syllables */
|
|
69 |
typedef tiny::HangulIterator<utf16_iterator> iterator;
|
|
70 |
|
|
71 |
KoreanTokenizer(lucene::util::Reader* reader);
|
|
72 |
|
|
73 |
virtual bool next(lucene::analysis::Token* token);
|
|
74 |
|
|
75 |
private:
|
|
76 |
|
|
77 |
/** Jamu form of last consumed hangul syllable */
|
|
78 |
wchar_t jamu_[4];
|
|
79 |
|
|
80 |
/** offsets of last consumed hangul syllable */
|
|
81 |
int begin_, end_;
|
|
82 |
|
|
83 |
/**
|
|
84 |
* The amount of jamu characters left in buffer.
|
|
85 |
* If this is non-zero, hangul syllable is being processed.
|
|
86 |
*/
|
|
87 |
int state_;
|
|
88 |
|
|
89 |
/**
|
|
90 |
* Tiny CJK tokenizer is used to construct 1-grams out of
|
|
91 |
* chinese and japanese characters and to turn latin script
|
|
92 |
* into terms.
|
|
93 |
*/
|
|
94 |
TinyCjkTokenizer<iterator> t_;
|
|
95 |
|
|
96 |
/** 512 byte buffer for storing characters read with reader */
|
|
97 |
tiny::cl::ReaderBuffer<512> in_;
|
|
98 |
|
|
99 |
/**
|
|
100 |
* Reads utf16 from in_ buffer, turns it into unicode and
|
|
101 |
* then composes jamu alphabets into hangul syllables.
|
|
102 |
*/
|
|
103 |
iterator i_;
|
|
104 |
|
|
105 |
|
|
106 |
};
|
|
107 |
|
|
108 |
/** Korean tokenizer plus lowercase filter */
|
|
109 |
typedef TemplateAnalyzer1F<KoreanTokenizer, lucene::analysis::LowerCaseFilter>
|
|
110 |
KoreanAnalyzer;
|
|
111 |
|
|
112 |
/**
|
|
113 |
* Turns Jamu characters into Hangul syllables and generates 1-grams for
|
|
114 |
* all Chinese, Korean and Japanese text.
|
|
115 |
*/
|
|
116 |
class KoreanQueryTokenizer : public lucene::analysis::Tokenizer {
|
|
117 |
|
|
118 |
public:
|
|
119 |
|
|
120 |
/** Used to read from buffer */
|
|
121 |
typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
|
|
122 |
|
|
123 |
/** Turns utf16 code points into unicode */
|
|
124 |
typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
|
|
125 |
|
|
126 |
/** Turns Hangul syllables into Jamu alphabets */
|
|
127 |
typedef tiny::HangulIterator<utf16_iterator> iterator;
|
|
128 |
|
|
129 |
public:
|
|
130 |
|
|
131 |
KoreanQueryTokenizer( lucene::util::Reader* reader );
|
|
132 |
|
|
133 |
virtual bool next( lucene::analysis::Token* token );
|
|
134 |
|
|
135 |
private:
|
|
136 |
|
|
137 |
/** Buffer for storing characters read with reader */
|
|
138 |
TinyCjkTokenizer<iterator> t_;
|
|
139 |
|
|
140 |
/** Buffer for storing characters read with reader */
|
|
141 |
tiny::cl::ReaderBuffer<512> in_;
|
|
142 |
|
|
143 |
/**
|
|
144 |
* Reads utf16 from in_ buffer, turns it into unicode and
|
|
145 |
* then composes jamu alphabets into hangul syllables.
|
|
146 |
*/
|
|
147 |
iterator i_;
|
|
148 |
|
|
149 |
};
|
|
150 |
|
|
151 |
/** Korean query analyzer plus lowercase filter */
|
|
152 |
typedef TemplateAnalyzer1F<KoreanQueryTokenizer, lucene::analysis::LowerCaseFilter>
|
|
153 |
KoreanQueryAnalyzer;
|
|
154 |
|
|
155 |
}
|
|
156 |
|
|
157 |
#endif /* KOREANANALYZER_H_ */
|