|
1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 #include "CLucene/StdHeader.h" |
|
19 #include "CJKAnalyzer.h" |
|
20 |
|
21 CL_NS_DEF2(analysis,cjk) |
|
22 CL_NS_USE(analysis) |
|
23 CL_NS_USE(util) |
|
24 |
|
25 |
|
26 const TCHAR* CJKTokenizer::tokenTypeSingle = _T("single"); |
|
27 const TCHAR* CJKTokenizer::tokenTypeDouble = _T("double"); |
|
28 |
|
29 CJKTokenizer::CJKTokenizer(Reader* in): |
|
30 Tokenizer(in) |
|
31 { |
|
32 tokenType = Token::defaultType; |
|
33 offset = 0; |
|
34 bufferIndex = 0; |
|
35 dataLen = 0; |
|
36 preIsTokened = false; |
|
37 ignoreSurrogates = true; |
|
38 } |
|
39 |
|
40 bool CJKTokenizer::next(Token* token){ |
|
41 while (true) { |
|
42 /** how many character(s) has been stored in buffer */ |
|
43 int32_t length = 0; |
|
44 |
|
45 /** the position used to create Token */ |
|
46 int32_t start = offset; |
|
47 |
|
48 while (true) { |
|
49 /** current character */ |
|
50 clunichar c; |
|
51 int charlen = 1; |
|
52 |
|
53 offset++; |
|
54 |
|
55 if (bufferIndex >= dataLen) { |
|
56 dataLen = input->read(ioBuffer); |
|
57 bufferIndex = 0; |
|
58 } |
|
59 |
|
60 if (dataLen == -1) { |
|
61 if (length > 0) { |
|
62 if (preIsTokened == true) { |
|
63 length = 0; |
|
64 preIsTokened = false; |
|
65 } else { |
|
66 offset--; |
|
67 } |
|
68 break; |
|
69 } else { |
|
70 offset--; |
|
71 return false; |
|
72 } |
|
73 } else { |
|
74 //get current character |
|
75 c = ioBuffer[bufferIndex++]; |
|
76 } |
|
77 |
|
78 //to support surrogates, we'll need to convert the incoming utf16 into |
|
79 //ucs4(c variable). however, gunichartables doesn't seem to classify |
|
80 //any of the surrogates as alpha, so they are skipped anyway... |
|
81 //so for now we just convert to ucs4 so that we dont corrupt the input. |
|
82 if ( c >= 0xd800 || c <= 0xdfff ){ |
|
83 clunichar c2 = ioBuffer[bufferIndex]; |
|
84 if ( c2 >= 0xdc00 && c2 <= 0xdfff ){ |
|
85 bufferIndex++; |
|
86 offset++; |
|
87 charlen=2; |
|
88 |
|
89 c = (((c & 0x03ffL) << 10) | ((c2 & 0x03ffL) << 0)) + 0x00010000L; |
|
90 } |
|
91 } |
|
92 |
|
93 //if the current character is ASCII or Extend ASCII |
|
94 if ((c <= 0xFF) //is BASIC_LATIN |
|
95 || (c>=0xFF00 && c<=0xFFEF) //ascii >0x74 cast to unsigned... |
|
96 ) { |
|
97 if (c >= 0xFF00) { |
|
98 //todo: test this... only happens on platforms where char is signed, i think... |
|
99 /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */ |
|
100 c -= 0xFEE0; |
|
101 } |
|
102 |
|
103 // if the current character is a letter or "_" "+" "#" |
|
104 if (_istalnum(c) || ((c == '_') || (c == '+') || (c == '#')) ) { |
|
105 if (length == 0) { |
|
106 // "javaC1C2C3C4linux" <br> |
|
107 // ^--: the current character begin to token the ASCII |
|
108 // letter |
|
109 start = offset - 1; |
|
110 } else if (tokenType == tokenTypeDouble) { |
|
111 // "javaC1C2C3C4linux" <br> |
|
112 // ^--: the previous non-ASCII |
|
113 // : the current character |
|
114 offset-=charlen; |
|
115 bufferIndex-=charlen; |
|
116 tokenType = tokenTypeSingle; |
|
117 |
|
118 if (preIsTokened == true) { |
|
119 // there is only one non-ASCII has been stored |
|
120 length = 0; |
|
121 preIsTokened = false; |
|
122 |
|
123 break; |
|
124 } else { |
|
125 break; |
|
126 } |
|
127 } |
|
128 |
|
129 // store the LowerCase(c) in the buffer |
|
130 buffer[length++] = _totlower((TCHAR)c); |
|
131 tokenType = tokenTypeSingle; |
|
132 |
|
133 // break the procedure if buffer overflowed! |
|
134 if (length == LUCENE_MAX_WORD_LEN) { |
|
135 break; |
|
136 } |
|
137 } else if (length > 0) { |
|
138 if (preIsTokened == true) { |
|
139 length = 0; |
|
140 preIsTokened = false; |
|
141 } else { |
|
142 break; |
|
143 } |
|
144 } |
|
145 } else { |
|
146 // non-ASCII letter, eg."C1C2C3C4" |
|
147 if ( _istalpha(c) || (!ignoreSurrogates && c >= 0x10000) ) { |
|
148 if (length == 0) { |
|
149 start = offset - 1; |
|
150 |
|
151 if ( c < 0x00010000L ) |
|
152 buffer[length++] = (TCHAR)c; |
|
153 else{ |
|
154 clunichar ucs4 = c - 0x00010000L; |
|
155 buffer[length++] = (TCHAR)((ucs4 >> 10) & 0x3ff) | 0xd800; |
|
156 buffer[length++] = (TCHAR)((ucs4 >> 0) & 0x3ff) | 0xdc00; |
|
157 } |
|
158 |
|
159 tokenType = tokenTypeDouble; |
|
160 } else { |
|
161 if (tokenType == tokenTypeSingle) { |
|
162 offset-=charlen; |
|
163 bufferIndex-=charlen; |
|
164 |
|
165 //return the previous ASCII characters |
|
166 break; |
|
167 } else { |
|
168 if ( c < 0x00010000L ) |
|
169 buffer[length++] = (TCHAR)c; |
|
170 else{ |
|
171 clunichar ucs4 = c - 0x00010000L; |
|
172 buffer[length++] = (TCHAR)((ucs4 >> 10) & 0x3ff) | 0xd800; |
|
173 buffer[length++] = (TCHAR)((ucs4 >> 0) & 0x3ff) | 0xdc00; |
|
174 } |
|
175 tokenType = tokenTypeDouble; |
|
176 |
|
177 if (length >= 2) { |
|
178 offset-=charlen; |
|
179 bufferIndex-=charlen; |
|
180 preIsTokened = true; |
|
181 |
|
182 break; |
|
183 } |
|
184 } |
|
185 } |
|
186 } else if (length > 0) { |
|
187 if (preIsTokened == true) { |
|
188 // empty the buffer |
|
189 length = 0; |
|
190 preIsTokened = false; |
|
191 } else { |
|
192 break; |
|
193 } |
|
194 } |
|
195 } |
|
196 } |
|
197 if (length > 0) { |
|
198 buffer[length]='\0'; |
|
199 token->set(buffer,start, start+length, tokenType); |
|
200 return true; |
|
201 } else if (dataLen == -1) { |
|
202 offset--; |
|
203 return false; |
|
204 } |
|
205 } |
|
206 } |
|
207 |
|
208 TokenStream* CJKAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) { |
|
209 return new CJKTokenizer(reader); |
|
210 } |
|
211 |
|
212 |
|
213 CL_NS_END2 |