|
1 /** |
|
2 * Copyright 2002-2004 The Apache Software Foundation |
|
3 * |
|
4 * Licensed under the Apache License, Version 2.0 (the "License"); |
|
5 * you may not use this file except in compliance with the License. |
|
6 * You may obtain a copy of the License at |
|
7 * |
|
8 * http://www.apache.org/licenses/LICENSE-2.0 |
|
9 * |
|
10 * Unless required by applicable law or agreed to in writing, software |
|
11 * distributed under the License is distributed on an "AS IS" BASIS, |
|
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
13 * See the License for the specific language governing permissions and |
|
14 * limitations under the License. |
|
15 */ |
|
16 #ifndef _lucene_analysis_cjk_cjkanalyzer_ |
|
17 #define _lucene_analysis_cjk_cjkanalyzer_ |
|
18 |
|
19 #if defined(_LUCENE_PRAGMA_ONCE) |
|
20 # pragma once |
|
21 #endif |
|
22 |
|
23 #include "CLucene/analysis/AnalysisHeader.h" |
|
24 |
|
25 CL_NS_DEF2(analysis,cjk) |
|
26 |
|
27 /** |
|
28 * CJKTokenizer was modified from StopTokenizer which does a decent job for |
|
29 * most European languages. It performs other token methods for double-byte |
|
30 * Characters: the token will return at each two charactors with overlap match.<br> |
|
31 * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it |
|
32 * also need filter filter zero length token ""<br> |
|
33 * for Digit: digit, '+', '#' will token as letter<br> |
|
34 * for more info on Asia language(Chinese Japanese Korean) text segmentation: |
|
35 * please search <a |
|
36 * href="http://www.google.com/search?q=word+chinese+segment">google</a> |
|
37 * |
|
38 * @author Che, Dong |
|
39 */ |
|
40 class CJKTokenizer: public CL_NS(analysis)::Tokenizer { |
|
41 private: |
|
42 /** word offset, used to imply which character(in ) is parsed */ |
|
43 int32_t offset; |
|
44 |
|
45 /** the index used only for ioBuffer */ |
|
46 int32_t bufferIndex; |
|
47 |
|
48 /** data length */ |
|
49 int32_t dataLen; |
|
50 |
|
51 /** |
|
52 * character buffer, store the characters which are used to compose <br> |
|
53 * the returned Token |
|
54 */ |
|
55 TCHAR buffer[LUCENE_MAX_WORD_LEN]; |
|
56 |
|
57 /** |
|
58 * I/O buffer, used to store the content of the input(one of the <br> |
|
59 * members of Tokenizer) |
|
60 */ |
|
61 const TCHAR* ioBuffer; |
|
62 |
|
63 /** word type: single=>ASCII double=>non-ASCII word=>default */ |
|
64 const TCHAR* tokenType; |
|
65 |
|
66 static const TCHAR* tokenTypeSingle; |
|
67 static const TCHAR* tokenTypeDouble; |
|
68 |
|
69 /** |
|
70 * tag: previous character is a cached double-byte character "C1C2C3C4" |
|
71 * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened) |
|
72 * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4" |
|
73 */ |
|
74 bool preIsTokened; |
|
75 |
|
76 |
|
77 bool ignoreSurrogates; |
|
78 |
|
79 public: |
|
80 /** |
|
81 * Construct a token stream processing the given input. |
|
82 * |
|
83 * @param in I/O reader |
|
84 */ |
|
85 CJKTokenizer(CL_NS(util)::Reader* in); |
|
86 |
|
87 /** |
|
88 * Returns the next token in the stream, or null at EOS. |
|
89 * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html |
|
90 * for detail. |
|
91 * |
|
92 * @return Token |
|
93 * |
|
94 * @throws java.io.IOException - throw IOException when read error <br> |
|
95 * hanppened in the InputStream |
|
96 * |
|
97 */ |
|
98 bool next(CL_NS(analysis)::Token* token); |
|
99 |
|
100 bool getIgnoreSurrogates(){ return ignoreSurrogates; }; |
|
101 void setIgnoreSurrogates(bool ignoreSurrogates){ this->ignoreSurrogates = ignoreSurrogates; }; |
|
102 }; |
|
103 |
|
104 class CJKAnalyzer : public lucene::analysis::Analyzer { |
|
105 |
|
106 public: |
|
107 |
|
108 virtual TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); |
|
109 |
|
110 }; |
|
111 |
|
112 CL_NS_END2 |
|
113 #endif |