24
|
1 |
/**
|
|
2 |
* Copyright 2002-2004 The Apache Software Foundation
|
|
3 |
*
|
|
4 |
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
5 |
* you may not use this file except in compliance with the License.
|
|
6 |
* You may obtain a copy of the License at
|
|
7 |
*
|
|
8 |
* http://www.apache.org/licenses/LICENSE-2.0
|
|
9 |
*
|
|
10 |
* Unless required by applicable law or agreed to in writing, software
|
|
11 |
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
12 |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13 |
* See the License for the specific language governing permissions and
|
|
14 |
* limitations under the License.
|
|
15 |
*/
|
|
16 |
#ifndef _lucene_analysis_cjk_cjkanalyzer_
|
|
17 |
#define _lucene_analysis_cjk_cjkanalyzer_
|
|
18 |
|
|
19 |
#if defined(_LUCENE_PRAGMA_ONCE)
|
|
20 |
# pragma once
|
|
21 |
#endif
|
|
22 |
|
|
23 |
#include "CLucene/analysis/AnalysisHeader.h"
|
|
24 |
|
|
25 |
CL_NS_DEF2(analysis,cjk)
|
|
26 |
|
|
27 |
/**
|
|
28 |
* CJKTokenizer was modified from StopTokenizer which does a decent job for
|
|
29 |
* most European languages. It performs other token methods for double-byte
|
|
30 |
* Characters: the token will return at each two charactors with overlap match.<br>
|
|
31 |
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
|
|
32 |
* also need filter filter zero length token ""<br>
|
|
33 |
* for Digit: digit, '+', '#' will token as letter<br>
|
|
34 |
* for more info on Asia language(Chinese Japanese Korean) text segmentation:
|
|
35 |
* please search <a
|
|
36 |
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
|
|
37 |
*
|
|
38 |
* @author Che, Dong
|
|
39 |
*/
|
|
40 |
class CJKTokenizer: public CL_NS(analysis)::Tokenizer {
|
|
41 |
private:
|
|
42 |
/** word offset, used to imply which character(in ) is parsed */
|
|
43 |
int32_t offset;
|
|
44 |
|
|
45 |
/** the index used only for ioBuffer */
|
|
46 |
int32_t bufferIndex;
|
|
47 |
|
|
48 |
/** data length */
|
|
49 |
int32_t dataLen;
|
|
50 |
|
|
51 |
/**
|
|
52 |
* character buffer, store the characters which are used to compose <br>
|
|
53 |
* the returned Token
|
|
54 |
*/
|
|
55 |
TCHAR buffer[LUCENE_MAX_WORD_LEN];
|
|
56 |
|
|
57 |
/**
|
|
58 |
* I/O buffer, used to store the content of the input(one of the <br>
|
|
59 |
* members of Tokenizer)
|
|
60 |
*/
|
|
61 |
const TCHAR* ioBuffer;
|
|
62 |
|
|
63 |
/** word type: single=>ASCII double=>non-ASCII word=>default */
|
|
64 |
const TCHAR* tokenType;
|
|
65 |
|
|
66 |
static const TCHAR* tokenTypeSingle;
|
|
67 |
static const TCHAR* tokenTypeDouble;
|
|
68 |
|
|
69 |
/**
|
|
70 |
* tag: previous character is a cached double-byte character "C1C2C3C4"
|
|
71 |
* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
|
|
72 |
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
|
|
73 |
*/
|
|
74 |
bool preIsTokened;
|
|
75 |
|
|
76 |
|
|
77 |
bool ignoreSurrogates;
|
|
78 |
|
|
79 |
public:
|
|
80 |
/**
|
|
81 |
* Construct a token stream processing the given input.
|
|
82 |
*
|
|
83 |
* @param in I/O reader
|
|
84 |
*/
|
|
85 |
CJKTokenizer(CL_NS(util)::Reader* in);
|
|
86 |
|
|
87 |
/**
|
|
88 |
* Returns the next token in the stream, or null at EOS.
|
|
89 |
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
|
|
90 |
* for detail.
|
|
91 |
*
|
|
92 |
* @return Token
|
|
93 |
*
|
|
94 |
* @throws java.io.IOException - throw IOException when read error <br>
|
|
95 |
* hanppened in the InputStream
|
|
96 |
*
|
|
97 |
*/
|
|
98 |
bool next(CL_NS(analysis)::Token* token);
|
|
99 |
|
|
100 |
bool getIgnoreSurrogates(){ return ignoreSurrogates; };
|
|
101 |
void setIgnoreSurrogates(bool ignoreSurrogates){ this->ignoreSurrogates = ignoreSurrogates; };
|
|
102 |
};
|
|
103 |
|
|
104 |
class CJKAnalyzer : public lucene::analysis::Analyzer {
|
|
105 |
|
|
106 |
public:
|
|
107 |
|
|
108 |
virtual TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
|
|
109 |
|
|
110 |
};
|
|
111 |
|
|
112 |
CL_NS_END2
|
|
113 |
#endif
|