diff -r d4d56f5e7c55 -r 65456528cac2 searchengine/oss/loc/analysis/inc/public/cjkanalyzer.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/searchengine/oss/loc/analysis/inc/public/cjkanalyzer.h Fri Oct 15 12:09:28 2010 +0530 @@ -0,0 +1,113 @@ +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef _lucene_analysis_cjk_cjkanalyzer_ +#define _lucene_analysis_cjk_cjkanalyzer_ + +#if defined(_LUCENE_PRAGMA_ONCE) +# pragma once +#endif + +#include "CLucene/analysis/AnalysisHeader.h" + +CL_NS_DEF2(analysis,cjk) + +/** + * CJKTokenizer was modified from StopTokenizer which does a decent job for + * most European languages. It performs other token methods for double-byte + * Characters: the token will return at each two charactors with overlap match.
+ * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it + * also need filter filter zero length token ""
+ * for Digit: digit, '+', '#' will token as letter
+ * for more info on Asia language(Chinese Japanese Korean) text segmentation: + * please search google + * + * @author Che, Dong + */ +class CJKTokenizer: public CL_NS(analysis)::Tokenizer { +private: + /** word offset, used to imply which character(in ) is parsed */ + int32_t offset; + + /** the index used only for ioBuffer */ + int32_t bufferIndex; + + /** data length */ + int32_t dataLen; + + /** + * character buffer, store the characters which are used to compose
+ * the returned Token + */ + TCHAR buffer[LUCENE_MAX_WORD_LEN]; + + /** + * I/O buffer, used to store the content of the input(one of the
+ * members of Tokenizer) + */ + const TCHAR* ioBuffer; + + /** word type: single=>ASCII double=>non-ASCII word=>default */ + const TCHAR* tokenType; + + static const TCHAR* tokenTypeSingle; + static const TCHAR* tokenTypeDouble; + + /** + * tag: previous character is a cached double-byte character "C1C2C3C4" + * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened) + * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4" + */ + bool preIsTokened; + + + bool ignoreSurrogates; + +public: + /** + * Construct a token stream processing the given input. + * + * @param in I/O reader + */ + CJKTokenizer(CL_NS(util)::Reader* in); + + /** + * Returns the next token in the stream, or null at EOS. + * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html + * for detail. + * + * @return Token + * + * @throws java.io.IOException - throw IOException when read error
+ * hanppened in the InputStream + * + */ + bool next(CL_NS(analysis)::Token* token); + + bool getIgnoreSurrogates(){ return ignoreSurrogates; }; + void setIgnoreSurrogates(bool ignoreSurrogates){ this->ignoreSurrogates = ignoreSurrogates; }; +}; + +class CJKAnalyzer : public lucene::analysis::Analyzer { + +public: + + virtual TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); + +}; + +CL_NS_END2 +#endif