searchengine/oss/loc/analysis/inc/public/cjkanalyzer.h
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /**
       
     2  * Copyright 2002-2004 The Apache Software Foundation
       
     3  *
       
     4  * Licensed under the Apache License, Version 2.0 (the "License");
       
     5  * you may not use this file except in compliance with the License.
       
     6  * You may obtain a copy of the License at
       
     7  *
       
     8  *     http://www.apache.org/licenses/LICENSE-2.0
       
     9  *
       
    10  * Unless required by applicable law or agreed to in writing, software
       
    11  * distributed under the License is distributed on an "AS IS" BASIS,
       
    12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
       
    13  * See the License for the specific language governing permissions and
       
    14  * limitations under the License.
       
    15  */
       
    16 #ifndef _lucene_analysis_cjk_cjkanalyzer_
       
    17 #define _lucene_analysis_cjk_cjkanalyzer_
       
    18 
       
    19 #if defined(_LUCENE_PRAGMA_ONCE)
       
    20 # pragma once
       
    21 #endif
       
    22 
       
    23 #include "CLucene/analysis/AnalysisHeader.h"
       
    24 
       
    25 CL_NS_DEF2(analysis,cjk)
       
    26 
       
    27 /**
       
    28  * CJKTokenizer was modified from StopTokenizer which does a decent job for
       
    29  * most European languages. It performs other token methods for double-byte
       
    30  * Characters: the token will return at each two charactors with overlap match.<br>
       
    31  * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
       
    32  * also need filter filter zero length token ""<br>
       
    33  * for Digit: digit, '+', '#' will token as letter<br>
       
    34  * for more info on Asia language(Chinese Japanese Korean) text segmentation:
       
    35  * please search  <a
       
    36  * href="http://www.google.com/search?q=word+chinese+segment">google</a>
       
    37  *
       
    38  * @author Che, Dong
       
    39  */
       
    40 class CJKTokenizer: public CL_NS(analysis)::Tokenizer {
       
    41 private:
       
    42 	/** word offset, used to imply which character(in ) is parsed */
       
    43     int32_t offset;
       
    44 
       
    45     /** the index used only for ioBuffer */
       
    46     int32_t bufferIndex;
       
    47 
       
    48     /** data length */
       
    49     int32_t dataLen;
       
    50 
       
    51     /**
       
    52      * character buffer, store the characters which are used to compose <br>
       
    53      * the returned Token
       
    54      */
       
    55     TCHAR buffer[LUCENE_MAX_WORD_LEN];
       
    56 
       
    57     /**
       
    58      * I/O buffer, used to store the content of the input(one of the <br>
       
    59      * members of Tokenizer)
       
    60      */
       
    61     const TCHAR* ioBuffer;
       
    62 
       
    63     /** word type: single=>ASCII  double=>non-ASCII word=>default */
       
    64     const TCHAR* tokenType;
       
    65 
       
    66 	static const TCHAR* tokenTypeSingle;
       
    67 	static const TCHAR* tokenTypeDouble;
       
    68 
       
    69     /**
       
    70      * tag: previous character is a cached double-byte character  "C1C2C3C4"
       
    71      * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
       
    72      * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
       
    73      */
       
    74     bool preIsTokened;
       
    75 
       
    76 
       
    77 	bool ignoreSurrogates;
       
    78 
       
    79 public:
       
    80     /**
       
    81      * Construct a token stream processing the given input.
       
    82      *
       
    83      * @param in I/O reader
       
    84      */
       
    85 	CJKTokenizer(CL_NS(util)::Reader* in);
       
    86 
       
    87 	/**
       
    88      * Returns the next token in the stream, or null at EOS.
       
    89      * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
       
    90      * for detail.
       
    91      *
       
    92      * @return Token
       
    93      *
       
    94      * @throws java.io.IOException - throw IOException when read error <br>
       
    95      *         hanppened in the InputStream
       
    96      *
       
    97      */
       
    98 	bool next(CL_NS(analysis)::Token* token);
       
    99 
       
   100 	bool getIgnoreSurrogates(){ return ignoreSurrogates; };
       
   101 	void setIgnoreSurrogates(bool ignoreSurrogates){ this->ignoreSurrogates = ignoreSurrogates; };
       
   102 };
       
   103 
       
   104 class CJKAnalyzer : public lucene::analysis::Analyzer {
       
   105 
       
   106 public: 
       
   107 
       
   108 	virtual TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
       
   109 	
       
   110 };
       
   111 
       
   112 CL_NS_END2
       
   113 #endif