searchengine/oss/loc/analysis/inc/public/cjkanalyzer.h
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
hgs
parents:
diff changeset
     1
/**
hgs
parents:
diff changeset
     2
 * Copyright 2002-2004 The Apache Software Foundation
hgs
parents:
diff changeset
     3
 *
hgs
parents:
diff changeset
     4
 * Licensed under the Apache License, Version 2.0 (the "License");
hgs
parents:
diff changeset
     5
 * you may not use this file except in compliance with the License.
hgs
parents:
diff changeset
     6
 * You may obtain a copy of the License at
hgs
parents:
diff changeset
     7
 *
hgs
parents:
diff changeset
     8
 *     http://www.apache.org/licenses/LICENSE-2.0
hgs
parents:
diff changeset
     9
 *
hgs
parents:
diff changeset
    10
 * Unless required by applicable law or agreed to in writing, software
hgs
parents:
diff changeset
    11
 * distributed under the License is distributed on an "AS IS" BASIS,
hgs
parents:
diff changeset
    12
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
hgs
parents:
diff changeset
    13
 * See the License for the specific language governing permissions and
hgs
parents:
diff changeset
    14
 * limitations under the License.
hgs
parents:
diff changeset
    15
 */
hgs
parents:
diff changeset
    16
#ifndef _lucene_analysis_cjk_cjkanalyzer_
hgs
parents:
diff changeset
    17
#define _lucene_analysis_cjk_cjkanalyzer_
hgs
parents:
diff changeset
    18
hgs
parents:
diff changeset
    19
#if defined(_LUCENE_PRAGMA_ONCE)
hgs
parents:
diff changeset
    20
# pragma once
hgs
parents:
diff changeset
    21
#endif
hgs
parents:
diff changeset
    22
hgs
parents:
diff changeset
    23
#include "CLucene/analysis/AnalysisHeader.h"
hgs
parents:
diff changeset
    24
hgs
parents:
diff changeset
    25
CL_NS_DEF2(analysis,cjk)
hgs
parents:
diff changeset
    26
hgs
parents:
diff changeset
    27
/**
hgs
parents:
diff changeset
    28
 * CJKTokenizer was modified from StopTokenizer which does a decent job for
hgs
parents:
diff changeset
    29
 * most European languages. It performs other token methods for double-byte
hgs
parents:
diff changeset
    30
 * Characters: the token will return at each two charactors with overlap match.<br>
hgs
parents:
diff changeset
    31
 * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
hgs
parents:
diff changeset
    32
 * also need filter filter zero length token ""<br>
hgs
parents:
diff changeset
    33
 * for Digit: digit, '+', '#' will token as letter<br>
hgs
parents:
diff changeset
    34
 * for more info on Asia language(Chinese Japanese Korean) text segmentation:
hgs
parents:
diff changeset
    35
 * please search  <a
hgs
parents:
diff changeset
    36
 * href="http://www.google.com/search?q=word+chinese+segment">google</a>
hgs
parents:
diff changeset
    37
 *
hgs
parents:
diff changeset
    38
 * @author Che, Dong
hgs
parents:
diff changeset
    39
 */
hgs
parents:
diff changeset
    40
class CJKTokenizer: public CL_NS(analysis)::Tokenizer {
hgs
parents:
diff changeset
    41
private:
hgs
parents:
diff changeset
    42
	/** word offset, used to imply which character(in ) is parsed */
hgs
parents:
diff changeset
    43
    int32_t offset;
hgs
parents:
diff changeset
    44
hgs
parents:
diff changeset
    45
    /** the index used only for ioBuffer */
hgs
parents:
diff changeset
    46
    int32_t bufferIndex;
hgs
parents:
diff changeset
    47
hgs
parents:
diff changeset
    48
    /** data length */
hgs
parents:
diff changeset
    49
    int32_t dataLen;
hgs
parents:
diff changeset
    50
hgs
parents:
diff changeset
    51
    /**
hgs
parents:
diff changeset
    52
     * character buffer, store the characters which are used to compose <br>
hgs
parents:
diff changeset
    53
     * the returned Token
hgs
parents:
diff changeset
    54
     */
hgs
parents:
diff changeset
    55
    TCHAR buffer[LUCENE_MAX_WORD_LEN];
hgs
parents:
diff changeset
    56
hgs
parents:
diff changeset
    57
    /**
hgs
parents:
diff changeset
    58
     * I/O buffer, used to store the content of the input(one of the <br>
hgs
parents:
diff changeset
    59
     * members of Tokenizer)
hgs
parents:
diff changeset
    60
     */
hgs
parents:
diff changeset
    61
    const TCHAR* ioBuffer;
hgs
parents:
diff changeset
    62
hgs
parents:
diff changeset
    63
    /** word type: single=>ASCII  double=>non-ASCII word=>default */
hgs
parents:
diff changeset
    64
    const TCHAR* tokenType;
hgs
parents:
diff changeset
    65
hgs
parents:
diff changeset
    66
	static const TCHAR* tokenTypeSingle;
hgs
parents:
diff changeset
    67
	static const TCHAR* tokenTypeDouble;
hgs
parents:
diff changeset
    68
hgs
parents:
diff changeset
    69
    /**
hgs
parents:
diff changeset
    70
     * tag: previous character is a cached double-byte character  "C1C2C3C4"
hgs
parents:
diff changeset
    71
     * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
hgs
parents:
diff changeset
    72
     * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
hgs
parents:
diff changeset
    73
     */
hgs
parents:
diff changeset
    74
    bool preIsTokened;
hgs
parents:
diff changeset
    75
hgs
parents:
diff changeset
    76
hgs
parents:
diff changeset
    77
	bool ignoreSurrogates;
hgs
parents:
diff changeset
    78
hgs
parents:
diff changeset
    79
public:
hgs
parents:
diff changeset
    80
    /**
hgs
parents:
diff changeset
    81
     * Construct a token stream processing the given input.
hgs
parents:
diff changeset
    82
     *
hgs
parents:
diff changeset
    83
     * @param in I/O reader
hgs
parents:
diff changeset
    84
     */
hgs
parents:
diff changeset
    85
	CJKTokenizer(CL_NS(util)::Reader* in);
hgs
parents:
diff changeset
    86
hgs
parents:
diff changeset
    87
	/**
hgs
parents:
diff changeset
    88
     * Returns the next token in the stream, or null at EOS.
hgs
parents:
diff changeset
    89
     * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
hgs
parents:
diff changeset
    90
     * for detail.
hgs
parents:
diff changeset
    91
     *
hgs
parents:
diff changeset
    92
     * @return Token
hgs
parents:
diff changeset
    93
     *
hgs
parents:
diff changeset
    94
     * @throws java.io.IOException - throw IOException when read error <br>
hgs
parents:
diff changeset
    95
     *         hanppened in the InputStream
hgs
parents:
diff changeset
    96
     *
hgs
parents:
diff changeset
    97
     */
hgs
parents:
diff changeset
    98
	bool next(CL_NS(analysis)::Token* token);
hgs
parents:
diff changeset
    99
hgs
parents:
diff changeset
   100
	bool getIgnoreSurrogates(){ return ignoreSurrogates; };
hgs
parents:
diff changeset
   101
	void setIgnoreSurrogates(bool ignoreSurrogates){ this->ignoreSurrogates = ignoreSurrogates; };
hgs
parents:
diff changeset
   102
};
hgs
parents:
diff changeset
   103
hgs
parents:
diff changeset
   104
class CJKAnalyzer : public lucene::analysis::Analyzer {
hgs
parents:
diff changeset
   105
hgs
parents:
diff changeset
   106
public: 
hgs
parents:
diff changeset
   107
hgs
parents:
diff changeset
   108
	virtual TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
hgs
parents:
diff changeset
   109
	
hgs
parents:
diff changeset
   110
};
hgs
parents:
diff changeset
   111
hgs
parents:
diff changeset
   112
CL_NS_END2
hgs
parents:
diff changeset
   113
#endif