searchengine/oss/loc/analysis/inc/public/ngram.h
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
hgs
parents:
diff changeset
     1
/*
hgs
parents:
diff changeset
     2
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
hgs
parents:
diff changeset
     3
* All rights reserved.
hgs
parents:
diff changeset
     4
* This component and the accompanying materials are made available
hgs
parents:
diff changeset
     5
* under the terms of "Eclipse Public License v1.0"
hgs
parents:
diff changeset
     6
* which accompanies this distribution, and is available
hgs
parents:
diff changeset
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
hgs
parents:
diff changeset
     8
*
hgs
parents:
diff changeset
     9
* Initial Contributors:
hgs
parents:
diff changeset
    10
* Nokia Corporation - initial contribution.
hgs
parents:
diff changeset
    11
*
hgs
parents:
diff changeset
    12
* Contributors:
hgs
parents:
diff changeset
    13
*
hgs
parents:
diff changeset
    14
* Description: 
hgs
parents:
diff changeset
    15
*
hgs
parents:
diff changeset
    16
*/
hgs
parents:
diff changeset
    17
#ifndef NGRAM_H_
hgs
parents:
diff changeset
    18
#define NGRAM_H_
hgs
parents:
diff changeset
    19
hgs
parents:
diff changeset
    20
#include "tinyanalysis.h"
hgs
parents:
diff changeset
    21
#include "tinyunicode.h"
hgs
parents:
diff changeset
    22
#include "clutil.h"
hgs
parents:
diff changeset
    23
hgs
parents:
diff changeset
    24
namespace analysis {
hgs
parents:
diff changeset
    25
hgs
parents:
diff changeset
    26
	/**
hgs
parents:
diff changeset
    27
	 * Returns true, if the character is non-cjk letter
hgs
parents:
diff changeset
    28
	 */
hgs
parents:
diff changeset
    29
    int IsNonCjk(int c);
hgs
parents:
diff changeset
    30
    
hgs
parents:
diff changeset
    31
    /**
hgs
parents:
diff changeset
    32
     * TinyCjkTokenizer. Contains tiny analysis classes, that are 
hgs
parents:
diff changeset
    33
     * used to turn Chinese, Korean and Japanese into 1-grams, while
hgs
parents:
diff changeset
    34
     * using letter analyzer for other kinds of text (western, cyrillic,
hgs
parents:
diff changeset
    35
     * etc.)   
hgs
parents:
diff changeset
    36
     * 
hgs
parents:
diff changeset
    37
     * @tparam I the iterator, that is used to read characters
hgs
parents:
diff changeset
    38
     */
hgs
parents:
diff changeset
    39
    template<typename I>
hgs
parents:
diff changeset
    40
    struct TinyCjkTokenizer {
hgs
parents:
diff changeset
    41
    
hgs
parents:
diff changeset
    42
    	/** Deals with cjk */
hgs
parents:
diff changeset
    43
        tiny::NGramTokenizer<I> cjk_;
hgs
parents:
diff changeset
    44
    	/** Letter tokenizer for space separated language */
hgs
parents:
diff changeset
    45
        tiny::CustomTokenizer<I> noncjk_; 
hgs
parents:
diff changeset
    46
    	/** Combines cjk with noncjk */
hgs
parents:
diff changeset
    47
        tiny::PairTokenizer<I> pair_;
hgs
parents:
diff changeset
    48
        /** Moves forward, if tokenization fails */
hgs
parents:
diff changeset
    49
        tiny::RelaxedTokenizer<I> t_;
hgs
parents:
diff changeset
    50
        
hgs
parents:
diff changeset
    51
        /** 
hgs
parents:
diff changeset
    52
         * Constructs the tiny cjk tokenizer with given ngram size
hgs
parents:
diff changeset
    53
         *
hgs
parents:
diff changeset
    54
         * @param ngramsize cjk text is treated with n-gram analyzer of this size   
hgs
parents:
diff changeset
    55
         */
hgs
parents:
diff changeset
    56
        TinyCjkTokenizer(int ngramsize) 
hgs
parents:
diff changeset
    57
        : cjk_(ngramsize, &unicode::IsCjk),
hgs
parents:
diff changeset
    58
          noncjk_(&IsNonCjk),
hgs
parents:
diff changeset
    59
          pair_(cjk_, noncjk_),
hgs
parents:
diff changeset
    60
          t_(pair_) {}          
hgs
parents:
diff changeset
    61
        
hgs
parents:
diff changeset
    62
        /**
hgs
parents:
diff changeset
    63
         * Consumes a token from given iterator. Returns n-grams
hgs
parents:
diff changeset
    64
         * for cjk text, letter tokenized words for non-cjk text. 
hgs
parents:
diff changeset
    65
         * Always returns something unless EOS has been reached.
hgs
parents:
diff changeset
    66
         */
hgs
parents:
diff changeset
    67
        inline tiny::Token<I> consume(I& i) {
hgs
parents:
diff changeset
    68
            return t_.consume(i);
hgs
parents:
diff changeset
    69
        }
hgs
parents:
diff changeset
    70
    };
hgs
parents:
diff changeset
    71
hgs
parents:
diff changeset
    72
    
hgs
parents:
diff changeset
    73
    /**
hgs
parents:
diff changeset
    74
     * Constructs n-grams of Chinese, Korean and Japanese text. Uses
hgs
parents:
diff changeset
    75
     * letter tokenization for other kinds of texts. 
hgs
parents:
diff changeset
    76
     */
hgs
parents:
diff changeset
    77
	class CjkNGramTokenizer : public lucene::analysis::Tokenizer {
hgs
parents:
diff changeset
    78
	
hgs
parents:
diff changeset
    79
		public: 
hgs
parents:
diff changeset
    80
	
hgs
parents:
diff changeset
    81
			/** Reads from buffer */
hgs
parents:
diff changeset
    82
			typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
hgs
parents:
diff changeset
    83
hgs
parents:
diff changeset
    84
			/** Turns utf16 to unicode */
hgs
parents:
diff changeset
    85
	        typedef tiny::Utf16Iterator<buffer_iterator> iterator;
hgs
parents:
diff changeset
    86
hgs
parents:
diff changeset
    87
		public:
hgs
parents:
diff changeset
    88
	
hgs
parents:
diff changeset
    89
			CjkNGramTokenizer( lucene::util::Reader* reader, int gramSize );
hgs
parents:
diff changeset
    90
			
hgs
parents:
diff changeset
    91
			virtual bool next( lucene::analysis::Token* token );			
hgs
parents:
diff changeset
    92
	
hgs
parents:
diff changeset
    93
		private:
hgs
parents:
diff changeset
    94
hgs
parents:
diff changeset
    95
			/** The tokenizer */
hgs
parents:
diff changeset
    96
            TinyCjkTokenizer<iterator> t_;
hgs
parents:
diff changeset
    97
hgs
parents:
diff changeset
    98
            /** Buffer */
hgs
parents:
diff changeset
    99
			tiny::cl::ReaderBuffer<512> in_;
hgs
parents:
diff changeset
   100
hgs
parents:
diff changeset
   101
			/** Reads utf16 from buffer and transforms it to unicode*/
hgs
parents:
diff changeset
   102
			iterator i_; 
hgs
parents:
diff changeset
   103
			
hgs
parents:
diff changeset
   104
	};
hgs
parents:
diff changeset
   105
	
hgs
parents:
diff changeset
   106
	/**
hgs
parents:
diff changeset
   107
	 * The great difference of this class compared to CJK ngram, 
hgs
parents:
diff changeset
   108
	 * that it decomposes Hangul syllables into Hangul Jamu letters.
hgs
parents:
diff changeset
   109
	 * 
hgs
parents:
diff changeset
   110
	 * This analyzer appeared to have bad performance in testing. 
hgs
parents:
diff changeset
   111
	 */
hgs
parents:
diff changeset
   112
	class JamuNGramTokenizer : public lucene::analysis::Tokenizer {
hgs
parents:
diff changeset
   113
   
hgs
parents:
diff changeset
   114
       public:
hgs
parents:
diff changeset
   115
    
hgs
parents:
diff changeset
   116
           typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
hgs
parents:
diff changeset
   117
hgs
parents:
diff changeset
   118
           typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
hgs
parents:
diff changeset
   119
            
hgs
parents:
diff changeset
   120
           typedef tiny::JamuIterator<utf16_iterator> iterator;
hgs
parents:
diff changeset
   121
            
hgs
parents:
diff changeset
   122
       public:
hgs
parents:
diff changeset
   123
    
hgs
parents:
diff changeset
   124
           JamuNGramTokenizer( lucene::util::Reader* reader, int gramSize );
hgs
parents:
diff changeset
   125
            
hgs
parents:
diff changeset
   126
           virtual bool next( lucene::analysis::Token* token );            
hgs
parents:
diff changeset
   127
    
hgs
parents:
diff changeset
   128
       private:
hgs
parents:
diff changeset
   129
            
hgs
parents:
diff changeset
   130
           TinyCjkTokenizer<iterator> t_;
hgs
parents:
diff changeset
   131
       
hgs
parents:
diff changeset
   132
           tiny::cl::ReaderBuffer<512> in_;
hgs
parents:
diff changeset
   133
            
hgs
parents:
diff changeset
   134
           iterator i_; 
hgs
parents:
diff changeset
   135
           
hgs
parents:
diff changeset
   136
    };
hgs
parents:
diff changeset
   137
	
hgs
parents:
diff changeset
   138
	// Analyzers using the tokenizers
hgs
parents:
diff changeset
   139
	//   * Provided mainly for testing
hgs
parents:
diff changeset
   140
	//
hgs
parents:
diff changeset
   141
	
hgs
parents:
diff changeset
   142
	/** CjkNGramTokenizer plus lowercase filter */
hgs
parents:
diff changeset
   143
    typedef TemplateAnalyzer1A1F<CjkNGramTokenizer, int, lucene::analysis::LowerCaseFilter> 
hgs
parents:
diff changeset
   144
        CjkNGramAnalyzer;
hgs
parents:
diff changeset
   145
    
hgs
parents:
diff changeset
   146
	/** JamuNGramTokenizer plus lowercase filter */
hgs
parents:
diff changeset
   147
    typedef TemplateAnalyzer1A1F<JamuNGramTokenizer, int, lucene::analysis::LowerCaseFilter> 
hgs
parents:
diff changeset
   148
        JamuNGramAnalyzer;
hgs
parents:
diff changeset
   149
hgs
parents:
diff changeset
   150
}
hgs
parents:
diff changeset
   151
hgs
parents:
diff changeset
   152
hgs
parents:
diff changeset
   153
#endif /* NGRAM_H_ */