searchengine/oss/loc/analysis/inc/public/ngram.h
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 #ifndef NGRAM_H_
       
    18 #define NGRAM_H_
       
    19 
       
    20 #include "tinyanalysis.h"
       
    21 #include "tinyunicode.h"
       
    22 #include "clutil.h"
       
    23 
       
    24 namespace analysis {
       
    25 
       
    26 	/**
       
    27 	 * Returns true, if the character is non-cjk letter
       
    28 	 */
       
    29     int IsNonCjk(int c);
       
    30     
       
    31     /**
       
    32      * TinyCjkTokenizer. Contains tiny analysis classes, that are 
       
    33      * used to turn Chinese, Korean and Japanese into 1-grams, while
       
    34      * using letter analyzer for other kinds of text (western, cyrillic,
       
    35      * etc.)   
       
    36      * 
       
    37      * @tparam I the iterator, that is used to read characters
       
    38      */
       
    39     template<typename I>
       
    40     struct TinyCjkTokenizer {
       
    41     
       
    42     	/** Deals with cjk */
       
    43         tiny::NGramTokenizer<I> cjk_;
       
    44     	/** Letter tokenizer for space separated language */
       
    45         tiny::CustomTokenizer<I> noncjk_; 
       
    46     	/** Combines cjk with noncjk */
       
    47         tiny::PairTokenizer<I> pair_;
       
    48         /** Moves forward, if tokenization fails */
       
    49         tiny::RelaxedTokenizer<I> t_;
       
    50         
       
    51         /** 
       
    52          * Constructs the tiny cjk tokenizer with given ngram size
       
    53          *
       
    54          * @param ngramsize cjk text is treated with n-gram analyzer of this size   
       
    55          */
       
    56         TinyCjkTokenizer(int ngramsize) 
       
    57         : cjk_(ngramsize, &unicode::IsCjk),
       
    58           noncjk_(&IsNonCjk),
       
    59           pair_(cjk_, noncjk_),
       
    60           t_(pair_) {}          
       
    61         
       
    62         /**
       
    63          * Consumes a token from given iterator. Returns n-grams
       
    64          * for cjk text, letter tokenized words for non-cjk text. 
       
    65          * Always returns something unless EOS has been reached.
       
    66          */
       
    67         inline tiny::Token<I> consume(I& i) {
       
    68             return t_.consume(i);
       
    69         }
       
    70     };
       
    71 
       
    72     
       
    73     /**
       
    74      * Constructs n-grams of Chinese, Korean and Japanese text. Uses
       
    75      * letter tokenization for other kinds of texts. 
       
    76      */
       
    77 	class CjkNGramTokenizer : public lucene::analysis::Tokenizer {
       
    78 	
       
    79 		public: 
       
    80 	
       
    81 			/** Reads from buffer */
       
    82 			typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
       
    83 
       
    84 			/** Turns utf16 to unicode */
       
    85 	        typedef tiny::Utf16Iterator<buffer_iterator> iterator;
       
    86 
       
    87 		public:
       
    88 	
       
    89 			CjkNGramTokenizer( lucene::util::Reader* reader, int gramSize );
       
    90 			
       
    91 			virtual bool next( lucene::analysis::Token* token );			
       
    92 	
       
    93 		private:
       
    94 
       
    95 			/** The tokenizer */
       
    96             TinyCjkTokenizer<iterator> t_;
       
    97 
       
    98             /** Buffer */
       
    99 			tiny::cl::ReaderBuffer<512> in_;
       
   100 
       
   101 			/** Reads utf16 from buffer and transforms it to unicode*/
       
   102 			iterator i_; 
       
   103 			
       
   104 	};
       
   105 	
       
   106 	/**
       
   107 	 * The great difference of this class compared to CJK ngram, 
       
   108 	 * that it decomposes Hangul syllables into Hangul Jamu letters.
       
   109 	 * 
       
   110 	 * This analyzer appeared to have bad performance in testing. 
       
   111 	 */
       
   112 	class JamuNGramTokenizer : public lucene::analysis::Tokenizer {
       
   113    
       
   114        public:
       
   115     
       
   116            typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
       
   117 
       
   118            typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
       
   119             
       
   120            typedef tiny::JamuIterator<utf16_iterator> iterator;
       
   121             
       
   122        public:
       
   123     
       
   124            JamuNGramTokenizer( lucene::util::Reader* reader, int gramSize );
       
   125             
       
   126            virtual bool next( lucene::analysis::Token* token );            
       
   127     
       
   128        private:
       
   129             
       
   130            TinyCjkTokenizer<iterator> t_;
       
   131        
       
   132            tiny::cl::ReaderBuffer<512> in_;
       
   133             
       
   134            iterator i_; 
       
   135            
       
   136     };
       
   137 	
       
   138 	// Analyzers using the tokenizers
       
   139 	//   * Provided mainly for testing
       
   140 	//
       
   141 	
       
   142 	/** CjkNGramTokenizer plus lowercase filter */
       
   143     typedef TemplateAnalyzer1A1F<CjkNGramTokenizer, int, lucene::analysis::LowerCaseFilter> 
       
   144         CjkNGramAnalyzer;
       
   145     
       
   146 	/** JamuNGramTokenizer plus lowercase filter */
       
   147     typedef TemplateAnalyzer1A1F<JamuNGramTokenizer, int, lucene::analysis::LowerCaseFilter> 
       
   148         JamuNGramAnalyzer;
       
   149 
       
   150 }
       
   151 
       
   152 
       
   153 #endif /* NGRAM_H_ */