searchengine/oss/loc/analysis/inc/public/koreananalyzer.h
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 #ifndef KOREANANALYZER_H_
       
    19 #define KOREANANALYZER_H_
       
    20 
       
    21 #include "Clucene.h"
       
    22 
       
    23 #include "ngram.h"
       
    24 
       
    25 #include "tinyanalysis.h"
       
    26 #include "tinyutf16.h"
       
    27 #include "tinyunicode.h"
       
    28 
       
    29 namespace analysis 
       
    30 {
       
    31     // Forward declarations
       
    32 
       
    33 	/**
       
    34 	 * Special Korean analyzer that is designed so, that Cpix can 
       
    35 	 * update the result list, when each individual Jamu character is
       
    36 	 * entered.
       
    37 	 * 
       
    38 	 * The analyzer tries to first convert given character stream into 
       
    39 	 * a form, where all Jamu characters are composed into Hangul form.
       
    40 	 * This means, that character sequences of form LV and LVT are 
       
    41 	 * eliminated and replaced with hangul syllables (L is leading Jamu
       
    42 	 * consonant, V is for vocal and T is for trailing consonant).  
       
    43 	 * 
       
    44 	 * The idea behind the analyzer is that it produces up to 3 alternative 
       
    45 	 * tokens for each hangul syllabic. All of these alternatives are returned
       
    46 	 * to be located at the same position. Let's have some Hangul syllabic H1
       
    47 	 * consisting of Jamu characters so that H1=J1J2J3. If H2=J1J2, then first
       
    48 	 * returned token is H1, second token is H2 and third token is J1. This 
       
    49 	 * means, that when user enters H1, H2 or J1, the term H1 will be found. 
       
    50 	 * Also, if user enters J1J2J3 or J1J2, term will be found, because
       
    51 	 * J1J2J3 is automatically turned to H1 and J1J2 is turned to H2.
       
    52 	 * 
       
    53 	 * NOTE: This analyzer MUST NOT be used, when searching, because 
       
    54 	 * CLuceneQueryParser will break, when it faces tokens with zero 
       
    55 	 * increment. Use KoreanQueryAnalyzer for searching material indexed
       
    56 	 * with this analyzer. 
       
    57 	 */
       
    58     class KoreanTokenizer : public lucene::analysis::Tokenizer {
       
    59    
       
    60         public:
       
    61             
       
    62 			/** Used to read from buffer */
       
    63             typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
       
    64 
       
    65             /** Turns utf16 code points into unicode */
       
    66             typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
       
    67             
       
    68             /** Turns Jamu alphabets into Hangul syllables */
       
    69             typedef tiny::HangulIterator<utf16_iterator> iterator;
       
    70             
       
    71             KoreanTokenizer(lucene::util::Reader* reader);
       
    72             
       
    73             virtual bool next(lucene::analysis::Token* token);
       
    74 
       
    75         private:
       
    76             
       
    77             /** Jamu form of last consumed hangul syllable */
       
    78             wchar_t jamu_[4];
       
    79             
       
    80             /** offsets of last consumed hangul syllable  */
       
    81             int begin_, end_;
       
    82             
       
    83             /**
       
    84              * The amount of jamu characters left in buffer. 
       
    85              * If this is non-zero, hangul syllable is being processed. 
       
    86              */
       
    87             int state_;
       
    88             
       
    89             /**
       
    90              * Tiny CJK tokenizer is used to construct 1-grams out of 
       
    91              * chinese and japanese characters and to turn latin script
       
    92              * into terms.
       
    93              */
       
    94             TinyCjkTokenizer<iterator> t_;
       
    95             
       
    96             /** 512 byte buffer for storing characters read with reader */
       
    97             tiny::cl::ReaderBuffer<512> in_;
       
    98             
       
    99             /** 
       
   100              * Reads utf16 from in_ buffer, turns it into unicode and 
       
   101              * then composes jamu alphabets into hangul syllables. 
       
   102              */
       
   103             iterator i_;
       
   104             
       
   105     
       
   106     };
       
   107     
       
   108     /** Korean tokenizer plus lowercase filter */
       
   109     typedef TemplateAnalyzer1F<KoreanTokenizer, lucene::analysis::LowerCaseFilter> 
       
   110         KoreanAnalyzer;
       
   111     
       
   112     /**
       
   113      * Turns Jamu characters into Hangul syllables and generates 1-grams for
       
   114      * all Chinese, Korean and Japanese text. 
       
   115      */
       
   116     class KoreanQueryTokenizer : public lucene::analysis::Tokenizer {
       
   117    
       
   118 		public:
       
   119     
       
   120 		    /** Used to read from buffer */
       
   121 			typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
       
   122 
       
   123 			/** Turns utf16 code points into unicode */
       
   124             typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
       
   125             
       
   126             /** Turns Hangul syllables into Jamu alphabets */
       
   127             typedef tiny::HangulIterator<utf16_iterator> iterator;
       
   128             
       
   129 		public:
       
   130        
       
   131             KoreanQueryTokenizer( lucene::util::Reader* reader );
       
   132             
       
   133             virtual bool next( lucene::analysis::Token* token );            
       
   134     
       
   135 		private:
       
   136 
       
   137             /** Buffer for storing characters read with reader */
       
   138             TinyCjkTokenizer<iterator> t_;
       
   139        
       
   140             /** Buffer for storing characters read with reader */
       
   141             tiny::cl::ReaderBuffer<512> in_;
       
   142             
       
   143             /** 
       
   144              * Reads utf16 from in_ buffer, turns it into unicode and 
       
   145              * then composes jamu alphabets into hangul syllables. 
       
   146              */
       
   147             iterator i_; 
       
   148            
       
   149     };
       
   150 
       
   151     /** Korean query analyzer plus lowercase filter */
       
   152     typedef TemplateAnalyzer1F<KoreanQueryTokenizer, lucene::analysis::LowerCaseFilter> 
       
   153         KoreanQueryAnalyzer;
       
   154 
       
   155 }
       
   156 
       
   157 #endif /* KOREANANALYZER_H_ */