searchengine/oss/loc/analysis/inc/public/koreananalyzer.h
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
hgs
parents:
diff changeset
     1
/*
hgs
parents:
diff changeset
     2
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
hgs
parents:
diff changeset
     3
* All rights reserved.
hgs
parents:
diff changeset
     4
* This component and the accompanying materials are made available
hgs
parents:
diff changeset
     5
* under the terms of "Eclipse Public License v1.0"
hgs
parents:
diff changeset
     6
* which accompanies this distribution, and is available
hgs
parents:
diff changeset
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
hgs
parents:
diff changeset
     8
*
hgs
parents:
diff changeset
     9
* Initial Contributors:
hgs
parents:
diff changeset
    10
* Nokia Corporation - initial contribution.
hgs
parents:
diff changeset
    11
*
hgs
parents:
diff changeset
    12
* Contributors:
hgs
parents:
diff changeset
    13
*
hgs
parents:
diff changeset
    14
* Description: 
hgs
parents:
diff changeset
    15
*
hgs
parents:
diff changeset
    16
*/
hgs
parents:
diff changeset
    17
hgs
parents:
diff changeset
    18
#ifndef KOREANANALYZER_H_
hgs
parents:
diff changeset
    19
#define KOREANANALYZER_H_
hgs
parents:
diff changeset
    20
hgs
parents:
diff changeset
    21
#include "Clucene.h"
hgs
parents:
diff changeset
    22
hgs
parents:
diff changeset
    23
#include "ngram.h"
hgs
parents:
diff changeset
    24
hgs
parents:
diff changeset
    25
#include "tinyanalysis.h"
hgs
parents:
diff changeset
    26
#include "tinyutf16.h"
hgs
parents:
diff changeset
    27
#include "tinyunicode.h"
hgs
parents:
diff changeset
    28
hgs
parents:
diff changeset
    29
namespace analysis 
hgs
parents:
diff changeset
    30
{
hgs
parents:
diff changeset
    31
    // Forward declarations
hgs
parents:
diff changeset
    32
hgs
parents:
diff changeset
    33
	/**
hgs
parents:
diff changeset
    34
	 * Special Korean analyzer that is designed so, that Cpix can 
hgs
parents:
diff changeset
    35
	 * update the result list, when each individual Jamu character is
hgs
parents:
diff changeset
    36
	 * entered.
hgs
parents:
diff changeset
    37
	 * 
hgs
parents:
diff changeset
    38
	 * The analyzer tries to first convert given character stream into 
hgs
parents:
diff changeset
    39
	 * a form, where all Jamu characters are composed into Hangul form.
hgs
parents:
diff changeset
    40
	 * This means, that character sequences of form LV and LVT are 
hgs
parents:
diff changeset
    41
	 * eliminated and replaced with hangul syllables (L is leading Jamu
hgs
parents:
diff changeset
    42
	 * consonant, V is for vocal and T is for trailing consonant).  
hgs
parents:
diff changeset
    43
	 * 
hgs
parents:
diff changeset
    44
	 * The idea behind the analyzer is that it produces up to 3 alternative 
hgs
parents:
diff changeset
    45
	 * tokens for each hangul syllabic. All of these alternatives are returned
hgs
parents:
diff changeset
    46
	 * to be located at the same position. Let's have some Hangul syllabic H1
hgs
parents:
diff changeset
    47
	 * consisting of Jamu characters so that H1=J1J2J3. If H2=J1J2, then first
hgs
parents:
diff changeset
    48
	 * returned token is H1, second token is H2 and third token is J1. This 
hgs
parents:
diff changeset
    49
	 * means, that when user enters H1, H2 or J1, the term H1 will be found. 
hgs
parents:
diff changeset
    50
	 * Also, if user enters J1J2J3 or J1J2, term will be found, because
hgs
parents:
diff changeset
    51
	 * J1J2J3 is automatically turned to H1 and J1J2 is turned to H2.
hgs
parents:
diff changeset
    52
	 * 
hgs
parents:
diff changeset
    53
	 * NOTE: This analyzer MUST NOT be used, when searching, because 
hgs
parents:
diff changeset
    54
	 * CLuceneQueryParser will break, when it faces tokens with zero 
hgs
parents:
diff changeset
    55
	 * increment. Use KoreanQueryAnalyzer for searching material indexed
hgs
parents:
diff changeset
    56
	 * with this analyzer. 
hgs
parents:
diff changeset
    57
	 */
hgs
parents:
diff changeset
    58
    class KoreanTokenizer : public lucene::analysis::Tokenizer {
hgs
parents:
diff changeset
    59
   
hgs
parents:
diff changeset
    60
        public:
hgs
parents:
diff changeset
    61
            
hgs
parents:
diff changeset
    62
			/** Used to read from buffer */
hgs
parents:
diff changeset
    63
            typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
hgs
parents:
diff changeset
    64
hgs
parents:
diff changeset
    65
            /** Turns utf16 code points into unicode */
hgs
parents:
diff changeset
    66
            typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
hgs
parents:
diff changeset
    67
            
hgs
parents:
diff changeset
    68
            /** Turns Jamu alphabets into Hangul syllables */
hgs
parents:
diff changeset
    69
            typedef tiny::HangulIterator<utf16_iterator> iterator;
hgs
parents:
diff changeset
    70
            
hgs
parents:
diff changeset
    71
            KoreanTokenizer(lucene::util::Reader* reader);
hgs
parents:
diff changeset
    72
            
hgs
parents:
diff changeset
    73
            virtual bool next(lucene::analysis::Token* token);
hgs
parents:
diff changeset
    74
hgs
parents:
diff changeset
    75
        private:
hgs
parents:
diff changeset
    76
            
hgs
parents:
diff changeset
    77
            /** Jamu form of last consumed hangul syllable */
hgs
parents:
diff changeset
    78
            wchar_t jamu_[4];
hgs
parents:
diff changeset
    79
            
hgs
parents:
diff changeset
    80
            /** offsets of last consumed hangul syllable  */
hgs
parents:
diff changeset
    81
            int begin_, end_;
hgs
parents:
diff changeset
    82
            
hgs
parents:
diff changeset
    83
            /**
hgs
parents:
diff changeset
    84
             * The amount of jamu characters left in buffer. 
hgs
parents:
diff changeset
    85
             * If this is non-zero, hangul syllable is being processed. 
hgs
parents:
diff changeset
    86
             */
hgs
parents:
diff changeset
    87
            int state_;
hgs
parents:
diff changeset
    88
            
hgs
parents:
diff changeset
    89
            /**
hgs
parents:
diff changeset
    90
             * Tiny CJK tokenizer is used to construct 1-grams out of 
hgs
parents:
diff changeset
    91
             * chinese and japanese characters and to turn latin script
hgs
parents:
diff changeset
    92
             * into terms.
hgs
parents:
diff changeset
    93
             */
hgs
parents:
diff changeset
    94
            TinyCjkTokenizer<iterator> t_;
hgs
parents:
diff changeset
    95
            
hgs
parents:
diff changeset
    96
            /** 512 byte buffer for storing characters read with reader */
hgs
parents:
diff changeset
    97
            tiny::cl::ReaderBuffer<512> in_;
hgs
parents:
diff changeset
    98
            
hgs
parents:
diff changeset
    99
            /** 
hgs
parents:
diff changeset
   100
             * Reads utf16 from in_ buffer, turns it into unicode and 
hgs
parents:
diff changeset
   101
             * then composes jamu alphabets into hangul syllables. 
hgs
parents:
diff changeset
   102
             */
hgs
parents:
diff changeset
   103
            iterator i_;
hgs
parents:
diff changeset
   104
            
hgs
parents:
diff changeset
   105
    
hgs
parents:
diff changeset
   106
    };
hgs
parents:
diff changeset
   107
    
hgs
parents:
diff changeset
   108
    /** Korean tokenizer plus lowercase filter */
hgs
parents:
diff changeset
   109
    typedef TemplateAnalyzer1F<KoreanTokenizer, lucene::analysis::LowerCaseFilter> 
hgs
parents:
diff changeset
   110
        KoreanAnalyzer;
hgs
parents:
diff changeset
   111
    
hgs
parents:
diff changeset
   112
    /**
hgs
parents:
diff changeset
   113
     * Turns Jamu characters into Hangul syllables and generates 1-grams for
hgs
parents:
diff changeset
   114
     * all Chinese, Korean and Japanese text. 
hgs
parents:
diff changeset
   115
     */
hgs
parents:
diff changeset
   116
    class KoreanQueryTokenizer : public lucene::analysis::Tokenizer {
hgs
parents:
diff changeset
   117
   
hgs
parents:
diff changeset
   118
		public:
hgs
parents:
diff changeset
   119
    
hgs
parents:
diff changeset
   120
		    /** Used to read from buffer */
hgs
parents:
diff changeset
   121
			typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
hgs
parents:
diff changeset
   122
hgs
parents:
diff changeset
   123
			/** Turns utf16 code points into unicode */
hgs
parents:
diff changeset
   124
            typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
hgs
parents:
diff changeset
   125
            
hgs
parents:
diff changeset
   126
            /** Turns Hangul syllables into Jamu alphabets */
hgs
parents:
diff changeset
   127
            typedef tiny::HangulIterator<utf16_iterator> iterator;
hgs
parents:
diff changeset
   128
            
hgs
parents:
diff changeset
   129
		public:
hgs
parents:
diff changeset
   130
       
hgs
parents:
diff changeset
   131
            KoreanQueryTokenizer( lucene::util::Reader* reader );
hgs
parents:
diff changeset
   132
            
hgs
parents:
diff changeset
   133
            virtual bool next( lucene::analysis::Token* token );            
hgs
parents:
diff changeset
   134
    
hgs
parents:
diff changeset
   135
		private:
hgs
parents:
diff changeset
   136
hgs
parents:
diff changeset
   137
            /** Buffer for storing characters read with reader */
hgs
parents:
diff changeset
   138
            TinyCjkTokenizer<iterator> t_;
hgs
parents:
diff changeset
   139
       
hgs
parents:
diff changeset
   140
            /** Buffer for storing characters read with reader */
hgs
parents:
diff changeset
   141
            tiny::cl::ReaderBuffer<512> in_;
hgs
parents:
diff changeset
   142
            
hgs
parents:
diff changeset
   143
            /** 
hgs
parents:
diff changeset
   144
             * Reads utf16 from in_ buffer, turns it into unicode and 
hgs
parents:
diff changeset
   145
             * then composes jamu alphabets into hangul syllables. 
hgs
parents:
diff changeset
   146
             */
hgs
parents:
diff changeset
   147
            iterator i_; 
hgs
parents:
diff changeset
   148
           
hgs
parents:
diff changeset
   149
    };
hgs
parents:
diff changeset
   150
hgs
parents:
diff changeset
   151
    /** Korean query analyzer plus lowercase filter */
hgs
parents:
diff changeset
   152
    typedef TemplateAnalyzer1F<KoreanQueryTokenizer, lucene::analysis::LowerCaseFilter> 
hgs
parents:
diff changeset
   153
        KoreanQueryAnalyzer;
hgs
parents:
diff changeset
   154
hgs
parents:
diff changeset
   155
}
hgs
parents:
diff changeset
   156
hgs
parents:
diff changeset
   157
#endif /* KOREANANALYZER_H_ */