searchengine/oss/loc/analysis/inc/public/koreananalyzer.h
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/

#ifndef KOREANANALYZER_H_
#define KOREANANALYZER_H_

#include "Clucene.h"

#include "ngram.h"

#include "tinyanalysis.h"
#include "tinyutf16.h"
#include "tinyunicode.h"

namespace analysis 
{
    // Forward declarations

	/**
	 * Special Korean analyzer that is designed so, that Cpix can 
	 * update the result list, when each individual Jamu character is
	 * entered.
	 * 
	 * The analyzer tries to first convert given character stream into 
	 * a form, where all Jamu characters are composed into Hangul form.
	 * This means, that character sequences of form LV and LVT are 
	 * eliminated and replaced with hangul syllables (L is leading Jamu
	 * consonant, V is for vocal and T is for trailing consonant).  
	 * 
	 * The idea behind the analyzer is that it produces up to 3 alternative 
	 * tokens for each hangul syllabic. All of these alternatives are returned
	 * to be located at the same position. Let's have some Hangul syllabic H1
	 * consisting of Jamu characters so that H1=J1J2J3. If H2=J1J2, then first
	 * returned token is H1, second token is H2 and third token is J1. This 
	 * means, that when user enters H1, H2 or J1, the term H1 will be found. 
	 * Also, if user enters J1J2J3 or J1J2, term will be found, because
	 * J1J2J3 is automatically turned to H1 and J1J2 is turned to H2.
	 * 
	 * NOTE: This analyzer MUST NOT be used, when searching, because 
	 * CLuceneQueryParser will break, when it faces tokens with zero 
	 * increment. Use KoreanQueryAnalyzer for searching material indexed
	 * with this analyzer. 
	 */
    class KoreanTokenizer : public lucene::analysis::Tokenizer {
   
        public:
            
			/** Used to read from buffer */
            typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;

            /** Turns utf16 code points into unicode */
            typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
            
            /** Turns Jamu alphabets into Hangul syllables */
            typedef tiny::HangulIterator<utf16_iterator> iterator;
            
            KoreanTokenizer(lucene::util::Reader* reader);
            
            virtual bool next(lucene::analysis::Token* token);

        private:
            
            /** Jamu form of last consumed hangul syllable */
            wchar_t jamu_[4];
            
            /** offsets of last consumed hangul syllable  */
            int begin_, end_;
            
            /**
             * The amount of jamu characters left in buffer. 
             * If this is non-zero, hangul syllable is being processed. 
             */
            int state_;
            
            /**
             * Tiny CJK tokenizer is used to construct 1-grams out of 
             * chinese and japanese characters and to turn latin script
             * into terms.
             */
            TinyCjkTokenizer<iterator> t_;
            
            /** 512 byte buffer for storing characters read with reader */
            tiny::cl::ReaderBuffer<512> in_;
            
            /** 
             * Reads utf16 from in_ buffer, turns it into unicode and 
             * then composes jamu alphabets into hangul syllables. 
             */
            iterator i_;
            
    
    };
    
    /** Korean tokenizer plus lowercase filter */
    typedef TemplateAnalyzer1F<KoreanTokenizer, lucene::analysis::LowerCaseFilter> 
        KoreanAnalyzer;
    
    /**
     * Turns Jamu characters into Hangul syllables and generates 1-grams for
     * all Chinese, Korean and Japanese text. 
     */
    class KoreanQueryTokenizer : public lucene::analysis::Tokenizer {
   
		public:
    
		    /** Used to read from buffer */
			typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;

			/** Turns utf16 code points into unicode */
            typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
            
            /** Turns Hangul syllables into Jamu alphabets */
            typedef tiny::HangulIterator<utf16_iterator> iterator;
            
		public:
       
            KoreanQueryTokenizer( lucene::util::Reader* reader );
            
            virtual bool next( lucene::analysis::Token* token );            
    
		private:

            /** Buffer for storing characters read with reader */
            TinyCjkTokenizer<iterator> t_;
       
            /** Buffer for storing characters read with reader */
            tiny::cl::ReaderBuffer<512> in_;
            
            /** 
             * Reads utf16 from in_ buffer, turns it into unicode and 
             * then composes jamu alphabets into hangul syllables. 
             */
            iterator i_; 
           
    };

    /** Korean query analyzer plus lowercase filter */
    typedef TemplateAnalyzer1F<KoreanQueryTokenizer, lucene::analysis::LowerCaseFilter> 
        KoreanQueryAnalyzer;

}

#endif /* KOREANANALYZER_H_ */