searchengine/oss/loc/analysis/src/koreananalyzer.cpp
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/

#include "koreananalyzer.h"
#include "tinyanalysis.inl"

namespace analysis {

    KoreanTokenizer::KoreanTokenizer(lucene::util::Reader* reader) :
        begin_(0),
        end_(0),
        state_(0),
        t_(1), 
        in_(*reader),
        i_(iterator(utf16_iterator(in_.begin()))) {}

    bool KoreanTokenizer::next(lucene::analysis::Token* token) {
        using namespace unicode;
        using namespace tiny;
         
        if ( state_ ) {
            jamu_[state_--] = '\0';
            const wchar_t buf[] = { ComposeJamu(jamu_), '\0' }; 
            token->set( buf, begin_, end_);
            token->setPositionIncrement(0);
            return true;
        } else {
            while ( *i_ ) {
                if ( IsHangulSyllable( *i_ ) ) {
                    DecomposeHangul( IteratorOutput<wchar_t*>(jamu_), *i_ );
                    state_ = wcslen(jamu_)-1;
					wchar_t buf[] = {*i_, '\0'};
                    begin_ = i_;
                    end_ = ++i_;
                    token->set( buf, begin_, end_ );
                    return true;
                } else {
                    Token<iterator> t = t_.consume( i_ );
                    if ( t ) {
                        t.copyTo(token);
                        return true;
                    }
                }
                ++i_;
            }
            return false;
        }
    }
    
	
	KoreanQueryTokenizer::KoreanQueryTokenizer( lucene::util::Reader* reader ) 
    : lucene::analysis::Tokenizer( reader ),
       t_( 1 ),
       in_( *reader ), 
       i_( utf16_iterator( buffer_iterator( in_ ) ) ) {}
	
    bool KoreanQueryTokenizer::next( lucene::analysis::Token* token ) {
        using namespace tiny;
        
        Token<iterator> t = t_.consume(i_);
        if ( t ) {
            t.copyTo( token );
            return true; 
        } 
        return false; 
    }   
}