searchengine/oss/loc/analysis/src/koreananalyzer.cpp
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
hgs
parents:
diff changeset
     1
/*
hgs
parents:
diff changeset
     2
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
hgs
parents:
diff changeset
     3
* All rights reserved.
hgs
parents:
diff changeset
     4
* This component and the accompanying materials are made available
hgs
parents:
diff changeset
     5
* under the terms of "Eclipse Public License v1.0"
hgs
parents:
diff changeset
     6
* which accompanies this distribution, and is available
hgs
parents:
diff changeset
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
hgs
parents:
diff changeset
     8
*
hgs
parents:
diff changeset
     9
* Initial Contributors:
hgs
parents:
diff changeset
    10
* Nokia Corporation - initial contribution.
hgs
parents:
diff changeset
    11
*
hgs
parents:
diff changeset
    12
* Contributors:
hgs
parents:
diff changeset
    13
*
hgs
parents:
diff changeset
    14
* Description: 
hgs
parents:
diff changeset
    15
*
hgs
parents:
diff changeset
    16
*/
hgs
parents:
diff changeset
    17
hgs
parents:
diff changeset
    18
#include "koreananalyzer.h"
hgs
parents:
diff changeset
    19
#include "tinyanalysis.inl"
hgs
parents:
diff changeset
    20
hgs
parents:
diff changeset
    21
namespace analysis {
hgs
parents:
diff changeset
    22
hgs
parents:
diff changeset
    23
    KoreanTokenizer::KoreanTokenizer(lucene::util::Reader* reader) :
hgs
parents:
diff changeset
    24
        begin_(0),
hgs
parents:
diff changeset
    25
        end_(0),
hgs
parents:
diff changeset
    26
        state_(0),
hgs
parents:
diff changeset
    27
        t_(1), 
hgs
parents:
diff changeset
    28
        in_(*reader),
hgs
parents:
diff changeset
    29
        i_(iterator(utf16_iterator(in_.begin()))) {}
hgs
parents:
diff changeset
    30
hgs
parents:
diff changeset
    31
    bool KoreanTokenizer::next(lucene::analysis::Token* token) {
hgs
parents:
diff changeset
    32
        using namespace unicode;
hgs
parents:
diff changeset
    33
        using namespace tiny;
hgs
parents:
diff changeset
    34
         
hgs
parents:
diff changeset
    35
        if ( state_ ) {
hgs
parents:
diff changeset
    36
            jamu_[state_--] = '\0';
hgs
parents:
diff changeset
    37
            const wchar_t buf[] = { ComposeJamu(jamu_), '\0' }; 
hgs
parents:
diff changeset
    38
            token->set( buf, begin_, end_);
hgs
parents:
diff changeset
    39
            token->setPositionIncrement(0);
hgs
parents:
diff changeset
    40
            return true;
hgs
parents:
diff changeset
    41
        } else {
hgs
parents:
diff changeset
    42
            while ( *i_ ) {
hgs
parents:
diff changeset
    43
                if ( IsHangulSyllable( *i_ ) ) {
hgs
parents:
diff changeset
    44
                    DecomposeHangul( IteratorOutput<wchar_t*>(jamu_), *i_ );
hgs
parents:
diff changeset
    45
                    state_ = wcslen(jamu_)-1;
hgs
parents:
diff changeset
    46
					wchar_t buf[] = {*i_, '\0'};
hgs
parents:
diff changeset
    47
                    begin_ = i_;
hgs
parents:
diff changeset
    48
                    end_ = ++i_;
hgs
parents:
diff changeset
    49
                    token->set( buf, begin_, end_ );
hgs
parents:
diff changeset
    50
                    return true;
hgs
parents:
diff changeset
    51
                } else {
hgs
parents:
diff changeset
    52
                    Token<iterator> t = t_.consume( i_ );
hgs
parents:
diff changeset
    53
                    if ( t ) {
hgs
parents:
diff changeset
    54
                        t.copyTo(token);
hgs
parents:
diff changeset
    55
                        return true;
hgs
parents:
diff changeset
    56
                    }
hgs
parents:
diff changeset
    57
                }
hgs
parents:
diff changeset
    58
                ++i_;
hgs
parents:
diff changeset
    59
            }
hgs
parents:
diff changeset
    60
            return false;
hgs
parents:
diff changeset
    61
        }
hgs
parents:
diff changeset
    62
    }
hgs
parents:
diff changeset
    63
    
hgs
parents:
diff changeset
    64
	
hgs
parents:
diff changeset
    65
	KoreanQueryTokenizer::KoreanQueryTokenizer( lucene::util::Reader* reader ) 
hgs
parents:
diff changeset
    66
    : lucene::analysis::Tokenizer( reader ),
hgs
parents:
diff changeset
    67
       t_( 1 ),
hgs
parents:
diff changeset
    68
       in_( *reader ), 
hgs
parents:
diff changeset
    69
       i_( utf16_iterator( buffer_iterator( in_ ) ) ) {}
hgs
parents:
diff changeset
    70
	
hgs
parents:
diff changeset
    71
    bool KoreanQueryTokenizer::next( lucene::analysis::Token* token ) {
hgs
parents:
diff changeset
    72
        using namespace tiny;
hgs
parents:
diff changeset
    73
        
hgs
parents:
diff changeset
    74
        Token<iterator> t = t_.consume(i_);
hgs
parents:
diff changeset
    75
        if ( t ) {
hgs
parents:
diff changeset
    76
            t.copyTo( token );
hgs
parents:
diff changeset
    77
            return true; 
hgs
parents:
diff changeset
    78
        } 
hgs
parents:
diff changeset
    79
        return false; 
hgs
parents:
diff changeset
    80
    }   
hgs
parents:
diff changeset
    81
}