searchengine/oss/loc/analysis/src/koreananalyzer.cpp
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 #include "koreananalyzer.h"
       
    19 #include "tinyanalysis.inl"
       
    20 
       
    21 namespace analysis {
       
    22 
       
    23     KoreanTokenizer::KoreanTokenizer(lucene::util::Reader* reader) :
       
    24         begin_(0),
       
    25         end_(0),
       
    26         state_(0),
       
    27         t_(1), 
       
    28         in_(*reader),
       
    29         i_(iterator(utf16_iterator(in_.begin()))) {}
       
    30 
       
    31     bool KoreanTokenizer::next(lucene::analysis::Token* token) {
       
    32         using namespace unicode;
       
    33         using namespace tiny;
       
    34          
       
    35         if ( state_ ) {
       
    36             jamu_[state_--] = '\0';
       
    37             const wchar_t buf[] = { ComposeJamu(jamu_), '\0' }; 
       
    38             token->set( buf, begin_, end_);
       
    39             token->setPositionIncrement(0);
       
    40             return true;
       
    41         } else {
       
    42             while ( *i_ ) {
       
    43                 if ( IsHangulSyllable( *i_ ) ) {
       
    44                     DecomposeHangul( IteratorOutput<wchar_t*>(jamu_), *i_ );
       
    45                     state_ = wcslen(jamu_)-1;
       
    46 					wchar_t buf[] = {*i_, '\0'};
       
    47                     begin_ = i_;
       
    48                     end_ = ++i_;
       
    49                     token->set( buf, begin_, end_ );
       
    50                     return true;
       
    51                 } else {
       
    52                     Token<iterator> t = t_.consume( i_ );
       
    53                     if ( t ) {
       
    54                         t.copyTo(token);
       
    55                         return true;
       
    56                     }
       
    57                 }
       
    58                 ++i_;
       
    59             }
       
    60             return false;
       
    61         }
       
    62     }
       
    63     
       
    64 	
       
    65 	KoreanQueryTokenizer::KoreanQueryTokenizer( lucene::util::Reader* reader ) 
       
    66     : lucene::analysis::Tokenizer( reader ),
       
    67        t_( 1 ),
       
    68        in_( *reader ), 
       
    69        i_( utf16_iterator( buffer_iterator( in_ ) ) ) {}
       
    70 	
       
    71     bool KoreanQueryTokenizer::next( lucene::analysis::Token* token ) {
       
    72         using namespace tiny;
       
    73         
       
    74         Token<iterator> t = t_.consume(i_);
       
    75         if ( t ) {
       
    76             t.copyTo( token );
       
    77             return true; 
       
    78         } 
       
    79         return false; 
       
    80     }   
       
    81 }