searchengine/oss/loc/analysis/src/ngram.cpp
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 
       
    19 #include "ngram.h"
       
    20 #include "tinyunicode.h"
       
    21 #include "tinyanalysis.inl"
       
    22 
       
    23 
       
    24 namespace analysis {
       
    25 
       
    26 	using namespace unicode; 
       
    27 
       
    28 	int IsNonCjk(int c) {
       
    29 		return iswalnum(c) && !IsCjk(c); 
       
    30 	}
       
    31 
       
    32 	CjkNGramTokenizer::CjkNGramTokenizer( 
       
    33 			lucene::util::Reader* reader, 
       
    34 			int gramSize ) 
       
    35 		: lucene::analysis::Tokenizer(reader),
       
    36           t_( gramSize ),
       
    37 		  in_( *reader ),
       
    38 	      i_( buffer_iterator( in_ ) ){
       
    39 	}
       
    40 		
       
    41 	bool CjkNGramTokenizer::next( lucene::analysis::Token* token ) {
       
    42 		using namespace tiny;
       
    43 
       
    44 		Token<iterator> t = t_.consume(i_);
       
    45         if ( t ) {
       
    46             t.copyTo( token );
       
    47             return true; 
       
    48         } 
       
    49 		return false; 
       
    50 	}
       
    51 	
       
    52 	JamuNGramTokenizer::JamuNGramTokenizer( lucene::util::Reader* reader, 
       
    53                                             int gramSize ) 
       
    54     : lucene::analysis::Tokenizer( reader ),
       
    55        t_( gramSize ),
       
    56        in_( *reader ), 
       
    57        i_( utf16_iterator( buffer_iterator( in_ ) ) ) {}
       
    58 	
       
    59     bool JamuNGramTokenizer::next( lucene::analysis::Token* token ) {
       
    60         using namespace tiny;
       
    61         
       
    62         Token<iterator> t = t_.consume(i_);
       
    63         if ( t ) {
       
    64             t.copyTo( token );
       
    65             return true; 
       
    66         } 
       
    67         return false; 
       
    68     }
       
    69 }