24
|
1 |
/*
|
|
2 |
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
|
|
3 |
* All rights reserved.
|
|
4 |
* This component and the accompanying materials are made available
|
|
5 |
* under the terms of "Eclipse Public License v1.0"
|
|
6 |
* which accompanies this distribution, and is available
|
|
7 |
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
|
8 |
*
|
|
9 |
* Initial Contributors:
|
|
10 |
* Nokia Corporation - initial contribution.
|
|
11 |
*
|
|
12 |
* Contributors:
|
|
13 |
*
|
|
14 |
* Description:
|
|
15 |
*
|
|
16 |
*/
|
|
17 |
|
|
18 |
|
|
19 |
#include "ngram.h"
|
|
20 |
#include "tinyunicode.h"
|
|
21 |
#include "tinyanalysis.inl"
|
|
22 |
|
|
23 |
|
|
24 |
namespace analysis {
|
|
25 |
|
|
26 |
using namespace unicode;
|
|
27 |
|
|
28 |
int IsNonCjk(int c) {
|
|
29 |
return iswalnum(c) && !IsCjk(c);
|
|
30 |
}
|
|
31 |
|
|
32 |
CjkNGramTokenizer::CjkNGramTokenizer(
|
|
33 |
lucene::util::Reader* reader,
|
|
34 |
int gramSize )
|
|
35 |
: lucene::analysis::Tokenizer(reader),
|
|
36 |
t_( gramSize ),
|
|
37 |
in_( *reader ),
|
|
38 |
i_( buffer_iterator( in_ ) ){
|
|
39 |
}
|
|
40 |
|
|
41 |
bool CjkNGramTokenizer::next( lucene::analysis::Token* token ) {
|
|
42 |
using namespace tiny;
|
|
43 |
|
|
44 |
Token<iterator> t = t_.consume(i_);
|
|
45 |
if ( t ) {
|
|
46 |
t.copyTo( token );
|
|
47 |
return true;
|
|
48 |
}
|
|
49 |
return false;
|
|
50 |
}
|
|
51 |
|
|
52 |
JamuNGramTokenizer::JamuNGramTokenizer( lucene::util::Reader* reader,
|
|
53 |
int gramSize )
|
|
54 |
: lucene::analysis::Tokenizer( reader ),
|
|
55 |
t_( gramSize ),
|
|
56 |
in_( *reader ),
|
|
57 |
i_( utf16_iterator( buffer_iterator( in_ ) ) ) {}
|
|
58 |
|
|
59 |
bool JamuNGramTokenizer::next( lucene::analysis::Token* token ) {
|
|
60 |
using namespace tiny;
|
|
61 |
|
|
62 |
Token<iterator> t = t_.consume(i_);
|
|
63 |
if ( t ) {
|
|
64 |
t.copyTo( token );
|
|
65 |
return true;
|
|
66 |
}
|
|
67 |
return false;
|
|
68 |
}
|
|
69 |
}
|