diff -r d4d56f5e7c55 -r 65456528cac2 searchengine/oss/loc/analysis/inc/public/ngram.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/searchengine/oss/loc/analysis/inc/public/ngram.h Fri Oct 15 12:09:28 2010 +0530 @@ -0,0 +1,153 @@ +/* +* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ +#ifndef NGRAM_H_ +#define NGRAM_H_ + +#include "tinyanalysis.h" +#include "tinyunicode.h" +#include "clutil.h" + +namespace analysis { + + /** + * Returns true, if the character is non-cjk letter + */ + int IsNonCjk(int c); + + /** + * TinyCjkTokenizer. Contains tiny analysis classes, that are + * used to turn Chinese, Korean and Japanese into 1-grams, while + * using letter analyzer for other kinds of text (western, cyrillic, + * etc.) + * + * @tparam I the iterator, that is used to read characters + */ + template + struct TinyCjkTokenizer { + + /** Deals with cjk */ + tiny::NGramTokenizer cjk_; + /** Letter tokenizer for space separated language */ + tiny::CustomTokenizer noncjk_; + /** Combines cjk with noncjk */ + tiny::PairTokenizer pair_; + /** Moves forward, if tokenization fails */ + tiny::RelaxedTokenizer t_; + + /** + * Constructs the tiny cjk tokenizer with given ngram size + * + * @param ngramsize cjk text is treated with n-gram analyzer of this size + */ + TinyCjkTokenizer(int ngramsize) + : cjk_(ngramsize, &unicode::IsCjk), + noncjk_(&IsNonCjk), + pair_(cjk_, noncjk_), + t_(pair_) {} + + /** + * Consumes a token from given iterator. Returns n-grams + * for cjk text, letter tokenized words for non-cjk text. + * Always returns something unless EOS has been reached. + */ + inline tiny::Token consume(I& i) { + return t_.consume(i); + } + }; + + + /** + * Constructs n-grams of Chinese, Korean and Japanese text. Uses + * letter tokenization for other kinds of texts. + */ + class CjkNGramTokenizer : public lucene::analysis::Tokenizer { + + public: + + /** Reads from buffer */ + typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator; + + /** Turns utf16 to unicode */ + typedef tiny::Utf16Iterator iterator; + + public: + + CjkNGramTokenizer( lucene::util::Reader* reader, int gramSize ); + + virtual bool next( lucene::analysis::Token* token ); + + private: + + /** The tokenizer */ + TinyCjkTokenizer t_; + + /** Buffer */ + tiny::cl::ReaderBuffer<512> in_; + + /** Reads utf16 from buffer and transforms it to unicode*/ + iterator i_; + + }; + + /** + * The great difference of this class compared to CJK ngram, + * that it decomposes Hangul syllables into Hangul Jamu letters. + * + * This analyzer appeared to have bad performance in testing. + */ + class JamuNGramTokenizer : public lucene::analysis::Tokenizer { + + public: + + typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator; + + typedef tiny::Utf16Iterator utf16_iterator; + + typedef tiny::JamuIterator iterator; + + public: + + JamuNGramTokenizer( lucene::util::Reader* reader, int gramSize ); + + virtual bool next( lucene::analysis::Token* token ); + + private: + + TinyCjkTokenizer t_; + + tiny::cl::ReaderBuffer<512> in_; + + iterator i_; + + }; + + // Analyzers using the tokenizers + // * Provided mainly for testing + // + + /** CjkNGramTokenizer plus lowercase filter */ + typedef TemplateAnalyzer1A1F + CjkNGramAnalyzer; + + /** JamuNGramTokenizer plus lowercase filter */ + typedef TemplateAnalyzer1A1F + JamuNGramAnalyzer; + +} + + +#endif /* NGRAM_H_ */