diff -r d4d56f5e7c55 -r 65456528cac2 searchengine/oss/loc/analysis/inc/public/koreananalyzer.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/searchengine/oss/loc/analysis/inc/public/koreananalyzer.h Fri Oct 15 12:09:28 2010 +0530 @@ -0,0 +1,157 @@ +/* +* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ + +#ifndef KOREANANALYZER_H_ +#define KOREANANALYZER_H_ + +#include "Clucene.h" + +#include "ngram.h" + +#include "tinyanalysis.h" +#include "tinyutf16.h" +#include "tinyunicode.h" + +namespace analysis +{ + // Forward declarations + + /** + * Special Korean analyzer that is designed so, that Cpix can + * update the result list, when each individual Jamu character is + * entered. + * + * The analyzer tries to first convert given character stream into + * a form, where all Jamu characters are composed into Hangul form. + * This means, that character sequences of form LV and LVT are + * eliminated and replaced with hangul syllables (L is leading Jamu + * consonant, V is for vocal and T is for trailing consonant). + * + * The idea behind the analyzer is that it produces up to 3 alternative + * tokens for each hangul syllabic. All of these alternatives are returned + * to be located at the same position. Let's have some Hangul syllabic H1 + * consisting of Jamu characters so that H1=J1J2J3. If H2=J1J2, then first + * returned token is H1, second token is H2 and third token is J1. This + * means, that when user enters H1, H2 or J1, the term H1 will be found. + * Also, if user enters J1J2J3 or J1J2, term will be found, because + * J1J2J3 is automatically turned to H1 and J1J2 is turned to H2. + * + * NOTE: This analyzer MUST NOT be used, when searching, because + * CLuceneQueryParser will break, when it faces tokens with zero + * increment. Use KoreanQueryAnalyzer for searching material indexed + * with this analyzer. + */ + class KoreanTokenizer : public lucene::analysis::Tokenizer { + + public: + + /** Used to read from buffer */ + typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator; + + /** Turns utf16 code points into unicode */ + typedef tiny::Utf16Iterator utf16_iterator; + + /** Turns Jamu alphabets into Hangul syllables */ + typedef tiny::HangulIterator iterator; + + KoreanTokenizer(lucene::util::Reader* reader); + + virtual bool next(lucene::analysis::Token* token); + + private: + + /** Jamu form of last consumed hangul syllable */ + wchar_t jamu_[4]; + + /** offsets of last consumed hangul syllable */ + int begin_, end_; + + /** + * The amount of jamu characters left in buffer. + * If this is non-zero, hangul syllable is being processed. + */ + int state_; + + /** + * Tiny CJK tokenizer is used to construct 1-grams out of + * chinese and japanese characters and to turn latin script + * into terms. + */ + TinyCjkTokenizer t_; + + /** 512 byte buffer for storing characters read with reader */ + tiny::cl::ReaderBuffer<512> in_; + + /** + * Reads utf16 from in_ buffer, turns it into unicode and + * then composes jamu alphabets into hangul syllables. + */ + iterator i_; + + + }; + + /** Korean tokenizer plus lowercase filter */ + typedef TemplateAnalyzer1F + KoreanAnalyzer; + + /** + * Turns Jamu characters into Hangul syllables and generates 1-grams for + * all Chinese, Korean and Japanese text. + */ + class KoreanQueryTokenizer : public lucene::analysis::Tokenizer { + + public: + + /** Used to read from buffer */ + typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator; + + /** Turns utf16 code points into unicode */ + typedef tiny::Utf16Iterator utf16_iterator; + + /** Turns Hangul syllables into Jamu alphabets */ + typedef tiny::HangulIterator iterator; + + public: + + KoreanQueryTokenizer( lucene::util::Reader* reader ); + + virtual bool next( lucene::analysis::Token* token ); + + private: + + /** Buffer for storing characters read with reader */ + TinyCjkTokenizer t_; + + /** Buffer for storing characters read with reader */ + tiny::cl::ReaderBuffer<512> in_; + + /** + * Reads utf16 from in_ buffer, turns it into unicode and + * then composes jamu alphabets into hangul syllables. + */ + iterator i_; + + }; + + /** Korean query analyzer plus lowercase filter */ + typedef TemplateAnalyzer1F + KoreanQueryAnalyzer; + +} + +#endif /* KOREANANALYZER_H_ */