FCL/sf/mw/searchsrv: comparison searchengine/oss/loc/analysis/inc/public/koreananalyzer.h

equal deleted inserted replaced

-:d4d56f5e7c55
+:65456528cac2
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description:
+*
+*/
+#ifndef KOREANANALYZER_H_
+#define KOREANANALYZER_H_
+#include "Clucene.h"
+#include "ngram.h"
+#include "tinyanalysis.h"
+#include "tinyutf16.h"
+#include "tinyunicode.h"
+namespace analysis
+{
+// Forward declarations
+	/**
+	 * Special Korean analyzer that is designed so, that Cpix can
+	 * update the result list, when each individual Jamu character is
+	 * entered.
+	 *
+	 * The analyzer tries to first convert given character stream into
+	 * a form, where all Jamu characters are composed into Hangul form.
+	 * This means, that character sequences of form LV and LVT are
+	 * eliminated and replaced with hangul syllables (L is leading Jamu
+	 * consonant, V is for vocal and T is for trailing consonant).
+	 *
+	 * The idea behind the analyzer is that it produces up to 3 alternative
+	 * tokens for each hangul syllabic. All of these alternatives are returned
+	 * to be located at the same position. Let's have some Hangul syllabic H1
+	 * consisting of Jamu characters so that H1=J1J2J3. If H2=J1J2, then first
+	 * returned token is H1, second token is H2 and third token is J1. This
+	 * means, that when user enters H1, H2 or J1, the term H1 will be found.
+	 * Also, if user enters J1J2J3 or J1J2, term will be found, because
+	 * J1J2J3 is automatically turned to H1 and J1J2 is turned to H2.
+	 *
+	 * NOTE: This analyzer MUST NOT be used, when searching, because
+	 * CLuceneQueryParser will break, when it faces tokens with zero
+	 * increment. Use KoreanQueryAnalyzer for searching material indexed
+	 * with this analyzer.
+	 */
+class KoreanTokenizer : public lucene::analysis::Tokenizer {
+public:
+			/** Used to read from buffer */
+typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
+/** Turns utf16 code points into unicode */
+typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
+/** Turns Jamu alphabets into Hangul syllables */
+typedef tiny::HangulIterator<utf16_iterator> iterator;
+KoreanTokenizer(lucene::util::Reader* reader);
+virtual bool next(lucene::analysis::Token* token);
+private:
+/** Jamu form of last consumed hangul syllable */
+wchar_t jamu_[4];
+/** offsets of last consumed hangul syllable  */
+int begin_, end_;
+/**
+* The amount of jamu characters left in buffer.
+* If this is non-zero, hangul syllable is being processed.
+*/
+int state_;
+/**
+* Tiny CJK tokenizer is used to construct 1-grams out of
+* chinese and japanese characters and to turn latin script
+* into terms.
+*/
+TinyCjkTokenizer<iterator> t_;
+/** 512 byte buffer for storing characters read with reader */
+tiny::cl::ReaderBuffer<512> in_;
+/**
+* Reads utf16 from in_ buffer, turns it into unicode and
+* then composes jamu alphabets into hangul syllables.
+*/
+iterator i_;
+};
+/** Korean tokenizer plus lowercase filter */
+typedef TemplateAnalyzer1F<KoreanTokenizer, lucene::analysis::LowerCaseFilter>
+KoreanAnalyzer;
+/**
+* Turns Jamu characters into Hangul syllables and generates 1-grams for
+* all Chinese, Korean and Japanese text.
+*/
+class KoreanQueryTokenizer : public lucene::analysis::Tokenizer {
+		public:
+		    /** Used to read from buffer */
+			typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
+			/** Turns utf16 code points into unicode */
+typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
+/** Turns Hangul syllables into Jamu alphabets */
+typedef tiny::HangulIterator<utf16_iterator> iterator;
+		public:
+KoreanQueryTokenizer( lucene::util::Reader* reader );
+virtual bool next( lucene::analysis::Token* token );
+		private:
+/** Buffer for storing characters read with reader */
+TinyCjkTokenizer<iterator> t_;
+/** Buffer for storing characters read with reader */
+tiny::cl::ReaderBuffer<512> in_;
+/**
+* Reads utf16 from in_ buffer, turns it into unicode and
+* then composes jamu alphabets into hangul syllables.
+*/
+iterator i_;
+};
+/** Korean query analyzer plus lowercase filter */
+typedef TemplateAnalyzer1F<KoreanQueryTokenizer, lucene::analysis::LowerCaseFilter>
+KoreanQueryAnalyzer;
+}
+#endif /* KOREANANALYZER_H_ */