--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/loc/analysis/inc/public/koreananalyzer.h Fri Oct 15 12:09:28 2010 +0530
@@ -0,0 +1,157 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description:
+*
+*/
+
+#ifndef KOREANANALYZER_H_
+#define KOREANANALYZER_H_
+
+#include "Clucene.h"
+
+#include "ngram.h"
+
+#include "tinyanalysis.h"
+#include "tinyutf16.h"
+#include "tinyunicode.h"
+
+namespace analysis
+{
+ // Forward declarations
+
+ /**
+ * Special Korean analyzer that is designed so, that Cpix can
+ * update the result list, when each individual Jamu character is
+ * entered.
+ *
+ * The analyzer tries to first convert given character stream into
+ * a form, where all Jamu characters are composed into Hangul form.
+ * This means, that character sequences of form LV and LVT are
+ * eliminated and replaced with hangul syllables (L is leading Jamu
+ * consonant, V is for vocal and T is for trailing consonant).
+ *
+ * The idea behind the analyzer is that it produces up to 3 alternative
+ * tokens for each hangul syllabic. All of these alternatives are returned
+ * to be located at the same position. Let's have some Hangul syllabic H1
+ * consisting of Jamu characters so that H1=J1J2J3. If H2=J1J2, then first
+ * returned token is H1, second token is H2 and third token is J1. This
+ * means, that when user enters H1, H2 or J1, the term H1 will be found.
+ * Also, if user enters J1J2J3 or J1J2, term will be found, because
+ * J1J2J3 is automatically turned to H1 and J1J2 is turned to H2.
+ *
+ * NOTE: This analyzer MUST NOT be used, when searching, because
+ * CLuceneQueryParser will break, when it faces tokens with zero
+ * increment. Use KoreanQueryAnalyzer for searching material indexed
+ * with this analyzer.
+ */
+ class KoreanTokenizer : public lucene::analysis::Tokenizer {
+
+ public:
+
+ /** Used to read from buffer */
+ typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
+
+ /** Turns utf16 code points into unicode */
+ typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
+
+ /** Turns Jamu alphabets into Hangul syllables */
+ typedef tiny::HangulIterator<utf16_iterator> iterator;
+
+ KoreanTokenizer(lucene::util::Reader* reader);
+
+ virtual bool next(lucene::analysis::Token* token);
+
+ private:
+
+ /** Jamu form of last consumed hangul syllable */
+ wchar_t jamu_[4];
+
+ /** offsets of last consumed hangul syllable */
+ int begin_, end_;
+
+ /**
+ * The amount of jamu characters left in buffer.
+ * If this is non-zero, hangul syllable is being processed.
+ */
+ int state_;
+
+ /**
+ * Tiny CJK tokenizer is used to construct 1-grams out of
+ * chinese and japanese characters and to turn latin script
+ * into terms.
+ */
+ TinyCjkTokenizer<iterator> t_;
+
+ /** 512 byte buffer for storing characters read with reader */
+ tiny::cl::ReaderBuffer<512> in_;
+
+ /**
+ * Reads utf16 from in_ buffer, turns it into unicode and
+ * then composes jamu alphabets into hangul syllables.
+ */
+ iterator i_;
+
+
+ };
+
+ /** Korean tokenizer plus lowercase filter */
+ typedef TemplateAnalyzer1F<KoreanTokenizer, lucene::analysis::LowerCaseFilter>
+ KoreanAnalyzer;
+
+ /**
+ * Turns Jamu characters into Hangul syllables and generates 1-grams for
+ * all Chinese, Korean and Japanese text.
+ */
+ class KoreanQueryTokenizer : public lucene::analysis::Tokenizer {
+
+ public:
+
+ /** Used to read from buffer */
+ typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
+
+ /** Turns utf16 code points into unicode */
+ typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
+
+ /** Turns Hangul syllables into Jamu alphabets */
+ typedef tiny::HangulIterator<utf16_iterator> iterator;
+
+ public:
+
+ KoreanQueryTokenizer( lucene::util::Reader* reader );
+
+ virtual bool next( lucene::analysis::Token* token );
+
+ private:
+
+ /** Buffer for storing characters read with reader */
+ TinyCjkTokenizer<iterator> t_;
+
+ /** Buffer for storing characters read with reader */
+ tiny::cl::ReaderBuffer<512> in_;
+
+ /**
+ * Reads utf16 from in_ buffer, turns it into unicode and
+ * then composes jamu alphabets into hangul syllables.
+ */
+ iterator i_;
+
+ };
+
+ /** Korean query analyzer plus lowercase filter */
+ typedef TemplateAnalyzer1F<KoreanQueryTokenizer, lucene::analysis::LowerCaseFilter>
+ KoreanQueryAnalyzer;
+
+}
+
+#endif /* KOREANANALYZER_H_ */