searchengine/oss/loc/analysis/inc/public/koreananalyzer.h
changeset 24 65456528cac2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/loc/analysis/inc/public/koreananalyzer.h	Fri Oct 15 12:09:28 2010 +0530
@@ -0,0 +1,157 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: 
+*
+*/
+
+#ifndef KOREANANALYZER_H_
+#define KOREANANALYZER_H_
+
+#include "Clucene.h"
+
+#include "ngram.h"
+
+#include "tinyanalysis.h"
+#include "tinyutf16.h"
+#include "tinyunicode.h"
+
+namespace analysis 
+{
+    // Forward declarations
+
+	/**
+	 * Special Korean analyzer that is designed so, that Cpix can 
+	 * update the result list, when each individual Jamu character is
+	 * entered.
+	 * 
+	 * The analyzer tries to first convert given character stream into 
+	 * a form, where all Jamu characters are composed into Hangul form.
+	 * This means, that character sequences of form LV and LVT are 
+	 * eliminated and replaced with hangul syllables (L is leading Jamu
+	 * consonant, V is for vocal and T is for trailing consonant).  
+	 * 
+	 * The idea behind the analyzer is that it produces up to 3 alternative 
+	 * tokens for each hangul syllabic. All of these alternatives are returned
+	 * to be located at the same position. Let's have some Hangul syllabic H1
+	 * consisting of Jamu characters so that H1=J1J2J3. If H2=J1J2, then first
+	 * returned token is H1, second token is H2 and third token is J1. This 
+	 * means, that when user enters H1, H2 or J1, the term H1 will be found. 
+	 * Also, if user enters J1J2J3 or J1J2, term will be found, because
+	 * J1J2J3 is automatically turned to H1 and J1J2 is turned to H2.
+	 * 
+	 * NOTE: This analyzer MUST NOT be used, when searching, because 
+	 * CLuceneQueryParser will break, when it faces tokens with zero 
+	 * increment. Use KoreanQueryAnalyzer for searching material indexed
+	 * with this analyzer. 
+	 */
+    class KoreanTokenizer : public lucene::analysis::Tokenizer {
+   
+        public:
+            
+			/** Used to read from buffer */
+            typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
+
+            /** Turns utf16 code points into unicode */
+            typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
+            
+            /** Turns Jamu alphabets into Hangul syllables */
+            typedef tiny::HangulIterator<utf16_iterator> iterator;
+            
+            KoreanTokenizer(lucene::util::Reader* reader);
+            
+            virtual bool next(lucene::analysis::Token* token);
+
+        private:
+            
+            /** Jamu form of last consumed hangul syllable */
+            wchar_t jamu_[4];
+            
+            /** offsets of last consumed hangul syllable  */
+            int begin_, end_;
+            
+            /**
+             * The amount of jamu characters left in buffer. 
+             * If this is non-zero, hangul syllable is being processed. 
+             */
+            int state_;
+            
+            /**
+             * Tiny CJK tokenizer is used to construct 1-grams out of 
+             * chinese and japanese characters and to turn latin script
+             * into terms.
+             */
+            TinyCjkTokenizer<iterator> t_;
+            
+            /** 512 byte buffer for storing characters read with reader */
+            tiny::cl::ReaderBuffer<512> in_;
+            
+            /** 
+             * Reads utf16 from in_ buffer, turns it into unicode and 
+             * then composes jamu alphabets into hangul syllables. 
+             */
+            iterator i_;
+            
+    
+    };
+    
+    /** Korean tokenizer plus lowercase filter */
+    typedef TemplateAnalyzer1F<KoreanTokenizer, lucene::analysis::LowerCaseFilter> 
+        KoreanAnalyzer;
+    
+    /**
+     * Turns Jamu characters into Hangul syllables and generates 1-grams for
+     * all Chinese, Korean and Japanese text. 
+     */
+    class KoreanQueryTokenizer : public lucene::analysis::Tokenizer {
+   
+		public:
+    
+		    /** Used to read from buffer */
+			typedef tiny::cl::ReaderBuffer<512>::iterator buffer_iterator;
+
+			/** Turns utf16 code points into unicode */
+            typedef tiny::Utf16Iterator<buffer_iterator> utf16_iterator;
+            
+            /** Turns Hangul syllables into Jamu alphabets */
+            typedef tiny::HangulIterator<utf16_iterator> iterator;
+            
+		public:
+       
+            KoreanQueryTokenizer( lucene::util::Reader* reader );
+            
+            virtual bool next( lucene::analysis::Token* token );            
+    
+		private:
+
+            /** Buffer for storing characters read with reader */
+            TinyCjkTokenizer<iterator> t_;
+       
+            /** Buffer for storing characters read with reader */
+            tiny::cl::ReaderBuffer<512> in_;
+            
+            /** 
+             * Reads utf16 from in_ buffer, turns it into unicode and 
+             * then composes jamu alphabets into hangul syllables. 
+             */
+            iterator i_; 
+           
+    };
+
+    /** Korean query analyzer plus lowercase filter */
+    typedef TemplateAnalyzer1F<KoreanQueryTokenizer, lucene::analysis::LowerCaseFilter> 
+        KoreanQueryAnalyzer;
+
+}
+
+#endif /* KOREANANALYZER_H_ */