searchengine/oss/loc/analysis/inc/public/tinyunicode.h
changeset 10 afe194b6b1cd
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/loc/analysis/inc/public/tinyunicode.h	Tue Jul 06 15:30:04 2010 +0300
@@ -0,0 +1,213 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: 
+*
+*/
+#ifndef UNICODEINFO_H_
+#define UNICODEINFO_H_
+
+#include "tinyiterator.h"
+
+namespace analysis {
+
+	/**
+	 * The package contains various unicode related functionality as
+	 * needed by the remaining analysis package
+	 */
+	namespace unicode {
+	
+		/** Returns true, if character c is either Hangul Jamo or Syllable */
+        int IsHangul(int c);
+        
+        /** Returns true, if character c is Hangul Syllable */
+        int IsHangulSyllable(int c);
+        
+        /** Returns true, if character c is Hangul Jamo */
+        int IsHangulJamo(int c);
+        
+        /** 
+         * Returns true, if character is of either Chinese, 
+         * Japanese or Korean writing systems
+         */
+        int IsCjk(int c);
+        
+        /**
+         * Returns true, if characters is on the Thai unicode block
+         */
+        int IsThai(int c);
+        
+        namespace hangul {
+        
+			/* First Hangul Syllable code */
+            static const int SyllableBase = 0xAC00;
+            
+            // Jamu Alphabets
+            /** First leading Jamu consonant */
+            static const int LeadingBase = 0x1100;
+            
+            /** First vowel  */
+            static const int VowelBase = 0x1161;
+            
+            /** First trailing Jamu consonant */
+            static const int TrailingBase = 0x11A7;
+            
+            /** Leading consonants count */
+            static const int LeadingCount = 19;
+            
+            /** Vowel count */
+            static const int VowelCount = 21;
+            
+            /** Trailing consonant count */
+            static const int TrailingCount = 28;
+            
+            /** 
+             * Amount of syllables that are composed of a leading 
+             * consonant and a vowel 
+             */
+            static const int LvSyllableCount = VowelCount * TrailingCount; // 588
+            
+            /**
+             * Amount of syllables that are composed of a leading
+             * consonant, a vowel and a trailing consonant 
+             */
+            static const int SyllableCount = LvSyllableCount * TrailingCount; // 11172 
+        }
+        
+        /**
+         * Decomposes hangul syllable into jamu alphabets
+         */
+        template<typename Output>
+        void DecomposeHangul(Output out, int c) {
+            using namespace hangul;
+            int sindex = c - SyllableBase;
+            if (sindex < 0 || sindex >= SyllableCount) {
+                out<<c<<'\0';
+            } else {
+                // Leading
+                out<<(LeadingBase + sindex / LvSyllableCount);
+                // Vocal
+                out<<(VowelBase + (sindex % LvSyllableCount) / TrailingCount);
+                // Trailing (voluntary)
+                int toffset = sindex % TrailingCount;
+                if (toffset) out<<(TrailingBase + toffset);
+                // Finish
+                out<<'\0';
+            }
+        }
+        
+        /**
+         * Composes encountered jamu alphabets into hangul syllable.
+         * Moves given iterator over the consumed unicode character.
+         */
+        template<typename Iterator> 
+        int ConsumeComposedJamu(Iterator& i) {
+            using namespace hangul;
+            int c = *i; ++i;
+            int lindex = c - LeadingBase;
+            if (0 <= lindex && lindex < LeadingCount) {
+                int vindex = *i - VowelBase;
+                if (0 <= vindex && vindex < VowelCount) {
+                    ++i;
+                    int tindex = *i - TrailingBase;
+                    c = (SyllableBase + (lindex * VowelCount + vindex) * TrailingCount);
+                    if (0 <= tindex && tindex < TrailingCount) {
+                        ++i;
+                        c += tindex;
+                    } 
+                }
+            }
+            return c;
+        }
+        
+        /**
+         * Composes encountered jamu alphabets into hangul syllable.
+         */
+        template<typename Iterator> 
+        inline int ComposeJamu(Iterator i) {
+        	return ConsumeComposedJamu(i); 
+        }        
+	}
+	
+	namespace tiny { // tiny analysis
+	
+        using namespace analysis::unicode::hangul;
+        using namespace analysis::unicode;
+	
+        /**
+         * Composes encountered Hangul Jamu characters into 
+         * Hangul syllables.  
+         */
+        template <typename Iterator> 
+        struct HangulIterator {
+            public:
+                HangulIterator() : i_(), c_(), offset_(0) {}
+                HangulIterator(Iterator i) : i_(i) {
+                    ++(*this); // populate c_
+                }
+                int operator*() {
+                    return c_;
+                }
+                operator int() {
+                    return offset_;
+                }
+                HangulIterator& operator++() {
+                    offset_ = i_;
+                    c_ = ConsumeComposedJamu(i_);
+                    return *this;
+                }
+            private:
+                Iterator i_;
+                int c_;
+                int offset_;
+        };
+        
+        /**
+         * Decomposes encountered Hangul syllables into 
+         * Hangul Jamu characters  
+         */
+        template <typename Iterator> 
+        struct JamuIterator {
+            public:
+                JamuIterator() : i_(), b_(0), offset_(0) { buf_[0];}
+                JamuIterator(Iterator i) : i_(i), b_(0) {
+                    buf_[1] = '\0';
+                    ++(*this); // populate buffer
+                }
+                int operator*() {
+                    return buf_[b_];
+                }
+                JamuIterator& operator++() {
+                    offset_ = i_;
+                    if (!buf_[++b_]) {
+                        b_ = 0; // reset buf
+                        tiny::IteratorOutput<int*> out(buf_);
+                        DecomposeHangul(out, *i_); ++i_;
+                    }
+                    return *this;
+                }
+                operator int() {
+                    return offset_;
+                }
+            private:
+                Iterator i_;
+                int buf_[4];
+                int b_;
+                int offset_;
+
+        };
+	}
+}
+
+
+#endif /* UNICODEINFO_H_ */