searchengine/oss/loc/analysis/inc/public/tinyunicode.h
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/
#ifndef UNICODEINFO_H_
#define UNICODEINFO_H_

#include "tinyiterator.h"

namespace analysis {

	/**
	 * The package contains various unicode related functionality as
	 * needed by the remaining analysis package
	 */
	namespace unicode {
	
		/** Returns true, if character c is either Hangul Jamo or Syllable */
        int IsHangul(int c);
        
        /** Returns true, if character c is Hangul Syllable */
        int IsHangulSyllable(int c);
        
        /** Returns true, if character c is Hangul Jamo */
        int IsHangulJamo(int c);
        
        /** 
         * Returns true, if character is of either Chinese, 
         * Japanese or Korean writing systems
         */
        int IsCjk(int c);
        
        /**
         * Returns true, if characters is on the Thai unicode block
         */
        int IsThai(int c);
        
        namespace hangul {
        
			/* First Hangul Syllable code */
            static const int SyllableBase = 0xAC00;
            
            // Jamu Alphabets
            /** First leading Jamu consonant */
            static const int LeadingBase = 0x1100;
            
            /** First vowel  */
            static const int VowelBase = 0x1161;
            
            /** First trailing Jamu consonant */
            static const int TrailingBase = 0x11A7;
            
            /** Leading consonants count */
            static const int LeadingCount = 19;
            
            /** Vowel count */
            static const int VowelCount = 21;
            
            /** Trailing consonant count */
            static const int TrailingCount = 28;
            
            /** 
             * Amount of syllables that are composed of a leading 
             * consonant and a vowel 
             */
            static const int LvSyllableCount = VowelCount * TrailingCount; // 588
            
            /**
             * Amount of syllables that are composed of a leading
             * consonant, a vowel and a trailing consonant 
             */
            static const int SyllableCount = LvSyllableCount * TrailingCount; // 11172 
        }
        
        /**
         * Decomposes hangul syllable into jamu alphabets
         */
        template<typename Output>
        void DecomposeHangul(Output out, int c) {
            using namespace hangul;
            int sindex = c - SyllableBase;
            if (sindex < 0 || sindex >= SyllableCount) {
                out<<c<<'\0';
            } else {
                // Leading
                out<<(LeadingBase + sindex / LvSyllableCount);
                // Vocal
                out<<(VowelBase + (sindex % LvSyllableCount) / TrailingCount);
                // Trailing (voluntary)
                int toffset = sindex % TrailingCount;
                if (toffset) out<<(TrailingBase + toffset);
                // Finish
                out<<'\0';
            }
        }
        
        /**
         * Composes encountered jamu alphabets into hangul syllable.
         * Moves given iterator over the consumed unicode character.
         */
        template<typename Iterator> 
        int ConsumeComposedJamu(Iterator& i) {
            using namespace hangul;
            int c = *i; ++i;
            int lindex = c - LeadingBase;
            if (0 <= lindex && lindex < LeadingCount) {
                int vindex = *i - VowelBase;
                if (0 <= vindex && vindex < VowelCount) {
                    ++i;
                    int tindex = *i - TrailingBase;
                    c = (SyllableBase + (lindex * VowelCount + vindex) * TrailingCount);
                    if (0 <= tindex && tindex < TrailingCount) {
                        ++i;
                        c += tindex;
                    } 
                }
            }
            return c;
        }
        
        /**
         * Composes encountered jamu alphabets into hangul syllable.
         */
        template<typename Iterator> 
        inline int ComposeJamu(Iterator i) {
        	return ConsumeComposedJamu(i); 
        }        
	}
	
	namespace tiny { // tiny analysis
	
        using namespace analysis::unicode::hangul;
        using namespace analysis::unicode;
	
        /**
         * Composes encountered Hangul Jamu characters into 
         * Hangul syllables.  
         */
        template <typename Iterator> 
        struct HangulIterator {
            public:
                HangulIterator() : i_(), c_(), offset_(0) {}
                HangulIterator(Iterator i) : i_(i) {
                    ++(*this); // populate c_
                }
                int operator*() {
                    return c_;
                }
                operator int() {
                    return offset_;
                }
                HangulIterator& operator++() {
                    offset_ = i_;
                    c_ = ConsumeComposedJamu(i_);
                    return *this;
                }
            private:
                Iterator i_;
                int c_;
                int offset_;
        };
        
        /**
         * Decomposes encountered Hangul syllables into 
         * Hangul Jamu characters  
         */
        template <typename Iterator> 
        struct JamuIterator {
            public:
                JamuIterator() : i_(), b_(0), offset_(0) { buf_[0];}
                JamuIterator(Iterator i) : i_(i), b_(0) {
                    buf_[1] = '\0';
                    ++(*this); // populate buffer
                }
                int operator*() {
                    return buf_[b_];
                }
                JamuIterator& operator++() {
                    offset_ = i_;
                    if (!buf_[++b_]) {
                        b_ = 0; // reset buf
                        tiny::IteratorOutput<int*> out(buf_);
                        DecomposeHangul(out, *i_); ++i_;
                    }
                    return *this;
                }
                operator int() {
                    return offset_;
                }
            private:
                Iterator i_;
                int buf_[4];
                int b_;
                int offset_;

        };
	}
}


#endif /* UNICODEINFO_H_ */