searchengine/oss/loc/analysis/inc/public/tinyunicode.h
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 #ifndef UNICODEINFO_H_
       
    18 #define UNICODEINFO_H_
       
    19 
       
    20 #include "tinyiterator.h"
       
    21 
       
    22 namespace analysis {
       
    23 
       
    24 	/**
       
    25 	 * The package contains various unicode related functionality as
       
    26 	 * needed by the remaining analysis package
       
    27 	 */
       
    28 	namespace unicode {
       
    29 	
       
    30 		/** Returns true, if character c is either Hangul Jamo or Syllable */
       
    31         int IsHangul(int c);
       
    32         
       
    33         /** Returns true, if character c is Hangul Syllable */
       
    34         int IsHangulSyllable(int c);
       
    35         
       
    36         /** Returns true, if character c is Hangul Jamo */
       
    37         int IsHangulJamo(int c);
       
    38         
       
    39         /** 
       
    40          * Returns true, if character is of either Chinese, 
       
    41          * Japanese or Korean writing systems
       
    42          */
       
    43         int IsCjk(int c);
       
    44         
       
    45         /**
       
    46          * Returns true, if characters is on the Thai unicode block
       
    47          */
       
    48         int IsThai(int c);
       
    49         
       
    50         namespace hangul {
       
    51         
       
    52 			/* First Hangul Syllable code */
       
    53             static const int SyllableBase = 0xAC00;
       
    54             
       
    55             // Jamu Alphabets
       
    56             /** First leading Jamu consonant */
       
    57             static const int LeadingBase = 0x1100;
       
    58             
       
    59             /** First vowel  */
       
    60             static const int VowelBase = 0x1161;
       
    61             
       
    62             /** First trailing Jamu consonant */
       
    63             static const int TrailingBase = 0x11A7;
       
    64             
       
    65             /** Leading consonants count */
       
    66             static const int LeadingCount = 19;
       
    67             
       
    68             /** Vowel count */
       
    69             static const int VowelCount = 21;
       
    70             
       
    71             /** Trailing consonant count */
       
    72             static const int TrailingCount = 28;
       
    73             
       
    74             /** 
       
    75              * Amount of syllables that are composed of a leading 
       
    76              * consonant and a vowel 
       
    77              */
       
    78             static const int LvSyllableCount = VowelCount * TrailingCount; // 588
       
    79             
       
    80             /**
       
    81              * Amount of syllables that are composed of a leading
       
    82              * consonant, a vowel and a trailing consonant 
       
    83              */
       
    84             static const int SyllableCount = LvSyllableCount * TrailingCount; // 11172 
       
    85         }
       
    86         
       
    87         /**
       
    88          * Decomposes hangul syllable into jamu alphabets
       
    89          */
       
    90         template<typename Output>
       
    91         void DecomposeHangul(Output out, int c) {
       
    92             using namespace hangul;
       
    93             int sindex = c - SyllableBase;
       
    94             if (sindex < 0 || sindex >= SyllableCount) {
       
    95                 out<<c<<'\0';
       
    96             } else {
       
    97                 // Leading
       
    98                 out<<(LeadingBase + sindex / LvSyllableCount);
       
    99                 // Vocal
       
   100                 out<<(VowelBase + (sindex % LvSyllableCount) / TrailingCount);
       
   101                 // Trailing (voluntary)
       
   102                 int toffset = sindex % TrailingCount;
       
   103                 if (toffset) out<<(TrailingBase + toffset);
       
   104                 // Finish
       
   105                 out<<'\0';
       
   106             }
       
   107         }
       
   108         
       
   109         /**
       
   110          * Composes encountered jamu alphabets into hangul syllable.
       
   111          * Moves given iterator over the consumed unicode character.
       
   112          */
       
   113         template<typename Iterator> 
       
   114         int ConsumeComposedJamu(Iterator& i) {
       
   115             using namespace hangul;
       
   116             int c = *i; ++i;
       
   117             int lindex = c - LeadingBase;
       
   118             if (0 <= lindex && lindex < LeadingCount) {
       
   119                 int vindex = *i - VowelBase;
       
   120                 if (0 <= vindex && vindex < VowelCount) {
       
   121                     ++i;
       
   122                     int tindex = *i - TrailingBase;
       
   123                     c = (SyllableBase + (lindex * VowelCount + vindex) * TrailingCount);
       
   124                     if (0 <= tindex && tindex < TrailingCount) {
       
   125                         ++i;
       
   126                         c += tindex;
       
   127                     } 
       
   128                 }
       
   129             }
       
   130             return c;
       
   131         }
       
   132         
       
   133         /**
       
   134          * Composes encountered jamu alphabets into hangul syllable.
       
   135          */
       
   136         template<typename Iterator> 
       
   137         inline int ComposeJamu(Iterator i) {
       
   138         	return ConsumeComposedJamu(i); 
       
   139         }        
       
   140 	}
       
   141 	
       
   142 	namespace tiny { // tiny analysis
       
   143 	
       
   144         using namespace analysis::unicode::hangul;
       
   145         using namespace analysis::unicode;
       
   146 	
       
   147         /**
       
   148          * Composes encountered Hangul Jamu characters into 
       
   149          * Hangul syllables.  
       
   150          */
       
   151         template <typename Iterator> 
       
   152         struct HangulIterator {
       
   153             public:
       
   154                 HangulIterator() : i_(), c_(), offset_(0) {}
       
   155                 HangulIterator(Iterator i) : i_(i) {
       
   156                     ++(*this); // populate c_
       
   157                 }
       
   158                 int operator*() {
       
   159                     return c_;
       
   160                 }
       
   161                 operator int() {
       
   162                     return offset_;
       
   163                 }
       
   164                 HangulIterator& operator++() {
       
   165                     offset_ = i_;
       
   166                     c_ = ConsumeComposedJamu(i_);
       
   167                     return *this;
       
   168                 }
       
   169             private:
       
   170                 Iterator i_;
       
   171                 int c_;
       
   172                 int offset_;
       
   173         };
       
   174         
       
   175         /**
       
   176          * Decomposes encountered Hangul syllables into 
       
   177          * Hangul Jamu characters  
       
   178          */
       
   179         template <typename Iterator> 
       
   180         struct JamuIterator {
       
   181             public:
       
   182                 JamuIterator() : i_(), b_(0), offset_(0) { buf_[0];}
       
   183                 JamuIterator(Iterator i) : i_(i), b_(0) {
       
   184                     buf_[1] = '\0';
       
   185                     ++(*this); // populate buffer
       
   186                 }
       
   187                 int operator*() {
       
   188                     return buf_[b_];
       
   189                 }
       
   190                 JamuIterator& operator++() {
       
   191                     offset_ = i_;
       
   192                     if (!buf_[++b_]) {
       
   193                         b_ = 0; // reset buf
       
   194                         tiny::IteratorOutput<int*> out(buf_);
       
   195                         DecomposeHangul(out, *i_); ++i_;
       
   196                     }
       
   197                     return *this;
       
   198                 }
       
   199                 operator int() {
       
   200                     return offset_;
       
   201                 }
       
   202             private:
       
   203                 Iterator i_;
       
   204                 int buf_[4];
       
   205                 int b_;
       
   206                 int offset_;
       
   207 
       
   208         };
       
   209 	}
       
   210 }
       
   211 
       
   212 
       
   213 #endif /* UNICODEINFO_H_ */