searchengine/oss/loc/analysis/inc/public/tinyutf16.h
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 #ifndef TINYUTF16_H_
       
    18 #define TINYUTF16_H_
       
    19 
       
    20 #include "tinyiterator.h"
       
    21 
       
    22 namespace analysis {
       
    23 
       
    24     namespace tiny {
       
    25 
       
    26 		/**
       
    27 		 * Translates given unicode character as utf16 and 
       
    28 		 * stores utf16 codes in the output stream
       
    29 		 */
       
    30         template <typename Stream>
       
    31         void utf16put(Stream& out, int c) {
       
    32             if ( c >= 0x00010000L ) {
       
    33                 c -= 0x00010000L;
       
    34                 out<<(wchar_t)(0xd800 + ((c >> 10) & 0x03ffL));
       
    35                 out<<(wchar_t)(0xd800 + (c & 0x03ffL));
       
    36             } else {
       
    37                 out<<(wchar_t)(c);
       
    38             }
       
    39         }
       
    40 
       
    41         /**
       
    42          * Writes unicode characters into the output 
       
    43          * stream as utf16 codes. 
       
    44          */
       
    45         template <typename Output> 
       
    46         struct Utf16Output {
       
    47             public:
       
    48                 Utf16Output(const Output& out) : out_(out) {};
       
    49                 inline Utf16Output& operator<<(int c) {
       
    50                     utf16put(out_, c);
       
    51                     return *this;
       
    52                 }
       
    53 
       
    54                 inline Utf16Output& operator<<(wchar_t c) {
       
    55                     return (*this)<<(int)c;
       
    56                 }
       
    57                 template <typename I>
       
    58                 Utf16Output& write(I source, int length) {
       
    59                     for (int i = 0; i < length; i++) { 
       
    60                         (*this)<<source; ++source;
       
    61                     }
       
    62                     return *this;
       
    63                 }
       
    64                 template <typename I>
       
    65                 Utf16Output& operator<<(I source) {
       
    66                      for (;*source; ++source) {
       
    67                          (*this)<<*source;
       
    68                      }
       
    69                      return *this;
       
    70                  }
       
    71             private:
       
    72                 Output out_;
       
    73         };
       
    74         
       
    75         /**
       
    76          * Writes unicode characters into the given iterator as utf16 codes
       
    77          */
       
    78         template <typename Iterator> 
       
    79         struct Utf16Writer : public Utf16Output<IteratorOutput<Iterator> > {
       
    80             public:
       
    81                 Utf16Writer(Iterator i) : Utf16Output<IteratorOutput<Iterator> >(IteratorOutput<Iterator>(i)) {}
       
    82             };
       
    83                 
       
    84         /** 
       
    85          * Calculates the size of all characters with the iterator as utf16 
       
    86          * code points
       
    87          */
       
    88         template<typename Iterator>
       
    89         int utf16size(Iterator i) {
       
    90             int rv = 0;
       
    91             for (;*i; ++i) {
       
    92                 rv += (*i >= 0x10000 ? 2 : 1);
       
    93             }
       
    94             return rv;
       
    95         }
       
    96         
       
    97         /**
       
    98          * Reads utf16 code points from given iterator and translates them 
       
    99          * as unicode characters.  
       
   100          */
       
   101         template <typename Iterator> 
       
   102         struct Utf16Iterator {
       
   103             public:
       
   104                 Utf16Iterator(Iterator i) : i_(i) { 
       
   105                     operator++(); // cache first character
       
   106                 }
       
   107                 Utf16Iterator() : i_(), c_(0), offset_(0) {}
       
   108                 inline int operator*() const {
       
   109                     return c_;
       
   110                 };  
       
   111                 Utf16Iterator& operator++() {
       
   112                     offset_ = i_;
       
   113                     c_ = *i_; ++i_;
       
   114                     if ( c_ >= 0xd800 && c_ <= 0xdfff ) {
       
   115                         int c2 = *i_; ++i_;
       
   116                         if ( c2 >= 0xdc00 && c2 <= 0xdfff ){
       
   117                             c_ = (((c_ & 0x03ffL) << 10) | ((c2 & 0x03ffL) << 0)) + 0x00010000L;
       
   118                         }
       
   119                     }
       
   120                     return *this;
       
   121                 }
       
   122                 operator int() {return offset_;}
       
   123             private:
       
   124                 Iterator i_;
       
   125                 int c_; // current utf cached
       
   126                 int offset_; // characters read
       
   127         };
       
   128         
       
   129         /**
       
   130          * Copies the iterator content into a wstring
       
   131          */
       
   132         template<class Iterator> 
       
   133         std::wstring utf16str(Iterator i) {
       
   134             std::wostringstream ret;
       
   135             while (*i) {
       
   136                 utf16put(ret, *i);
       
   137                 ++i;
       
   138             }
       
   139             return ret.str();
       
   140         }
       
   141         
       
   142     }
       
   143 }
       
   144 
       
   145 #endif /* TINYUTF16_H_ */