searchengine/oss/loc/analysis/inc/public/tinyutf16.h
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
hgs
parents:
diff changeset
     1
/*
hgs
parents:
diff changeset
     2
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
hgs
parents:
diff changeset
     3
* All rights reserved.
hgs
parents:
diff changeset
     4
* This component and the accompanying materials are made available
hgs
parents:
diff changeset
     5
* under the terms of "Eclipse Public License v1.0"
hgs
parents:
diff changeset
     6
* which accompanies this distribution, and is available
hgs
parents:
diff changeset
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
hgs
parents:
diff changeset
     8
*
hgs
parents:
diff changeset
     9
* Initial Contributors:
hgs
parents:
diff changeset
    10
* Nokia Corporation - initial contribution.
hgs
parents:
diff changeset
    11
*
hgs
parents:
diff changeset
    12
* Contributors:
hgs
parents:
diff changeset
    13
*
hgs
parents:
diff changeset
    14
* Description: 
hgs
parents:
diff changeset
    15
*
hgs
parents:
diff changeset
    16
*/
hgs
parents:
diff changeset
    17
#ifndef TINYUTF16_H_
hgs
parents:
diff changeset
    18
#define TINYUTF16_H_
hgs
parents:
diff changeset
    19
hgs
parents:
diff changeset
    20
#include "tinyiterator.h"
hgs
parents:
diff changeset
    21
hgs
parents:
diff changeset
    22
namespace analysis {
hgs
parents:
diff changeset
    23
hgs
parents:
diff changeset
    24
    namespace tiny {
hgs
parents:
diff changeset
    25
hgs
parents:
diff changeset
    26
		/**
hgs
parents:
diff changeset
    27
		 * Translates given unicode character as utf16 and 
hgs
parents:
diff changeset
    28
		 * stores utf16 codes in the output stream
hgs
parents:
diff changeset
    29
		 */
hgs
parents:
diff changeset
    30
        template <typename Stream>
hgs
parents:
diff changeset
    31
        void utf16put(Stream& out, int c) {
hgs
parents:
diff changeset
    32
            if ( c >= 0x00010000L ) {
hgs
parents:
diff changeset
    33
                c -= 0x00010000L;
hgs
parents:
diff changeset
    34
                out<<(wchar_t)(0xd800 + ((c >> 10) & 0x03ffL));
hgs
parents:
diff changeset
    35
                out<<(wchar_t)(0xd800 + (c & 0x03ffL));
hgs
parents:
diff changeset
    36
            } else {
hgs
parents:
diff changeset
    37
                out<<(wchar_t)(c);
hgs
parents:
diff changeset
    38
            }
hgs
parents:
diff changeset
    39
        }
hgs
parents:
diff changeset
    40
hgs
parents:
diff changeset
    41
        /**
hgs
parents:
diff changeset
    42
         * Writes unicode characters into the output 
hgs
parents:
diff changeset
    43
         * stream as utf16 codes. 
hgs
parents:
diff changeset
    44
         */
hgs
parents:
diff changeset
    45
        template <typename Output> 
hgs
parents:
diff changeset
    46
        struct Utf16Output {
hgs
parents:
diff changeset
    47
            public:
hgs
parents:
diff changeset
    48
                Utf16Output(const Output& out) : out_(out) {};
hgs
parents:
diff changeset
    49
                inline Utf16Output& operator<<(int c) {
hgs
parents:
diff changeset
    50
                    utf16put(out_, c);
hgs
parents:
diff changeset
    51
                    return *this;
hgs
parents:
diff changeset
    52
                }
hgs
parents:
diff changeset
    53
hgs
parents:
diff changeset
    54
                inline Utf16Output& operator<<(wchar_t c) {
hgs
parents:
diff changeset
    55
                    return (*this)<<(int)c;
hgs
parents:
diff changeset
    56
                }
hgs
parents:
diff changeset
    57
                template <typename I>
hgs
parents:
diff changeset
    58
                Utf16Output& write(I source, int length) {
hgs
parents:
diff changeset
    59
                    for (int i = 0; i < length; i++) { 
hgs
parents:
diff changeset
    60
                        (*this)<<source; ++source;
hgs
parents:
diff changeset
    61
                    }
hgs
parents:
diff changeset
    62
                    return *this;
hgs
parents:
diff changeset
    63
                }
hgs
parents:
diff changeset
    64
                template <typename I>
hgs
parents:
diff changeset
    65
                Utf16Output& operator<<(I source) {
hgs
parents:
diff changeset
    66
                     for (;*source; ++source) {
hgs
parents:
diff changeset
    67
                         (*this)<<*source;
hgs
parents:
diff changeset
    68
                     }
hgs
parents:
diff changeset
    69
                     return *this;
hgs
parents:
diff changeset
    70
                 }
hgs
parents:
diff changeset
    71
            private:
hgs
parents:
diff changeset
    72
                Output out_;
hgs
parents:
diff changeset
    73
        };
hgs
parents:
diff changeset
    74
        
hgs
parents:
diff changeset
    75
        /**
hgs
parents:
diff changeset
    76
         * Writes unicode characters into the given iterator as utf16 codes
hgs
parents:
diff changeset
    77
         */
hgs
parents:
diff changeset
    78
        template <typename Iterator> 
hgs
parents:
diff changeset
    79
        struct Utf16Writer : public Utf16Output<IteratorOutput<Iterator> > {
hgs
parents:
diff changeset
    80
            public:
hgs
parents:
diff changeset
    81
                Utf16Writer(Iterator i) : Utf16Output<IteratorOutput<Iterator> >(IteratorOutput<Iterator>(i)) {}
hgs
parents:
diff changeset
    82
            };
hgs
parents:
diff changeset
    83
                
hgs
parents:
diff changeset
    84
        /** 
hgs
parents:
diff changeset
    85
         * Calculates the size of all characters with the iterator as utf16 
hgs
parents:
diff changeset
    86
         * code points
hgs
parents:
diff changeset
    87
         */
hgs
parents:
diff changeset
    88
        template<typename Iterator>
hgs
parents:
diff changeset
    89
        int utf16size(Iterator i) {
hgs
parents:
diff changeset
    90
            int rv = 0;
hgs
parents:
diff changeset
    91
            for (;*i; ++i) {
hgs
parents:
diff changeset
    92
                rv += (*i >= 0x10000 ? 2 : 1);
hgs
parents:
diff changeset
    93
            }
hgs
parents:
diff changeset
    94
            return rv;
hgs
parents:
diff changeset
    95
        }
hgs
parents:
diff changeset
    96
        
hgs
parents:
diff changeset
    97
        /**
hgs
parents:
diff changeset
    98
         * Reads utf16 code points from given iterator and translates them 
hgs
parents:
diff changeset
    99
         * as unicode characters.  
hgs
parents:
diff changeset
   100
         */
hgs
parents:
diff changeset
   101
        template <typename Iterator> 
hgs
parents:
diff changeset
   102
        struct Utf16Iterator {
hgs
parents:
diff changeset
   103
            public:
hgs
parents:
diff changeset
   104
                Utf16Iterator(Iterator i) : i_(i) { 
hgs
parents:
diff changeset
   105
                    operator++(); // cache first character
hgs
parents:
diff changeset
   106
                }
hgs
parents:
diff changeset
   107
                Utf16Iterator() : i_(), c_(0), offset_(0) {}
hgs
parents:
diff changeset
   108
                inline int operator*() const {
hgs
parents:
diff changeset
   109
                    return c_;
hgs
parents:
diff changeset
   110
                };  
hgs
parents:
diff changeset
   111
                Utf16Iterator& operator++() {
hgs
parents:
diff changeset
   112
                    offset_ = i_;
hgs
parents:
diff changeset
   113
                    c_ = *i_; ++i_;
hgs
parents:
diff changeset
   114
                    if ( c_ >= 0xd800 && c_ <= 0xdfff ) {
hgs
parents:
diff changeset
   115
                        int c2 = *i_; ++i_;
hgs
parents:
diff changeset
   116
                        if ( c2 >= 0xdc00 && c2 <= 0xdfff ){
hgs
parents:
diff changeset
   117
                            c_ = (((c_ & 0x03ffL) << 10) | ((c2 & 0x03ffL) << 0)) + 0x00010000L;
hgs
parents:
diff changeset
   118
                        }
hgs
parents:
diff changeset
   119
                    }
hgs
parents:
diff changeset
   120
                    return *this;
hgs
parents:
diff changeset
   121
                }
hgs
parents:
diff changeset
   122
                operator int() {return offset_;}
hgs
parents:
diff changeset
   123
            private:
hgs
parents:
diff changeset
   124
                Iterator i_;
hgs
parents:
diff changeset
   125
                int c_; // current utf cached
hgs
parents:
diff changeset
   126
                int offset_; // characters read
hgs
parents:
diff changeset
   127
        };
hgs
parents:
diff changeset
   128
        
hgs
parents:
diff changeset
   129
        /**
hgs
parents:
diff changeset
   130
         * Copies the iterator content into a wstring
hgs
parents:
diff changeset
   131
         */
hgs
parents:
diff changeset
   132
        template<class Iterator> 
hgs
parents:
diff changeset
   133
        std::wstring utf16str(Iterator i) {
hgs
parents:
diff changeset
   134
            std::wostringstream ret;
hgs
parents:
diff changeset
   135
            while (*i) {
hgs
parents:
diff changeset
   136
                utf16put(ret, *i);
hgs
parents:
diff changeset
   137
                ++i;
hgs
parents:
diff changeset
   138
            }
hgs
parents:
diff changeset
   139
            return ret.str();
hgs
parents:
diff changeset
   140
        }
hgs
parents:
diff changeset
   141
        
hgs
parents:
diff changeset
   142
    }
hgs
parents:
diff changeset
   143
}
hgs
parents:
diff changeset
   144
hgs
parents:
diff changeset
   145
#endif /* TINYUTF16_H_ */