searchengine/oss/loc/analysis/inc/public/tinyutf16.h
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/
#ifndef TINYUTF16_H_
#define TINYUTF16_H_

#include "tinyiterator.h"

namespace analysis {

    namespace tiny {

		/**
		 * Translates given unicode character as utf16 and 
		 * stores utf16 codes in the output stream
		 */
        template <typename Stream>
        void utf16put(Stream& out, int c) {
            if ( c >= 0x00010000L ) {
                c -= 0x00010000L;
                out<<(wchar_t)(0xd800 + ((c >> 10) & 0x03ffL));
                out<<(wchar_t)(0xd800 + (c & 0x03ffL));
            } else {
                out<<(wchar_t)(c);
            }
        }

        /**
         * Writes unicode characters into the output 
         * stream as utf16 codes. 
         */
        template <typename Output> 
        struct Utf16Output {
            public:
                Utf16Output(const Output& out) : out_(out) {};
                inline Utf16Output& operator<<(int c) {
                    utf16put(out_, c);
                    return *this;
                }

                inline Utf16Output& operator<<(wchar_t c) {
                    return (*this)<<(int)c;
                }
                template <typename I>
                Utf16Output& write(I source, int length) {
                    for (int i = 0; i < length; i++) { 
                        (*this)<<source; ++source;
                    }
                    return *this;
                }
                template <typename I>
                Utf16Output& operator<<(I source) {
                     for (;*source; ++source) {
                         (*this)<<*source;
                     }
                     return *this;
                 }
            private:
                Output out_;
        };
        
        /**
         * Writes unicode characters into the given iterator as utf16 codes
         */
        template <typename Iterator> 
        struct Utf16Writer : public Utf16Output<IteratorOutput<Iterator> > {
            public:
                Utf16Writer(Iterator i) : Utf16Output<IteratorOutput<Iterator> >(IteratorOutput<Iterator>(i)) {}
            };
                
        /** 
         * Calculates the size of all characters with the iterator as utf16 
         * code points
         */
        template<typename Iterator>
        int utf16size(Iterator i) {
            int rv = 0;
            for (;*i; ++i) {
                rv += (*i >= 0x10000 ? 2 : 1);
            }
            return rv;
        }
        
        /**
         * Reads utf16 code points from given iterator and translates them 
         * as unicode characters.  
         */
        template <typename Iterator> 
        struct Utf16Iterator {
            public:
                Utf16Iterator(Iterator i) : i_(i) { 
                    operator++(); // cache first character
                }
                Utf16Iterator() : i_(), c_(0), offset_(0) {}
                inline int operator*() const {
                    return c_;
                };  
                Utf16Iterator& operator++() {
                    offset_ = i_;
                    c_ = *i_; ++i_;
                    if ( c_ >= 0xd800 && c_ <= 0xdfff ) {
                        int c2 = *i_; ++i_;
                        if ( c2 >= 0xdc00 && c2 <= 0xdfff ){
                            c_ = (((c_ & 0x03ffL) << 10) | ((c2 & 0x03ffL) << 0)) + 0x00010000L;
                        }
                    }
                    return *this;
                }
                operator int() {return offset_;}
            private:
                Iterator i_;
                int c_; // current utf cached
                int offset_; // characters read
        };
        
        /**
         * Copies the iterator content into a wstring
         */
        template<class Iterator> 
        std::wstring utf16str(Iterator i) {
            std::wostringstream ret;
            while (*i) {
                utf16put(ret, *i);
                ++i;
            }
            return ret.str();
        }
        
    }
}

#endif /* TINYUTF16_H_ */