searchengine/oss/loc/analysis/inc/public/tinyutf16.h
changeset 24 65456528cac2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/loc/analysis/inc/public/tinyutf16.h	Fri Oct 15 12:09:28 2010 +0530
@@ -0,0 +1,145 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: 
+*
+*/
+#ifndef TINYUTF16_H_
+#define TINYUTF16_H_
+
+#include "tinyiterator.h"
+
+namespace analysis {
+
+    namespace tiny {
+
+		/**
+		 * Translates given unicode character as utf16 and 
+		 * stores utf16 codes in the output stream
+		 */
+        template <typename Stream>
+        void utf16put(Stream& out, int c) {
+            if ( c >= 0x00010000L ) {
+                c -= 0x00010000L;
+                out<<(wchar_t)(0xd800 + ((c >> 10) & 0x03ffL));
+                out<<(wchar_t)(0xd800 + (c & 0x03ffL));
+            } else {
+                out<<(wchar_t)(c);
+            }
+        }
+
+        /**
+         * Writes unicode characters into the output 
+         * stream as utf16 codes. 
+         */
+        template <typename Output> 
+        struct Utf16Output {
+            public:
+                Utf16Output(const Output& out) : out_(out) {};
+                inline Utf16Output& operator<<(int c) {
+                    utf16put(out_, c);
+                    return *this;
+                }
+
+                inline Utf16Output& operator<<(wchar_t c) {
+                    return (*this)<<(int)c;
+                }
+                template <typename I>
+                Utf16Output& write(I source, int length) {
+                    for (int i = 0; i < length; i++) { 
+                        (*this)<<source; ++source;
+                    }
+                    return *this;
+                }
+                template <typename I>
+                Utf16Output& operator<<(I source) {
+                     for (;*source; ++source) {
+                         (*this)<<*source;
+                     }
+                     return *this;
+                 }
+            private:
+                Output out_;
+        };
+        
+        /**
+         * Writes unicode characters into the given iterator as utf16 codes
+         */
+        template <typename Iterator> 
+        struct Utf16Writer : public Utf16Output<IteratorOutput<Iterator> > {
+            public:
+                Utf16Writer(Iterator i) : Utf16Output<IteratorOutput<Iterator> >(IteratorOutput<Iterator>(i)) {}
+            };
+                
+        /** 
+         * Calculates the size of all characters with the iterator as utf16 
+         * code points
+         */
+        template<typename Iterator>
+        int utf16size(Iterator i) {
+            int rv = 0;
+            for (;*i; ++i) {
+                rv += (*i >= 0x10000 ? 2 : 1);
+            }
+            return rv;
+        }
+        
+        /**
+         * Reads utf16 code points from given iterator and translates them 
+         * as unicode characters.  
+         */
+        template <typename Iterator> 
+        struct Utf16Iterator {
+            public:
+                Utf16Iterator(Iterator i) : i_(i) { 
+                    operator++(); // cache first character
+                }
+                Utf16Iterator() : i_(), c_(0), offset_(0) {}
+                inline int operator*() const {
+                    return c_;
+                };  
+                Utf16Iterator& operator++() {
+                    offset_ = i_;
+                    c_ = *i_; ++i_;
+                    if ( c_ >= 0xd800 && c_ <= 0xdfff ) {
+                        int c2 = *i_; ++i_;
+                        if ( c2 >= 0xdc00 && c2 <= 0xdfff ){
+                            c_ = (((c_ & 0x03ffL) << 10) | ((c2 & 0x03ffL) << 0)) + 0x00010000L;
+                        }
+                    }
+                    return *this;
+                }
+                operator int() {return offset_;}
+            private:
+                Iterator i_;
+                int c_; // current utf cached
+                int offset_; // characters read
+        };
+        
+        /**
+         * Copies the iterator content into a wstring
+         */
+        template<class Iterator> 
+        std::wstring utf16str(Iterator i) {
+            std::wostringstream ret;
+            while (*i) {
+                utf16put(ret, *i);
+                ++i;
+            }
+            return ret.str();
+        }
+        
+    }
+}
+
+#endif /* TINYUTF16_H_ */