searchengine/oss/loc/analysis/inc/public/tinyiterator.h
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Tue, 06 Jul 2010 15:30:04 +0300
changeset 10 afe194b6b1cd
permissions -rw-r--r--
Revision: 201025 Kit: 2010127

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/

#ifndef TINYITERATOR_H_
#define TINYITERATOR_H_

#include <exception>

namespace lucene {
    namespace util {
        class Reader; 
    }
}
namespace analysis {

	 /**
	  * This package provides basic 
	  */
     namespace tiny {
      
        /*
         *
         * Meta code for describing the iterator concept used here.
         * Used to iterate character streams. 
         * Follows closely the STL forward iterator.
         * Note: Comparisons of form x < y may not work properly.
         * Note: x - y will not provide distance in characters, 
         * but instead offset distance in the original text.  
         * 
         * Only operators of form ++i is provided. --i is not provided
         * because complications with utf16 or unicode decomposition/ 
         * composition. i++ would lead to innecessary code.
         * 
         
        concept Iterator {
           
		   // Accessor to the iterator character
           wchar_t operator*();     // 16 bit unicode
           OR
           int operator*();         // 32 bit unicode
       
           // Next location
		   Iterator& operator++();
		   
		   // Returns offset in the original text. Note that one character 
		   // may be transformed into a number of characters. This means 
		   // that comparisons of form (int)i < (int)j are unreliable and
		   // should not be used. Also lengths i - j are unreliable. With korean 
		   // i - j may be produce length 2, but iterating for (;i<j;++i); may
		   // iterate throught e.g. 6 characters. Or, with 32 bit unicode,
		   // length 2 may contain only one character. 
           operator int(); 
        
        };
         
         */
     
		
        /**
         * Iterates throught some T that provides array/pointer like interface
         */
        template <typename T>
        struct ArrayIterator {
            public:
                inline ArrayIterator(T& array, int i) : array_(&array), i_(i) {}
                inline ArrayIterator(T& array) : array_(&array), i_(0) {}
                inline ArrayIterator() : array_(0), i_(0) {}
                
                inline wchar_t operator*() const { return (*array_)[i_]; };  
                inline ArrayIterator<T>& operator++() { i_++; return *this; };  
                inline operator int() { return i_; }
            private: 
                T* array_;
                int i_;
                
        };

        /**
         * Iterates from starting position to up to length characters. 
         */
        template <typename Iterator>
        struct RangeIterator {
            public:
                RangeIterator(Iterator& begin, int length) : i_(begin), left_(length) {}
                inline int operator*() { return left_ ? *i_: '\0'; }
                inline RangeIterator& operator++() { 
                    if (left_ ) { 
                        ++i_; left_--;
                    }
                    return *this;
                }
                inline operator int() { return i_; }
            private: 
                Iterator i_;
                int left_; 
        };

        /**
         * Turns iterator into a C++ stream. Allows out<<'c'<<'\0'; 
         * kind of syntax to be used with iterators.  
         */
        template <typename Iterator>
        struct IteratorOutput {
            public:
                IteratorOutput(Iterator i) : i_(i) {}
                template <typename T>
                inline IteratorOutput& operator<<(T t) {
                    *i_= t;
                    ++i_;
                    return *this;
                }
            private:
                Iterator i_;
        };
        
        /**
         * CLucene IO support
         */
        namespace cl {
        
			/**
			 * Informs that caller has attempted to read a location
			 * from the reader source, that is no more stored in the 
			 * buffer.
			 */
			class TooOldIndexException : public std::exception {
            public: 
                const char* what() const;
            };
            
            /**
             * Provides buffer & array like interface to be used with
             * CLucene readers. If reader r provides access to file X, 
             * and we have buf(r), we can sort of 'random access' file 
             * X with buf[0], buf[X], buf[Z+3] syntaxes. Still, the buffer
             * is of limited size. There is always the most recent location
             * L that is read. Trying to access buf[L-SIZE-1] will raise 
             * exception, where SIZE is the buffer size. 
             */
            template<int SIZE>
            class ReaderBuffer {
                public:
					/** Iterator for iterating the underlying source */
                    typedef ArrayIterator<ReaderBuffer> iterator;
                    /* Constructs buffer for a reader reading some source. */
                    ReaderBuffer(lucene::util::Reader& reader);
                    /** Returns character at location i  */
                    wchar_t operator[](int i);
                    /** Returns iterator pointing to location i */
                    inline iterator at(int i);
                    /** Returns iterator pointing to the beginning of character source */
                    inline iterator begin();
                private:
                    /** Rotating buffer. */
                    wchar_t buf_[SIZE]; 
                    /** How many characters have been read from reader */
                    int read_; 
                    /** Points to the next character to be overwritten in buffer */
                    int cut_; 
                    /** Index of oldest character inside the original source */
                    int offset_; 
                    /** Reader reading original source */
                    lucene::util::Reader& reader_;
            };
            
        }

    }

}

#endif /* TINYITERATOR_H_ */