searchengine/oss/loc/analysis/inc/public/tinyanalysis.h
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/

#ifndef TINYANALYSIS_H_
#define TINYANALYSIS_H_

#include <string>
#include <sstream>

#include "tinyutf16.h"
#include "wctype.h"

/*
 * This file contains template based tokenization utilities. There 
 * are following rationales for this package: 
 * 
 *    * More flexibility was needed for various CJK analyzers.
 *     
 *       -> CLucene tokenizers are difficult to make work 
 *          together well. For example in practice you cannot using 
 *          generic n-gram tokenizer for cjk and standard tokenizer
 *          for non-cjk. This cannot be done in CLucene without
 *          making it very, very heavy operation. 
 *          
 *    * More flexibility was needed on the character reading level.
 *    
 *        * It is possible to encounter over unicodes that don't fit in 
 *          16 bit characters, when dealing with Chinese and Japanese. 
 *          For this reason, reading CJK should be done in unicode mode
 *          instead of reading individual 16 bit codepoints.
 *           
 *        * Also with Korean, there is alphabetic (Hangul Jamu) and 
 *          syllabic writing form (Hangul Syllables). Same text can be 
 *          expressed in either of these forms. For good behavior (and 
 *          some UX reasons), it was necessary to convert all encountered 
 *          text into one of these forms, so that text written in Jamu 
 *          could be found with Hangul Syllables and visa versa.
 * 
 * This package fulfills both of these requirements in a very speed
 * efficient way. Tokenizers can be easily combined to form a sort of 
 * 'aggregated tokenizer'. This kind of combination is supported by design
 * and done with PairTokenizer class. 
 *  
 * The ability to switch the way text is read on fly is supported by
 * having the reading done by rather abstract iterators. 
 * 
 * Performance is taken into account by having heavily used iterators
 * resolved run-time by making it a template parameter. Lot of inlines
 * are used, but perhaps biggest optimization of it all is that instead
 * extracted tokens holding the string inside, tokenizers simply hold
 * references (in a form of an iterator) into the original character 
 * buffer. So there is no heap usage, look-ups or string copying.
 * 
 * NOTE: Iterators may be surprisingly big objects. While wchar_t*
 * is only 4 bytes, e.g. HangulIterator<Utf16Iterator<ReaderBuffer<N>>>
 * is already 24 bytes. This size could be reduced to 8 bytes, but
 * it would bring performance implications. So copying of iterators
 * may be expensive. 
 * 
 * The design shown in here is actually very nice, flexible, simplistic, 
 * fast and uses very little memory. The same design could be used
 * e.g. for lexical analysis code. 
 */

namespace lucene {
    namespace analysis {
        class Token;
    }
}

namespace analysis {

	
    namespace tiny {

		/**
		 * Token is object, which identifies some sequence of characters in
		 * the original text stream. Holds iterator to the beginning of the 
		 * token and holds information of the tokens length. The length 
		 * is always the amount of unicode characters in the token. 
		 */
		template <typename Iterator>
		struct Token {
			
			typedef RangeIterator<Iterator> iter;
			
			Token() : begin_(), length_() {}
			Token(Iterator& begin, int length) : begin_(begin), length_(length) {}
			
			/** Length in unicode characters */
            inline int length() { return length_; };
            
            /** Gives iterator, that iterates over this token's characters */
            iter iterator() {
				return iter(begin_, length_); 
			}
            /** Informs, whether this token is non-empty */
			operator bool() {
				return length_;
			}
			/** Text size in 16 bit codewords */
			int utf16size() {
				return analysis::tiny::utf16size(iterator()); 
			}
			/** Copy text as 16 bit codewords */
            void utf16(wchar_t* buf) {
                Utf16Writer<wchar_t*>(buf)<<iterator()<<L'\0';
            }
			/** Copy text as 16 bit codewords */
            std::wstring utf16() {
                return utf16str(iterator());
            }
            /** Copy this token content to the Clucene token.*/
            void copyTo(lucene::analysis::Token* token);
		private: 
			Iterator begin_;
			int length_; 
		};

		typedef int (*Acceptor)(int c); 
		
		/** Skips all characters, that are accepted by the acceptor */
		template <class Iterator, typename Acceptor> 
		inline int skip(Iterator& i, Acceptor accept) {
			int ret = 0; 
			while ( *i && accept( *i ) ) { ++i; ret++; }
			return ret; 
		}

		/** Skips all characters, that are not accepted by the acceptor */
		template <class Iterator, typename Acceptor> 
		inline int skipbut(Iterator& i, Acceptor accept) {
			int ret = 0; 
			while ( *i && !accept( *i ) ) { ++i; ret++; }
			return ret; 
		}

		/** Consumes a token consisting of all characters accepted by the acceptor */
		template <class Iterator, typename Acceptor> 
		Token<Iterator> consume(Iterator& i, Acceptor accept) {
			Iterator begin = i;
			return Token<Iterator>( begin,  skip(i, accept) ); 
		}
		
		/** Abstract base class for tokenizers */
        template <class Iterator>
		class Tokenizer {
            public:
                virtual void reset() {};
                virtual Token<Iterator> consume(Iterator& i) = 0;
		};

		/** Consumes as accepted by the acceptor */
		template <class Iterator>
		class CustomTokenizer : public Tokenizer<Iterator> {
			public:
				CustomTokenizer(Acceptor accept) : accept_(accept) {}
				Token<Iterator> consume(Iterator& i) {
					return ::analysis::tiny::consume(i, accept_);  
				}
			private: 
				Acceptor accept_; 
		};
		
		/** 
		 * NGram tokenizer. Tokenizers NGram from any character sequence accepted 
		 * by acceptor. This class maintains internal state. It consumes either
		 * fully sized ngrams or entire word, if the word is smaller than defined
		 * ngram size. 
		 */
		template <class Iterator>
		class NGramTokenizer : public Tokenizer<Iterator> {
			public:
				NGramTokenizer(int size, Acceptor accept) : size_(size), accept_(accept), continue_(false) {}
				NGramTokenizer(int size) : size_(size), accept_(&iswalpha) {}
				void reset() { continue_ = false; }
				Token<Iterator> consume(Iterator& i) {
					if ( *i ) {
						Iterator end = i;
						int l = 0;
						while (l < size_ && *end && accept_( *end )) { l++; ++end; }
						if (l == size_ || (!continue_ && l)) {
							// properly sized token or whole word
							Token<Iterator> t(i, l);
							continue_ = true; 
							++i;
							return t;
						} 
					}
					continue_ = false;
					return Token<Iterator>(i, 0); 
				}
			private: 
				int size_; 
				Acceptor accept_;
				bool continue_;
		};

		/**
		 * Tokenizer, that returns ALWAYS a token, unless EOS is 
		 * reached. If the tokenizer given to this tokenizer fails, 
		 * relaxed tokenizer just moves one position further and 
		 * tries again. 
		 */
	    template <typename I> 
	    class RelaxedTokenizer : public Tokenizer<I> {
            public: 
				/** Uses given tokenizer to extract tokens.  */
                RelaxedTokenizer(Tokenizer<I>& t) : t_(t) {}
                void reset() {t_.reset();}
                /** 
                 * Always returns a token. If tokenization fails,
                 * moves forward a character and tries again. 
                 */
                Token<I> consume(I& i) {
                    Token<I> t;
                    while (*i && !t) {
                        t = t_.consume(i);
                        if (!t) {
                            ++i; t_.reset();
                        }
                    }
                    return t;
                }
            private: 
                Tokenizer<I>& t_;
	    };
	    
	    /**
	     * Tries to first tokenize with the first tokenizer, but if it 
	     * fails, the second tokenizer is tried. If first tokenizer fails, 
	     * it is reset.  
	     */
		template <typename I> 
		class PairTokenizer : public Tokenizer<I>{
		    public:
		        PairTokenizer(Tokenizer<I>& t1, Tokenizer<I>& t2) : t1_(t1), t2_(t2) {}
		        void reset() {
                    t1_.reset();
                    t2_.reset();
                }
	            /**
	             * Attempts to tokenizer with first tokenizer, then 
	             * with second. If both tokenizers fail, empty 
	             * token is returned. 
	             */
	 		    Token<I> consume(I& i) {
                    Token<I> t( t1_.consume( i ) );
                    if ( !t ) {
                        t1_.reset(); 
                        t = t2_.consume( i );
                    }   
                    return t;
                }
		    private:
                Tokenizer<I>& t1_; 
                Tokenizer<I>& t2_;
		};

	}
    
}

#endif /* TINYTOKENIZER_H_ */