searchengine/oss/loc/analysis/inc/public/tinyanalysis.h
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 #ifndef TINYANALYSIS_H_
       
    19 #define TINYANALYSIS_H_
       
    20 
       
    21 #include <string>
       
    22 #include <sstream>
       
    23 
       
    24 #include "tinyutf16.h"
       
    25 #include "wctype.h"
       
    26 
       
    27 /*
       
    28  * This file contains template based tokenization utilities. There 
       
    29  * are following rationales for this package: 
       
    30  * 
       
    31  *    * More flexibility was needed for various CJK analyzers.
       
    32  *     
       
    33  *       -> CLucene tokenizers are difficult to make work 
       
    34  *          together well. For example in practice you cannot using 
       
    35  *          generic n-gram tokenizer for cjk and standard tokenizer
       
    36  *          for non-cjk. This cannot be done in CLucene without
       
    37  *          making it very, very heavy operation. 
       
    38  *          
       
    39  *    * More flexibility was needed on the character reading level.
       
    40  *    
       
    41  *        * It is possible to encounter over unicodes that don't fit in 
       
    42  *          16 bit characters, when dealing with Chinese and Japanese. 
       
    43  *          For this reason, reading CJK should be done in unicode mode
       
    44  *          instead of reading individual 16 bit codepoints.
       
    45  *           
       
    46  *        * Also with Korean, there is alphabetic (Hangul Jamu) and 
       
    47  *          syllabic writing form (Hangul Syllables). Same text can be 
       
    48  *          expressed in either of these forms. For good behavior (and 
       
    49  *          some UX reasons), it was necessary to convert all encountered 
       
    50  *          text into one of these forms, so that text written in Jamu 
       
    51  *          could be found with Hangul Syllables and visa versa.
       
    52  * 
       
    53  * This package fulfills both of these requirements in a very speed
       
    54  * efficient way. Tokenizers can be easily combined to form a sort of 
       
    55  * 'aggregated tokenizer'. This kind of combination is supported by design
       
    56  * and done with PairTokenizer class. 
       
    57  *  
       
    58  * The ability to switch the way text is read on fly is supported by
       
    59  * having the reading done by rather abstract iterators. 
       
    60  * 
       
    61  * Performance is taken into account by having heavily used iterators
       
    62  * resolved run-time by making it a template parameter. Lot of inlines
       
    63  * are used, but perhaps biggest optimization of it all is that instead
       
    64  * extracted tokens holding the string inside, tokenizers simply hold
       
    65  * references (in a form of an iterator) into the original character 
       
    66  * buffer. So there is no heap usage, look-ups or string copying.
       
    67  * 
       
    68  * NOTE: Iterators may be surprisingly big objects. While wchar_t*
       
    69  * is only 4 bytes, e.g. HangulIterator<Utf16Iterator<ReaderBuffer<N>>>
       
    70  * is already 24 bytes. This size could be reduced to 8 bytes, but
       
    71  * it would bring performance implications. So copying of iterators
       
    72  * may be expensive. 
       
    73  * 
       
    74  * The design shown in here is actually very nice, flexible, simplistic, 
       
    75  * fast and uses very little memory. The same design could be used
       
    76  * e.g. for lexical analysis code. 
       
    77  */
       
    78 
       
    79 namespace lucene {
       
    80     namespace analysis {
       
    81         class Token;
       
    82     }
       
    83 }
       
    84 
       
    85 namespace analysis {
       
    86 
       
    87 	
       
    88     namespace tiny {
       
    89 
       
    90 		/**
       
    91 		 * Token is object, which identifies some sequence of characters in
       
    92 		 * the original text stream. Holds iterator to the beginning of the 
       
    93 		 * token and holds information of the tokens length. The length 
       
    94 		 * is always the amount of unicode characters in the token. 
       
    95 		 */
       
    96 		template <typename Iterator>
       
    97 		struct Token {
       
    98 			
       
    99 			typedef RangeIterator<Iterator> iter;
       
   100 			
       
   101 			Token() : begin_(), length_() {}
       
   102 			Token(Iterator& begin, int length) : begin_(begin), length_(length) {}
       
   103 			
       
   104 			/** Length in unicode characters */
       
   105             inline int length() { return length_; };
       
   106             
       
   107             /** Gives iterator, that iterates over this token's characters */
       
   108             iter iterator() {
       
   109 				return iter(begin_, length_); 
       
   110 			}
       
   111             /** Informs, whether this token is non-empty */
       
   112 			operator bool() {
       
   113 				return length_;
       
   114 			}
       
   115 			/** Text size in 16 bit codewords */
       
   116 			int utf16size() {
       
   117 				return analysis::tiny::utf16size(iterator()); 
       
   118 			}
       
   119 			/** Copy text as 16 bit codewords */
       
   120             void utf16(wchar_t* buf) {
       
   121                 Utf16Writer<wchar_t*>(buf)<<iterator()<<L'\0';
       
   122             }
       
   123 			/** Copy text as 16 bit codewords */
       
   124             std::wstring utf16() {
       
   125                 return utf16str(iterator());
       
   126             }
       
   127             /** Copy this token content to the Clucene token.*/
       
   128             void copyTo(lucene::analysis::Token* token);
       
   129 		private: 
       
   130 			Iterator begin_;
       
   131 			int length_; 
       
   132 		};
       
   133 
       
   134 		typedef int (*Acceptor)(int c); 
       
   135 		
       
   136 		/** Skips all characters, that are accepted by the acceptor */
       
   137 		template <class Iterator, typename Acceptor> 
       
   138 		inline int skip(Iterator& i, Acceptor accept) {
       
   139 			int ret = 0; 
       
   140 			while ( *i && accept( *i ) ) { ++i; ret++; }
       
   141 			return ret; 
       
   142 		}
       
   143 
       
   144 		/** Skips all characters, that are not accepted by the acceptor */
       
   145 		template <class Iterator, typename Acceptor> 
       
   146 		inline int skipbut(Iterator& i, Acceptor accept) {
       
   147 			int ret = 0; 
       
   148 			while ( *i && !accept( *i ) ) { ++i; ret++; }
       
   149 			return ret; 
       
   150 		}
       
   151 
       
   152 		/** Consumes a token consisting of all characters accepted by the acceptor */
       
   153 		template <class Iterator, typename Acceptor> 
       
   154 		Token<Iterator> consume(Iterator& i, Acceptor accept) {
       
   155 			Iterator begin = i;
       
   156 			return Token<Iterator>( begin,  skip(i, accept) ); 
       
   157 		}
       
   158 		
       
   159 		/** Abstract base class for tokenizers */
       
   160         template <class Iterator>
       
   161 		class Tokenizer {
       
   162             public:
       
   163                 virtual void reset() {};
       
   164                 virtual Token<Iterator> consume(Iterator& i) = 0;
       
   165 		};
       
   166 
       
   167 		/** Consumes as accepted by the acceptor */
       
   168 		template <class Iterator>
       
   169 		class CustomTokenizer : public Tokenizer<Iterator> {
       
   170 			public:
       
   171 				CustomTokenizer(Acceptor accept) : accept_(accept) {}
       
   172 				Token<Iterator> consume(Iterator& i) {
       
   173 					return ::analysis::tiny::consume(i, accept_);  
       
   174 				}
       
   175 			private: 
       
   176 				Acceptor accept_; 
       
   177 		};
       
   178 		
       
   179 		/** 
       
   180 		 * NGram tokenizer. Tokenizers NGram from any character sequence accepted 
       
   181 		 * by acceptor. This class maintains internal state. It consumes either
       
   182 		 * fully sized ngrams or entire word, if the word is smaller than defined
       
   183 		 * ngram size. 
       
   184 		 */
       
   185 		template <class Iterator>
       
   186 		class NGramTokenizer : public Tokenizer<Iterator> {
       
   187 			public:
       
   188 				NGramTokenizer(int size, Acceptor accept) : size_(size), accept_(accept), continue_(false) {}
       
   189 				NGramTokenizer(int size) : size_(size), accept_(&iswalpha) {}
       
   190 				void reset() { continue_ = false; }
       
   191 				Token<Iterator> consume(Iterator& i) {
       
   192 					if ( *i ) {
       
   193 						Iterator end = i;
       
   194 						int l = 0;
       
   195 						while (l < size_ && *end && accept_( *end )) { l++; ++end; }
       
   196 						if (l == size_ || (!continue_ && l)) {
       
   197 							// properly sized token or whole word
       
   198 							Token<Iterator> t(i, l);
       
   199 							continue_ = true; 
       
   200 							++i;
       
   201 							return t;
       
   202 						} 
       
   203 					}
       
   204 					continue_ = false;
       
   205 					return Token<Iterator>(i, 0); 
       
   206 				}
       
   207 			private: 
       
   208 				int size_; 
       
   209 				Acceptor accept_;
       
   210 				bool continue_;
       
   211 		};
       
   212 
       
   213 		/**
       
   214 		 * Tokenizer, that returns ALWAYS a token, unless EOS is 
       
   215 		 * reached. If the tokenizer given to this tokenizer fails, 
       
   216 		 * relaxed tokenizer just moves one position further and 
       
   217 		 * tries again. 
       
   218 		 */
       
   219 	    template <typename I> 
       
   220 	    class RelaxedTokenizer : public Tokenizer<I> {
       
   221             public: 
       
   222 				/** Uses given tokenizer to extract tokens.  */
       
   223                 RelaxedTokenizer(Tokenizer<I>& t) : t_(t) {}
       
   224                 void reset() {t_.reset();}
       
   225                 /** 
       
   226                  * Always returns a token. If tokenization fails,
       
   227                  * moves forward a character and tries again. 
       
   228                  */
       
   229                 Token<I> consume(I& i) {
       
   230                     Token<I> t;
       
   231                     while (*i && !t) {
       
   232                         t = t_.consume(i);
       
   233                         if (!t) {
       
   234                             ++i; t_.reset();
       
   235                         }
       
   236                     }
       
   237                     return t;
       
   238                 }
       
   239             private: 
       
   240                 Tokenizer<I>& t_;
       
   241 	    };
       
   242 	    
       
   243 	    /**
       
   244 	     * Tries to first tokenize with the first tokenizer, but if it 
       
   245 	     * fails, the second tokenizer is tried. If first tokenizer fails, 
       
   246 	     * it is reset.  
       
   247 	     */
       
   248 		template <typename I> 
       
   249 		class PairTokenizer : public Tokenizer<I>{
       
   250 		    public:
       
   251 		        PairTokenizer(Tokenizer<I>& t1, Tokenizer<I>& t2) : t1_(t1), t2_(t2) {}
       
   252 		        void reset() {
       
   253                     t1_.reset();
       
   254                     t2_.reset();
       
   255                 }
       
   256 	            /**
       
   257 	             * Attempts to tokenizer with first tokenizer, then 
       
   258 	             * with second. If both tokenizers fail, empty 
       
   259 	             * token is returned. 
       
   260 	             */
       
   261 	 		    Token<I> consume(I& i) {
       
   262                     Token<I> t( t1_.consume( i ) );
       
   263                     if ( !t ) {
       
   264                         t1_.reset(); 
       
   265                         t = t2_.consume( i );
       
   266                     }   
       
   267                     return t;
       
   268                 }
       
   269 		    private:
       
   270                 Tokenizer<I>& t1_; 
       
   271                 Tokenizer<I>& t2_;
       
   272 		};
       
   273 
       
   274 	}
       
   275     
       
   276 }
       
   277 
       
   278 #endif /* TINYTOKENIZER_H_ */