searchengine/oss/loc/analysis/inc/public/tinyanalysis.h
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
hgs
parents:
diff changeset
     1
/*
hgs
parents:
diff changeset
     2
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
hgs
parents:
diff changeset
     3
* All rights reserved.
hgs
parents:
diff changeset
     4
* This component and the accompanying materials are made available
hgs
parents:
diff changeset
     5
* under the terms of "Eclipse Public License v1.0"
hgs
parents:
diff changeset
     6
* which accompanies this distribution, and is available
hgs
parents:
diff changeset
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
hgs
parents:
diff changeset
     8
*
hgs
parents:
diff changeset
     9
* Initial Contributors:
hgs
parents:
diff changeset
    10
* Nokia Corporation - initial contribution.
hgs
parents:
diff changeset
    11
*
hgs
parents:
diff changeset
    12
* Contributors:
hgs
parents:
diff changeset
    13
*
hgs
parents:
diff changeset
    14
* Description: 
hgs
parents:
diff changeset
    15
*
hgs
parents:
diff changeset
    16
*/
hgs
parents:
diff changeset
    17
hgs
parents:
diff changeset
    18
#ifndef TINYANALYSIS_H_
hgs
parents:
diff changeset
    19
#define TINYANALYSIS_H_
hgs
parents:
diff changeset
    20
hgs
parents:
diff changeset
    21
#include <string>
hgs
parents:
diff changeset
    22
#include <sstream>
hgs
parents:
diff changeset
    23
hgs
parents:
diff changeset
    24
#include "tinyutf16.h"
hgs
parents:
diff changeset
    25
#include "wctype.h"
hgs
parents:
diff changeset
    26
hgs
parents:
diff changeset
    27
/*
hgs
parents:
diff changeset
    28
 * This file contains template based tokenization utilities. There 
hgs
parents:
diff changeset
    29
 * are following rationales for this package: 
hgs
parents:
diff changeset
    30
 * 
hgs
parents:
diff changeset
    31
 *    * More flexibility was needed for various CJK analyzers.
hgs
parents:
diff changeset
    32
 *     
hgs
parents:
diff changeset
    33
 *       -> CLucene tokenizers are difficult to make work 
hgs
parents:
diff changeset
    34
 *          together well. For example in practice you cannot using 
hgs
parents:
diff changeset
    35
 *          generic n-gram tokenizer for cjk and standard tokenizer
hgs
parents:
diff changeset
    36
 *          for non-cjk. This cannot be done in CLucene without
hgs
parents:
diff changeset
    37
 *          making it very, very heavy operation. 
hgs
parents:
diff changeset
    38
 *          
hgs
parents:
diff changeset
    39
 *    * More flexibility was needed on the character reading level.
hgs
parents:
diff changeset
    40
 *    
hgs
parents:
diff changeset
    41
 *        * It is possible to encounter over unicodes that don't fit in 
hgs
parents:
diff changeset
    42
 *          16 bit characters, when dealing with Chinese and Japanese. 
hgs
parents:
diff changeset
    43
 *          For this reason, reading CJK should be done in unicode mode
hgs
parents:
diff changeset
    44
 *          instead of reading individual 16 bit codepoints.
hgs
parents:
diff changeset
    45
 *           
hgs
parents:
diff changeset
    46
 *        * Also with Korean, there is alphabetic (Hangul Jamu) and 
hgs
parents:
diff changeset
    47
 *          syllabic writing form (Hangul Syllables). Same text can be 
hgs
parents:
diff changeset
    48
 *          expressed in either of these forms. For good behavior (and 
hgs
parents:
diff changeset
    49
 *          some UX reasons), it was necessary to convert all encountered 
hgs
parents:
diff changeset
    50
 *          text into one of these forms, so that text written in Jamu 
hgs
parents:
diff changeset
    51
 *          could be found with Hangul Syllables and visa versa.
hgs
parents:
diff changeset
    52
 * 
hgs
parents:
diff changeset
    53
 * This package fulfills both of these requirements in a very speed
hgs
parents:
diff changeset
    54
 * efficient way. Tokenizers can be easily combined to form a sort of 
hgs
parents:
diff changeset
    55
 * 'aggregated tokenizer'. This kind of combination is supported by design
hgs
parents:
diff changeset
    56
 * and done with PairTokenizer class. 
hgs
parents:
diff changeset
    57
 *  
hgs
parents:
diff changeset
    58
 * The ability to switch the way text is read on fly is supported by
hgs
parents:
diff changeset
    59
 * having the reading done by rather abstract iterators. 
hgs
parents:
diff changeset
    60
 * 
hgs
parents:
diff changeset
    61
 * Performance is taken into account by having heavily used iterators
hgs
parents:
diff changeset
    62
 * resolved run-time by making it a template parameter. Lot of inlines
hgs
parents:
diff changeset
    63
 * are used, but perhaps biggest optimization of it all is that instead
hgs
parents:
diff changeset
    64
 * extracted tokens holding the string inside, tokenizers simply hold
hgs
parents:
diff changeset
    65
 * references (in a form of an iterator) into the original character 
hgs
parents:
diff changeset
    66
 * buffer. So there is no heap usage, look-ups or string copying.
hgs
parents:
diff changeset
    67
 * 
hgs
parents:
diff changeset
    68
 * NOTE: Iterators may be surprisingly big objects. While wchar_t*
hgs
parents:
diff changeset
    69
 * is only 4 bytes, e.g. HangulIterator<Utf16Iterator<ReaderBuffer<N>>>
hgs
parents:
diff changeset
    70
 * is already 24 bytes. This size could be reduced to 8 bytes, but
hgs
parents:
diff changeset
    71
 * it would bring performance implications. So copying of iterators
hgs
parents:
diff changeset
    72
 * may be expensive. 
hgs
parents:
diff changeset
    73
 * 
hgs
parents:
diff changeset
    74
 * The design shown in here is actually very nice, flexible, simplistic, 
hgs
parents:
diff changeset
    75
 * fast and uses very little memory. The same design could be used
hgs
parents:
diff changeset
    76
 * e.g. for lexical analysis code. 
hgs
parents:
diff changeset
    77
 */
hgs
parents:
diff changeset
    78
hgs
parents:
diff changeset
    79
namespace lucene {
hgs
parents:
diff changeset
    80
    namespace analysis {
hgs
parents:
diff changeset
    81
        class Token;
hgs
parents:
diff changeset
    82
    }
hgs
parents:
diff changeset
    83
}
hgs
parents:
diff changeset
    84
hgs
parents:
diff changeset
    85
namespace analysis {
hgs
parents:
diff changeset
    86
hgs
parents:
diff changeset
    87
	
hgs
parents:
diff changeset
    88
    namespace tiny {
hgs
parents:
diff changeset
    89
hgs
parents:
diff changeset
    90
		/**
hgs
parents:
diff changeset
    91
		 * Token is object, which identifies some sequence of characters in
hgs
parents:
diff changeset
    92
		 * the original text stream. Holds iterator to the beginning of the 
hgs
parents:
diff changeset
    93
		 * token and holds information of the tokens length. The length 
hgs
parents:
diff changeset
    94
		 * is always the amount of unicode characters in the token. 
hgs
parents:
diff changeset
    95
		 */
hgs
parents:
diff changeset
    96
		template <typename Iterator>
hgs
parents:
diff changeset
    97
		struct Token {
hgs
parents:
diff changeset
    98
			
hgs
parents:
diff changeset
    99
			typedef RangeIterator<Iterator> iter;
hgs
parents:
diff changeset
   100
			
hgs
parents:
diff changeset
   101
			Token() : begin_(), length_() {}
hgs
parents:
diff changeset
   102
			Token(Iterator& begin, int length) : begin_(begin), length_(length) {}
hgs
parents:
diff changeset
   103
			
hgs
parents:
diff changeset
   104
			/** Length in unicode characters */
hgs
parents:
diff changeset
   105
            inline int length() { return length_; };
hgs
parents:
diff changeset
   106
            
hgs
parents:
diff changeset
   107
            /** Gives iterator, that iterates over this token's characters */
hgs
parents:
diff changeset
   108
            iter iterator() {
hgs
parents:
diff changeset
   109
				return iter(begin_, length_); 
hgs
parents:
diff changeset
   110
			}
hgs
parents:
diff changeset
   111
            /** Informs, whether this token is non-empty */
hgs
parents:
diff changeset
   112
			operator bool() {
hgs
parents:
diff changeset
   113
				return length_;
hgs
parents:
diff changeset
   114
			}
hgs
parents:
diff changeset
   115
			/** Text size in 16 bit codewords */
hgs
parents:
diff changeset
   116
			int utf16size() {
hgs
parents:
diff changeset
   117
				return analysis::tiny::utf16size(iterator()); 
hgs
parents:
diff changeset
   118
			}
hgs
parents:
diff changeset
   119
			/** Copy text as 16 bit codewords */
hgs
parents:
diff changeset
   120
            void utf16(wchar_t* buf) {
hgs
parents:
diff changeset
   121
                Utf16Writer<wchar_t*>(buf)<<iterator()<<L'\0';
hgs
parents:
diff changeset
   122
            }
hgs
parents:
diff changeset
   123
			/** Copy text as 16 bit codewords */
hgs
parents:
diff changeset
   124
            std::wstring utf16() {
hgs
parents:
diff changeset
   125
                return utf16str(iterator());
hgs
parents:
diff changeset
   126
            }
hgs
parents:
diff changeset
   127
            /** Copy this token content to the Clucene token.*/
hgs
parents:
diff changeset
   128
            void copyTo(lucene::analysis::Token* token);
hgs
parents:
diff changeset
   129
		private: 
hgs
parents:
diff changeset
   130
			Iterator begin_;
hgs
parents:
diff changeset
   131
			int length_; 
hgs
parents:
diff changeset
   132
		};
hgs
parents:
diff changeset
   133
hgs
parents:
diff changeset
   134
		typedef int (*Acceptor)(int c); 
hgs
parents:
diff changeset
   135
		
hgs
parents:
diff changeset
   136
		/** Skips all characters, that are accepted by the acceptor */
hgs
parents:
diff changeset
   137
		template <class Iterator, typename Acceptor> 
hgs
parents:
diff changeset
   138
		inline int skip(Iterator& i, Acceptor accept) {
hgs
parents:
diff changeset
   139
			int ret = 0; 
hgs
parents:
diff changeset
   140
			while ( *i && accept( *i ) ) { ++i; ret++; }
hgs
parents:
diff changeset
   141
			return ret; 
hgs
parents:
diff changeset
   142
		}
hgs
parents:
diff changeset
   143
hgs
parents:
diff changeset
   144
		/** Skips all characters, that are not accepted by the acceptor */
hgs
parents:
diff changeset
   145
		template <class Iterator, typename Acceptor> 
hgs
parents:
diff changeset
   146
		inline int skipbut(Iterator& i, Acceptor accept) {
hgs
parents:
diff changeset
   147
			int ret = 0; 
hgs
parents:
diff changeset
   148
			while ( *i && !accept( *i ) ) { ++i; ret++; }
hgs
parents:
diff changeset
   149
			return ret; 
hgs
parents:
diff changeset
   150
		}
hgs
parents:
diff changeset
   151
hgs
parents:
diff changeset
   152
		/** Consumes a token consisting of all characters accepted by the acceptor */
hgs
parents:
diff changeset
   153
		template <class Iterator, typename Acceptor> 
hgs
parents:
diff changeset
   154
		Token<Iterator> consume(Iterator& i, Acceptor accept) {
hgs
parents:
diff changeset
   155
			Iterator begin = i;
hgs
parents:
diff changeset
   156
			return Token<Iterator>( begin,  skip(i, accept) ); 
hgs
parents:
diff changeset
   157
		}
hgs
parents:
diff changeset
   158
		
hgs
parents:
diff changeset
   159
		/** Abstract base class for tokenizers */
hgs
parents:
diff changeset
   160
        template <class Iterator>
hgs
parents:
diff changeset
   161
		class Tokenizer {
hgs
parents:
diff changeset
   162
            public:
hgs
parents:
diff changeset
   163
                virtual void reset() {};
hgs
parents:
diff changeset
   164
                virtual Token<Iterator> consume(Iterator& i) = 0;
hgs
parents:
diff changeset
   165
		};
hgs
parents:
diff changeset
   166
hgs
parents:
diff changeset
   167
		/** Consumes as accepted by the acceptor */
hgs
parents:
diff changeset
   168
		template <class Iterator>
hgs
parents:
diff changeset
   169
		class CustomTokenizer : public Tokenizer<Iterator> {
hgs
parents:
diff changeset
   170
			public:
hgs
parents:
diff changeset
   171
				CustomTokenizer(Acceptor accept) : accept_(accept) {}
hgs
parents:
diff changeset
   172
				Token<Iterator> consume(Iterator& i) {
hgs
parents:
diff changeset
   173
					return ::analysis::tiny::consume(i, accept_);  
hgs
parents:
diff changeset
   174
				}
hgs
parents:
diff changeset
   175
			private: 
hgs
parents:
diff changeset
   176
				Acceptor accept_; 
hgs
parents:
diff changeset
   177
		};
hgs
parents:
diff changeset
   178
		
hgs
parents:
diff changeset
   179
		/** 
hgs
parents:
diff changeset
   180
		 * NGram tokenizer. Tokenizers NGram from any character sequence accepted 
hgs
parents:
diff changeset
   181
		 * by acceptor. This class maintains internal state. It consumes either
hgs
parents:
diff changeset
   182
		 * fully sized ngrams or entire word, if the word is smaller than defined
hgs
parents:
diff changeset
   183
		 * ngram size. 
hgs
parents:
diff changeset
   184
		 */
hgs
parents:
diff changeset
   185
		template <class Iterator>
hgs
parents:
diff changeset
   186
		class NGramTokenizer : public Tokenizer<Iterator> {
hgs
parents:
diff changeset
   187
			public:
hgs
parents:
diff changeset
   188
				NGramTokenizer(int size, Acceptor accept) : size_(size), accept_(accept), continue_(false) {}
hgs
parents:
diff changeset
   189
				NGramTokenizer(int size) : size_(size), accept_(&iswalpha) {}
hgs
parents:
diff changeset
   190
				void reset() { continue_ = false; }
hgs
parents:
diff changeset
   191
				Token<Iterator> consume(Iterator& i) {
hgs
parents:
diff changeset
   192
					if ( *i ) {
hgs
parents:
diff changeset
   193
						Iterator end = i;
hgs
parents:
diff changeset
   194
						int l = 0;
hgs
parents:
diff changeset
   195
						while (l < size_ && *end && accept_( *end )) { l++; ++end; }
hgs
parents:
diff changeset
   196
						if (l == size_ || (!continue_ && l)) {
hgs
parents:
diff changeset
   197
							// properly sized token or whole word
hgs
parents:
diff changeset
   198
							Token<Iterator> t(i, l);
hgs
parents:
diff changeset
   199
							continue_ = true; 
hgs
parents:
diff changeset
   200
							++i;
hgs
parents:
diff changeset
   201
							return t;
hgs
parents:
diff changeset
   202
						} 
hgs
parents:
diff changeset
   203
					}
hgs
parents:
diff changeset
   204
					continue_ = false;
hgs
parents:
diff changeset
   205
					return Token<Iterator>(i, 0); 
hgs
parents:
diff changeset
   206
				}
hgs
parents:
diff changeset
   207
			private: 
hgs
parents:
diff changeset
   208
				int size_; 
hgs
parents:
diff changeset
   209
				Acceptor accept_;
hgs
parents:
diff changeset
   210
				bool continue_;
hgs
parents:
diff changeset
   211
		};
hgs
parents:
diff changeset
   212
hgs
parents:
diff changeset
   213
		/**
hgs
parents:
diff changeset
   214
		 * Tokenizer, that returns ALWAYS a token, unless EOS is 
hgs
parents:
diff changeset
   215
		 * reached. If the tokenizer given to this tokenizer fails, 
hgs
parents:
diff changeset
   216
		 * relaxed tokenizer just moves one position further and 
hgs
parents:
diff changeset
   217
		 * tries again. 
hgs
parents:
diff changeset
   218
		 */
hgs
parents:
diff changeset
   219
	    template <typename I> 
hgs
parents:
diff changeset
   220
	    class RelaxedTokenizer : public Tokenizer<I> {
hgs
parents:
diff changeset
   221
            public: 
hgs
parents:
diff changeset
   222
				/** Uses given tokenizer to extract tokens.  */
hgs
parents:
diff changeset
   223
                RelaxedTokenizer(Tokenizer<I>& t) : t_(t) {}
hgs
parents:
diff changeset
   224
                void reset() {t_.reset();}
hgs
parents:
diff changeset
   225
                /** 
hgs
parents:
diff changeset
   226
                 * Always returns a token. If tokenization fails,
hgs
parents:
diff changeset
   227
                 * moves forward a character and tries again. 
hgs
parents:
diff changeset
   228
                 */
hgs
parents:
diff changeset
   229
                Token<I> consume(I& i) {
hgs
parents:
diff changeset
   230
                    Token<I> t;
hgs
parents:
diff changeset
   231
                    while (*i && !t) {
hgs
parents:
diff changeset
   232
                        t = t_.consume(i);
hgs
parents:
diff changeset
   233
                        if (!t) {
hgs
parents:
diff changeset
   234
                            ++i; t_.reset();
hgs
parents:
diff changeset
   235
                        }
hgs
parents:
diff changeset
   236
                    }
hgs
parents:
diff changeset
   237
                    return t;
hgs
parents:
diff changeset
   238
                }
hgs
parents:
diff changeset
   239
            private: 
hgs
parents:
diff changeset
   240
                Tokenizer<I>& t_;
hgs
parents:
diff changeset
   241
	    };
hgs
parents:
diff changeset
   242
	    
hgs
parents:
diff changeset
   243
	    /**
hgs
parents:
diff changeset
   244
	     * Tries to first tokenize with the first tokenizer, but if it 
hgs
parents:
diff changeset
   245
	     * fails, the second tokenizer is tried. If first tokenizer fails, 
hgs
parents:
diff changeset
   246
	     * it is reset.  
hgs
parents:
diff changeset
   247
	     */
hgs
parents:
diff changeset
   248
		template <typename I> 
hgs
parents:
diff changeset
   249
		class PairTokenizer : public Tokenizer<I>{
hgs
parents:
diff changeset
   250
		    public:
hgs
parents:
diff changeset
   251
		        PairTokenizer(Tokenizer<I>& t1, Tokenizer<I>& t2) : t1_(t1), t2_(t2) {}
hgs
parents:
diff changeset
   252
		        void reset() {
hgs
parents:
diff changeset
   253
                    t1_.reset();
hgs
parents:
diff changeset
   254
                    t2_.reset();
hgs
parents:
diff changeset
   255
                }
hgs
parents:
diff changeset
   256
	            /**
hgs
parents:
diff changeset
   257
	             * Attempts to tokenizer with first tokenizer, then 
hgs
parents:
diff changeset
   258
	             * with second. If both tokenizers fail, empty 
hgs
parents:
diff changeset
   259
	             * token is returned. 
hgs
parents:
diff changeset
   260
	             */
hgs
parents:
diff changeset
   261
	 		    Token<I> consume(I& i) {
hgs
parents:
diff changeset
   262
                    Token<I> t( t1_.consume( i ) );
hgs
parents:
diff changeset
   263
                    if ( !t ) {
hgs
parents:
diff changeset
   264
                        t1_.reset(); 
hgs
parents:
diff changeset
   265
                        t = t2_.consume( i );
hgs
parents:
diff changeset
   266
                    }   
hgs
parents:
diff changeset
   267
                    return t;
hgs
parents:
diff changeset
   268
                }
hgs
parents:
diff changeset
   269
		    private:
hgs
parents:
diff changeset
   270
                Tokenizer<I>& t1_; 
hgs
parents:
diff changeset
   271
                Tokenizer<I>& t2_;
hgs
parents:
diff changeset
   272
		};
hgs
parents:
diff changeset
   273
hgs
parents:
diff changeset
   274
	}
hgs
parents:
diff changeset
   275
    
hgs
parents:
diff changeset
   276
}
hgs
parents:
diff changeset
   277
hgs
parents:
diff changeset
   278
#endif /* TINYTOKENIZER_H_ */