diff -r d4d56f5e7c55 -r 65456528cac2 searchengine/oss/loc/analysis/inc/public/tinyanalysis.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/searchengine/oss/loc/analysis/inc/public/tinyanalysis.h Fri Oct 15 12:09:28 2010 +0530 @@ -0,0 +1,278 @@ +/* +* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ + +#ifndef TINYANALYSIS_H_ +#define TINYANALYSIS_H_ + +#include +#include + +#include "tinyutf16.h" +#include "wctype.h" + +/* + * This file contains template based tokenization utilities. There + * are following rationales for this package: + * + * * More flexibility was needed for various CJK analyzers. + * + * -> CLucene tokenizers are difficult to make work + * together well. For example in practice you cannot using + * generic n-gram tokenizer for cjk and standard tokenizer + * for non-cjk. This cannot be done in CLucene without + * making it very, very heavy operation. + * + * * More flexibility was needed on the character reading level. + * + * * It is possible to encounter over unicodes that don't fit in + * 16 bit characters, when dealing with Chinese and Japanese. + * For this reason, reading CJK should be done in unicode mode + * instead of reading individual 16 bit codepoints. + * + * * Also with Korean, there is alphabetic (Hangul Jamu) and + * syllabic writing form (Hangul Syllables). Same text can be + * expressed in either of these forms. For good behavior (and + * some UX reasons), it was necessary to convert all encountered + * text into one of these forms, so that text written in Jamu + * could be found with Hangul Syllables and visa versa. + * + * This package fulfills both of these requirements in a very speed + * efficient way. Tokenizers can be easily combined to form a sort of + * 'aggregated tokenizer'. This kind of combination is supported by design + * and done with PairTokenizer class. + * + * The ability to switch the way text is read on fly is supported by + * having the reading done by rather abstract iterators. + * + * Performance is taken into account by having heavily used iterators + * resolved run-time by making it a template parameter. Lot of inlines + * are used, but perhaps biggest optimization of it all is that instead + * extracted tokens holding the string inside, tokenizers simply hold + * references (in a form of an iterator) into the original character + * buffer. So there is no heap usage, look-ups or string copying. + * + * NOTE: Iterators may be surprisingly big objects. While wchar_t* + * is only 4 bytes, e.g. HangulIterator>> + * is already 24 bytes. This size could be reduced to 8 bytes, but + * it would bring performance implications. So copying of iterators + * may be expensive. + * + * The design shown in here is actually very nice, flexible, simplistic, + * fast and uses very little memory. The same design could be used + * e.g. for lexical analysis code. + */ + +namespace lucene { + namespace analysis { + class Token; + } +} + +namespace analysis { + + + namespace tiny { + + /** + * Token is object, which identifies some sequence of characters in + * the original text stream. Holds iterator to the beginning of the + * token and holds information of the tokens length. The length + * is always the amount of unicode characters in the token. + */ + template + struct Token { + + typedef RangeIterator iter; + + Token() : begin_(), length_() {} + Token(Iterator& begin, int length) : begin_(begin), length_(length) {} + + /** Length in unicode characters */ + inline int length() { return length_; }; + + /** Gives iterator, that iterates over this token's characters */ + iter iterator() { + return iter(begin_, length_); + } + /** Informs, whether this token is non-empty */ + operator bool() { + return length_; + } + /** Text size in 16 bit codewords */ + int utf16size() { + return analysis::tiny::utf16size(iterator()); + } + /** Copy text as 16 bit codewords */ + void utf16(wchar_t* buf) { + Utf16Writer(buf)< + inline int skip(Iterator& i, Acceptor accept) { + int ret = 0; + while ( *i && accept( *i ) ) { ++i; ret++; } + return ret; + } + + /** Skips all characters, that are not accepted by the acceptor */ + template + inline int skipbut(Iterator& i, Acceptor accept) { + int ret = 0; + while ( *i && !accept( *i ) ) { ++i; ret++; } + return ret; + } + + /** Consumes a token consisting of all characters accepted by the acceptor */ + template + Token consume(Iterator& i, Acceptor accept) { + Iterator begin = i; + return Token( begin, skip(i, accept) ); + } + + /** Abstract base class for tokenizers */ + template + class Tokenizer { + public: + virtual void reset() {}; + virtual Token consume(Iterator& i) = 0; + }; + + /** Consumes as accepted by the acceptor */ + template + class CustomTokenizer : public Tokenizer { + public: + CustomTokenizer(Acceptor accept) : accept_(accept) {} + Token consume(Iterator& i) { + return ::analysis::tiny::consume(i, accept_); + } + private: + Acceptor accept_; + }; + + /** + * NGram tokenizer. Tokenizers NGram from any character sequence accepted + * by acceptor. This class maintains internal state. It consumes either + * fully sized ngrams or entire word, if the word is smaller than defined + * ngram size. + */ + template + class NGramTokenizer : public Tokenizer { + public: + NGramTokenizer(int size, Acceptor accept) : size_(size), accept_(accept), continue_(false) {} + NGramTokenizer(int size) : size_(size), accept_(&iswalpha) {} + void reset() { continue_ = false; } + Token consume(Iterator& i) { + if ( *i ) { + Iterator end = i; + int l = 0; + while (l < size_ && *end && accept_( *end )) { l++; ++end; } + if (l == size_ || (!continue_ && l)) { + // properly sized token or whole word + Token t(i, l); + continue_ = true; + ++i; + return t; + } + } + continue_ = false; + return Token(i, 0); + } + private: + int size_; + Acceptor accept_; + bool continue_; + }; + + /** + * Tokenizer, that returns ALWAYS a token, unless EOS is + * reached. If the tokenizer given to this tokenizer fails, + * relaxed tokenizer just moves one position further and + * tries again. + */ + template + class RelaxedTokenizer : public Tokenizer { + public: + /** Uses given tokenizer to extract tokens. */ + RelaxedTokenizer(Tokenizer& t) : t_(t) {} + void reset() {t_.reset();} + /** + * Always returns a token. If tokenization fails, + * moves forward a character and tries again. + */ + Token consume(I& i) { + Token t; + while (*i && !t) { + t = t_.consume(i); + if (!t) { + ++i; t_.reset(); + } + } + return t; + } + private: + Tokenizer& t_; + }; + + /** + * Tries to first tokenize with the first tokenizer, but if it + * fails, the second tokenizer is tried. If first tokenizer fails, + * it is reset. + */ + template + class PairTokenizer : public Tokenizer{ + public: + PairTokenizer(Tokenizer& t1, Tokenizer& t2) : t1_(t1), t2_(t2) {} + void reset() { + t1_.reset(); + t2_.reset(); + } + /** + * Attempts to tokenizer with first tokenizer, then + * with second. If both tokenizers fail, empty + * token is returned. + */ + Token consume(I& i) { + Token t( t1_.consume( i ) ); + if ( !t ) { + t1_.reset(); + t = t2_.consume( i ); + } + return t; + } + private: + Tokenizer& t1_; + Tokenizer& t2_; + }; + + } + +} + +#endif /* TINYTOKENIZER_H_ */