--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/loc/analysis/inc/public/tinyanalysis.h Fri Oct 15 12:09:28 2010 +0530
@@ -0,0 +1,278 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description:
+*
+*/
+
+#ifndef TINYANALYSIS_H_
+#define TINYANALYSIS_H_
+
+#include <string>
+#include <sstream>
+
+#include "tinyutf16.h"
+#include "wctype.h"
+
+/*
+ * This file contains template based tokenization utilities. There
+ * are following rationales for this package:
+ *
+ * * More flexibility was needed for various CJK analyzers.
+ *
+ * -> CLucene tokenizers are difficult to make work
+ * together well. For example in practice you cannot using
+ * generic n-gram tokenizer for cjk and standard tokenizer
+ * for non-cjk. This cannot be done in CLucene without
+ * making it very, very heavy operation.
+ *
+ * * More flexibility was needed on the character reading level.
+ *
+ * * It is possible to encounter over unicodes that don't fit in
+ * 16 bit characters, when dealing with Chinese and Japanese.
+ * For this reason, reading CJK should be done in unicode mode
+ * instead of reading individual 16 bit codepoints.
+ *
+ * * Also with Korean, there is alphabetic (Hangul Jamu) and
+ * syllabic writing form (Hangul Syllables). Same text can be
+ * expressed in either of these forms. For good behavior (and
+ * some UX reasons), it was necessary to convert all encountered
+ * text into one of these forms, so that text written in Jamu
+ * could be found with Hangul Syllables and visa versa.
+ *
+ * This package fulfills both of these requirements in a very speed
+ * efficient way. Tokenizers can be easily combined to form a sort of
+ * 'aggregated tokenizer'. This kind of combination is supported by design
+ * and done with PairTokenizer class.
+ *
+ * The ability to switch the way text is read on fly is supported by
+ * having the reading done by rather abstract iterators.
+ *
+ * Performance is taken into account by having heavily used iterators
+ * resolved run-time by making it a template parameter. Lot of inlines
+ * are used, but perhaps biggest optimization of it all is that instead
+ * extracted tokens holding the string inside, tokenizers simply hold
+ * references (in a form of an iterator) into the original character
+ * buffer. So there is no heap usage, look-ups or string copying.
+ *
+ * NOTE: Iterators may be surprisingly big objects. While wchar_t*
+ * is only 4 bytes, e.g. HangulIterator<Utf16Iterator<ReaderBuffer<N>>>
+ * is already 24 bytes. This size could be reduced to 8 bytes, but
+ * it would bring performance implications. So copying of iterators
+ * may be expensive.
+ *
+ * The design shown in here is actually very nice, flexible, simplistic,
+ * fast and uses very little memory. The same design could be used
+ * e.g. for lexical analysis code.
+ */
+
+namespace lucene {
+ namespace analysis {
+ class Token;
+ }
+}
+
+namespace analysis {
+
+
+ namespace tiny {
+
+ /**
+ * Token is object, which identifies some sequence of characters in
+ * the original text stream. Holds iterator to the beginning of the
+ * token and holds information of the tokens length. The length
+ * is always the amount of unicode characters in the token.
+ */
+ template <typename Iterator>
+ struct Token {
+
+ typedef RangeIterator<Iterator> iter;
+
+ Token() : begin_(), length_() {}
+ Token(Iterator& begin, int length) : begin_(begin), length_(length) {}
+
+ /** Length in unicode characters */
+ inline int length() { return length_; };
+
+ /** Gives iterator, that iterates over this token's characters */
+ iter iterator() {
+ return iter(begin_, length_);
+ }
+ /** Informs, whether this token is non-empty */
+ operator bool() {
+ return length_;
+ }
+ /** Text size in 16 bit codewords */
+ int utf16size() {
+ return analysis::tiny::utf16size(iterator());
+ }
+ /** Copy text as 16 bit codewords */
+ void utf16(wchar_t* buf) {
+ Utf16Writer<wchar_t*>(buf)<<iterator()<<L'\0';
+ }
+ /** Copy text as 16 bit codewords */
+ std::wstring utf16() {
+ return utf16str(iterator());
+ }
+ /** Copy this token content to the Clucene token.*/
+ void copyTo(lucene::analysis::Token* token);
+ private:
+ Iterator begin_;
+ int length_;
+ };
+
+ typedef int (*Acceptor)(int c);
+
+ /** Skips all characters, that are accepted by the acceptor */
+ template <class Iterator, typename Acceptor>
+ inline int skip(Iterator& i, Acceptor accept) {
+ int ret = 0;
+ while ( *i && accept( *i ) ) { ++i; ret++; }
+ return ret;
+ }
+
+ /** Skips all characters, that are not accepted by the acceptor */
+ template <class Iterator, typename Acceptor>
+ inline int skipbut(Iterator& i, Acceptor accept) {
+ int ret = 0;
+ while ( *i && !accept( *i ) ) { ++i; ret++; }
+ return ret;
+ }
+
+ /** Consumes a token consisting of all characters accepted by the acceptor */
+ template <class Iterator, typename Acceptor>
+ Token<Iterator> consume(Iterator& i, Acceptor accept) {
+ Iterator begin = i;
+ return Token<Iterator>( begin, skip(i, accept) );
+ }
+
+ /** Abstract base class for tokenizers */
+ template <class Iterator>
+ class Tokenizer {
+ public:
+ virtual void reset() {};
+ virtual Token<Iterator> consume(Iterator& i) = 0;
+ };
+
+ /** Consumes as accepted by the acceptor */
+ template <class Iterator>
+ class CustomTokenizer : public Tokenizer<Iterator> {
+ public:
+ CustomTokenizer(Acceptor accept) : accept_(accept) {}
+ Token<Iterator> consume(Iterator& i) {
+ return ::analysis::tiny::consume(i, accept_);
+ }
+ private:
+ Acceptor accept_;
+ };
+
+ /**
+ * NGram tokenizer. Tokenizers NGram from any character sequence accepted
+ * by acceptor. This class maintains internal state. It consumes either
+ * fully sized ngrams or entire word, if the word is smaller than defined
+ * ngram size.
+ */
+ template <class Iterator>
+ class NGramTokenizer : public Tokenizer<Iterator> {
+ public:
+ NGramTokenizer(int size, Acceptor accept) : size_(size), accept_(accept), continue_(false) {}
+ NGramTokenizer(int size) : size_(size), accept_(&iswalpha) {}
+ void reset() { continue_ = false; }
+ Token<Iterator> consume(Iterator& i) {
+ if ( *i ) {
+ Iterator end = i;
+ int l = 0;
+ while (l < size_ && *end && accept_( *end )) { l++; ++end; }
+ if (l == size_ || (!continue_ && l)) {
+ // properly sized token or whole word
+ Token<Iterator> t(i, l);
+ continue_ = true;
+ ++i;
+ return t;
+ }
+ }
+ continue_ = false;
+ return Token<Iterator>(i, 0);
+ }
+ private:
+ int size_;
+ Acceptor accept_;
+ bool continue_;
+ };
+
+ /**
+ * Tokenizer, that returns ALWAYS a token, unless EOS is
+ * reached. If the tokenizer given to this tokenizer fails,
+ * relaxed tokenizer just moves one position further and
+ * tries again.
+ */
+ template <typename I>
+ class RelaxedTokenizer : public Tokenizer<I> {
+ public:
+ /** Uses given tokenizer to extract tokens. */
+ RelaxedTokenizer(Tokenizer<I>& t) : t_(t) {}
+ void reset() {t_.reset();}
+ /**
+ * Always returns a token. If tokenization fails,
+ * moves forward a character and tries again.
+ */
+ Token<I> consume(I& i) {
+ Token<I> t;
+ while (*i && !t) {
+ t = t_.consume(i);
+ if (!t) {
+ ++i; t_.reset();
+ }
+ }
+ return t;
+ }
+ private:
+ Tokenizer<I>& t_;
+ };
+
+ /**
+ * Tries to first tokenize with the first tokenizer, but if it
+ * fails, the second tokenizer is tried. If first tokenizer fails,
+ * it is reset.
+ */
+ template <typename I>
+ class PairTokenizer : public Tokenizer<I>{
+ public:
+ PairTokenizer(Tokenizer<I>& t1, Tokenizer<I>& t2) : t1_(t1), t2_(t2) {}
+ void reset() {
+ t1_.reset();
+ t2_.reset();
+ }
+ /**
+ * Attempts to tokenizer with first tokenizer, then
+ * with second. If both tokenizers fail, empty
+ * token is returned.
+ */
+ Token<I> consume(I& i) {
+ Token<I> t( t1_.consume( i ) );
+ if ( !t ) {
+ t1_.reset();
+ t = t2_.consume( i );
+ }
+ return t;
+ }
+ private:
+ Tokenizer<I>& t1_;
+ Tokenizer<I>& t2_;
+ };
+
+ }
+
+}
+
+#endif /* TINYTOKENIZER_H_ */