diff -r d4d56f5e7c55 -r 65456528cac2 searchengine/oss/cl/clucene/src/clucene/analysis/standard/standardtokenizer.cpp --- a/searchengine/oss/cl/clucene/src/clucene/analysis/standard/standardtokenizer.cpp Tue Oct 05 13:15:12 2010 +0530 +++ b/searchengine/oss/cl/clucene/src/clucene/analysis/standard/standardtokenizer.cpp Fri Oct 15 12:09:28 2010 +0530 @@ -42,6 +42,9 @@ (ch>=0xf900 && ch<=0xfaff) || \ (ch>=0xac00 && ch<=0xd7af) ) //korean + // CHANGED Thai support -> + #define _THAI ( ch >= 0x0E00 && ch <= 0x0Eff ) + // CHANGED #define DASH (ch == '-') #define NEGATIVE_SIGN_ DASH @@ -62,13 +65,19 @@ /* otherMatches is a condition (possibly compound) under which a character ** that's not an ALNUM or UNDERSCORE can be considered not to break the ** span. Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */ - #define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE) + #define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM /*|| UNDERSCORE*/) /* ** Consume CJK characters */ #define CONSUME_CJK _CONSUME_AS_LONG_AS(_CJK) + // CHANGED Thai support -> + /** + * Consume Thai + */ + #define CONSUME_THAI _CONSUME_AS_LONG_AS(_THAI) + // CHANGED /* It is considered that "nothing of value" has been read if: ** a) The "read head" hasn't moved since specialCharPos was established. @@ -121,6 +130,7 @@ t->setType(tokenImage[tokenCode]); sb->getBuffer(); //null terminates the buffer t->resetTermTextLen(); + t->setPositionIncrement(1); return true; } @@ -131,9 +141,14 @@ if ( ch == 0 || ch == -1 ){ continue; + // CHANGED Thai Support -> + } else if ( _THAI ) { + if ( ReadThai(ch,t) ) + return true; + // CHANGED } else if (SPACE) { continue; - } else if (ALPHA || UNDERSCORE) { + } else if (ALPHA) { tokenStart = rdPos; return ReadAlphaNum(ch,t); } else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) { @@ -225,6 +240,9 @@ SUCCESSFULLY_EXTRACTED_NUMBER: TCHAR rightmost = RIGHTMOST(str); /* Don't including a trailing decimal point. */ + if(ALPHA){ + CONSUME_WORD; + } if (rightmost == '.') { SHAVE_RIGHTMOST(str); unReadChar(); @@ -249,9 +267,9 @@ CONSUME_WORD; if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character? switch(ch) { /* What follows the first alphanum segment? */ - case '.': - str.appendChar('.'); - return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t); +// case '.': +// str.appendChar('.'); +// return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t); case '\'': str.appendChar('\''); return ReadApostrophe(&str,t); @@ -280,6 +298,19 @@ return setToken(t,&str,CL_NS2(analysis,standard)::CJK); } + // CHANGED Thai Support + bool StandardTokenizer::ReadThai(const TCHAR prev, Token* t) { + t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word + StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText + if ( str.len < LUCENE_MAX_WORD_LEN ){ + str.appendChar(prev); + int ch = prev; + + CONSUME_THAI; + } + return setToken(t,&str,CL_NS2(analysis,standard)::Thai); + } + // CHANGED bool StandardTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, Token* t) { const int32_t specialCharPos = rdPos; @@ -399,7 +430,7 @@ - return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM); + return setToken(t,&str,forcedType); // return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN // ? forcedType : CL_NS2(analysis,standard)::HOST); }