--- a/searchengine/oss/cl/clucene/src/clucene/analysis/standard/standardtokenizer.cpp Tue Oct 05 13:15:12 2010 +0530
+++ b/searchengine/oss/cl/clucene/src/clucene/analysis/standard/standardtokenizer.cpp Fri Oct 15 12:09:28 2010 +0530
@@ -42,6 +42,9 @@
(ch>=0xf900 && ch<=0xfaff) || \
(ch>=0xac00 && ch<=0xd7af) ) //korean
+ // CHANGED Thai support ->
+ #define _THAI ( ch >= 0x0E00 && ch <= 0x0Eff )
+ // CHANGED
#define DASH (ch == '-')
#define NEGATIVE_SIGN_ DASH
@@ -62,13 +65,19 @@
/* otherMatches is a condition (possibly compound) under which a character
** that's not an ALNUM or UNDERSCORE can be considered not to break the
** span. Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */
- #define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE)
+ #define CONSUME_WORD _CONSUME_AS_LONG_AS(ALNUM /*|| UNDERSCORE*/)
/*
** Consume CJK characters
*/
#define CONSUME_CJK _CONSUME_AS_LONG_AS(_CJK)
+ // CHANGED Thai support ->
+ /**
+ * Consume Thai
+ */
+ #define CONSUME_THAI _CONSUME_AS_LONG_AS(_THAI)
+ // CHANGED
/* It is considered that "nothing of value" has been read if:
** a) The "read head" hasn't moved since specialCharPos was established.
@@ -121,6 +130,7 @@
t->setType(tokenImage[tokenCode]);
sb->getBuffer(); //null terminates the buffer
t->resetTermTextLen();
+ t->setPositionIncrement(1);
return true;
}
@@ -131,9 +141,14 @@
if ( ch == 0 || ch == -1 ){
continue;
+ // CHANGED Thai Support ->
+ } else if ( _THAI ) {
+ if ( ReadThai(ch,t) )
+ return true;
+ // CHANGED
} else if (SPACE) {
continue;
- } else if (ALPHA || UNDERSCORE) {
+ } else if (ALPHA) {
tokenStart = rdPos;
return ReadAlphaNum(ch,t);
} else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) {
@@ -225,6 +240,9 @@
SUCCESSFULLY_EXTRACTED_NUMBER:
TCHAR rightmost = RIGHTMOST(str);
/* Don't including a trailing decimal point. */
+ if(ALPHA){
+ CONSUME_WORD;
+ }
if (rightmost == '.') {
SHAVE_RIGHTMOST(str);
unReadChar();
@@ -249,9 +267,9 @@
CONSUME_WORD;
if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
switch(ch) { /* What follows the first alphanum segment? */
- case '.':
- str.appendChar('.');
- return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t);
+// case '.':
+// str.appendChar('.');
+// return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t);
case '\'':
str.appendChar('\'');
return ReadApostrophe(&str,t);
@@ -280,6 +298,19 @@
return setToken(t,&str,CL_NS2(analysis,standard)::CJK);
}
+ // CHANGED Thai Support
+ bool StandardTokenizer::ReadThai(const TCHAR prev, Token* t) {
+ t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
+ StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
+ if ( str.len < LUCENE_MAX_WORD_LEN ){
+ str.appendChar(prev);
+ int ch = prev;
+
+ CONSUME_THAI;
+ }
+ return setToken(t,&str,CL_NS2(analysis,standard)::Thai);
+ }
+ // CHANGED
bool StandardTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, Token* t) {
const int32_t specialCharPos = rdPos;
@@ -399,7 +430,7 @@
- return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
+ return setToken(t,&str,forcedType);
// return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN
// ? forcedType : CL_NS2(analysis,standard)::HOST);
}