searchengine/oss/cl/clucene/src/clucene/analysis/standard/standardtokenizer.cpp
changeset 24 65456528cac2
parent 2 6c1a2771f4b7
--- a/searchengine/oss/cl/clucene/src/clucene/analysis/standard/standardtokenizer.cpp	Tue Oct 05 13:15:12 2010 +0530
+++ b/searchengine/oss/cl/clucene/src/clucene/analysis/standard/standardtokenizer.cpp	Fri Oct 15 12:09:28 2010 +0530
@@ -42,6 +42,9 @@
   						   (ch>=0xf900 && ch<=0xfaff) || \
   						   (ch>=0xac00 && ch<=0xd7af) ) //korean
 
+  // CHANGED Thai support ->
+  #define _THAI        ( ch >= 0x0E00 && ch <= 0x0Eff )
+  // CHANGED
   
   #define DASH          (ch == '-')
   #define NEGATIVE_SIGN_ DASH
@@ -62,13 +65,19 @@
   /* otherMatches is a condition (possibly compound) under which a character
   ** that's not an ALNUM or UNDERSCORE can be considered not to break the
   ** span.  Callers should pass false if only ALNUM/UNDERSCORE are acceptable. */
-  #define CONSUME_WORD                  _CONSUME_AS_LONG_AS(ALNUM || UNDERSCORE)
+  #define CONSUME_WORD                  _CONSUME_AS_LONG_AS(ALNUM /*|| UNDERSCORE*/)
   
   /*
   ** Consume CJK characters
   */
   #define CONSUME_CJK                   _CONSUME_AS_LONG_AS(_CJK)
 
+  // CHANGED Thai support -> 
+  /**
+   * Consume Thai 
+   */
+  #define CONSUME_THAI 				    _CONSUME_AS_LONG_AS(_THAI) 
+  // CHANGED
 
   /* It is considered that "nothing of value" has been read if:
   ** a) The "read head" hasn't moved since specialCharPos was established.
@@ -121,6 +130,7 @@
 	t->setType(tokenImage[tokenCode]);
 	sb->getBuffer(); //null terminates the buffer
 	t->resetTermTextLen();
+	t->setPositionIncrement(1);
 	return true;
   }
 
@@ -131,9 +141,14 @@
 
 	  if ( ch == 0 || ch == -1 ){
 		continue;
+	  // CHANGED Thai Support -> 
+	  } else if ( _THAI ) {
+		  if ( ReadThai(ch,t) ) 
+			  return true; 
+	  // CHANGED
 	  } else if (SPACE) {
         continue;
-      } else if (ALPHA || UNDERSCORE) {
+      } else if (ALPHA) {
         tokenStart = rdPos;
         return ReadAlphaNum(ch,t);
       } else if (DIGIT || NEGATIVE_SIGN_ || DECIMAL) {
@@ -225,6 +240,9 @@
     SUCCESSFULLY_EXTRACTED_NUMBER:
     TCHAR rightmost = RIGHTMOST(str);
     /* Don't including a trailing decimal point. */
+    if(ALPHA){
+        CONSUME_WORD;
+    }
     if (rightmost == '.') {
       SHAVE_RIGHTMOST(str);
       unReadChar();
@@ -249,9 +267,9 @@
 		CONSUME_WORD;
 		if (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) { //still have space for 1 more character?
 			switch(ch) { /* What follows the first alphanum segment? */
-				case '.':
-					str.appendChar('.');
-					return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t);
+//				case '.':
+//					str.appendChar('.');
+//					return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t);
 				case '\'':
 					str.appendChar('\'');
 					return ReadApostrophe(&str,t);
@@ -280,6 +298,19 @@
 	return setToken(t,&str,CL_NS2(analysis,standard)::CJK);
   }
   
+  // CHANGED Thai Support
+  bool StandardTokenizer::ReadThai(const TCHAR prev, Token* t) {
+    t->growBuffer(LUCENE_MAX_WORD_LEN+1);//make sure token can hold the next word
+    StringBuffer str(t->_termText,t->bufferLength(),true); //use stringbuffer to read data onto the termText
+	if ( str.len < LUCENE_MAX_WORD_LEN ){
+		str.appendChar(prev);
+		int ch = prev;
+
+		CONSUME_THAI;
+	}
+	return setToken(t,&str,CL_NS2(analysis,standard)::Thai);
+  }
+  // CHANGED
 
   bool StandardTokenizer::ReadDotted(StringBuffer* _str, TokenTypes forcedType, Token* t) {
     const int32_t specialCharPos = rdPos;
@@ -399,7 +430,7 @@
 
      
     
-  return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
+  return setToken(t,&str,forcedType);
 //	return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN
 //			? forcedType : CL_NS2(analysis,standard)::HOST);
   }