searchengine/oss/cl/clucene/src/clucene/analysis/standard/standardtokenizer.cpp
changeset 2 6c1a2771f4b7
parent 0 671dee74050a
child 10 afe194b6b1cd
child 24 65456528cac2
equal deleted inserted replaced
1:6f2c1c46032b 2:6c1a2771f4b7
   253 					str.appendChar('.');
   253 					str.appendChar('.');
   254 					return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t);
   254 					return ReadDotted(&str, CL_NS2(analysis,standard)::UNKNOWN,t);
   255 				case '\'':
   255 				case '\'':
   256 					str.appendChar('\'');
   256 					str.appendChar('\'');
   257 					return ReadApostrophe(&str,t);
   257 					return ReadApostrophe(&str,t);
   258 				case '@':
   258 //				case '@':
   259 					str.appendChar('@');
   259 //					str.appendChar('@');
   260 					return ReadAt(&str,t);
   260 //					return ReadAt(&str,t);
   261 				case '&':
   261 				case '&':
   262 					str.appendChar('&');
   262 					str.appendChar('&');
   263 					return ReadCompany(&str,t);
   263 					return ReadCompany(&str,t);
   264 				/* default: fall through to end of this function. */
   264 				/* default: fall through to end of this function. */
   265 			}
   265 			}
   302       }
   302       }
   303       while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
   303       while (!EOS && str.len < LUCENE_MAX_WORD_LEN-1 ) {
   304         ch = readChar();
   304         ch = readChar();
   305         const bool dot = ch == '.';
   305         const bool dot = ch == '.';
   306         const bool dash = ch == '-';
   306         const bool dash = ch == '-';
       
   307         //const bool at = ch == '@';
   307 
   308 
   308         if (!(ALNUM || UNDERSCORE || dot || dash)) {
   309         if (!(ALNUM || UNDERSCORE || dot || dash)) {
   309           break;
   310           break;
   310         }
   311         }
   311         /* Multiple dots or dashes in succession end the token.
   312         /* Multiple dots or dashes in succession end the token.
   320           if (!prevWasDot) {
   321           if (!prevWasDot) {
   321             SHAVE_RIGHTMOST(str);
   322             SHAVE_RIGHTMOST(str);
   322           }
   323           }
   323           break;
   324           break;
   324         }
   325         }
       
   326         
   325 
   327 
   326         str.appendChar(ch);
   328         str.appendChar(ch);
       
   329             
       
   330     
   327 
   331 
   328         prevWasDot = dot;
   332         prevWasDot = dot;
   329         prevWasDash = dash;
   333         prevWasDash = dash;
   330       }
   334       }
   331     }
   335     }
   377         }
   381         }
   378       }
   382       }
   379     }
   383     }
   380     } /* End block-guard of strBuf */
   384     } /* End block-guard of strBuf */
   381 
   385 
   382     if (!EOS) {
   386 
   383       if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) {
   387 
   384         str.appendChar('@');
   388 //    if (!EOS) {
   385         return ReadAt(&str,t);
   389 //      if (ch == '@' && str.len < LUCENE_MAX_WORD_LEN-1) {
   386       } else {
   390 //        str.appendChar('@');
   387         unReadChar();
   391 //        return ReadAt(&str,t);
   388       }
   392 //      } else {
   389     }
   393 //        unReadChar();
   390 
   394 //      } 
   391 	return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN
   395     
   392 			? forcedType : CL_NS2(analysis,standard)::HOST);
   396       if (!EOS) {
       
   397             unReadChar();
       
   398           }
       
   399 
       
   400      
       
   401     
       
   402   return setToken(t,&str,CL_NS2(analysis,standard)::ALPHANUM);
       
   403 //	return setToken(t,&str,CL_NS2(analysis,standard)::UNKNOWN
       
   404 //			? forcedType : CL_NS2(analysis,standard)::HOST);
   393   }
   405   }
   394 
   406 
   395   bool StandardTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) {
   407   bool StandardTokenizer::ReadApostrophe(StringBuffer* _str, Token* t) {
   396     StringBuffer& str=*_str;
   408     StringBuffer& str=*_str;
   397 
   409