searchengine/oss/cl/clucene/src/clucene/highlighter/TokenSources.cpp
changeset 7 a5fbfefd615f
child 10 afe194b6b1cd
equal deleted inserted replaced
3:ae3f1779f6da 7:a5fbfefd615f
       
     1 #include "CLucene/StdHeader.h"
       
     2 #include "TokenSources.h"
       
     3 
       
     4 #include "CLucene/util/VoidList.h"
       
     5 
       
     6 CL_NS_DEF2(search,highlight)
       
     7 CL_NS_USE(analysis)
       
     8 CL_NS_USE(index)
       
     9 CL_NS_USE(util)
       
    10 
       
    11 TokenSources::TokenSources(void)
       
    12 {
       
    13 }
       
    14 
       
    15 TokenSources::~TokenSources(void)
       
    16 {
       
    17 }
       
    18 
       
    19 TokenStream* TokenSources::getAnyTokenStream(IndexReader* reader,int32_t docId, TCHAR* field, Analyzer* analyzer)
       
    20 {
       
    21 	TokenStream* ts=NULL;
       
    22 
       
    23 	TermFreqVector* tfv=reader->getTermFreqVector(docId,field);
       
    24 	if(tfv!=NULL)
       
    25 	{
       
    26 //		todo: this is actually very dodgy... we try casting
       
    27 //		to TermPositionVector, we take the token stream
       
    28 //		only if the cast works... should have a way of
       
    29 //		knowing what type this is
       
    30 		TermPositionVector* tmp = NULL;
       
    31 		try{
       
    32              tmp = dynamic_cast<TermPositionVector *> (tfv);
       
    33 		}catch(...){
       
    34 			//ignore
       
    35 		}
       
    36 		if ( tmp != NULL )
       
    37 		    ts=getTokenStream(tmp);
       
    38 	}
       
    39 	//No token info stored so fall back to analyzing raw content
       
    40 	if(ts==NULL)
       
    41 	{
       
    42 		ts=getTokenStream(reader,docId,field,analyzer);
       
    43 	}
       
    44 	return ts;
       
    45 }
       
    46 
       
    47 
       
    48 TokenStream* TokenSources::getTokenStream(TermPositionVector* tpv)
       
    49 {
       
    50     //assumes the worst and makes no assumptions about token position sequences.
       
    51     return getTokenStream(tpv,false);   
       
    52 }
       
    53 
       
    54 TokenStream* TokenSources::getTokenStream(TermPositionVector* tpv, bool tokenPositionsGuaranteedContiguous)
       
    55 {
       
    56     //an object used to iterate across an array of tokens
       
    57     /*class StoredTokenStream extends TokenStream
       
    58     {
       
    59         Token tokens[];
       
    60         int32_t currentToken=0;
       
    61         StoredTokenStream(Token tokens[])
       
    62         {
       
    63             this.tokens=tokens;
       
    64         }
       
    65         public Token next()
       
    66         {
       
    67             if(currentToken>=tokens.length)
       
    68             {
       
    69                 return NULL;
       
    70             }
       
    71             return tokens[currentToken++];
       
    72         }            
       
    73     }     */   
       
    74     //code to reconstruct the original sequence of Tokens
       
    75     const TCHAR** terms=tpv->getTerms();          
       
    76     const int32_t* freq= (int32_t *)tpv->getTermFrequencies();
       
    77 	int32_t freqLen = tpv->size();
       
    78 
       
    79     size_t totalTokens=0;
       
    80 	{
       
    81 		for (int32_t t = 0; t < freqLen; t++)
       
    82 			totalTokens+=freq[t];
       
    83 	}
       
    84 
       
    85     Token** tokensInOriginalOrder=NULL;
       
    86 	CLSetList<Token*,Token::OrderCompare>* unsortedTokens = NULL;
       
    87     for (int32_t t = 0; t < freqLen; t++)
       
    88     {
       
    89         TermVectorOffsetInfo** offsets=(TermVectorOffsetInfo**)tpv->getOffsets(t);
       
    90         if(offsets==NULL)
       
    91             return NULL;
       
    92         
       
    93         int32_t* pos=NULL;
       
    94 		int32_t posLen=0;
       
    95         if(tokenPositionsGuaranteedContiguous)
       
    96         {
       
    97             //try get the token position info to speed up assembly of tokens into sorted sequence
       
    98             pos=(int32_t *)tpv->getTermPositions(t);
       
    99 			posLen=1;//todo
       
   100         }
       
   101 
       
   102 		if ( tokensInOriginalOrder != NULL )
       
   103 			tokensInOriginalOrder = _CL_NEWARRAY(Token*, totalTokens+1);
       
   104 
       
   105         if(pos==NULL)
       
   106         {	
       
   107             //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
       
   108             if(unsortedTokens==NULL)
       
   109                 unsortedTokens=_CLNEW CLSetList<Token*,Token::OrderCompare>(false);
       
   110             for (int32_t tp=0; offsets[tp]!=NULL; tp++)
       
   111             {
       
   112                 unsortedTokens->insert(_CLNEW Token(terms[t],
       
   113                     offsets[tp]->getStartOffset(),
       
   114                     offsets[tp]->getEndOffset()));
       
   115             }
       
   116         }
       
   117         else
       
   118         {
       
   119             //We have positions stored and a guarantee that the token position information is contiguous
       
   120             
       
   121             // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
       
   122             // creates jumps in position numbers - this code would fail under those circumstances
       
   123             
       
   124             //tokens stored with positions - can use this to index straight into sorted array
       
   125             for (int32_t tp = 0; tp < posLen; tp++)
       
   126             {
       
   127                 if ( tokensInOriginalOrder )
       
   128                     {
       
   129                     tokensInOriginalOrder[pos[tp]]=_CLNEW Token(terms[t],
       
   130                         offsets[tp]->getStartOffset(),
       
   131                         offsets[tp]->getEndOffset());
       
   132                     }
       
   133             }                
       
   134         }
       
   135     }
       
   136     //If the field has been stored without position data we must perform a sort        
       
   137     if(unsortedTokens!=NULL)
       
   138     {
       
   139 		if ( totalTokens<unsortedTokens->size() ){
       
   140 			_CLDELETE_ARRAY(tokensInOriginalOrder);
       
   141 			tokensInOriginalOrder = _CL_NEWARRAY(Token*,unsortedTokens->size()+1);
       
   142 		}
       
   143 		//the list has already sorted our items //todo:check that this is true...
       
   144 		if ( tokensInOriginalOrder )
       
   145 		   unsortedTokens->toArray(tokensInOriginalOrder);
       
   146 		
       
   147 		return _CLNEW StoredTokenStream(tokensInOriginalOrder,unsortedTokens->size());
       
   148     }else
       
   149 		return _CLNEW StoredTokenStream(tokensInOriginalOrder,totalTokens);
       
   150 }
       
   151 
       
   152 TokenStream* TokenSources::getTokenStream(IndexReader* reader,int32_t docId, TCHAR* field)
       
   153 {
       
   154 	TermFreqVector* tfv=reader->getTermFreqVector(docId,field);
       
   155 	if(tfv==NULL)
       
   156 	{
       
   157 		TCHAR buf[250];
       
   158 		_sntprintf(buf,250,_T("%s in doc #%d does not have any term position data stored"),field,docId);
       
   159 		_CLTHROWT(CL_ERR_IllegalArgument,buf);
       
   160 		return NULL;
       
   161 	}
       
   162 
       
   163 	//todo:bad way of doing this...
       
   164 	TermPositionVector* tmp = NULL;
       
   165 	try{
       
   166 		tmp = dynamic_cast<TermPositionVector *> (tfv); //check to see if tfv is a Tpv
       
   167 	}catch(...){}
       
   168 	TokenStream* stream = NULL;
       
   169 	if ( tmp != NULL ){
       
   170 		TermPositionVector* tpv = dynamic_cast<TermPositionVector *> (reader->getTermFreqVector(docId,field));
       
   171 		if ( tpv )
       
   172 		    stream = getTokenStream(tpv);  
       
   173 	    //return getTokenStream(tpv);	        
       
   174 	}else{
       
   175 		TCHAR buf[250];
       
   176 		_sntprintf(buf,250,_T("%s in doc #%d does not have any term position data stored"),field,docId);
       
   177 		_CLTHROWT(CL_ERR_IllegalArgument,buf);
       
   178 		//return NULL;
       
   179 	}
       
   180 	return stream;
       
   181 }
       
   182 
       
   183 //convenience method
       
   184 TokenStream* TokenSources::getTokenStream(IndexReader* reader,int32_t docId, TCHAR* field,Analyzer* analyzer)
       
   185 {
       
   186 	CL_NS(document)::Document* doc=reader->document(docId);
       
   187 	const TCHAR* contents=doc->get(field);
       
   188 	if(contents==NULL)
       
   189 	{
       
   190 		TCHAR buf[250];
       
   191 		_sntprintf(buf,250,_T("Field %s in document #%d is not stored and cannot be analyzed"),field,docId);
       
   192 		_CLTHROWT(CL_ERR_IllegalArgument,buf);
       
   193 		return NULL;
       
   194 	}
       
   195     return analyzer->tokenStream(field,_CLNEW StringReader(contents));
       
   196 }
       
   197 
       
   198 TokenSources::StoredTokenStream::StoredTokenStream(CL_NS(analysis)::Token** tokens, size_t len)
       
   199 {
       
   200 	currentToken = 0;
       
   201     this->tokens=tokens;
       
   202 	this->length = len;
       
   203 }
       
   204 bool TokenSources::StoredTokenStream::next(CL_NS(analysis)::Token* token)
       
   205 {
       
   206     if(currentToken>=length)
       
   207     {
       
   208         return false;
       
   209     }
       
   210 	Token* t = tokens[currentToken++];
       
   211 
       
   212 	token->set(t->termText(),t->startOffset(),t->endOffset(),t->type());;
       
   213     return true;
       
   214 }
       
   215 void TokenSources::StoredTokenStream::close(){
       
   216 	
       
   217 }
       
   218 
       
   219 CL_NS_END2