FCL/sf/mw/searchsrv: comparison searchengine/oss/cl/clucene/src/clucene/highlighter/TokenSources.cpp

equal deleted inserted replaced

-:ae3f1779f6da
+:a5fbfefd615f
+#include "CLucene/StdHeader.h"
+#include "TokenSources.h"
+#include "CLucene/util/VoidList.h"
+CL_NS_DEF2(search,highlight)
+CL_NS_USE(analysis)
+CL_NS_USE(index)
+CL_NS_USE(util)
+TokenSources::TokenSources(void)
+{
+}
+TokenSources::~TokenSources(void)
+{
+}
+TokenStream* TokenSources::getAnyTokenStream(IndexReader* reader,int32_t docId, TCHAR* field, Analyzer* analyzer)
+{
+	TokenStream* ts=NULL;
+	TermFreqVector* tfv=reader->getTermFreqVector(docId,field);
+	if(tfv!=NULL)
+	{
+//		todo: this is actually very dodgy... we try casting
+//		to TermPositionVector, we take the token stream
+//		only if the cast works... should have a way of
+//		knowing what type this is
+		TermPositionVector* tmp = NULL;
+		try{
+tmp = dynamic_cast<TermPositionVector *> (tfv);
+		}catch(...){
+			//ignore
+		}
+		if ( tmp != NULL )
+		    ts=getTokenStream(tmp);
+	}
+	//No token info stored so fall back to analyzing raw content
+	if(ts==NULL)
+	{
+		ts=getTokenStream(reader,docId,field,analyzer);
+	}
+	return ts;
+}
+TokenStream* TokenSources::getTokenStream(TermPositionVector* tpv)
+{
+//assumes the worst and makes no assumptions about token position sequences.
+return getTokenStream(tpv,false);
+}
+TokenStream* TokenSources::getTokenStream(TermPositionVector* tpv, bool tokenPositionsGuaranteedContiguous)
+{
+//an object used to iterate across an array of tokens
+/*class StoredTokenStream extends TokenStream
+{
+Token tokens[];
+int32_t currentToken=0;
+StoredTokenStream(Token tokens[])
+{
+this.tokens=tokens;
+}
+public Token next()
+{
+if(currentToken>=tokens.length)
+{
+return NULL;
+}
+return tokens[currentToken++];
+}
+}     */
+//code to reconstruct the original sequence of Tokens
+const TCHAR** terms=tpv->getTerms();
+const int32_t* freq= (int32_t *)tpv->getTermFrequencies();
+	int32_t freqLen = tpv->size();
+size_t totalTokens=0;
+	{
+		for (int32_t t = 0; t < freqLen; t++)
+			totalTokens+=freq[t];
+	}
+Token** tokensInOriginalOrder=NULL;
+	CLSetList<Token*,Token::OrderCompare>* unsortedTokens = NULL;
+for (int32_t t = 0; t < freqLen; t++)
+{
+TermVectorOffsetInfo** offsets=(TermVectorOffsetInfo**)tpv->getOffsets(t);
+if(offsets==NULL)
+return NULL;
+int32_t* pos=NULL;
+		int32_t posLen=0;
+if(tokenPositionsGuaranteedContiguous)
+{
+//try get the token position info to speed up assembly of tokens into sorted sequence
+pos=(int32_t *)tpv->getTermPositions(t);
+			posLen=1;//todo
+}
+		if ( tokensInOriginalOrder != NULL )
+			tokensInOriginalOrder = _CL_NEWARRAY(Token*, totalTokens+1);
+if(pos==NULL)
+{
+//tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later
+if(unsortedTokens==NULL)
+unsortedTokens=_CLNEW CLSetList<Token*,Token::OrderCompare>(false);
+for (int32_t tp=0; offsets[tp]!=NULL; tp++)
+{
+unsortedTokens->insert(_CLNEW Token(terms[t],
+offsets[tp]->getStartOffset(),
+offsets[tp]->getEndOffset()));
+}
+}
+else
+{
+//We have positions stored and a guarantee that the token position information is contiguous
+// This may be fast BUT wont work if Tokenizers used which create >1 token in same position or
+// creates jumps in position numbers - this code would fail under those circumstances
+//tokens stored with positions - can use this to index straight into sorted array
+for (int32_t tp = 0; tp < posLen; tp++)
+{
+if ( tokensInOriginalOrder )
+{
+tokensInOriginalOrder[pos[tp]]=_CLNEW Token(terms[t],
+offsets[tp]->getStartOffset(),
+offsets[tp]->getEndOffset());
+}
+}
+}
+}
+//If the field has been stored without position data we must perform a sort
+if(unsortedTokens!=NULL)
+{
+		if ( totalTokens<unsortedTokens->size() ){
+			_CLDELETE_ARRAY(tokensInOriginalOrder);
+			tokensInOriginalOrder = _CL_NEWARRAY(Token*,unsortedTokens->size()+1);
+		}
+		//the list has already sorted our items //todo:check that this is true...
+		if ( tokensInOriginalOrder )
+		   unsortedTokens->toArray(tokensInOriginalOrder);
+		return _CLNEW StoredTokenStream(tokensInOriginalOrder,unsortedTokens->size());
+}else
+		return _CLNEW StoredTokenStream(tokensInOriginalOrder,totalTokens);
+}
+TokenStream* TokenSources::getTokenStream(IndexReader* reader,int32_t docId, TCHAR* field)
+{
+	TermFreqVector* tfv=reader->getTermFreqVector(docId,field);
+	if(tfv==NULL)
+	{
+		TCHAR buf[250];
+		_sntprintf(buf,250,_T("%s in doc #%d does not have any term position data stored"),field,docId);
+		_CLTHROWT(CL_ERR_IllegalArgument,buf);
+		return NULL;
+	}
+	//todo:bad way of doing this...
+	TermPositionVector* tmp = NULL;
+	try{
+		tmp = dynamic_cast<TermPositionVector *> (tfv); //check to see if tfv is a Tpv
+	}catch(...){}
+	TokenStream* stream = NULL;
+	if ( tmp != NULL ){
+		TermPositionVector* tpv = dynamic_cast<TermPositionVector *> (reader->getTermFreqVector(docId,field));
+		if ( tpv )
+		    stream = getTokenStream(tpv);
+	    //return getTokenStream(tpv);
+	}else{
+		TCHAR buf[250];
+		_sntprintf(buf,250,_T("%s in doc #%d does not have any term position data stored"),field,docId);
+		_CLTHROWT(CL_ERR_IllegalArgument,buf);
+		//return NULL;
+	}
+	return stream;
+}
+//convenience method
+TokenStream* TokenSources::getTokenStream(IndexReader* reader,int32_t docId, TCHAR* field,Analyzer* analyzer)
+{
+	CL_NS(document)::Document* doc=reader->document(docId);
+	const TCHAR* contents=doc->get(field);
+	if(contents==NULL)
+	{
+		TCHAR buf[250];
+		_sntprintf(buf,250,_T("Field %s in document #%d is not stored and cannot be analyzed"),field,docId);
+		_CLTHROWT(CL_ERR_IllegalArgument,buf);
+		return NULL;
+	}
+return analyzer->tokenStream(field,_CLNEW StringReader(contents));
+}
+TokenSources::StoredTokenStream::StoredTokenStream(CL_NS(analysis)::Token** tokens, size_t len)
+{
+	currentToken = 0;
+this->tokens=tokens;
+	this->length = len;
+}
+bool TokenSources::StoredTokenStream::next(CL_NS(analysis)::Token* token)
+{
+if(currentToken>=length)
+{
+return false;
+}
+	Token* t = tokens[currentToken++];
+	token->set(t->termText(),t->startOffset(),t->endOffset(),t->type());;
+return true;
+}
+void TokenSources::StoredTokenStream::close(){
+}
+CL_NS_END2

changeset 7	a5fbfefd615f
child 10	afe194b6b1cd