#include "CLucene/StdHeader.h"
#include "Highlighter.h"
CL_NS_DEF2(search,highlight)
CL_NS_USE(analysis)
CL_NS_USE(util)
class FragmentQueue : public CL_NS(util)::PriorityQueue<TextFragment*, CL_NS(util)::Deletor::Object<TextFragment> >
{
public:
FragmentQueue(int32_t size)
{
initialize(size, true);
}
protected:
bool lessThan(TextFragment * fragA, TextFragment * fragB)
{
if (fragA->getScore() == fragB->getScore())
return fragA->getFragNum() > fragB->getFragNum();
else
return fragA->getScore() < fragB->getScore();
}
};
Highlighter::Highlighter(HighlightScorer * fragmentScorer):
delete_formatter(true),
delete_encoder(true),
delete_textFragmenter(true),
delete_fragmentScorer(false)
{
maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
_textFragmenter = _CLNEW SimpleFragmenter();
_fragmentScorer = fragmentScorer;
_formatter = _CLNEW SimpleHTMLFormatter();
_encoder = _CLNEW DefaultEncoder();
}
Highlighter::Highlighter(Formatter * formatter, HighlightScorer * fragmentScorer):
delete_formatter(false),
delete_encoder(true),
delete_textFragmenter(true),
delete_fragmentScorer(false)
{
maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
_textFragmenter = _CLNEW SimpleFragmenter();
_fragmentScorer = fragmentScorer;
_formatter = formatter;
_encoder = _CLNEW DefaultEncoder();
}
Highlighter::Highlighter(Formatter * formatter, Encoder* encoder, HighlightScorer * fragmentScorer):
delete_formatter(false),
delete_encoder(true),
delete_textFragmenter(true),
delete_fragmentScorer(false)
{
maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
_textFragmenter = _CLNEW SimpleFragmenter();
_fragmentScorer = fragmentScorer;
_formatter = formatter;
_encoder = encoder;
}
Highlighter::~Highlighter()
{
if ( delete_textFragmenter )
_CLDELETE ( _textFragmenter );
if ( delete_fragmentScorer )
_CLDELETE(_fragmentScorer);
if( delete_formatter )
_CLDELETE(_formatter);
if ( delete_encoder )
_CLDELETE(_encoder);
}
TCHAR* Highlighter::getBestFragment(TokenStream * tokenStream, const TCHAR* text)
{
TCHAR** results = getBestFragments(tokenStream,text, 1);
TCHAR* result = 0;
if (results[0] != NULL )
result = stringDuplicate(results[0]);
_CLDELETE_CARRAY_ALL(results);
return result;
}
/**
* Highlights chosen terms in a text, extracting the most relevant section.
* This is a convenience method that calls
* {@link #getBestFragment(TokenStream, const TCHAR*)}
*
* @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks
* @param text text to highlight terms in
* @param fieldName Name of field used to influence analyzer's tokenization policy
*
* @return highlighted text fragment or NULL if no terms found
*/
TCHAR* Highlighter::getBestFragment(Analyzer* analyzer, const TCHAR* fieldName, const TCHAR* text)
{
TokenStream* tokenStream = analyzer->tokenStream(fieldName, _CLNEW StringReader(text));
return getBestFragment(tokenStream, text);
}
TCHAR** Highlighter::getBestFragments(
TokenStream * tokenStream,
const TCHAR* text,
int32_t maxNumFragments)
{
maxNumFragments = max((int32_t)1, maxNumFragments); //sanity check
StringBuffer buffer;
TextFragment** frags = getBestTextFragments(&buffer,tokenStream,text, true,maxNumFragments);
//Get text
CL_NS(util)::StringArray fragTexts;
for (uint32_t i=0; frags[i]!=NULL; i++)
{
TextFragment* f = frags[i];
if ((f != NULL) && (f->getScore() > 0))
{
fragTexts.push_back(f->toString(&buffer));
}
_CLDELETE(f);
}
_CLDELETE_ARRAY(frags);
TCHAR** ret = _CL_NEWARRAY(TCHAR*,fragTexts.size()+1);
fragTexts.toArray(ret);
return ret;
}
TCHAR* Highlighter::getBestFragments(
TokenStream * tokenStream,
const TCHAR* text,
int32_t maxNumFragments,
const TCHAR* separator)
{
TCHAR** sections = getBestFragments(tokenStream,text, maxNumFragments);
StringBuffer result;
for (int32_t i = 0; sections[i]!=NULL; i++)
{
if (i > 0)
{
result.append(separator);
}
result.append(sections[i]);
}
_CLDELETE_CARRAY_ALL(sections);
return result.toString();
}
TextFragment** Highlighter::getBestTextFragments(
StringBuffer* writeTo,
TokenStream * tokenStream,
const TCHAR* text,
bool mergeContiguousFragments,
int32_t maxNumFragments)
{
CLArrayList<TextFragment*> docFrags(false);
TextFragment* currentFrag = _CLNEW TextFragment(writeTo->length(), docFrags.size());
_fragmentScorer->startFragment(currentFrag);
docFrags.push_back(currentFrag);
FragmentQueue fragQueue(maxNumFragments);
try
{
int32_t startOffset;
int32_t endOffset;
int32_t lastEndOffset = 0;
int32_t highlightedfrags = 0;
_textFragmenter->start(text);
TCHAR substringBuffer[LUCENE_MAX_WORD_LEN];
TokenGroup* tokenGroup=_CLNEW TokenGroup();
TCHAR buffer[LUCENE_MAX_FIELD_LEN+1];
Token token;
while ( tokenStream->next(&token) )
{
if((tokenGroup->getNumTokens()>0)&&(tokenGroup->isDistinct(&token))){
//the current token is distinct from previous tokens -
// markup the cached token group info
startOffset = tokenGroup->getStartOffset();
endOffset = tokenGroup->getEndOffset();
_tcsncpy(substringBuffer,text+startOffset,endOffset-startOffset);
substringBuffer[endOffset-startOffset]=_T('\0');
TCHAR* encoded = _encoder->encodeText(substringBuffer);
const TCHAR* markedUpText=_formatter->highlightTerm(encoded, tokenGroup);
_CLDELETE_CARRAY(encoded);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset){
int len = startOffset-lastEndOffset;
if ( len > LUCENE_MAX_FIELD_LEN )
len = LUCENE_MAX_FIELD_LEN;
_tcsncpy(buffer,text+lastEndOffset,len);
buffer[len]=_T('\0');
TCHAR* encoded = _encoder->encodeText(buffer);
writeTo->append(encoded);
_CLDELETE_CARRAY(encoded);
}
writeTo->append(markedUpText);
lastEndOffset=endOffset;
tokenGroup->clear();
_CLDELETE_CARRAY(markedUpText);
//check if current token marks the start of a new fragment
if (_textFragmenter->isNewFragment(&token))
{
float_t score = _fragmentScorer->getFragmentScore();
if(score > 0) highlightedfrags++;
currentFrag->setScore(score);
//record stats for a new fragment
currentFrag->setTextEndPos( writeTo->length() );
currentFrag =_CLNEW TextFragment(writeTo->length(), docFrags.size());
_fragmentScorer->startFragment(currentFrag);
docFrags.push_back(currentFrag);
}
}
// does query contain current token?
float_t score=_fragmentScorer->getTokenScore(&token);
//TCHAR* highlightedTerm = _formatter->highlightTerm(&substringBuffer, token->termText(), score, startOffset);
//newText->append(highlightedTerm);
//_CLDELETE_CARRAY(highlightedTerm);
//_CLDELETE(token);
tokenGroup->addToken(&token,_fragmentScorer->getTokenScore(&token));
if(lastEndOffset>maxDocBytesToAnalyze || highlightedfrags>MAX_FRAGMENTS_TO_HIGHLIGHT)
{
break;
}
}
currentFrag->setScore(_fragmentScorer->getFragmentScore());
if(tokenGroup->getNumTokens()>0)
{
//flush the accumulated text (same code as in above loop)
startOffset = tokenGroup->getStartOffset();
endOffset = tokenGroup->getEndOffset();
_tcsncpy(substringBuffer,text+startOffset,endOffset-startOffset);
substringBuffer[endOffset-startOffset]=_T('\0');
TCHAR* encoded = _encoder->encodeText(substringBuffer);
const TCHAR* markedUpText=_formatter->highlightTerm(encoded, tokenGroup);
_CLDELETE_CARRAY(encoded);
//store any whitespace etc from between this and last group
if (startOffset > lastEndOffset){
int len = startOffset-lastEndOffset;
if ( len > LUCENE_MAX_FIELD_LEN )
len = LUCENE_MAX_FIELD_LEN;
_tcsncpy(buffer,text+lastEndOffset,len);
buffer[len]=_T('\0');
TCHAR* encoded = _encoder->encodeText(buffer);
writeTo->append(encoded);
_CLDELETE_CARRAY(encoded);
}
writeTo->append(markedUpText);
lastEndOffset=endOffset;
_CLDELETE_CARRAY(markedUpText);
}
// append text after end of last token
//if (lastEndOffset < (int32_t)_tcslen(text))
//newText->append(text+lastEndOffset);
currentFrag->setTextEndPos(writeTo->length());
//sort the most relevant sections of the text
while (docFrags.size() > 0) {
//for (TextFragmentList::iterator i = docFrags.begin(); i != docFrags.end(); i++)
//{
currentFrag = (TextFragment*) docFrags[0];
docFrags.remove(0);
//If you are running with a version of Lucene before 11th Sept 03
// you do not have PriorityQueue.insert() - so uncomment the code below
/*if (currentFrag->getScore() >= minScore)
{
fragQueue.put(currentFrag);
if (fragQueue.size() > maxNumFragments)
{ // if hit queue overfull
_CLLDELETE(fragQueue.pop()); // remove lowest in hit queue
minScore = ((TextFragment *) fragQueue.top())->getScore(); // reset minScore
}
} else {
_CLDELETE(currentFrag);
}*/
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
//fix to PriorityQueue. The correct method to use here is the new "insert" method
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
if ( !fragQueue.insert(currentFrag) )
_CLDELETE(currentFrag);
//todo: check this
}
//return the most relevant fragments
int32_t fragsLen = fragQueue.size();
TextFragment** frags = _CL_NEWARRAY(TextFragment*,fragsLen+1);
for ( int32_t i=0;i<fragsLen;i++ )
frags[i] = fragQueue.pop();
frags[fragsLen]=NULL;
//merge any contiguous fragments to improve readability
if(mergeContiguousFragments)
{
_mergeContiguousFragments(frags,fragsLen);
CLArrayList<TextFragment*> fragTexts;
for (int32_t i = 0; i < fragsLen; i++)
{
TextFragment* tf = frags[i];
if ((tf != NULL) && (tf->getScore() > 0))
fragTexts.push_back(tf);
else
_CLDELETE(tf);
}
_CLDELETE_ARRAY(frags);
frags = _CL_NEWARRAY(TextFragment*,fragTexts.size()+1);
fragTexts.toArray(frags);
}
_CLDELETE(tokenGroup);
if (tokenStream)
{
try
{
tokenStream->close();
}
catch (...)
{
}
}
//_CLDELETE(newText);
return frags;
}
catch(...){
if (tokenStream)
{
try
{
tokenStream->close();
}
catch (...)
{
}
}
return NULL;
}
}
void Highlighter::_mergeContiguousFragments(TextFragment** frag, int32_t fragsLen)
{
bool mergingStillBeingDone;
if ( frag[0] != NULL )
do
{
mergingStillBeingDone = false; //initialise loop control flag
//for each fragment, scan other frags looking for contiguous blocks
for (int32_t i=0; i<fragsLen; i++)
{
if (frag[i] == NULL)
{
continue;
}
//merge any contiguous blocks
for (int32_t x=0; x<fragsLen; x++)
{
if ( x==i )
continue; //bug 1072183. don't try and merge with self
if (frag[x] == NULL)
continue;
if (frag[i] == NULL)
break;
TextFragment * frag1 = NULL;
TextFragment * frag2 = NULL;
int32_t frag1Num = 0;
int32_t frag2Num = 0;
int32_t bestScoringFragNum;
int32_t worstScoringFragNum;
//if blocks are contiguous....
if (frag[i]->follows(frag[x]))
{
frag1 = frag[x];
frag1Num = x;
frag2 = frag[i];
frag2Num = i;
}
else if (frag[x]->follows(frag[i]))
{
frag1 = frag[i];
frag1Num = i;
frag2 = frag[x];
frag2Num = x;
}
//merging required..
if (frag1 != NULL)
{
if (frag1->getScore() > frag2->getScore())
{
bestScoringFragNum = frag1Num;
worstScoringFragNum = frag2Num;
}
else
{
bestScoringFragNum = frag2Num;
worstScoringFragNum = frag1Num;
}
frag1->merge(frag2);
frag[worstScoringFragNum]= NULL;
mergingStillBeingDone = true;
frag[bestScoringFragNum]=frag1;
_CLDELETE(frag2);
}
}
}
}
while (mergingStillBeingDone);
}
CL_NS_END2