--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/cl/clucene/src/clucene/highlighter/Highlighter.cpp Fri Jun 11 14:43:47 2010 +0300
@@ -0,0 +1,439 @@
+#include "CLucene/StdHeader.h"
+#include "Highlighter.h"
+
+CL_NS_DEF2(search,highlight)
+CL_NS_USE(analysis)
+CL_NS_USE(util)
+
+ class FragmentQueue : public CL_NS(util)::PriorityQueue<TextFragment*, CL_NS(util)::Deletor::Object<TextFragment> >
+ {
+ public:
+ FragmentQueue(int32_t size)
+ {
+ initialize(size, true);
+ }
+
+ protected:
+ bool lessThan(TextFragment * fragA, TextFragment * fragB)
+ {
+ if (fragA->getScore() == fragB->getScore())
+ return fragA->getFragNum() > fragB->getFragNum();
+ else
+ return fragA->getScore() < fragB->getScore();
+ }
+ };
+
+
+ Highlighter::Highlighter(HighlightScorer * fragmentScorer):
+ delete_formatter(true),
+ delete_encoder(true),
+ delete_textFragmenter(true),
+ delete_fragmentScorer(false)
+ {
+ maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
+
+ _textFragmenter = _CLNEW SimpleFragmenter();
+ _fragmentScorer = fragmentScorer;
+ _formatter = _CLNEW SimpleHTMLFormatter();
+ _encoder = _CLNEW DefaultEncoder();
+ }
+
+ Highlighter::Highlighter(Formatter * formatter, HighlightScorer * fragmentScorer):
+ delete_formatter(false),
+ delete_encoder(true),
+ delete_textFragmenter(true),
+ delete_fragmentScorer(false)
+ {
+ maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
+
+ _textFragmenter = _CLNEW SimpleFragmenter();
+ _fragmentScorer = fragmentScorer;
+ _formatter = formatter;
+ _encoder = _CLNEW DefaultEncoder();
+ }
+
+ Highlighter::Highlighter(Formatter * formatter, Encoder* encoder, HighlightScorer * fragmentScorer):
+ delete_formatter(false),
+ delete_encoder(true),
+ delete_textFragmenter(true),
+ delete_fragmentScorer(false)
+ {
+ maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
+ _textFragmenter = _CLNEW SimpleFragmenter();
+ _fragmentScorer = fragmentScorer;
+ _formatter = formatter;
+ _encoder = encoder;
+ }
+
+ Highlighter::~Highlighter()
+ {
+ if ( delete_textFragmenter )
+ _CLDELETE ( _textFragmenter );
+
+ if ( delete_fragmentScorer )
+ _CLDELETE(_fragmentScorer);
+
+ if( delete_formatter )
+ _CLDELETE(_formatter);
+
+ if ( delete_encoder )
+ _CLDELETE(_encoder);
+ }
+
+ TCHAR* Highlighter::getBestFragment(TokenStream * tokenStream, const TCHAR* text)
+ {
+ TCHAR** results = getBestFragments(tokenStream,text, 1);
+ TCHAR* result = 0;
+
+ if (results[0] != NULL )
+ result = stringDuplicate(results[0]);
+
+ _CLDELETE_CARRAY_ALL(results);
+
+ return result;
+ }
+
+ /**
+ * Highlights chosen terms in a text, extracting the most relevant section.
+ * This is a convenience method that calls
+ * {@link #getBestFragment(TokenStream, const TCHAR*)}
+ *
+ * @param analyzer the analyzer that will be used to split <code>text</code>
+ * into chunks
+ * @param text text to highlight terms in
+ * @param fieldName Name of field used to influence analyzer's tokenization policy
+ *
+ * @return highlighted text fragment or NULL if no terms found
+ */
+ TCHAR* Highlighter::getBestFragment(Analyzer* analyzer, const TCHAR* fieldName, const TCHAR* text)
+ {
+ TokenStream* tokenStream = analyzer->tokenStream(fieldName, _CLNEW StringReader(text));
+ return getBestFragment(tokenStream, text);
+ }
+
+ TCHAR** Highlighter::getBestFragments(
+ TokenStream * tokenStream,
+ const TCHAR* text,
+ int32_t maxNumFragments)
+ {
+ maxNumFragments = max((int32_t)1, maxNumFragments); //sanity check
+
+ StringBuffer buffer;
+ TextFragment** frags = getBestTextFragments(&buffer,tokenStream,text, true,maxNumFragments);
+
+ //Get text
+ CL_NS(util)::StringArray fragTexts;
+ for (uint32_t i=0; frags[i]!=NULL; i++)
+ {
+ TextFragment* f = frags[i];
+ if ((f != NULL) && (f->getScore() > 0))
+ {
+ fragTexts.push_back(f->toString(&buffer));
+ }
+ _CLDELETE(f);
+ }
+
+ _CLDELETE_ARRAY(frags);
+
+ TCHAR** ret = _CL_NEWARRAY(TCHAR*,fragTexts.size()+1);
+ fragTexts.toArray(ret);
+ return ret;
+ }
+
+ TCHAR* Highlighter::getBestFragments(
+ TokenStream * tokenStream,
+ const TCHAR* text,
+ int32_t maxNumFragments,
+ const TCHAR* separator)
+ {
+ TCHAR** sections = getBestFragments(tokenStream,text, maxNumFragments);
+ StringBuffer result;
+
+ for (int32_t i = 0; sections[i]!=NULL; i++)
+ {
+ if (i > 0)
+ {
+ result.append(separator);
+ }
+ result.append(sections[i]);
+ }
+
+ _CLDELETE_CARRAY_ALL(sections);
+ return result.toString();
+ }
+
+ TextFragment** Highlighter::getBestTextFragments(
+ StringBuffer* writeTo,
+ TokenStream * tokenStream,
+ const TCHAR* text,
+ bool mergeContiguousFragments,
+ int32_t maxNumFragments)
+ {
+ CLArrayList<TextFragment*> docFrags(false);
+ TextFragment* currentFrag = _CLNEW TextFragment(writeTo->length(), docFrags.size());
+ _fragmentScorer->startFragment(currentFrag);
+ docFrags.push_back(currentFrag);
+
+ FragmentQueue fragQueue(maxNumFragments);
+
+ try
+ {
+ int32_t startOffset;
+ int32_t endOffset;
+ int32_t lastEndOffset = 0;
+ _textFragmenter->start(text);
+ TCHAR substringBuffer[LUCENE_MAX_WORD_LEN];
+
+ TokenGroup* tokenGroup=_CLNEW TokenGroup();
+
+ TCHAR buffer[LUCENE_MAX_FIELD_LEN+1];
+ Token token;
+ while ( tokenStream->next(&token) )
+ {
+ if((tokenGroup->getNumTokens()>0)&&(tokenGroup->isDistinct(&token))){
+ //the current token is distinct from previous tokens -
+ // markup the cached token group info
+ startOffset = tokenGroup->getStartOffset();
+ endOffset = tokenGroup->getEndOffset();
+
+ _tcsncpy(substringBuffer,text+startOffset,endOffset-startOffset);
+ substringBuffer[endOffset-startOffset]=_T('\0');
+
+ TCHAR* encoded = _encoder->encodeText(substringBuffer);
+ const TCHAR* markedUpText=_formatter->highlightTerm(encoded, tokenGroup);
+ _CLDELETE_CARRAY(encoded);
+
+ //store any whitespace etc from between this and last group
+ if (startOffset > lastEndOffset){
+ int len = startOffset-lastEndOffset;
+ if ( len > LUCENE_MAX_FIELD_LEN )
+ len = LUCENE_MAX_FIELD_LEN;
+ _tcsncpy(buffer,text+lastEndOffset,len);
+ buffer[len]=_T('\0');
+
+ TCHAR* encoded = _encoder->encodeText(buffer);
+ writeTo->append(encoded);
+ _CLDELETE_CARRAY(encoded);
+ }
+ writeTo->append(markedUpText);
+ lastEndOffset=endOffset;
+ tokenGroup->clear();
+ _CLDELETE_CARRAY(markedUpText);
+
+ //check if current token marks the start of a new fragment
+ if (_textFragmenter->isNewFragment(&token))
+ {
+ currentFrag->setScore(_fragmentScorer->getFragmentScore());
+ //record stats for a new fragment
+ currentFrag->setTextEndPos( writeTo->length() );
+ currentFrag =_CLNEW TextFragment(writeTo->length(), docFrags.size());
+ _fragmentScorer->startFragment(currentFrag);
+ docFrags.push_back(currentFrag);
+ }
+ }
+
+ // does query contain current token?
+ float_t score=_fragmentScorer->getTokenScore(&token);
+ //TCHAR* highlightedTerm = _formatter->highlightTerm(&substringBuffer, token->termText(), score, startOffset);
+ //newText->append(highlightedTerm);
+ //_CLDELETE_CARRAY(highlightedTerm);
+ //_CLDELETE(token);
+
+ tokenGroup->addToken(&token,_fragmentScorer->getTokenScore(&token));
+
+ if(lastEndOffset>maxDocBytesToAnalyze)
+ {
+ break;
+ }
+ }
+ currentFrag->setScore(_fragmentScorer->getFragmentScore());
+
+ if(tokenGroup->getNumTokens()>0)
+ {
+ //flush the accumulated text (same code as in above loop)
+ startOffset = tokenGroup->getStartOffset();
+ endOffset = tokenGroup->getEndOffset();
+
+ _tcsncpy(substringBuffer,text+startOffset,endOffset-startOffset);
+ substringBuffer[endOffset-startOffset]=_T('\0');
+
+ TCHAR* encoded = _encoder->encodeText(substringBuffer);
+ const TCHAR* markedUpText=_formatter->highlightTerm(encoded, tokenGroup);
+ _CLDELETE_CARRAY(encoded);
+
+ //store any whitespace etc from between this and last group
+ if (startOffset > lastEndOffset){
+ int len = startOffset-lastEndOffset;
+ if ( len > LUCENE_MAX_FIELD_LEN )
+ len = LUCENE_MAX_FIELD_LEN;
+ _tcsncpy(buffer,text+lastEndOffset,len);
+ buffer[len]=_T('\0');
+
+ TCHAR* encoded = _encoder->encodeText(buffer);
+ writeTo->append(encoded);
+ _CLDELETE_CARRAY(encoded);
+ }
+ writeTo->append(markedUpText);
+ lastEndOffset=endOffset;
+
+ _CLDELETE_CARRAY(markedUpText);
+ }
+
+ // append text after end of last token
+ //if (lastEndOffset < (int32_t)_tcslen(text))
+ //newText->append(text+lastEndOffset);
+
+ currentFrag->setTextEndPos(writeTo->length());
+
+ //sort the most relevant sections of the text
+ while (docFrags.size() > 0) {
+ //for (TextFragmentList::iterator i = docFrags.begin(); i != docFrags.end(); i++)
+ //{
+ currentFrag = (TextFragment*) docFrags[0];
+ docFrags.remove(0);
+
+ //If you are running with a version of Lucene before 11th Sept 03
+ // you do not have PriorityQueue.insert() - so uncomment the code below
+
+ /*if (currentFrag->getScore() >= minScore)
+ {
+ fragQueue.put(currentFrag);
+ if (fragQueue.size() > maxNumFragments)
+ { // if hit queue overfull
+ _CLLDELETE(fragQueue.pop()); // remove lowest in hit queue
+ minScore = ((TextFragment *) fragQueue.top())->getScore(); // reset minScore
+ }
+
+
+ } else {
+ _CLDELETE(currentFrag);
+ }*/
+
+ //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
+ //fix to PriorityQueue. The correct method to use here is the new "insert" method
+ // USE ABOVE CODE IF THIS DOES NOT COMPILE!
+ if ( !fragQueue.insert(currentFrag) )
+ _CLDELETE(currentFrag);
+
+ //todo: check this
+ }
+
+ //return the most relevant fragments
+ int32_t fragsLen = fragQueue.size();
+ TextFragment** frags = _CL_NEWARRAY(TextFragment*,fragsLen+1);
+ for ( int32_t i=0;i<fragsLen;i++ )
+ frags[i] = fragQueue.pop();
+ frags[fragsLen]=NULL;
+
+ //merge any contiguous fragments to improve readability
+ if(mergeContiguousFragments)
+ {
+ _mergeContiguousFragments(frags,fragsLen);
+ CLArrayList<TextFragment*> fragTexts;
+ for (int32_t i = 0; i < fragsLen; i++)
+ {
+ TextFragment* tf = frags[i];
+ if ((tf != NULL) && (tf->getScore() > 0))
+ fragTexts.push_back(tf);
+ else
+ _CLDELETE(tf);
+ }
+ _CLDELETE_ARRAY(frags);
+ frags = _CL_NEWARRAY(TextFragment*,fragTexts.size()+1);
+ fragTexts.toArray(frags);
+ }
+
+ _CLDELETE(tokenGroup);
+ //_CLDELETE(newText);
+ return frags;
+
+ }
+ _CLFINALLY(
+ if (tokenStream)
+ {
+ try
+ {
+ tokenStream->close();
+ }
+ catch (...)
+ {
+ }
+ }
+ )
+ }
+
+
+ void Highlighter::_mergeContiguousFragments(TextFragment** frag, int32_t fragsLen)
+ {
+ bool mergingStillBeingDone;
+ if ( frag[0] != NULL )
+ do
+ {
+ mergingStillBeingDone = false; //initialise loop control flag
+ //for each fragment, scan other frags looking for contiguous blocks
+ for (int32_t i=0; i<fragsLen; i++)
+ {
+ if (frag[i] == NULL)
+ {
+ continue;
+ }
+ //merge any contiguous blocks
+ for (int32_t x=0; x<fragsLen; x++)
+ {
+ if ( x==i )
+ continue; //bug 1072183. don't try and merge with self
+
+ if (frag[x] == NULL)
+ continue;
+ if (frag[i] == NULL)
+ break;
+
+ TextFragment * frag1 = NULL;
+ TextFragment * frag2 = NULL;
+ int32_t frag1Num = 0;
+ int32_t frag2Num = 0;
+ int32_t bestScoringFragNum;
+ int32_t worstScoringFragNum;
+ //if blocks are contiguous....
+ if (frag[i]->follows(frag[x]))
+ {
+ frag1 = frag[x];
+ frag1Num = x;
+ frag2 = frag[i];
+ frag2Num = i;
+ }
+ else if (frag[x]->follows(frag[i]))
+ {
+ frag1 = frag[i];
+ frag1Num = i;
+ frag2 = frag[x];
+ frag2Num = x;
+ }
+ //merging required..
+ if (frag1 != NULL)
+ {
+ if (frag1->getScore() > frag2->getScore())
+ {
+ bestScoringFragNum = frag1Num;
+ worstScoringFragNum = frag2Num;
+ }
+ else
+ {
+ bestScoringFragNum = frag2Num;
+ worstScoringFragNum = frag1Num;
+ }
+ frag1->merge(frag2);
+ frag[worstScoringFragNum]= NULL;
+ mergingStillBeingDone = true;
+ frag[bestScoringFragNum]=frag1;
+ _CLDELETE(frag2);
+ }
+ }
+ }
+ }
+ while (mergingStillBeingDone);
+ }
+
+
+
+CL_NS_END2