/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef _lucene_search_highlight_highlighter_
#define _lucene_search_highlight_highlighter_
#if defined(_LUCENE_PRAGMA_ONCE)
# pragma once
#endif
#include "CLucene/util/StringBuffer.h"
#include "CLucene/util/PriorityQueue.h"
#include "CLucene/util/VoidList.h"
#include "CLucene/highlighter/Formatter.h"
#include "CLucene/highlighter/Encoder.h"
#include "CLucene/highlighter/SimpleHTMLFormatter.h"
#include "CLucene/highlighter/Fragmenter.h"
#include "CLucene/highlighter/HighlightScorer.h"
#include "CLucene/highlighter/SimpleFragmenter.h"
#include "CLucene/highlighter/TextFragment.h"
CL_NS_DEF2(search,highlight)
/**
* Class used to markup highlighted terms found in the best sections of a
* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
* and tokenizers.
* {@link Encoder} and tokenizers.
*/
class Highlighter :LUCENE_BASE
{
private:
int32_t maxDocBytesToAnalyze;
Formatter * _formatter;
bool delete_formatter;
Encoder* _encoder;
bool delete_encoder;
Fragmenter * _textFragmenter;
bool delete_textFragmenter;
HighlightScorer * _fragmentScorer;
bool delete_fragmentScorer;
/** Improves readability of a score-sorted list of TextFragments by merging any fragments
* that were contiguous in the original text into one larger fragment with the correct order.
* This will leave a "null" in the array entry for the lesser scored fragment.
*
* @param frag An array of document fragments in descending score
*/
void _mergeContiguousFragments(TextFragment** frag, int32_t fragsLen);
public:
LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024);
LUCENE_STATIC_CONSTANT(int32_t, MAX_FRAGMENTS_TO_HIGHLIGHT=2);
/**
* Constructs a Highlighter object with the provided scorer. The HighlightScorer object is owned
* by the Highlighter object, and it will freed in the destructor.
*/
Highlighter(HighlightScorer * fragmentScorer);
Highlighter(Formatter * formatter, HighlightScorer * fragmentScorer);
Highlighter(Formatter * formatter, Encoder* encoder, HighlightScorer * fragmentScorer);
/**
* Destructor for Highlighter. It deletes the owned HighlightScorer, formatter and textFragmenter.
*/
~Highlighter();
/**
* Highlights chosen terms in a text, extracting the most relevant section.
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragment with the highest score
* is returned
*
* @param tokenStream a stream of tokens identified in the text parameter, including offset information.
* This is typically produced by an analyzer re-parsing a document's
* text. Some work may be done on retrieving TokenStreams more efficently
* by adding support for storing original text position data in the Lucene
* index but this support is not currently available (as of Lucene 1.4 rc2).
* @param text text to highlight terms in
*
* @return highlighted text fragment or null if no terms found
*/
TCHAR* getBestFragment(CL_NS(analysis)::TokenStream * tokenStream, const TCHAR* text);
/**
* Highlights chosen terms in a text, extracting the most relevant section.
* This is a convenience method that calls
* {@link #getBestFragment(TokenStream, const TCHAR*)}
*
* @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks
* @param text text to highlight terms in
* @param fieldName Name of field used to influence analyzer's tokenization policy
*
* @return highlighted text fragment or null if no terms found
*/
TCHAR* getBestFragment(CL_NS(analysis)::Analyzer* analyzer, const TCHAR* fieldName, const TCHAR* text);
/**
* Highlights chosen terms in a text, extracting the most relevant sections.
* This is a convenience method that calls
* {@link #getBestFragments(TokenStream, const TCHAR*, int)}
*
* @param analyzer the analyzer that will be used to split <code>text</code>
* into chunks
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
*/
TCHAR** getBestFragments(
CL_NS(analysis)::Analyzer* analyzer,
const TCHAR* text,
int32_t maxNumFragments);
/**
* Highlights chosen terms in a text, extracting the most relevant sections.
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragments with the highest scores
* are returned as an array of strings in order of score (contiguous fragments are merged into
* one in their original order to improve readability)
*
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
*
* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
*/
TCHAR** getBestFragments(
CL_NS(analysis)::TokenStream * tokenStream,
const TCHAR* text,
int32_t maxNumFragments);
/**
* Low level api to get the most relevant (formatted) sections of the document.
* This method has been made public to allow visibility of score information held in TextFragment objects.
* Thanks to Jason Calabrese for help in redefining the interface.
* @param tokenStream
* @param text
* @param maxNumFragments
* @param mergeContiguousFragments
*/
TextFragment** getBestTextFragments(
CL_NS(util)::StringBuffer* writeTo,
CL_NS(analysis)::TokenStream * tokenStream,
const TCHAR* text,
bool mergeContiguousFragments,
int32_t maxNumFragments);
/**
* Highlights terms in the text , extracting the most relevant sections
* and concatenating the chosen fragments with a separator (typically "...").
* The document text is analysed in chunks to record hit statistics
* across the document. After accumulating stats, the fragments with the highest scores
* are returned in order as "separator" delimited strings.
*
* @param text text to highlight terms in
* @param maxNumFragments the maximum number of fragments.
* @param separator the separator used to intersperse the document fragments (typically "...")
*
* @return highlighted text
*/
TCHAR* getBestFragments(
CL_NS(analysis)::TokenStream * tokenStream,
const TCHAR* text,
int32_t maxNumFragments,
const TCHAR* separator);
/**
* @return the maximum number of bytes to be tokenized per doc
*/
int32_t getMaxDocBytesToAnalyze()
{
return maxDocBytesToAnalyze;
}
/**
* @param byteCount the maximum number of bytes to be tokenized per doc
* (This can improve performance with large documents)
*/
void setMaxDocBytesToAnalyze(int32_t byteCount)
{
maxDocBytesToAnalyze = byteCount;
}
/**
*/
Fragmenter * getTextFragmenter()
{
return _textFragmenter;
}
/**
* @param fragmenter
*/
void setTextFragmenter(Fragmenter * fragmenter)
{
if ( delete_textFragmenter ){
_CLDELETE(_textFragmenter);
delete_textFragmenter = false;
}
_textFragmenter = fragmenter;
}
/**
* @return Object used to score each text fragment
*/
HighlightScorer * getFragmentScorer()
{
return _fragmentScorer;
}
/**
* @param HighlightScorer
*/
void setFragmentScorer(HighlightScorer * scorer)
{
if ( delete_fragmentScorer ){
delete_fragmentScorer = false;
_CLDELETE(scorer);
}
_fragmentScorer = scorer;
}
Encoder* getEncoder()
{
return _encoder;
}
void setEncoder(Encoder* encoder)
{
if ( delete_encoder ){
_CLDELETE(encoder);
delete_encoder = false;
}
this->_encoder = encoder;
}
};
CL_NS_END2
#endif