FCL/sf/mw/searchsrv: comparison searchengine/oss/cl/clucene/src/clucene/highlighter/Highlighter.h

equal deleted inserted replaced

-:ae3f1779f6da
+:a5fbfefd615f
+/**
+* Copyright 2002-2004 The Apache Software Foundation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+#ifndef _lucene_search_highlight_highlighter_
+#define _lucene_search_highlight_highlighter_
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+#include "CLucene/util/StringBuffer.h"
+#include "CLucene/util/PriorityQueue.h"
+#include "CLucene/util/VoidList.h"
+#include "CLucene/highlighter/Formatter.h"
+#include "CLucene/highlighter/Encoder.h"
+#include "CLucene/highlighter/SimpleHTMLFormatter.h"
+#include "CLucene/highlighter/Fragmenter.h"
+#include "CLucene/highlighter/HighlightScorer.h"
+#include "CLucene/highlighter/SimpleFragmenter.h"
+#include "CLucene/highlighter/TextFragment.h"
+CL_NS_DEF2(search,highlight)
+/**
+* Class used to markup highlighted terms found in the best sections of a
+* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
+* and tokenizers.
+* {@link Encoder} and tokenizers.
+*/
+class Highlighter :LUCENE_BASE
+{
+private:
+	int32_t maxDocBytesToAnalyze;
+	Formatter * _formatter;
+	bool delete_formatter;
+	Encoder* _encoder;
+	bool delete_encoder;
+	Fragmenter * _textFragmenter;
+	bool delete_textFragmenter;
+	HighlightScorer * _fragmentScorer;
+	bool delete_fragmentScorer;
+	/** Improves readability of a score-sorted list of TextFragments by merging any fragments
+	 * that were contiguous in the original text into one larger fragment with the correct order.
+	 * This will leave a "null" in the array entry for the lesser scored fragment.
+	 *
+	 * @param frag An array of document fragments in descending score
+	 */
+	void _mergeContiguousFragments(TextFragment** frag, int32_t fragsLen);
+public:
+	LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024);
+	/**
+	 * Constructs a Highlighter object with the provided scorer. The HighlightScorer object is owned
+	 * by the Highlighter object, and it will freed in the destructor.
+	 */
+	Highlighter(HighlightScorer * fragmentScorer);
+	Highlighter(Formatter * formatter, HighlightScorer * fragmentScorer);
+	Highlighter(Formatter * formatter, Encoder* encoder, HighlightScorer * fragmentScorer);
+	/**
+	 * Destructor for Highlighter. It deletes the owned HighlightScorer, formatter and textFragmenter.
+	 */
+	~Highlighter();
+	/**
+	 * Highlights chosen terms in a text, extracting the most relevant section.
+	 * The document text is analysed in chunks to record hit statistics
+	 * across the document. After accumulating stats, the fragment with the highest score
+	 * is returned
+	 *
+	 * @param tokenStream   a stream of tokens identified in the text parameter, including offset information.
+	 * This is typically produced by an analyzer re-parsing a document's
+	 * text. Some work may be done on retrieving TokenStreams more efficently
+	 * by adding support for storing original text position data in the Lucene
+	 * index but this support is not currently available (as of Lucene 1.4 rc2).
+	 * @param text text to highlight terms in
+	 *
+	 * @return highlighted text fragment or null if no terms found
+	 */
+	TCHAR* getBestFragment(CL_NS(analysis)::TokenStream * tokenStream, const TCHAR* text);
+	/**
+	 * Highlights chosen terms in a text, extracting the most relevant section.
+	 * This is a convenience method that calls
+	 * {@link #getBestFragment(TokenStream, const TCHAR*)}
+	 *
+	 * @param analyzer   the analyzer that will be used to split <code>text</code>
+	 * into chunks
+	 * @param text text to highlight terms in
+	 * @param fieldName Name of field used to influence analyzer's tokenization policy
+	 *
+	 * @return highlighted text fragment or null if no terms found
+	 */
+	TCHAR* getBestFragment(CL_NS(analysis)::Analyzer* analyzer, const TCHAR* fieldName, const TCHAR* text);
+	/**
+	 * Highlights chosen terms in a text, extracting the most relevant sections.
+	 * This is a convenience method that calls
+	 * {@link #getBestFragments(TokenStream, const TCHAR*, int)}
+	 *
+	 * @param analyzer   the analyzer that will be used to split <code>text</code>
+	 * into chunks
+	 * @param text        	text to highlight terms in
+	 * @param maxNumFragments  the maximum number of fragments.
+	 *
+	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+	 */
+	TCHAR** getBestFragments(
+		CL_NS(analysis)::Analyzer* analyzer,
+		const TCHAR* text,
+		int32_t maxNumFragments);
+	/**
+	 * Highlights chosen terms in a text, extracting the most relevant sections.
+	 * The document text is analysed in chunks to record hit statistics
+	 * across the document. After accumulating stats, the fragments with the highest scores
+	 * are returned as an array of strings in order of score (contiguous fragments are merged into
+	 * one in their original order to improve readability)
+	 *
+	 * @param text        	text to highlight terms in
+	 * @param maxNumFragments  the maximum number of fragments.
+	 *
+	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+	 */
+	 TCHAR** getBestFragments(
+		CL_NS(analysis)::TokenStream * tokenStream,
+		const TCHAR* text,
+		int32_t maxNumFragments);
+	/**
+* Low level api to get the most relevant (formatted) sections of the document.
+	* This method has been made public to allow visibility of score information held in TextFragment objects.
+	* Thanks to Jason Calabrese for help in redefining the interface.
+* @param tokenStream
+* @param text
+* @param maxNumFragments
+* @param mergeContiguousFragments
+*/
+	TextFragment** getBestTextFragments(
+		CL_NS(util)::StringBuffer* writeTo,
+		CL_NS(analysis)::TokenStream * tokenStream,
+		const TCHAR* text,
+		bool mergeContiguousFragments,
+		int32_t maxNumFragments);
+	/**
+	 * Highlights terms in the  text , extracting the most relevant sections
+	 * and concatenating the chosen fragments with a separator (typically "...").
+	 * The document text is analysed in chunks to record hit statistics
+	 * across the document. After accumulating stats, the fragments with the highest scores
+	 * are returned in order as "separator" delimited strings.
+	 *
+	 * @param text        text to highlight terms in
+	 * @param maxNumFragments  the maximum number of fragments.
+	 * @param separator  the separator used to intersperse the document fragments (typically "...")
+	 *
+	 * @return highlighted text
+	 */
+	TCHAR* getBestFragments(
+		CL_NS(analysis)::TokenStream * tokenStream,
+		const TCHAR* text,
+		int32_t maxNumFragments,
+		const TCHAR* separator);
+	/**
+	 * @return the maximum number of bytes to be tokenized per doc
+	 */
+	int32_t getMaxDocBytesToAnalyze()
+	{
+		return maxDocBytesToAnalyze;
+	}
+	/**
+	 * @param byteCount the maximum number of bytes to be tokenized per doc
+	 * (This can improve performance with large documents)
+	 */
+	void setMaxDocBytesToAnalyze(int32_t byteCount)
+	{
+		maxDocBytesToAnalyze = byteCount;
+	}
+	/**
+	 */
+	Fragmenter * getTextFragmenter()
+	{
+		return _textFragmenter;
+	}
+	/**
+	 * @param fragmenter
+	 */
+	void setTextFragmenter(Fragmenter * fragmenter)
+	{
+		if ( delete_textFragmenter ){
+			_CLDELETE(_textFragmenter);
+			delete_textFragmenter = false;
+		}
+		_textFragmenter = fragmenter;
+	}
+	/**
+	 * @return Object used to score each text fragment
+	 */
+	HighlightScorer * getFragmentScorer()
+	{
+		return _fragmentScorer;
+	}
+	/**
+	 * @param HighlightScorer
+	 */
+	void setFragmentScorer(HighlightScorer * scorer)
+	{
+		if ( delete_fragmentScorer ){
+			delete_fragmentScorer = false;
+			_CLDELETE(scorer);
+		}
+		_fragmentScorer = scorer;
+	}
+Encoder* getEncoder()
+{
+return _encoder;
+}
+void setEncoder(Encoder* encoder)
+{
+		if ( delete_encoder ){
+			_CLDELETE(encoder);
+			delete_encoder = false;
+		}
+this->_encoder = encoder;
+}
+};
+CL_NS_END2
+#endif

changeset 7	a5fbfefd615f
child 21	2c484ac32ef0