searchengine/oss/cl/clucene/src/clucene/highlighter/Highlighter.h
changeset 7 a5fbfefd615f
child 21 2c484ac32ef0
equal deleted inserted replaced
3:ae3f1779f6da 7:a5fbfefd615f
       
     1 /**
       
     2  * Copyright 2002-2004 The Apache Software Foundation
       
     3  *
       
     4  * Licensed under the Apache License, Version 2.0 (the "License");
       
     5  * you may not use this file except in compliance with the License.
       
     6  * You may obtain a copy of the License at
       
     7  *
       
     8  *     http://www.apache.org/licenses/LICENSE-2.0
       
     9  *
       
    10  * Unless required by applicable law or agreed to in writing, software
       
    11  * distributed under the License is distributed on an "AS IS" BASIS,
       
    12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
       
    13  * See the License for the specific language governing permissions and
       
    14  * limitations under the License.
       
    15  */
       
    16 
       
    17 #ifndef _lucene_search_highlight_highlighter_
       
    18 #define _lucene_search_highlight_highlighter_
       
    19 
       
    20 #if defined(_LUCENE_PRAGMA_ONCE)
       
    21 # pragma once
       
    22 #endif
       
    23 
       
    24 #include "CLucene/util/StringBuffer.h"
       
    25 #include "CLucene/util/PriorityQueue.h"
       
    26 #include "CLucene/util/VoidList.h"
       
    27 #include "CLucene/highlighter/Formatter.h"
       
    28 #include "CLucene/highlighter/Encoder.h"
       
    29 #include "CLucene/highlighter/SimpleHTMLFormatter.h"
       
    30 #include "CLucene/highlighter/Fragmenter.h"
       
    31 #include "CLucene/highlighter/HighlightScorer.h"
       
    32 #include "CLucene/highlighter/SimpleFragmenter.h"
       
    33 #include "CLucene/highlighter/TextFragment.h"
       
    34 
       
    35 CL_NS_DEF2(search,highlight)
       
    36 
       
    37 /**
       
    38 * Class used to markup highlighted terms found in the best sections of a
       
    39 * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
       
    40 * and tokenizers. 	  
       
    41 * {@link Encoder} and tokenizers.
       
    42 */
       
    43 class Highlighter :LUCENE_BASE
       
    44 {
       
    45 private:
       
    46 	int32_t maxDocBytesToAnalyze;
       
    47 
       
    48 	Formatter * _formatter;
       
    49 	bool delete_formatter;
       
    50 	
       
    51 	Encoder* _encoder;
       
    52 	bool delete_encoder;
       
    53 
       
    54 	Fragmenter * _textFragmenter;
       
    55 	bool delete_textFragmenter;
       
    56 
       
    57 	HighlightScorer * _fragmentScorer;
       
    58 	bool delete_fragmentScorer;
       
    59 
       
    60 	/** Improves readability of a score-sorted list of TextFragments by merging any fragments 
       
    61 	 * that were contiguous in the original text into one larger fragment with the correct order.
       
    62 	 * This will leave a "null" in the array entry for the lesser scored fragment. 
       
    63 	 * 
       
    64 	 * @param frag An array of document fragments in descending score
       
    65 	 */
       
    66 	void _mergeContiguousFragments(TextFragment** frag, int32_t fragsLen);
       
    67 	
       
    68 public:
       
    69 	LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024);
       
    70 
       
    71 	/**
       
    72 	 * Constructs a Highlighter object with the provided scorer. The HighlightScorer object is owned
       
    73 	 * by the Highlighter object, and it will freed in the destructor.
       
    74 	 */
       
    75 	Highlighter(HighlightScorer * fragmentScorer);
       
    76 
       
    77 	Highlighter(Formatter * formatter, HighlightScorer * fragmentScorer);
       
    78 
       
    79 	Highlighter(Formatter * formatter, Encoder* encoder, HighlightScorer * fragmentScorer);
       
    80 
       
    81 
       
    82 	/**
       
    83 	 * Destructor for Highlighter. It deletes the owned HighlightScorer, formatter and textFragmenter.
       
    84 	 */
       
    85 	~Highlighter();
       
    86 
       
    87 	/**
       
    88 	 * Highlights chosen terms in a text, extracting the most relevant section.
       
    89 	 * The document text is analysed in chunks to record hit statistics
       
    90 	 * across the document. After accumulating stats, the fragment with the highest score
       
    91 	 * is returned
       
    92 	 *
       
    93 	 * @param tokenStream   a stream of tokens identified in the text parameter, including offset information. 
       
    94 	 * This is typically produced by an analyzer re-parsing a document's 
       
    95 	 * text. Some work may be done on retrieving TokenStreams more efficently 
       
    96 	 * by adding support for storing original text position data in the Lucene
       
    97 	 * index but this support is not currently available (as of Lucene 1.4 rc2).  
       
    98 	 * @param text text to highlight terms in
       
    99 	 *
       
   100 	 * @return highlighted text fragment or null if no terms found
       
   101 	 */
       
   102 	TCHAR* getBestFragment(CL_NS(analysis)::TokenStream * tokenStream, const TCHAR* text);
       
   103 
       
   104 	/**
       
   105 	 * Highlights chosen terms in a text, extracting the most relevant section.
       
   106 	 * This is a convenience method that calls
       
   107 	 * {@link #getBestFragment(TokenStream, const TCHAR*)}
       
   108 	 *
       
   109 	 * @param analyzer   the analyzer that will be used to split <code>text</code>
       
   110 	 * into chunks  
       
   111 	 * @param text text to highlight terms in
       
   112 	 * @param fieldName Name of field used to influence analyzer's tokenization policy 
       
   113 	 *
       
   114 	 * @return highlighted text fragment or null if no terms found
       
   115 	 */
       
   116 	TCHAR* getBestFragment(CL_NS(analysis)::Analyzer* analyzer, const TCHAR* fieldName, const TCHAR* text);
       
   117 
       
   118 	/**
       
   119 	 * Highlights chosen terms in a text, extracting the most relevant sections.
       
   120 	 * This is a convenience method that calls
       
   121 	 * {@link #getBestFragments(TokenStream, const TCHAR*, int)}
       
   122 	 *
       
   123 	 * @param analyzer   the analyzer that will be used to split <code>text</code>
       
   124 	 * into chunks  
       
   125 	 * @param text        	text to highlight terms in
       
   126 	 * @param maxNumFragments  the maximum number of fragments.
       
   127 	 *
       
   128 	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
       
   129 	 */
       
   130 	TCHAR** getBestFragments(
       
   131 		CL_NS(analysis)::Analyzer* analyzer,	
       
   132 		const TCHAR* text,
       
   133 		int32_t maxNumFragments);
       
   134 
       
   135 	/**
       
   136 	 * Highlights chosen terms in a text, extracting the most relevant sections.
       
   137 	 * The document text is analysed in chunks to record hit statistics
       
   138 	 * across the document. After accumulating stats, the fragments with the highest scores
       
   139 	 * are returned as an array of strings in order of score (contiguous fragments are merged into 
       
   140 	 * one in their original order to improve readability)
       
   141 	 *
       
   142 	 * @param text        	text to highlight terms in
       
   143 	 * @param maxNumFragments  the maximum number of fragments.
       
   144 	 *
       
   145 	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
       
   146 	 */
       
   147 	 TCHAR** getBestFragments(
       
   148 		CL_NS(analysis)::TokenStream * tokenStream,	
       
   149 		const TCHAR* text,
       
   150 		int32_t maxNumFragments);
       
   151 
       
   152 	/**
       
   153     * Low level api to get the most relevant (formatted) sections of the document.
       
   154   	* This method has been made public to allow visibility of score information held in TextFragment objects.
       
   155   	* Thanks to Jason Calabrese for help in redefining the interface.
       
   156     * @param tokenStream
       
   157     * @param text
       
   158     * @param maxNumFragments
       
   159     * @param mergeContiguousFragments
       
   160     */
       
   161 	TextFragment** getBestTextFragments(
       
   162 		CL_NS(util)::StringBuffer* writeTo,
       
   163 		CL_NS(analysis)::TokenStream * tokenStream,	
       
   164 		const TCHAR* text,
       
   165 		bool mergeContiguousFragments,
       
   166 		int32_t maxNumFragments);
       
   167 
       
   168 	/**
       
   169 	 * Highlights terms in the  text , extracting the most relevant sections
       
   170 	 * and concatenating the chosen fragments with a separator (typically "...").
       
   171 	 * The document text is analysed in chunks to record hit statistics
       
   172 	 * across the document. After accumulating stats, the fragments with the highest scores
       
   173 	 * are returned in order as "separator" delimited strings.
       
   174 	 *
       
   175 	 * @param text        text to highlight terms in
       
   176 	 * @param maxNumFragments  the maximum number of fragments.
       
   177 	 * @param separator  the separator used to intersperse the document fragments (typically "...")
       
   178 	 *
       
   179 	 * @return highlighted text
       
   180 	 */
       
   181 	TCHAR* getBestFragments(
       
   182 		CL_NS(analysis)::TokenStream * tokenStream,	
       
   183 		const TCHAR* text,
       
   184 		int32_t maxNumFragments,
       
   185 		const TCHAR* separator);
       
   186 
       
   187 	/**
       
   188 	 * @return the maximum number of bytes to be tokenized per doc 
       
   189 	 */
       
   190 	int32_t getMaxDocBytesToAnalyze()
       
   191 	{
       
   192 		return maxDocBytesToAnalyze;
       
   193 	}
       
   194 
       
   195 	/**
       
   196 	 * @param byteCount the maximum number of bytes to be tokenized per doc
       
   197 	 * (This can improve performance with large documents)
       
   198 	 */
       
   199 	void setMaxDocBytesToAnalyze(int32_t byteCount)
       
   200 	{
       
   201 		maxDocBytesToAnalyze = byteCount;
       
   202 	}
       
   203 
       
   204 	/**
       
   205 	 */
       
   206 	Fragmenter * getTextFragmenter()
       
   207 	{
       
   208 		return _textFragmenter;
       
   209 	}
       
   210 
       
   211 	/**
       
   212 	 * @param fragmenter
       
   213 	 */
       
   214 	void setTextFragmenter(Fragmenter * fragmenter)
       
   215 	{
       
   216 		if ( delete_textFragmenter ){
       
   217 			_CLDELETE(_textFragmenter);
       
   218 			delete_textFragmenter = false;
       
   219 		}
       
   220 		_textFragmenter = fragmenter;
       
   221 	}
       
   222 
       
   223 	/**
       
   224 	 * @return Object used to score each text fragment 
       
   225 	 */
       
   226 	HighlightScorer * getFragmentScorer()
       
   227 	{
       
   228 		return _fragmentScorer;
       
   229 	}
       
   230 
       
   231 
       
   232 	/**
       
   233 	 * @param HighlightScorer
       
   234 	 */
       
   235 	void setFragmentScorer(HighlightScorer * scorer)
       
   236 	{
       
   237 		if ( delete_fragmentScorer ){
       
   238 			delete_fragmentScorer = false;
       
   239 			_CLDELETE(scorer);
       
   240 		}
       
   241 		_fragmentScorer = scorer;
       
   242 	}
       
   243 
       
   244 	
       
   245     Encoder* getEncoder()
       
   246     {
       
   247         return _encoder;
       
   248     }
       
   249     void setEncoder(Encoder* encoder)
       
   250     {
       
   251 		if ( delete_encoder ){
       
   252 			_CLDELETE(encoder);
       
   253 			delete_encoder = false;
       
   254 		}
       
   255         this->_encoder = encoder;
       
   256     }
       
   257 
       
   258 
       
   259 };
       
   260 
       
   261 
       
   262 CL_NS_END2
       
   263 
       
   264 #endif
       
   265