searchengine/oss/cl/clucene/src/clucene/highlighter/TokenSources.h
changeset 7 a5fbfefd615f
equal deleted inserted replaced
3:ae3f1779f6da 7:a5fbfefd615f
       
     1 /**
       
     2  * Copyright 2002-2004 The Apache Software Foundation
       
     3  *
       
     4  * Licensed under the Apache License, Version 2.0 (the "License");
       
     5  * you may not use this file except in compliance with the License.
       
     6  * You may obtain a copy of the License at
       
     7  *
       
     8  *     http://www.apache.org/licenses/LICENSE-2.0
       
     9  *
       
    10  * Unless required by applicable law or agreed to in writing, software
       
    11  * distributed under the License is distributed on an "AS IS" BASIS,
       
    12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
       
    13  * See the License for the specific language governing permissions and
       
    14  * limitations under the License.
       
    15  */
       
    16 
       
    17 #ifndef _lucene_search_highlight_tokensources_
       
    18 #define _lucene_search_highlight_tokensources_
       
    19 
       
    20 #if defined(_LUCENE_PRAGMA_ONCE)
       
    21 # pragma once
       
    22 #endif
       
    23 
       
    24 #include "CLucene/analysis/AnalysisHeader.h"
       
    25 #include "CLucene/index/IndexReader.h"
       
    26 #include "CLucene/index/TermVector.h"
       
    27 #include "CLucene/analysis/AnalysisHeader.h"
       
    28 
       
    29 CL_NS_DEF2(search,highlight)
       
    30 
       
    31 class TokenSources: LUCENE_BASE
       
    32 {
       
    33 	//an object used to iterate across an array of tokens
       
    34 	class StoredTokenStream:public CL_NS(analysis)::TokenStream
       
    35     {
       
    36 	public:
       
    37         CL_NS(analysis)::Token** tokens;
       
    38 		size_t length;
       
    39         int32_t currentToken;
       
    40         StoredTokenStream(CL_NS(analysis)::Token** tokens, size_t len);
       
    41 		bool next(CL_NS(analysis)::Token* token);
       
    42 		void close();
       
    43     };
       
    44 public:
       
    45 	TokenSources(void);
       
    46 	~TokenSources(void);
       
    47 
       
    48 	/**
       
    49      * A convenience method that tries a number of approaches to getting a token stream.
       
    50      * The cost of finding there are no termVectors in the index is minimal (1000 invocations still 
       
    51      * registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
       
    52      * @param reader
       
    53      * @param docId
       
    54      * @param field
       
    55      * @param analyzer
       
    56      * @return null if field not stored correctly 
       
    57      * @throws IOException
       
    58      */
       
    59 	static CL_NS(analysis)::TokenStream* getAnyTokenStream(CL_NS(index)::IndexReader* reader,int32_t docId, TCHAR* field, CL_NS(analysis)::Analyzer* analyzer);
       
    60     
       
    61     static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::TermPositionVector* tpv);
       
    62 
       
    63     /**
       
    64      * Low level api.
       
    65      * Returns a token stream or null if no offset info available in index.
       
    66      * This can be used to feed the highlighter with a pre-parsed token stream 
       
    67      * 
       
    68      * In my tests the speeds to recreate 1000 token streams using this method are:
       
    69      * - with TermVector offset only data stored - 420  milliseconds 
       
    70      * - with TermVector offset AND position data stored - 271 milliseconds
       
    71      *  (nb timings for TermVector with position data are based on a tokenizer with contiguous
       
    72      *  positions - no overlaps or gaps)
       
    73      * The cost of not using TermPositionVector to store
       
    74      * pre-parsed content and using an analyzer to re-parse the original content: 
       
    75      * - reanalyzing the original content - 980 milliseconds
       
    76      * 
       
    77      * The re-analyze timings will typically vary depending on -
       
    78      * 	1) The complexity of the analyzer code (timings above were using a 
       
    79      * 	   stemmer/lowercaser/stopword combo)
       
    80      *  2) The  number of other fields (Lucene reads ALL fields off the disk 
       
    81      *     when accessing just one document field - can cost dear!)
       
    82      *  3) Use of compression on field storage - could be faster cos of compression (less disk IO)
       
    83      *     or slower (more CPU burn) depending on the content.
       
    84      *
       
    85      * @param tpv
       
    86      * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking
       
    87      * to eek out the last drops of performance, set to true. If in doubt, set to false.
       
    88      */
       
    89     static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::TermPositionVector* tpv, bool tokenPositionsGuaranteedContiguous);
       
    90 
       
    91 	static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::IndexReader* reader,int32_t docId, TCHAR* field);
       
    92 
       
    93     //convenience method
       
    94 	static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::IndexReader* reader,int32_t docId, TCHAR* field,CL_NS(analysis)::Analyzer* analyzer);
       
    95 };
       
    96 
       
    97 CL_NS_END2
       
    98 #endif