searchengine/oss/cl/clucene/src/clucene/highlighter/TokenSources.h
changeset 7 a5fbfefd615f
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/cl/clucene/src/clucene/highlighter/TokenSources.h	Fri Jun 11 14:43:47 2010 +0300
@@ -0,0 +1,98 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _lucene_search_highlight_tokensources_
+#define _lucene_search_highlight_tokensources_
+
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+
+#include "CLucene/analysis/AnalysisHeader.h"
+#include "CLucene/index/IndexReader.h"
+#include "CLucene/index/TermVector.h"
+#include "CLucene/analysis/AnalysisHeader.h"
+
+CL_NS_DEF2(search,highlight)
+
+class TokenSources: LUCENE_BASE
+{
+	//an object used to iterate across an array of tokens
+	class StoredTokenStream:public CL_NS(analysis)::TokenStream
+    {
+	public:
+        CL_NS(analysis)::Token** tokens;
+		size_t length;
+        int32_t currentToken;
+        StoredTokenStream(CL_NS(analysis)::Token** tokens, size_t len);
+		bool next(CL_NS(analysis)::Token* token);
+		void close();
+    };
+public:
+	TokenSources(void);
+	~TokenSources(void);
+
+	/**
+     * A convenience method that tries a number of approaches to getting a token stream.
+     * The cost of finding there are no termVectors in the index is minimal (1000 invocations still 
+     * registers 0 ms). So this "lazy" (flexible?) approach to coding is probably acceptable
+     * @param reader
+     * @param docId
+     * @param field
+     * @param analyzer
+     * @return null if field not stored correctly 
+     * @throws IOException
+     */
+	static CL_NS(analysis)::TokenStream* getAnyTokenStream(CL_NS(index)::IndexReader* reader,int32_t docId, TCHAR* field, CL_NS(analysis)::Analyzer* analyzer);
+    
+    static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::TermPositionVector* tpv);
+
+    /**
+     * Low level api.
+     * Returns a token stream or null if no offset info available in index.
+     * This can be used to feed the highlighter with a pre-parsed token stream 
+     * 
+     * In my tests the speeds to recreate 1000 token streams using this method are:
+     * - with TermVector offset only data stored - 420  milliseconds 
+     * - with TermVector offset AND position data stored - 271 milliseconds
+     *  (nb timings for TermVector with position data are based on a tokenizer with contiguous
+     *  positions - no overlaps or gaps)
+     * The cost of not using TermPositionVector to store
+     * pre-parsed content and using an analyzer to re-parse the original content: 
+     * - reanalyzing the original content - 980 milliseconds
+     * 
+     * The re-analyze timings will typically vary depending on -
+     * 	1) The complexity of the analyzer code (timings above were using a 
+     * 	   stemmer/lowercaser/stopword combo)
+     *  2) The  number of other fields (Lucene reads ALL fields off the disk 
+     *     when accessing just one document field - can cost dear!)
+     *  3) Use of compression on field storage - could be faster cos of compression (less disk IO)
+     *     or slower (more CPU burn) depending on the content.
+     *
+     * @param tpv
+     * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking
+     * to eek out the last drops of performance, set to true. If in doubt, set to false.
+     */
+    static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::TermPositionVector* tpv, bool tokenPositionsGuaranteedContiguous);
+
+	static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::IndexReader* reader,int32_t docId, TCHAR* field);
+
+    //convenience method
+	static CL_NS(analysis)::TokenStream* getTokenStream(CL_NS(index)::IndexReader* reader,int32_t docId, TCHAR* field,CL_NS(analysis)::Analyzer* analyzer);
+};
+
+CL_NS_END2
+#endif