searchengine/oss/cl/clucene/src/clucene/index/indexreader.h
changeset 0 671dee74050a
child 24 65456528cac2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/cl/clucene/src/clucene/index/indexreader.h	Mon Apr 19 14:40:16 2010 +0300
@@ -0,0 +1,483 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_index_IndexReader_
+#define _lucene_index_IndexReader_
+
+#if defined(_LUCENE_PRAGMA_ONCE)
+# pragma once
+#endif
+
+#include "clucene/store/directory.h"
+#include "CLucene/store/FSDirectory.h"
+#include "CLucene/store/Lock.h"
+#include "clucene/document/document.h"
+#include "CLucene/index/TermVector.h"
+#include "CLucene/index/SegmentInfos.h"
+#include "clucene/index/terms.h"
+
+
+CL_NS_DEF(index)
+
+
+/** IndexReader is an abstract class, providing an interface for accessing an
+ index.  Search of an index is done entirely through this abstract interface,
+ so that any subclass which implements it is searchable.
+
+ <p> Concrete subclasses of IndexReader are usually constructed with a call to
+ one of the static <code>open()</code> methods, e.g. {@link #open(String)}.
+
+ <p> For efficiency, in this API documents are often referred to via
+ <i>document numbers</i>, non-negative integers which each name a unique
+ document in the index.  These document numbers are ephemeral--they may change
+ as documents are added to and deleted from an index.  Clients should thus not
+ rely on a given document having the same number between sessions.
+ 
+ <p> An IndexReader can be opened on a directory for which an IndexWriter is
+ opened already, but it cannot be used to delete documents from the index then.
+*/
+class IndexReader :LUCENE_BASE{
+public:
+	//Callback for classes that need to know if IndexReader is closing.
+	typedef void (*CloseCallback)(IndexReader*, void*);
+
+	class CloseCallbackCompare:public CL_NS(util)::Compare::_base{
+	public:
+		bool operator()( CloseCallback t1, CloseCallback t2 ) const{
+			return t1 > t2;
+		}
+		static void doDelete(CloseCallback ){
+		}
+	};
+	
+	
+	enum FieldOption {
+		// all fields
+		ALL = 1,
+		// all indexed fields
+		INDEXED = 2,
+		// all fields which are not indexed
+		UNINDEXED = 4,
+		// all fields which are indexed with termvectors enables
+		INDEXED_WITH_TERMVECTOR = 8,
+		// all fields which are indexed but don't have termvectors enabled
+		INDEXED_NO_TERMVECTOR = 16,
+		// all fields where termvectors are enabled. Please note that only standard termvector fields are returned
+		TERMVECTOR = 32,
+		// all field with termvectors wiht positions enabled
+		TERMVECTOR_WITH_POSITION = 64,
+		// all fields where termvectors with offset position are set
+		TERMVECTOR_WITH_OFFSET = 128,
+		// all fields where termvectors with offset and position values set
+		TERMVECTOR_WITH_POSITION_OFFSET = 256
+	};
+
+
+private:
+	CL_NS(store)::LuceneLock* writeLock;
+
+    bool directoryOwner;
+    bool stale;
+    bool hasChanges;
+    bool closeDirectory;
+    
+    SegmentInfos* segmentInfos;
+
+    CL_NS(store)::Directory* directory;
+	typedef CL_NS(util)::CLSet<CloseCallback, void*, 
+		CloseCallbackCompare,
+		CloseCallbackCompare> CloseCallbackMap;
+	CloseCallbackMap closeCallbacks;
+	
+    /** Internal use. Implements commit */
+    virtual void doCommit() = 0;
+  
+    /**
+	* Tries to acquire the WriteLock on this directory.
+	* this method is only valid if this IndexReader is directory owner.
+	* 
+	* @throws IOException If WriteLock cannot be acquired.
+	*/
+    void aquireWriteLock();
+protected:
+    /**
+    * Constructor used if IndexReader is not owner of its directory. 
+    * This is used for IndexReaders that are used within other IndexReaders that take care or locking directories.
+    * 
+    * @param directory Directory where IndexReader files reside.
+    */
+    IndexReader(CL_NS(store)::Directory* dir);
+
+    /**
+    * Constructor used if IndexReader is owner of its directory.
+    * If IndexReader is owner of its directory, it locks its directory in case of write operations.
+    * 
+    * @param directory Directory where IndexReader files reside.
+    * @param segmentInfos Used for write-l
+    * @param closeDirectory
+    */
+	IndexReader(CL_NS(store)::Directory* directory, SegmentInfos* segmentInfos, bool closeDirectory);
+	
+
+	/// Implements close. 
+	virtual void doClose() = 0;
+
+    /** Implements setNorm in subclass.*/
+    virtual void doSetNorm(int32_t doc, const TCHAR* field, uint8_t value) = 0;
+      
+    /** Implements actual undeleteAll() in subclass. */
+    virtual void doUndeleteAll() = 0;
+
+
+    /** Implements deletion of the document numbered <code>docNum</code>.
+	* Applications should call {@link #deleteDocument(int32_t)} or {@link #deleteDocuments(Term*)}.
+	*/
+    virtual void doDelete(const int32_t docNum) = 0;
+
+public:
+
+    // CPIXASYNC DEFINE_MUTEX(THIS_LOCK)
+    DEFINE_CRUCIAL_MUTEX(IndexReader_THIS_LOCK)
+	
+	///Do not access this directly, only public so that MultiReader can access it
+	virtual void commit();
+	
+	
+	/** Undeletes all documents currently marked as deleted in this index.*/
+	void undeleteAll();
+
+	/**
+	* Get a list of unique field names that exist in this index and have the specified
+	* field option information.
+	* @param fldOption specifies which field option should be available for the returned fields
+	* @return Collection of Strings indicating the names of the fields.
+	* @see IndexReader.FieldOption
+	*/
+	virtual void getFieldNames(FieldOption fldOption, CL_NS(util)::StringArrayWithDeletor& retarray) = 0;
+	
+	_CL_DEPRECATED( getFieldNames(FieldOption, StringArrayWithDeletor&) ) virtual TCHAR** getFieldNames();
+	_CL_DEPRECATED( getFieldNames(FieldOption, StringArrayWithDeletor&) ) virtual TCHAR** getFieldNames(bool indexed);
+
+	/** Returns the byte-encoded normalization factor for the named field of
+	* every document.  This is used by the search code to score documents.
+	*
+	* The number of bytes returned is the size of the IndexReader->maxDoc()
+	* MEMORY: The values are cached, so don't delete the returned byte array.
+	* @see Field#setBoost(float_t)
+	*/
+	virtual uint8_t* norms(const TCHAR* field) = 0;
+	
+	
+	/** Reads the byte-encoded normalization factor for the named field of every
+	*  document.  This is used by the search code to score documents.
+	*
+	* @see Field#setBoost(float_t)
+	*/
+	virtual void norms(const TCHAR* field, uint8_t* bytes) = 0;
+
+    /** Expert: Resets the normalization factor for the named field of the named
+    * document.
+    *
+    * @see #norms(TCHAR*)
+    * @see Similarity#decodeNorm(uint8_t)
+    */
+    void setNorm(int32_t doc, const TCHAR* field, float_t value);
+  
+    /** Expert: Resets the normalization factor for the named field of the named
+    * document.  The norm represents the product of the field's {@link
+    * Field#setBoost(float_t) boost} and its {@link Similarity#lengthNorm(TCHAR*,
+    * int32_t) length normalization}.  Thus, to preserve the length normalization
+    * values when resetting this, one should base the new value upon the old.
+    *
+    * @see #norms(TCHAR*)
+    * @see Similarity#decodeNorm(uint8_t)
+    */
+    void setNorm(int32_t doc, const TCHAR* field, uint8_t value);
+
+	/// Release the write lock, if needed. 
+    virtual ~IndexReader();
+
+	/// Returns an IndexReader reading the index in an FSDirectory in the named path. 
+	static IndexReader* open(const char* path);
+
+	/// Returns an IndexReader reading the index in the given Directory. 
+	static IndexReader* open( CL_NS(store)::Directory* directory, bool closeDirectory=false);
+
+	/** 
+	* Returns the time the index in the named directory was last modified.
+	* Do not use this to check whether the reader is still up-to-date, use
+	* {@link #isCurrent()} instead. 
+	*/
+	static uint64_t lastModified(const char* directory);
+
+	/** 
+	* Returns the time the index in the named directory was last modified. 
+	* Do not use this to check whether the reader is still up-to-date, use
+	* {@link #isCurrent()} instead. 
+	*/
+	static uint64_t lastModified(const CL_NS(store)::Directory* directory);
+
+	
+	/**
+	* Reads version number from segments files. The version number is
+	* initialized with a timestamp and then increased by one for each change of
+	* the index.
+	* 
+	* @param directory where the index resides.
+	* @return version number.
+	* @throws IOException if segments file cannot be read
+	*/
+	static int64_t getCurrentVersion(CL_NS(store)::Directory* directory);
+	
+	/**
+   * Reads version number from segments files. The version number is
+   * initialized with a timestamp and then increased by one for each change of
+   * the index.
+   * 
+   * @param directory where the index resides.
+   * @return version number.
+   * @throws IOException if segments file cannot be read
+   */
+	static int64_t getCurrentVersion(const char* directory);
+	
+	/**
+	* Version number when this IndexReader was opened.
+	*/
+	int64_t getVersion();
+	
+	/**
+	* Check whether this IndexReader still works on a current version of the index.
+	* If this is not the case you will need to re-open the IndexReader to
+	* make sure you see the latest changes made to the index.
+	* 
+	* @throws IOException
+	*/
+	bool isCurrent();
+
+
+	/**
+	*  Return an array of term frequency vectors for the specified document.
+	*  The array contains a vector for each vectorized field in the document.
+	*  Each vector contains terms and frequencies for all terms in a given vectorized field.
+	*  If no such fields existed, the method returns null. The term vectors that are
+	* returned my either be of type TermFreqVector or of type TermPositionsVector if
+	* positions or offsets have been stored.
+	* 
+	* @param docNumber document for which term frequency vectors are returned
+	* @return array of term frequency vectors. May be null if no term vectors have been
+	*  stored for the specified document.
+	* @throws IOException if index cannot be accessed
+	* @see org.apache.lucene.document.Field.TermVector
+	*/
+	virtual bool getTermFreqVectors(int32_t docNumber, Array<TermFreqVector*>& result) =0;
+	
+	/**
+	*  Return a term frequency vector for the specified document and field. The
+	*  returned vector contains terms and frequencies for the terms in
+	*  the specified field of this document, if the field had the storeTermVector
+	*  flag set. If termvectors had been stored with positions or offsets, a 
+	*  TermPositionsVector is returned.
+	* 
+	* @param docNumber document for which the term frequency vector is returned
+	* @param field field for which the term frequency vector is returned.
+	* @return term frequency vector May be null if field does not exist in the specified
+	* document or term vector was not stored.
+	* @throws IOException if index cannot be accessed
+	* @see org.apache.lucene.document.Field.TermVector
+	*/
+	virtual TermFreqVector* getTermFreqVector(int32_t docNumber, const TCHAR* field) = 0;
+	
+	/**
+	* Returns <code>true</code> if an index exists at the specified directory.
+	* If the directory does not exist or if there is no index in it.
+	* @param  directory the directory to check for an index
+	* @return <code>true</code> if an index exists; <code>false</code> otherwise
+	*/
+	static bool indexExists(const char* directory);
+
+    /**
+	* Returns <code>true</code> if an index exists at the specified directory.
+	* If the directory does not exist or if there is no index in it.
+	* @param  directory the directory to check for an index
+	* @return <code>true</code> if an index exists; <code>false</code> otherwise
+	* @throws IOException if there is a problem with accessing the index
+	*/
+	static bool indexExists(const CL_NS(store)::Directory* directory);
+
+	/** Returns the number of documents in this index. */
+  	virtual int32_t numDocs() = 0;
+
+	/** Returns one greater than the largest possible document number.
+	* This may be used to, e.g., determine how big to allocate an array which
+	* will have an element for every document number in an index.
+	*/
+	virtual int32_t maxDoc() const = 0;
+
+	/** Gets the stored fields of the <code>n</code><sup>th</sup>
+   	 * <code>Document</code> in this index. 
+   	 * The fields are not cleared before retrieving the document, so the
+   	 * object should be new or just cleared.
+   	 */ 
+	virtual bool document(int32_t n, CL_NS(document)::Document*) =0;
+	
+	_CL_DEPRECATED( document(i, document) ) CL_NS(document)::Document* document(const int32_t n);
+
+	/** Returns true if document <i>n</i> has been deleted */
+  	virtual bool isDeleted(const int32_t n) = 0;
+
+	/** Returns true if any documents have been deleted */
+	virtual bool hasDeletions() const = 0;
+
+	/** Returns true if there are norms stored for this field. */
+	virtual bool hasNorms(const TCHAR* field);
+
+	/** Returns an enumeration of all the terms in the index.
+	* The enumeration is ordered by Term.compareTo().  Each term
+	* is greater than all that precede it in the enumeration.
+	* @memory Caller must clean up
+	*/
+	virtual TermEnum* terms() const =0;
+
+	/** Returns an enumeration of all terms after a given term.
+	* The enumeration is ordered by Term.compareTo().  Each term
+	* is greater than all that precede it in the enumeration.
+	* @memory Caller must clean up
+	*/
+	virtual TermEnum* terms(const Term* t) const = 0;
+
+	/** Returns the number of documents containing the term <code>t</code>. */
+	virtual int32_t docFreq(const Term* t) const = 0;
+
+	/* Returns an unpositioned TermPositions enumerator.
+	 * @memory Caller must clean up
+	 */
+	virtual TermPositions* termPositions() const = 0;
+	
+    /** Returns an enumeration of all the documents which contain
+	* <code>term</code>.  For each document, in addition to the document number
+	* and frequency of the term in that document, a list of all of the ordinal
+	* positions of the term in the document is available.  Thus, this method
+	* implements the mapping:
+	*
+	* <p><ul>
+	* Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq,
+	* &lt;pos<sub>1</sub>, pos<sub>2</sub>, ...
+	* pos<sub>freq-1</sub>&gt;
+	* &gt;<sup>*</sup>
+	* </ul>
+	* <p> This positional information faciliates phrase and proximity searching.
+	* <p>The enumeration is ordered by document number.  Each document number is
+	* greater than all that precede it in the enumeration.
+	 * @memory Caller must clean up
+	*/
+	TermPositions* termPositions(Term* term) const;
+
+	/** Returns an unpositioned {@link TermDocs} enumerator. 
+	 * @memory Caller must clean up
+	 */
+	virtual TermDocs* termDocs() const = 0;
+
+	/** Returns an enumeration of all the documents which contain
+	* <code>term</code>. For each document, the document number, the frequency of
+	* the term in that document is also provided, for use in search scoring.
+	* Thus, this method implements the mapping:
+	* <p><ul>Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq&gt;<sup>*</sup></ul>
+	* <p>The enumeration is ordered by document number.  Each document number
+	* is greater than all that precede it in the enumeration.
+	 * @memory Caller must clean up
+	*/
+	TermDocs* termDocs(Term* term) const;
+
+	/** Deletes the document numbered <code>docNum</code>.  Once a document is
+	* deleted it will not appear in TermDocs or TermPostitions enumerations.
+	* Attempts to read its field with the {@link #document}
+	* method will result in an error.  The presence of this document may still be
+	* reflected in the {@link #docFreq} statistic, though
+	* this will be corrected eventually as the index is further modified.
+	*/
+	void deleteDocument(const int32_t docNum);
+
+	///@deprecated. Use deleteDocument instead.
+	_CL_DEPRECATED( deleteDocument ) void deleteDoc(const int32_t docNum){ deleteDocument(docNum); }
+
+	/** Deletes all documents containing <code>term</code>.
+	* This is useful if one uses a document field to hold a unique ID string for
+	* the document.  Then to delete such a document, one merely constructs a
+	* term with the appropriate field and the unique ID string as its text and
+	* passes it to this method.
+	* See {@link #deleteDocument(int)} for information about when this deletion will 
+	* become effective.
+	* @return the number of documents deleted
+	*/
+	int32_t deleteDocuments(Term* term);
+
+	///@deprecated. Use deleteDocuments instead.
+	_CL_DEPRECATED( deleteDocuments ) int32_t deleteTerm(Term* term){ return deleteDocuments(term); }
+
+	/** 
+	* Closes files associated with this index and also saves any new deletions to disk.
+    * No other methods should be called after this has been called.
+    */
+	void close();
+
+	///Checks if the index in the named directory is currently locked.       
+	static bool isLocked(CL_NS(store)::Directory* directory);
+	
+	///Checks if the index in the named directory is currently locked.       
+	static bool isLocked(const char* directory);
+
+
+	///Forcibly unlocks the index in the named directory.
+	///Caution: this should only be used by failure recovery code,
+	///when it is known that no other process nor thread is in fact
+	///currently accessing this index.
+	static void unlock(CL_NS(store)::Directory* directory);
+	static void unlock(const char* path);
+
+	 /** Returns the directory this index resides in. */
+	CL_NS(store)::Directory* getDirectory() { return directory; }
+
+	/** Returns true if the file is a lucene filename (based on extension or filename) */
+	static bool isLuceneFile(const char* filename);
+
+	/**
+	* For classes that need to know when the IndexReader closes (such as caches, etc),
+	* should pass their callback function to this.
+	*/
+	void addCloseCallback(CloseCallback callback, void* parameter);
+
+protected:
+	class LockWith:public CL_NS(store)::LuceneLockWith<IndexReader*>{
+	public:
+		CL_NS(store)::Directory* directory;
+		IndexReader* indexReader;
+
+		//Constructor	
+		LockWith(CL_NS(store)::LuceneLock* lock, CL_NS(store)::Directory* dir);
+
+		//Reads the segmentinfo file and depending on the number of segments found
+		//it returns a MultiReader or a SegmentReader
+		IndexReader* doBody();
+
+	};
+	friend class IndexReader::LockWith;
+
+    class CommitLockWith:public CL_NS(store)::LuceneLockWith<void>{
+    private:
+	    IndexReader* reader;
+	public:
+			
+	    //Constructor	
+	    CommitLockWith( CL_NS(store)::LuceneLock* lock, IndexReader* r );
+	    void doBody();
+	};
+    friend class IndexReader::CommitLockWith;
+};
+
+CL_NS_END
+#endif
+
+