searchengine/oss/cl/clucene/src/clucene/index/indexreader.cpp
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
parent 0 671dee74050a
permissions -rw-r--r--
201041

/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "clucene/stdheader.h"
#include "IndexReader.h"
#include "IndexWriter.h"

#include "clucene/store/directory.h"
#include "CLucene/store/FSDirectory.h"
#include "CLucene/store/Lock.h"
#include "clucene/document/document.h"
#include "clucene/search/similarity.h"
#include "SegmentInfos.h"
#include "MultiReader.h"
#include "Terms.h"

CL_NS_USE(util)
CL_NS_USE(store)
CL_NS_DEF(index)

  IndexReader::IndexReader(Directory* dir):
   directory(_CL_POINTER(dir)){
  //Constructor.
  //Func - Creates an instance of IndexReader
  //Pre  - true
  //Post - An instance has been created with writeLock = NULL

      writeLock = NULL;
      segmentInfos = NULL;
      directoryOwner = false;
      closeDirectory = false;
      stale = false;
      hasChanges = false;
  }

   IndexReader::IndexReader(Directory* directory, SegmentInfos* segmentInfos, bool closeDirectory) {
    this->directory = _CL_POINTER(directory);
    this->segmentInfos = segmentInfos;
    directoryOwner = true;
    this->closeDirectory = closeDirectory;
    stale = false;
    hasChanges = false;
    writeLock = NULL;
  }

  IndexReader::~IndexReader(){
  //Func - Destructor
  //       Destroys the instance and releases the writeLock if needed
  //Pre  - true
  //Post - The instance has been destroyed if pre(writeLock) exists is has been released
      if (writeLock != NULL) {
		  //release writeLock
          writeLock->release();
		  _CLDELETE(writeLock);
      }
	  _CLDELETE(segmentInfos);
	  _CLDECDELETE(directory);
  }

  IndexReader* IndexReader::open(const char* path){
  //Func - Static method.
  //       Returns an IndexReader reading the index in an FSDirectory in the named path. 
  //Pre  - path != NULL and contains the path of the index for which an IndexReader must be 
  //       instantiated
  //       closeDir indicates if the directory needs to be closed
  //Post - An IndexReader has been returned that reads tnhe index located at path

	  CND_PRECONDITION(path != NULL, "path is NULL");

	  Directory* dir = FSDirectory::getDirectory(path,false);
     IndexReader* reader = open(dir,true);
     //because fsdirectory will now have a refcount of 1 more than
     //if the reader had been opened with a directory object,
     //we need to do a refdec
     _CLDECDELETE(dir);
     return reader;
  }

  IndexReader* IndexReader::open( Directory* directory, bool closeDirectory){
  //Func - Static method.
  //       Returns an IndexReader reading the index in an FSDirectory in the named path. 
  //Pre  - directory represents a directory 
  //       closeDir indicates if the directory needs to be closed
  //Post - An IndexReader has been returned that reads the index located at directory

	  // in- & inter-process sync
      // CPIXASYNC SCOPED_LOCK_MUTEX(directory->THIS_LOCK)
      SCOPED_LOCK_CRUCIAL_MUTEX(directory->Directory_THIS_LOCK)
      
     IndexReader* ret = NULL;     

	  LuceneLock* lock = directory->makeLock("commit.lock");

	  //Instantiate an IndexReader::LockWith which can produce an IndexReader
      IndexReader::LockWith with(lock,directory);

	  try{
	  //Create an IndexReader reading the index
		ret = with.runAndReturn();
	  }_CLFINALLY(
        _CLDELETE( lock );
	  );

	  ret->closeDirectory = closeDirectory;

	   CND_CONDITION(ret != NULL,"ret is NULL");
	   //return reference 
       return ret;
  }
  
  CL_NS(document)::Document* IndexReader::document(const int32_t n){
    CL_NS(document)::Document* ret = _CLNEW CL_NS(document)::Document;
    if (!document(n,ret) )
        _CLDELETE(ret);
    return ret;
  }

  IndexReader* IndexReader::LockWith::doBody() {
  //Func - Reads the segmentinfo file and depending on the number of segments found
  //       it returns a SegmentsReader or a SegmentReader
  //Pre  - directory != NULL
  //Post - Depending on the number of Segments present in directory this method
  //       returns an empty SegmentsReader when there are no segments, a SegmentReader when
  //       directory contains 1 segment and a nonempty SegmentsReader when directory
  //       contains multiple segements

	   CND_PRECONDITION(directory != NULL, "directory is NULL");

	   //Instantiate SegmentInfos
       SegmentInfos* infos = _CLNEW SegmentInfos;
	   try{
			//Have SegmentInfos read the segments file in directory
			infos->read(directory);
	   }catch(...){
	        //make sure infos is cleaned up
			_CLDELETE(infos);
			throw;
	   }

       // If there is at least one segment (if infos.size() >= 1), the last
       // SegmentReader object will close the directory when the SegmentReader
       // object itself is closed (see SegmentReader::doClose).
       // If there are no segments, there will be no "last SegmentReader object"
       // to fulfill this responsibility, so we need to explicitly close the
       // directory in the segmentsreader.close
       
	   //Count the number segments in the directory
	   const uint32_t nSegs = infos->size();

       if (nSegs == 1 ) {
			// index is optimized 
            return _CLNEW SegmentReader(infos, infos->info(0));
	    }else{
			//Instantiate an array of pointers to SegmentReaders of size nSegs (The number of segments in the index)
			IndexReader** readers = NULL;

			if (nSegs > 0){
				uint32_t infosize=infos->size();
				readers = _CL_NEWARRAY(IndexReader*,infosize+1);
				for (uint32_t i = 0; i < infosize; ++i) {
					//Instantiate a SegementReader responsible for reading the i-th segment and store it in
					//the readers array
					readers[i] = _CLNEW SegmentReader(infos->info(i));
				}
				readers[infosize] = NULL;
			}

			//return an instance of SegmentsReader which is a reader that manages all Segments
			return _CLNEW MultiReader(directory, infos, readers);
        }// end if
  }

  uint64_t IndexReader::lastModified(const char* directory) {
  //Func - Static method
  //       Returns the time the index in the named directory was last modified. 
  //Pre  - directory != NULL and contains the path name of the directory to check
  //Post - The last modified time of the index has been returned

      CND_PRECONDITION(directory != NULL, "directory is NULL");

	  return FSDirectory::fileModified(directory,"segments");
  }

  int64_t IndexReader::getCurrentVersion(Directory* directory) {
      // CPIXASYNC SCOPED_LOCK_MUTEX(directory->THIS_LOCK)                 // in- & inter-process sync
      SCOPED_LOCK_CRUCIAL_MUTEX(directory->Directory_THIS_LOCK)                 // in- & inter-process sync
	LuceneLock* commitLock=directory->makeLock(IndexWriter::COMMIT_LOCK_NAME);
	bool locked=false;
	int64_t ret = 0;
	try {
		locked=commitLock->obtain(IndexWriter::COMMIT_LOCK_TIMEOUT);
		ret = SegmentInfos::readCurrentVersion(directory);
	}_CLFINALLY(
		if (locked) {
			commitLock->release();
		}
		_CLDELETE(commitLock);
	)
	return ret;
  }


   int64_t IndexReader::getCurrentVersion(const char* directory){
      Directory* dir = FSDirectory::getDirectory(directory, false);
      int64_t version = getCurrentVersion(dir);
	  dir->close();
      _CLDECDELETE(dir);
      return version;
   }
    int64_t IndexReader::getVersion() {
		return segmentInfos->getVersion();
	}
	
	bool IndexReader::isCurrent() {
            // CPIXASYNC SCOPED_LOCK_MUTEX(directory->THIS_LOCK)                 // in- & inter-process sync
            SCOPED_LOCK_CRUCIAL_MUTEX(directory->Directory_THIS_LOCK)                 // in- & inter-process sync
		LuceneLock* commitLock = directory->makeLock(IndexWriter::COMMIT_LOCK_NAME);
		bool locked=false;
		bool ret = false;
		try {
			locked=commitLock->obtain(IndexWriter::COMMIT_LOCK_TIMEOUT);
			ret = SegmentInfos::readCurrentVersion(directory) == segmentInfos->getVersion();
		} _CLFINALLY(
			if (locked) {
				commitLock->release();
			}
		)
		return ret;
	}

  uint64_t IndexReader::lastModified(const Directory* directory) {
  //Func - Static method
  //       Returns the time the index in this directory was last modified. 
  //Pre  - directory contains a valid reference
  //Post - The last modified time of the index has been returned

      return directory->fileModified("segments");
  }


  bool IndexReader::indexExists(const char* directory){
  //Func - Static method
  //       Checks if an index exists in the named directory
  //Pre  - directory != NULL
  //Post - Returns true if an index exists at the specified directory->
  //       If the directory does not exist or if there is no index in it.
  //       false is returned.

       CND_PRECONDITION(directory != NULL, "directory is NULL");

	   //Create a buffer of length CL_MAXDIR
       char f[CL_MAX_PATH];
	   //Copy the directory string to the buffer. leave room for /segments
       strncpy(f,directory,CL_MAX_PATH-10);
	   //Cat the name of the segments to buffer
       strcat(f, "/segments");
	   //Check if the segments file exists
       return Misc::dir_Exists(f);
  }
    

  void IndexReader::setNorm(int32_t doc, const TCHAR* field, uint8_t value){
      // CPIXASYNC SCOPED_LOCK_MUTEX(THIS_LOCK)
      SCOPED_LOCK_CRUCIAL_MUTEX(IndexReader_THIS_LOCK)
    if(directoryOwner)
      aquireWriteLock();
    doSetNorm(doc, field, value);
    hasChanges = true;
  }

 void IndexReader::aquireWriteLock() {
    if (stale)
      _CLTHROWA(CL_ERR_IO,"IndexReader out of date and no longer valid for delete, undelete, or setNorm operations");

    if (writeLock == NULL) {
      LuceneLock* writeLock = directory->makeLock("write.lock");
      if (!writeLock->obtain(IndexWriter::WRITE_LOCK_TIMEOUT)) // obtain write lock
       _CLTHROWA(CL_ERR_IO,"Index locked for write"); // + writeLock
      this->writeLock = writeLock;

      // we have to check whether index has changed since this reader was opened.
      // if so, this reader is no longer valid for deletion
      if (SegmentInfos::readCurrentVersion(directory) > segmentInfos->getVersion()) {
        stale = true;
        this->writeLock->release();
        _CLDELETE(this->writeLock);
        _CLTHROWA(CL_ERR_IO,"IndexReader out of date and no longer valid for delete, undelete, or setNorm operations");
      }
    }
  }
  

  void IndexReader::setNorm(int32_t doc, const TCHAR* field, float_t value){
     setNorm(doc, field, CL_NS(search)::Similarity::encodeNorm(value));
  }
  bool IndexReader::indexExists(){
  //Func - Checks if an index exists in the directory
  //Pre  - directory is a valid reference
  //Post - Returns true if an index exists at the specified directory->
  //       If the directory does not exist or if there is no index in it.
  //       false is returned. Added for usecases like connecting device in mass storage mode 
        return this->indexExists(this->directory);  
      }
  bool IndexReader::indexExists(const Directory* directory){
  //Func - Static method
  //       Checks if an index exists in the directory
  //Pre  - directory is a valid reference
  //Post - Returns true if an index exists at the specified directory->
  //       If the directory does not exist or if there is no index in it.
  //       false is returned.

      return directory->fileExists("segments");
  }

  TermDocs* IndexReader::termDocs(Term* term) const {
  //Func - Returns an enumeration of all the documents which contain
  //       term. For each document, the document number, the frequency of
  //       the term in that document is also provided, for use in search scoring.
  //       Thus, this method implements the mapping: 
  //
  //       Term => <docNum, freq>*
  //	   The enumeration is ordered by document number.  Each document number
  //       is greater than all that precede it in the enumeration. 
  //Pre  - term != NULL
  //Post - A reference to TermDocs containing an enumeration of all found documents
  //       has been returned

      CND_PRECONDITION(term != NULL, "term is NULL");

      //Reference an instantiated TermDocs instance
      TermDocs* _termDocs = termDocs();
      //Seek all documents containing term
      _termDocs->seek(term);
      //return the enumaration
      return _termDocs;
  }

  TermPositions* IndexReader::termPositions(Term* term) const{
  //Func - Returns an enumeration of all the documents which contain  term. For each 
  //       document, in addition to the document number and frequency of the term in 
  //       that document, a list of all of the ordinal positions of the term in the document 
  //       is available.  Thus, this method implements the mapping:
  //
  //       Term => <docNum, freq,<pos 1, pos 2, ...pos freq-1>>*
  //
  //       This positional information faciliates phrase and proximity searching.
  //       The enumeration is ordered by document number.  Each document number is greater than 
  //       all that precede it in the enumeration. 
  //Pre  - term != NULL
  //Post - A reference to TermPositions containing an enumeration of all found documents
  //       has been returned

      CND_PRECONDITION(term != NULL, "term is NULL");

      //Reference an instantiated termPositions instance
      TermPositions* _termPositions = termPositions();
	  //Seek all documents containing term
      _termPositions->seek(term);
	  //return the enumeration
      return _termPositions;
  }

  void IndexReader::deleteDocument(const int32_t docNum) {
  //Func - Deletes the document numbered docNum.  Once a document is deleted it will not appear 
  //       in TermDocs or TermPostitions enumerations. Attempts to read its field with the document 
  //       method will result in an error.  The presence of this document may still be reflected in 
  //       the docFreq statistic, though this will be corrected eventually as the index is further modified.  
  //Pre  - docNum >= 0
  //Post - If successful the document identified by docNum has been deleted. If no writelock
  //       could be obtained an exception has been thrown stating that the index was locked or has no write access

      // CPIXASYNC SCOPED_LOCK_MUTEX(THIS_LOCK)
      SCOPED_LOCK_CRUCIAL_MUTEX(IndexReader_THIS_LOCK)

     CND_PRECONDITION(docNum >= 0, "docNum is negative");

      if (directoryOwner)
		  aquireWriteLock();

	  //Have the document identified by docNum deleted
      doDelete(docNum);
      hasChanges = true;
  }

  /**
   * Commit changes resulting from delete, undeleteAll, or setNorm operations
   * 
   * @throws IOException
   */
   void IndexReader::commit(){
       // CPIXASYNC SCOPED_LOCK_MUTEX(THIS_LOCK)
       SCOPED_LOCK_CRUCIAL_MUTEX(IndexReader_THIS_LOCK)
    if(hasChanges){
      if(directoryOwner){
        {
            // CPIXASYNC SCOPED_LOCK_MUTEX(directory->THIS_LOCK)      // in- & inter-process sync
            SCOPED_LOCK_CRUCIAL_MUTEX(directory->Directory_THIS_LOCK)      // in- & inter-process sync
	
	        LuceneLock* commitLock = directory->makeLock("commit.lock");
	        IndexReader::CommitLockWith cl(commitLock,this);
	        cl.run();
			_CLDELETE(commitLock);
	
	    }
        if (writeLock != NULL) {
          writeLock->release();  // release write lock
          _CLDELETE(writeLock);
        }
      }else
        doCommit();
    }
    hasChanges = false;
  }


  void IndexReader::undeleteAll(){
      // CPIXASYNC SCOPED_LOCK_MUTEX(THIS_LOCK)
      SCOPED_LOCK_CRUCIAL_MUTEX(IndexReader_THIS_LOCK)
    if(directoryOwner)
      aquireWriteLock();
    doUndeleteAll();
    hasChanges = true;
  }

  int32_t IndexReader::deleteDocuments(Term* term) {
  //Func - Deletes all documents containing term. This is useful if one uses a 
  //       document field to hold a unique ID string for the document.  Then to delete such  
  //       a document, one merely constructs a term with the appropriate field and the unique 
  //       ID string as its text and passes it to this method.  
  //Pre  - term != NULL
  //Post - All documents containing term have been deleted. The number of deleted documents
  //       has been returned

      CND_PRECONDITION(term != NULL, "term is NULL");

	  //Search for the documents contain term
      TermDocs* docs = termDocs(term);

	  //Check if documents have been found
	  if ( docs == NULL ){
          return 0;
	  }
    
	  //initialize
	  int32_t Counter = 0;
      try {
		  //iterate through the found documents
          while (docs->next()) {
			  //Delete the document
              deleteDocument(docs->doc());
              ++Counter;
          }
      }_CLFINALLY(
		  //Close the enumeration
          docs->close();
          );

    //Delete the enumeration of found documents
    _CLDELETE( docs );

	//Return the number of deleted documents
    return Counter;
  }
  
	TCHAR** IndexReader::getFieldNames(){
		CL_NS(util)::StringArrayWithDeletor array;
		getFieldNames(IndexReader::ALL, array);
		
		array.setDoDelete(false);
		TCHAR** ret = _CL_NEWARRAY(TCHAR*,array.size()+1);
		int j=0;
  		CL_NS(util)::StringArrayWithDeletor::iterator itr = array.begin();
  		while ( itr != array.end() ){
  			ret[j]=*itr;
			++j;++itr;
 		}
		ret[j]=NULL;
		return ret;
	}
	TCHAR** IndexReader::getFieldNames(bool indexed){
		CL_NS(util)::StringArrayWithDeletor array;
		getFieldNames(indexed?IndexReader::INDEXED:IndexReader::UNINDEXED, array);
		
		array.setDoDelete(false);
		TCHAR** ret = _CL_NEWARRAY(TCHAR*,array.size()+1);
		int j=0;
  		CL_NS(util)::StringArrayWithDeletor::iterator itr = array.begin();
  		while ( itr != array.end() ){
  			ret[j]=*itr;
			++j;++itr;
 		}
		ret[j]=NULL;
		return ret;
	}
  

  void IndexReader::close() {
  //Func - Closes files associated with this index and also saves any new deletions to disk.
  //       No other methods should be called after this has been called.
  //Pre  - true
  //Post - All files associated with this index have been deleted and new deletions have been 
  //       saved to disk
      // CPIXASYNC SCOPED_LOCK_MUTEX(THIS_LOCK)
      SCOPED_LOCK_CRUCIAL_MUTEX(IndexReader_THIS_LOCK)

	CloseCallbackMap::iterator iter = closeCallbacks.begin();
	for ( ;iter!=closeCallbacks.end();iter++){
		CloseCallback callback = *iter->first;
		callback(this,iter->second);
	}
	
    commit();
    doClose();

	if(closeDirectory){
      directory->close();
	  _CLDECDELETE(directory);
	}
  }
   
  bool IndexReader::isLocked(Directory* directory) {
  //Func - Static method 
  //       Checks if the index in the directory is currently locked.
  //Pre  - directory is a valid reference to a directory to check for a lock
  //Post - Returns true if the index in the named directory is locked otherwise false

	  //Check the existence of the file write.lock and return true when it does and false
	  //when it doesn't
     LuceneLock* l1 = directory->makeLock("write.lock");
     LuceneLock* l2 = directory->makeLock("commit.lock");

	 bool ret = l1->isLocked() || l2->isLocked();

     _CLDELETE(l1);
     _CLDELETE(l2);
     return ret;
  }

  bool IndexReader::isLocked(const char* directory) {
  //Func - Static method 
  //       Checks if the index in the named directory is currently locked.
  //Pre  - directory != NULL and contains the directory to check for a lock
  //Post - Returns true if the index in the named directory is locked otherwise false

      CND_PRECONDITION(directory != NULL, "directory is NULL");

	  //Create a buffer of length CL_MAXDIR
      char f[CL_MAX_PATH]; //todo: potential buffer overflow
	  //Copy the directory string to the buffer. leave room for /write.lock
      strncpy(f,directory,CL_MAX_PATH-12);
	  //Cat the name of the write.lock file to buffer
      strcat ( f,"/write.lock" );

      Directory* dir = FSDirectory::getDirectory(directory,false);
      bool ret = isLocked(dir);
	  dir->close();
      _CLDECDELETE(dir);

	  return ret;
  }
  
/** Returns true if there are norms stored for this field. */
bool IndexReader::hasNorms(const TCHAR* field) {
	// backward compatible implementation.
	// SegmentReader has an efficient implementation.
	return norms(field) != NULL;
}

void IndexReader::unlock(const char* path){
	FSDirectory* dir = FSDirectory::getDirectory(path,false);
	unlock(dir);
	dir->close();
	_CLDECDELETE(dir);
}
  void IndexReader::unlock(Directory* directory){
  //Func - Static method
  //       Forcibly unlocks the index in the named directory->
  //       Caution: this should only be used by failure recovery code,
  //       when it is known that no other process nor thread is in fact
  //       currently accessing this index.
  //Pre  - directory is a valid reference to a directory 
  //Post - The directory has been forcibly unlocked
      LuceneLock* lock;

	  lock = directory->makeLock("write.lock");
      lock->release();
      _CLDELETE(lock);

      lock = directory->makeLock("commit.lock");
      lock->release();
      _CLDELETE(lock);
  }

bool IndexReader::isLuceneFile(const char* filename){
	if ( !filename )
		return false;
	size_t len = strlen(filename);
	if ( len < 6 ) //need at least x.frx
		return false;
	const char* ext=filename+len;
	while(*ext != '.' && ext!=filename)
		ext--;

	if ( strcmp(ext, ".cfs") == 0 )
		return true;
	else if ( strcmp(ext, ".fnm") == 0 )
		return true;
	else if ( strcmp(ext, ".fdx") == 0 )
		return true;
	else if ( strcmp(ext, ".fdt") == 0 )
		return true;
	else if ( strcmp(ext, ".tii") == 0 )
		return true;
	else if ( strcmp(ext, ".tis") == 0 )
		return true;
	else if ( strcmp(ext, ".frq") == 0 )
		return true;
	else if ( strcmp(ext, ".prx") == 0 )
		return true;
	else if ( strcmp(ext, ".del") == 0 )
		return true;
	else if ( strcmp(ext, ".tvx") == 0 )
		return true;
	else if ( strcmp(ext, ".tvd") == 0 )
		return true;
	else if ( strcmp(ext, ".tvf") == 0 )
		return true;
	else if ( strcmp(ext, ".tvp") == 0 )
		return true;

	else if ( strcmp(filename, "segments") == 0 )
		return true;
	else if ( strcmp(filename, "segments.new") == 0 )
		return true;
	else if ( strcmp(filename, "deletable") == 0 )
		return true;

	else if ( strncmp(ext,".f",2)==0 ){
		const char* n = ext+2;
		if ( *n && _istdigit(*n) )
			return true;	
	}

	return false;
}

	void IndexReader::addCloseCallback(CloseCallback callback, void* parameter){
		closeCallbacks.put(callback, parameter);	
	}


	//Constructor	
    IndexReader::LockWith::LockWith(CL_NS(store)::LuceneLock* lock, CL_NS(store)::Directory* dir):
		CL_NS(store)::LuceneLockWith<IndexReader*>(lock,IndexWriter::COMMIT_LOCK_TIMEOUT)
	{
		this->directory = dir;
	}	
	//Constructor	
	IndexReader::CommitLockWith::CommitLockWith( CL_NS(store)::LuceneLock* lock, IndexReader* r ):
		CL_NS(store)::LuceneLockWith<void>(lock,IndexWriter::COMMIT_LOCK_TIMEOUT),
		reader(r)
	{
	}
	void IndexReader::CommitLockWith::doBody(){
		reader->doCommit();
		reader->segmentInfos->write(reader->getDirectory());
	}

CL_NS_END