searchengine/oss/cl/clucene/src/clucene/search/phrasequery.cpp
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Mon, 03 May 2010 13:33:22 +0300
changeset 1 6f2c1c46032b
parent 0 671dee74050a
child 2 6c1a2771f4b7
permissions -rw-r--r--
Revision: 201015 Kit: 201018

/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "clucene/stdheader.h"
#include "phrasequery.h"

#include "searchheader.h"
#include "scorer.h"
#include "booleanquery.h"
#include "termquery.h"

#include "clucene/index/term.h"
#include "clucene/index/terms.h"
#include "clucene/index/indexreader.h"

#include "clucene/util/stringbuffer.h"
#include "clucene/util/voidlist.h"
#include "clucene/util/arrays.h"

#include "exactphrasescorer.h"
#include "sloppyphrasescorer.h"

CL_NS_USE(index)
CL_NS_USE(util)
CL_NS_DEF(search)

  PhraseQuery::PhraseQuery():
	terms(false)
  {
  //Func - Constructor
  //Pre  - true
  //Post - An empty PhraseQuery has been created
  
      slop   = 0;

	  field = NULL;
  }
  PhraseQuery::PhraseQuery(const PhraseQuery& clone):
	Query(clone), terms(false)
  {
      slop  = clone.slop;
	  field = clone.field;
	  int32_t size=clone.positions.size();
	  { //msvc6 scope fix
		  for ( int32_t i=0;i<size;i++ ){
			  int32_t n = clone.positions[i];
			  this->positions.push_back( n );
		  }
	  }
	  size=clone.terms.size();
	  { //msvc6 scope fix
		  for ( int32_t i=0;i<size;i++ ){
			  this->terms.push_back( _CL_POINTER(clone.terms[i]));
		  }
	  }
  }
  Query* PhraseQuery::clone() const{
	  return _CLNEW PhraseQuery(*this);
  }
  bool PhraseQuery::equals(CL_NS(search)::Query *other) const{
	  if (!(other->instanceOf(PhraseQuery::getClassName())))
            return false;

    PhraseQuery* pq = (PhraseQuery*)other;
    bool ret = (this->getBoost() == pq->getBoost())
      && (this->slop == pq->slop);
	
		if ( ret ){
			CLListEquals<CL_NS(index)::Term,Term::Equals,
				const CL_NS(util)::CLVector<CL_NS(index)::Term*>,
				const CL_NS(util)::CLVector<CL_NS(index)::Term*> > comp;
			ret = comp.equals(&this->terms,&pq->terms);
		}
	
		if ( ret ){
			CLListEquals<int32_t,Equals::Int32,
				const CL_NS(util)::CLVector<int32_t,CL_NS(util)::Deletor::DummyInt32>,
				const CL_NS(util)::CLVector<int32_t,CL_NS(util)::Deletor::DummyInt32> > comp;
			ret = comp.equals(&this->positions,&pq->positions);
		}
		return ret;
  }


  PhraseQuery::~PhraseQuery(){
  //Func - Destructor
  //Pre  - true
  //Post 0 The instance has been destroyed
      
	  //Iterate through all the terms
	  for (uint32_t i = 0; i < terms.size(); i++){
        _CLLDECDELETE(terms[i]);
      }
	  positions.clear();
  }

  size_t PhraseQuery::hashCode() const {
		//todo: do cachedHashCode, and invalidate on add/remove clause
		size_t ret = Similarity::floatToByte(getBoost()) ^ Similarity::floatToByte(slop);
		
		{ //msvc6 scope fix
			for ( int32_t i=0;terms.size();i++ )
				ret = 31 * ret + terms[i]->hashCode();
		}
		{ //msvc6 scope fix
			for ( int32_t i=0;positions.size();i++ )
				ret = 31 * ret + positions[i];
		}
		return ret;
	}

  const TCHAR* PhraseQuery::getClassName(){
    return _T("PhraseQuery");
  }
  const TCHAR* PhraseQuery::getQueryName() const{
  //Func - Returns the string "PhraseQuery"
  //Pre  - true
  //Post - The string "PhraseQuery" has been returned
    return getClassName();
  }

  
	/**
	* Adds a term to the end of the query phrase.
	* The relative position of the term is the one immediately after the last term added.
	*/
	void PhraseQuery::add(Term* term) {
		CND_PRECONDITION(term != NULL,"term is NULL");

		int32_t position = 0;

		if(positions.size() > 0)
			position = (positions[positions.size()-1]) + 1;
		
		add(term, position);
	}

	void PhraseQuery::add(Term* term, int32_t position) {
	//Func - Adds a term to the end of the query phrase. 
	//Pre  - term != NULL 
	//Post - The term has been added if its field matches the field of the PhraseQuery
	//       and true is returned otherwise false is returned
		CND_PRECONDITION(term != NULL,"term is NULL");

		if (terms.size() == 0)
			field = term->field();
		else{
			//Check if the field of the _CLNEW term matches the field of the PhraseQuery
			//can use != because fields are interned
			if ( term->field() != field){
				//return false;
				TCHAR buf[200];
				_sntprintf(buf,200,_T("All phrase terms must be in the same field: %s"),term->field());
				_CLTHROWT(CL_ERR_IllegalArgument,buf);
			}
		}
		//Store the _CLNEW term
		terms.push_back(_CL_POINTER(term));

		positions.push_back(position);
	}

	void PhraseQuery::getPositions(Array<int32_t>& result) const{
		result.length = positions.size();
		result.values = _CL_NEWARRAY(int32_t,result.length);
		for(int32_t i = 0; i < result.length; i++){
			result.values[i] = positions[i];
		}
	}
	int32_t* PhraseQuery::getPositions() const{
	    CND_WARNING(false,"getPositions() is deprecated")

        Array<int32_t> arr;
        getPositions(arr);
        return arr.values;
	}
  
  Weight* PhraseQuery::_createWeight(Searcher* searcher) {
    if (terms.size() == 1) {			  // optimize one-term case
      Term* term = terms[0];
      Query* termQuery = _CLNEW TermQuery(term);
      termQuery->setBoost(getBoost());
      Weight* ret = termQuery->_createWeight(searcher);
	  _CLDELETE(termQuery);
	  return ret;
    }
    return _CLNEW PhraseWeight(searcher,this);
  }


  Term** PhraseQuery::getTerms() const{
  //Func - added by search highlighter
  //Pre  -
  //Post -

	  //Let size contain the number of terms
      int32_t size = terms.size();
      Term** ret = _CL_NEWARRAY(Term*,size+1);
       
	  CND_CONDITION(ret != NULL,"Could not allocated memory for ret");

	  //Iterate through terms and copy each pointer to ret
	  for ( int32_t i=0;i<size;i++ ){
          ret[i] = terms[i];
     }
     ret[size] = NULL;
     return ret;
  }

  TCHAR* PhraseQuery::toString(const TCHAR* f) const{
  //Func - Prints a user-readable version of this query. 
  //Pre  - f != NULL
  //Post - The query string has been returned

      if ( terms.size()== 0 )
		  return NULL;

      StringBuffer buffer;
      if ( f==NULL || _tcscmp(field,f)!=0) {
          buffer.append(field);
          buffer.append( _T(":"));
      }

      buffer.append( _T("\"") );

      Term *T = NULL;

	  //iterate through all terms
      for (uint32_t i = 0; i < terms.size(); i++) {
		  //Get the i-th term
		  T = terms[i];

		  //Ensure T is a valid Term
          CND_CONDITION(T !=NULL,"T is NULL");

          buffer.append( T->text() );
		  //Check if i is at the end of terms
		  if (i != terms.size()-1){
              buffer.append(_T(" "));
              }
          }

      buffer.append( _T("\"") );

      if (slop != 0) {
          buffer.append(_T("~"));
          buffer.appendFloat(slop,0);
          }

	  //Check if there is an other boost factor than 1.0
      if (getBoost() != 1.0f) {
          buffer.append(_T("^"));
          buffer.appendFloat( getBoost(),1 );
          }

	  //return the query string
	  return buffer.toString();
  }







  
 PhraseQuery::PhraseWeight::PhraseWeight(Searcher* searcher, PhraseQuery* _this) {
   this->_this=_this;
   this->value = 0;
   this->idf = 0;
   this->queryNorm = 0;
   this->queryWeight = 0;
   this->searcher = searcher;
 }

 TCHAR* PhraseQuery::PhraseWeight::toString() { 
	return STRDUP_TtoT(_T("weight(PhraseQuery)"));
 }
 PhraseQuery::PhraseWeight::~PhraseWeight(){
 }

 
 Query* PhraseQuery::PhraseWeight::getQuery() { return _this; }
 float_t PhraseQuery::PhraseWeight::getValue() { return value; }

 float_t PhraseQuery::PhraseWeight::sumOfSquaredWeights(){
   idf = _this->getSimilarity(searcher)->idf(&_this->terms, searcher);
   queryWeight = idf * _this->getBoost();    // compute query weight
   return queryWeight * queryWeight;         // square it
 }

 void PhraseQuery::PhraseWeight::normalize(float_t queryNorm) {
   this->queryNorm = queryNorm;
   queryWeight *= queryNorm;                   // normalize query weight
   value = queryWeight * idf;                  // idf for document 
 }

  Scorer* PhraseQuery::PhraseWeight::scorer(IndexReader* reader)  {
  //Func -
  //Pre  -
  //Post -

	  //Get the length of terms
      int32_t tpsLength = _this->terms.size();

	  //optimize zero-term case
      if (tpsLength == 0)			  
          return NULL;
    
    TermPositions** tps = _CL_NEWARRAY(TermPositions*,tpsLength+1);

	//Check if tps has been allocated properly
    CND_CONDITION(tps != NULL,"Could not allocate memory for tps");

    TermPositions* p = NULL;

	//Iterate through all terms
	int32_t size = _this->terms.size();
    for (int32_t i = 0; i < size; i++) {
        //Get the termPostitions for the i-th term
        p = reader->termPositions(_this->terms[i]);
      
		//Check if p is valid
		if (p == NULL) {
			//Delete previous retrieved termPositions
			while (--i >= 0){
				_CLVDELETE(tps[i]);  //todo: not a clucene object... should be
			}
            _CLDELETE_ARRAY(tps); 
            return NULL;
        }

        //Store p at i in tps
        tps[i] = p;
    }
	tps[tpsLength] = NULL;

    Scorer* ret = NULL;

    Array<int32_t> positions;
	_this->getPositions(positions);
	int32_t slop = _this->getSlop();
	if ( slop != 0)
		 // optimize exact case
		 //todo: need to pass these: this, tps, 
         ret = _CLNEW SloppyPhraseScorer(this,tps,positions.values,
								_this->getSimilarity(searcher), 
								slop, reader->norms(_this->field));
	else
	    ret = _CLNEW ExactPhraseScorer(this, tps, positions.values, 
									_this->getSimilarity(searcher),
                                    reader->norms(_this->field));
	positions.deleteArray();

    CND_CONDITION(ret != NULL,"Could not allocate memory for ret");

	//tps can be deleted safely. SloppyPhraseScorer or ExactPhraseScorer will take care
	//of its values

    _CLDELETE_ARRAY(tps);
    return ret;
  }

 void PhraseQuery::PhraseWeight::explain(IndexReader* reader, int32_t doc, Explanation* result){
   TCHAR descbuf[LUCENE_SEARCH_EXPLANATION_DESC_LEN+1];
   TCHAR* tmp;
   
   tmp = getQuery()->toString();
   _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN,_T("weight(%s in %d), product of:"),
	   tmp,doc);
   _CLDELETE_CARRAY(tmp);
   result->setDescription(descbuf);
   
   StringBuffer docFreqs;
   StringBuffer query;
   query.appendChar('\"');
   for (uint32_t i = 0; i < _this->terms.size(); i++) {
     if (i != 0) {
       docFreqs.appendChar(' ');
       query.appendChar(' ');
     }

     Term* term = _this->terms[i];

     docFreqs.append(term->text());
     docFreqs.appendChar('=');
     docFreqs.appendInt(searcher->docFreq(term));

     query.append(term->text());
   }
   query.appendChar('\"');

   _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN,
	   _T("idf(%s: %s)"),_this->field,docFreqs.getBuffer());
   Explanation* idfExpl = _CLNEW Explanation(idf, descbuf);
   
   // explain query weight
   Explanation* queryExpl = _CLNEW Explanation;
   tmp = getQuery()->toString();
   _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN,
		_T("queryWeight(%s), product of:"),tmp);
   _CLDELETE_CARRAY(tmp);
   queryExpl->setDescription(descbuf);

   Explanation* boostExpl = _CLNEW Explanation(_this->getBoost(), _T("boost"));
   if (_this->getBoost() != 1.0f)
     queryExpl->addDetail(boostExpl);
   queryExpl->addDetail(idfExpl);
   
   Explanation* queryNormExpl = _CLNEW Explanation(queryNorm,_T("queryNorm"));
   queryExpl->addDetail(queryNormExpl);

   queryExpl->setValue(boostExpl->getValue() *
                      idfExpl->getValue() *
                      queryNormExpl->getValue());

   result->addDetail(queryExpl);
   
   // explain field weight
   Explanation* fieldExpl = _CLNEW Explanation;
    _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN,
		_T("fieldWeight(%s:%s in %d), product of:"),
		_this->field,query.getBuffer(),doc);
   fieldExpl->setDescription(descbuf);

   
   Explanation* tfExpl = _CLNEW Explanation;
   scorer(reader)->explain(doc, tfExpl);
   fieldExpl->addDetail(tfExpl);
   fieldExpl->addDetail(idfExpl);

   Explanation* fieldNormExpl = _CLNEW Explanation();
   uint8_t* fieldNorms = reader->norms(_this->field);
   float_t fieldNorm =
     fieldNorms!=NULL ? Similarity::decodeNorm(fieldNorms[doc]) : 0.0f;
   fieldNormExpl->setValue(fieldNorm);

   
    _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN,
		_T("fieldNorm(field=%s, doc=%d)"),_this->field,doc);
   fieldNormExpl->setDescription(descbuf);
   fieldExpl->addDetail(fieldNormExpl);
   
   fieldExpl->setValue(tfExpl->getValue() *
                      idfExpl->getValue() *
                      fieldNormExpl->getValue());
   
   result->addDetail(fieldExpl);

   // combine them
   result->setValue(queryExpl->getValue() * fieldExpl->getValue());

   if (queryExpl->getValue() == 1.0f){
     result->set(*fieldExpl);
     _CLDELETE(fieldExpl);
   }
 }


CL_NS_END