searchengine/oss/cl/clucene/src/clucene/search/phrasequery.cpp
changeset 0 671dee74050a
child 1 6f2c1c46032b
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/cl/clucene/src/clucene/search/phrasequery.cpp	Mon Apr 19 14:40:16 2010 +0300
@@ -0,0 +1,463 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "clucene/stdheader.h"
+#include "PhraseQuery.h"
+
+#include "searchheader.h"
+#include "scorer.h"
+#include "booleanquery.h"
+#include "termquery.h"
+
+#include "clucene/index/term.h"
+#include "clucene/index/terms.h"
+#include "clucene/index/indexreader.h"
+
+#include "clucene/util/stringbuffer.h"
+#include "clucene/util/voidlist.h"
+#include "clucene/util/arrays.h"
+
+#include "exactphrasescorer.h"
+#include "sloppyphrasescorer.h"
+
+CL_NS_USE(index)
+CL_NS_USE(util)
+CL_NS_DEF(search)
+
+  PhraseQuery::PhraseQuery():
+	terms(false)
+  {
+  //Func - Constructor
+  //Pre  - true
+  //Post - An empty PhraseQuery has been created
+  
+      slop   = 0;
+
+	  field = NULL;
+  }
+  PhraseQuery::PhraseQuery(const PhraseQuery& clone):
+	Query(clone), terms(false)
+  {
+      slop  = clone.slop;
+	  field = clone.field;
+	  int32_t size=clone.positions.size();
+	  { //msvc6 scope fix
+		  for ( int32_t i=0;i<size;i++ ){
+			  int32_t n = clone.positions[i];
+			  this->positions.push_back( n );
+		  }
+	  }
+	  size=clone.terms.size();
+	  { //msvc6 scope fix
+		  for ( int32_t i=0;i<size;i++ ){
+			  this->terms.push_back( _CL_POINTER(clone.terms[i]));
+		  }
+	  }
+  }
+  Query* PhraseQuery::clone() const{
+	  return _CLNEW PhraseQuery(*this);
+  }
+  bool PhraseQuery::equals(CL_NS(search)::Query *other) const{
+	  if (!(other->instanceOf(PhraseQuery::getClassName())))
+            return false;
+
+    PhraseQuery* pq = (PhraseQuery*)other;
+    bool ret = (this->getBoost() == pq->getBoost())
+      && (this->slop == pq->slop);
+	
+		if ( ret ){
+			CLListEquals<CL_NS(index)::Term,Term::Equals,
+				const CL_NS(util)::CLVector<CL_NS(index)::Term*>,
+				const CL_NS(util)::CLVector<CL_NS(index)::Term*> > comp;
+			ret = comp.equals(&this->terms,&pq->terms);
+		}
+	
+		if ( ret ){
+			CLListEquals<int32_t,Equals::Int32,
+				const CL_NS(util)::CLVector<int32_t,CL_NS(util)::Deletor::DummyInt32>,
+				const CL_NS(util)::CLVector<int32_t,CL_NS(util)::Deletor::DummyInt32> > comp;
+			ret = comp.equals(&this->positions,&pq->positions);
+		}
+		return ret;
+  }
+
+
+  PhraseQuery::~PhraseQuery(){
+  //Func - Destructor
+  //Pre  - true
+  //Post 0 The instance has been destroyed
+      
+	  //Iterate through all the terms
+	  for (uint32_t i = 0; i < terms.size(); i++){
+        _CLLDECDELETE(terms[i]);
+      }
+	  positions.clear();
+  }
+
+  size_t PhraseQuery::hashCode() const {
+		//todo: do cachedHashCode, and invalidate on add/remove clause
+		size_t ret = Similarity::floatToByte(getBoost()) ^ Similarity::floatToByte(slop);
+		
+		{ //msvc6 scope fix
+			for ( int32_t i=0;terms.size();i++ )
+				ret = 31 * ret + terms[i]->hashCode();
+		}
+		{ //msvc6 scope fix
+			for ( int32_t i=0;positions.size();i++ )
+				ret = 31 * ret + positions[i];
+		}
+		return ret;
+	}
+
+  const TCHAR* PhraseQuery::getClassName(){
+    return _T("PhraseQuery");
+  }
+  const TCHAR* PhraseQuery::getQueryName() const{
+  //Func - Returns the string "PhraseQuery"
+  //Pre  - true
+  //Post - The string "PhraseQuery" has been returned
+    return getClassName();
+  }
+
+  
+	/**
+	* Adds a term to the end of the query phrase.
+	* The relative position of the term is the one immediately after the last term added.
+	*/
+	void PhraseQuery::add(Term* term) {
+		CND_PRECONDITION(term != NULL,"term is NULL");
+
+		int32_t position = 0;
+
+		if(positions.size() > 0)
+			position = (positions[positions.size()-1]) + 1;
+		
+		add(term, position);
+	}
+
+	void PhraseQuery::add(Term* term, int32_t position) {
+	//Func - Adds a term to the end of the query phrase. 
+	//Pre  - term != NULL 
+	//Post - The term has been added if its field matches the field of the PhraseQuery
+	//       and true is returned otherwise false is returned
+		CND_PRECONDITION(term != NULL,"term is NULL");
+
+		if (terms.size() == 0)
+			field = term->field();
+		else{
+			//Check if the field of the _CLNEW term matches the field of the PhraseQuery
+			//can use != because fields are interned
+			if ( term->field() != field){
+				//return false;
+				TCHAR buf[200];
+				_sntprintf(buf,200,_T("All phrase terms must be in the same field: %s"),term->field());
+				_CLTHROWT(CL_ERR_IllegalArgument,buf);
+			}
+		}
+		//Store the _CLNEW term
+		terms.push_back(_CL_POINTER(term));
+
+		positions.push_back(position);
+	}
+
+	void PhraseQuery::getPositions(Array<int32_t>& result) const{
+		result.length = positions.size();
+		result.values = _CL_NEWARRAY(int32_t,result.length);
+		for(int32_t i = 0; i < result.length; i++){
+			result.values[i] = positions[i];
+		}
+	}
+	int32_t* PhraseQuery::getPositions() const{
+	    CND_WARNING(false,"getPositions() is deprecated")
+
+        Array<int32_t> arr;
+        getPositions(arr);
+        return arr.values;
+	}
+  
+  Weight* PhraseQuery::_createWeight(Searcher* searcher) {
+    if (terms.size() == 1) {			  // optimize one-term case
+      Term* term = terms[0];
+      Query* termQuery = _CLNEW TermQuery(term);
+      termQuery->setBoost(getBoost());
+      Weight* ret = termQuery->_createWeight(searcher);
+	  _CLDELETE(termQuery);
+	  return ret;
+    }
+    return _CLNEW PhraseWeight(searcher,this);
+  }
+
+
+  Term** PhraseQuery::getTerms() const{
+  //Func - added by search highlighter
+  //Pre  -
+  //Post -
+
+	  //Let size contain the number of terms
+      int32_t size = terms.size();
+      Term** ret = _CL_NEWARRAY(Term*,size+1);
+       
+	  CND_CONDITION(ret != NULL,"Could not allocated memory for ret");
+
+	  //Iterate through terms and copy each pointer to ret
+	  for ( int32_t i=0;i<size;i++ ){
+          ret[i] = terms[i];
+     }
+     ret[size] = NULL;
+     return ret;
+  }
+
+  TCHAR* PhraseQuery::toString(const TCHAR* f) const{
+  //Func - Prints a user-readable version of this query. 
+  //Pre  - f != NULL
+  //Post - The query string has been returned
+
+      if ( terms.size()== 0 )
+		  return NULL;
+
+      StringBuffer buffer;
+      if ( f==NULL || _tcscmp(field,f)!=0) {
+          buffer.append(field);
+          buffer.append( _T(":"));
+      }
+
+      buffer.append( _T("\"") );
+
+      Term *T = NULL;
+
+	  //iterate through all terms
+      for (uint32_t i = 0; i < terms.size(); i++) {
+		  //Get the i-th term
+		  T = terms[i];
+
+		  //Ensure T is a valid Term
+          CND_CONDITION(T !=NULL,"T is NULL");
+
+          buffer.append( T->text() );
+		  //Check if i is at the end of terms
+		  if (i != terms.size()-1){
+              buffer.append(_T(" "));
+              }
+          }
+
+      buffer.append( _T("\"") );
+
+      if (slop != 0) {
+          buffer.append(_T("~"));
+          buffer.appendFloat(slop,0);
+          }
+
+	  //Check if there is an other boost factor than 1.0
+      if (getBoost() != 1.0f) {
+          buffer.append(_T("^"));
+          buffer.appendFloat( getBoost(),1 );
+          }
+
+	  //return the query string
+	  return buffer.toString();
+  }
+
+
+
+
+
+
+
+  
+ PhraseQuery::PhraseWeight::PhraseWeight(Searcher* searcher, PhraseQuery* _this) {
+   this->_this=_this;
+   this->value = 0;
+   this->idf = 0;
+   this->queryNorm = 0;
+   this->queryWeight = 0;
+   this->searcher = searcher;
+ }
+
+ TCHAR* PhraseQuery::PhraseWeight::toString() { 
+	return STRDUP_TtoT(_T("weight(PhraseQuery)"));
+ }
+ PhraseQuery::PhraseWeight::~PhraseWeight(){
+ }
+
+ 
+ Query* PhraseQuery::PhraseWeight::getQuery() { return _this; }
+ float_t PhraseQuery::PhraseWeight::getValue() { return value; }
+
+ float_t PhraseQuery::PhraseWeight::sumOfSquaredWeights(){
+   idf = _this->getSimilarity(searcher)->idf(&_this->terms, searcher);
+   queryWeight = idf * _this->getBoost();    // compute query weight
+   return queryWeight * queryWeight;         // square it
+ }
+
+ void PhraseQuery::PhraseWeight::normalize(float_t queryNorm) {
+   this->queryNorm = queryNorm;
+   queryWeight *= queryNorm;                   // normalize query weight
+   value = queryWeight * idf;                  // idf for document 
+ }
+
+  Scorer* PhraseQuery::PhraseWeight::scorer(IndexReader* reader)  {
+  //Func -
+  //Pre  -
+  //Post -
+
+	  //Get the length of terms
+      int32_t tpsLength = _this->terms.size();
+
+	  //optimize zero-term case
+      if (tpsLength == 0)			  
+          return NULL;
+    
+    TermPositions** tps = _CL_NEWARRAY(TermPositions*,tpsLength+1);
+
+	//Check if tps has been allocated properly
+    CND_CONDITION(tps != NULL,"Could not allocate memory for tps");
+
+    TermPositions* p = NULL;
+
+	//Iterate through all terms
+	int32_t size = _this->terms.size();
+    for (int32_t i = 0; i < size; i++) {
+        //Get the termPostitions for the i-th term
+        p = reader->termPositions(_this->terms[i]);
+      
+		//Check if p is valid
+		if (p == NULL) {
+			//Delete previous retrieved termPositions
+			while (--i >= 0){
+				_CLVDELETE(tps[i]);  //todo: not a clucene object... should be
+			}
+            _CLDELETE_ARRAY(tps); 
+            return NULL;
+        }
+
+        //Store p at i in tps
+        tps[i] = p;
+    }
+	tps[tpsLength] = NULL;
+
+    Scorer* ret = NULL;
+
+    Array<int32_t> positions;
+	_this->getPositions(positions);
+	int32_t slop = _this->getSlop();
+	if ( slop != 0)
+		 // optimize exact case
+		 //todo: need to pass these: this, tps, 
+         ret = _CLNEW SloppyPhraseScorer(this,tps,positions.values,
+								_this->getSimilarity(searcher), 
+								slop, reader->norms(_this->field));
+	else
+	    ret = _CLNEW ExactPhraseScorer(this, tps, positions.values, 
+									_this->getSimilarity(searcher),
+                                    reader->norms(_this->field));
+	positions.deleteArray();
+
+    CND_CONDITION(ret != NULL,"Could not allocate memory for ret");
+
+	//tps can be deleted safely. SloppyPhraseScorer or ExactPhraseScorer will take care
+	//of its values
+
+    _CLDELETE_ARRAY(tps);
+    return ret;
+  }
+
+ void PhraseQuery::PhraseWeight::explain(IndexReader* reader, int32_t doc, Explanation* result){
+   TCHAR descbuf[LUCENE_SEARCH_EXPLANATION_DESC_LEN+1];
+   TCHAR* tmp;
+   
+   tmp = getQuery()->toString();
+   _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN,_T("weight(%s in %d), product of:"),
+	   tmp,doc);
+   _CLDELETE_CARRAY(tmp);
+   result->setDescription(descbuf);
+   
+   StringBuffer docFreqs;
+   StringBuffer query;
+   query.appendChar('\"');
+   for (uint32_t i = 0; i < _this->terms.size(); i++) {
+     if (i != 0) {
+       docFreqs.appendChar(' ');
+       query.appendChar(' ');
+     }
+
+     Term* term = _this->terms[i];
+
+     docFreqs.append(term->text());
+     docFreqs.appendChar('=');
+     docFreqs.appendInt(searcher->docFreq(term));
+
+     query.append(term->text());
+   }
+   query.appendChar('\"');
+
+   _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN,
+	   _T("idf(%s: %s)"),_this->field,docFreqs.getBuffer());
+   Explanation* idfExpl = _CLNEW Explanation(idf, descbuf);
+   
+   // explain query weight
+   Explanation* queryExpl = _CLNEW Explanation;
+   tmp = getQuery()->toString();
+   _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN,
+		_T("queryWeight(%s), product of:"),tmp);
+   _CLDELETE_CARRAY(tmp);
+   queryExpl->setDescription(descbuf);
+
+   Explanation* boostExpl = _CLNEW Explanation(_this->getBoost(), _T("boost"));
+   if (_this->getBoost() != 1.0f)
+     queryExpl->addDetail(boostExpl);
+   queryExpl->addDetail(idfExpl);
+   
+   Explanation* queryNormExpl = _CLNEW Explanation(queryNorm,_T("queryNorm"));
+   queryExpl->addDetail(queryNormExpl);
+
+   queryExpl->setValue(boostExpl->getValue() *
+                      idfExpl->getValue() *
+                      queryNormExpl->getValue());
+
+   result->addDetail(queryExpl);
+   
+   // explain field weight
+   Explanation* fieldExpl = _CLNEW Explanation;
+    _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN,
+		_T("fieldWeight(%s:%s in %d), product of:"),
+		_this->field,query.getBuffer(),doc);
+   fieldExpl->setDescription(descbuf);
+
+   
+   Explanation* tfExpl = _CLNEW Explanation;
+   scorer(reader)->explain(doc, tfExpl);
+   fieldExpl->addDetail(tfExpl);
+   fieldExpl->addDetail(idfExpl);
+
+   Explanation* fieldNormExpl = _CLNEW Explanation();
+   uint8_t* fieldNorms = reader->norms(_this->field);
+   float_t fieldNorm =
+     fieldNorms!=NULL ? Similarity::decodeNorm(fieldNorms[doc]) : 0.0f;
+   fieldNormExpl->setValue(fieldNorm);
+
+   
+    _sntprintf(descbuf,LUCENE_SEARCH_EXPLANATION_DESC_LEN,
+		_T("fieldNorm(field=%s, doc=%d)"),_this->field,doc);
+   fieldNormExpl->setDescription(descbuf);
+   fieldExpl->addDetail(fieldNormExpl);
+   
+   fieldExpl->setValue(tfExpl->getValue() *
+                      idfExpl->getValue() *
+                      fieldNormExpl->getValue());
+   
+   result->addDetail(fieldExpl);
+
+   // combine them
+   result->setValue(queryExpl->getValue() * fieldExpl->getValue());
+
+   if (queryExpl->getValue() == 1.0f){
+     result->set(*fieldExpl);
+     _CLDELETE(fieldExpl);
+   }
+ }
+
+
+CL_NS_END