searchengine/oss/cl/clucene/src/clucene/highlighter/QueryTermExtractor.cpp
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Fri, 17 Sep 2010 08:35:54 +0300
changeset 21 2c484ac32ef0
parent 7 a5fbfefd615f
permissions -rw-r--r--
Revision: 201035 Kit: 201037

#include "CLucene/StdHeader.h"
#include "QueryTermExtractor.h"

CL_NS_DEF2(search,highlight)
CL_NS_USE(index)

	WeightedTerm** QueryTermExtractor::getTerms(const Query *query) 
	{
		WeightedTerm** ret = getTerms(query,false);
		return ret;
	}

	WeightedTerm** QueryTermExtractor::getTerms(const Query * query, bool prohibited) 
	{
		WeightedTermList terms(false);
		getTerms(query,&terms,prohibited);

		// Return extracted terms
		WeightedTerm** ret = _CL_NEWARRAY(WeightedTerm*,terms.size()+1);
		terms.toArray(ret);

		return ret;
	}

	void QueryTermExtractor::getTerms(const Query * query, WeightedTermList * terms,bool prohibited) 
	{
		if (query->instanceOf( BooleanQuery::getClassName() ))
			getTermsFromBooleanQuery((BooleanQuery *) query, terms, prohibited);
		else if (query->instanceOf( PhraseQuery::getClassName() ))
			getTermsFromPhraseQuery((PhraseQuery *) query, terms);
		else if (query->instanceOf( TermQuery::getClassName() ))
			getTermsFromTermQuery((TermQuery *) query, terms);
		// Adding support for prefix Query to have direct comparision of Query text
		else if (query->instanceOf( PrefixQuery::getClassName() ))
			getTermsFromPrefixQuery((PrefixQuery *) query, terms);
		//else if(query->instanceOf(_T("SpanNearQuery"))
		//	getTermsFromSpanNearQuery((SpanNearQuery*) query, terms);
	}

	/**
  	* Extracts all terms texts of a given Query into an array of WeightedTerms
  	*
  	* @param query      Query to extract term texts from
  	* @param reader used to compute IDF which can be used to a) score selected fragments better
  	* b) use graded highlights eg chaning intensity of font color
  	* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
  	* @return an array of the terms used in a query, plus their weights.
  	*/
  	WeightedTerm** QueryTermExtractor::getIdfWeightedTerms(const Query* query, IndexReader* reader, const TCHAR* fieldName)
  	{
  	    WeightedTermList terms(true);
		getTerms(query,&terms,false);

  	    int32_t totalNumDocs=reader->numDocs();
		
		WeightedTermList::iterator itr = terms.begin();
  	    while ( itr != terms.end() )
  		{
  			try
  			{
				Term* term = _CLNEW Term(fieldName,(*itr)->getTerm());
  				int32_t docFreq=reader->docFreq(term);
				_CLDECDELETE(term);

  				//IDF algorithm taken from DefaultSimilarity class
  				float_t idf=(float_t)(log(totalNumDocs/(float_t)(docFreq+1)) + 1.0);
  				(*itr)->setWeight((*itr)->getWeight() * idf);
  			}catch (LuceneError& e){
  				if ( e.number()!=CL_ERR_IO )
					throw e;
  			}

			itr++;
  		}
  	   
		// Return extracted terms
		WeightedTerm** ret = _CL_NEWARRAY(WeightedTerm*,terms.size()+1);
		terms.toArray(ret);

		return ret;
  	}

	void QueryTermExtractor::getTermsFromBooleanQuery(const BooleanQuery * query, WeightedTermList * terms, bool prohibited)
	{
		// TODO: change Query to get the queryclauses and their number in one function call
		BooleanClause** queryClauses = query->getClauses();
		uint32_t numClauses = query->getClauseCount();

		for (uint32_t i = 0; i < numClauses; i++)
		{
			if (prohibited || !queryClauses[i]->prohibited){
				Query* qry = queryClauses[i]->query;
				getTerms(qry, terms, prohibited);
			}
		}

		_CLDELETE_ARRAY(queryClauses);
	}

	void QueryTermExtractor::getTermsFromPhraseQuery(const PhraseQuery * query, WeightedTermList * terms)
	{
		Term** queryTerms = query->getTerms();
		int32_t i = 0;
		while ( queryTerms[i] != NULL ){
			WeightedTerm * pWT = _CLNEW WeightedTerm(query->getBoost(),queryTerms[i]->text());
			if (terms->find(pWT)==terms->end()) // possible memory leak if key already present
				terms->insert(pWT);
			else
				_CLDELETE(pWT);

			i++;
		}
		_CLDELETE_ARRAY(queryTerms);
	}

	void QueryTermExtractor::getTermsFromTermQuery(const TermQuery * query, WeightedTermList * terms)
	{
		Term * term = query->getTerm();
		WeightedTerm * pWT = _CLNEW WeightedTerm(query->getBoost(),term->text());
		_CLDECDELETE(term);
		if (terms->find(pWT)==terms->end()) // possible memory leak if key already present
			terms->insert(pWT);
		else
			_CLDELETE(pWT);
	}

                             
	void QueryTermExtractor::getTermsFromPrefixQuery( PrefixQuery * query, WeightedTermList * terms)
	{
		Term *term =  query->getPrefix();
		WeightedTerm * pWT = _CLNEW WeightedTerm(query->getBoost(),term->text());
		_CLDECDELETE(term);
		if (terms->find(pWT)==terms->end()) // possible memory leak if key already present
			terms->insert(pWT);
		else
			_CLDELETE(pWT);
	}

	//todo: implement this when span queries are implemented
	/*void getTermsFromSpanNearQuery(SpanNearQuery* query, WeightedTermList* terms){
  	    Collection queryTerms = query.getTerms();

  	    for(Iterator iterator = queryTerms.iterator(); iterator.hasNext();){
  	        // break it out for debugging.
  	        Term term = (Term) iterator.next();
  	        const TCHAR* text = term.text();
  	        terms.add(_CLNEW WeightedTerm(query.getBoost(), text));
  	    }
  	}*/

CL_NS_END2