searchengine/oss/cl/clucene/src/clucene/highlighter/QueryTermExtractor.cpp
changeset 7 a5fbfefd615f
child 21 2c484ac32ef0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/cl/clucene/src/clucene/highlighter/QueryTermExtractor.cpp	Fri Jun 11 14:43:47 2010 +0300
@@ -0,0 +1,136 @@
+#include "CLucene/StdHeader.h"
+#include "QueryTermExtractor.h"
+
+CL_NS_DEF2(search,highlight)
+CL_NS_USE(index)
+
+	WeightedTerm** QueryTermExtractor::getTerms(const Query *query) 
+	{
+		WeightedTerm** ret = getTerms(query,false);
+		return ret;
+	}
+
+	WeightedTerm** QueryTermExtractor::getTerms(const Query * query, bool prohibited) 
+	{
+		WeightedTermList terms(false);
+		getTerms(query,&terms,prohibited);
+
+		// Return extracted terms
+		WeightedTerm** ret = _CL_NEWARRAY(WeightedTerm*,terms.size()+1);
+		terms.toArray(ret);
+
+		return ret;
+	}
+
+	void QueryTermExtractor::getTerms(const Query * query, WeightedTermList * terms,bool prohibited) 
+	{
+		if (query->instanceOf( BooleanQuery::getClassName() ))
+			getTermsFromBooleanQuery((BooleanQuery *) query, terms, prohibited);
+		else if (query->instanceOf( PhraseQuery::getClassName() ))
+			getTermsFromPhraseQuery((PhraseQuery *) query, terms);
+		else if (query->instanceOf( TermQuery::getClassName() ))
+			getTermsFromTermQuery((TermQuery *) query, terms);
+		//else if(query->instanceOf(_T("SpanNearQuery"))
+		//	getTermsFromSpanNearQuery((SpanNearQuery*) query, terms);
+	}
+
+	/**
+  	* Extracts all terms texts of a given Query into an array of WeightedTerms
+  	*
+  	* @param query      Query to extract term texts from
+  	* @param reader used to compute IDF which can be used to a) score selected fragments better
+  	* b) use graded highlights eg chaning intensity of font color
+  	* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
+  	* @return an array of the terms used in a query, plus their weights.
+  	*/
+  	WeightedTerm** QueryTermExtractor::getIdfWeightedTerms(const Query* query, IndexReader* reader, const TCHAR* fieldName)
+  	{
+  	    WeightedTermList terms(true);
+		getTerms(query,&terms,false);
+
+  	    int32_t totalNumDocs=reader->numDocs();
+		
+		WeightedTermList::iterator itr = terms.begin();
+  	    while ( itr != terms.end() )
+  		{
+  			try
+  			{
+				Term* term = _CLNEW Term(fieldName,(*itr)->getTerm());
+  				int32_t docFreq=reader->docFreq(term);
+				_CLDECDELETE(term);
+
+  				//IDF algorithm taken from DefaultSimilarity class
+  				float_t idf=(float_t)(log(totalNumDocs/(float_t)(docFreq+1)) + 1.0);
+  				(*itr)->setWeight((*itr)->getWeight() * idf);
+  			}catch (LuceneError& e){
+  				if ( e.number()!=CL_ERR_IO )
+					throw e;
+  			}
+
+			itr++;
+  		}
+  	   
+		// Return extracted terms
+		WeightedTerm** ret = _CL_NEWARRAY(WeightedTerm*,terms.size()+1);
+		terms.toArray(ret);
+
+		return ret;
+  	}
+
+	void QueryTermExtractor::getTermsFromBooleanQuery(const BooleanQuery * query, WeightedTermList * terms, bool prohibited)
+	{
+		// TODO: change Query to get the queryclauses and their number in one function call
+		BooleanClause** queryClauses = query->getClauses();
+		uint32_t numClauses = query->getClauseCount();
+
+		for (uint32_t i = 0; i < numClauses; i++)
+		{
+			if (prohibited || !queryClauses[i]->prohibited){
+				Query* qry = queryClauses[i]->query;
+				getTerms(qry, terms, prohibited);
+			}
+		}
+
+		_CLDELETE_ARRAY(queryClauses);
+	}
+
+	void QueryTermExtractor::getTermsFromPhraseQuery(const PhraseQuery * query, WeightedTermList * terms)
+	{
+		Term** queryTerms = query->getTerms();
+		int32_t i = 0;
+		while ( queryTerms[i] != NULL ){
+			WeightedTerm * pWT = _CLNEW WeightedTerm(query->getBoost(),queryTerms[i]->text());
+			if (terms->find(pWT)==terms->end()) // possible memory leak if key already present
+				terms->insert(pWT);
+			else
+				_CLDELETE(pWT);
+
+			i++;
+		}
+		_CLDELETE_ARRAY(queryTerms);
+	}
+
+	void QueryTermExtractor::getTermsFromTermQuery(const TermQuery * query, WeightedTermList * terms)
+	{
+		Term * term = query->getTerm();
+		WeightedTerm * pWT = _CLNEW WeightedTerm(query->getBoost(),term->text());
+		_CLDECDELETE(term);
+		if (terms->find(pWT)==terms->end()) // possible memory leak if key already present
+			terms->insert(pWT);
+		else
+			_CLDELETE(pWT);
+	}
+
+	//todo: implement this when span queries are implemented
+	/*void getTermsFromSpanNearQuery(SpanNearQuery* query, WeightedTermList* terms){
+  	    Collection queryTerms = query.getTerms();
+
+  	    for(Iterator iterator = queryTerms.iterator(); iterator.hasNext();){
+  	        // break it out for debugging.
+  	        Term term = (Term) iterator.next();
+  	        const TCHAR* text = term.text();
+  	        terms.add(_CLNEW WeightedTerm(query.getBoost(), text));
+  	    }
+  	}*/
+
+CL_NS_END2