searchengine/oss/cl/clucene/src/clucene/highlighter/QueryTermExtractor.cpp
changeset 7 a5fbfefd615f
child 21 2c484ac32ef0
equal deleted inserted replaced
3:ae3f1779f6da 7:a5fbfefd615f
       
     1 #include "CLucene/StdHeader.h"
       
     2 #include "QueryTermExtractor.h"
       
     3 
       
     4 CL_NS_DEF2(search,highlight)
       
     5 CL_NS_USE(index)
       
     6 
       
     7 	WeightedTerm** QueryTermExtractor::getTerms(const Query *query) 
       
     8 	{
       
     9 		WeightedTerm** ret = getTerms(query,false);
       
    10 		return ret;
       
    11 	}
       
    12 
       
    13 	WeightedTerm** QueryTermExtractor::getTerms(const Query * query, bool prohibited) 
       
    14 	{
       
    15 		WeightedTermList terms(false);
       
    16 		getTerms(query,&terms,prohibited);
       
    17 
       
    18 		// Return extracted terms
       
    19 		WeightedTerm** ret = _CL_NEWARRAY(WeightedTerm*,terms.size()+1);
       
    20 		terms.toArray(ret);
       
    21 
       
    22 		return ret;
       
    23 	}
       
    24 
       
    25 	void QueryTermExtractor::getTerms(const Query * query, WeightedTermList * terms,bool prohibited) 
       
    26 	{
       
    27 		if (query->instanceOf( BooleanQuery::getClassName() ))
       
    28 			getTermsFromBooleanQuery((BooleanQuery *) query, terms, prohibited);
       
    29 		else if (query->instanceOf( PhraseQuery::getClassName() ))
       
    30 			getTermsFromPhraseQuery((PhraseQuery *) query, terms);
       
    31 		else if (query->instanceOf( TermQuery::getClassName() ))
       
    32 			getTermsFromTermQuery((TermQuery *) query, terms);
       
    33 		//else if(query->instanceOf(_T("SpanNearQuery"))
       
    34 		//	getTermsFromSpanNearQuery((SpanNearQuery*) query, terms);
       
    35 	}
       
    36 
       
    37 	/**
       
    38   	* Extracts all terms texts of a given Query into an array of WeightedTerms
       
    39   	*
       
    40   	* @param query      Query to extract term texts from
       
    41   	* @param reader used to compute IDF which can be used to a) score selected fragments better
       
    42   	* b) use graded highlights eg chaning intensity of font color
       
    43   	* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
       
    44   	* @return an array of the terms used in a query, plus their weights.
       
    45   	*/
       
    46   	WeightedTerm** QueryTermExtractor::getIdfWeightedTerms(const Query* query, IndexReader* reader, const TCHAR* fieldName)
       
    47   	{
       
    48   	    WeightedTermList terms(true);
       
    49 		getTerms(query,&terms,false);
       
    50 
       
    51   	    int32_t totalNumDocs=reader->numDocs();
       
    52 		
       
    53 		WeightedTermList::iterator itr = terms.begin();
       
    54   	    while ( itr != terms.end() )
       
    55   		{
       
    56   			try
       
    57   			{
       
    58 				Term* term = _CLNEW Term(fieldName,(*itr)->getTerm());
       
    59   				int32_t docFreq=reader->docFreq(term);
       
    60 				_CLDECDELETE(term);
       
    61 
       
    62   				//IDF algorithm taken from DefaultSimilarity class
       
    63   				float_t idf=(float_t)(log(totalNumDocs/(float_t)(docFreq+1)) + 1.0);
       
    64   				(*itr)->setWeight((*itr)->getWeight() * idf);
       
    65   			}catch (LuceneError& e){
       
    66   				if ( e.number()!=CL_ERR_IO )
       
    67 					throw e;
       
    68   			}
       
    69 
       
    70 			itr++;
       
    71   		}
       
    72   	   
       
    73 		// Return extracted terms
       
    74 		WeightedTerm** ret = _CL_NEWARRAY(WeightedTerm*,terms.size()+1);
       
    75 		terms.toArray(ret);
       
    76 
       
    77 		return ret;
       
    78   	}
       
    79 
       
    80 	void QueryTermExtractor::getTermsFromBooleanQuery(const BooleanQuery * query, WeightedTermList * terms, bool prohibited)
       
    81 	{
       
    82 		// TODO: change Query to get the queryclauses and their number in one function call
       
    83 		BooleanClause** queryClauses = query->getClauses();
       
    84 		uint32_t numClauses = query->getClauseCount();
       
    85 
       
    86 		for (uint32_t i = 0; i < numClauses; i++)
       
    87 		{
       
    88 			if (prohibited || !queryClauses[i]->prohibited){
       
    89 				Query* qry = queryClauses[i]->query;
       
    90 				getTerms(qry, terms, prohibited);
       
    91 			}
       
    92 		}
       
    93 
       
    94 		_CLDELETE_ARRAY(queryClauses);
       
    95 	}
       
    96 
       
    97 	void QueryTermExtractor::getTermsFromPhraseQuery(const PhraseQuery * query, WeightedTermList * terms)
       
    98 	{
       
    99 		Term** queryTerms = query->getTerms();
       
   100 		int32_t i = 0;
       
   101 		while ( queryTerms[i] != NULL ){
       
   102 			WeightedTerm * pWT = _CLNEW WeightedTerm(query->getBoost(),queryTerms[i]->text());
       
   103 			if (terms->find(pWT)==terms->end()) // possible memory leak if key already present
       
   104 				terms->insert(pWT);
       
   105 			else
       
   106 				_CLDELETE(pWT);
       
   107 
       
   108 			i++;
       
   109 		}
       
   110 		_CLDELETE_ARRAY(queryTerms);
       
   111 	}
       
   112 
       
   113 	void QueryTermExtractor::getTermsFromTermQuery(const TermQuery * query, WeightedTermList * terms)
       
   114 	{
       
   115 		Term * term = query->getTerm();
       
   116 		WeightedTerm * pWT = _CLNEW WeightedTerm(query->getBoost(),term->text());
       
   117 		_CLDECDELETE(term);
       
   118 		if (terms->find(pWT)==terms->end()) // possible memory leak if key already present
       
   119 			terms->insert(pWT);
       
   120 		else
       
   121 			_CLDELETE(pWT);
       
   122 	}
       
   123 
       
   124 	//todo: implement this when span queries are implemented
       
   125 	/*void getTermsFromSpanNearQuery(SpanNearQuery* query, WeightedTermList* terms){
       
   126   	    Collection queryTerms = query.getTerms();
       
   127 
       
   128   	    for(Iterator iterator = queryTerms.iterator(); iterator.hasNext();){
       
   129   	        // break it out for debugging.
       
   130   	        Term term = (Term) iterator.next();
       
   131   	        const TCHAR* text = term.text();
       
   132   	        terms.add(_CLNEW WeightedTerm(query.getBoost(), text));
       
   133   	    }
       
   134   	}*/
       
   135 
       
   136 CL_NS_END2