|
1 #include "CLucene/StdHeader.h" |
|
2 #include "QueryTermExtractor.h" |
|
3 |
|
4 CL_NS_DEF2(search,highlight) |
|
5 CL_NS_USE(index) |
|
6 |
|
7 WeightedTerm** QueryTermExtractor::getTerms(const Query *query) |
|
8 { |
|
9 WeightedTerm** ret = getTerms(query,false); |
|
10 return ret; |
|
11 } |
|
12 |
|
13 WeightedTerm** QueryTermExtractor::getTerms(const Query * query, bool prohibited) |
|
14 { |
|
15 WeightedTermList terms(false); |
|
16 getTerms(query,&terms,prohibited); |
|
17 |
|
18 // Return extracted terms |
|
19 WeightedTerm** ret = _CL_NEWARRAY(WeightedTerm*,terms.size()+1); |
|
20 terms.toArray(ret); |
|
21 |
|
22 return ret; |
|
23 } |
|
24 |
|
25 void QueryTermExtractor::getTerms(const Query * query, WeightedTermList * terms,bool prohibited) |
|
26 { |
|
27 if (query->instanceOf( BooleanQuery::getClassName() )) |
|
28 getTermsFromBooleanQuery((BooleanQuery *) query, terms, prohibited); |
|
29 else if (query->instanceOf( PhraseQuery::getClassName() )) |
|
30 getTermsFromPhraseQuery((PhraseQuery *) query, terms); |
|
31 else if (query->instanceOf( TermQuery::getClassName() )) |
|
32 getTermsFromTermQuery((TermQuery *) query, terms); |
|
33 //else if(query->instanceOf(_T("SpanNearQuery")) |
|
34 // getTermsFromSpanNearQuery((SpanNearQuery*) query, terms); |
|
35 } |
|
36 |
|
37 /** |
|
38 * Extracts all terms texts of a given Query into an array of WeightedTerms |
|
39 * |
|
40 * @param query Query to extract term texts from |
|
41 * @param reader used to compute IDF which can be used to a) score selected fragments better |
|
42 * b) use graded highlights eg chaning intensity of font color |
|
43 * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based |
|
44 * @return an array of the terms used in a query, plus their weights. |
|
45 */ |
|
46 WeightedTerm** QueryTermExtractor::getIdfWeightedTerms(const Query* query, IndexReader* reader, const TCHAR* fieldName) |
|
47 { |
|
48 WeightedTermList terms(true); |
|
49 getTerms(query,&terms,false); |
|
50 |
|
51 int32_t totalNumDocs=reader->numDocs(); |
|
52 |
|
53 WeightedTermList::iterator itr = terms.begin(); |
|
54 while ( itr != terms.end() ) |
|
55 { |
|
56 try |
|
57 { |
|
58 Term* term = _CLNEW Term(fieldName,(*itr)->getTerm()); |
|
59 int32_t docFreq=reader->docFreq(term); |
|
60 _CLDECDELETE(term); |
|
61 |
|
62 //IDF algorithm taken from DefaultSimilarity class |
|
63 float_t idf=(float_t)(log(totalNumDocs/(float_t)(docFreq+1)) + 1.0); |
|
64 (*itr)->setWeight((*itr)->getWeight() * idf); |
|
65 }catch (LuceneError& e){ |
|
66 if ( e.number()!=CL_ERR_IO ) |
|
67 throw e; |
|
68 } |
|
69 |
|
70 itr++; |
|
71 } |
|
72 |
|
73 // Return extracted terms |
|
74 WeightedTerm** ret = _CL_NEWARRAY(WeightedTerm*,terms.size()+1); |
|
75 terms.toArray(ret); |
|
76 |
|
77 return ret; |
|
78 } |
|
79 |
|
80 void QueryTermExtractor::getTermsFromBooleanQuery(const BooleanQuery * query, WeightedTermList * terms, bool prohibited) |
|
81 { |
|
82 // TODO: change Query to get the queryclauses and their number in one function call |
|
83 BooleanClause** queryClauses = query->getClauses(); |
|
84 uint32_t numClauses = query->getClauseCount(); |
|
85 |
|
86 for (uint32_t i = 0; i < numClauses; i++) |
|
87 { |
|
88 if (prohibited || !queryClauses[i]->prohibited){ |
|
89 Query* qry = queryClauses[i]->query; |
|
90 getTerms(qry, terms, prohibited); |
|
91 } |
|
92 } |
|
93 |
|
94 _CLDELETE_ARRAY(queryClauses); |
|
95 } |
|
96 |
|
97 void QueryTermExtractor::getTermsFromPhraseQuery(const PhraseQuery * query, WeightedTermList * terms) |
|
98 { |
|
99 Term** queryTerms = query->getTerms(); |
|
100 int32_t i = 0; |
|
101 while ( queryTerms[i] != NULL ){ |
|
102 WeightedTerm * pWT = _CLNEW WeightedTerm(query->getBoost(),queryTerms[i]->text()); |
|
103 if (terms->find(pWT)==terms->end()) // possible memory leak if key already present |
|
104 terms->insert(pWT); |
|
105 else |
|
106 _CLDELETE(pWT); |
|
107 |
|
108 i++; |
|
109 } |
|
110 _CLDELETE_ARRAY(queryTerms); |
|
111 } |
|
112 |
|
113 void QueryTermExtractor::getTermsFromTermQuery(const TermQuery * query, WeightedTermList * terms) |
|
114 { |
|
115 Term * term = query->getTerm(); |
|
116 WeightedTerm * pWT = _CLNEW WeightedTerm(query->getBoost(),term->text()); |
|
117 _CLDECDELETE(term); |
|
118 if (terms->find(pWT)==terms->end()) // possible memory leak if key already present |
|
119 terms->insert(pWT); |
|
120 else |
|
121 _CLDELETE(pWT); |
|
122 } |
|
123 |
|
124 //todo: implement this when span queries are implemented |
|
125 /*void getTermsFromSpanNearQuery(SpanNearQuery* query, WeightedTermList* terms){ |
|
126 Collection queryTerms = query.getTerms(); |
|
127 |
|
128 for(Iterator iterator = queryTerms.iterator(); iterator.hasNext();){ |
|
129 // break it out for debugging. |
|
130 Term term = (Term) iterator.next(); |
|
131 const TCHAR* text = term.text(); |
|
132 terms.add(_CLNEW WeightedTerm(query.getBoost(), text)); |
|
133 } |
|
134 }*/ |
|
135 |
|
136 CL_NS_END2 |