|
1 #include "CLucene/StdHeader.h" |
|
2 #include "TokenSources.h" |
|
3 |
|
4 #include "CLucene/util/VoidList.h" |
|
5 |
|
6 CL_NS_DEF2(search,highlight) |
|
7 CL_NS_USE(analysis) |
|
8 CL_NS_USE(index) |
|
9 CL_NS_USE(util) |
|
10 |
|
11 TokenSources::TokenSources(void) |
|
12 { |
|
13 } |
|
14 |
|
15 TokenSources::~TokenSources(void) |
|
16 { |
|
17 } |
|
18 |
|
19 TokenStream* TokenSources::getAnyTokenStream(IndexReader* reader,int32_t docId, TCHAR* field, Analyzer* analyzer) |
|
20 { |
|
21 TokenStream* ts=NULL; |
|
22 |
|
23 TermFreqVector* tfv=reader->getTermFreqVector(docId,field); |
|
24 if(tfv!=NULL) |
|
25 { |
|
26 // todo: this is actually very dodgy... we try casting |
|
27 // to TermPositionVector, we take the token stream |
|
28 // only if the cast works... should have a way of |
|
29 // knowing what type this is |
|
30 TermPositionVector* tmp = NULL; |
|
31 try{ |
|
32 tmp = dynamic_cast<TermPositionVector *> (tfv); |
|
33 }catch(...){ |
|
34 //ignore |
|
35 } |
|
36 if ( tmp != NULL ) |
|
37 ts=getTokenStream(tmp); |
|
38 } |
|
39 //No token info stored so fall back to analyzing raw content |
|
40 if(ts==NULL) |
|
41 { |
|
42 ts=getTokenStream(reader,docId,field,analyzer); |
|
43 } |
|
44 return ts; |
|
45 } |
|
46 |
|
47 |
|
48 TokenStream* TokenSources::getTokenStream(TermPositionVector* tpv) |
|
49 { |
|
50 //assumes the worst and makes no assumptions about token position sequences. |
|
51 return getTokenStream(tpv,false); |
|
52 } |
|
53 |
|
54 TokenStream* TokenSources::getTokenStream(TermPositionVector* tpv, bool tokenPositionsGuaranteedContiguous) |
|
55 { |
|
56 //an object used to iterate across an array of tokens |
|
57 /*class StoredTokenStream extends TokenStream |
|
58 { |
|
59 Token tokens[]; |
|
60 int32_t currentToken=0; |
|
61 StoredTokenStream(Token tokens[]) |
|
62 { |
|
63 this.tokens=tokens; |
|
64 } |
|
65 public Token next() |
|
66 { |
|
67 if(currentToken>=tokens.length) |
|
68 { |
|
69 return NULL; |
|
70 } |
|
71 return tokens[currentToken++]; |
|
72 } |
|
73 } */ |
|
74 //code to reconstruct the original sequence of Tokens |
|
75 const TCHAR** terms=tpv->getTerms(); |
|
76 const int32_t* freq= (int32_t *)tpv->getTermFrequencies(); |
|
77 int32_t freqLen = tpv->size(); |
|
78 |
|
79 size_t totalTokens=0; |
|
80 { |
|
81 for (int32_t t = 0; t < freqLen; t++) |
|
82 totalTokens+=freq[t]; |
|
83 } |
|
84 |
|
85 Token** tokensInOriginalOrder=NULL; |
|
86 CLSetList<Token*,Token::OrderCompare>* unsortedTokens = NULL; |
|
87 for (int32_t t = 0; t < freqLen; t++) |
|
88 { |
|
89 TermVectorOffsetInfo** offsets=(TermVectorOffsetInfo**)tpv->getOffsets(t); |
|
90 if(offsets==NULL) |
|
91 return NULL; |
|
92 |
|
93 int32_t* pos=NULL; |
|
94 int32_t posLen=0; |
|
95 if(tokenPositionsGuaranteedContiguous) |
|
96 { |
|
97 //try get the token position info to speed up assembly of tokens into sorted sequence |
|
98 pos=(int32_t *)tpv->getTermPositions(t); |
|
99 posLen=1;//todo |
|
100 } |
|
101 |
|
102 if ( tokensInOriginalOrder != NULL ) |
|
103 tokensInOriginalOrder = _CL_NEWARRAY(Token*, totalTokens+1); |
|
104 |
|
105 if(pos==NULL) |
|
106 { |
|
107 //tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later |
|
108 if(unsortedTokens==NULL) |
|
109 unsortedTokens=_CLNEW CLSetList<Token*,Token::OrderCompare>(false); |
|
110 for (int32_t tp=0; offsets[tp]!=NULL; tp++) |
|
111 { |
|
112 unsortedTokens->insert(_CLNEW Token(terms[t], |
|
113 offsets[tp]->getStartOffset(), |
|
114 offsets[tp]->getEndOffset())); |
|
115 } |
|
116 } |
|
117 else |
|
118 { |
|
119 //We have positions stored and a guarantee that the token position information is contiguous |
|
120 |
|
121 // This may be fast BUT wont work if Tokenizers used which create >1 token in same position or |
|
122 // creates jumps in position numbers - this code would fail under those circumstances |
|
123 |
|
124 //tokens stored with positions - can use this to index straight into sorted array |
|
125 for (int32_t tp = 0; tp < posLen; tp++) |
|
126 { |
|
127 if ( tokensInOriginalOrder ) |
|
128 { |
|
129 tokensInOriginalOrder[pos[tp]]=_CLNEW Token(terms[t], |
|
130 offsets[tp]->getStartOffset(), |
|
131 offsets[tp]->getEndOffset()); |
|
132 } |
|
133 } |
|
134 } |
|
135 } |
|
136 //If the field has been stored without position data we must perform a sort |
|
137 if(unsortedTokens!=NULL) |
|
138 { |
|
139 if ( totalTokens<unsortedTokens->size() ){ |
|
140 _CLDELETE_ARRAY(tokensInOriginalOrder); |
|
141 tokensInOriginalOrder = _CL_NEWARRAY(Token*,unsortedTokens->size()+1); |
|
142 } |
|
143 //the list has already sorted our items //todo:check that this is true... |
|
144 if ( tokensInOriginalOrder ) |
|
145 unsortedTokens->toArray(tokensInOriginalOrder); |
|
146 |
|
147 return _CLNEW StoredTokenStream(tokensInOriginalOrder,unsortedTokens->size()); |
|
148 }else |
|
149 return _CLNEW StoredTokenStream(tokensInOriginalOrder,totalTokens); |
|
150 } |
|
151 |
|
152 TokenStream* TokenSources::getTokenStream(IndexReader* reader,int32_t docId, TCHAR* field) |
|
153 { |
|
154 TermFreqVector* tfv=reader->getTermFreqVector(docId,field); |
|
155 if(tfv==NULL) |
|
156 { |
|
157 TCHAR buf[250]; |
|
158 _sntprintf(buf,250,_T("%s in doc #%d does not have any term position data stored"),field,docId); |
|
159 _CLTHROWT(CL_ERR_IllegalArgument,buf); |
|
160 return NULL; |
|
161 } |
|
162 |
|
163 //todo:bad way of doing this... |
|
164 TermPositionVector* tmp = NULL; |
|
165 try{ |
|
166 tmp = dynamic_cast<TermPositionVector *> (tfv); //check to see if tfv is a Tpv |
|
167 }catch(...){} |
|
168 TokenStream* stream = NULL; |
|
169 if ( tmp != NULL ){ |
|
170 TermPositionVector* tpv = dynamic_cast<TermPositionVector *> (reader->getTermFreqVector(docId,field)); |
|
171 if ( tpv ) |
|
172 stream = getTokenStream(tpv); |
|
173 //return getTokenStream(tpv); |
|
174 }else{ |
|
175 TCHAR buf[250]; |
|
176 _sntprintf(buf,250,_T("%s in doc #%d does not have any term position data stored"),field,docId); |
|
177 _CLTHROWT(CL_ERR_IllegalArgument,buf); |
|
178 //return NULL; |
|
179 } |
|
180 return stream; |
|
181 } |
|
182 |
|
183 //convenience method |
|
184 TokenStream* TokenSources::getTokenStream(IndexReader* reader,int32_t docId, TCHAR* field,Analyzer* analyzer) |
|
185 { |
|
186 CL_NS(document)::Document* doc=reader->document(docId); |
|
187 const TCHAR* contents=doc->get(field); |
|
188 if(contents==NULL) |
|
189 { |
|
190 TCHAR buf[250]; |
|
191 _sntprintf(buf,250,_T("Field %s in document #%d is not stored and cannot be analyzed"),field,docId); |
|
192 _CLTHROWT(CL_ERR_IllegalArgument,buf); |
|
193 return NULL; |
|
194 } |
|
195 return analyzer->tokenStream(field,_CLNEW StringReader(contents)); |
|
196 } |
|
197 |
|
198 TokenSources::StoredTokenStream::StoredTokenStream(CL_NS(analysis)::Token** tokens, size_t len) |
|
199 { |
|
200 currentToken = 0; |
|
201 this->tokens=tokens; |
|
202 this->length = len; |
|
203 } |
|
204 bool TokenSources::StoredTokenStream::next(CL_NS(analysis)::Token* token) |
|
205 { |
|
206 if(currentToken>=length) |
|
207 { |
|
208 return false; |
|
209 } |
|
210 Token* t = tokens[currentToken++]; |
|
211 |
|
212 token->set(t->termText(),t->startOffset(),t->endOffset(),t->type());; |
|
213 return true; |
|
214 } |
|
215 void TokenSources::StoredTokenStream::close(){ |
|
216 |
|
217 } |
|
218 |
|
219 CL_NS_END2 |