author | hgs |
Fri, 15 Oct 2010 12:09:28 +0530 | |
changeset 24 | 65456528cac2 |
parent 7 | a5fbfefd615f |
permissions | -rw-r--r-- |
7
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
1 |
#include "CLucene/StdHeader.h" |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
2 |
#include "TokenSources.h" |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
3 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
4 |
#include "CLucene/util/VoidList.h" |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
5 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
6 |
CL_NS_DEF2(search,highlight) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
7 |
CL_NS_USE(analysis) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
8 |
CL_NS_USE(index) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
9 |
CL_NS_USE(util) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
10 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
11 |
TokenSources::TokenSources(void) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
12 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
13 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
14 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
15 |
TokenSources::~TokenSources(void) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
16 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
17 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
18 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
19 |
TokenStream* TokenSources::getAnyTokenStream(IndexReader* reader,int32_t docId, TCHAR* field, Analyzer* analyzer) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
20 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
21 |
TokenStream* ts=NULL; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
22 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
23 |
TermFreqVector* tfv=reader->getTermFreqVector(docId,field); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
24 |
if(tfv!=NULL) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
25 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
26 |
// todo: this is actually very dodgy... we try casting |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
27 |
// to TermPositionVector, we take the token stream |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
28 |
// only if the cast works... should have a way of |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
29 |
// knowing what type this is |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
30 |
TermPositionVector* tmp = NULL; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
31 |
try{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
32 |
tmp = dynamic_cast<TermPositionVector *> (tfv); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
33 |
}catch(...){ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
34 |
//ignore |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
35 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
36 |
if ( tmp != NULL ) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
37 |
ts=getTokenStream(tmp); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
38 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
39 |
//No token info stored so fall back to analyzing raw content |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
40 |
if(ts==NULL) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
41 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
42 |
ts=getTokenStream(reader,docId,field,analyzer); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
43 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
44 |
return ts; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
45 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
46 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
47 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
48 |
TokenStream* TokenSources::getTokenStream(TermPositionVector* tpv) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
49 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
50 |
//assumes the worst and makes no assumptions about token position sequences. |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
51 |
return getTokenStream(tpv,false); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
52 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
53 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
54 |
TokenStream* TokenSources::getTokenStream(TermPositionVector* tpv, bool tokenPositionsGuaranteedContiguous) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
55 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
56 |
//an object used to iterate across an array of tokens |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
57 |
/*class StoredTokenStream extends TokenStream |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
58 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
59 |
Token tokens[]; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
60 |
int32_t currentToken=0; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
61 |
StoredTokenStream(Token tokens[]) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
62 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
63 |
this.tokens=tokens; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
64 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
65 |
public Token next() |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
66 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
67 |
if(currentToken>=tokens.length) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
68 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
69 |
return NULL; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
70 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
71 |
return tokens[currentToken++]; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
72 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
73 |
} */ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
74 |
//code to reconstruct the original sequence of Tokens |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
75 |
const TCHAR** terms=tpv->getTerms(); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
76 |
const int32_t* freq= (int32_t *)tpv->getTermFrequencies(); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
77 |
int32_t freqLen = tpv->size(); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
78 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
79 |
size_t totalTokens=0; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
80 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
81 |
for (int32_t t = 0; t < freqLen; t++) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
82 |
totalTokens+=freq[t]; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
83 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
84 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
85 |
Token** tokensInOriginalOrder=NULL; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
86 |
CLSetList<Token*,Token::OrderCompare>* unsortedTokens = NULL; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
87 |
for (int32_t t = 0; t < freqLen; t++) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
88 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
89 |
TermVectorOffsetInfo** offsets=(TermVectorOffsetInfo**)tpv->getOffsets(t); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
90 |
if(offsets==NULL) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
91 |
return NULL; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
92 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
93 |
int32_t* pos=NULL; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
94 |
int32_t posLen=0; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
95 |
if(tokenPositionsGuaranteedContiguous) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
96 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
97 |
//try get the token position info to speed up assembly of tokens into sorted sequence |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
98 |
pos=(int32_t *)tpv->getTermPositions(t); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
99 |
posLen=1;//todo |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
100 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
101 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
102 |
if ( tokensInOriginalOrder != NULL ) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
103 |
tokensInOriginalOrder = _CL_NEWARRAY(Token*, totalTokens+1); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
104 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
105 |
if(pos==NULL) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
106 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
107 |
//tokens NOT stored with positions or not guaranteed contiguous - must add to list and sort later |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
108 |
if(unsortedTokens==NULL) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
109 |
unsortedTokens=_CLNEW CLSetList<Token*,Token::OrderCompare>(false); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
110 |
for (int32_t tp=0; offsets[tp]!=NULL; tp++) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
111 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
112 |
unsortedTokens->insert(_CLNEW Token(terms[t], |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
113 |
offsets[tp]->getStartOffset(), |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
114 |
offsets[tp]->getEndOffset())); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
115 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
116 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
117 |
else |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
118 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
119 |
//We have positions stored and a guarantee that the token position information is contiguous |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
120 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
121 |
// This may be fast BUT wont work if Tokenizers used which create >1 token in same position or |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
122 |
// creates jumps in position numbers - this code would fail under those circumstances |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
123 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
124 |
//tokens stored with positions - can use this to index straight into sorted array |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
125 |
for (int32_t tp = 0; tp < posLen; tp++) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
126 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
127 |
if ( tokensInOriginalOrder ) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
128 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
129 |
tokensInOriginalOrder[pos[tp]]=_CLNEW Token(terms[t], |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
130 |
offsets[tp]->getStartOffset(), |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
131 |
offsets[tp]->getEndOffset()); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
132 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
133 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
134 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
135 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
136 |
//If the field has been stored without position data we must perform a sort |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
137 |
if(unsortedTokens!=NULL) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
138 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
139 |
if ( totalTokens<unsortedTokens->size() ){ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
140 |
_CLDELETE_ARRAY(tokensInOriginalOrder); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
141 |
tokensInOriginalOrder = _CL_NEWARRAY(Token*,unsortedTokens->size()+1); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
142 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
143 |
//the list has already sorted our items //todo:check that this is true... |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
144 |
if ( tokensInOriginalOrder ) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
145 |
unsortedTokens->toArray(tokensInOriginalOrder); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
146 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
147 |
return _CLNEW StoredTokenStream(tokensInOriginalOrder,unsortedTokens->size()); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
148 |
}else |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
149 |
return _CLNEW StoredTokenStream(tokensInOriginalOrder,totalTokens); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
150 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
151 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
152 |
TokenStream* TokenSources::getTokenStream(IndexReader* reader,int32_t docId, TCHAR* field) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
153 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
154 |
TermFreqVector* tfv=reader->getTermFreqVector(docId,field); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
155 |
if(tfv==NULL) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
156 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
157 |
TCHAR buf[250]; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
158 |
_sntprintf(buf,250,_T("%s in doc #%d does not have any term position data stored"),field,docId); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
159 |
_CLTHROWT(CL_ERR_IllegalArgument,buf); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
160 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
161 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
162 |
//todo:bad way of doing this... |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
163 |
TermPositionVector* tmp = NULL; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
164 |
try{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
165 |
tmp = dynamic_cast<TermPositionVector *> (tfv); //check to see if tfv is a Tpv |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
166 |
}catch(...){} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
167 |
TokenStream* stream = NULL; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
168 |
if ( tmp != NULL ){ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
169 |
TermPositionVector* tpv = dynamic_cast<TermPositionVector *> (reader->getTermFreqVector(docId,field)); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
170 |
if ( tpv ) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
171 |
stream = getTokenStream(tpv); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
172 |
//return getTokenStream(tpv); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
173 |
}else{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
174 |
TCHAR buf[250]; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
175 |
_sntprintf(buf,250,_T("%s in doc #%d does not have any term position data stored"),field,docId); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
176 |
_CLTHROWT(CL_ERR_IllegalArgument,buf); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
177 |
//return NULL; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
178 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
179 |
return stream; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
180 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
181 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
182 |
//convenience method |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
183 |
TokenStream* TokenSources::getTokenStream(IndexReader* reader,int32_t docId, TCHAR* field,Analyzer* analyzer) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
184 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
185 |
CL_NS(document)::Document* doc=reader->document(docId); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
186 |
const TCHAR* contents=doc->get(field); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
187 |
if(contents==NULL) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
188 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
189 |
TCHAR buf[250]; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
190 |
_sntprintf(buf,250,_T("Field %s in document #%d is not stored and cannot be analyzed"),field,docId); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
191 |
_CLTHROWT(CL_ERR_IllegalArgument,buf); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
192 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
193 |
return analyzer->tokenStream(field,_CLNEW StringReader(contents)); |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
194 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
195 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
196 |
TokenSources::StoredTokenStream::StoredTokenStream(CL_NS(analysis)::Token** tokens, size_t len) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
197 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
198 |
currentToken = 0; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
199 |
this->tokens=tokens; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
200 |
this->length = len; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
201 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
202 |
bool TokenSources::StoredTokenStream::next(CL_NS(analysis)::Token* token) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
203 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
204 |
if(currentToken>=length) |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
205 |
{ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
206 |
return false; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
207 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
208 |
Token* t = tokens[currentToken++]; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
209 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
210 |
token->set(t->termText(),t->startOffset(),t->endOffset(),t->type());; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
211 |
return true; |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
212 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
213 |
void TokenSources::StoredTokenStream::close(){ |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
214 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
215 |
} |
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
216 |
|
a5fbfefd615f
Revision: 201021
Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
parents:
diff
changeset
|
217 |
CL_NS_END2 |