|
1 /****************************************************************************** |
|
2 * |
|
3 * |
|
4 * |
|
5 * Copyright (C) 1997-2008 by Dimitri van Heesch. |
|
6 * |
|
7 * Permission to use, copy, modify, and distribute this software and its |
|
8 * documentation under the terms of the GNU General Public License is hereby |
|
9 * granted. No representations are made about the suitability of this software |
|
10 * for any purpose. It is provided "as is" without express or implied warranty. |
|
11 * See the GNU General Public License for more details. |
|
12 * |
|
13 * Documents produced by Doxygen are derivative works derived from the |
|
14 * input used in their production; they are not affected by this license. |
|
15 * |
|
16 */ |
|
17 |
|
18 #include "qtbc.h" |
|
19 #include "searchindex.h" |
|
20 #include "config.h" |
|
21 #include "util.h" |
|
22 #include <qfile.h> |
|
23 #include <ctype.h> |
|
24 #include <qregexp.h> |
|
25 |
|
26 |
|
27 // file format: (all multi-byte values are stored in big endian format) |
|
28 // 4 byte header |
|
29 // 256*256*4 byte index (4 bytes) |
|
30 // for each index entry: a zero terminated list of words |
|
31 // for each word: a \0 terminated string + 4 byte offset to the stats info |
|
32 // padding bytes to align at 4 byte boundary |
|
33 // for each word: the number of urls (4 bytes) |
|
34 // + for each url containing the word 8 bytes statistics |
|
35 // (4 bytes index to url string + 4 bytes frequency counter) |
|
36 // for each url: a \0 terminated string |
|
37 |
|
38 const int numIndexEntries = 256*256; |
|
39 |
|
40 //-------------------------------------------------------------------- |
|
41 |
|
42 IndexWord::IndexWord(const char *word) : m_word(word), m_urls(17) |
|
43 { |
|
44 m_urls.setAutoDelete(TRUE); |
|
45 //printf("IndexWord::IndexWord(%s)\n",word); |
|
46 } |
|
47 |
|
48 void IndexWord::addUrlIndex(int idx,bool hiPriority) |
|
49 { |
|
50 //printf("IndexWord::addUrlIndex(%d,%d)\n",idx,hiPriority); |
|
51 URLInfo *ui = m_urls.find(idx); |
|
52 if (ui==0) |
|
53 { |
|
54 //printf("URLInfo::URLInfo(%d)\n",idx); |
|
55 ui=new URLInfo(idx,0); |
|
56 m_urls.insert(idx,ui); |
|
57 } |
|
58 ui->freq+=2; |
|
59 if (hiPriority) ui->freq|=1; // mark as high priority document |
|
60 } |
|
61 |
|
62 //-------------------------------------------------------------------- |
|
63 |
|
64 SearchIndex::SearchIndex() : m_words(328829), m_index(numIndexEntries), m_urlIndex(-1) |
|
65 { |
|
66 int i; |
|
67 m_words.setAutoDelete(TRUE); |
|
68 m_urls.setAutoDelete(TRUE); |
|
69 m_index.setAutoDelete(TRUE); |
|
70 for (i=0;i<numIndexEntries;i++) m_index.insert(i,new QList<IndexWord>); |
|
71 } |
|
72 |
|
73 void SearchIndex::setCurrentDoc(const char *name,const char *baseName,const char *anchor) |
|
74 { |
|
75 if (name==0 || baseName==0) return; |
|
76 //printf("SearchIndex::setCurrentDoc(%s,%s,%s)\n",name,baseName,anchor); |
|
77 QCString url=baseName+Config_getString("HTML_FILE_EXTENSION"); |
|
78 if (anchor) url+=(QCString)"#"+anchor; |
|
79 m_urlIndex++; |
|
80 m_urls.insert(m_urlIndex,new URL(name,url)); |
|
81 } |
|
82 |
|
83 static int charsToIndex(const char *word) |
|
84 { |
|
85 if (word==0) return -1; |
|
86 |
|
87 // Fast string hashing algorithm |
|
88 //register ushort h=0; |
|
89 //const char *k = word; |
|
90 //ushort mask=0xfc00; |
|
91 //while ( *k ) |
|
92 //{ |
|
93 // h = (h&mask)^(h<<6)^(*k++); |
|
94 //} |
|
95 //return h; |
|
96 |
|
97 // Simple hashing that allows for substring searching |
|
98 uint c1=word[0]; |
|
99 if (c1==0) return -1; |
|
100 uint c2=word[1]; |
|
101 if (c2==0) return -1; |
|
102 return c1*256+c2; |
|
103 } |
|
104 |
|
105 void SearchIndex::addWord(const char *word,bool hiPriority,bool recurse) |
|
106 { |
|
107 static QRegExp nextPart("[_a-z:][A-Z]"); |
|
108 //printf("SearchIndex::addWord(%s,%d)\n",word,hiPriority); |
|
109 QString wStr(word); |
|
110 if (wStr.isEmpty()) return; |
|
111 wStr=wStr.lower(); |
|
112 IndexWord *w = m_words[wStr]; |
|
113 if (w==0) |
|
114 { |
|
115 int idx=charsToIndex(wStr); |
|
116 if (idx<0) return; |
|
117 w = new IndexWord(wStr); |
|
118 //fprintf(stderr,"addWord(%s) at index %d\n",word,idx); |
|
119 m_index[idx]->append(w); |
|
120 m_words.insert(wStr,w); |
|
121 } |
|
122 w->addUrlIndex(m_urlIndex,hiPriority); |
|
123 int i; |
|
124 bool found=FALSE; |
|
125 if (!recurse) // the first time we check if we can strip the prefix |
|
126 { |
|
127 i=getPrefixIndex(word); |
|
128 if (i>0) |
|
129 { |
|
130 addWord(word+i,hiPriority,TRUE); |
|
131 found=TRUE; |
|
132 } |
|
133 } |
|
134 if (!found) // no prefix stripped |
|
135 { |
|
136 if ((i=nextPart.match(word))>=1) |
|
137 { |
|
138 addWord(word+i+1,hiPriority,TRUE); |
|
139 } |
|
140 } |
|
141 } |
|
142 |
|
143 |
|
144 static void writeInt(QFile &f,int index) |
|
145 { |
|
146 f.putch(((uint)index)>>24); |
|
147 f.putch((((uint)index)>>16)&0xff); |
|
148 f.putch((((uint)index)>>8)&0xff); |
|
149 f.putch(((uint)index)&0xff); |
|
150 } |
|
151 |
|
152 static void writeString(QFile &f,const char *s) |
|
153 { |
|
154 const char *p = s; |
|
155 while (*p) f.putch(*p++); |
|
156 f.putch(0); |
|
157 } |
|
158 |
|
159 void SearchIndex::write(const char *fileName) |
|
160 { |
|
161 int i; |
|
162 int size=4; // for the header |
|
163 size+=4*numIndexEntries; // for the index |
|
164 int wordsOffset = size; |
|
165 // first pass: compute the size of the wordlist |
|
166 for (i=0;i<numIndexEntries;i++) |
|
167 { |
|
168 QList<IndexWord> *wlist = m_index[i]; |
|
169 if (!wlist->isEmpty()) |
|
170 { |
|
171 QListIterator<IndexWord> iwi(*wlist); |
|
172 IndexWord *iw; |
|
173 for (iwi.toFirst();(iw=iwi.current());++iwi) |
|
174 { |
|
175 int ws = iw->word().length()+1; |
|
176 size+=ws+4; // word + url info list offset |
|
177 } |
|
178 size+=1; // zero list terminator |
|
179 } |
|
180 } |
|
181 |
|
182 // second pass: compute the offsets in the index |
|
183 int indexOffsets[numIndexEntries]; |
|
184 int offset=wordsOffset; |
|
185 for (i=0;i<numIndexEntries;i++) |
|
186 { |
|
187 QList<IndexWord> *wlist = m_index[i]; |
|
188 if (!wlist->isEmpty()) |
|
189 { |
|
190 indexOffsets[i]=offset; |
|
191 QListIterator<IndexWord> iwi(*wlist); |
|
192 IndexWord *iw; |
|
193 for (iwi.toFirst();(iw=iwi.current());++iwi) |
|
194 { |
|
195 offset+= iw->word().length()+1; |
|
196 offset+=4; // word + offset to url info array |
|
197 } |
|
198 offset+=1; // zero list terminator |
|
199 } |
|
200 else |
|
201 { |
|
202 indexOffsets[i]=0; |
|
203 } |
|
204 } |
|
205 int padding = size; |
|
206 size = (size+3)&~3; // round up to 4 byte boundary |
|
207 padding = size - padding; |
|
208 |
|
209 //int statsOffset = size; |
|
210 QDictIterator<IndexWord> wdi(m_words); |
|
211 //IndexWord *iw; |
|
212 int *wordStatOffsets = new int[m_words.count()]; |
|
213 |
|
214 int count=0; |
|
215 |
|
216 // third pass: compute offset to stats info for each word |
|
217 for (i=0;i<numIndexEntries;i++) |
|
218 { |
|
219 QList<IndexWord> *wlist = m_index[i]; |
|
220 if (!wlist->isEmpty()) |
|
221 { |
|
222 QListIterator<IndexWord> iwi(*wlist); |
|
223 IndexWord *iw; |
|
224 for (iwi.toFirst();(iw=iwi.current());++iwi) |
|
225 { |
|
226 //printf("wordStatOffsets[%d]=%d\n",count,size); |
|
227 wordStatOffsets[count++] = size; |
|
228 size+=4+iw->urls().count()*8; // count + (url_index,freq) per url |
|
229 } |
|
230 } |
|
231 } |
|
232 int *urlOffsets = new int[m_urls.count()]; |
|
233 //int urlsOffset = size; |
|
234 QIntDictIterator<URL> udi(m_urls); |
|
235 URL *url; |
|
236 for (udi.toFirst();(url=udi.current());++udi) |
|
237 { |
|
238 urlOffsets[udi.currentKey()]=size; |
|
239 size+=url->name.length()+1+ |
|
240 url->url.length()+1; |
|
241 } |
|
242 //printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset); |
|
243 QFile f(fileName); |
|
244 if (f.open(IO_WriteOnly)) |
|
245 { |
|
246 // write header |
|
247 f.putch('D'); f.putch('O'); f.putch('X'); f.putch('S'); |
|
248 // write index |
|
249 for (i=0;i<numIndexEntries;i++) |
|
250 { |
|
251 writeInt(f,indexOffsets[i]); |
|
252 } |
|
253 // write word lists |
|
254 count=0; |
|
255 for (i=0;i<numIndexEntries;i++) |
|
256 { |
|
257 QList<IndexWord> *wlist = m_index[i]; |
|
258 if (!wlist->isEmpty()) |
|
259 { |
|
260 QListIterator<IndexWord> iwi(*wlist); |
|
261 IndexWord *iw; |
|
262 for (iwi.toFirst();(iw=iwi.current());++iwi) |
|
263 { |
|
264 writeString(f,iw->word()); |
|
265 writeInt(f,wordStatOffsets[count++]); |
|
266 } |
|
267 f.putch(0); |
|
268 } |
|
269 } |
|
270 // write extra padding bytes |
|
271 for (i=0;i<padding;i++) f.putch(0); |
|
272 // write word statistics |
|
273 for (i=0;i<numIndexEntries;i++) |
|
274 { |
|
275 QList<IndexWord> *wlist = m_index[i]; |
|
276 if (!wlist->isEmpty()) |
|
277 { |
|
278 QListIterator<IndexWord> iwi(*wlist); |
|
279 IndexWord *iw; |
|
280 for (iwi.toFirst();(iw=iwi.current());++iwi) |
|
281 { |
|
282 int numUrls = iw->urls().count(); |
|
283 writeInt(f,numUrls); |
|
284 QIntDictIterator<URLInfo> uli(iw->urls()); |
|
285 URLInfo *ui; |
|
286 for (uli.toFirst();(ui=uli.current());++uli) |
|
287 { |
|
288 writeInt(f,urlOffsets[ui->urlIdx]); |
|
289 writeInt(f,ui->freq); |
|
290 } |
|
291 } |
|
292 } |
|
293 } |
|
294 // write urls |
|
295 QIntDictIterator<URL> udi(m_urls); |
|
296 URL *url; |
|
297 for (udi.toFirst();(url=udi.current());++udi) |
|
298 { |
|
299 writeString(f,url->name); |
|
300 writeString(f,url->url); |
|
301 } |
|
302 } |
|
303 |
|
304 delete[] urlOffsets; |
|
305 delete[] wordStatOffsets; |
|
306 } |
|
307 |