|
1 function readInt($file) |
|
2 { |
|
3 $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file)); |
|
4 $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file)); |
|
5 return ($b1<<24)|($b2<<16)|($b3<<8)|$b4; |
|
6 } |
|
7 |
|
8 function readString($file) |
|
9 { |
|
10 $result=""; |
|
11 while (ord($c=fgetc($file))) $result.=$c; |
|
12 return $result; |
|
13 } |
|
14 |
|
15 function readHeader($file) |
|
16 { |
|
17 $header =fgetc($file); $header.=fgetc($file); |
|
18 $header.=fgetc($file); $header.=fgetc($file); |
|
19 return $header; |
|
20 } |
|
21 |
|
22 function computeIndex($word) |
|
23 { |
|
24 // Simple hashing that allows for substring search |
|
25 if (strlen($word)<2) return -1; |
|
26 // high char of the index |
|
27 $hi = ord($word{0}); |
|
28 if ($hi==0) return -1; |
|
29 // low char of the index |
|
30 $lo = ord($word{1}); |
|
31 if ($lo==0) return -1; |
|
32 // return index |
|
33 return $hi*256+$lo; |
|
34 } |
|
35 |
|
36 function search($file,$word,&$statsList) |
|
37 { |
|
38 $index = computeIndex($word); |
|
39 if ($index!=-1) // found a valid index |
|
40 { |
|
41 fseek($file,$index*4+4); // 4 bytes per entry, skip header |
|
42 $index = readInt($file); |
|
43 if ($index) // found words matching the hash key |
|
44 { |
|
45 $start=sizeof($statsList); |
|
46 $count=$start; |
|
47 fseek($file,$index); |
|
48 $w = readString($file); |
|
49 while ($w) |
|
50 { |
|
51 $statIdx = readInt($file); |
|
52 if ($word==substr($w,0,strlen($word))) |
|
53 { // found word that matches (as substring) |
|
54 $statsList[$count++]=array( |
|
55 "word"=>$word, |
|
56 "match"=>$w, |
|
57 "index"=>$statIdx, |
|
58 "full"=>strlen($w)==strlen($word), |
|
59 "docs"=>array() |
|
60 ); |
|
61 } |
|
62 $w = readString($file); |
|
63 } |
|
64 $totalHi=0; |
|
65 $totalFreqHi=0; |
|
66 $totalFreqLo=0; |
|
67 for ($count=$start;$count<sizeof($statsList);$count++) |
|
68 { |
|
69 $statInfo = &$statsList[$count]; |
|
70 $multiplier = 1; |
|
71 // whole word matches have a double weight |
|
72 if ($statInfo["full"]) $multiplier=2; |
|
73 fseek($file,$statInfo["index"]); |
|
74 $numDocs = readInt($file); |
|
75 $docInfo = array(); |
|
76 // read docs info + occurrence frequency of the word |
|
77 for ($i=0;$i<$numDocs;$i++) |
|
78 { |
|
79 $idx=readInt($file); |
|
80 $freq=readInt($file); |
|
81 $docInfo[$i]=array("idx" => $idx, |
|
82 "freq" => $freq>>1, |
|
83 "rank" => 0.0, |
|
84 "hi" => $freq&1 |
|
85 ); |
|
86 if ($freq&1) // word occurs in high priority doc |
|
87 { |
|
88 $totalHi++; |
|
89 $totalFreqHi+=$freq*$multiplier; |
|
90 } |
|
91 else // word occurs in low priority doc |
|
92 { |
|
93 $totalFreqLo+=$freq*$multiplier; |
|
94 } |
|
95 } |
|
96 // read name and url info for the doc |
|
97 for ($i=0;$i<$numDocs;$i++) |
|
98 { |
|
99 fseek($file,$docInfo[$i]["idx"]); |
|
100 $docInfo[$i]["name"]=readString($file); |
|
101 $docInfo[$i]["url"]=readString($file); |
|
102 } |
|
103 $statInfo["docs"]=$docInfo; |
|
104 } |
|
105 $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi; |
|
106 for ($count=$start;$count<sizeof($statsList);$count++) |
|
107 { |
|
108 $statInfo = &$statsList[$count]; |
|
109 $multiplier = 1; |
|
110 // whole word matches have a double weight |
|
111 if ($statInfo["full"]) $multiplier=2; |
|
112 for ($i=0;$i<sizeof($statInfo["docs"]);$i++) |
|
113 { |
|
114 $docInfo = &$statInfo["docs"]; |
|
115 // compute frequency rank of the word in each doc |
|
116 $freq=$docInfo[$i]["freq"]; |
|
117 if ($docInfo[$i]["hi"]) |
|
118 { |
|
119 $statInfo["docs"][$i]["rank"]= |
|
120 (float)($freq*$multiplier+$totalFreqLo)/$totalFreq; |
|
121 } |
|
122 else |
|
123 { |
|
124 $statInfo["docs"][$i]["rank"]= |
|
125 (float)($freq*$multiplier)/$totalFreq; |
|
126 } |
|
127 } |
|
128 } |
|
129 } |
|
130 } |
|
131 return $statsList; |
|
132 } |
|
133 |
|
134 function combine_results($results,&$docs) |
|
135 { |
|
136 foreach ($results as $wordInfo) |
|
137 { |
|
138 $docsList = &$wordInfo["docs"]; |
|
139 foreach ($docsList as $di) |
|
140 { |
|
141 $key=$di["url"]; |
|
142 $rank=$di["rank"]; |
|
143 if (in_array($key, array_keys($docs))) |
|
144 { |
|
145 $docs[$key]["rank"]+=$rank; |
|
146 } |
|
147 else |
|
148 { |
|
149 $docs[$key] = array("url"=>$key, |
|
150 "name"=>$di["name"], |
|
151 "rank"=>$rank |
|
152 ); |
|
153 } |
|
154 $docs[$key]["words"][] = array( |
|
155 "word"=>$wordInfo["word"], |
|
156 "match"=>$wordInfo["match"], |
|
157 "freq"=>$di["freq"] |
|
158 ); |
|
159 } |
|
160 } |
|
161 return $docs; |
|
162 } |
|
163 |
|
164 function filter_results($docs,&$requiredWords,&$forbiddenWords) |
|
165 { |
|
166 $filteredDocs=array(); |
|
167 while (list ($key, $val) = each ($docs)) |
|
168 { |
|
169 $words = &$docs[$key]["words"]; |
|
170 $copy=1; // copy entry by default |
|
171 if (sizeof($requiredWords)>0) |
|
172 { |
|
173 foreach ($requiredWords as $reqWord) |
|
174 { |
|
175 $found=0; |
|
176 foreach ($words as $wordInfo) |
|
177 { |
|
178 $found = $wordInfo["word"]==$reqWord; |
|
179 if ($found) break; |
|
180 } |
|
181 if (!$found) |
|
182 { |
|
183 $copy=0; // document contains none of the required words |
|
184 break; |
|
185 } |
|
186 } |
|
187 } |
|
188 if (sizeof($forbiddenWords)>0) |
|
189 { |
|
190 foreach ($words as $wordInfo) |
|
191 { |
|
192 if (in_array($wordInfo["word"],$forbiddenWords)) |
|
193 { |
|
194 $copy=0; // document contains a forbidden word |
|
195 break; |
|
196 } |
|
197 } |
|
198 } |
|
199 if ($copy) $filteredDocs[$key]=$docs[$key]; |
|
200 } |
|
201 return $filteredDocs; |
|
202 } |
|
203 |
|
204 function compare_rank($a,$b) |
|
205 { |
|
206 if ($a["rank"] == $b["rank"]) |
|
207 { |
|
208 return 0; |
|
209 } |
|
210 return ($a["rank"]>$b["rank"]) ? -1 : 1; |
|
211 } |
|
212 |
|
213 function sort_results($docs,&$sorted) |
|
214 { |
|
215 $sorted = $docs; |
|
216 usort($sorted,"compare_rank"); |
|
217 return $sorted; |
|
218 } |
|
219 |
|
220 function report_results(&$docs) |
|
221 { |
|
222 echo "<table cellspacing=\"2\">\n"; |
|
223 echo " <tr>\n"; |
|
224 echo " <td colspan=\"2\"><h2>".search_results()."</h2></td>\n"; |
|
225 echo " </tr>\n"; |
|
226 $numDocs = sizeof($docs); |
|
227 if ($numDocs==0) |
|
228 { |
|
229 echo " <tr>\n"; |
|
230 echo " <td colspan=\"2\">".matches_text(0)."</td>\n"; |
|
231 echo " </tr>\n"; |
|
232 } |
|
233 else |
|
234 { |
|
235 echo " <tr>\n"; |
|
236 echo " <td colspan=\"2\">".matches_text($numDocs); |
|
237 echo "\n"; |
|
238 echo " </td>\n"; |
|
239 echo " </tr>\n"; |
|
240 $num=1; |
|
241 foreach ($docs as $doc) |
|
242 { |
|
243 echo " <tr>\n"; |
|
244 echo " <td align=\"right\">$num.</td>"; |
|
245 echo "<td><a class=\"el\" href=\"".$doc["url"]."\">".$doc["name"]."</a></td>\n"; |
|
246 echo " <tr>\n"; |
|
247 echo " <td></td><td class=\"tiny\">".report_matches()." "; |
|
248 foreach ($doc["words"] as $wordInfo) |
|
249 { |
|
250 $word = $wordInfo["word"]; |
|
251 $matchRight = substr($wordInfo["match"],strlen($word)); |
|
252 echo "<b>$word</b>$matchRight(".$wordInfo["freq"].") "; |
|
253 } |
|
254 echo " </td>\n"; |
|
255 echo " </tr>\n"; |
|
256 $num++; |
|
257 } |
|
258 } |
|
259 echo "</table>\n"; |
|
260 } |
|
261 |
|
262 function main() |
|
263 { |
|
264 if(strcmp('4.1.0', phpversion()) > 0) |
|
265 { |
|
266 die("Error: PHP version 4.1.0 or above required!"); |
|
267 } |
|
268 if (!($file=fopen("search/search.idx","rb"))) |
|
269 { |
|
270 die("Error: Search index file could NOT be opened!"); |
|
271 } |
|
272 if (readHeader($file)!="DOXS") |
|
273 { |
|
274 die("Error: Header of index file is invalid!"); |
|
275 } |
|
276 $query=""; |
|
277 if (array_key_exists("query", $_GET)) |
|
278 { |
|
279 $query=$_GET["query"]; |
|
280 } |
|
281 end_form(preg_replace("/[^a-zA-Z0-9\-\_\.]/i", " ", $query )); |
|
282 echo " \n<div class=\"searchresults\">\n"; |
|
283 $results = array(); |
|
284 $requiredWords = array(); |
|
285 $forbiddenWords = array(); |
|
286 $foundWords = array(); |
|
287 $word=strtok($query," "); |
|
288 while ($word) // for each word in the search query |
|
289 { |
|
290 if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; } |
|
291 if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; } |
|
292 if (!in_array($word,$foundWords)) |
|
293 { |
|
294 $foundWords[]=$word; |
|
295 search($file,strtolower($word),$results); |
|
296 } |
|
297 $word=strtok(" "); |
|
298 } |
|
299 $docs = array(); |
|
300 combine_results($results,$docs); |
|
301 // filter out documents with forbidden word or that do not contain |
|
302 // required words |
|
303 $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords); |
|
304 // sort the results based on rank |
|
305 $sorted = array(); |
|
306 sort_results($filteredDocs,$sorted); |
|
307 // report results to the user |
|
308 report_results($sorted); |
|
309 echo "</div>\n"; |
|
310 fclose($file); |
|
311 } |
|
312 |
|
313 main(); |
|
314 |