|
1 "function readInt($file)\n" |
|
2 "{\n" |
|
3 " $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));\n" |
|
4 " $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));\n" |
|
5 " return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;\n" |
|
6 "}\n" |
|
7 "\n" |
|
8 "function readString($file)\n" |
|
9 "{\n" |
|
10 " $result=\"\";\n" |
|
11 " while (ord($c=fgetc($file))) $result.=$c;\n" |
|
12 " return $result;\n" |
|
13 "}\n" |
|
14 "\n" |
|
15 "function readHeader($file)\n" |
|
16 "{\n" |
|
17 " $header =fgetc($file); $header.=fgetc($file);\n" |
|
18 " $header.=fgetc($file); $header.=fgetc($file);\n" |
|
19 " return $header;\n" |
|
20 "}\n" |
|
21 "\n" |
|
22 "function computeIndex($word)\n" |
|
23 "{\n" |
|
24 " // Simple hashing that allows for substring search\n" |
|
25 " if (strlen($word)<2) return -1;\n" |
|
26 " // high char of the index\n" |
|
27 " $hi = ord($word{0});\n" |
|
28 " if ($hi==0) return -1;\n" |
|
29 " // low char of the index\n" |
|
30 " $lo = ord($word{1});\n" |
|
31 " if ($lo==0) return -1;\n" |
|
32 " // return index\n" |
|
33 " return $hi*256+$lo;\n" |
|
34 "}\n" |
|
35 "\n" |
|
36 "function search($file,$word,&$statsList)\n" |
|
37 "{\n" |
|
38 " $index = computeIndex($word);\n" |
|
39 " if ($index!=-1) // found a valid index\n" |
|
40 " {\n" |
|
41 " fseek($file,$index*4+4); // 4 bytes per entry, skip header\n" |
|
42 " $index = readInt($file);\n" |
|
43 " if ($index) // found words matching the hash key\n" |
|
44 " {\n" |
|
45 " $start=sizeof($statsList);\n" |
|
46 " $count=$start;\n" |
|
47 " fseek($file,$index);\n" |
|
48 " $w = readString($file);\n" |
|
49 " while ($w)\n" |
|
50 " {\n" |
|
51 " $statIdx = readInt($file);\n" |
|
52 " if ($word==substr($w,0,strlen($word)))\n" |
|
53 " { // found word that matches (as substring)\n" |
|
54 " $statsList[$count++]=array(\n" |
|
55 " \"word\"=>$word,\n" |
|
56 " \"match\"=>$w,\n" |
|
57 " \"index\"=>$statIdx,\n" |
|
58 " \"full\"=>strlen($w)==strlen($word),\n" |
|
59 " \"docs\"=>array()\n" |
|
60 " );\n" |
|
61 " }\n" |
|
62 " $w = readString($file);\n" |
|
63 " }\n" |
|
64 " $totalHi=0;\n" |
|
65 " $totalFreqHi=0;\n" |
|
66 " $totalFreqLo=0;\n" |
|
67 " for ($count=$start;$count<sizeof($statsList);$count++)\n" |
|
68 " {\n" |
|
69 " $statInfo = &$statsList[$count];\n" |
|
70 " $multiplier = 1;\n" |
|
71 " // whole word matches have a double weight\n" |
|
72 " if ($statInfo[\"full\"]) $multiplier=2;\n" |
|
73 " fseek($file,$statInfo[\"index\"]); \n" |
|
74 " $numDocs = readInt($file);\n" |
|
75 " $docInfo = array();\n" |
|
76 " // read docs info + occurrence frequency of the word\n" |
|
77 " for ($i=0;$i<$numDocs;$i++)\n" |
|
78 " {\n" |
|
79 " $idx=readInt($file); \n" |
|
80 " $freq=readInt($file); \n" |
|
81 " $docInfo[$i]=array(\"idx\" => $idx,\n" |
|
82 " \"freq\" => $freq>>1,\n" |
|
83 " \"rank\" => 0.0,\n" |
|
84 " \"hi\" => $freq&1\n" |
|
85 " );\n" |
|
86 " if ($freq&1) // word occurs in high priority doc\n" |
|
87 " {\n" |
|
88 " $totalHi++;\n" |
|
89 " $totalFreqHi+=$freq*$multiplier;\n" |
|
90 " }\n" |
|
91 " else // word occurs in low priority doc\n" |
|
92 " {\n" |
|
93 " $totalFreqLo+=$freq*$multiplier;\n" |
|
94 " }\n" |
|
95 " }\n" |
|
96 " // read name and url info for the doc\n" |
|
97 " for ($i=0;$i<$numDocs;$i++)\n" |
|
98 " {\n" |
|
99 " fseek($file,$docInfo[$i][\"idx\"]);\n" |
|
100 " $docInfo[$i][\"name\"]=readString($file);\n" |
|
101 " $docInfo[$i][\"url\"]=readString($file);\n" |
|
102 " }\n" |
|
103 " $statInfo[\"docs\"]=$docInfo;\n" |
|
104 " }\n" |
|
105 " $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;\n" |
|
106 " for ($count=$start;$count<sizeof($statsList);$count++)\n" |
|
107 " {\n" |
|
108 " $statInfo = &$statsList[$count];\n" |
|
109 " $multiplier = 1;\n" |
|
110 " // whole word matches have a double weight\n" |
|
111 " if ($statInfo[\"full\"]) $multiplier=2;\n" |
|
112 " for ($i=0;$i<sizeof($statInfo[\"docs\"]);$i++)\n" |
|
113 " {\n" |
|
114 " $docInfo = &$statInfo[\"docs\"];\n" |
|
115 " // compute frequency rank of the word in each doc\n" |
|
116 " $freq=$docInfo[$i][\"freq\"];\n" |
|
117 " if ($docInfo[$i][\"hi\"])\n" |
|
118 " {\n" |
|
119 " $statInfo[\"docs\"][$i][\"rank\"]=\n" |
|
120 " (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;\n" |
|
121 " }\n" |
|
122 " else\n" |
|
123 " {\n" |
|
124 " $statInfo[\"docs\"][$i][\"rank\"]=\n" |
|
125 " (float)($freq*$multiplier)/$totalFreq;\n" |
|
126 " }\n" |
|
127 " }\n" |
|
128 " }\n" |
|
129 " }\n" |
|
130 " }\n" |
|
131 " return $statsList;\n" |
|
132 "}\n" |
|
133 "\n" |
|
134 "function combine_results($results,&$docs)\n" |
|
135 "{\n" |
|
136 " foreach ($results as $wordInfo)\n" |
|
137 " {\n" |
|
138 " $docsList = &$wordInfo[\"docs\"];\n" |
|
139 " foreach ($docsList as $di)\n" |
|
140 " {\n" |
|
141 " $key=$di[\"url\"];\n" |
|
142 " $rank=$di[\"rank\"];\n" |
|
143 " if (in_array($key, array_keys($docs)))\n" |
|
144 " {\n" |
|
145 " $docs[$key][\"rank\"]+=$rank;\n" |
|
146 " }\n" |
|
147 " else\n" |
|
148 " {\n" |
|
149 " $docs[$key] = array(\"url\"=>$key,\n" |
|
150 " \"name\"=>$di[\"name\"],\n" |
|
151 " \"rank\"=>$rank\n" |
|
152 " );\n" |
|
153 " }\n" |
|
154 " $docs[$key][\"words\"][] = array(\n" |
|
155 " \"word\"=>$wordInfo[\"word\"],\n" |
|
156 " \"match\"=>$wordInfo[\"match\"],\n" |
|
157 " \"freq\"=>$di[\"freq\"]\n" |
|
158 " );\n" |
|
159 " }\n" |
|
160 " }\n" |
|
161 " return $docs;\n" |
|
162 "}\n" |
|
163 "\n" |
|
164 "function filter_results($docs,&$requiredWords,&$forbiddenWords)\n" |
|
165 "{\n" |
|
166 " $filteredDocs=array();\n" |
|
167 " while (list ($key, $val) = each ($docs)) \n" |
|
168 " {\n" |
|
169 " $words = &$docs[$key][\"words\"];\n" |
|
170 " $copy=1; // copy entry by default\n" |
|
171 " if (sizeof($requiredWords)>0)\n" |
|
172 " {\n" |
|
173 " foreach ($requiredWords as $reqWord)\n" |
|
174 " {\n" |
|
175 " $found=0;\n" |
|
176 " foreach ($words as $wordInfo)\n" |
|
177 " { \n" |
|
178 " $found = $wordInfo[\"word\"]==$reqWord;\n" |
|
179 " if ($found) break;\n" |
|
180 " }\n" |
|
181 " if (!$found) \n" |
|
182 " {\n" |
|
183 " $copy=0; // document contains none of the required words\n" |
|
184 " break;\n" |
|
185 " }\n" |
|
186 " }\n" |
|
187 " }\n" |
|
188 " if (sizeof($forbiddenWords)>0)\n" |
|
189 " {\n" |
|
190 " foreach ($words as $wordInfo)\n" |
|
191 " {\n" |
|
192 " if (in_array($wordInfo[\"word\"],$forbiddenWords))\n" |
|
193 " {\n" |
|
194 " $copy=0; // document contains a forbidden word\n" |
|
195 " break;\n" |
|
196 " }\n" |
|
197 " }\n" |
|
198 " }\n" |
|
199 " if ($copy) $filteredDocs[$key]=$docs[$key];\n" |
|
200 " }\n" |
|
201 " return $filteredDocs;\n" |
|
202 "}\n" |
|
203 "\n" |
|
204 "function compare_rank($a,$b)\n" |
|
205 "{\n" |
|
206 " if ($a[\"rank\"] == $b[\"rank\"]) \n" |
|
207 " {\n" |
|
208 " return 0;\n" |
|
209 " }\n" |
|
210 " return ($a[\"rank\"]>$b[\"rank\"]) ? -1 : 1; \n" |
|
211 "}\n" |
|
212 "\n" |
|
213 "function sort_results($docs,&$sorted)\n" |
|
214 "{\n" |
|
215 " $sorted = $docs;\n" |
|
216 " usort($sorted,\"compare_rank\");\n" |
|
217 " return $sorted;\n" |
|
218 "}\n" |
|
219 "\n" |
|
220 "function report_results(&$docs)\n" |
|
221 "{\n" |
|
222 " echo \"<table cellspacing=\\\"2\\\">\\n\";\n" |
|
223 " echo \" <tr>\\n\";\n" |
|
224 " echo \" <td colspan=\\\"2\\\"><h2>\".search_results().\"</h2></td>\\n\";\n" |
|
225 " echo \" </tr>\\n\";\n" |
|
226 " $numDocs = sizeof($docs);\n" |
|
227 " if ($numDocs==0)\n" |
|
228 " {\n" |
|
229 " echo \" <tr>\\n\";\n" |
|
230 " echo \" <td colspan=\\\"2\\\">\".matches_text(0).\"</td>\\n\";\n" |
|
231 " echo \" </tr>\\n\";\n" |
|
232 " }\n" |
|
233 " else\n" |
|
234 " {\n" |
|
235 " echo \" <tr>\\n\";\n" |
|
236 " echo \" <td colspan=\\\"2\\\">\".matches_text($numDocs);\n" |
|
237 " echo \"\\n\";\n" |
|
238 " echo \" </td>\\n\";\n" |
|
239 " echo \" </tr>\\n\";\n" |
|
240 " $num=1;\n" |
|
241 " foreach ($docs as $doc)\n" |
|
242 " {\n" |
|
243 " echo \" <tr>\\n\";\n" |
|
244 " echo \" <td align=\\\"right\\\">$num.</td>\";\n" |
|
245 " echo \"<td><a class=\\\"el\\\" href=\\\"\".$doc[\"url\"].\"\\\">\".$doc[\"name\"].\"</a></td>\\n\";\n" |
|
246 " echo \" <tr>\\n\";\n" |
|
247 " echo \" <td></td><td class=\\\"tiny\\\">\".report_matches().\" \";\n" |
|
248 " foreach ($doc[\"words\"] as $wordInfo)\n" |
|
249 " {\n" |
|
250 " $word = $wordInfo[\"word\"];\n" |
|
251 " $matchRight = substr($wordInfo[\"match\"],strlen($word));\n" |
|
252 " echo \"<b>$word</b>$matchRight(\".$wordInfo[\"freq\"].\") \";\n" |
|
253 " }\n" |
|
254 " echo \" </td>\\n\";\n" |
|
255 " echo \" </tr>\\n\";\n" |
|
256 " $num++;\n" |
|
257 " }\n" |
|
258 " }\n" |
|
259 " echo \"</table>\\n\";\n" |
|
260 "}\n" |
|
261 "\n" |
|
262 "function main()\n" |
|
263 "{\n" |
|
264 " if(strcmp('4.1.0', phpversion()) > 0) \n" |
|
265 " {\n" |
|
266 " die(\"Error: PHP version 4.1.0 or above required!\");\n" |
|
267 " }\n" |
|
268 " if (!($file=fopen(\"search/search.idx\",\"rb\"))) \n" |
|
269 " {\n" |
|
270 " die(\"Error: Search index file could NOT be opened!\");\n" |
|
271 " }\n" |
|
272 " if (readHeader($file)!=\"DOXS\")\n" |
|
273 " {\n" |
|
274 " die(\"Error: Header of index file is invalid!\");\n" |
|
275 " }\n" |
|
276 " $query=\"\";\n" |
|
277 " if (array_key_exists(\"query\", $_GET))\n" |
|
278 " {\n" |
|
279 " $query=$_GET[\"query\"];\n" |
|
280 " }\n" |
|
281 " end_form(preg_replace(\"/[^a-zA-Z0-9\\-\\_\\.]/i\", \" \", $query ));\n" |
|
282 " echo \" \\n<div class=\\\"searchresults\\\">\\n\";\n" |
|
283 " $results = array();\n" |
|
284 " $requiredWords = array();\n" |
|
285 " $forbiddenWords = array();\n" |
|
286 " $foundWords = array();\n" |
|
287 " $word=strtok($query,\" \");\n" |
|
288 " while ($word) // for each word in the search query\n" |
|
289 " {\n" |
|
290 " if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }\n" |
|
291 " if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }\n" |
|
292 " if (!in_array($word,$foundWords))\n" |
|
293 " {\n" |
|
294 " $foundWords[]=$word;\n" |
|
295 " search($file,strtolower($word),$results);\n" |
|
296 " }\n" |
|
297 " $word=strtok(\" \");\n" |
|
298 " }\n" |
|
299 " $docs = array();\n" |
|
300 " combine_results($results,$docs);\n" |
|
301 " // filter out documents with forbidden word or that do not contain\n" |
|
302 " // required words\n" |
|
303 " $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);\n" |
|
304 " // sort the results based on rank\n" |
|
305 " $sorted = array();\n" |
|
306 " sort_results($filteredDocs,$sorted);\n" |
|
307 " // report results to the user\n" |
|
308 " report_results($sorted);\n" |
|
309 " echo \"</div>\\n\";\n" |
|
310 " fclose($file);\n" |
|
311 "}\n" |
|
312 "\n" |
|
313 "main();\n" |
|
314 "\n" |