Orb/Doxygen/src/search_php.h
changeset 0 42188c7ea2d9
equal deleted inserted replaced
-1:000000000000 0:42188c7ea2d9
       
     1 "function readInt($file)\n"
       
     2 "{\n"
       
     3 "  $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));\n"
       
     4 "  $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));\n"
       
     5 "  return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;\n"
       
     6 "}\n"
       
     7 "\n"
       
     8 "function readString($file)\n"
       
     9 "{\n"
       
    10 "  $result=\"\";\n"
       
    11 "  while (ord($c=fgetc($file))) $result.=$c;\n"
       
    12 "  return $result;\n"
       
    13 "}\n"
       
    14 "\n"
       
    15 "function readHeader($file)\n"
       
    16 "{\n"
       
    17 "  $header =fgetc($file); $header.=fgetc($file);\n"
       
    18 "  $header.=fgetc($file); $header.=fgetc($file);\n"
       
    19 "  return $header;\n"
       
    20 "}\n"
       
    21 "\n"
       
    22 "function computeIndex($word)\n"
       
    23 "{\n"
       
    24 "  // Simple hashing that allows for substring search\n"
       
    25 "  if (strlen($word)<2) return -1;\n"
       
    26 "  // high char of the index\n"
       
    27 "  $hi = ord($word{0});\n"
       
    28 "  if ($hi==0) return -1;\n"
       
    29 "  // low char of the index\n"
       
    30 "  $lo = ord($word{1});\n"
       
    31 "  if ($lo==0) return -1;\n"
       
    32 "  // return index\n"
       
    33 "  return $hi*256+$lo;\n"
       
    34 "}\n"
       
    35 "\n"
       
    36 "function search($file,$word,&$statsList)\n"
       
    37 "{\n"
       
    38 "  $index = computeIndex($word);\n"
       
    39 "  if ($index!=-1) // found a valid index\n"
       
    40 "  {\n"
       
    41 "    fseek($file,$index*4+4); // 4 bytes per entry, skip header\n"
       
    42 "    $index = readInt($file);\n"
       
    43 "    if ($index) // found words matching the hash key\n"
       
    44 "    {\n"
       
    45 "      $start=sizeof($statsList);\n"
       
    46 "      $count=$start;\n"
       
    47 "      fseek($file,$index);\n"
       
    48 "      $w = readString($file);\n"
       
    49 "      while ($w)\n"
       
    50 "      {\n"
       
    51 "        $statIdx = readInt($file);\n"
       
    52 "        if ($word==substr($w,0,strlen($word)))\n"
       
    53 "        { // found word that matches (as substring)\n"
       
    54 "          $statsList[$count++]=array(\n"
       
    55 "              \"word\"=>$word,\n"
       
    56 "              \"match\"=>$w,\n"
       
    57 "              \"index\"=>$statIdx,\n"
       
    58 "              \"full\"=>strlen($w)==strlen($word),\n"
       
    59 "              \"docs\"=>array()\n"
       
    60 "              );\n"
       
    61 "        }\n"
       
    62 "        $w = readString($file);\n"
       
    63 "      }\n"
       
    64 "      $totalHi=0;\n"
       
    65 "      $totalFreqHi=0;\n"
       
    66 "      $totalFreqLo=0;\n"
       
    67 "      for ($count=$start;$count<sizeof($statsList);$count++)\n"
       
    68 "      {\n"
       
    69 "        $statInfo = &$statsList[$count];\n"
       
    70 "        $multiplier = 1;\n"
       
    71 "        // whole word matches have a double weight\n"
       
    72 "        if ($statInfo[\"full\"]) $multiplier=2;\n"
       
    73 "        fseek($file,$statInfo[\"index\"]); \n"
       
    74 "        $numDocs = readInt($file);\n"
       
    75 "        $docInfo = array();\n"
       
    76 "        // read docs info + occurrence frequency of the word\n"
       
    77 "        for ($i=0;$i<$numDocs;$i++)\n"
       
    78 "        {\n"
       
    79 "          $idx=readInt($file); \n"
       
    80 "          $freq=readInt($file); \n"
       
    81 "          $docInfo[$i]=array(\"idx\"  => $idx,\n"
       
    82 "                             \"freq\" => $freq>>1,\n"
       
    83 "                             \"rank\" => 0.0,\n"
       
    84 "                             \"hi\"   => $freq&1\n"
       
    85 "                            );\n"
       
    86 "          if ($freq&1) // word occurs in high priority doc\n"
       
    87 "          {\n"
       
    88 "            $totalHi++;\n"
       
    89 "            $totalFreqHi+=$freq*$multiplier;\n"
       
    90 "          }\n"
       
    91 "          else // word occurs in low priority doc\n"
       
    92 "          {\n"
       
    93 "            $totalFreqLo+=$freq*$multiplier;\n"
       
    94 "          }\n"
       
    95 "        }\n"
       
    96 "        // read name and url info for the doc\n"
       
    97 "        for ($i=0;$i<$numDocs;$i++)\n"
       
    98 "        {\n"
       
    99 "          fseek($file,$docInfo[$i][\"idx\"]);\n"
       
   100 "          $docInfo[$i][\"name\"]=readString($file);\n"
       
   101 "          $docInfo[$i][\"url\"]=readString($file);\n"
       
   102 "        }\n"
       
   103 "        $statInfo[\"docs\"]=$docInfo;\n"
       
   104 "      }\n"
       
   105 "      $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;\n"
       
   106 "      for ($count=$start;$count<sizeof($statsList);$count++)\n"
       
   107 "      {\n"
       
   108 "        $statInfo = &$statsList[$count];\n"
       
   109 "        $multiplier = 1;\n"
       
   110 "        // whole word matches have a double weight\n"
       
   111 "        if ($statInfo[\"full\"]) $multiplier=2;\n"
       
   112 "        for ($i=0;$i<sizeof($statInfo[\"docs\"]);$i++)\n"
       
   113 "        {\n"
       
   114 "          $docInfo = &$statInfo[\"docs\"];\n"
       
   115 "          // compute frequency rank of the word in each doc\n"
       
   116 "          $freq=$docInfo[$i][\"freq\"];\n"
       
   117 "          if ($docInfo[$i][\"hi\"])\n"
       
   118 "          {\n"
       
   119 "            $statInfo[\"docs\"][$i][\"rank\"]=\n"
       
   120 "              (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;\n"
       
   121 "          }\n"
       
   122 "          else\n"
       
   123 "          {\n"
       
   124 "            $statInfo[\"docs\"][$i][\"rank\"]=\n"
       
   125 "              (float)($freq*$multiplier)/$totalFreq;\n"
       
   126 "          }\n"
       
   127 "        }\n"
       
   128 "      }\n"
       
   129 "    }\n"
       
   130 "  }\n"
       
   131 "  return $statsList;\n"
       
   132 "}\n"
       
   133 "\n"
       
   134 "function combine_results($results,&$docs)\n"
       
   135 "{\n"
       
   136 "  foreach ($results as $wordInfo)\n"
       
   137 "  {\n"
       
   138 "    $docsList = &$wordInfo[\"docs\"];\n"
       
   139 "    foreach ($docsList as $di)\n"
       
   140 "    {\n"
       
   141 "      $key=$di[\"url\"];\n"
       
   142 "      $rank=$di[\"rank\"];\n"
       
   143 "      if (in_array($key, array_keys($docs)))\n"
       
   144 "      {\n"
       
   145 "        $docs[$key][\"rank\"]+=$rank;\n"
       
   146 "      }\n"
       
   147 "      else\n"
       
   148 "      {\n"
       
   149 "        $docs[$key] = array(\"url\"=>$key,\n"
       
   150 "            \"name\"=>$di[\"name\"],\n"
       
   151 "            \"rank\"=>$rank\n"
       
   152 "            );\n"
       
   153 "      }\n"
       
   154 "      $docs[$key][\"words\"][] = array(\n"
       
   155 "               \"word\"=>$wordInfo[\"word\"],\n"
       
   156 "               \"match\"=>$wordInfo[\"match\"],\n"
       
   157 "               \"freq\"=>$di[\"freq\"]\n"
       
   158 "               );\n"
       
   159 "    }\n"
       
   160 "  }\n"
       
   161 "  return $docs;\n"
       
   162 "}\n"
       
   163 "\n"
       
   164 "function filter_results($docs,&$requiredWords,&$forbiddenWords)\n"
       
   165 "{\n"
       
   166 "  $filteredDocs=array();\n"
       
   167 "  while (list ($key, $val) = each ($docs)) \n"
       
   168 "  {\n"
       
   169 "    $words = &$docs[$key][\"words\"];\n"
       
   170 "    $copy=1; // copy entry by default\n"
       
   171 "    if (sizeof($requiredWords)>0)\n"
       
   172 "    {\n"
       
   173 "      foreach ($requiredWords as $reqWord)\n"
       
   174 "      {\n"
       
   175 "        $found=0;\n"
       
   176 "        foreach ($words as $wordInfo)\n"
       
   177 "        { \n"
       
   178 "          $found = $wordInfo[\"word\"]==$reqWord;\n"
       
   179 "          if ($found) break;\n"
       
   180 "        }\n"
       
   181 "        if (!$found) \n"
       
   182 "        {\n"
       
   183 "          $copy=0; // document contains none of the required words\n"
       
   184 "          break;\n"
       
   185 "        }\n"
       
   186 "      }\n"
       
   187 "    }\n"
       
   188 "    if (sizeof($forbiddenWords)>0)\n"
       
   189 "    {\n"
       
   190 "      foreach ($words as $wordInfo)\n"
       
   191 "      {\n"
       
   192 "        if (in_array($wordInfo[\"word\"],$forbiddenWords))\n"
       
   193 "        {\n"
       
   194 "          $copy=0; // document contains a forbidden word\n"
       
   195 "          break;\n"
       
   196 "        }\n"
       
   197 "      }\n"
       
   198 "    }\n"
       
   199 "    if ($copy) $filteredDocs[$key]=$docs[$key];\n"
       
   200 "  }\n"
       
   201 "  return $filteredDocs;\n"
       
   202 "}\n"
       
   203 "\n"
       
   204 "function compare_rank($a,$b)\n"
       
   205 "{\n"
       
   206 "  if ($a[\"rank\"] == $b[\"rank\"]) \n"
       
   207 "  {\n"
       
   208 "    return 0;\n"
       
   209 "  }\n"
       
   210 "  return ($a[\"rank\"]>$b[\"rank\"]) ? -1 : 1; \n"
       
   211 "}\n"
       
   212 "\n"
       
   213 "function sort_results($docs,&$sorted)\n"
       
   214 "{\n"
       
   215 "  $sorted = $docs;\n"
       
   216 "  usort($sorted,\"compare_rank\");\n"
       
   217 "  return $sorted;\n"
       
   218 "}\n"
       
   219 "\n"
       
   220 "function report_results(&$docs)\n"
       
   221 "{\n"
       
   222 "  echo \"<table cellspacing=\\\"2\\\">\\n\";\n"
       
   223 "  echo \"  <tr>\\n\";\n"
       
   224 "  echo \"    <td colspan=\\\"2\\\"><h2>\".search_results().\"</h2></td>\\n\";\n"
       
   225 "  echo \"  </tr>\\n\";\n"
       
   226 "  $numDocs = sizeof($docs);\n"
       
   227 "  if ($numDocs==0)\n"
       
   228 "  {\n"
       
   229 "    echo \"  <tr>\\n\";\n"
       
   230 "    echo \"    <td colspan=\\\"2\\\">\".matches_text(0).\"</td>\\n\";\n"
       
   231 "    echo \"  </tr>\\n\";\n"
       
   232 "  }\n"
       
   233 "  else\n"
       
   234 "  {\n"
       
   235 "    echo \"  <tr>\\n\";\n"
       
   236 "    echo \"    <td colspan=\\\"2\\\">\".matches_text($numDocs);\n"
       
   237 "    echo \"\\n\";\n"
       
   238 "    echo \"    </td>\\n\";\n"
       
   239 "    echo \"  </tr>\\n\";\n"
       
   240 "    $num=1;\n"
       
   241 "    foreach ($docs as $doc)\n"
       
   242 "    {\n"
       
   243 "      echo \"  <tr>\\n\";\n"
       
   244 "      echo \"    <td align=\\\"right\\\">$num.</td>\";\n"
       
   245 "      echo     \"<td><a class=\\\"el\\\" href=\\\"\".$doc[\"url\"].\"\\\">\".$doc[\"name\"].\"</a></td>\\n\";\n"
       
   246 "      echo \"  <tr>\\n\";\n"
       
   247 "      echo \"    <td></td><td class=\\\"tiny\\\">\".report_matches().\" \";\n"
       
   248 "      foreach ($doc[\"words\"] as $wordInfo)\n"
       
   249 "      {\n"
       
   250 "        $word = $wordInfo[\"word\"];\n"
       
   251 "        $matchRight = substr($wordInfo[\"match\"],strlen($word));\n"
       
   252 "        echo \"<b>$word</b>$matchRight(\".$wordInfo[\"freq\"].\") \";\n"
       
   253 "      }\n"
       
   254 "      echo \"    </td>\\n\";\n"
       
   255 "      echo \"  </tr>\\n\";\n"
       
   256 "      $num++;\n"
       
   257 "    }\n"
       
   258 "  }\n"
       
   259 "  echo \"</table>\\n\";\n"
       
   260 "}\n"
       
   261 "\n"
       
   262 "function main()\n"
       
   263 "{\n"
       
   264 "  if(strcmp('4.1.0', phpversion()) > 0) \n"
       
   265 "  {\n"
       
   266 "    die(\"Error: PHP version 4.1.0 or above required!\");\n"
       
   267 "  }\n"
       
   268 "  if (!($file=fopen(\"search/search.idx\",\"rb\"))) \n"
       
   269 "  {\n"
       
   270 "    die(\"Error: Search index file could NOT be opened!\");\n"
       
   271 "  }\n"
       
   272 "  if (readHeader($file)!=\"DOXS\")\n"
       
   273 "  {\n"
       
   274 "    die(\"Error: Header of index file is invalid!\");\n"
       
   275 "  }\n"
       
   276 "  $query=\"\";\n"
       
   277 "  if (array_key_exists(\"query\", $_GET))\n"
       
   278 "  {\n"
       
   279 "    $query=$_GET[\"query\"];\n"
       
   280 "  }\n"
       
   281 "  end_form(preg_replace(\"/[^a-zA-Z0-9\\-\\_\\.]/i\", \" \", $query ));\n"
       
   282 "  echo \"&nbsp;\\n<div class=\\\"searchresults\\\">\\n\";\n"
       
   283 "  $results = array();\n"
       
   284 "  $requiredWords = array();\n"
       
   285 "  $forbiddenWords = array();\n"
       
   286 "  $foundWords = array();\n"
       
   287 "  $word=strtok($query,\" \");\n"
       
   288 "  while ($word) // for each word in the search query\n"
       
   289 "  {\n"
       
   290 "    if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }\n"
       
   291 "    if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }\n"
       
   292 "    if (!in_array($word,$foundWords))\n"
       
   293 "    {\n"
       
   294 "      $foundWords[]=$word;\n"
       
   295 "      search($file,strtolower($word),$results);\n"
       
   296 "    }\n"
       
   297 "    $word=strtok(\" \");\n"
       
   298 "  }\n"
       
   299 "  $docs = array();\n"
       
   300 "  combine_results($results,$docs);\n"
       
   301 "  // filter out documents with forbidden word or that do not contain\n"
       
   302 "  // required words\n"
       
   303 "  $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);\n"
       
   304 "  // sort the results based on rank\n"
       
   305 "  $sorted = array();\n"
       
   306 "  sort_results($filteredDocs,$sorted);\n"
       
   307 "  // report results to the user\n"
       
   308 "  report_results($sorted);\n"
       
   309 "  echo \"</div>\\n\";\n"
       
   310 "  fclose($file);\n"
       
   311 "}\n"
       
   312 "\n"
       
   313 "main();\n"
       
   314 "\n"