Orb/Doxygen/src/search.php
changeset 0 42188c7ea2d9
equal deleted inserted replaced
-1:000000000000 0:42188c7ea2d9
       
     1 function readInt($file)
       
     2 {
       
     3   $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));
       
     4   $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));
       
     5   return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;
       
     6 }
       
     7 
       
     8 function readString($file)
       
     9 {
       
    10   $result="";
       
    11   while (ord($c=fgetc($file))) $result.=$c;
       
    12   return $result;
       
    13 }
       
    14 
       
    15 function readHeader($file)
       
    16 {
       
    17   $header =fgetc($file); $header.=fgetc($file);
       
    18   $header.=fgetc($file); $header.=fgetc($file);
       
    19   return $header;
       
    20 }
       
    21 
       
    22 function computeIndex($word)
       
    23 {
       
    24   // Simple hashing that allows for substring search
       
    25   if (strlen($word)<2) return -1;
       
    26   // high char of the index
       
    27   $hi = ord($word{0});
       
    28   if ($hi==0) return -1;
       
    29   // low char of the index
       
    30   $lo = ord($word{1});
       
    31   if ($lo==0) return -1;
       
    32   // return index
       
    33   return $hi*256+$lo;
       
    34 }
       
    35 
       
    36 function search($file,$word,&$statsList)
       
    37 {
       
    38   $index = computeIndex($word);
       
    39   if ($index!=-1) // found a valid index
       
    40   {
       
    41     fseek($file,$index*4+4); // 4 bytes per entry, skip header
       
    42     $index = readInt($file);
       
    43     if ($index) // found words matching the hash key
       
    44     {
       
    45       $start=sizeof($statsList);
       
    46       $count=$start;
       
    47       fseek($file,$index);
       
    48       $w = readString($file);
       
    49       while ($w)
       
    50       {
       
    51         $statIdx = readInt($file);
       
    52         if ($word==substr($w,0,strlen($word)))
       
    53         { // found word that matches (as substring)
       
    54           $statsList[$count++]=array(
       
    55               "word"=>$word,
       
    56               "match"=>$w,
       
    57               "index"=>$statIdx,
       
    58               "full"=>strlen($w)==strlen($word),
       
    59               "docs"=>array()
       
    60               );
       
    61         }
       
    62         $w = readString($file);
       
    63       }
       
    64       $totalHi=0;
       
    65       $totalFreqHi=0;
       
    66       $totalFreqLo=0;
       
    67       for ($count=$start;$count<sizeof($statsList);$count++)
       
    68       {
       
    69         $statInfo = &$statsList[$count];
       
    70         $multiplier = 1;
       
    71         // whole word matches have a double weight
       
    72         if ($statInfo["full"]) $multiplier=2;
       
    73         fseek($file,$statInfo["index"]); 
       
    74         $numDocs = readInt($file);
       
    75         $docInfo = array();
       
    76         // read docs info + occurrence frequency of the word
       
    77         for ($i=0;$i<$numDocs;$i++)
       
    78         {
       
    79           $idx=readInt($file); 
       
    80           $freq=readInt($file); 
       
    81           $docInfo[$i]=array("idx"  => $idx,
       
    82                              "freq" => $freq>>1,
       
    83                              "rank" => 0.0,
       
    84                              "hi"   => $freq&1
       
    85                             );
       
    86           if ($freq&1) // word occurs in high priority doc
       
    87           {
       
    88             $totalHi++;
       
    89             $totalFreqHi+=$freq*$multiplier;
       
    90           }
       
    91           else // word occurs in low priority doc
       
    92           {
       
    93             $totalFreqLo+=$freq*$multiplier;
       
    94           }
       
    95         }
       
    96         // read name and url info for the doc
       
    97         for ($i=0;$i<$numDocs;$i++)
       
    98         {
       
    99           fseek($file,$docInfo[$i]["idx"]);
       
   100           $docInfo[$i]["name"]=readString($file);
       
   101           $docInfo[$i]["url"]=readString($file);
       
   102         }
       
   103         $statInfo["docs"]=$docInfo;
       
   104       }
       
   105       $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;
       
   106       for ($count=$start;$count<sizeof($statsList);$count++)
       
   107       {
       
   108         $statInfo = &$statsList[$count];
       
   109         $multiplier = 1;
       
   110         // whole word matches have a double weight
       
   111         if ($statInfo["full"]) $multiplier=2;
       
   112         for ($i=0;$i<sizeof($statInfo["docs"]);$i++)
       
   113         {
       
   114           $docInfo = &$statInfo["docs"];
       
   115           // compute frequency rank of the word in each doc
       
   116           $freq=$docInfo[$i]["freq"];
       
   117           if ($docInfo[$i]["hi"])
       
   118           {
       
   119             $statInfo["docs"][$i]["rank"]=
       
   120               (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;
       
   121           }
       
   122           else
       
   123           {
       
   124             $statInfo["docs"][$i]["rank"]=
       
   125               (float)($freq*$multiplier)/$totalFreq;
       
   126           }
       
   127         }
       
   128       }
       
   129     }
       
   130   }
       
   131   return $statsList;
       
   132 }
       
   133 
       
   134 function combine_results($results,&$docs)
       
   135 {
       
   136   foreach ($results as $wordInfo)
       
   137   {
       
   138     $docsList = &$wordInfo["docs"];
       
   139     foreach ($docsList as $di)
       
   140     {
       
   141       $key=$di["url"];
       
   142       $rank=$di["rank"];
       
   143       if (in_array($key, array_keys($docs)))
       
   144       {
       
   145         $docs[$key]["rank"]+=$rank;
       
   146       }
       
   147       else
       
   148       {
       
   149         $docs[$key] = array("url"=>$key,
       
   150             "name"=>$di["name"],
       
   151             "rank"=>$rank
       
   152             );
       
   153       }
       
   154       $docs[$key]["words"][] = array(
       
   155                "word"=>$wordInfo["word"],
       
   156                "match"=>$wordInfo["match"],
       
   157                "freq"=>$di["freq"]
       
   158                );
       
   159     }
       
   160   }
       
   161   return $docs;
       
   162 }
       
   163 
       
   164 function filter_results($docs,&$requiredWords,&$forbiddenWords)
       
   165 {
       
   166   $filteredDocs=array();
       
   167   while (list ($key, $val) = each ($docs)) 
       
   168   {
       
   169     $words = &$docs[$key]["words"];
       
   170     $copy=1; // copy entry by default
       
   171     if (sizeof($requiredWords)>0)
       
   172     {
       
   173       foreach ($requiredWords as $reqWord)
       
   174       {
       
   175         $found=0;
       
   176         foreach ($words as $wordInfo)
       
   177         { 
       
   178           $found = $wordInfo["word"]==$reqWord;
       
   179           if ($found) break;
       
   180         }
       
   181         if (!$found) 
       
   182         {
       
   183           $copy=0; // document contains none of the required words
       
   184           break;
       
   185         }
       
   186       }
       
   187     }
       
   188     if (sizeof($forbiddenWords)>0)
       
   189     {
       
   190       foreach ($words as $wordInfo)
       
   191       {
       
   192         if (in_array($wordInfo["word"],$forbiddenWords))
       
   193         {
       
   194           $copy=0; // document contains a forbidden word
       
   195           break;
       
   196         }
       
   197       }
       
   198     }
       
   199     if ($copy) $filteredDocs[$key]=$docs[$key];
       
   200   }
       
   201   return $filteredDocs;
       
   202 }
       
   203 
       
   204 function compare_rank($a,$b)
       
   205 {
       
   206   if ($a["rank"] == $b["rank"]) 
       
   207   {
       
   208     return 0;
       
   209   }
       
   210   return ($a["rank"]>$b["rank"]) ? -1 : 1; 
       
   211 }
       
   212 
       
   213 function sort_results($docs,&$sorted)
       
   214 {
       
   215   $sorted = $docs;
       
   216   usort($sorted,"compare_rank");
       
   217   return $sorted;
       
   218 }
       
   219 
       
   220 function report_results(&$docs)
       
   221 {
       
   222   echo "<table cellspacing=\"2\">\n";
       
   223   echo "  <tr>\n";
       
   224   echo "    <td colspan=\"2\"><h2>".search_results()."</h2></td>\n";
       
   225   echo "  </tr>\n";
       
   226   $numDocs = sizeof($docs);
       
   227   if ($numDocs==0)
       
   228   {
       
   229     echo "  <tr>\n";
       
   230     echo "    <td colspan=\"2\">".matches_text(0)."</td>\n";
       
   231     echo "  </tr>\n";
       
   232   }
       
   233   else
       
   234   {
       
   235     echo "  <tr>\n";
       
   236     echo "    <td colspan=\"2\">".matches_text($numDocs);
       
   237     echo "\n";
       
   238     echo "    </td>\n";
       
   239     echo "  </tr>\n";
       
   240     $num=1;
       
   241     foreach ($docs as $doc)
       
   242     {
       
   243       echo "  <tr>\n";
       
   244       echo "    <td align=\"right\">$num.</td>";
       
   245       echo     "<td><a class=\"el\" href=\"".$doc["url"]."\">".$doc["name"]."</a></td>\n";
       
   246       echo "  <tr>\n";
       
   247       echo "    <td></td><td class=\"tiny\">".report_matches()." ";
       
   248       foreach ($doc["words"] as $wordInfo)
       
   249       {
       
   250         $word = $wordInfo["word"];
       
   251         $matchRight = substr($wordInfo["match"],strlen($word));
       
   252         echo "<b>$word</b>$matchRight(".$wordInfo["freq"].") ";
       
   253       }
       
   254       echo "    </td>\n";
       
   255       echo "  </tr>\n";
       
   256       $num++;
       
   257     }
       
   258   }
       
   259   echo "</table>\n";
       
   260 }
       
   261 
       
   262 function main()
       
   263 {
       
   264   if(strcmp('4.1.0', phpversion()) > 0) 
       
   265   {
       
   266     die("Error: PHP version 4.1.0 or above required!");
       
   267   }
       
   268   if (!($file=fopen("search/search.idx","rb"))) 
       
   269   {
       
   270     die("Error: Search index file could NOT be opened!");
       
   271   }
       
   272   if (readHeader($file)!="DOXS")
       
   273   {
       
   274     die("Error: Header of index file is invalid!");
       
   275   }
       
   276   $query="";
       
   277   if (array_key_exists("query", $_GET))
       
   278   {
       
   279     $query=$_GET["query"];
       
   280   }
       
   281   end_form(preg_replace("/[^a-zA-Z0-9\-\_\.]/i", " ", $query ));
       
   282   echo "&nbsp;\n<div class=\"searchresults\">\n";
       
   283   $results = array();
       
   284   $requiredWords = array();
       
   285   $forbiddenWords = array();
       
   286   $foundWords = array();
       
   287   $word=strtok($query," ");
       
   288   while ($word) // for each word in the search query
       
   289   {
       
   290     if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }
       
   291     if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }
       
   292     if (!in_array($word,$foundWords))
       
   293     {
       
   294       $foundWords[]=$word;
       
   295       search($file,strtolower($word),$results);
       
   296     }
       
   297     $word=strtok(" ");
       
   298   }
       
   299   $docs = array();
       
   300   combine_results($results,$docs);
       
   301   // filter out documents with forbidden word or that do not contain
       
   302   // required words
       
   303   $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);
       
   304   // sort the results based on rank
       
   305   $sorted = array();
       
   306   sort_results($filteredDocs,$sorted);
       
   307   // report results to the user
       
   308   report_results($sorted);
       
   309   echo "</div>\n";
       
   310   fclose($file);
       
   311 }
       
   312 
       
   313 main();
       
   314