Orb/Doxygen/src/searchindex.cpp
changeset 0 42188c7ea2d9
child 4 468f4c8d3d5b
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Orb/Doxygen/src/searchindex.cpp	Thu Jan 21 17:29:01 2010 +0000
@@ -0,0 +1,307 @@
+/******************************************************************************
+ *
+ * 
+ *
+ * Copyright (C) 1997-2008 by Dimitri van Heesch.
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation under the terms of the GNU General Public License is hereby 
+ * granted. No representations are made about the suitability of this software 
+ * for any purpose. It is provided "as is" without express or implied warranty.
+ * See the GNU General Public License for more details.
+ *
+ * Documents produced by Doxygen are derivative works derived from the
+ * input used in their production; they are not affected by this license.
+ *
+ */
+
+#include "qtbc.h"
+#include "searchindex.h"
+#include "config.h"
+#include "util.h"
+#include <qfile.h>
+#include <ctype.h>
+#include <qregexp.h>
+
+
+// file format: (all multi-byte values are stored in big endian format)
+//   4 byte header
+//   256*256*4 byte index (4 bytes)
+//   for each index entry: a zero terminated list of words 
+//   for each word: a \0 terminated string + 4 byte offset to the stats info
+//   padding bytes to align at 4 byte boundary
+//   for each word: the number of urls (4 bytes) 
+//               + for each url containing the word 8 bytes statistics
+//                 (4 bytes index to url string + 4 bytes frequency counter)
+//   for each url: a \0 terminated string
+
+const int numIndexEntries = 256*256;
+
+//--------------------------------------------------------------------
+
+IndexWord::IndexWord(const char *word) : m_word(word), m_urls(17)
+{
+  m_urls.setAutoDelete(TRUE);
+  //printf("IndexWord::IndexWord(%s)\n",word);
+}
+
+void IndexWord::addUrlIndex(int idx,bool hiPriority)
+{
+  //printf("IndexWord::addUrlIndex(%d,%d)\n",idx,hiPriority);
+  URLInfo *ui = m_urls.find(idx);
+  if (ui==0)
+  {
+    //printf("URLInfo::URLInfo(%d)\n",idx);
+    ui=new URLInfo(idx,0);
+    m_urls.insert(idx,ui);
+  }
+  ui->freq+=2;
+  if (hiPriority) ui->freq|=1; // mark as high priority document
+}
+
+//--------------------------------------------------------------------
+
+SearchIndex::SearchIndex() : m_words(328829), m_index(numIndexEntries), m_urlIndex(-1)
+{
+  int i;
+  m_words.setAutoDelete(TRUE);
+  m_urls.setAutoDelete(TRUE);
+  m_index.setAutoDelete(TRUE);
+  for (i=0;i<numIndexEntries;i++) m_index.insert(i,new QList<IndexWord>);
+}
+
+void SearchIndex::setCurrentDoc(const char *name,const char *baseName,const char *anchor)
+{
+  if (name==0 || baseName==0) return;
+  //printf("SearchIndex::setCurrentDoc(%s,%s,%s)\n",name,baseName,anchor);
+  QCString url=baseName+Config_getString("HTML_FILE_EXTENSION");
+  if (anchor) url+=(QCString)"#"+anchor;  
+  m_urlIndex++;
+  m_urls.insert(m_urlIndex,new URL(name,url));
+}
+
+static int charsToIndex(const char *word)
+{
+  if (word==0) return -1;
+
+  // Fast string hashing algorithm
+  //register ushort h=0;
+  //const char *k = word;
+  //ushort mask=0xfc00;
+  //while ( *k ) 
+  //{
+  //  h = (h&mask)^(h<<6)^(*k++);
+  //}
+  //return h;
+
+  // Simple hashing that allows for substring searching
+  uint c1=word[0];
+  if (c1==0) return -1;
+  uint c2=word[1];
+  if (c2==0) return -1;
+  return c1*256+c2;
+}
+
+void SearchIndex::addWord(const char *word,bool hiPriority,bool recurse)
+{
+  static QRegExp nextPart("[_a-z:][A-Z]");
+  //printf("SearchIndex::addWord(%s,%d)\n",word,hiPriority);
+  QString wStr(word);
+  if (wStr.isEmpty()) return;
+  wStr=wStr.lower();
+  IndexWord *w = m_words[wStr];
+  if (w==0)
+  {
+    int idx=charsToIndex(wStr);
+    if (idx<0) return;
+    w = new IndexWord(wStr);
+    //fprintf(stderr,"addWord(%s) at index %d\n",word,idx);
+    m_index[idx]->append(w);
+    m_words.insert(wStr,w);
+  }
+  w->addUrlIndex(m_urlIndex,hiPriority);
+  int i;
+  bool found=FALSE;
+  if (!recurse) // the first time we check if we can strip the prefix
+  {
+    i=getPrefixIndex(word);
+    if (i>0)
+    {
+      addWord(word+i,hiPriority,TRUE);
+      found=TRUE;
+    }
+  }
+  if (!found) // no prefix stripped
+  {
+    if ((i=nextPart.match(word))>=1)
+    {
+      addWord(word+i+1,hiPriority,TRUE);
+    }
+  }
+}
+
+
+static void writeInt(QFile &f,int index)
+{
+  f.putch(((uint)index)>>24);
+  f.putch((((uint)index)>>16)&0xff);
+  f.putch((((uint)index)>>8)&0xff);
+  f.putch(((uint)index)&0xff);
+}
+
+static void writeString(QFile &f,const char *s)
+{
+  const char *p = s;
+  while (*p) f.putch(*p++);
+  f.putch(0);
+}
+
+void SearchIndex::write(const char *fileName)
+{
+  int i;
+  int size=4; // for the header
+  size+=4*numIndexEntries; // for the index
+  int wordsOffset = size;
+  // first pass: compute the size of the wordlist
+  for (i=0;i<numIndexEntries;i++)
+  {
+    QList<IndexWord> *wlist = m_index[i];
+    if (!wlist->isEmpty())
+    {
+      QListIterator<IndexWord> iwi(*wlist);
+      IndexWord *iw;
+      for (iwi.toFirst();(iw=iwi.current());++iwi)
+      {
+        int ws = iw->word().length()+1; 
+        size+=ws+4; // word + url info list offset
+      }
+      size+=1; // zero list terminator
+    }
+  }
+
+  // second pass: compute the offsets in the index
+  int indexOffsets[numIndexEntries];
+  int offset=wordsOffset;
+  for (i=0;i<numIndexEntries;i++)
+  {
+    QList<IndexWord> *wlist = m_index[i];
+    if (!wlist->isEmpty())
+    {
+      indexOffsets[i]=offset;
+      QListIterator<IndexWord> iwi(*wlist);
+      IndexWord *iw;
+      for (iwi.toFirst();(iw=iwi.current());++iwi)
+      {
+        offset+= iw->word().length()+1; 
+        offset+=4; // word + offset to url info array 
+      }
+      offset+=1; // zero list terminator
+    }
+    else
+    {
+      indexOffsets[i]=0;
+    }
+  }
+  int padding = size;
+  size = (size+3)&~3; // round up to 4 byte boundary
+  padding = size - padding;
+
+  //int statsOffset = size;
+  QDictIterator<IndexWord> wdi(m_words);
+  //IndexWord *iw;
+  int *wordStatOffsets = new int[m_words.count()];
+  
+  int count=0;
+
+  // third pass: compute offset to stats info for each word
+  for (i=0;i<numIndexEntries;i++)
+  {
+    QList<IndexWord> *wlist = m_index[i];
+    if (!wlist->isEmpty())
+    {
+      QListIterator<IndexWord> iwi(*wlist);
+      IndexWord *iw;
+      for (iwi.toFirst();(iw=iwi.current());++iwi)
+      {
+        //printf("wordStatOffsets[%d]=%d\n",count,size);
+        wordStatOffsets[count++] = size;
+        size+=4+iw->urls().count()*8; // count + (url_index,freq) per url
+      }
+    }
+  }
+  int *urlOffsets = new int[m_urls.count()];
+  //int urlsOffset = size;
+  QIntDictIterator<URL> udi(m_urls);
+  URL *url;
+  for (udi.toFirst();(url=udi.current());++udi)
+  {
+    urlOffsets[udi.currentKey()]=size;
+    size+=url->name.length()+1+
+          url->url.length()+1;
+  }
+  //printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset);
+  QFile f(fileName);
+  if (f.open(IO_WriteOnly))
+  {
+    // write header
+    f.putch('D'); f.putch('O'); f.putch('X'); f.putch('S');
+    // write index
+    for (i=0;i<numIndexEntries;i++)
+    {
+      writeInt(f,indexOffsets[i]);
+    }
+    // write word lists
+    count=0;
+    for (i=0;i<numIndexEntries;i++)
+    {
+      QList<IndexWord> *wlist = m_index[i];
+      if (!wlist->isEmpty())
+      {
+        QListIterator<IndexWord> iwi(*wlist);
+        IndexWord *iw;
+        for (iwi.toFirst();(iw=iwi.current());++iwi)
+        {
+          writeString(f,iw->word());
+          writeInt(f,wordStatOffsets[count++]);
+        }
+        f.putch(0);
+      }
+    }
+    // write extra padding bytes
+    for (i=0;i<padding;i++) f.putch(0);
+    // write word statistics
+    for (i=0;i<numIndexEntries;i++)
+    {
+      QList<IndexWord> *wlist = m_index[i];
+      if (!wlist->isEmpty())
+      {
+        QListIterator<IndexWord> iwi(*wlist);
+        IndexWord *iw;
+        for (iwi.toFirst();(iw=iwi.current());++iwi)
+        {
+          int numUrls = iw->urls().count();
+          writeInt(f,numUrls);
+          QIntDictIterator<URLInfo> uli(iw->urls());
+          URLInfo *ui;
+          for (uli.toFirst();(ui=uli.current());++uli)
+          {
+            writeInt(f,urlOffsets[ui->urlIdx]);
+            writeInt(f,ui->freq);
+          }
+        }
+      }
+    }
+    // write urls
+    QIntDictIterator<URL> udi(m_urls);
+    URL *url;
+    for (udi.toFirst();(url=udi.current());++udi)
+    {
+      writeString(f,url->name);
+      writeString(f,url->url);
+    }
+  }
+
+  delete[] urlOffsets;
+  delete[] wordStatOffsets;
+}
+