--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Orb/Doxygen/src/searchindex.cpp Thu Jan 21 17:29:01 2010 +0000
@@ -0,0 +1,307 @@
+/******************************************************************************
+ *
+ *
+ *
+ * Copyright (C) 1997-2008 by Dimitri van Heesch.
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation under the terms of the GNU General Public License is hereby
+ * granted. No representations are made about the suitability of this software
+ * for any purpose. It is provided "as is" without express or implied warranty.
+ * See the GNU General Public License for more details.
+ *
+ * Documents produced by Doxygen are derivative works derived from the
+ * input used in their production; they are not affected by this license.
+ *
+ */
+
+#include "qtbc.h"
+#include "searchindex.h"
+#include "config.h"
+#include "util.h"
+#include <qfile.h>
+#include <ctype.h>
+#include <qregexp.h>
+
+
+// file format: (all multi-byte values are stored in big endian format)
+// 4 byte header
+// 256*256*4 byte index (4 bytes)
+// for each index entry: a zero terminated list of words
+// for each word: a \0 terminated string + 4 byte offset to the stats info
+// padding bytes to align at 4 byte boundary
+// for each word: the number of urls (4 bytes)
+// + for each url containing the word 8 bytes statistics
+// (4 bytes index to url string + 4 bytes frequency counter)
+// for each url: a \0 terminated string
+
+const int numIndexEntries = 256*256;
+
+//--------------------------------------------------------------------
+
+IndexWord::IndexWord(const char *word) : m_word(word), m_urls(17)
+{
+ m_urls.setAutoDelete(TRUE);
+ //printf("IndexWord::IndexWord(%s)\n",word);
+}
+
+void IndexWord::addUrlIndex(int idx,bool hiPriority)
+{
+ //printf("IndexWord::addUrlIndex(%d,%d)\n",idx,hiPriority);
+ URLInfo *ui = m_urls.find(idx);
+ if (ui==0)
+ {
+ //printf("URLInfo::URLInfo(%d)\n",idx);
+ ui=new URLInfo(idx,0);
+ m_urls.insert(idx,ui);
+ }
+ ui->freq+=2;
+ if (hiPriority) ui->freq|=1; // mark as high priority document
+}
+
+//--------------------------------------------------------------------
+
+SearchIndex::SearchIndex() : m_words(328829), m_index(numIndexEntries), m_urlIndex(-1)
+{
+ int i;
+ m_words.setAutoDelete(TRUE);
+ m_urls.setAutoDelete(TRUE);
+ m_index.setAutoDelete(TRUE);
+ for (i=0;i<numIndexEntries;i++) m_index.insert(i,new QList<IndexWord>);
+}
+
+void SearchIndex::setCurrentDoc(const char *name,const char *baseName,const char *anchor)
+{
+ if (name==0 || baseName==0) return;
+ //printf("SearchIndex::setCurrentDoc(%s,%s,%s)\n",name,baseName,anchor);
+ QCString url=baseName+Config_getString("HTML_FILE_EXTENSION");
+ if (anchor) url+=(QCString)"#"+anchor;
+ m_urlIndex++;
+ m_urls.insert(m_urlIndex,new URL(name,url));
+}
+
+static int charsToIndex(const char *word)
+{
+ if (word==0) return -1;
+
+ // Fast string hashing algorithm
+ //register ushort h=0;
+ //const char *k = word;
+ //ushort mask=0xfc00;
+ //while ( *k )
+ //{
+ // h = (h&mask)^(h<<6)^(*k++);
+ //}
+ //return h;
+
+ // Simple hashing that allows for substring searching
+ uint c1=word[0];
+ if (c1==0) return -1;
+ uint c2=word[1];
+ if (c2==0) return -1;
+ return c1*256+c2;
+}
+
+void SearchIndex::addWord(const char *word,bool hiPriority,bool recurse)
+{
+ static QRegExp nextPart("[_a-z:][A-Z]");
+ //printf("SearchIndex::addWord(%s,%d)\n",word,hiPriority);
+ QString wStr(word);
+ if (wStr.isEmpty()) return;
+ wStr=wStr.lower();
+ IndexWord *w = m_words[wStr];
+ if (w==0)
+ {
+ int idx=charsToIndex(wStr);
+ if (idx<0) return;
+ w = new IndexWord(wStr);
+ //fprintf(stderr,"addWord(%s) at index %d\n",word,idx);
+ m_index[idx]->append(w);
+ m_words.insert(wStr,w);
+ }
+ w->addUrlIndex(m_urlIndex,hiPriority);
+ int i;
+ bool found=FALSE;
+ if (!recurse) // the first time we check if we can strip the prefix
+ {
+ i=getPrefixIndex(word);
+ if (i>0)
+ {
+ addWord(word+i,hiPriority,TRUE);
+ found=TRUE;
+ }
+ }
+ if (!found) // no prefix stripped
+ {
+ if ((i=nextPart.match(word))>=1)
+ {
+ addWord(word+i+1,hiPriority,TRUE);
+ }
+ }
+}
+
+
+static void writeInt(QFile &f,int index)
+{
+ f.putch(((uint)index)>>24);
+ f.putch((((uint)index)>>16)&0xff);
+ f.putch((((uint)index)>>8)&0xff);
+ f.putch(((uint)index)&0xff);
+}
+
+static void writeString(QFile &f,const char *s)
+{
+ const char *p = s;
+ while (*p) f.putch(*p++);
+ f.putch(0);
+}
+
+void SearchIndex::write(const char *fileName)
+{
+ int i;
+ int size=4; // for the header
+ size+=4*numIndexEntries; // for the index
+ int wordsOffset = size;
+ // first pass: compute the size of the wordlist
+ for (i=0;i<numIndexEntries;i++)
+ {
+ QList<IndexWord> *wlist = m_index[i];
+ if (!wlist->isEmpty())
+ {
+ QListIterator<IndexWord> iwi(*wlist);
+ IndexWord *iw;
+ for (iwi.toFirst();(iw=iwi.current());++iwi)
+ {
+ int ws = iw->word().length()+1;
+ size+=ws+4; // word + url info list offset
+ }
+ size+=1; // zero list terminator
+ }
+ }
+
+ // second pass: compute the offsets in the index
+ int indexOffsets[numIndexEntries];
+ int offset=wordsOffset;
+ for (i=0;i<numIndexEntries;i++)
+ {
+ QList<IndexWord> *wlist = m_index[i];
+ if (!wlist->isEmpty())
+ {
+ indexOffsets[i]=offset;
+ QListIterator<IndexWord> iwi(*wlist);
+ IndexWord *iw;
+ for (iwi.toFirst();(iw=iwi.current());++iwi)
+ {
+ offset+= iw->word().length()+1;
+ offset+=4; // word + offset to url info array
+ }
+ offset+=1; // zero list terminator
+ }
+ else
+ {
+ indexOffsets[i]=0;
+ }
+ }
+ int padding = size;
+ size = (size+3)&~3; // round up to 4 byte boundary
+ padding = size - padding;
+
+ //int statsOffset = size;
+ QDictIterator<IndexWord> wdi(m_words);
+ //IndexWord *iw;
+ int *wordStatOffsets = new int[m_words.count()];
+
+ int count=0;
+
+ // third pass: compute offset to stats info for each word
+ for (i=0;i<numIndexEntries;i++)
+ {
+ QList<IndexWord> *wlist = m_index[i];
+ if (!wlist->isEmpty())
+ {
+ QListIterator<IndexWord> iwi(*wlist);
+ IndexWord *iw;
+ for (iwi.toFirst();(iw=iwi.current());++iwi)
+ {
+ //printf("wordStatOffsets[%d]=%d\n",count,size);
+ wordStatOffsets[count++] = size;
+ size+=4+iw->urls().count()*8; // count + (url_index,freq) per url
+ }
+ }
+ }
+ int *urlOffsets = new int[m_urls.count()];
+ //int urlsOffset = size;
+ QIntDictIterator<URL> udi(m_urls);
+ URL *url;
+ for (udi.toFirst();(url=udi.current());++udi)
+ {
+ urlOffsets[udi.currentKey()]=size;
+ size+=url->name.length()+1+
+ url->url.length()+1;
+ }
+ //printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset);
+ QFile f(fileName);
+ if (f.open(IO_WriteOnly))
+ {
+ // write header
+ f.putch('D'); f.putch('O'); f.putch('X'); f.putch('S');
+ // write index
+ for (i=0;i<numIndexEntries;i++)
+ {
+ writeInt(f,indexOffsets[i]);
+ }
+ // write word lists
+ count=0;
+ for (i=0;i<numIndexEntries;i++)
+ {
+ QList<IndexWord> *wlist = m_index[i];
+ if (!wlist->isEmpty())
+ {
+ QListIterator<IndexWord> iwi(*wlist);
+ IndexWord *iw;
+ for (iwi.toFirst();(iw=iwi.current());++iwi)
+ {
+ writeString(f,iw->word());
+ writeInt(f,wordStatOffsets[count++]);
+ }
+ f.putch(0);
+ }
+ }
+ // write extra padding bytes
+ for (i=0;i<padding;i++) f.putch(0);
+ // write word statistics
+ for (i=0;i<numIndexEntries;i++)
+ {
+ QList<IndexWord> *wlist = m_index[i];
+ if (!wlist->isEmpty())
+ {
+ QListIterator<IndexWord> iwi(*wlist);
+ IndexWord *iw;
+ for (iwi.toFirst();(iw=iwi.current());++iwi)
+ {
+ int numUrls = iw->urls().count();
+ writeInt(f,numUrls);
+ QIntDictIterator<URLInfo> uli(iw->urls());
+ URLInfo *ui;
+ for (uli.toFirst();(ui=uli.current());++uli)
+ {
+ writeInt(f,urlOffsets[ui->urlIdx]);
+ writeInt(f,ui->freq);
+ }
+ }
+ }
+ }
+ // write urls
+ QIntDictIterator<URL> udi(m_urls);
+ URL *url;
+ for (udi.toFirst();(url=udi.current());++udi)
+ {
+ writeString(f,url->name);
+ writeString(f,url->url);
+ }
+ }
+
+ delete[] urlOffsets;
+ delete[] wordStatOffsets;
+}
+