diff -r 932c358ece3e -r d8fccb2cd802 Orb/Doxygen/src/searchindex.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Orb/Doxygen/src/searchindex.cpp Fri Apr 23 20:47:58 2010 +0100 @@ -0,0 +1,307 @@ +/****************************************************************************** + * + * + * + * Copyright (C) 1997-2008 by Dimitri van Heesch. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation under the terms of the GNU General Public License is hereby + * granted. No representations are made about the suitability of this software + * for any purpose. It is provided "as is" without express or implied warranty. + * See the GNU General Public License for more details. + * + * Documents produced by Doxygen are derivative works derived from the + * input used in their production; they are not affected by this license. + * + */ + +#include "qtbc.h" +#include "searchindex.h" +#include "config.h" +#include "util.h" +#include +#include +#include + + +// file format: (all multi-byte values are stored in big endian format) +// 4 byte header +// 256*256*4 byte index (4 bytes) +// for each index entry: a zero terminated list of words +// for each word: a \0 terminated string + 4 byte offset to the stats info +// padding bytes to align at 4 byte boundary +// for each word: the number of urls (4 bytes) +// + for each url containing the word 8 bytes statistics +// (4 bytes index to url string + 4 bytes frequency counter) +// for each url: a \0 terminated string + +const int numIndexEntries = 256*256; + +//-------------------------------------------------------------------- + +IndexWord::IndexWord(const char *word) : m_word(word), m_urls(17) +{ + m_urls.setAutoDelete(TRUE); + //printf("IndexWord::IndexWord(%s)\n",word); +} + +void IndexWord::addUrlIndex(int idx,bool hiPriority) +{ + //printf("IndexWord::addUrlIndex(%d,%d)\n",idx,hiPriority); + URLInfo *ui = m_urls.find(idx); + if (ui==0) + { + //printf("URLInfo::URLInfo(%d)\n",idx); + ui=new URLInfo(idx,0); + m_urls.insert(idx,ui); + } + ui->freq+=2; + if (hiPriority) ui->freq|=1; // mark as high priority document +} + +//-------------------------------------------------------------------- + +SearchIndex::SearchIndex() : m_words(328829), m_index(numIndexEntries), m_urlIndex(-1) +{ + int i; + m_words.setAutoDelete(TRUE); + m_urls.setAutoDelete(TRUE); + m_index.setAutoDelete(TRUE); + for (i=0;i); +} + +void SearchIndex::setCurrentDoc(const char *name,const char *baseName,const char *anchor) +{ + if (name==0 || baseName==0) return; + //printf("SearchIndex::setCurrentDoc(%s,%s,%s)\n",name,baseName,anchor); + QCString url=baseName+Config_getString("HTML_FILE_EXTENSION"); + if (anchor) url+=(QCString)"#"+anchor; + m_urlIndex++; + m_urls.insert(m_urlIndex,new URL(name,url)); +} + +static int charsToIndex(const char *word) +{ + if (word==0) return -1; + + // Fast string hashing algorithm + //register ushort h=0; + //const char *k = word; + //ushort mask=0xfc00; + //while ( *k ) + //{ + // h = (h&mask)^(h<<6)^(*k++); + //} + //return h; + + // Simple hashing that allows for substring searching + uint c1=word[0]; + if (c1==0) return -1; + uint c2=word[1]; + if (c2==0) return -1; + return c1*256+c2; +} + +void SearchIndex::addWord(const char *word,bool hiPriority,bool recurse) +{ + static QRegExp nextPart("[_a-z:][A-Z]"); + //printf("SearchIndex::addWord(%s,%d)\n",word,hiPriority); + QString wStr(word); + if (wStr.isEmpty()) return; + wStr=wStr.lower(); + IndexWord *w = m_words[wStr]; + if (w==0) + { + int idx=charsToIndex(wStr); + if (idx<0) return; + w = new IndexWord(wStr); + //fprintf(stderr,"addWord(%s) at index %d\n",word,idx); + m_index[idx]->append(w); + m_words.insert(wStr,w); + } + w->addUrlIndex(m_urlIndex,hiPriority); + int i; + bool found=FALSE; + if (!recurse) // the first time we check if we can strip the prefix + { + i=getPrefixIndex(word); + if (i>0) + { + addWord(word+i,hiPriority,TRUE); + found=TRUE; + } + } + if (!found) // no prefix stripped + { + if ((i=nextPart.match(word))>=1) + { + addWord(word+i+1,hiPriority,TRUE); + } + } +} + + +static void writeInt(QFile &f,int index) +{ + f.putch(((uint)index)>>24); + f.putch((((uint)index)>>16)&0xff); + f.putch((((uint)index)>>8)&0xff); + f.putch(((uint)index)&0xff); +} + +static void writeString(QFile &f,const char *s) +{ + const char *p = s; + while (*p) f.putch(*p++); + f.putch(0); +} + +void SearchIndex::write(const char *fileName) +{ + int i; + int size=4; // for the header + size+=4*numIndexEntries; // for the index + int wordsOffset = size; + // first pass: compute the size of the wordlist + for (i=0;i *wlist = m_index[i]; + if (!wlist->isEmpty()) + { + QListIterator iwi(*wlist); + IndexWord *iw; + for (iwi.toFirst();(iw=iwi.current());++iwi) + { + int ws = iw->word().length()+1; + size+=ws+4; // word + url info list offset + } + size+=1; // zero list terminator + } + } + + // second pass: compute the offsets in the index + int indexOffsets[numIndexEntries]; + int offset=wordsOffset; + for (i=0;i *wlist = m_index[i]; + if (!wlist->isEmpty()) + { + indexOffsets[i]=offset; + QListIterator iwi(*wlist); + IndexWord *iw; + for (iwi.toFirst();(iw=iwi.current());++iwi) + { + offset+= iw->word().length()+1; + offset+=4; // word + offset to url info array + } + offset+=1; // zero list terminator + } + else + { + indexOffsets[i]=0; + } + } + int padding = size; + size = (size+3)&~3; // round up to 4 byte boundary + padding = size - padding; + + //int statsOffset = size; + QDictIterator wdi(m_words); + //IndexWord *iw; + int *wordStatOffsets = new int[m_words.count()]; + + int count=0; + + // third pass: compute offset to stats info for each word + for (i=0;i *wlist = m_index[i]; + if (!wlist->isEmpty()) + { + QListIterator iwi(*wlist); + IndexWord *iw; + for (iwi.toFirst();(iw=iwi.current());++iwi) + { + //printf("wordStatOffsets[%d]=%d\n",count,size); + wordStatOffsets[count++] = size; + size+=4+iw->urls().count()*8; // count + (url_index,freq) per url + } + } + } + int *urlOffsets = new int[m_urls.count()]; + //int urlsOffset = size; + QIntDictIterator udi(m_urls); + URL *url; + for (udi.toFirst();(url=udi.current());++udi) + { + urlOffsets[udi.currentKey()]=size; + size+=url->name.length()+1+ + url->url.length()+1; + } + //printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset); + QFile f(fileName); + if (f.open(IO_WriteOnly)) + { + // write header + f.putch('D'); f.putch('O'); f.putch('X'); f.putch('S'); + // write index + for (i=0;i *wlist = m_index[i]; + if (!wlist->isEmpty()) + { + QListIterator iwi(*wlist); + IndexWord *iw; + for (iwi.toFirst();(iw=iwi.current());++iwi) + { + writeString(f,iw->word()); + writeInt(f,wordStatOffsets[count++]); + } + f.putch(0); + } + } + // write extra padding bytes + for (i=0;i *wlist = m_index[i]; + if (!wlist->isEmpty()) + { + QListIterator iwi(*wlist); + IndexWord *iw; + for (iwi.toFirst();(iw=iwi.current());++iwi) + { + int numUrls = iw->urls().count(); + writeInt(f,numUrls); + QIntDictIterator uli(iw->urls()); + URLInfo *ui; + for (uli.toFirst();(ui=uli.current());++uli) + { + writeInt(f,urlOffsets[ui->urlIdx]); + writeInt(f,ui->freq); + } + } + } + } + // write urls + QIntDictIterator udi(m_urls); + URL *url; + for (udi.toFirst();(url=udi.current());++udi) + { + writeString(f,url->name); + writeString(f,url->url); + } + } + + delete[] urlOffsets; + delete[] wordStatOffsets; +} +