/******************************************************************************
*
*
*
* Copyright (C) 1997-2008 by Dimitri van Heesch.
*
* Permission to use, copy, modify, and distribute this software and its
* documentation under the terms of the GNU General Public License is hereby
* granted. No representations are made about the suitability of this software
* for any purpose. It is provided "as is" without express or implied warranty.
* See the GNU General Public License for more details.
*
* Documents produced by Doxygen are derivative works derived from the
* input used in their production; they are not affected by this license.
*
*/
#include "qtbc.h"
#include "searchindex.h"
#include "config.h"
#include "util.h"
#include <qfile.h>
#include <ctype.h>
#include <qregexp.h>
// file format: (all multi-byte values are stored in big endian format)
// 4 byte header
// 256*256*4 byte index (4 bytes)
// for each index entry: a zero terminated list of words
// for each word: a \0 terminated string + 4 byte offset to the stats info
// padding bytes to align at 4 byte boundary
// for each word: the number of urls (4 bytes)
// + for each url containing the word 8 bytes statistics
// (4 bytes index to url string + 4 bytes frequency counter)
// for each url: a \0 terminated string
const int numIndexEntries = 256*256;
//--------------------------------------------------------------------
IndexWord::IndexWord(const char *word) : m_word(word), m_urls(17)
{
m_urls.setAutoDelete(TRUE);
//printf("IndexWord::IndexWord(%s)\n",word);
}
void IndexWord::addUrlIndex(int idx,bool hiPriority)
{
//printf("IndexWord::addUrlIndex(%d,%d)\n",idx,hiPriority);
URLInfo *ui = m_urls.find(idx);
if (ui==0)
{
//printf("URLInfo::URLInfo(%d)\n",idx);
ui=new URLInfo(idx,0);
m_urls.insert(idx,ui);
}
ui->freq+=2;
if (hiPriority) ui->freq|=1; // mark as high priority document
}
//--------------------------------------------------------------------
SearchIndex::SearchIndex() : m_words(328829), m_index(numIndexEntries), m_urlIndex(-1)
{
int i;
m_words.setAutoDelete(TRUE);
m_urls.setAutoDelete(TRUE);
m_index.setAutoDelete(TRUE);
for (i=0;i<numIndexEntries;i++) m_index.insert(i,new QList<IndexWord>);
}
void SearchIndex::setCurrentDoc(const char *name,const char *baseName,const char *anchor)
{
if (name==0 || baseName==0) return;
//printf("SearchIndex::setCurrentDoc(%s,%s,%s)\n",name,baseName,anchor);
QCString url=baseName+Config_getString("HTML_FILE_EXTENSION");
if (anchor) url+=(QCString)"#"+anchor;
m_urlIndex++;
m_urls.insert(m_urlIndex,new URL(name,url));
}
static int charsToIndex(const char *word)
{
if (word==0) return -1;
// Fast string hashing algorithm
//register ushort h=0;
//const char *k = word;
//ushort mask=0xfc00;
//while ( *k )
//{
// h = (h&mask)^(h<<6)^(*k++);
//}
//return h;
// Simple hashing that allows for substring searching
uint c1=word[0];
if (c1==0) return -1;
uint c2=word[1];
if (c2==0) return -1;
return c1*256+c2;
}
void SearchIndex::addWord(const char *word,bool hiPriority,bool recurse)
{
static QRegExp nextPart("[_a-z:][A-Z]");
//printf("SearchIndex::addWord(%s,%d)\n",word,hiPriority);
QString wStr(word);
if (wStr.isEmpty()) return;
wStr=wStr.lower();
IndexWord *w = m_words[wStr];
if (w==0)
{
int idx=charsToIndex(wStr);
if (idx<0) return;
w = new IndexWord(wStr);
//fprintf(stderr,"addWord(%s) at index %d\n",word,idx);
m_index[idx]->append(w);
m_words.insert(wStr,w);
}
w->addUrlIndex(m_urlIndex,hiPriority);
int i;
bool found=FALSE;
if (!recurse) // the first time we check if we can strip the prefix
{
i=getPrefixIndex(word);
if (i>0)
{
addWord(word+i,hiPriority,TRUE);
found=TRUE;
}
}
if (!found) // no prefix stripped
{
if ((i=nextPart.match(word))>=1)
{
addWord(word+i+1,hiPriority,TRUE);
}
}
}
static void writeInt(QFile &f,int index)
{
f.putch(((uint)index)>>24);
f.putch((((uint)index)>>16)&0xff);
f.putch((((uint)index)>>8)&0xff);
f.putch(((uint)index)&0xff);
}
static void writeString(QFile &f,const char *s)
{
const char *p = s;
while (*p) f.putch(*p++);
f.putch(0);
}
void SearchIndex::write(const char *fileName)
{
int i;
int size=4; // for the header
size+=4*numIndexEntries; // for the index
int wordsOffset = size;
// first pass: compute the size of the wordlist
for (i=0;i<numIndexEntries;i++)
{
QList<IndexWord> *wlist = m_index[i];
if (!wlist->isEmpty())
{
QListIterator<IndexWord> iwi(*wlist);
IndexWord *iw;
for (iwi.toFirst();(iw=iwi.current());++iwi)
{
int ws = iw->word().length()+1;
size+=ws+4; // word + url info list offset
}
size+=1; // zero list terminator
}
}
// second pass: compute the offsets in the index
int indexOffsets[numIndexEntries];
int offset=wordsOffset;
for (i=0;i<numIndexEntries;i++)
{
QList<IndexWord> *wlist = m_index[i];
if (!wlist->isEmpty())
{
indexOffsets[i]=offset;
QListIterator<IndexWord> iwi(*wlist);
IndexWord *iw;
for (iwi.toFirst();(iw=iwi.current());++iwi)
{
offset+= iw->word().length()+1;
offset+=4; // word + offset to url info array
}
offset+=1; // zero list terminator
}
else
{
indexOffsets[i]=0;
}
}
int padding = size;
size = (size+3)&~3; // round up to 4 byte boundary
padding = size - padding;
//int statsOffset = size;
QDictIterator<IndexWord> wdi(m_words);
//IndexWord *iw;
int *wordStatOffsets = new int[m_words.count()];
int count=0;
// third pass: compute offset to stats info for each word
for (i=0;i<numIndexEntries;i++)
{
QList<IndexWord> *wlist = m_index[i];
if (!wlist->isEmpty())
{
QListIterator<IndexWord> iwi(*wlist);
IndexWord *iw;
for (iwi.toFirst();(iw=iwi.current());++iwi)
{
//printf("wordStatOffsets[%d]=%d\n",count,size);
wordStatOffsets[count++] = size;
size+=4+iw->urls().count()*8; // count + (url_index,freq) per url
}
}
}
int *urlOffsets = new int[m_urls.count()];
//int urlsOffset = size;
QIntDictIterator<URL> udi(m_urls);
URL *url;
for (udi.toFirst();(url=udi.current());++udi)
{
urlOffsets[udi.currentKey()]=size;
size+=url->name.length()+1+
url->url.length()+1;
}
//printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset);
QFile f(fileName);
if (f.open(IO_WriteOnly))
{
// write header
f.putch('D'); f.putch('O'); f.putch('X'); f.putch('S');
// write index
for (i=0;i<numIndexEntries;i++)
{
writeInt(f,indexOffsets[i]);
}
// write word lists
count=0;
for (i=0;i<numIndexEntries;i++)
{
QList<IndexWord> *wlist = m_index[i];
if (!wlist->isEmpty())
{
QListIterator<IndexWord> iwi(*wlist);
IndexWord *iw;
for (iwi.toFirst();(iw=iwi.current());++iwi)
{
writeString(f,iw->word());
writeInt(f,wordStatOffsets[count++]);
}
f.putch(0);
}
}
// write extra padding bytes
for (i=0;i<padding;i++) f.putch(0);
// write word statistics
for (i=0;i<numIndexEntries;i++)
{
QList<IndexWord> *wlist = m_index[i];
if (!wlist->isEmpty())
{
QListIterator<IndexWord> iwi(*wlist);
IndexWord *iw;
for (iwi.toFirst();(iw=iwi.current());++iwi)
{
int numUrls = iw->urls().count();
writeInt(f,numUrls);
QIntDictIterator<URLInfo> uli(iw->urls());
URLInfo *ui;
for (uli.toFirst();(ui=uli.current());++uli)
{
writeInt(f,urlOffsets[ui->urlIdx]);
writeInt(f,ui->freq);
}
}
}
}
// write urls
QIntDictIterator<URL> udi(m_urls);
URL *url;
for (udi.toFirst();(url=udi.current());++udi)
{
writeString(f,url->name);
writeString(f,url->url);
}
}
delete[] urlOffsets;
delete[] wordStatOffsets;
}