Orb/Doxygen/src/searchindex.cpp
changeset 0 42188c7ea2d9
child 4 468f4c8d3d5b
equal deleted inserted replaced
-1:000000000000 0:42188c7ea2d9
       
     1 /******************************************************************************
       
     2  *
       
     3  * 
       
     4  *
       
     5  * Copyright (C) 1997-2008 by Dimitri van Heesch.
       
     6  *
       
     7  * Permission to use, copy, modify, and distribute this software and its
       
     8  * documentation under the terms of the GNU General Public License is hereby 
       
     9  * granted. No representations are made about the suitability of this software 
       
    10  * for any purpose. It is provided "as is" without express or implied warranty.
       
    11  * See the GNU General Public License for more details.
       
    12  *
       
    13  * Documents produced by Doxygen are derivative works derived from the
       
    14  * input used in their production; they are not affected by this license.
       
    15  *
       
    16  */
       
    17 
       
    18 #include "qtbc.h"
       
    19 #include "searchindex.h"
       
    20 #include "config.h"
       
    21 #include "util.h"
       
    22 #include <qfile.h>
       
    23 #include <ctype.h>
       
    24 #include <qregexp.h>
       
    25 
       
    26 
       
    27 // file format: (all multi-byte values are stored in big endian format)
       
    28 //   4 byte header
       
    29 //   256*256*4 byte index (4 bytes)
       
    30 //   for each index entry: a zero terminated list of words 
       
    31 //   for each word: a \0 terminated string + 4 byte offset to the stats info
       
    32 //   padding bytes to align at 4 byte boundary
       
    33 //   for each word: the number of urls (4 bytes) 
       
    34 //               + for each url containing the word 8 bytes statistics
       
    35 //                 (4 bytes index to url string + 4 bytes frequency counter)
       
    36 //   for each url: a \0 terminated string
       
    37 
       
    38 const int numIndexEntries = 256*256;
       
    39 
       
    40 //--------------------------------------------------------------------
       
    41 
       
    42 IndexWord::IndexWord(const char *word) : m_word(word), m_urls(17)
       
    43 {
       
    44   m_urls.setAutoDelete(TRUE);
       
    45   //printf("IndexWord::IndexWord(%s)\n",word);
       
    46 }
       
    47 
       
    48 void IndexWord::addUrlIndex(int idx,bool hiPriority)
       
    49 {
       
    50   //printf("IndexWord::addUrlIndex(%d,%d)\n",idx,hiPriority);
       
    51   URLInfo *ui = m_urls.find(idx);
       
    52   if (ui==0)
       
    53   {
       
    54     //printf("URLInfo::URLInfo(%d)\n",idx);
       
    55     ui=new URLInfo(idx,0);
       
    56     m_urls.insert(idx,ui);
       
    57   }
       
    58   ui->freq+=2;
       
    59   if (hiPriority) ui->freq|=1; // mark as high priority document
       
    60 }
       
    61 
       
    62 //--------------------------------------------------------------------
       
    63 
       
    64 SearchIndex::SearchIndex() : m_words(328829), m_index(numIndexEntries), m_urlIndex(-1)
       
    65 {
       
    66   int i;
       
    67   m_words.setAutoDelete(TRUE);
       
    68   m_urls.setAutoDelete(TRUE);
       
    69   m_index.setAutoDelete(TRUE);
       
    70   for (i=0;i<numIndexEntries;i++) m_index.insert(i,new QList<IndexWord>);
       
    71 }
       
    72 
       
    73 void SearchIndex::setCurrentDoc(const char *name,const char *baseName,const char *anchor)
       
    74 {
       
    75   if (name==0 || baseName==0) return;
       
    76   //printf("SearchIndex::setCurrentDoc(%s,%s,%s)\n",name,baseName,anchor);
       
    77   QCString url=baseName+Config_getString("HTML_FILE_EXTENSION");
       
    78   if (anchor) url+=(QCString)"#"+anchor;  
       
    79   m_urlIndex++;
       
    80   m_urls.insert(m_urlIndex,new URL(name,url));
       
    81 }
       
    82 
       
    83 static int charsToIndex(const char *word)
       
    84 {
       
    85   if (word==0) return -1;
       
    86 
       
    87   // Fast string hashing algorithm
       
    88   //register ushort h=0;
       
    89   //const char *k = word;
       
    90   //ushort mask=0xfc00;
       
    91   //while ( *k ) 
       
    92   //{
       
    93   //  h = (h&mask)^(h<<6)^(*k++);
       
    94   //}
       
    95   //return h;
       
    96 
       
    97   // Simple hashing that allows for substring searching
       
    98   uint c1=word[0];
       
    99   if (c1==0) return -1;
       
   100   uint c2=word[1];
       
   101   if (c2==0) return -1;
       
   102   return c1*256+c2;
       
   103 }
       
   104 
       
   105 void SearchIndex::addWord(const char *word,bool hiPriority,bool recurse)
       
   106 {
       
   107   static QRegExp nextPart("[_a-z:][A-Z]");
       
   108   //printf("SearchIndex::addWord(%s,%d)\n",word,hiPriority);
       
   109   QString wStr(word);
       
   110   if (wStr.isEmpty()) return;
       
   111   wStr=wStr.lower();
       
   112   IndexWord *w = m_words[wStr];
       
   113   if (w==0)
       
   114   {
       
   115     int idx=charsToIndex(wStr);
       
   116     if (idx<0) return;
       
   117     w = new IndexWord(wStr);
       
   118     //fprintf(stderr,"addWord(%s) at index %d\n",word,idx);
       
   119     m_index[idx]->append(w);
       
   120     m_words.insert(wStr,w);
       
   121   }
       
   122   w->addUrlIndex(m_urlIndex,hiPriority);
       
   123   int i;
       
   124   bool found=FALSE;
       
   125   if (!recurse) // the first time we check if we can strip the prefix
       
   126   {
       
   127     i=getPrefixIndex(word);
       
   128     if (i>0)
       
   129     {
       
   130       addWord(word+i,hiPriority,TRUE);
       
   131       found=TRUE;
       
   132     }
       
   133   }
       
   134   if (!found) // no prefix stripped
       
   135   {
       
   136     if ((i=nextPart.match(word))>=1)
       
   137     {
       
   138       addWord(word+i+1,hiPriority,TRUE);
       
   139     }
       
   140   }
       
   141 }
       
   142 
       
   143 
       
   144 static void writeInt(QFile &f,int index)
       
   145 {
       
   146   f.putch(((uint)index)>>24);
       
   147   f.putch((((uint)index)>>16)&0xff);
       
   148   f.putch((((uint)index)>>8)&0xff);
       
   149   f.putch(((uint)index)&0xff);
       
   150 }
       
   151 
       
   152 static void writeString(QFile &f,const char *s)
       
   153 {
       
   154   const char *p = s;
       
   155   while (*p) f.putch(*p++);
       
   156   f.putch(0);
       
   157 }
       
   158 
       
   159 void SearchIndex::write(const char *fileName)
       
   160 {
       
   161   int i;
       
   162   int size=4; // for the header
       
   163   size+=4*numIndexEntries; // for the index
       
   164   int wordsOffset = size;
       
   165   // first pass: compute the size of the wordlist
       
   166   for (i=0;i<numIndexEntries;i++)
       
   167   {
       
   168     QList<IndexWord> *wlist = m_index[i];
       
   169     if (!wlist->isEmpty())
       
   170     {
       
   171       QListIterator<IndexWord> iwi(*wlist);
       
   172       IndexWord *iw;
       
   173       for (iwi.toFirst();(iw=iwi.current());++iwi)
       
   174       {
       
   175         int ws = iw->word().length()+1; 
       
   176         size+=ws+4; // word + url info list offset
       
   177       }
       
   178       size+=1; // zero list terminator
       
   179     }
       
   180   }
       
   181 
       
   182   // second pass: compute the offsets in the index
       
   183   int indexOffsets[numIndexEntries];
       
   184   int offset=wordsOffset;
       
   185   for (i=0;i<numIndexEntries;i++)
       
   186   {
       
   187     QList<IndexWord> *wlist = m_index[i];
       
   188     if (!wlist->isEmpty())
       
   189     {
       
   190       indexOffsets[i]=offset;
       
   191       QListIterator<IndexWord> iwi(*wlist);
       
   192       IndexWord *iw;
       
   193       for (iwi.toFirst();(iw=iwi.current());++iwi)
       
   194       {
       
   195         offset+= iw->word().length()+1; 
       
   196         offset+=4; // word + offset to url info array 
       
   197       }
       
   198       offset+=1; // zero list terminator
       
   199     }
       
   200     else
       
   201     {
       
   202       indexOffsets[i]=0;
       
   203     }
       
   204   }
       
   205   int padding = size;
       
   206   size = (size+3)&~3; // round up to 4 byte boundary
       
   207   padding = size - padding;
       
   208 
       
   209   //int statsOffset = size;
       
   210   QDictIterator<IndexWord> wdi(m_words);
       
   211   //IndexWord *iw;
       
   212   int *wordStatOffsets = new int[m_words.count()];
       
   213   
       
   214   int count=0;
       
   215 
       
   216   // third pass: compute offset to stats info for each word
       
   217   for (i=0;i<numIndexEntries;i++)
       
   218   {
       
   219     QList<IndexWord> *wlist = m_index[i];
       
   220     if (!wlist->isEmpty())
       
   221     {
       
   222       QListIterator<IndexWord> iwi(*wlist);
       
   223       IndexWord *iw;
       
   224       for (iwi.toFirst();(iw=iwi.current());++iwi)
       
   225       {
       
   226         //printf("wordStatOffsets[%d]=%d\n",count,size);
       
   227         wordStatOffsets[count++] = size;
       
   228         size+=4+iw->urls().count()*8; // count + (url_index,freq) per url
       
   229       }
       
   230     }
       
   231   }
       
   232   int *urlOffsets = new int[m_urls.count()];
       
   233   //int urlsOffset = size;
       
   234   QIntDictIterator<URL> udi(m_urls);
       
   235   URL *url;
       
   236   for (udi.toFirst();(url=udi.current());++udi)
       
   237   {
       
   238     urlOffsets[udi.currentKey()]=size;
       
   239     size+=url->name.length()+1+
       
   240           url->url.length()+1;
       
   241   }
       
   242   //printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset);
       
   243   QFile f(fileName);
       
   244   if (f.open(IO_WriteOnly))
       
   245   {
       
   246     // write header
       
   247     f.putch('D'); f.putch('O'); f.putch('X'); f.putch('S');
       
   248     // write index
       
   249     for (i=0;i<numIndexEntries;i++)
       
   250     {
       
   251       writeInt(f,indexOffsets[i]);
       
   252     }
       
   253     // write word lists
       
   254     count=0;
       
   255     for (i=0;i<numIndexEntries;i++)
       
   256     {
       
   257       QList<IndexWord> *wlist = m_index[i];
       
   258       if (!wlist->isEmpty())
       
   259       {
       
   260         QListIterator<IndexWord> iwi(*wlist);
       
   261         IndexWord *iw;
       
   262         for (iwi.toFirst();(iw=iwi.current());++iwi)
       
   263         {
       
   264           writeString(f,iw->word());
       
   265           writeInt(f,wordStatOffsets[count++]);
       
   266         }
       
   267         f.putch(0);
       
   268       }
       
   269     }
       
   270     // write extra padding bytes
       
   271     for (i=0;i<padding;i++) f.putch(0);
       
   272     // write word statistics
       
   273     for (i=0;i<numIndexEntries;i++)
       
   274     {
       
   275       QList<IndexWord> *wlist = m_index[i];
       
   276       if (!wlist->isEmpty())
       
   277       {
       
   278         QListIterator<IndexWord> iwi(*wlist);
       
   279         IndexWord *iw;
       
   280         for (iwi.toFirst();(iw=iwi.current());++iwi)
       
   281         {
       
   282           int numUrls = iw->urls().count();
       
   283           writeInt(f,numUrls);
       
   284           QIntDictIterator<URLInfo> uli(iw->urls());
       
   285           URLInfo *ui;
       
   286           for (uli.toFirst();(ui=uli.current());++uli)
       
   287           {
       
   288             writeInt(f,urlOffsets[ui->urlIdx]);
       
   289             writeInt(f,ui->freq);
       
   290           }
       
   291         }
       
   292       }
       
   293     }
       
   294     // write urls
       
   295     QIntDictIterator<URL> udi(m_urls);
       
   296     URL *url;
       
   297     for (udi.toFirst();(url=udi.current());++udi)
       
   298     {
       
   299       writeString(f,url->name);
       
   300       writeString(f,url->url);
       
   301     }
       
   302   }
       
   303 
       
   304   delete[] urlOffsets;
       
   305   delete[] wordStatOffsets;
       
   306 }
       
   307