Orb/Doxygen/src/searchindex.cpp
author Jonathan Harrington <jonathan.harrington@nokia.com>
Wed, 11 Aug 2010 14:49:30 +0100
changeset 4 468f4c8d3d5b
parent 0 42188c7ea2d9
permissions -rw-r--r--
Orb version 0.2.0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
     1
/******************************************************************************
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
     2
 *
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
     3
 * 
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
     4
 *
4
468f4c8d3d5b Orb version 0.2.0
Jonathan Harrington <jonathan.harrington@nokia.com>
parents: 0
diff changeset
     5
 * Copyright (C) 1997-2010 by Dimitri van Heesch.
0
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
     6
 *
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
     7
 * Permission to use, copy, modify, and distribute this software and its
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
     8
 * documentation under the terms of the GNU General Public License is hereby 
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
     9
 * granted. No representations are made about the suitability of this software 
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    10
 * for any purpose. It is provided "as is" without express or implied warranty.
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    11
 * See the GNU General Public License for more details.
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    12
 *
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    13
 * Documents produced by Doxygen are derivative works derived from the
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    14
 * input used in their production; they are not affected by this license.
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    15
 *
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    16
 */
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    17
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    18
#include "qtbc.h"
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    19
#include "searchindex.h"
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    20
#include "config.h"
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    21
#include "util.h"
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    22
#include <qfile.h>
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    23
#include <ctype.h>
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    24
#include <qregexp.h>
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    25
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    26
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    27
// file format: (all multi-byte values are stored in big endian format)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    28
//   4 byte header
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    29
//   256*256*4 byte index (4 bytes)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    30
//   for each index entry: a zero terminated list of words 
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    31
//   for each word: a \0 terminated string + 4 byte offset to the stats info
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    32
//   padding bytes to align at 4 byte boundary
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    33
//   for each word: the number of urls (4 bytes) 
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    34
//               + for each url containing the word 8 bytes statistics
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    35
//                 (4 bytes index to url string + 4 bytes frequency counter)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    36
//   for each url: a \0 terminated string
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    37
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    38
const int numIndexEntries = 256*256;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    39
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    40
//--------------------------------------------------------------------
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    41
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    42
IndexWord::IndexWord(const char *word) : m_word(word), m_urls(17)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    43
{
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    44
  m_urls.setAutoDelete(TRUE);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    45
  //printf("IndexWord::IndexWord(%s)\n",word);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    46
}
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    47
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    48
void IndexWord::addUrlIndex(int idx,bool hiPriority)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    49
{
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    50
  //printf("IndexWord::addUrlIndex(%d,%d)\n",idx,hiPriority);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    51
  URLInfo *ui = m_urls.find(idx);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    52
  if (ui==0)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    53
  {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    54
    //printf("URLInfo::URLInfo(%d)\n",idx);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    55
    ui=new URLInfo(idx,0);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    56
    m_urls.insert(idx,ui);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    57
  }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    58
  ui->freq+=2;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    59
  if (hiPriority) ui->freq|=1; // mark as high priority document
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    60
}
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    61
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    62
//--------------------------------------------------------------------
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    63
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    64
SearchIndex::SearchIndex() : m_words(328829), m_index(numIndexEntries), m_urlIndex(-1)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    65
{
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    66
  int i;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    67
  m_words.setAutoDelete(TRUE);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    68
  m_urls.setAutoDelete(TRUE);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    69
  m_index.setAutoDelete(TRUE);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    70
  for (i=0;i<numIndexEntries;i++) m_index.insert(i,new QList<IndexWord>);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    71
}
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    72
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    73
void SearchIndex::setCurrentDoc(const char *name,const char *baseName,const char *anchor)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    74
{
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    75
  if (name==0 || baseName==0) return;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    76
  //printf("SearchIndex::setCurrentDoc(%s,%s,%s)\n",name,baseName,anchor);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    77
  QCString url=baseName+Config_getString("HTML_FILE_EXTENSION");
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    78
  if (anchor) url+=(QCString)"#"+anchor;  
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    79
  m_urlIndex++;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    80
  m_urls.insert(m_urlIndex,new URL(name,url));
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    81
}
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    82
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    83
static int charsToIndex(const char *word)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    84
{
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    85
  if (word==0) return -1;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    86
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    87
  // Fast string hashing algorithm
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    88
  //register ushort h=0;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    89
  //const char *k = word;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    90
  //ushort mask=0xfc00;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    91
  //while ( *k ) 
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    92
  //{
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    93
  //  h = (h&mask)^(h<<6)^(*k++);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    94
  //}
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    95
  //return h;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    96
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    97
  // Simple hashing that allows for substring searching
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    98
  uint c1=word[0];
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
    99
  if (c1==0) return -1;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   100
  uint c2=word[1];
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   101
  if (c2==0) return -1;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   102
  return c1*256+c2;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   103
}
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   104
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   105
void SearchIndex::addWord(const char *word,bool hiPriority,bool recurse)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   106
{
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   107
  static QRegExp nextPart("[_a-z:][A-Z]");
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   108
  //printf("SearchIndex::addWord(%s,%d)\n",word,hiPriority);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   109
  QString wStr(word);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   110
  if (wStr.isEmpty()) return;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   111
  wStr=wStr.lower();
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   112
  IndexWord *w = m_words[wStr];
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   113
  if (w==0)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   114
  {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   115
    int idx=charsToIndex(wStr);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   116
    if (idx<0) return;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   117
    w = new IndexWord(wStr);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   118
    //fprintf(stderr,"addWord(%s) at index %d\n",word,idx);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   119
    m_index[idx]->append(w);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   120
    m_words.insert(wStr,w);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   121
  }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   122
  w->addUrlIndex(m_urlIndex,hiPriority);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   123
  int i;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   124
  bool found=FALSE;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   125
  if (!recurse) // the first time we check if we can strip the prefix
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   126
  {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   127
    i=getPrefixIndex(word);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   128
    if (i>0)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   129
    {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   130
      addWord(word+i,hiPriority,TRUE);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   131
      found=TRUE;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   132
    }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   133
  }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   134
  if (!found) // no prefix stripped
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   135
  {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   136
    if ((i=nextPart.match(word))>=1)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   137
    {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   138
      addWord(word+i+1,hiPriority,TRUE);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   139
    }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   140
  }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   141
}
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   142
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   143
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   144
static void writeInt(QFile &f,int index)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   145
{
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   146
  f.putch(((uint)index)>>24);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   147
  f.putch((((uint)index)>>16)&0xff);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   148
  f.putch((((uint)index)>>8)&0xff);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   149
  f.putch(((uint)index)&0xff);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   150
}
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   151
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   152
static void writeString(QFile &f,const char *s)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   153
{
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   154
  const char *p = s;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   155
  while (*p) f.putch(*p++);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   156
  f.putch(0);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   157
}
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   158
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   159
void SearchIndex::write(const char *fileName)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   160
{
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   161
  int i;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   162
  int size=4; // for the header
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   163
  size+=4*numIndexEntries; // for the index
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   164
  int wordsOffset = size;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   165
  // first pass: compute the size of the wordlist
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   166
  for (i=0;i<numIndexEntries;i++)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   167
  {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   168
    QList<IndexWord> *wlist = m_index[i];
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   169
    if (!wlist->isEmpty())
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   170
    {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   171
      QListIterator<IndexWord> iwi(*wlist);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   172
      IndexWord *iw;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   173
      for (iwi.toFirst();(iw=iwi.current());++iwi)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   174
      {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   175
        int ws = iw->word().length()+1; 
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   176
        size+=ws+4; // word + url info list offset
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   177
      }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   178
      size+=1; // zero list terminator
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   179
    }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   180
  }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   181
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   182
  // second pass: compute the offsets in the index
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   183
  int indexOffsets[numIndexEntries];
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   184
  int offset=wordsOffset;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   185
  for (i=0;i<numIndexEntries;i++)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   186
  {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   187
    QList<IndexWord> *wlist = m_index[i];
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   188
    if (!wlist->isEmpty())
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   189
    {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   190
      indexOffsets[i]=offset;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   191
      QListIterator<IndexWord> iwi(*wlist);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   192
      IndexWord *iw;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   193
      for (iwi.toFirst();(iw=iwi.current());++iwi)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   194
      {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   195
        offset+= iw->word().length()+1; 
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   196
        offset+=4; // word + offset to url info array 
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   197
      }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   198
      offset+=1; // zero list terminator
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   199
    }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   200
    else
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   201
    {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   202
      indexOffsets[i]=0;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   203
    }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   204
  }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   205
  int padding = size;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   206
  size = (size+3)&~3; // round up to 4 byte boundary
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   207
  padding = size - padding;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   208
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   209
  //int statsOffset = size;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   210
  QDictIterator<IndexWord> wdi(m_words);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   211
  //IndexWord *iw;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   212
  int *wordStatOffsets = new int[m_words.count()];
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   213
  
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   214
  int count=0;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   215
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   216
  // third pass: compute offset to stats info for each word
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   217
  for (i=0;i<numIndexEntries;i++)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   218
  {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   219
    QList<IndexWord> *wlist = m_index[i];
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   220
    if (!wlist->isEmpty())
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   221
    {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   222
      QListIterator<IndexWord> iwi(*wlist);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   223
      IndexWord *iw;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   224
      for (iwi.toFirst();(iw=iwi.current());++iwi)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   225
      {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   226
        //printf("wordStatOffsets[%d]=%d\n",count,size);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   227
        wordStatOffsets[count++] = size;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   228
        size+=4+iw->urls().count()*8; // count + (url_index,freq) per url
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   229
      }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   230
    }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   231
  }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   232
  int *urlOffsets = new int[m_urls.count()];
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   233
  //int urlsOffset = size;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   234
  QIntDictIterator<URL> udi(m_urls);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   235
  URL *url;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   236
  for (udi.toFirst();(url=udi.current());++udi)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   237
  {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   238
    urlOffsets[udi.currentKey()]=size;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   239
    size+=url->name.length()+1+
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   240
          url->url.length()+1;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   241
  }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   242
  //printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   243
  QFile f(fileName);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   244
  if (f.open(IO_WriteOnly))
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   245
  {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   246
    // write header
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   247
    f.putch('D'); f.putch('O'); f.putch('X'); f.putch('S');
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   248
    // write index
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   249
    for (i=0;i<numIndexEntries;i++)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   250
    {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   251
      writeInt(f,indexOffsets[i]);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   252
    }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   253
    // write word lists
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   254
    count=0;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   255
    for (i=0;i<numIndexEntries;i++)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   256
    {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   257
      QList<IndexWord> *wlist = m_index[i];
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   258
      if (!wlist->isEmpty())
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   259
      {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   260
        QListIterator<IndexWord> iwi(*wlist);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   261
        IndexWord *iw;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   262
        for (iwi.toFirst();(iw=iwi.current());++iwi)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   263
        {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   264
          writeString(f,iw->word());
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   265
          writeInt(f,wordStatOffsets[count++]);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   266
        }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   267
        f.putch(0);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   268
      }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   269
    }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   270
    // write extra padding bytes
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   271
    for (i=0;i<padding;i++) f.putch(0);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   272
    // write word statistics
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   273
    for (i=0;i<numIndexEntries;i++)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   274
    {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   275
      QList<IndexWord> *wlist = m_index[i];
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   276
      if (!wlist->isEmpty())
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   277
      {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   278
        QListIterator<IndexWord> iwi(*wlist);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   279
        IndexWord *iw;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   280
        for (iwi.toFirst();(iw=iwi.current());++iwi)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   281
        {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   282
          int numUrls = iw->urls().count();
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   283
          writeInt(f,numUrls);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   284
          QIntDictIterator<URLInfo> uli(iw->urls());
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   285
          URLInfo *ui;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   286
          for (uli.toFirst();(ui=uli.current());++uli)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   287
          {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   288
            writeInt(f,urlOffsets[ui->urlIdx]);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   289
            writeInt(f,ui->freq);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   290
          }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   291
        }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   292
      }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   293
    }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   294
    // write urls
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   295
    QIntDictIterator<URL> udi(m_urls);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   296
    URL *url;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   297
    for (udi.toFirst();(url=udi.current());++udi)
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   298
    {
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   299
      writeString(f,url->name);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   300
      writeString(f,url->url);
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   301
    }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   302
  }
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   303
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   304
  delete[] urlOffsets;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   305
  delete[] wordStatOffsets;
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   306
}
42188c7ea2d9 Initial contribution of ORB delivering Feature bug 1460
szarinda <>
parents:
diff changeset
   307