tools/assistant/lib/qhelpsearchindexreader_default.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
child 7 f7bc934e204c
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the Qt Assistant of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include "qhelpenginecore.h"
       
    43 #include "qhelpsearchindexreader_default_p.h"
       
    44 
       
    45 #include <QtCore/QDir>
       
    46 #include <QtCore/QUrl>
       
    47 #include <QtCore/QFile>
       
    48 #include <QtCore/QVariant>
       
    49 #include <QtCore/QFileInfo>
       
    50 #include <QtCore/QDataStream>
       
    51 #include <QtCore/QTextStream>
       
    52 
       
    53 QT_BEGIN_NAMESPACE
       
    54 
       
    55 namespace qt {
       
    56     namespace fulltextsearch {
       
    57         namespace std {
       
    58 
       
    59 namespace {
       
    60     QStringList split( const QString &str )
       
    61     {
       
    62         QStringList lst;
       
    63         int j = 0;
       
    64         int i = str.indexOf(QLatin1Char('*'), j );
       
    65 
       
    66         if (str.startsWith(QLatin1String("*")))
       
    67             lst << QLatin1String("*");
       
    68 
       
    69         while ( i != -1 ) {
       
    70             if ( i > j && i <= (int)str.length() ) {
       
    71                 lst << str.mid( j, i - j );
       
    72                 lst << QLatin1String("*");
       
    73             }
       
    74             j = i + 1;
       
    75             i = str.indexOf(QLatin1Char('*'), j );
       
    76         }
       
    77 
       
    78         int l = str.length() - 1;
       
    79         if ( str.mid( j, l - j + 1 ).length() > 0 )
       
    80             lst << str.mid( j, l - j + 1 );
       
    81 
       
    82         return lst;
       
    83     }
       
    84 }
       
    85 
       
    86 
       
    87 Reader::Reader()
       
    88     : indexPath(QString())
       
    89     , indexFile(QString())
       
    90     , documentFile(QString())
       
    91 {
       
    92     termList.clear();
       
    93     indexTable.clear();
       
    94     searchIndexTable.clear();
       
    95 }
       
    96 
       
    97 Reader::~Reader()
       
    98 {
       
    99     reset();
       
   100     searchIndexTable.clear();
       
   101 }
       
   102 
       
   103 bool Reader::readIndex()
       
   104 {
       
   105     if (indexTable.contains(indexFile))
       
   106         return true;
       
   107 
       
   108     QFile idxFile(indexFile);
       
   109     if (!idxFile.open(QFile::ReadOnly))
       
   110         return false;
       
   111 
       
   112     QString key;
       
   113     int numOfDocs;
       
   114     EntryTable entryTable;
       
   115     QVector<Document> docs;
       
   116     QDataStream dictStream(&idxFile);
       
   117     while (!dictStream.atEnd()) {
       
   118         dictStream >> key;
       
   119         dictStream >> numOfDocs;
       
   120         docs.resize(numOfDocs);
       
   121         dictStream >> docs;
       
   122         entryTable.insert(key, new Entry(docs));
       
   123     }
       
   124     idxFile.close();
       
   125 
       
   126     if (entryTable.isEmpty())
       
   127         return false;
       
   128 
       
   129     QFile docFile(documentFile);
       
   130     if (!docFile.open(QFile::ReadOnly))
       
   131         return false;
       
   132 
       
   133     QString title, url;
       
   134     DocumentList documentList;
       
   135     QDataStream docStream(&docFile);
       
   136     while (!docStream.atEnd()) {
       
   137         docStream >> title;
       
   138         docStream >> url;
       
   139         documentList.append(QStringList(title) << url);
       
   140     }
       
   141     docFile.close();
       
   142 
       
   143     if (documentList.isEmpty()) {
       
   144         cleanupIndex(entryTable);
       
   145         return false;
       
   146     }
       
   147 
       
   148     indexTable.insert(indexFile, Index(entryTable, documentList));
       
   149     return true;
       
   150 }
       
   151 
       
   152 bool Reader::initCheck() const
       
   153 {
       
   154     return !searchIndexTable.isEmpty();
       
   155 }
       
   156 
       
   157 void Reader::setIndexPath(const QString &path)
       
   158 {
       
   159     indexPath = path;
       
   160 }
       
   161 
       
   162 void Reader::filterFilesForAttributes(const QStringList &attributes)
       
   163 {
       
   164     searchIndexTable.clear();
       
   165     for(IndexTable::ConstIterator it = indexTable.begin(); it != indexTable.end(); ++it) {
       
   166         const QString fileName = it.key();
       
   167         bool containsAll = true;
       
   168         QStringList split = fileName.split(QLatin1String("@"));
       
   169         foreach (const QString attribute, attributes) {
       
   170             if (!split.contains(attribute, Qt::CaseInsensitive)) {
       
   171                 containsAll = false;
       
   172                 break;
       
   173             }
       
   174         }
       
   175 
       
   176         if (containsAll)
       
   177             searchIndexTable.insert(fileName, it.value());
       
   178     }
       
   179 }
       
   180 
       
   181 void Reader::setIndexFile(const QString &namespaceName, const QString &attributes)
       
   182 {
       
   183     QString extention = namespaceName + QLatin1String("@") + attributes;
       
   184     indexFile = indexPath + QLatin1String("/indexdb40.") + extention;
       
   185     documentFile = indexPath + QLatin1String("/indexdoc40.") + extention;
       
   186 }
       
   187 
       
   188 bool Reader::splitSearchTerm(const QString &searchTerm, QStringList *terms,
       
   189                                   QStringList *termSeq, QStringList *seqWords)
       
   190 {
       
   191     QString term = searchTerm;
       
   192 
       
   193     term = term.simplified();
       
   194     term = term.replace(QLatin1String("\'"), QLatin1String("\""));
       
   195     term = term.replace(QLatin1String("`"), QLatin1String("\""));
       
   196     term = term.replace(QLatin1String("-"), QLatin1String(" "));
       
   197     term = term.replace(QRegExp(QLatin1String("\\s[\\S]?\\s")), QLatin1String(" "));
       
   198 
       
   199     *terms = term.split(QLatin1Char(' '));
       
   200     QStringList::iterator it = terms->begin();
       
   201     for (; it != terms->end(); ++it) {
       
   202         (*it) = (*it).simplified();
       
   203         (*it) = (*it).toLower();
       
   204         (*it) = (*it).replace(QLatin1String("\""), QLatin1String(""));
       
   205     }
       
   206 
       
   207     if (term.contains(QLatin1Char('\"'))) {
       
   208         if ((term.count(QLatin1Char('\"')))%2 == 0) {
       
   209             int beg = 0;
       
   210             int end = 0;
       
   211             QString s;
       
   212             beg = term.indexOf(QLatin1Char('\"'), beg);
       
   213             while (beg != -1) {
       
   214                 beg++;
       
   215                 end = term.indexOf(QLatin1Char('\"'), beg);
       
   216                 s = term.mid(beg, end - beg);
       
   217                 s = s.toLower();
       
   218                 s = s.simplified();
       
   219                 if (s.contains(QLatin1Char('*'))) {
       
   220                     qWarning("Full Text Search, using a wildcard within phrases is not allowed.");
       
   221                     return false;
       
   222                 }
       
   223                 *seqWords += s.split(QLatin1Char(' '));
       
   224                 *termSeq << s;
       
   225                 beg = term.indexOf(QLatin1Char('\"'), end + 1);
       
   226             }
       
   227         } else {
       
   228             qWarning("Full Text Search, the closing quotation mark is missing.");
       
   229             return false;
       
   230         }
       
   231     }
       
   232 
       
   233     return true;
       
   234 }
       
   235 
       
   236 void Reader::searchInIndex(const QStringList &terms)
       
   237 {
       
   238     foreach (const QString term, terms) {
       
   239         QVector<Document> documents;
       
   240 
       
   241         for(IndexTable::ConstIterator it = searchIndexTable.begin();
       
   242             it != searchIndexTable.end(); ++it) {
       
   243             EntryTable entryTable = it.value().first;
       
   244             DocumentList documentList = it.value().second;
       
   245 
       
   246             if (term.contains(QLatin1Char('*')))
       
   247                 documents = setupDummyTerm(getWildcardTerms(term, entryTable), entryTable);
       
   248             else if (entryTable.value(term))
       
   249                 documents = entryTable.value(term)->documents;
       
   250             else
       
   251                 continue;
       
   252 
       
   253             if (!documents.isEmpty()) {
       
   254                 DocumentInfo info;
       
   255                 QString title, url;
       
   256                 QVector<DocumentInfo> documentsInfo;
       
   257                 foreach(const Document doc, documents) {
       
   258                     info.docNumber = doc.docNumber;
       
   259                     info.frequency = doc.frequency;
       
   260                     info.documentUrl = documentList.at(doc.docNumber).at(1);
       
   261                     info.documentTitle = documentList.at(doc.docNumber).at(0);
       
   262                     documentsInfo.append(info);
       
   263                 }
       
   264 
       
   265                 bool found = false;
       
   266                 for(QList<TermInfo>::Iterator tit = termList.begin();
       
   267                     tit != termList.end(); ++tit) {
       
   268                     TermInfo *t = &(*tit);
       
   269                     if(t->term == term) {
       
   270                         t->documents += documentsInfo;
       
   271                         t->frequency += documentsInfo.count();
       
   272                         found = true; break;
       
   273                     }
       
   274                 }
       
   275                 if (!found)
       
   276                     termList.append(TermInfo(term, documentsInfo.count(), documentsInfo));
       
   277             }
       
   278         }
       
   279     }
       
   280     qSort(termList);
       
   281 }
       
   282 
       
   283 QVector<DocumentInfo> Reader::hits()
       
   284 {
       
   285     QVector<DocumentInfo> documents;
       
   286     if (!termList.count())
       
   287         return documents;
       
   288 
       
   289     documents = termList.takeFirst().documents;
       
   290     for(QList<TermInfo>::Iterator it = termList.begin(); it != termList.end(); ++it) {
       
   291         TermInfo *t = &(*it);
       
   292         QVector<DocumentInfo> docs = t->documents;
       
   293         for(QVector<DocumentInfo>::Iterator minDoc_it = documents.begin();
       
   294             minDoc_it != documents.end(); ) {
       
   295             bool found = false;
       
   296             for (QVector<DocumentInfo>::ConstIterator doc_it = docs.constBegin();
       
   297                 doc_it != docs.constEnd(); ++doc_it ) {
       
   298                 if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
       
   299                     (*minDoc_it).frequency += (*doc_it).frequency;
       
   300                     found = true;
       
   301                     break;
       
   302                 }
       
   303             }
       
   304             if (!found)
       
   305                 minDoc_it = documents.erase(minDoc_it);
       
   306             else
       
   307                 ++minDoc_it;
       
   308         }
       
   309     }
       
   310 
       
   311     qSort(documents);
       
   312     return documents;
       
   313 }
       
   314 
       
   315 bool Reader::searchForPattern(const QStringList &patterns, const QStringList &words,
       
   316                                    const QByteArray &data)
       
   317 {
       
   318     if (data.isEmpty())
       
   319         return false;
       
   320 
       
   321     for(QHash<QString, PosEntry*>::ConstIterator mit =
       
   322         miniIndex.begin(); mit != miniIndex.end(); ++mit) {
       
   323             delete mit.value();
       
   324     }
       
   325     miniIndex.clear();
       
   326 
       
   327     wordNum = 3;
       
   328     QStringList::ConstIterator cIt = words.begin();
       
   329     for ( ; cIt != words.end(); ++cIt )
       
   330         miniIndex.insert(*cIt, new PosEntry(0));
       
   331 
       
   332     QTextStream s(data);
       
   333     QString text = s.readAll();
       
   334     bool valid = true;
       
   335     const QChar *buf = text.unicode();
       
   336     QChar str[64];
       
   337     QChar c = buf[0];
       
   338     int j = 0;
       
   339     int i = 0;
       
   340     while ( j < text.length() ) {
       
   341         if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
       
   342             valid = false;
       
   343             if ( i > 1 )
       
   344                 buildMiniIndex( QString(str,i) );
       
   345             i = 0;
       
   346             c = buf[++j];
       
   347             continue;
       
   348         }
       
   349         if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
       
   350             valid = true;
       
   351             c = buf[++j];
       
   352             continue;
       
   353         }
       
   354         if ( !valid ) {
       
   355             c = buf[++j];
       
   356             continue;
       
   357         }
       
   358         if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
       
   359             str[i] = c.toLower();
       
   360             ++i;
       
   361         } else {
       
   362             if ( i > 1 )
       
   363                 buildMiniIndex( QString(str,i) );
       
   364             i = 0;
       
   365         }
       
   366         c = buf[++j];
       
   367     }
       
   368     if ( i > 1 )
       
   369         buildMiniIndex( QString(str,i) );
       
   370 
       
   371     QStringList::ConstIterator patIt = patterns.begin();
       
   372     QStringList wordLst;
       
   373     QList<uint> a, b;
       
   374     QList<uint>::iterator aIt;
       
   375     for ( ; patIt != patterns.end(); ++patIt ) {
       
   376         wordLst = (*patIt).split(QLatin1Char(' '));
       
   377         a = miniIndex[ wordLst[0] ]->positions;
       
   378         for ( int j = 1; j < (int)wordLst.count(); ++j ) {
       
   379             b = miniIndex[ wordLst[j] ]->positions;
       
   380             aIt = a.begin();
       
   381             while ( aIt != a.end() ) {
       
   382                 if ( b.contains( *aIt + 1 )) {
       
   383                     (*aIt)++;
       
   384                     ++aIt;
       
   385                 } else {
       
   386                     aIt = a.erase( aIt );
       
   387                 }
       
   388             }
       
   389         }
       
   390     }
       
   391     if ( a.count() )
       
   392         return true;
       
   393     return false;
       
   394 }
       
   395 
       
   396 QVector<Document> Reader::setupDummyTerm(const QStringList &terms,
       
   397                                               const EntryTable &entryTable)
       
   398 {
       
   399     QList<Term> termList;
       
   400     for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it) {
       
   401         if (entryTable.value(*it)) {
       
   402             Entry *e = entryTable.value(*it);
       
   403             termList.append(Term(*it, e->documents.count(), e->documents ) );
       
   404         }
       
   405     }
       
   406     QVector<Document> maxList(0);
       
   407     if ( !termList.count() )
       
   408         return maxList;
       
   409     qSort(termList);
       
   410 
       
   411     maxList = termList.takeLast().documents;
       
   412     for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
       
   413         Term *t = &(*it);
       
   414         QVector<Document> docs = t->documents;
       
   415         for (QVector<Document>::iterator docIt = docs.begin(); docIt != docs.end(); ++docIt ) {
       
   416             if ( maxList.indexOf( *docIt ) == -1 )
       
   417                 maxList.append( *docIt );
       
   418         }
       
   419     }
       
   420     return maxList;
       
   421 }
       
   422 
       
   423 QStringList Reader::getWildcardTerms(const QString &term,
       
   424                                           const EntryTable &entryTable)
       
   425 {
       
   426     QStringList lst;
       
   427     QStringList terms = split(term);
       
   428     QStringList::Iterator iter;
       
   429 
       
   430     for(EntryTable::ConstIterator it = entryTable.begin();
       
   431         it != entryTable.end(); ++it) {
       
   432         int index = 0;
       
   433         bool found = false;
       
   434         QString text( it.key() );
       
   435         for ( iter = terms.begin(); iter != terms.end(); ++iter ) {
       
   436             if ( *iter == QLatin1String("*") ) {
       
   437                 found = true;
       
   438                 continue;
       
   439             }
       
   440             if ( iter == terms.begin() && (*iter)[0] != text[0] ) {
       
   441                 found = false;
       
   442                 break;
       
   443             }
       
   444             index = text.indexOf( *iter, index );
       
   445             if ( *iter == terms.last() && index != (int)text.length()-1 ) {
       
   446                 index = text.lastIndexOf( *iter );
       
   447                 if ( index != (int)text.length() - (int)(*iter).length() ) {
       
   448                     found = false;
       
   449                     break;
       
   450                 }
       
   451             }
       
   452             if ( index != -1 ) {
       
   453                 found = true;
       
   454                 index += (*iter).length();
       
   455                 continue;
       
   456             } else {
       
   457                 found = false;
       
   458                 break;
       
   459             }
       
   460         }
       
   461         if (found)
       
   462             lst << text;
       
   463     }
       
   464 
       
   465     return lst;
       
   466 }
       
   467 
       
   468 void Reader::buildMiniIndex(const QString &string)
       
   469 {
       
   470     if (miniIndex[string])
       
   471         miniIndex[string]->positions.append(wordNum);
       
   472     ++wordNum;
       
   473 }
       
   474 
       
   475 void Reader::reset()
       
   476 {
       
   477     for(IndexTable::Iterator it = indexTable.begin();
       
   478         it != indexTable.end(); ++it) {
       
   479         cleanupIndex(it.value().first);
       
   480         it.value().second.clear();
       
   481     }
       
   482 }
       
   483 
       
   484 void Reader::cleanupIndex(EntryTable &entryTable)
       
   485 {
       
   486     for(EntryTable::ConstIterator it =
       
   487         entryTable.begin(); it != entryTable.end(); ++it) {
       
   488             delete it.value();
       
   489     }
       
   490 
       
   491     entryTable.clear();
       
   492 }
       
   493 
       
   494 
       
   495 QHelpSearchIndexReaderDefault::QHelpSearchIndexReaderDefault()
       
   496     : QHelpSearchIndexReader()
       
   497 {
       
   498     // nothing todo
       
   499 }
       
   500 
       
   501 QHelpSearchIndexReaderDefault::~QHelpSearchIndexReaderDefault()
       
   502 {
       
   503 }
       
   504 
       
   505 void QHelpSearchIndexReaderDefault::run()
       
   506 {
       
   507     mutex.lock();
       
   508 
       
   509     if (m_cancel) {
       
   510         mutex.unlock();
       
   511         return;
       
   512     }
       
   513 
       
   514     const QList<QHelpSearchQuery> &queryList = this->m_query;
       
   515     const QLatin1String key("DefaultSearchNamespaces");
       
   516     const QString collectionFile(this->m_collectionFile);
       
   517     const QString indexPath = m_indexFilesFolder;
       
   518 
       
   519     mutex.unlock();
       
   520 
       
   521     QString queryTerm;
       
   522     foreach (const QHelpSearchQuery query, queryList) {
       
   523         if (query.fieldName == QHelpSearchQuery::DEFAULT) {
       
   524             queryTerm = query.wordList.at(0);
       
   525             break;
       
   526         }
       
   527     }
       
   528 
       
   529     if (queryTerm.isEmpty())
       
   530         return;
       
   531 
       
   532     QHelpEngineCore engine(collectionFile, 0);
       
   533     if (!engine.setupData())
       
   534         return;
       
   535 
       
   536     const QStringList registeredDocs = engine.registeredDocumentations();
       
   537     const QStringList indexedNamespaces = engine.customValue(key).toString().
       
   538         split(QLatin1String("|"), QString::SkipEmptyParts);
       
   539 
       
   540     emit searchingStarted();
       
   541 
       
   542     // setup the reader
       
   543     m_reader.setIndexPath(indexPath);
       
   544     foreach(const QString namespaceName, registeredDocs) {
       
   545         mutex.lock();
       
   546         if (m_cancel) {
       
   547             mutex.unlock();
       
   548             searchingFinished(0);   // TODO: check this ???
       
   549             return;
       
   550         }
       
   551         mutex.unlock();
       
   552 
       
   553         const QList<QStringList> attributeSets =
       
   554             engine.filterAttributeSets(namespaceName);
       
   555 
       
   556         foreach (QStringList attributes, attributeSets) {
       
   557             // read all index files
       
   558             m_reader.setIndexFile(namespaceName, attributes.join(QLatin1String("@")));
       
   559             if (!m_reader.readIndex()) {
       
   560                 qWarning("Full Text Search, could not read file for namespace: %s.",
       
   561                     namespaceName.toUtf8().constData());
       
   562             }
       
   563         }
       
   564     }
       
   565 
       
   566     // get the current filter attributes and minimize the index files table
       
   567     m_reader.filterFilesForAttributes(engine.filterAttributes(engine.currentFilter()));
       
   568 
       
   569     hitList.clear();
       
   570     QStringList terms, termSeq, seqWords;
       
   571     if (m_reader.initCheck() && // check if we could read anything
       
   572         m_reader.splitSearchTerm(queryTerm, &terms, &termSeq, &seqWords) ) {
       
   573 
       
   574         // search for term(s)
       
   575         m_reader.searchInIndex(terms);    // TODO: should this be interruptible as well ???
       
   576 
       
   577         QVector<DocumentInfo> hits = m_reader.hits();
       
   578         if (!hits.isEmpty()) {
       
   579             if (termSeq.isEmpty()) {
       
   580                 foreach (const DocumentInfo docInfo, hits) {
       
   581                     mutex.lock();
       
   582                     if (m_cancel) {
       
   583                         mutex.unlock();
       
   584                         searchingFinished(0);   // TODO: check this, speed issue while locking???
       
   585                         return;
       
   586                     }
       
   587                     mutex.unlock();
       
   588                     hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
       
   589                 }
       
   590             } else {
       
   591                 foreach (const DocumentInfo docInfo, hits) {
       
   592                     mutex.lock();
       
   593                     if (m_cancel) {
       
   594                         mutex.unlock();
       
   595                         searchingFinished(0);   // TODO: check this, speed issue while locking???
       
   596                         return;
       
   597                     }
       
   598                     mutex.unlock();
       
   599 
       
   600                     if (m_reader.searchForPattern(termSeq, seqWords, engine.fileData(docInfo.documentUrl))) // TODO: should this be interruptible as well ???
       
   601                         hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
       
   602                 }
       
   603             }
       
   604         }
       
   605     }
       
   606 
       
   607     emit searchingFinished(hitList.count());
       
   608 }
       
   609 
       
   610         }   // namespace std
       
   611     }   // namespace fulltextsearch
       
   612 }   // namespace qt
       
   613 
       
   614 QT_END_NAMESPACE