tools/assistant/lib/qhelpsearchindexreader_clucene.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the Qt Assistant of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include "qhelpenginecore.h"
       
    43 #include "fulltextsearch/qsearchable_p.h"
       
    44 #include "fulltextsearch/qqueryparser_p.h"
       
    45 #include "fulltextsearch/qindexreader_p.h"
       
    46 #include "qhelpsearchindexreader_clucene_p.h"
       
    47 
       
    48 #include <QtCore/QDir>
       
    49 #include <QtCore/QSet>
       
    50 #include <QtCore/QString>
       
    51 #include <QtCore/QFileInfo>
       
    52 #include <QtCore/QStringList>
       
    53 #include <QtCore/QTextStream>
       
    54 #include <QtCore/QMutexLocker>
       
    55 
       
    56 QT_BEGIN_NAMESPACE
       
    57 
       
    58 namespace qt {
       
    59     namespace fulltextsearch {
       
    60         namespace clucene {
       
    61 
       
    62 QHelpSearchIndexReaderClucene::QHelpSearchIndexReaderClucene()
       
    63     : QHelpSearchIndexReader()
       
    64 {
       
    65     // nothing todo
       
    66 }
       
    67 
       
    68 QHelpSearchIndexReaderClucene::~QHelpSearchIndexReaderClucene()
       
    69 {
       
    70 }
       
    71 
       
    72 
       
    73 void QHelpSearchIndexReaderClucene::run()
       
    74 {
       
    75     mutex.lock();
       
    76 
       
    77     if (m_cancel) {
       
    78         mutex.unlock();
       
    79         return;
       
    80     }
       
    81 
       
    82     const QString collectionFile(this->m_collectionFile);
       
    83     const QList<QHelpSearchQuery> &queryList = this->m_query;
       
    84     const QString indexPath(m_indexFilesFolder);
       
    85 
       
    86     mutex.unlock();
       
    87 
       
    88     QHelpEngineCore engine(collectionFile, 0);
       
    89     if (!engine.setupData())
       
    90         return;
       
    91 
       
    92     QFileInfo fInfo(indexPath);
       
    93     if (fInfo.exists() && !fInfo.isWritable()) {
       
    94         qWarning("Full Text Search, could not read index (missing permissions).");
       
    95         return;
       
    96     }
       
    97 
       
    98     if(QCLuceneIndexReader::indexExists(indexPath)) {
       
    99         mutex.lock();
       
   100         if (m_cancel) {
       
   101             mutex.unlock();
       
   102             return;
       
   103         }
       
   104         mutex.unlock();
       
   105 
       
   106         emit searchingStarted();
       
   107 
       
   108 #if !defined(QT_NO_EXCEPTIONS)
       
   109         try {
       
   110 #endif
       
   111             QCLuceneBooleanQuery booleanQuery;
       
   112             QCLuceneStandardAnalyzer analyzer;
       
   113             if (!buildQuery(booleanQuery, queryList, analyzer)) {
       
   114                 emit searchingFinished(0);
       
   115                 return;
       
   116             }
       
   117 
       
   118             const QStringList attribList = engine.filterAttributes(engine.currentFilter());
       
   119             if (!attribList.isEmpty()) {
       
   120                 QCLuceneQuery* query = QCLuceneQueryParser::parse(QLatin1String("+")
       
   121                     + attribList.join(QLatin1String(" +")), QLatin1String("attribute"), analyzer);
       
   122 
       
   123                 if (!query) {
       
   124                     emit searchingFinished(0);
       
   125                     return;
       
   126                 }
       
   127                 booleanQuery.add(query, true, true, false);
       
   128             }
       
   129 
       
   130             QCLuceneIndexSearcher indexSearcher(indexPath);
       
   131             QCLuceneHits hits = indexSearcher.search(booleanQuery);
       
   132 
       
   133             bool boost = true;
       
   134             QCLuceneBooleanQuery tryHarderQuery;
       
   135             if (hits.length() == 0) {
       
   136                 if (buildTryHarderQuery(tryHarderQuery, queryList, analyzer)) {
       
   137                     if (!attribList.isEmpty()) {
       
   138                         QCLuceneQuery* query = QCLuceneQueryParser::parse(QLatin1String("+")
       
   139                             + attribList.join(QLatin1String(" +")), QLatin1String("attribute"),
       
   140                             analyzer);
       
   141                         tryHarderQuery.add(query, true, true, false);
       
   142                     }
       
   143                     hits = indexSearcher.search(tryHarderQuery);
       
   144                     boost = (hits.length() == 0);
       
   145                 }
       
   146             }
       
   147 
       
   148             QSet<QString> pathSet;
       
   149             QCLuceneDocument document;
       
   150             const QStringList namespaceList = engine.registeredDocumentations();
       
   151 
       
   152             for (qint32 i = 0; i < hits.length(); i++) {
       
   153                 document = hits.document(i);
       
   154                 const QString path = document.get(QLatin1String("path"));
       
   155                 if (!pathSet.contains(path) && namespaceList.contains(
       
   156                     document.get(QLatin1String("namespace")), Qt::CaseInsensitive)) {
       
   157                     pathSet.insert(path);
       
   158                     hitList.append(qMakePair(path, document.get(QLatin1String("title"))));
       
   159                 }
       
   160                 document.clear();
       
   161 
       
   162                 mutex.lock();
       
   163                 if (m_cancel) {
       
   164                     mutex.unlock();
       
   165                     emit searchingFinished(0);
       
   166                     return;
       
   167                 }
       
   168                 mutex.unlock();
       
   169             }
       
   170 
       
   171             indexSearcher.close();
       
   172             const int count = hitList.count();
       
   173             if ((count > 0) && boost)
       
   174                 boostSearchHits(engine, hitList, queryList);
       
   175             emit searchingFinished(hitList.count());
       
   176 
       
   177 #if !defined(QT_NO_EXCEPTIONS)
       
   178         } catch(...) {
       
   179             mutex.lock();
       
   180             hitList.clear();
       
   181             mutex.unlock();
       
   182             emit searchingFinished(0);
       
   183         }
       
   184 #endif
       
   185     }
       
   186 }
       
   187 
       
   188 bool QHelpSearchIndexReaderClucene::defaultQuery(const QString &term, QCLuceneBooleanQuery &booleanQuery,
       
   189     QCLuceneStandardAnalyzer &analyzer)
       
   190 {
       
   191     const QLatin1String c("content");
       
   192     const QLatin1String t("titleTokenized");
       
   193 
       
   194     QCLuceneQuery *query = QCLuceneQueryParser::parse(term, c, analyzer);
       
   195     QCLuceneQuery *query2 = QCLuceneQueryParser::parse(term, t, analyzer);
       
   196     if (query && query2) {
       
   197         booleanQuery.add(query, true, false, false);
       
   198         booleanQuery.add(query2, true, false, false);
       
   199         return true;
       
   200     }
       
   201 
       
   202     return false;
       
   203 }
       
   204 
       
   205 bool QHelpSearchIndexReaderClucene::buildQuery(QCLuceneBooleanQuery &booleanQuery,
       
   206     const QList<QHelpSearchQuery> &queryList, QCLuceneStandardAnalyzer &analyzer)
       
   207 {
       
   208     foreach (const QHelpSearchQuery query, queryList) {
       
   209         switch (query.fieldName) {
       
   210             case QHelpSearchQuery::FUZZY: {
       
   211                 const QLatin1String fuzzy("~");
       
   212                 foreach (const QString &term, query.wordList) {
       
   213                     if (term.isEmpty()
       
   214                         || !defaultQuery(term.toLower() + fuzzy, booleanQuery, analyzer)) {
       
   215                         return false;
       
   216                     }
       
   217                 }
       
   218             }   break;
       
   219 
       
   220             case QHelpSearchQuery::WITHOUT: {
       
   221                 QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords();
       
   222                 foreach (const QString &term, query.wordList) {
       
   223                     if (stopWords.contains(term, Qt::CaseInsensitive))
       
   224                         continue;
       
   225 
       
   226                     QCLuceneQuery *query = new QCLuceneTermQuery(QCLuceneTerm(
       
   227                         QLatin1String("content"), term.toLower()));
       
   228                     QCLuceneQuery *query2 = new QCLuceneTermQuery(QCLuceneTerm(
       
   229                         QLatin1String("titleTokenized"), term.toLower()));
       
   230 
       
   231                     if (query && query2) {
       
   232                         booleanQuery.add(query, true, false, true);
       
   233                         booleanQuery.add(query2, true, false, true);
       
   234                     } else {
       
   235                         return false;
       
   236                     }
       
   237                 }
       
   238             }   break;
       
   239 
       
   240             case QHelpSearchQuery::PHRASE: {
       
   241                 const QString &term = query.wordList.at(0).toLower();
       
   242                 if (term.contains(QLatin1Char(' '))) {
       
   243                     QStringList termList = term.split(QLatin1String(" "));
       
   244                     QCLucenePhraseQuery *q = new QCLucenePhraseQuery();
       
   245                     QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords();
       
   246                     foreach (const QString &term, termList) {
       
   247                         if (!stopWords.contains(term, Qt::CaseInsensitive))
       
   248                             q->addTerm(QCLuceneTerm(QLatin1String("content"), term.toLower()));
       
   249                     }
       
   250                     booleanQuery.add(q, true, true, false);
       
   251                 } else {
       
   252                     QCLuceneQuery *query = new QCLuceneTermQuery(QCLuceneTerm(
       
   253                         QLatin1String("content"), term.toLower()));
       
   254                     QCLuceneQuery *query2 = new QCLuceneTermQuery(QCLuceneTerm(
       
   255                         QLatin1String("titleTokenized"), term.toLower()));
       
   256 
       
   257                     if (query && query2) {
       
   258                         booleanQuery.add(query, true, true, false);
       
   259                         booleanQuery.add(query2, true, false, false);
       
   260                     } else {
       
   261                         return false;
       
   262                     }
       
   263                 }
       
   264             }   break;
       
   265 
       
   266             case QHelpSearchQuery::ALL: {
       
   267                 QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords();
       
   268                 foreach (const QString &term, query.wordList) {
       
   269                     if (stopWords.contains(term, Qt::CaseInsensitive))
       
   270                         continue;
       
   271 
       
   272                     QCLuceneQuery *query = new QCLuceneTermQuery(QCLuceneTerm(
       
   273                         QLatin1String("content"), term.toLower()));
       
   274 
       
   275                     if (query) {
       
   276                         booleanQuery.add(query, true, true, false);
       
   277                     } else {
       
   278                         return false;
       
   279                     }
       
   280                 }
       
   281             }   break;
       
   282 
       
   283             case QHelpSearchQuery::DEFAULT: {
       
   284                 foreach (const QString &term, query.wordList) {
       
   285                     QCLuceneQuery *query = QCLuceneQueryParser::parse(term.toLower(),
       
   286                         QLatin1String("content"), analyzer);
       
   287 
       
   288                     if (query)
       
   289                         booleanQuery.add(query, true, true, false);
       
   290                 }
       
   291             }   break;
       
   292 
       
   293             case QHelpSearchQuery::ATLEAST: {
       
   294                 foreach (const QString &term, query.wordList) {
       
   295                     if (term.isEmpty() || !defaultQuery(term.toLower(), booleanQuery, analyzer))
       
   296                         return false;
       
   297                 }
       
   298             }
       
   299         }
       
   300     }
       
   301 
       
   302     return true;
       
   303 }
       
   304 
       
   305 bool QHelpSearchIndexReaderClucene::buildTryHarderQuery(QCLuceneBooleanQuery &booleanQuery,
       
   306     const QList<QHelpSearchQuery> &queryList, QCLuceneStandardAnalyzer &analyzer)
       
   307 {
       
   308     bool retVal = false;
       
   309     foreach (const QHelpSearchQuery query, queryList) {
       
   310         switch (query.fieldName) {
       
   311             default:    break;
       
   312             case QHelpSearchQuery::DEFAULT: {
       
   313                 foreach (const QString &term, query.wordList) {
       
   314                     QCLuceneQuery *query = QCLuceneQueryParser::parse(term.toLower(),
       
   315                         QLatin1String("content"), analyzer);
       
   316 
       
   317                     if (query) {
       
   318                         retVal = true;
       
   319                         booleanQuery.add(query, true, false, false);
       
   320                     }
       
   321                 }
       
   322             }   break;
       
   323         }
       
   324     }
       
   325     return retVal;
       
   326 }
       
   327 
       
   328 void QHelpSearchIndexReaderClucene::boostSearchHits(const QHelpEngineCore &engine,
       
   329     QList<QHelpSearchEngine::SearchHit> &hitList, const QList<QHelpSearchQuery> &queryList)
       
   330 {
       
   331     foreach (const QHelpSearchQuery query, queryList) {
       
   332         if (query.fieldName != QHelpSearchQuery::DEFAULT)
       
   333             continue;
       
   334 
       
   335         QString joinedQuery = query.wordList.join(QLatin1String(" "));
       
   336 
       
   337         QCLuceneStandardAnalyzer analyzer;
       
   338         QCLuceneQuery *parsedQuery = QCLuceneQueryParser::parse(
       
   339             joinedQuery, QLatin1String("content"), analyzer);
       
   340 
       
   341         if (parsedQuery) {
       
   342             joinedQuery = parsedQuery->toString();
       
   343             delete parsedQuery;
       
   344         }
       
   345 
       
   346         int length = QString(QLatin1String("content:")).length();
       
   347         int index = joinedQuery.indexOf(QLatin1String("content:"));
       
   348 
       
   349         QString term;
       
   350         int nextIndex = 0;
       
   351         QStringList searchTerms;
       
   352         while (index != -1) {
       
   353             nextIndex = joinedQuery.indexOf(QLatin1String("content:"), index + 1);
       
   354             term = joinedQuery.mid(index + length, nextIndex - (length + index)).simplified();
       
   355             if (term.startsWith(QLatin1String("\""))
       
   356                 && term.endsWith(QLatin1String("\""))) {
       
   357                 searchTerms.append(term.remove(QLatin1String("\"")));
       
   358             } else {
       
   359                 searchTerms += term.split(QLatin1Char(' '));
       
   360             }
       
   361             index = nextIndex;
       
   362         }
       
   363         searchTerms.removeDuplicates();
       
   364 
       
   365         int count = qMin(75, hitList.count());
       
   366         QMap<int, QHelpSearchEngine::SearchHit> hitMap;
       
   367         for (int i = 0; i < count; ++i) {
       
   368             const QHelpSearchEngine::SearchHit &hit = hitList.at(i);
       
   369             QString data = QString::fromUtf8(engine.fileData(hit.first));
       
   370 
       
   371             int counter = 0;
       
   372             foreach (const QString &term, searchTerms)
       
   373                 counter += data.count(term, Qt::CaseInsensitive);
       
   374             hitMap.insertMulti(counter, hit);
       
   375         }
       
   376 
       
   377         QList<QHelpSearchEngine::SearchHit> boostedList;
       
   378         QMap<int, QHelpSearchEngine::SearchHit>::const_iterator it = hitMap.constEnd();
       
   379         do {
       
   380             --it;
       
   381             boostedList.append(it.value());
       
   382         } while (it != hitMap.constBegin());
       
   383         boostedList += hitList.mid(count, hitList.count());
       
   384         mutex.lock();
       
   385         hitList = boostedList;
       
   386         mutex.unlock();
       
   387     }
       
   388 }
       
   389 
       
   390         }   // namespace clucene
       
   391     }   // namespace fulltextsearch
       
   392 }   // namespace qt
       
   393 
       
   394 QT_END_NAMESPACE