tools/assistant/lib/qhelpsearchindexreader_clucene.cpp
author Eckhart Koeppen <eckhart.koppen@nokia.com>
Mon, 19 Apr 2010 10:15:19 +0300
branchRCL_3
changeset 9 b5b118452c7d
parent 7 3f74d0d4af4c
permissions -rw-r--r--
a8c775f23625952540b5349744722bcb9e37de45

/****************************************************************************
**
** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
** All rights reserved.
** Contact: Nokia Corporation (qt-info@nokia.com)
**
** This file is part of the Qt Assistant of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** No Commercial Usage
** This file contains pre-release code and may not be distributed.
** You may use this file in accordance with the terms and conditions
** contained in the Technology Preview License Agreement accompanying
** this package.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 2.1 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL included in the
** packaging of this file.  Please review the following information to
** ensure the GNU Lesser General Public License version 2.1 requirements
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** In addition, as a special exception, Nokia gives you certain additional
** rights.  These rights are described in the Nokia Qt LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
** If you have questions regarding the use of this file, please contact
** Nokia at qt-info@nokia.com.
**
**
**
**
**
**
**
**
** $QT_END_LICENSE$
**
****************************************************************************/

#include "qhelpenginecore.h"
#include "fulltextsearch/qsearchable_p.h"
#include "fulltextsearch/qqueryparser_p.h"
#include "fulltextsearch/qindexreader_p.h"
#include "qhelpsearchindexreader_clucene_p.h"

#include <QtCore/QDir>
#include <QtCore/QSet>
#include <QtCore/QString>
#include <QtCore/QFileInfo>
#include <QtCore/QStringList>
#include <QtCore/QTextStream>
#include <QtCore/QMutexLocker>

QT_BEGIN_NAMESPACE

namespace fulltextsearch {
namespace clucene {

QHelpSearchIndexReaderClucene::QHelpSearchIndexReaderClucene()
    : QHelpSearchIndexReader()
{
    // nothing todo
}

QHelpSearchIndexReaderClucene::~QHelpSearchIndexReaderClucene()
{
}


void QHelpSearchIndexReaderClucene::run()
{
    mutex.lock();

    if (m_cancel) {
        mutex.unlock();
        return;
    }

    const QString collectionFile(this->m_collectionFile);
    const QList<QHelpSearchQuery> &queryList = this->m_query;
    const QString indexPath(m_indexFilesFolder);

    mutex.unlock();

    QHelpEngineCore engine(collectionFile, 0);
    if (!engine.setupData())
        return;

    QFileInfo fInfo(indexPath);
    if (fInfo.exists() && !fInfo.isWritable()) {
        qWarning("Full Text Search, could not read index (missing permissions).");
        return;
    }

    if(QCLuceneIndexReader::indexExists(indexPath)) {
        mutex.lock();
        if (m_cancel) {
            mutex.unlock();
            return;
        }
        mutex.unlock();

        emit searchingStarted();

#if !defined(QT_NO_EXCEPTIONS)
        try {
#endif
            QCLuceneBooleanQuery booleanQuery;
            QCLuceneStandardAnalyzer analyzer;
            if (!buildQuery(booleanQuery, queryList, analyzer)) {
                emit searchingFinished(0);
                return;
            }

            const QStringList attribList = engine.filterAttributes(engine.currentFilter());
            if (!attribList.isEmpty()) {
                QCLuceneQuery* query = QCLuceneQueryParser::parse(QLatin1String("+")
                    + attribList.join(QLatin1String(" +")), QLatin1String("attribute"), analyzer);

                if (!query) {
                    emit searchingFinished(0);
                    return;
                }
                booleanQuery.add(query, true, true, false);
            }

            QCLuceneIndexSearcher indexSearcher(indexPath);
            QCLuceneHits hits = indexSearcher.search(booleanQuery);

            bool boost = true;
            QCLuceneBooleanQuery tryHarderQuery;
            if (hits.length() == 0) {
                if (buildTryHarderQuery(tryHarderQuery, queryList, analyzer)) {
                    if (!attribList.isEmpty()) {
                        QCLuceneQuery* query = QCLuceneQueryParser::parse(QLatin1String("+")
                            + attribList.join(QLatin1String(" +")), QLatin1String("attribute"),
                            analyzer);
                        tryHarderQuery.add(query, true, true, false);
                    }
                    hits = indexSearcher.search(tryHarderQuery);
                    boost = (hits.length() == 0);
                }
            }

            QSet<QString> pathSet;
            QCLuceneDocument document;
            const QStringList namespaceList = engine.registeredDocumentations();

            for (qint32 i = 0; i < hits.length(); i++) {
                document = hits.document(i);
                const QString path = document.get(QLatin1String("path"));
                if (!pathSet.contains(path) && namespaceList.contains(
                    document.get(QLatin1String("namespace")), Qt::CaseInsensitive)) {
                    pathSet.insert(path);
                    hitList.append(qMakePair(path, document.get(QLatin1String("title"))));
                }
                document.clear();

                mutex.lock();
                if (m_cancel) {
                    mutex.unlock();
                    emit searchingFinished(0);
                    return;
                }
                mutex.unlock();
            }

            indexSearcher.close();
            const int count = hitList.count();
            if ((count > 0) && boost)
                boostSearchHits(engine, hitList, queryList);
            emit searchingFinished(hitList.count());

#if !defined(QT_NO_EXCEPTIONS)
        } catch(...) {
            mutex.lock();
            hitList.clear();
            mutex.unlock();
            emit searchingFinished(0);
        }
#endif
    }
}

bool QHelpSearchIndexReaderClucene::defaultQuery(const QString &term, QCLuceneBooleanQuery &booleanQuery,
    QCLuceneStandardAnalyzer &analyzer)
{
    const QLatin1String c("content");
    const QLatin1String t("titleTokenized");

    QCLuceneQuery *query = QCLuceneQueryParser::parse(term, c, analyzer);
    QCLuceneQuery *query2 = QCLuceneQueryParser::parse(term, t, analyzer);
    if (query && query2) {
        booleanQuery.add(query, true, false, false);
        booleanQuery.add(query2, true, false, false);
        return true;
    }

    return false;
}

bool QHelpSearchIndexReaderClucene::buildQuery(QCLuceneBooleanQuery &booleanQuery,
    const QList<QHelpSearchQuery> &queryList, QCLuceneStandardAnalyzer &analyzer)
{
    foreach (const QHelpSearchQuery query, queryList) {
        switch (query.fieldName) {
            case QHelpSearchQuery::FUZZY: {
                const QLatin1String fuzzy("~");
                foreach (const QString &term, query.wordList) {
                    if (term.isEmpty()
                        || !defaultQuery(term.toLower() + fuzzy, booleanQuery, analyzer)) {
                        return false;
                    }
                }
            }   break;

            case QHelpSearchQuery::WITHOUT: {
                QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords();
                foreach (const QString &term, query.wordList) {
                    if (stopWords.contains(term, Qt::CaseInsensitive))
                        continue;

                    QCLuceneQuery *query = new QCLuceneTermQuery(QCLuceneTerm(
                        QLatin1String("content"), term.toLower()));
                    QCLuceneQuery *query2 = new QCLuceneTermQuery(QCLuceneTerm(
                        QLatin1String("titleTokenized"), term.toLower()));

                    if (query && query2) {
                        booleanQuery.add(query, true, false, true);
                        booleanQuery.add(query2, true, false, true);
                    } else {
                        return false;
                    }
                }
            }   break;

            case QHelpSearchQuery::PHRASE: {
                const QString &term = query.wordList.at(0).toLower();
                if (term.contains(QLatin1Char(' '))) {
                    QStringList termList = term.split(QLatin1String(" "));
                    QCLucenePhraseQuery *q = new QCLucenePhraseQuery();
                    QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords();
                    foreach (const QString &term, termList) {
                        if (!stopWords.contains(term, Qt::CaseInsensitive))
                            q->addTerm(QCLuceneTerm(QLatin1String("content"), term.toLower()));
                    }
                    booleanQuery.add(q, true, true, false);
                } else {
                    QCLuceneQuery *query = new QCLuceneTermQuery(QCLuceneTerm(
                        QLatin1String("content"), term.toLower()));
                    QCLuceneQuery *query2 = new QCLuceneTermQuery(QCLuceneTerm(
                        QLatin1String("titleTokenized"), term.toLower()));

                    if (query && query2) {
                        booleanQuery.add(query, true, true, false);
                        booleanQuery.add(query2, true, false, false);
                    } else {
                        return false;
                    }
                }
            }   break;

            case QHelpSearchQuery::ALL: {
                QStringList stopWords = QCLuceneStopAnalyzer().englishStopWords();
                foreach (const QString &term, query.wordList) {
                    if (stopWords.contains(term, Qt::CaseInsensitive))
                        continue;

                    QCLuceneQuery *query = new QCLuceneTermQuery(QCLuceneTerm(
                        QLatin1String("content"), term.toLower()));

                    if (query) {
                        booleanQuery.add(query, true, true, false);
                    } else {
                        return false;
                    }
                }
            }   break;

            case QHelpSearchQuery::DEFAULT: {
                foreach (const QString &term, query.wordList) {
                    QCLuceneQuery *query = QCLuceneQueryParser::parse(term.toLower(),
                        QLatin1String("content"), analyzer);

                    if (query)
                        booleanQuery.add(query, true, true, false);
                }
            }   break;

            case QHelpSearchQuery::ATLEAST: {
                foreach (const QString &term, query.wordList) {
                    if (term.isEmpty() || !defaultQuery(term.toLower(), booleanQuery, analyzer))
                        return false;
                }
            }
        }
    }

    return true;
}

bool QHelpSearchIndexReaderClucene::buildTryHarderQuery(QCLuceneBooleanQuery &booleanQuery,
    const QList<QHelpSearchQuery> &queryList, QCLuceneStandardAnalyzer &analyzer)
{
    bool retVal = false;
    foreach (const QHelpSearchQuery query, queryList) {
        switch (query.fieldName) {
            default:    break;
            case QHelpSearchQuery::DEFAULT: {
                foreach (const QString &term, query.wordList) {
                    QCLuceneQuery *query = QCLuceneQueryParser::parse(term.toLower(),
                        QLatin1String("content"), analyzer);

                    if (query) {
                        retVal = true;
                        booleanQuery.add(query, true, false, false);
                    }
                }
            }   break;
        }
    }
    return retVal;
}

void QHelpSearchIndexReaderClucene::boostSearchHits(const QHelpEngineCore &engine,
    QList<QHelpSearchEngine::SearchHit> &hitList, const QList<QHelpSearchQuery> &queryList)
{
    foreach (const QHelpSearchQuery query, queryList) {
        if (query.fieldName != QHelpSearchQuery::DEFAULT)
            continue;

        QString joinedQuery = query.wordList.join(QLatin1String(" "));

        QCLuceneStandardAnalyzer analyzer;
        QCLuceneQuery *parsedQuery = QCLuceneQueryParser::parse(
            joinedQuery, QLatin1String("content"), analyzer);

        if (parsedQuery) {
            joinedQuery = parsedQuery->toString();
            delete parsedQuery;
        }

        int length = QString(QLatin1String("content:")).length();
        int index = joinedQuery.indexOf(QLatin1String("content:"));

        QString term;
        int nextIndex = 0;
        QStringList searchTerms;
        while (index != -1) {
            nextIndex = joinedQuery.indexOf(QLatin1String("content:"), index + 1);
            term = joinedQuery.mid(index + length, nextIndex - (length + index)).simplified();
            if (term.startsWith(QLatin1String("\""))
                && term.endsWith(QLatin1String("\""))) {
                searchTerms.append(term.remove(QLatin1String("\"")));
            } else {
                searchTerms += term.split(QLatin1Char(' '));
            }
            index = nextIndex;
        }
        searchTerms.removeDuplicates();

        int count = qMin(75, hitList.count());
        QMap<int, QHelpSearchEngine::SearchHit> hitMap;
        for (int i = 0; i < count; ++i) {
            const QHelpSearchEngine::SearchHit &hit = hitList.at(i);
            QString data = QString::fromUtf8(engine.fileData(hit.first));

            int counter = 0;
            foreach (const QString &term, searchTerms)
                counter += data.count(term, Qt::CaseInsensitive);
            hitMap.insertMulti(counter, hit);
        }

        QList<QHelpSearchEngine::SearchHit> boostedList;
        QMap<int, QHelpSearchEngine::SearchHit>::const_iterator it = hitMap.constEnd();
        do {
            --it;
            boostedList.append(it.value());
        } while (it != hitMap.constBegin());
        boostedList += hitList.mid(count, hitList.count());
        mutex.lock();
        hitList = boostedList;
        mutex.unlock();
    }
}

}   // namespace clucene
}   // namespace fulltextsearch

QT_END_NAMESPACE