tools/assistant/lib/qhelpsearchindexreader_default.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
child 7 f7bc934e204c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/assistant/lib/qhelpsearchindexreader_default.cpp	Mon Jan 11 14:00:40 2010 +0000
@@ -0,0 +1,614 @@
+/****************************************************************************
+**
+** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the Qt Assistant of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights.  These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "qhelpenginecore.h"
+#include "qhelpsearchindexreader_default_p.h"
+
+#include <QtCore/QDir>
+#include <QtCore/QUrl>
+#include <QtCore/QFile>
+#include <QtCore/QVariant>
+#include <QtCore/QFileInfo>
+#include <QtCore/QDataStream>
+#include <QtCore/QTextStream>
+
+QT_BEGIN_NAMESPACE
+
+namespace qt {
+    namespace fulltextsearch {
+        namespace std {
+
+namespace {
+    QStringList split( const QString &str )
+    {
+        QStringList lst;
+        int j = 0;
+        int i = str.indexOf(QLatin1Char('*'), j );
+
+        if (str.startsWith(QLatin1String("*")))
+            lst << QLatin1String("*");
+
+        while ( i != -1 ) {
+            if ( i > j && i <= (int)str.length() ) {
+                lst << str.mid( j, i - j );
+                lst << QLatin1String("*");
+            }
+            j = i + 1;
+            i = str.indexOf(QLatin1Char('*'), j );
+        }
+
+        int l = str.length() - 1;
+        if ( str.mid( j, l - j + 1 ).length() > 0 )
+            lst << str.mid( j, l - j + 1 );
+
+        return lst;
+    }
+}
+
+
+Reader::Reader()
+    : indexPath(QString())
+    , indexFile(QString())
+    , documentFile(QString())
+{
+    termList.clear();
+    indexTable.clear();
+    searchIndexTable.clear();
+}
+
+Reader::~Reader()
+{
+    reset();
+    searchIndexTable.clear();
+}
+
+bool Reader::readIndex()
+{
+    if (indexTable.contains(indexFile))
+        return true;
+
+    QFile idxFile(indexFile);
+    if (!idxFile.open(QFile::ReadOnly))
+        return false;
+
+    QString key;
+    int numOfDocs;
+    EntryTable entryTable;
+    QVector<Document> docs;
+    QDataStream dictStream(&idxFile);
+    while (!dictStream.atEnd()) {
+        dictStream >> key;
+        dictStream >> numOfDocs;
+        docs.resize(numOfDocs);
+        dictStream >> docs;
+        entryTable.insert(key, new Entry(docs));
+    }
+    idxFile.close();
+
+    if (entryTable.isEmpty())
+        return false;
+
+    QFile docFile(documentFile);
+    if (!docFile.open(QFile::ReadOnly))
+        return false;
+
+    QString title, url;
+    DocumentList documentList;
+    QDataStream docStream(&docFile);
+    while (!docStream.atEnd()) {
+        docStream >> title;
+        docStream >> url;
+        documentList.append(QStringList(title) << url);
+    }
+    docFile.close();
+
+    if (documentList.isEmpty()) {
+        cleanupIndex(entryTable);
+        return false;
+    }
+
+    indexTable.insert(indexFile, Index(entryTable, documentList));
+    return true;
+}
+
+bool Reader::initCheck() const
+{
+    return !searchIndexTable.isEmpty();
+}
+
+void Reader::setIndexPath(const QString &path)
+{
+    indexPath = path;
+}
+
+void Reader::filterFilesForAttributes(const QStringList &attributes)
+{
+    searchIndexTable.clear();
+    for(IndexTable::ConstIterator it = indexTable.begin(); it != indexTable.end(); ++it) {
+        const QString fileName = it.key();
+        bool containsAll = true;
+        QStringList split = fileName.split(QLatin1String("@"));
+        foreach (const QString attribute, attributes) {
+            if (!split.contains(attribute, Qt::CaseInsensitive)) {
+                containsAll = false;
+                break;
+            }
+        }
+
+        if (containsAll)
+            searchIndexTable.insert(fileName, it.value());
+    }
+}
+
+void Reader::setIndexFile(const QString &namespaceName, const QString &attributes)
+{
+    QString extention = namespaceName + QLatin1String("@") + attributes;
+    indexFile = indexPath + QLatin1String("/indexdb40.") + extention;
+    documentFile = indexPath + QLatin1String("/indexdoc40.") + extention;
+}
+
+bool Reader::splitSearchTerm(const QString &searchTerm, QStringList *terms,
+                                  QStringList *termSeq, QStringList *seqWords)
+{
+    QString term = searchTerm;
+
+    term = term.simplified();
+    term = term.replace(QLatin1String("\'"), QLatin1String("\""));
+    term = term.replace(QLatin1String("`"), QLatin1String("\""));
+    term = term.replace(QLatin1String("-"), QLatin1String(" "));
+    term = term.replace(QRegExp(QLatin1String("\\s[\\S]?\\s")), QLatin1String(" "));
+
+    *terms = term.split(QLatin1Char(' '));
+    QStringList::iterator it = terms->begin();
+    for (; it != terms->end(); ++it) {
+        (*it) = (*it).simplified();
+        (*it) = (*it).toLower();
+        (*it) = (*it).replace(QLatin1String("\""), QLatin1String(""));
+    }
+
+    if (term.contains(QLatin1Char('\"'))) {
+        if ((term.count(QLatin1Char('\"')))%2 == 0) {
+            int beg = 0;
+            int end = 0;
+            QString s;
+            beg = term.indexOf(QLatin1Char('\"'), beg);
+            while (beg != -1) {
+                beg++;
+                end = term.indexOf(QLatin1Char('\"'), beg);
+                s = term.mid(beg, end - beg);
+                s = s.toLower();
+                s = s.simplified();
+                if (s.contains(QLatin1Char('*'))) {
+                    qWarning("Full Text Search, using a wildcard within phrases is not allowed.");
+                    return false;
+                }
+                *seqWords += s.split(QLatin1Char(' '));
+                *termSeq << s;
+                beg = term.indexOf(QLatin1Char('\"'), end + 1);
+            }
+        } else {
+            qWarning("Full Text Search, the closing quotation mark is missing.");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void Reader::searchInIndex(const QStringList &terms)
+{
+    foreach (const QString term, terms) {
+        QVector<Document> documents;
+
+        for(IndexTable::ConstIterator it = searchIndexTable.begin();
+            it != searchIndexTable.end(); ++it) {
+            EntryTable entryTable = it.value().first;
+            DocumentList documentList = it.value().second;
+
+            if (term.contains(QLatin1Char('*')))
+                documents = setupDummyTerm(getWildcardTerms(term, entryTable), entryTable);
+            else if (entryTable.value(term))
+                documents = entryTable.value(term)->documents;
+            else
+                continue;
+
+            if (!documents.isEmpty()) {
+                DocumentInfo info;
+                QString title, url;
+                QVector<DocumentInfo> documentsInfo;
+                foreach(const Document doc, documents) {
+                    info.docNumber = doc.docNumber;
+                    info.frequency = doc.frequency;
+                    info.documentUrl = documentList.at(doc.docNumber).at(1);
+                    info.documentTitle = documentList.at(doc.docNumber).at(0);
+                    documentsInfo.append(info);
+                }
+
+                bool found = false;
+                for(QList<TermInfo>::Iterator tit = termList.begin();
+                    tit != termList.end(); ++tit) {
+                    TermInfo *t = &(*tit);
+                    if(t->term == term) {
+                        t->documents += documentsInfo;
+                        t->frequency += documentsInfo.count();
+                        found = true; break;
+                    }
+                }
+                if (!found)
+                    termList.append(TermInfo(term, documentsInfo.count(), documentsInfo));
+            }
+        }
+    }
+    qSort(termList);
+}
+
+QVector<DocumentInfo> Reader::hits()
+{
+    QVector<DocumentInfo> documents;
+    if (!termList.count())
+        return documents;
+
+    documents = termList.takeFirst().documents;
+    for(QList<TermInfo>::Iterator it = termList.begin(); it != termList.end(); ++it) {
+        TermInfo *t = &(*it);
+        QVector<DocumentInfo> docs = t->documents;
+        for(QVector<DocumentInfo>::Iterator minDoc_it = documents.begin();
+            minDoc_it != documents.end(); ) {
+            bool found = false;
+            for (QVector<DocumentInfo>::ConstIterator doc_it = docs.constBegin();
+                doc_it != docs.constEnd(); ++doc_it ) {
+                if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
+                    (*minDoc_it).frequency += (*doc_it).frequency;
+                    found = true;
+                    break;
+                }
+            }
+            if (!found)
+                minDoc_it = documents.erase(minDoc_it);
+            else
+                ++minDoc_it;
+        }
+    }
+
+    qSort(documents);
+    return documents;
+}
+
+bool Reader::searchForPattern(const QStringList &patterns, const QStringList &words,
+                                   const QByteArray &data)
+{
+    if (data.isEmpty())
+        return false;
+
+    for(QHash<QString, PosEntry*>::ConstIterator mit =
+        miniIndex.begin(); mit != miniIndex.end(); ++mit) {
+            delete mit.value();
+    }
+    miniIndex.clear();
+
+    wordNum = 3;
+    QStringList::ConstIterator cIt = words.begin();
+    for ( ; cIt != words.end(); ++cIt )
+        miniIndex.insert(*cIt, new PosEntry(0));
+
+    QTextStream s(data);
+    QString text = s.readAll();
+    bool valid = true;
+    const QChar *buf = text.unicode();
+    QChar str[64];
+    QChar c = buf[0];
+    int j = 0;
+    int i = 0;
+    while ( j < text.length() ) {
+        if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
+            valid = false;
+            if ( i > 1 )
+                buildMiniIndex( QString(str,i) );
+            i = 0;
+            c = buf[++j];
+            continue;
+        }
+        if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
+            valid = true;
+            c = buf[++j];
+            continue;
+        }
+        if ( !valid ) {
+            c = buf[++j];
+            continue;
+        }
+        if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
+            str[i] = c.toLower();
+            ++i;
+        } else {
+            if ( i > 1 )
+                buildMiniIndex( QString(str,i) );
+            i = 0;
+        }
+        c = buf[++j];
+    }
+    if ( i > 1 )
+        buildMiniIndex( QString(str,i) );
+
+    QStringList::ConstIterator patIt = patterns.begin();
+    QStringList wordLst;
+    QList<uint> a, b;
+    QList<uint>::iterator aIt;
+    for ( ; patIt != patterns.end(); ++patIt ) {
+        wordLst = (*patIt).split(QLatin1Char(' '));
+        a = miniIndex[ wordLst[0] ]->positions;
+        for ( int j = 1; j < (int)wordLst.count(); ++j ) {
+            b = miniIndex[ wordLst[j] ]->positions;
+            aIt = a.begin();
+            while ( aIt != a.end() ) {
+                if ( b.contains( *aIt + 1 )) {
+                    (*aIt)++;
+                    ++aIt;
+                } else {
+                    aIt = a.erase( aIt );
+                }
+            }
+        }
+    }
+    if ( a.count() )
+        return true;
+    return false;
+}
+
+QVector<Document> Reader::setupDummyTerm(const QStringList &terms,
+                                              const EntryTable &entryTable)
+{
+    QList<Term> termList;
+    for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it) {
+        if (entryTable.value(*it)) {
+            Entry *e = entryTable.value(*it);
+            termList.append(Term(*it, e->documents.count(), e->documents ) );
+        }
+    }
+    QVector<Document> maxList(0);
+    if ( !termList.count() )
+        return maxList;
+    qSort(termList);
+
+    maxList = termList.takeLast().documents;
+    for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
+        Term *t = &(*it);
+        QVector<Document> docs = t->documents;
+        for (QVector<Document>::iterator docIt = docs.begin(); docIt != docs.end(); ++docIt ) {
+            if ( maxList.indexOf( *docIt ) == -1 )
+                maxList.append( *docIt );
+        }
+    }
+    return maxList;
+}
+
+QStringList Reader::getWildcardTerms(const QString &term,
+                                          const EntryTable &entryTable)
+{
+    QStringList lst;
+    QStringList terms = split(term);
+    QStringList::Iterator iter;
+
+    for(EntryTable::ConstIterator it = entryTable.begin();
+        it != entryTable.end(); ++it) {
+        int index = 0;
+        bool found = false;
+        QString text( it.key() );
+        for ( iter = terms.begin(); iter != terms.end(); ++iter ) {
+            if ( *iter == QLatin1String("*") ) {
+                found = true;
+                continue;
+            }
+            if ( iter == terms.begin() && (*iter)[0] != text[0] ) {
+                found = false;
+                break;
+            }
+            index = text.indexOf( *iter, index );
+            if ( *iter == terms.last() && index != (int)text.length()-1 ) {
+                index = text.lastIndexOf( *iter );
+                if ( index != (int)text.length() - (int)(*iter).length() ) {
+                    found = false;
+                    break;
+                }
+            }
+            if ( index != -1 ) {
+                found = true;
+                index += (*iter).length();
+                continue;
+            } else {
+                found = false;
+                break;
+            }
+        }
+        if (found)
+            lst << text;
+    }
+
+    return lst;
+}
+
+void Reader::buildMiniIndex(const QString &string)
+{
+    if (miniIndex[string])
+        miniIndex[string]->positions.append(wordNum);
+    ++wordNum;
+}
+
+void Reader::reset()
+{
+    for(IndexTable::Iterator it = indexTable.begin();
+        it != indexTable.end(); ++it) {
+        cleanupIndex(it.value().first);
+        it.value().second.clear();
+    }
+}
+
+void Reader::cleanupIndex(EntryTable &entryTable)
+{
+    for(EntryTable::ConstIterator it =
+        entryTable.begin(); it != entryTable.end(); ++it) {
+            delete it.value();
+    }
+
+    entryTable.clear();
+}
+
+
+QHelpSearchIndexReaderDefault::QHelpSearchIndexReaderDefault()
+    : QHelpSearchIndexReader()
+{
+    // nothing todo
+}
+
+QHelpSearchIndexReaderDefault::~QHelpSearchIndexReaderDefault()
+{
+}
+
+void QHelpSearchIndexReaderDefault::run()
+{
+    mutex.lock();
+
+    if (m_cancel) {
+        mutex.unlock();
+        return;
+    }
+
+    const QList<QHelpSearchQuery> &queryList = this->m_query;
+    const QLatin1String key("DefaultSearchNamespaces");
+    const QString collectionFile(this->m_collectionFile);
+    const QString indexPath = m_indexFilesFolder;
+
+    mutex.unlock();
+
+    QString queryTerm;
+    foreach (const QHelpSearchQuery query, queryList) {
+        if (query.fieldName == QHelpSearchQuery::DEFAULT) {
+            queryTerm = query.wordList.at(0);
+            break;
+        }
+    }
+
+    if (queryTerm.isEmpty())
+        return;
+
+    QHelpEngineCore engine(collectionFile, 0);
+    if (!engine.setupData())
+        return;
+
+    const QStringList registeredDocs = engine.registeredDocumentations();
+    const QStringList indexedNamespaces = engine.customValue(key).toString().
+        split(QLatin1String("|"), QString::SkipEmptyParts);
+
+    emit searchingStarted();
+
+    // setup the reader
+    m_reader.setIndexPath(indexPath);
+    foreach(const QString namespaceName, registeredDocs) {
+        mutex.lock();
+        if (m_cancel) {
+            mutex.unlock();
+            searchingFinished(0);   // TODO: check this ???
+            return;
+        }
+        mutex.unlock();
+
+        const QList<QStringList> attributeSets =
+            engine.filterAttributeSets(namespaceName);
+
+        foreach (QStringList attributes, attributeSets) {
+            // read all index files
+            m_reader.setIndexFile(namespaceName, attributes.join(QLatin1String("@")));
+            if (!m_reader.readIndex()) {
+                qWarning("Full Text Search, could not read file for namespace: %s.",
+                    namespaceName.toUtf8().constData());
+            }
+        }
+    }
+
+    // get the current filter attributes and minimize the index files table
+    m_reader.filterFilesForAttributes(engine.filterAttributes(engine.currentFilter()));
+
+    hitList.clear();
+    QStringList terms, termSeq, seqWords;
+    if (m_reader.initCheck() && // check if we could read anything
+        m_reader.splitSearchTerm(queryTerm, &terms, &termSeq, &seqWords) ) {
+
+        // search for term(s)
+        m_reader.searchInIndex(terms);    // TODO: should this be interruptible as well ???
+
+        QVector<DocumentInfo> hits = m_reader.hits();
+        if (!hits.isEmpty()) {
+            if (termSeq.isEmpty()) {
+                foreach (const DocumentInfo docInfo, hits) {
+                    mutex.lock();
+                    if (m_cancel) {
+                        mutex.unlock();
+                        searchingFinished(0);   // TODO: check this, speed issue while locking???
+                        return;
+                    }
+                    mutex.unlock();
+                    hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
+                }
+            } else {
+                foreach (const DocumentInfo docInfo, hits) {
+                    mutex.lock();
+                    if (m_cancel) {
+                        mutex.unlock();
+                        searchingFinished(0);   // TODO: check this, speed issue while locking???
+                        return;
+                    }
+                    mutex.unlock();
+
+                    if (m_reader.searchForPattern(termSeq, seqWords, engine.fileData(docInfo.documentUrl))) // TODO: should this be interruptible as well ???
+                        hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
+                }
+            }
+        }
+    }
+
+    emit searchingFinished(hitList.count());
+}
+
+        }   // namespace std
+    }   // namespace fulltextsearch
+}   // namespace qt
+
+QT_END_NAMESPACE