diff -r 000000000000 -r 1918ee327afb tools/assistant/lib/qhelpsearchindexreader_default.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/assistant/lib/qhelpsearchindexreader_default.cpp Mon Jan 11 14:00:40 2010 +0000 @@ -0,0 +1,614 @@ +/**************************************************************************** +** +** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). +** All rights reserved. +** Contact: Nokia Corporation (qt-info@nokia.com) +** +** This file is part of the Qt Assistant of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the Technology Preview License Agreement accompanying +** this package. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** If you have questions regarding the use of this file, please contact +** Nokia at qt-info@nokia.com. +** +** +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "qhelpenginecore.h" +#include "qhelpsearchindexreader_default_p.h" + +#include +#include +#include +#include +#include +#include +#include + +QT_BEGIN_NAMESPACE + +namespace qt { + namespace fulltextsearch { + namespace std { + +namespace { + QStringList split( const QString &str ) + { + QStringList lst; + int j = 0; + int i = str.indexOf(QLatin1Char('*'), j ); + + if (str.startsWith(QLatin1String("*"))) + lst << QLatin1String("*"); + + while ( i != -1 ) { + if ( i > j && i <= (int)str.length() ) { + lst << str.mid( j, i - j ); + lst << QLatin1String("*"); + } + j = i + 1; + i = str.indexOf(QLatin1Char('*'), j ); + } + + int l = str.length() - 1; + if ( str.mid( j, l - j + 1 ).length() > 0 ) + lst << str.mid( j, l - j + 1 ); + + return lst; + } +} + + +Reader::Reader() + : indexPath(QString()) + , indexFile(QString()) + , documentFile(QString()) +{ + termList.clear(); + indexTable.clear(); + searchIndexTable.clear(); +} + +Reader::~Reader() +{ + reset(); + searchIndexTable.clear(); +} + +bool Reader::readIndex() +{ + if (indexTable.contains(indexFile)) + return true; + + QFile idxFile(indexFile); + if (!idxFile.open(QFile::ReadOnly)) + return false; + + QString key; + int numOfDocs; + EntryTable entryTable; + QVector docs; + QDataStream dictStream(&idxFile); + while (!dictStream.atEnd()) { + dictStream >> key; + dictStream >> numOfDocs; + docs.resize(numOfDocs); + dictStream >> docs; + entryTable.insert(key, new Entry(docs)); + } + idxFile.close(); + + if (entryTable.isEmpty()) + return false; + + QFile docFile(documentFile); + if (!docFile.open(QFile::ReadOnly)) + return false; + + QString title, url; + DocumentList documentList; + QDataStream docStream(&docFile); + while (!docStream.atEnd()) { + docStream >> title; + docStream >> url; + documentList.append(QStringList(title) << url); + } + docFile.close(); + + if (documentList.isEmpty()) { + cleanupIndex(entryTable); + return false; + } + + indexTable.insert(indexFile, Index(entryTable, documentList)); + return true; +} + +bool Reader::initCheck() const +{ + return !searchIndexTable.isEmpty(); +} + +void Reader::setIndexPath(const QString &path) +{ + indexPath = path; +} + +void Reader::filterFilesForAttributes(const QStringList &attributes) +{ + searchIndexTable.clear(); + for(IndexTable::ConstIterator it = indexTable.begin(); it != indexTable.end(); ++it) { + const QString fileName = it.key(); + bool containsAll = true; + QStringList split = fileName.split(QLatin1String("@")); + foreach (const QString attribute, attributes) { + if (!split.contains(attribute, Qt::CaseInsensitive)) { + containsAll = false; + break; + } + } + + if (containsAll) + searchIndexTable.insert(fileName, it.value()); + } +} + +void Reader::setIndexFile(const QString &namespaceName, const QString &attributes) +{ + QString extention = namespaceName + QLatin1String("@") + attributes; + indexFile = indexPath + QLatin1String("/indexdb40.") + extention; + documentFile = indexPath + QLatin1String("/indexdoc40.") + extention; +} + +bool Reader::splitSearchTerm(const QString &searchTerm, QStringList *terms, + QStringList *termSeq, QStringList *seqWords) +{ + QString term = searchTerm; + + term = term.simplified(); + term = term.replace(QLatin1String("\'"), QLatin1String("\"")); + term = term.replace(QLatin1String("`"), QLatin1String("\"")); + term = term.replace(QLatin1String("-"), QLatin1String(" ")); + term = term.replace(QRegExp(QLatin1String("\\s[\\S]?\\s")), QLatin1String(" ")); + + *terms = term.split(QLatin1Char(' ')); + QStringList::iterator it = terms->begin(); + for (; it != terms->end(); ++it) { + (*it) = (*it).simplified(); + (*it) = (*it).toLower(); + (*it) = (*it).replace(QLatin1String("\""), QLatin1String("")); + } + + if (term.contains(QLatin1Char('\"'))) { + if ((term.count(QLatin1Char('\"')))%2 == 0) { + int beg = 0; + int end = 0; + QString s; + beg = term.indexOf(QLatin1Char('\"'), beg); + while (beg != -1) { + beg++; + end = term.indexOf(QLatin1Char('\"'), beg); + s = term.mid(beg, end - beg); + s = s.toLower(); + s = s.simplified(); + if (s.contains(QLatin1Char('*'))) { + qWarning("Full Text Search, using a wildcard within phrases is not allowed."); + return false; + } + *seqWords += s.split(QLatin1Char(' ')); + *termSeq << s; + beg = term.indexOf(QLatin1Char('\"'), end + 1); + } + } else { + qWarning("Full Text Search, the closing quotation mark is missing."); + return false; + } + } + + return true; +} + +void Reader::searchInIndex(const QStringList &terms) +{ + foreach (const QString term, terms) { + QVector documents; + + for(IndexTable::ConstIterator it = searchIndexTable.begin(); + it != searchIndexTable.end(); ++it) { + EntryTable entryTable = it.value().first; + DocumentList documentList = it.value().second; + + if (term.contains(QLatin1Char('*'))) + documents = setupDummyTerm(getWildcardTerms(term, entryTable), entryTable); + else if (entryTable.value(term)) + documents = entryTable.value(term)->documents; + else + continue; + + if (!documents.isEmpty()) { + DocumentInfo info; + QString title, url; + QVector documentsInfo; + foreach(const Document doc, documents) { + info.docNumber = doc.docNumber; + info.frequency = doc.frequency; + info.documentUrl = documentList.at(doc.docNumber).at(1); + info.documentTitle = documentList.at(doc.docNumber).at(0); + documentsInfo.append(info); + } + + bool found = false; + for(QList::Iterator tit = termList.begin(); + tit != termList.end(); ++tit) { + TermInfo *t = &(*tit); + if(t->term == term) { + t->documents += documentsInfo; + t->frequency += documentsInfo.count(); + found = true; break; + } + } + if (!found) + termList.append(TermInfo(term, documentsInfo.count(), documentsInfo)); + } + } + } + qSort(termList); +} + +QVector Reader::hits() +{ + QVector documents; + if (!termList.count()) + return documents; + + documents = termList.takeFirst().documents; + for(QList::Iterator it = termList.begin(); it != termList.end(); ++it) { + TermInfo *t = &(*it); + QVector docs = t->documents; + for(QVector::Iterator minDoc_it = documents.begin(); + minDoc_it != documents.end(); ) { + bool found = false; + for (QVector::ConstIterator doc_it = docs.constBegin(); + doc_it != docs.constEnd(); ++doc_it ) { + if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) { + (*minDoc_it).frequency += (*doc_it).frequency; + found = true; + break; + } + } + if (!found) + minDoc_it = documents.erase(minDoc_it); + else + ++minDoc_it; + } + } + + qSort(documents); + return documents; +} + +bool Reader::searchForPattern(const QStringList &patterns, const QStringList &words, + const QByteArray &data) +{ + if (data.isEmpty()) + return false; + + for(QHash::ConstIterator mit = + miniIndex.begin(); mit != miniIndex.end(); ++mit) { + delete mit.value(); + } + miniIndex.clear(); + + wordNum = 3; + QStringList::ConstIterator cIt = words.begin(); + for ( ; cIt != words.end(); ++cIt ) + miniIndex.insert(*cIt, new PosEntry(0)); + + QTextStream s(data); + QString text = s.readAll(); + bool valid = true; + const QChar *buf = text.unicode(); + QChar str[64]; + QChar c = buf[0]; + int j = 0; + int i = 0; + while ( j < text.length() ) { + if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) { + valid = false; + if ( i > 1 ) + buildMiniIndex( QString(str,i) ); + i = 0; + c = buf[++j]; + continue; + } + if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) { + valid = true; + c = buf[++j]; + continue; + } + if ( !valid ) { + c = buf[++j]; + continue; + } + if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) { + str[i] = c.toLower(); + ++i; + } else { + if ( i > 1 ) + buildMiniIndex( QString(str,i) ); + i = 0; + } + c = buf[++j]; + } + if ( i > 1 ) + buildMiniIndex( QString(str,i) ); + + QStringList::ConstIterator patIt = patterns.begin(); + QStringList wordLst; + QList a, b; + QList::iterator aIt; + for ( ; patIt != patterns.end(); ++patIt ) { + wordLst = (*patIt).split(QLatin1Char(' ')); + a = miniIndex[ wordLst[0] ]->positions; + for ( int j = 1; j < (int)wordLst.count(); ++j ) { + b = miniIndex[ wordLst[j] ]->positions; + aIt = a.begin(); + while ( aIt != a.end() ) { + if ( b.contains( *aIt + 1 )) { + (*aIt)++; + ++aIt; + } else { + aIt = a.erase( aIt ); + } + } + } + } + if ( a.count() ) + return true; + return false; +} + +QVector Reader::setupDummyTerm(const QStringList &terms, + const EntryTable &entryTable) +{ + QList termList; + for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it) { + if (entryTable.value(*it)) { + Entry *e = entryTable.value(*it); + termList.append(Term(*it, e->documents.count(), e->documents ) ); + } + } + QVector maxList(0); + if ( !termList.count() ) + return maxList; + qSort(termList); + + maxList = termList.takeLast().documents; + for(QList::Iterator it = termList.begin(); it != termList.end(); ++it) { + Term *t = &(*it); + QVector docs = t->documents; + for (QVector::iterator docIt = docs.begin(); docIt != docs.end(); ++docIt ) { + if ( maxList.indexOf( *docIt ) == -1 ) + maxList.append( *docIt ); + } + } + return maxList; +} + +QStringList Reader::getWildcardTerms(const QString &term, + const EntryTable &entryTable) +{ + QStringList lst; + QStringList terms = split(term); + QStringList::Iterator iter; + + for(EntryTable::ConstIterator it = entryTable.begin(); + it != entryTable.end(); ++it) { + int index = 0; + bool found = false; + QString text( it.key() ); + for ( iter = terms.begin(); iter != terms.end(); ++iter ) { + if ( *iter == QLatin1String("*") ) { + found = true; + continue; + } + if ( iter == terms.begin() && (*iter)[0] != text[0] ) { + found = false; + break; + } + index = text.indexOf( *iter, index ); + if ( *iter == terms.last() && index != (int)text.length()-1 ) { + index = text.lastIndexOf( *iter ); + if ( index != (int)text.length() - (int)(*iter).length() ) { + found = false; + break; + } + } + if ( index != -1 ) { + found = true; + index += (*iter).length(); + continue; + } else { + found = false; + break; + } + } + if (found) + lst << text; + } + + return lst; +} + +void Reader::buildMiniIndex(const QString &string) +{ + if (miniIndex[string]) + miniIndex[string]->positions.append(wordNum); + ++wordNum; +} + +void Reader::reset() +{ + for(IndexTable::Iterator it = indexTable.begin(); + it != indexTable.end(); ++it) { + cleanupIndex(it.value().first); + it.value().second.clear(); + } +} + +void Reader::cleanupIndex(EntryTable &entryTable) +{ + for(EntryTable::ConstIterator it = + entryTable.begin(); it != entryTable.end(); ++it) { + delete it.value(); + } + + entryTable.clear(); +} + + +QHelpSearchIndexReaderDefault::QHelpSearchIndexReaderDefault() + : QHelpSearchIndexReader() +{ + // nothing todo +} + +QHelpSearchIndexReaderDefault::~QHelpSearchIndexReaderDefault() +{ +} + +void QHelpSearchIndexReaderDefault::run() +{ + mutex.lock(); + + if (m_cancel) { + mutex.unlock(); + return; + } + + const QList &queryList = this->m_query; + const QLatin1String key("DefaultSearchNamespaces"); + const QString collectionFile(this->m_collectionFile); + const QString indexPath = m_indexFilesFolder; + + mutex.unlock(); + + QString queryTerm; + foreach (const QHelpSearchQuery query, queryList) { + if (query.fieldName == QHelpSearchQuery::DEFAULT) { + queryTerm = query.wordList.at(0); + break; + } + } + + if (queryTerm.isEmpty()) + return; + + QHelpEngineCore engine(collectionFile, 0); + if (!engine.setupData()) + return; + + const QStringList registeredDocs = engine.registeredDocumentations(); + const QStringList indexedNamespaces = engine.customValue(key).toString(). + split(QLatin1String("|"), QString::SkipEmptyParts); + + emit searchingStarted(); + + // setup the reader + m_reader.setIndexPath(indexPath); + foreach(const QString namespaceName, registeredDocs) { + mutex.lock(); + if (m_cancel) { + mutex.unlock(); + searchingFinished(0); // TODO: check this ??? + return; + } + mutex.unlock(); + + const QList attributeSets = + engine.filterAttributeSets(namespaceName); + + foreach (QStringList attributes, attributeSets) { + // read all index files + m_reader.setIndexFile(namespaceName, attributes.join(QLatin1String("@"))); + if (!m_reader.readIndex()) { + qWarning("Full Text Search, could not read file for namespace: %s.", + namespaceName.toUtf8().constData()); + } + } + } + + // get the current filter attributes and minimize the index files table + m_reader.filterFilesForAttributes(engine.filterAttributes(engine.currentFilter())); + + hitList.clear(); + QStringList terms, termSeq, seqWords; + if (m_reader.initCheck() && // check if we could read anything + m_reader.splitSearchTerm(queryTerm, &terms, &termSeq, &seqWords) ) { + + // search for term(s) + m_reader.searchInIndex(terms); // TODO: should this be interruptible as well ??? + + QVector hits = m_reader.hits(); + if (!hits.isEmpty()) { + if (termSeq.isEmpty()) { + foreach (const DocumentInfo docInfo, hits) { + mutex.lock(); + if (m_cancel) { + mutex.unlock(); + searchingFinished(0); // TODO: check this, speed issue while locking??? + return; + } + mutex.unlock(); + hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl)); + } + } else { + foreach (const DocumentInfo docInfo, hits) { + mutex.lock(); + if (m_cancel) { + mutex.unlock(); + searchingFinished(0); // TODO: check this, speed issue while locking??? + return; + } + mutex.unlock(); + + if (m_reader.searchForPattern(termSeq, seqWords, engine.fileData(docInfo.documentUrl))) // TODO: should this be interruptible as well ??? + hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl)); + } + } + } + } + + emit searchingFinished(hitList.count()); +} + + } // namespace std + } // namespace fulltextsearch +} // namespace qt + +QT_END_NAMESPACE