tools/assistant/compat/index.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the Qt Assistant of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include "index.h"
       
    43 
       
    44 #include <QFile>
       
    45 #include <QDir>
       
    46 #include <QStringList>
       
    47 #include <QApplication>
       
    48 #include <QByteArray>
       
    49 #include <QTextStream>
       
    50 #include <QtAlgorithms>
       
    51 #include <QUrl>
       
    52 #include <QTextCodec>
       
    53 #include <ctype.h>
       
    54 #include <QTextDocument>
       
    55 
       
    56 QT_BEGIN_NAMESPACE
       
    57 
       
    58 struct Term {
       
    59     Term() : frequency(-1) {}
       
    60     Term( const QString &t, int f, QVector<Document> l ) : term( t ), frequency( f ), documents( l ) {}
       
    61     QString term;
       
    62     int frequency;
       
    63     QVector<Document>documents;
       
    64     bool operator<( const Term &i2 ) const { return frequency < i2.frequency; }
       
    65 };
       
    66 
       
    67 QDataStream &operator>>( QDataStream &s, Document &l )
       
    68 {
       
    69     s >> l.docNumber;
       
    70     s >> l.frequency;
       
    71     return s;
       
    72 }
       
    73 
       
    74 QDataStream &operator<<( QDataStream &s, const Document &l )
       
    75 {
       
    76     s << (qint16)l.docNumber;
       
    77     s << (qint16)l.frequency;
       
    78     return s;
       
    79 }
       
    80 
       
    81 Index::Index( const QString &dp, const QString &hp )
       
    82     : QObject( 0 ), docPath( dp )
       
    83 {
       
    84     Q_UNUSED(hp);
       
    85 
       
    86     alreadyHaveDocList = false;
       
    87     lastWindowClosed = false;
       
    88     connect( qApp, SIGNAL(lastWindowClosed()),
       
    89              this, SLOT(setLastWinClosed()) );
       
    90 }
       
    91 
       
    92 Index::Index( const QStringList &dl, const QString &hp )
       
    93     : QObject( 0 )
       
    94 {
       
    95     Q_UNUSED(hp);
       
    96     docList = dl;
       
    97     alreadyHaveDocList = true;
       
    98     lastWindowClosed = false;
       
    99     connect( qApp, SIGNAL(lastWindowClosed()),
       
   100              this, SLOT(setLastWinClosed()) );
       
   101 }
       
   102 
       
   103 void Index::setLastWinClosed()
       
   104 {
       
   105     lastWindowClosed = true;
       
   106 }
       
   107 
       
   108 void Index::setDictionaryFile( const QString &f )
       
   109 {
       
   110     dictFile = f;
       
   111 }
       
   112 
       
   113 void Index::setDocListFile( const QString &f )
       
   114 {
       
   115     docListFile = f;
       
   116 }
       
   117 
       
   118 void Index::setDocList( const QStringList &lst )
       
   119 {
       
   120     docList = lst;
       
   121 }
       
   122 
       
   123 int Index::makeIndex()
       
   124 {
       
   125     if ( !alreadyHaveDocList )
       
   126         setupDocumentList();
       
   127     if ( docList.isEmpty() )
       
   128         return 1;
       
   129     QStringList::Iterator it = docList.begin();
       
   130     int steps = docList.count() / 100;
       
   131     if ( !steps )
       
   132         steps++;
       
   133     int prog = 0;
       
   134     for ( int i = 0; it != docList.end(); ++it, ++i ) {
       
   135         if ( lastWindowClosed ) {
       
   136             return -1;
       
   137         }
       
   138         QUrl url(*it);
       
   139         parseDocument( url.toLocalFile(), i );
       
   140         if ( i%steps == 0 ) {
       
   141             prog++;
       
   142             emit indexingProgress( prog );
       
   143         }
       
   144     }
       
   145     return 0;
       
   146 }
       
   147 
       
   148 void Index::setupDocumentList()
       
   149 {
       
   150     QDir d( docPath );
       
   151     QStringList filters;
       
   152     filters.append(QLatin1String("*.html"));
       
   153     QStringList lst = d.entryList(filters);
       
   154     QStringList::ConstIterator it = lst.constBegin();
       
   155     for ( ; it != lst.constEnd(); ++it )
       
   156         docList.append( QLatin1String("file:") + docPath + QLatin1String("/") + *it );
       
   157 }
       
   158 
       
   159 void Index::insertInDict( const QString &str, int docNum )
       
   160 {
       
   161     if ( str == QLatin1String("amp") || str == QLatin1String("nbsp"))
       
   162         return;
       
   163     Entry *e = 0;
       
   164     if ( dict.count() )
       
   165         e = dict[ str ];
       
   166 
       
   167     if ( e ) {
       
   168         if ( e->documents.last().docNumber != docNum )
       
   169             e->documents.append( Document(docNum, 1 ) );
       
   170         else
       
   171             e->documents.last().frequency++;
       
   172     } else {
       
   173         dict.insert( str, new Entry( docNum ) );
       
   174     }
       
   175 }
       
   176 
       
   177 QString Index::getCharsetForDocument(QFile *file)
       
   178 {
       
   179     QTextStream s(file);
       
   180     QString contents = s.readAll();
       
   181 
       
   182     QString encoding;
       
   183     int start = contents.indexOf(QLatin1String("<meta"), 0, Qt::CaseInsensitive);
       
   184     if (start > 0) {
       
   185         int end = contents.indexOf(QLatin1String(">"), start);
       
   186         QString meta = contents.mid(start+5, end-start);
       
   187         meta = meta.toLower();
       
   188         QRegExp r(QLatin1String("charset=([^\"\\s]+)"));
       
   189         if (r.indexIn(meta) != -1) {
       
   190             encoding = r.cap(1);        
       
   191         }
       
   192     }
       
   193 
       
   194     file->seek(0);
       
   195     if (encoding.isEmpty())
       
   196         return QLatin1String("utf-8");
       
   197     return encoding;
       
   198 }
       
   199 
       
   200 void Index::parseDocument( const QString &filename, int docNum )
       
   201 {
       
   202     QFile file( filename );
       
   203     if ( !file.open(QFile::ReadOnly) ) {
       
   204         qWarning( "can not open file %s", qPrintable(filename) );
       
   205         return;
       
   206     }
       
   207 
       
   208     QTextStream s(&file);
       
   209     QString en = getCharsetForDocument(&file);
       
   210     s.setCodec(QTextCodec::codecForName(en.toLatin1().constData()));
       
   211 
       
   212     QString text = s.readAll();
       
   213     if (text.isNull())
       
   214         return;
       
   215 
       
   216     bool valid = true;
       
   217     const QChar *buf = text.unicode();
       
   218     QChar str[64];
       
   219     QChar c = buf[0];
       
   220     int j = 0;
       
   221     int i = 0;
       
   222     while ( j < text.length() ) {
       
   223         if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
       
   224             valid = false;
       
   225             if ( i > 1 )
       
   226                 insertInDict( QString(str,i), docNum );
       
   227             i = 0;
       
   228             c = buf[++j];
       
   229             continue;
       
   230         }
       
   231         if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
       
   232             valid = true;
       
   233             c = buf[++j];
       
   234             continue;
       
   235         }
       
   236         if ( !valid ) {
       
   237             c = buf[++j];
       
   238             continue;
       
   239         }
       
   240         if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
       
   241             str[i] = c.toLower();
       
   242             ++i;
       
   243         } else {
       
   244             if ( i > 1 )
       
   245                 insertInDict( QString(str,i), docNum );
       
   246             i = 0;
       
   247         }
       
   248         c = buf[++j];
       
   249     }
       
   250     if ( i > 1 )
       
   251         insertInDict( QString(str,i), docNum );
       
   252     file.close();
       
   253 }
       
   254 
       
   255 void Index::writeDict()
       
   256 {
       
   257     QFile f( dictFile );
       
   258     if ( !f.open(QFile::WriteOnly ) )
       
   259         return;
       
   260     QDataStream s( &f );
       
   261     for(QHash<QString, Entry *>::Iterator it = dict.begin(); it != dict.end(); ++it) {
       
   262         s << it.key();
       
   263         s << it.value()->documents.count();
       
   264         s << it.value()->documents;
       
   265     }
       
   266     f.close();
       
   267     writeDocumentList();
       
   268 }
       
   269 
       
   270 void Index::writeDocumentList()
       
   271 {
       
   272     QFile f( docListFile );
       
   273     if ( !f.open(QFile::WriteOnly ) )
       
   274         return;
       
   275     QDataStream s( &f );
       
   276     s << docList;
       
   277 }
       
   278 
       
   279 void Index::readDict()
       
   280 {
       
   281     QFile f( dictFile );
       
   282     if ( !f.open(QFile::ReadOnly ) )
       
   283         return;
       
   284 
       
   285     dict.clear();
       
   286     QDataStream s( &f );
       
   287     QString key;
       
   288     int numOfDocs;
       
   289     QVector<Document> docs;
       
   290     while ( !s.atEnd() ) {
       
   291         s >> key;
       
   292         s >> numOfDocs;
       
   293         docs.resize(numOfDocs);
       
   294         s >> docs;
       
   295         dict.insert( key, new Entry( docs ) );
       
   296     }
       
   297     f.close();
       
   298     readDocumentList();
       
   299 }
       
   300 
       
   301 void Index::readDocumentList()
       
   302 {
       
   303     QFile f( docListFile );
       
   304     if ( !f.open(QFile::ReadOnly ) )
       
   305         return;
       
   306     QDataStream s( &f );
       
   307     s >> docList;
       
   308 }
       
   309 
       
   310 QStringList Index::query( const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords )
       
   311 {
       
   312     QList<Term> termList;
       
   313     for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it ) {
       
   314         Entry *e = 0;
       
   315         if ( (*it).contains(QLatin1Char('*')) ) {
       
   316             QVector<Document> wcts = setupDummyTerm( getWildcardTerms( *it ) );
       
   317             termList.append( Term(QLatin1String("dummy"), wcts.count(), wcts ) );
       
   318         } else if ( dict[ *it ] ) {
       
   319             e = dict[ *it ];
       
   320             termList.append( Term( *it, e->documents.count(), e->documents ) );
       
   321         } else {
       
   322             return QStringList();
       
   323         }
       
   324     }
       
   325     if ( !termList.count() )
       
   326         return QStringList();
       
   327     qSort(termList);
       
   328 
       
   329     QVector<Document> minDocs = termList.takeFirst().documents;
       
   330     for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
       
   331         Term *t = &(*it);
       
   332         QVector<Document> docs = t->documents;
       
   333         for(QVector<Document>::Iterator minDoc_it = minDocs.begin(); minDoc_it != minDocs.end(); ) {
       
   334             bool found = false;
       
   335             for (QVector<Document>::ConstIterator doc_it = docs.constBegin(); doc_it != docs.constEnd(); ++doc_it ) {
       
   336                 if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
       
   337                     (*minDoc_it).frequency += (*doc_it).frequency;
       
   338                     found = true;
       
   339                     break;
       
   340                 }
       
   341             }
       
   342             if ( !found )
       
   343                 minDoc_it = minDocs.erase( minDoc_it );
       
   344             else
       
   345                 ++minDoc_it;
       
   346         }
       
   347     }
       
   348 
       
   349     QStringList results;
       
   350     qSort( minDocs );
       
   351     if ( termSeq.isEmpty() ) {
       
   352         for(QVector<Document>::Iterator it = minDocs.begin(); it != minDocs.end(); ++it)
       
   353             results << docList.at((int)(*it).docNumber);
       
   354         return results;
       
   355     }
       
   356 
       
   357     QString fileName;
       
   358     for(QVector<Document>::Iterator it = minDocs.begin(); it != minDocs.end(); ++it) {
       
   359         fileName =  docList[ (int)(*it).docNumber ];
       
   360         if ( searchForPattern( termSeq, seqWords, fileName ) )
       
   361             results << fileName;
       
   362     }
       
   363     return results;
       
   364 }
       
   365 
       
   366 QString Index::getDocumentTitle( const QString &fullFileName )
       
   367 {
       
   368     QUrl url(fullFileName);
       
   369     QString fileName = url.toLocalFile();
       
   370 
       
   371     if (documentTitleCache.contains(fileName))
       
   372         return documentTitleCache.value(fileName);
       
   373 
       
   374     QFile file( fileName );
       
   375     if ( !file.open( QFile::ReadOnly ) ) {
       
   376         qWarning( "cannot open file %s", qPrintable(fileName) );
       
   377         return fileName;
       
   378     }
       
   379     QTextStream s( &file );
       
   380     QString text = s.readAll();
       
   381 
       
   382     int start = text.indexOf(QLatin1String("<title>"), 0, Qt::CaseInsensitive) + 7;
       
   383     int end = text.indexOf(QLatin1String("</title>"), 0, Qt::CaseInsensitive);
       
   384 
       
   385     QString title = tr("Untitled");
       
   386     if (end - start > 0) {
       
   387         title = text.mid(start, end - start);
       
   388         if (Qt::mightBeRichText(title)) {
       
   389             QTextDocument doc;
       
   390             doc.setHtml(title);
       
   391             title = doc.toPlainText();
       
   392         }
       
   393     }
       
   394     documentTitleCache.insert(fileName, title);
       
   395     return title;
       
   396 }
       
   397 
       
   398 QStringList Index::getWildcardTerms( const QString &term )
       
   399 {
       
   400     QStringList lst;
       
   401     QStringList terms = split( term );
       
   402     QStringList::Iterator iter;
       
   403 
       
   404     for(QHash<QString, Entry*>::Iterator it = dict.begin(); it != dict.end(); ++it) {
       
   405         int index = 0;
       
   406         bool found = false;
       
   407         QString text( it.key() );
       
   408         for ( iter = terms.begin(); iter != terms.end(); ++iter ) {
       
   409             if ( *iter == QLatin1String("*") ) {
       
   410                 found = true;
       
   411                 continue;
       
   412             }
       
   413             if ( iter == terms.begin() && (*iter)[0] != text[0] ) {
       
   414                 found = false;
       
   415                 break;
       
   416             }
       
   417             index = text.indexOf( *iter, index );
       
   418             if ( *iter == terms.last() && index != (int)text.length()-1 ) {
       
   419                 index = text.lastIndexOf( *iter );
       
   420                 if ( index != (int)text.length() - (int)(*iter).length() ) {
       
   421                     found = false;
       
   422                     break;
       
   423                 }
       
   424             }
       
   425             if ( index != -1 ) {
       
   426                 found = true;
       
   427                 index += (*iter).length();
       
   428                 continue;
       
   429             } else {
       
   430                 found = false;
       
   431                 break;
       
   432             }
       
   433         }
       
   434         if ( found )
       
   435             lst << text;
       
   436     }
       
   437 
       
   438     return lst;
       
   439 }
       
   440 
       
   441 QStringList Index::split( const QString &str )
       
   442 {
       
   443     QStringList lst;
       
   444     int j = 0;
       
   445     int i = str.indexOf(QLatin1Char('*'), j );
       
   446 
       
   447     if (str.startsWith(QLatin1String("*")))
       
   448         lst << QLatin1String("*");
       
   449 
       
   450     while ( i != -1 ) {
       
   451         if ( i > j && i <= (int)str.length() ) {
       
   452             lst << str.mid( j, i - j );
       
   453             lst << QLatin1String("*");
       
   454         }
       
   455         j = i + 1;
       
   456         i = str.indexOf(QLatin1Char('*'), j );
       
   457     }
       
   458 
       
   459     int l = str.length() - 1;
       
   460     if ( str.mid( j, l - j + 1 ).length() > 0 )
       
   461         lst << str.mid( j, l - j + 1 );
       
   462 
       
   463     return lst;
       
   464 }
       
   465 
       
   466 QVector<Document> Index::setupDummyTerm( const QStringList &terms )
       
   467 {
       
   468     QList<Term> termList;
       
   469     for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it) {
       
   470         Entry *e = 0;
       
   471         if ( dict[ *it ] ) {
       
   472             e = dict[ *it ];
       
   473             termList.append( Term( *it, e->documents.count(), e->documents ) );
       
   474         }
       
   475     }
       
   476     QVector<Document> maxList(0);
       
   477     if ( !termList.count() )
       
   478         return maxList;
       
   479     qSort(termList);
       
   480 
       
   481     maxList = termList.takeLast().documents;
       
   482     for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
       
   483         Term *t = &(*it);
       
   484         QVector<Document> docs = t->documents;
       
   485         for (QVector<Document>::iterator docIt = docs.begin(); docIt != docs.end(); ++docIt ) {
       
   486             if ( maxList.indexOf( *docIt ) == -1 )
       
   487                 maxList.append( *docIt );
       
   488         }
       
   489     }
       
   490     return maxList;
       
   491 }
       
   492 
       
   493 void Index::buildMiniDict( const QString &str )
       
   494 {
       
   495     if ( miniDict[ str ] )
       
   496         miniDict[ str ]->positions.append( wordNum );
       
   497     ++wordNum;
       
   498 }
       
   499 
       
   500 bool Index::searchForPattern( const QStringList &patterns, const QStringList &words, const QString &fileName )
       
   501 {
       
   502     QUrl url(fileName);
       
   503     QString fName = url.toLocalFile();
       
   504     QFile file( fName );
       
   505     if ( !file.open( QFile::ReadOnly ) ) {
       
   506         qWarning( "cannot open file %s", qPrintable(fName) );
       
   507         return false;
       
   508     }
       
   509 
       
   510     wordNum = 3;
       
   511     miniDict.clear();
       
   512     QStringList::ConstIterator cIt = words.begin();
       
   513     for ( ; cIt != words.end(); ++cIt )
       
   514         miniDict.insert( *cIt, new PosEntry( 0 ) );
       
   515 
       
   516     QTextStream s( &file );
       
   517     QString text = s.readAll();
       
   518     bool valid = true;
       
   519     const QChar *buf = text.unicode();
       
   520     QChar str[64];
       
   521     QChar c = buf[0];
       
   522     int j = 0;
       
   523     int i = 0;
       
   524     while ( j < text.length() ) {
       
   525         if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
       
   526             valid = false;
       
   527             if ( i > 1 )
       
   528                 buildMiniDict( QString(str,i) );
       
   529             i = 0;
       
   530             c = buf[++j];
       
   531             continue;
       
   532         }
       
   533         if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
       
   534             valid = true;
       
   535             c = buf[++j];
       
   536             continue;
       
   537         }
       
   538         if ( !valid ) {
       
   539             c = buf[++j];
       
   540             continue;
       
   541         }
       
   542         if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
       
   543             str[i] = c.toLower();
       
   544             ++i;
       
   545         } else {
       
   546             if ( i > 1 )
       
   547                 buildMiniDict( QString(str,i) );
       
   548             i = 0;
       
   549         }
       
   550         c = buf[++j];
       
   551     }
       
   552     if ( i > 1 )
       
   553         buildMiniDict( QString(str,i) );
       
   554     file.close();
       
   555 
       
   556     QStringList::ConstIterator patIt = patterns.begin();
       
   557     QStringList wordLst;
       
   558     QList<uint> a, b;
       
   559     QList<uint>::iterator aIt;
       
   560     for ( ; patIt != patterns.end(); ++patIt ) {
       
   561         wordLst = (*patIt).split(QLatin1Char(' '));
       
   562         a = miniDict[ wordLst[0] ]->positions;
       
   563         for ( int j = 1; j < (int)wordLst.count(); ++j ) {
       
   564             b = miniDict[ wordLst[j] ]->positions;
       
   565             aIt = a.begin();
       
   566             while ( aIt != a.end() ) {
       
   567                 if ( b.contains( *aIt + 1 )) {
       
   568                     (*aIt)++;
       
   569                     ++aIt;
       
   570                 } else {
       
   571                     aIt = a.erase( aIt );
       
   572                 }
       
   573             }
       
   574         }
       
   575     }
       
   576     if ( a.count() )
       
   577         return true;
       
   578     return false;
       
   579 }
       
   580 
       
   581 QT_END_NAMESPACE