|
1 /**************************************************************************** |
|
2 ** |
|
3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). |
|
4 ** All rights reserved. |
|
5 ** Contact: Nokia Corporation (qt-info@nokia.com) |
|
6 ** |
|
7 ** This file is part of the Qt Assistant of the Qt Toolkit. |
|
8 ** |
|
9 ** $QT_BEGIN_LICENSE:LGPL$ |
|
10 ** No Commercial Usage |
|
11 ** This file contains pre-release code and may not be distributed. |
|
12 ** You may use this file in accordance with the terms and conditions |
|
13 ** contained in the Technology Preview License Agreement accompanying |
|
14 ** this package. |
|
15 ** |
|
16 ** GNU Lesser General Public License Usage |
|
17 ** Alternatively, this file may be used under the terms of the GNU Lesser |
|
18 ** General Public License version 2.1 as published by the Free Software |
|
19 ** Foundation and appearing in the file LICENSE.LGPL included in the |
|
20 ** packaging of this file. Please review the following information to |
|
21 ** ensure the GNU Lesser General Public License version 2.1 requirements |
|
22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
|
23 ** |
|
24 ** In addition, as a special exception, Nokia gives you certain additional |
|
25 ** rights. These rights are described in the Nokia Qt LGPL Exception |
|
26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
|
27 ** |
|
28 ** If you have questions regarding the use of this file, please contact |
|
29 ** Nokia at qt-info@nokia.com. |
|
30 ** |
|
31 ** |
|
32 ** |
|
33 ** |
|
34 ** |
|
35 ** |
|
36 ** |
|
37 ** |
|
38 ** $QT_END_LICENSE$ |
|
39 ** |
|
40 ****************************************************************************/ |
|
41 |
|
42 #include "index.h" |
|
43 |
|
44 #include <QFile> |
|
45 #include <QDir> |
|
46 #include <QStringList> |
|
47 #include <QApplication> |
|
48 #include <QByteArray> |
|
49 #include <QTextStream> |
|
50 #include <QtAlgorithms> |
|
51 #include <QUrl> |
|
52 #include <QTextCodec> |
|
53 #include <ctype.h> |
|
54 #include <QTextDocument> |
|
55 |
|
56 QT_BEGIN_NAMESPACE |
|
57 |
|
58 struct Term { |
|
59 Term() : frequency(-1) {} |
|
60 Term( const QString &t, int f, QVector<Document> l ) : term( t ), frequency( f ), documents( l ) {} |
|
61 QString term; |
|
62 int frequency; |
|
63 QVector<Document>documents; |
|
64 bool operator<( const Term &i2 ) const { return frequency < i2.frequency; } |
|
65 }; |
|
66 |
|
67 QDataStream &operator>>( QDataStream &s, Document &l ) |
|
68 { |
|
69 s >> l.docNumber; |
|
70 s >> l.frequency; |
|
71 return s; |
|
72 } |
|
73 |
|
74 QDataStream &operator<<( QDataStream &s, const Document &l ) |
|
75 { |
|
76 s << (qint16)l.docNumber; |
|
77 s << (qint16)l.frequency; |
|
78 return s; |
|
79 } |
|
80 |
|
81 Index::Index( const QString &dp, const QString &hp ) |
|
82 : QObject( 0 ), docPath( dp ) |
|
83 { |
|
84 Q_UNUSED(hp); |
|
85 |
|
86 alreadyHaveDocList = false; |
|
87 lastWindowClosed = false; |
|
88 connect( qApp, SIGNAL(lastWindowClosed()), |
|
89 this, SLOT(setLastWinClosed()) ); |
|
90 } |
|
91 |
|
92 Index::Index( const QStringList &dl, const QString &hp ) |
|
93 : QObject( 0 ) |
|
94 { |
|
95 Q_UNUSED(hp); |
|
96 docList = dl; |
|
97 alreadyHaveDocList = true; |
|
98 lastWindowClosed = false; |
|
99 connect( qApp, SIGNAL(lastWindowClosed()), |
|
100 this, SLOT(setLastWinClosed()) ); |
|
101 } |
|
102 |
|
103 void Index::setLastWinClosed() |
|
104 { |
|
105 lastWindowClosed = true; |
|
106 } |
|
107 |
|
108 void Index::setDictionaryFile( const QString &f ) |
|
109 { |
|
110 dictFile = f; |
|
111 } |
|
112 |
|
113 void Index::setDocListFile( const QString &f ) |
|
114 { |
|
115 docListFile = f; |
|
116 } |
|
117 |
|
118 void Index::setDocList( const QStringList &lst ) |
|
119 { |
|
120 docList = lst; |
|
121 } |
|
122 |
|
123 int Index::makeIndex() |
|
124 { |
|
125 if ( !alreadyHaveDocList ) |
|
126 setupDocumentList(); |
|
127 if ( docList.isEmpty() ) |
|
128 return 1; |
|
129 QStringList::Iterator it = docList.begin(); |
|
130 int steps = docList.count() / 100; |
|
131 if ( !steps ) |
|
132 steps++; |
|
133 int prog = 0; |
|
134 for ( int i = 0; it != docList.end(); ++it, ++i ) { |
|
135 if ( lastWindowClosed ) { |
|
136 return -1; |
|
137 } |
|
138 QUrl url(*it); |
|
139 parseDocument( url.toLocalFile(), i ); |
|
140 if ( i%steps == 0 ) { |
|
141 prog++; |
|
142 emit indexingProgress( prog ); |
|
143 } |
|
144 } |
|
145 return 0; |
|
146 } |
|
147 |
|
148 void Index::setupDocumentList() |
|
149 { |
|
150 QDir d( docPath ); |
|
151 QStringList filters; |
|
152 filters.append(QLatin1String("*.html")); |
|
153 QStringList lst = d.entryList(filters); |
|
154 QStringList::ConstIterator it = lst.constBegin(); |
|
155 for ( ; it != lst.constEnd(); ++it ) |
|
156 docList.append( QLatin1String("file:") + docPath + QLatin1String("/") + *it ); |
|
157 } |
|
158 |
|
159 void Index::insertInDict( const QString &str, int docNum ) |
|
160 { |
|
161 if ( str == QLatin1String("amp") || str == QLatin1String("nbsp")) |
|
162 return; |
|
163 Entry *e = 0; |
|
164 if ( dict.count() ) |
|
165 e = dict[ str ]; |
|
166 |
|
167 if ( e ) { |
|
168 if ( e->documents.last().docNumber != docNum ) |
|
169 e->documents.append( Document(docNum, 1 ) ); |
|
170 else |
|
171 e->documents.last().frequency++; |
|
172 } else { |
|
173 dict.insert( str, new Entry( docNum ) ); |
|
174 } |
|
175 } |
|
176 |
|
177 QString Index::getCharsetForDocument(QFile *file) |
|
178 { |
|
179 QTextStream s(file); |
|
180 QString contents = s.readAll(); |
|
181 |
|
182 QString encoding; |
|
183 int start = contents.indexOf(QLatin1String("<meta"), 0, Qt::CaseInsensitive); |
|
184 if (start > 0) { |
|
185 int end = contents.indexOf(QLatin1String(">"), start); |
|
186 QString meta = contents.mid(start+5, end-start); |
|
187 meta = meta.toLower(); |
|
188 QRegExp r(QLatin1String("charset=([^\"\\s]+)")); |
|
189 if (r.indexIn(meta) != -1) { |
|
190 encoding = r.cap(1); |
|
191 } |
|
192 } |
|
193 |
|
194 file->seek(0); |
|
195 if (encoding.isEmpty()) |
|
196 return QLatin1String("utf-8"); |
|
197 return encoding; |
|
198 } |
|
199 |
|
200 void Index::parseDocument( const QString &filename, int docNum ) |
|
201 { |
|
202 QFile file( filename ); |
|
203 if ( !file.open(QFile::ReadOnly) ) { |
|
204 qWarning( "can not open file %s", qPrintable(filename) ); |
|
205 return; |
|
206 } |
|
207 |
|
208 QTextStream s(&file); |
|
209 QString en = getCharsetForDocument(&file); |
|
210 s.setCodec(QTextCodec::codecForName(en.toLatin1().constData())); |
|
211 |
|
212 QString text = s.readAll(); |
|
213 if (text.isNull()) |
|
214 return; |
|
215 |
|
216 bool valid = true; |
|
217 const QChar *buf = text.unicode(); |
|
218 QChar str[64]; |
|
219 QChar c = buf[0]; |
|
220 int j = 0; |
|
221 int i = 0; |
|
222 while ( j < text.length() ) { |
|
223 if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) { |
|
224 valid = false; |
|
225 if ( i > 1 ) |
|
226 insertInDict( QString(str,i), docNum ); |
|
227 i = 0; |
|
228 c = buf[++j]; |
|
229 continue; |
|
230 } |
|
231 if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) { |
|
232 valid = true; |
|
233 c = buf[++j]; |
|
234 continue; |
|
235 } |
|
236 if ( !valid ) { |
|
237 c = buf[++j]; |
|
238 continue; |
|
239 } |
|
240 if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) { |
|
241 str[i] = c.toLower(); |
|
242 ++i; |
|
243 } else { |
|
244 if ( i > 1 ) |
|
245 insertInDict( QString(str,i), docNum ); |
|
246 i = 0; |
|
247 } |
|
248 c = buf[++j]; |
|
249 } |
|
250 if ( i > 1 ) |
|
251 insertInDict( QString(str,i), docNum ); |
|
252 file.close(); |
|
253 } |
|
254 |
|
255 void Index::writeDict() |
|
256 { |
|
257 QFile f( dictFile ); |
|
258 if ( !f.open(QFile::WriteOnly ) ) |
|
259 return; |
|
260 QDataStream s( &f ); |
|
261 for(QHash<QString, Entry *>::Iterator it = dict.begin(); it != dict.end(); ++it) { |
|
262 s << it.key(); |
|
263 s << it.value()->documents.count(); |
|
264 s << it.value()->documents; |
|
265 } |
|
266 f.close(); |
|
267 writeDocumentList(); |
|
268 } |
|
269 |
|
270 void Index::writeDocumentList() |
|
271 { |
|
272 QFile f( docListFile ); |
|
273 if ( !f.open(QFile::WriteOnly ) ) |
|
274 return; |
|
275 QDataStream s( &f ); |
|
276 s << docList; |
|
277 } |
|
278 |
|
279 void Index::readDict() |
|
280 { |
|
281 QFile f( dictFile ); |
|
282 if ( !f.open(QFile::ReadOnly ) ) |
|
283 return; |
|
284 |
|
285 dict.clear(); |
|
286 QDataStream s( &f ); |
|
287 QString key; |
|
288 int numOfDocs; |
|
289 QVector<Document> docs; |
|
290 while ( !s.atEnd() ) { |
|
291 s >> key; |
|
292 s >> numOfDocs; |
|
293 docs.resize(numOfDocs); |
|
294 s >> docs; |
|
295 dict.insert( key, new Entry( docs ) ); |
|
296 } |
|
297 f.close(); |
|
298 readDocumentList(); |
|
299 } |
|
300 |
|
301 void Index::readDocumentList() |
|
302 { |
|
303 QFile f( docListFile ); |
|
304 if ( !f.open(QFile::ReadOnly ) ) |
|
305 return; |
|
306 QDataStream s( &f ); |
|
307 s >> docList; |
|
308 } |
|
309 |
|
310 QStringList Index::query( const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords ) |
|
311 { |
|
312 QList<Term> termList; |
|
313 for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it ) { |
|
314 Entry *e = 0; |
|
315 if ( (*it).contains(QLatin1Char('*')) ) { |
|
316 QVector<Document> wcts = setupDummyTerm( getWildcardTerms( *it ) ); |
|
317 termList.append( Term(QLatin1String("dummy"), wcts.count(), wcts ) ); |
|
318 } else if ( dict[ *it ] ) { |
|
319 e = dict[ *it ]; |
|
320 termList.append( Term( *it, e->documents.count(), e->documents ) ); |
|
321 } else { |
|
322 return QStringList(); |
|
323 } |
|
324 } |
|
325 if ( !termList.count() ) |
|
326 return QStringList(); |
|
327 qSort(termList); |
|
328 |
|
329 QVector<Document> minDocs = termList.takeFirst().documents; |
|
330 for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) { |
|
331 Term *t = &(*it); |
|
332 QVector<Document> docs = t->documents; |
|
333 for(QVector<Document>::Iterator minDoc_it = minDocs.begin(); minDoc_it != minDocs.end(); ) { |
|
334 bool found = false; |
|
335 for (QVector<Document>::ConstIterator doc_it = docs.constBegin(); doc_it != docs.constEnd(); ++doc_it ) { |
|
336 if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) { |
|
337 (*minDoc_it).frequency += (*doc_it).frequency; |
|
338 found = true; |
|
339 break; |
|
340 } |
|
341 } |
|
342 if ( !found ) |
|
343 minDoc_it = minDocs.erase( minDoc_it ); |
|
344 else |
|
345 ++minDoc_it; |
|
346 } |
|
347 } |
|
348 |
|
349 QStringList results; |
|
350 qSort( minDocs ); |
|
351 if ( termSeq.isEmpty() ) { |
|
352 for(QVector<Document>::Iterator it = minDocs.begin(); it != minDocs.end(); ++it) |
|
353 results << docList.at((int)(*it).docNumber); |
|
354 return results; |
|
355 } |
|
356 |
|
357 QString fileName; |
|
358 for(QVector<Document>::Iterator it = minDocs.begin(); it != minDocs.end(); ++it) { |
|
359 fileName = docList[ (int)(*it).docNumber ]; |
|
360 if ( searchForPattern( termSeq, seqWords, fileName ) ) |
|
361 results << fileName; |
|
362 } |
|
363 return results; |
|
364 } |
|
365 |
|
366 QString Index::getDocumentTitle( const QString &fullFileName ) |
|
367 { |
|
368 QUrl url(fullFileName); |
|
369 QString fileName = url.toLocalFile(); |
|
370 |
|
371 if (documentTitleCache.contains(fileName)) |
|
372 return documentTitleCache.value(fileName); |
|
373 |
|
374 QFile file( fileName ); |
|
375 if ( !file.open( QFile::ReadOnly ) ) { |
|
376 qWarning( "cannot open file %s", qPrintable(fileName) ); |
|
377 return fileName; |
|
378 } |
|
379 QTextStream s( &file ); |
|
380 QString text = s.readAll(); |
|
381 |
|
382 int start = text.indexOf(QLatin1String("<title>"), 0, Qt::CaseInsensitive) + 7; |
|
383 int end = text.indexOf(QLatin1String("</title>"), 0, Qt::CaseInsensitive); |
|
384 |
|
385 QString title = tr("Untitled"); |
|
386 if (end - start > 0) { |
|
387 title = text.mid(start, end - start); |
|
388 if (Qt::mightBeRichText(title)) { |
|
389 QTextDocument doc; |
|
390 doc.setHtml(title); |
|
391 title = doc.toPlainText(); |
|
392 } |
|
393 } |
|
394 documentTitleCache.insert(fileName, title); |
|
395 return title; |
|
396 } |
|
397 |
|
398 QStringList Index::getWildcardTerms( const QString &term ) |
|
399 { |
|
400 QStringList lst; |
|
401 QStringList terms = split( term ); |
|
402 QStringList::Iterator iter; |
|
403 |
|
404 for(QHash<QString, Entry*>::Iterator it = dict.begin(); it != dict.end(); ++it) { |
|
405 int index = 0; |
|
406 bool found = false; |
|
407 QString text( it.key() ); |
|
408 for ( iter = terms.begin(); iter != terms.end(); ++iter ) { |
|
409 if ( *iter == QLatin1String("*") ) { |
|
410 found = true; |
|
411 continue; |
|
412 } |
|
413 if ( iter == terms.begin() && (*iter)[0] != text[0] ) { |
|
414 found = false; |
|
415 break; |
|
416 } |
|
417 index = text.indexOf( *iter, index ); |
|
418 if ( *iter == terms.last() && index != (int)text.length()-1 ) { |
|
419 index = text.lastIndexOf( *iter ); |
|
420 if ( index != (int)text.length() - (int)(*iter).length() ) { |
|
421 found = false; |
|
422 break; |
|
423 } |
|
424 } |
|
425 if ( index != -1 ) { |
|
426 found = true; |
|
427 index += (*iter).length(); |
|
428 continue; |
|
429 } else { |
|
430 found = false; |
|
431 break; |
|
432 } |
|
433 } |
|
434 if ( found ) |
|
435 lst << text; |
|
436 } |
|
437 |
|
438 return lst; |
|
439 } |
|
440 |
|
441 QStringList Index::split( const QString &str ) |
|
442 { |
|
443 QStringList lst; |
|
444 int j = 0; |
|
445 int i = str.indexOf(QLatin1Char('*'), j ); |
|
446 |
|
447 if (str.startsWith(QLatin1String("*"))) |
|
448 lst << QLatin1String("*"); |
|
449 |
|
450 while ( i != -1 ) { |
|
451 if ( i > j && i <= (int)str.length() ) { |
|
452 lst << str.mid( j, i - j ); |
|
453 lst << QLatin1String("*"); |
|
454 } |
|
455 j = i + 1; |
|
456 i = str.indexOf(QLatin1Char('*'), j ); |
|
457 } |
|
458 |
|
459 int l = str.length() - 1; |
|
460 if ( str.mid( j, l - j + 1 ).length() > 0 ) |
|
461 lst << str.mid( j, l - j + 1 ); |
|
462 |
|
463 return lst; |
|
464 } |
|
465 |
|
466 QVector<Document> Index::setupDummyTerm( const QStringList &terms ) |
|
467 { |
|
468 QList<Term> termList; |
|
469 for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it) { |
|
470 Entry *e = 0; |
|
471 if ( dict[ *it ] ) { |
|
472 e = dict[ *it ]; |
|
473 termList.append( Term( *it, e->documents.count(), e->documents ) ); |
|
474 } |
|
475 } |
|
476 QVector<Document> maxList(0); |
|
477 if ( !termList.count() ) |
|
478 return maxList; |
|
479 qSort(termList); |
|
480 |
|
481 maxList = termList.takeLast().documents; |
|
482 for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) { |
|
483 Term *t = &(*it); |
|
484 QVector<Document> docs = t->documents; |
|
485 for (QVector<Document>::iterator docIt = docs.begin(); docIt != docs.end(); ++docIt ) { |
|
486 if ( maxList.indexOf( *docIt ) == -1 ) |
|
487 maxList.append( *docIt ); |
|
488 } |
|
489 } |
|
490 return maxList; |
|
491 } |
|
492 |
|
493 void Index::buildMiniDict( const QString &str ) |
|
494 { |
|
495 if ( miniDict[ str ] ) |
|
496 miniDict[ str ]->positions.append( wordNum ); |
|
497 ++wordNum; |
|
498 } |
|
499 |
|
500 bool Index::searchForPattern( const QStringList &patterns, const QStringList &words, const QString &fileName ) |
|
501 { |
|
502 QUrl url(fileName); |
|
503 QString fName = url.toLocalFile(); |
|
504 QFile file( fName ); |
|
505 if ( !file.open( QFile::ReadOnly ) ) { |
|
506 qWarning( "cannot open file %s", qPrintable(fName) ); |
|
507 return false; |
|
508 } |
|
509 |
|
510 wordNum = 3; |
|
511 miniDict.clear(); |
|
512 QStringList::ConstIterator cIt = words.begin(); |
|
513 for ( ; cIt != words.end(); ++cIt ) |
|
514 miniDict.insert( *cIt, new PosEntry( 0 ) ); |
|
515 |
|
516 QTextStream s( &file ); |
|
517 QString text = s.readAll(); |
|
518 bool valid = true; |
|
519 const QChar *buf = text.unicode(); |
|
520 QChar str[64]; |
|
521 QChar c = buf[0]; |
|
522 int j = 0; |
|
523 int i = 0; |
|
524 while ( j < text.length() ) { |
|
525 if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) { |
|
526 valid = false; |
|
527 if ( i > 1 ) |
|
528 buildMiniDict( QString(str,i) ); |
|
529 i = 0; |
|
530 c = buf[++j]; |
|
531 continue; |
|
532 } |
|
533 if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) { |
|
534 valid = true; |
|
535 c = buf[++j]; |
|
536 continue; |
|
537 } |
|
538 if ( !valid ) { |
|
539 c = buf[++j]; |
|
540 continue; |
|
541 } |
|
542 if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) { |
|
543 str[i] = c.toLower(); |
|
544 ++i; |
|
545 } else { |
|
546 if ( i > 1 ) |
|
547 buildMiniDict( QString(str,i) ); |
|
548 i = 0; |
|
549 } |
|
550 c = buf[++j]; |
|
551 } |
|
552 if ( i > 1 ) |
|
553 buildMiniDict( QString(str,i) ); |
|
554 file.close(); |
|
555 |
|
556 QStringList::ConstIterator patIt = patterns.begin(); |
|
557 QStringList wordLst; |
|
558 QList<uint> a, b; |
|
559 QList<uint>::iterator aIt; |
|
560 for ( ; patIt != patterns.end(); ++patIt ) { |
|
561 wordLst = (*patIt).split(QLatin1Char(' ')); |
|
562 a = miniDict[ wordLst[0] ]->positions; |
|
563 for ( int j = 1; j < (int)wordLst.count(); ++j ) { |
|
564 b = miniDict[ wordLst[j] ]->positions; |
|
565 aIt = a.begin(); |
|
566 while ( aIt != a.end() ) { |
|
567 if ( b.contains( *aIt + 1 )) { |
|
568 (*aIt)++; |
|
569 ++aIt; |
|
570 } else { |
|
571 aIt = a.erase( aIt ); |
|
572 } |
|
573 } |
|
574 } |
|
575 } |
|
576 if ( a.count() ) |
|
577 return true; |
|
578 return false; |
|
579 } |
|
580 |
|
581 QT_END_NAMESPACE |