tools/linguist/shared/simtexth.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the Qt Linguist of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include "simtexth.h"
       
    43 #include "translator.h"
       
    44 
       
    45 #include <QtCore/QByteArray>
       
    46 #include <QtCore/QString>
       
    47 #include <QtCore/QList>
       
    48 
       
    49 
       
    50 QT_BEGIN_NAMESPACE
       
    51 
       
    52 typedef QList<TranslatorMessage> TML;
       
    53 
       
    54 /*
       
    55   How similar are two texts?  The approach used here relies on co-occurrence
       
    56   matrices and is very efficient.
       
    57 
       
    58   Let's see with an example: how similar are "here" and "hither"?  The
       
    59   co-occurrence matrix M for "here" is M[h,e] = 1, M[e,r] = 1, M[r,e] = 1, and 0
       
    60   elsewhere; the matrix N for "hither" is N[h,i] = 1, N[i,t] = 1, ...,
       
    61   N[h,e] = 1, N[e,r] = 1, and 0 elsewhere.  The union U of both matrices is the
       
    62   matrix U[i,j] = max { M[i,j], N[i,j] }, and the intersection V is
       
    63   V[i,j] = min { M[i,j], N[i,j] }.  The score for a pair of texts is
       
    64 
       
    65       score = (sum of V[i,j] over all i, j) / (sum of U[i,j] over all i, j),
       
    66 
       
    67   a formula suggested by Arnt Gulbrandsen.  Here we have
       
    68 
       
    69       score = 2 / 6,
       
    70 
       
    71   or one third.
       
    72 
       
    73   The implementation differs from this in a few details.  Most importantly,
       
    74   repetitions are ignored; for input "xxx", M[x,x] equals 1, not 2.
       
    75 */
       
    76 
       
    77 /*
       
    78   Every character is assigned to one of 20 buckets so that the co-occurrence
       
    79   matrix requires only 20 * 20 = 400 bits, not 256 * 256 = 65536 bits or even
       
    80   more if we want the whole Unicode.  Which character falls in which bucket is
       
    81   arbitrary.
       
    82 
       
    83   The second half of the table is a replica of the first half, because of
       
    84   laziness.
       
    85 */
       
    86 static const int indexOf[256] = {
       
    87     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
       
    88     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
       
    89 //      !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /
       
    90     0,  2,  6,  7,  10, 12, 15, 19, 2,  6,  7,  10, 12, 15, 19, 0,
       
    91 //  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?
       
    92     1,  3,  4,  5,  8,  9,  11, 13, 14, 16, 2,  6,  7,  10, 12, 15,
       
    93 //  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
       
    94     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  6,  10, 11, 12, 13, 14,
       
    95 //  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _
       
    96     15, 12, 16, 17, 18, 19, 2,  10, 15, 7,  19, 2,  6,  7,  10, 0,
       
    97 //  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o
       
    98     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  6,  10, 11, 12, 13, 14,
       
    99 //  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~
       
   100     15, 12, 16, 17, 18, 19, 2,  10, 15, 7,  19, 2,  6,  7,  10, 0,
       
   101 
       
   102     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
       
   103     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
       
   104     0,  2,  6,  7,  10, 12, 15, 19, 2,  6,  7,  10, 12, 15, 19, 0,
       
   105     1,  3,  4,  5,  8,  9,  11, 13, 14, 16, 2,  6,  7,  10, 12, 15,
       
   106     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  6,  10, 11, 12, 13, 14,
       
   107     15, 12, 16, 17, 18, 19, 2,  10, 15, 7,  19, 2,  6,  7,  10, 0,
       
   108     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  6,  10, 11, 12, 13, 14,
       
   109     15, 12, 16, 17, 18, 19, 2,  10, 15, 7,  19, 2,  6,  7,  10, 0
       
   110 };
       
   111 
       
   112 /*
       
   113   The entry bitCount[i] (for i between 0 and 255) is the number of bits used to
       
   114   represent i in binary.
       
   115 */
       
   116 static const int bitCount[256] = {
       
   117     0,  1,  1,  2,  1,  2,  2,  3,  1,  2,  2,  3,  2,  3,  3,  4,
       
   118     1,  2,  2,  3,  2,  3,  3,  4,  2,  3,  3,  4,  3,  4,  4,  5,
       
   119     1,  2,  2,  3,  2,  3,  3,  4,  2,  3,  3,  4,  3,  4,  4,  5,
       
   120     2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
       
   121     1,  2,  2,  3,  2,  3,  3,  4,  2,  3,  3,  4,  3,  4,  4,  5,
       
   122     2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
       
   123     2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
       
   124     3,  4,  4,  5,  4,  5,  5,  6,  4,  5,  5,  6,  5,  6,  6,  7,
       
   125     1,  2,  2,  3,  2,  3,  3,  4,  2,  3,  3,  4,  3,  4,  4,  5,
       
   126     2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
       
   127     2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
       
   128     3,  4,  4,  5,  4,  5,  5,  6,  4,  5,  5,  6,  5,  6,  6,  7,
       
   129     2,  3,  3,  4,  3,  4,  4,  5,  3,  4,  4,  5,  4,  5,  5,  6,
       
   130     3,  4,  4,  5,  4,  5,  5,  6,  4,  5,  5,  6,  5,  6,  6,  7,
       
   131     3,  4,  4,  5,  4,  5,  5,  6,  4,  5,  5,  6,  5,  6,  6,  7,
       
   132     4,  5,  5,  6,  5,  6,  6,  7,  5,  6,  6,  7,  6,  7,  7,  8
       
   133 };
       
   134 
       
   135 struct CoMatrix
       
   136 {
       
   137     /*
       
   138       The matrix has 20 * 20 = 400 entries.  This requires 50 bytes, or 13
       
   139       words.  Some operations are performed on words for more efficiency.
       
   140     */
       
   141     union {
       
   142         quint8 b[52];
       
   143         quint32 w[13];
       
   144     };
       
   145 
       
   146     CoMatrix() { memset( b, 0, 52 ); }
       
   147 
       
   148     CoMatrix(const QString &str)
       
   149     {
       
   150         QByteArray ba = str.toUtf8();
       
   151         const char *text = ba.constData();
       
   152         char c = '\0', d;
       
   153         memset( b, 0, 52 );
       
   154         /*
       
   155           The Knuth books are not in the office only for show; they help make
       
   156           loops 30% faster and 20% as readable.
       
   157         */
       
   158         while ( (d = *text) != '\0' ) {
       
   159             setCoOccurence( c, d );
       
   160             if ( (c = *++text) != '\0' ) {
       
   161                 setCoOccurence( d, c );
       
   162                 text++;
       
   163             }
       
   164         }
       
   165     }
       
   166 
       
   167     void setCoOccurence( char c, char d ) {
       
   168         int k = indexOf[(uchar) c] + 20 * indexOf[(uchar) d];
       
   169         b[k >> 3] |= (1 << (k & 0x7));
       
   170     }
       
   171 
       
   172     int worth() const {
       
   173         int w = 0;
       
   174         for ( int i = 0; i < 50; i++ )
       
   175             w += bitCount[b[i]];
       
   176         return w;
       
   177     }
       
   178 };
       
   179 
       
   180 static inline CoMatrix reunion(const CoMatrix &m, const CoMatrix &n)
       
   181 {
       
   182     CoMatrix p;
       
   183     for (int i = 0; i < 13; ++i)
       
   184         p.w[i] = m.w[i] | n.w[i];
       
   185     return p;
       
   186 }
       
   187 
       
   188 static inline CoMatrix intersection(const CoMatrix &m, const CoMatrix &n)
       
   189 {
       
   190     CoMatrix p;
       
   191     for (int i = 0; i < 13; ++i)
       
   192         p.w[i] = m.w[i] & n.w[i];
       
   193     return p;
       
   194 }
       
   195 
       
   196 StringSimilarityMatcher::StringSimilarityMatcher(const QString &stringToMatch)
       
   197 {
       
   198     m_cm = new CoMatrix(stringToMatch);
       
   199     m_length = stringToMatch.length();
       
   200 }
       
   201 
       
   202 int StringSimilarityMatcher::getSimilarityScore(const QString &strCandidate)
       
   203 {
       
   204     CoMatrix cmTarget(strCandidate);
       
   205     int delta = qAbs(m_length - strCandidate.size());
       
   206     int score = ( (intersection(*m_cm, cmTarget).worth() + 1) << 10 ) /
       
   207         ( reunion(*m_cm, cmTarget).worth() + (delta << 1) + 1 );
       
   208     return score;
       
   209 }
       
   210 
       
   211 StringSimilarityMatcher::~StringSimilarityMatcher()
       
   212 {
       
   213     delete m_cm;
       
   214 }
       
   215 
       
   216 /**
       
   217  * Checks how similar two strings are.
       
   218  * The return value is the score, and a higher score is more similar
       
   219  * than one with a low score.
       
   220  * Linguist considers a score over 190 to be a good match.
       
   221  * \sa StringSimilarityMatcher
       
   222  */
       
   223 int getSimilarityScore(const QString &str1, const QString &str2)
       
   224 {
       
   225     CoMatrix cmTarget(str2);
       
   226     CoMatrix cm(str1);
       
   227     int delta = qAbs(str1.size() - str2.size());
       
   228 
       
   229     int score = ( (intersection(cm, cmTarget).worth() + 1) << 10 )
       
   230         / ( reunion(cm, cmTarget).worth() + (delta << 1) + 1 );
       
   231 
       
   232     return score;
       
   233 }
       
   234 
       
   235 CandidateList similarTextHeuristicCandidates(const Translator *tor,
       
   236     const QString &text, int maxCandidates)
       
   237 {
       
   238     QList<int> scores;
       
   239     CandidateList candidates;
       
   240 
       
   241     TML all = tor->translatedMessages();
       
   242 
       
   243     foreach (const TranslatorMessage &mtm, all) {
       
   244         if (mtm.type() == TranslatorMessage::Unfinished
       
   245             || mtm.translation().isEmpty())
       
   246             continue;
       
   247 
       
   248         QString s = mtm.sourceText();
       
   249         int score = getSimilarityScore(s, text);
       
   250 
       
   251         if (candidates.size() == maxCandidates && score > scores[maxCandidates - 1] )
       
   252             candidates.removeLast();
       
   253 
       
   254         if (candidates.size() < maxCandidates && score >= textSimilarityThreshold) {
       
   255             Candidate cand( s, mtm.translation() );
       
   256 
       
   257             int i;
       
   258             for (i = 0; i < candidates.size(); i++) {
       
   259                 if (score >= scores.at(i)) {
       
   260                     if (score == scores.at(i)) {
       
   261                         if (candidates.at(i) == cand)
       
   262                             goto continue_outer_loop;
       
   263                     } else {
       
   264                         break;
       
   265                     }
       
   266                 }
       
   267             }
       
   268             scores.insert(i, score);
       
   269             candidates.insert(i, cand);
       
   270         }
       
   271         continue_outer_loop:
       
   272         ;
       
   273     }
       
   274     return candidates;
       
   275 }
       
   276 
       
   277 QT_END_NAMESPACE