tools/linguist/lupdate/merge.cpp
changeset 0 1918ee327afb
child 3 41300fa6a67c
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the Qt Linguist of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include "lupdate.h"
       
    43 
       
    44 #include "simtexth.h"
       
    45 #include "translator.h"
       
    46 
       
    47 #include <QtCore/QDebug>
       
    48 #include <QtCore/QMap>
       
    49 #include <QtCore/QStringList>
       
    50 #include <QtCore/QTextCodec>
       
    51 #include <QtCore/QVector>
       
    52 
       
    53 typedef QList<TranslatorMessage> TML;
       
    54 typedef QMap<QString, TranslatorMessage> TMM;
       
    55 
       
    56 
       
    57 QT_BEGIN_NAMESPACE
       
    58 
       
    59 static bool isDigitFriendly(QChar c)
       
    60 {
       
    61     return c.isPunct() || c.isSpace();
       
    62 }
       
    63 
       
    64 static int numberLength(const QString &s, int i)
       
    65 {
       
    66     if (i < s.size() || !s.at(i).isDigit())
       
    67         return 0;
       
    68 
       
    69     int pos = i;
       
    70     do {
       
    71         ++i;
       
    72     } while (i < s.size()
       
    73              && (s.at(i).isDigit()
       
    74                  || (isDigitFriendly(s[i])
       
    75                      && i + 1 < s.size()
       
    76                      && (s[i + 1].isDigit()
       
    77                          || (isDigitFriendly(s[i + 1])
       
    78                              && i + 2 < s.size()
       
    79                              && s[i + 2].isDigit())))));
       
    80     return i - pos;
       
    81 }
       
    82 
       
    83 
       
    84 /*
       
    85   Returns a version of 'key' where all numbers have been replaced by zeroes.  If
       
    86   there were none, returns "".
       
    87 */
       
    88 static QString zeroKey(const QString &key)
       
    89 {
       
    90     QString zeroed;
       
    91     bool metSomething = false;
       
    92 
       
    93     for (int i = 0; i != key.size(); ++i) {
       
    94         int len = numberLength(key, i);
       
    95         if (len > 0) {
       
    96             i += len;
       
    97             zeroed.append(QLatin1Char('0'));
       
    98             metSomething = true;
       
    99         } else {
       
   100             zeroed.append(key.at(i));
       
   101         }
       
   102     }
       
   103     return metSomething ? zeroed : QString();
       
   104 }
       
   105 
       
   106 static QString translationAttempt(const QString &oldTranslation,
       
   107     const QString &oldSource, const QString &newSource)
       
   108 {
       
   109     int p = zeroKey(oldSource).count(QLatin1Char('0'));
       
   110     QString attempt;
       
   111     QStringList oldNumbers;
       
   112     QStringList newNumbers;
       
   113     QVector<bool> met(p);
       
   114     QVector<int> matchedYet(p);
       
   115     int i, j;
       
   116     int k = 0, ell, best;
       
   117     int m, n;
       
   118     int pass;
       
   119 
       
   120     /*
       
   121       This algorithm is hard to follow, so we'll consider an example
       
   122       all along: oldTranslation is "XeT 3.0", oldSource is "TeX 3.0"
       
   123       and newSource is "XeT 3.1".
       
   124 
       
   125       First, we set up two tables: oldNumbers and newNumbers. In our
       
   126       example, oldNumber[0] is "3.0" and newNumber[0] is "3.1".
       
   127     */
       
   128     for (i = 0, j = 0; i < oldSource.size(); i++, j++) {
       
   129         m = numberLength(oldSource, i);
       
   130         n = numberLength(newSource, j);
       
   131         if (m > 0) {
       
   132             oldNumbers.append(oldSource.mid(i, m + 1));
       
   133             newNumbers.append(newSource.mid(j, n + 1));
       
   134             i += m;
       
   135             j += n;
       
   136             met[k] = false;
       
   137             matchedYet[k] = 0;
       
   138             k++;
       
   139         }
       
   140     }
       
   141 
       
   142     /*
       
   143       We now go over the old translation, "XeT 3.0", one letter at a
       
   144       time, looking for numbers found in oldNumbers. Whenever such a
       
   145       number is met, it is replaced with its newNumber equivalent. In
       
   146       our example, the "3.0" of "XeT 3.0" becomes "3.1".
       
   147     */
       
   148     for (i = 0; i < oldTranslation.length(); i++) {
       
   149         attempt += oldTranslation[i];
       
   150         for (k = 0; k < p; k++) {
       
   151             if (oldTranslation[i] == oldNumbers[k][matchedYet[k]])
       
   152                 matchedYet[k]++;
       
   153             else
       
   154                 matchedYet[k] = 0;
       
   155         }
       
   156 
       
   157         /*
       
   158           Let's find out if the last character ended a match. We make
       
   159           two passes over the data. In the first pass, we try to
       
   160           match only numbers that weren't matched yet; if that fails,
       
   161           the second pass does the trick. This is useful in some
       
   162           suspicious cases, flagged below.
       
   163         */
       
   164         for (pass = 0; pass < 2; pass++) {
       
   165             best = p; // an impossible value
       
   166             for (k = 0; k < p; k++) {
       
   167                 if ((!met[k] || pass > 0) &&
       
   168                      matchedYet[k] == oldNumbers[k].length() &&
       
   169                      numberLength(oldTranslation, i + 1 - matchedYet[k]) == matchedYet[k]) {
       
   170                     // the longer the better
       
   171                     if (best == p || matchedYet[k] > matchedYet[best])
       
   172                         best = k;
       
   173                 }
       
   174             }
       
   175             if (best != p) {
       
   176                 attempt.truncate(attempt.length() - matchedYet[best]);
       
   177                 attempt += newNumbers[best];
       
   178                 met[best] = true;
       
   179                 for (k = 0; k < p; k++)
       
   180                     matchedYet[k] = 0;
       
   181                 break;
       
   182             }
       
   183         }
       
   184     }
       
   185 
       
   186     /*
       
   187       We flag two kinds of suspicious cases. They are identified as
       
   188       such with comments such as "{2000?}" at the end.
       
   189 
       
   190       Example of the first kind: old source text "TeX 3.0" translated
       
   191       as "XeT 2.0" is flagged "TeX 2.0 {3.0?}", no matter what the
       
   192       new text is.
       
   193     */
       
   194     for (k = 0; k < p; k++) {
       
   195         if (!met[k])
       
   196             attempt += QLatin1String(" {") + newNumbers[k] + QLatin1String("?}");
       
   197     }
       
   198 
       
   199     /*
       
   200       Example of the second kind: "1 of 1" translated as "1 af 1",
       
   201       with new source text "1 of 2", generates "1 af 2 {1 or 2?}"
       
   202       because it's not clear which of "1 af 2" and "2 af 1" is right.
       
   203     */
       
   204     for (k = 0; k < p; k++) {
       
   205         for (ell = 0; ell < p; ell++) {
       
   206             if (k != ell && oldNumbers[k] == oldNumbers[ell] &&
       
   207                     newNumbers[k] < newNumbers[ell])
       
   208                 attempt += QLatin1String(" {") + newNumbers[k] + QLatin1String(" or ") +
       
   209                            newNumbers[ell] + QLatin1String("?}");
       
   210         }
       
   211     }
       
   212     return attempt;
       
   213 }
       
   214 
       
   215 
       
   216 /*
       
   217   Augments a Translator with translations easily derived from
       
   218   similar existing (probably obsolete) translations.
       
   219 
       
   220   For example, if "TeX 3.0" is translated as "XeT 3.0" and "TeX 3.1"
       
   221   has no translation, "XeT 3.1" is added to the translator and is
       
   222   marked Unfinished.
       
   223 
       
   224   Returns the number of additional messages that this heuristic translated.
       
   225 */
       
   226 int applyNumberHeuristic(Translator &tor)
       
   227 {
       
   228     TMM translated, untranslated;
       
   229     TMM::Iterator t, u;
       
   230     TML all = tor.messages();
       
   231     TML::Iterator it;
       
   232     int inserted = 0;
       
   233 
       
   234     for (it = all.begin(); it != all.end(); ++it) {
       
   235         bool hasTranslation = it->isTranslated();
       
   236         if (it->type() == TranslatorMessage::Unfinished) {
       
   237             if (!hasTranslation)
       
   238                 untranslated.insert(it->context() + QLatin1Char('\n')
       
   239                     + it->sourceText() + QLatin1Char('\n')
       
   240                     + it->comment(), *it);
       
   241         } else if (hasTranslation && it->translations().count() == 1) {
       
   242             translated.insert(zeroKey(it->sourceText()), *it);
       
   243         }
       
   244     }
       
   245 
       
   246     for (u = untranslated.begin(); u != untranslated.end(); ++u) {
       
   247         t = translated.find(zeroKey((*u).sourceText()));
       
   248         if (t != translated.end() && !t.key().isEmpty()
       
   249             && t->sourceText() != u->sourceText()) {
       
   250             TranslatorMessage m = *u;
       
   251             m.setTranslation(translationAttempt(t->translation(), t->sourceText(),
       
   252                                                 u->sourceText()));
       
   253             tor.replace(m);
       
   254             inserted++;
       
   255         }
       
   256     }
       
   257     return inserted;
       
   258 }
       
   259 
       
   260 
       
   261 /*
       
   262   Augments a Translator with trivially derived translations.
       
   263 
       
   264   For example, if "Enabled:" is consistendly translated as "Eingeschaltet:" no
       
   265   matter the context or the comment, "Eingeschaltet:" is added as the
       
   266   translation of any untranslated "Enabled:" text and is marked Unfinished.
       
   267 
       
   268   Returns the number of additional messages that this heuristic translated.
       
   269 */
       
   270 
       
   271 int applySameTextHeuristic(Translator &tor)
       
   272 {
       
   273     TMM translated;
       
   274     TMM avoid;
       
   275     TMM::Iterator t;
       
   276     TML untranslated;
       
   277     TML::Iterator u;
       
   278     TML all = tor.messages();
       
   279     TML::Iterator it;
       
   280     int inserted = 0;
       
   281 
       
   282     for (it = all.begin(); it != all.end(); ++it) {
       
   283         if (!it->isTranslated()) {
       
   284             if (it->type() == TranslatorMessage::Unfinished)
       
   285                 untranslated.append(*it);
       
   286         } else {
       
   287             QString key = it->sourceText();
       
   288             t = translated.find(key);
       
   289             if (t != translated.end()) {
       
   290                 /*
       
   291                   The same source text is translated at least two
       
   292                   different ways. Do nothing then.
       
   293                 */
       
   294                 if (t->translations() != it->translations()) {
       
   295                     translated.remove(key);
       
   296                     avoid.insert(key, *it);
       
   297                 }
       
   298             } else if (!avoid.contains(key)) {
       
   299                 translated.insert(key, *it);
       
   300             }
       
   301         }
       
   302     }
       
   303 
       
   304     for (u = untranslated.begin(); u != untranslated.end(); ++u) {
       
   305         QString key = u->sourceText();
       
   306         t = translated.find(key);
       
   307         if (t != translated.end()) {
       
   308             TranslatorMessage m = *u;
       
   309             m.setTranslations(t->translations());
       
   310             tor.replace(m);
       
   311             ++inserted;
       
   312         }
       
   313     }
       
   314     return inserted;
       
   315 }
       
   316 
       
   317 
       
   318 
       
   319 /*
       
   320   Merges two Translator objects. The first one
       
   321   is a set of source texts and translations for a previous version of
       
   322   the internationalized program; the second one is a set of fresh
       
   323   source texts newly extracted from the source code, without any
       
   324   translation yet.
       
   325 */
       
   326 
       
   327 Translator merge(const Translator &tor, const Translator &virginTor,
       
   328     UpdateOptions options, QString &err)
       
   329 {
       
   330     int known = 0;
       
   331     int neww = 0;
       
   332     int obsoleted = 0;
       
   333     int similarTextHeuristicCount = 0;
       
   334 
       
   335     Translator outTor;
       
   336     outTor.setLanguageCode(tor.languageCode());
       
   337     outTor.setSourceLanguageCode(tor.sourceLanguageCode());
       
   338     outTor.setLocationsType(tor.locationsType());
       
   339     outTor.setCodecName(tor.codecName());
       
   340 
       
   341     /*
       
   342       The types of all the messages from the vernacular translator
       
   343       are updated according to the virgin translator.
       
   344     */
       
   345     foreach (TranslatorMessage m, tor.messages()) {
       
   346         TranslatorMessage::Type newType = TranslatorMessage::Finished;
       
   347 
       
   348         if (m.sourceText().isEmpty()) {
       
   349             // context/file comment
       
   350             TranslatorMessage mv = virginTor.find(m.context());
       
   351             if (!mv.isNull())
       
   352                 m.setComment(mv.comment());
       
   353         } else {
       
   354             TranslatorMessage mv = virginTor.find(m.context(), m.sourceText(), m.comment());
       
   355             if (mv.isNull()) {
       
   356                 if (!(options & HeuristicSimilarText)) {
       
   357                     newType = TranslatorMessage::Obsolete;
       
   358                     if (m.type() != TranslatorMessage::Obsolete)
       
   359                         obsoleted++;
       
   360                     m.clearReferences();
       
   361                 } else {
       
   362                     mv = virginTor.find(m.context(), m.comment(), m.allReferences());
       
   363                     if (mv.isNull()) {
       
   364                         // did not find it in the virgin, mark it as obsolete
       
   365                         newType = TranslatorMessage::Obsolete;
       
   366                         if (m.type() != TranslatorMessage::Obsolete)
       
   367                             obsoleted++;
       
   368                         m.clearReferences();
       
   369                     } else {
       
   370                         // Do not just accept it if its on the same line number,
       
   371                         // but different source text.
       
   372                         // Also check if the texts are more or less similar before
       
   373                         // we consider them to represent the same message...
       
   374                         if (getSimilarityScore(m.sourceText(), mv.sourceText()) >= textSimilarityThreshold) {
       
   375                             // It is just slightly modified, assume that it is the same string
       
   376 
       
   377                             // Mark it as unfinished. (Since the source text
       
   378                             // was changed it might require re-translating...)
       
   379                             newType = TranslatorMessage::Unfinished;
       
   380                             ++similarTextHeuristicCount;
       
   381                             neww++;
       
   382 
       
   383                             m.setOldSourceText(m.sourceText());
       
   384                             m.setSourceText(mv.sourceText());
       
   385                             const QString &oldpluralsource = m.extra(QLatin1String("po-msgid_plural"));
       
   386                             if (!oldpluralsource.isEmpty()) {
       
   387                                 m.setExtra(QLatin1String("po-old_msgid_plural"), oldpluralsource);
       
   388                                 m.unsetExtra(QLatin1String("po-msgid_plural"));
       
   389                             }
       
   390                             m.setReferences(mv.allReferences()); // Update secondary references
       
   391                             m.setPlural(mv.isPlural());
       
   392                             m.setUtf8(mv.isUtf8());
       
   393                             m.setExtraComment(mv.extraComment());
       
   394                         } else {
       
   395                             // The virgin and vernacular sourceTexts are so
       
   396                             // different that we could not find it.
       
   397                             newType = TranslatorMessage::Obsolete;
       
   398                             if (m.type() != TranslatorMessage::Obsolete)
       
   399                                 obsoleted++;
       
   400                             m.clearReferences();
       
   401                         }
       
   402                     }
       
   403                 }
       
   404             } else {
       
   405                 switch (m.type()) {
       
   406                 case TranslatorMessage::Finished:
       
   407                 default:
       
   408                     if (m.isPlural() == mv.isPlural()) {
       
   409                         newType = TranslatorMessage::Finished;
       
   410                     } else {
       
   411                         newType = TranslatorMessage::Unfinished;
       
   412                     }
       
   413                     known++;
       
   414                     break;
       
   415                 case TranslatorMessage::Unfinished:
       
   416                     newType = TranslatorMessage::Unfinished;
       
   417                     known++;
       
   418                     break;
       
   419                 case TranslatorMessage::Obsolete:
       
   420                     newType = TranslatorMessage::Unfinished;
       
   421                     neww++;
       
   422                 }
       
   423 
       
   424                 // Always get the filename and linenumber info from the
       
   425                 // virgin Translator, in case it has changed location.
       
   426                 // This should also enable us to read a file that does not
       
   427                 // have the <location> element.
       
   428                 // why not use operator=()? Because it overwrites e.g. userData.
       
   429                 m.setReferences(mv.allReferences());
       
   430                 m.setPlural(mv.isPlural());
       
   431                 m.setUtf8(mv.isUtf8());
       
   432                 m.setExtraComment(mv.extraComment());
       
   433             }
       
   434         }
       
   435 
       
   436         m.setType(newType);
       
   437         outTor.append(m);
       
   438     }
       
   439 
       
   440     /*
       
   441       Messages found only in the virgin translator are added to the
       
   442       vernacular translator.
       
   443     */
       
   444     foreach (const TranslatorMessage &mv, virginTor.messages()) {
       
   445         if (mv.sourceText().isEmpty()) {
       
   446             if (tor.contains(mv.context()))
       
   447                 continue;
       
   448         } else {
       
   449             if (tor.contains(mv.context(), mv.sourceText(), mv.comment()))
       
   450                 continue;
       
   451             if (options & HeuristicSimilarText) {
       
   452                 TranslatorMessage m = tor.find(mv.context(), mv.comment(), mv.allReferences());
       
   453                 if (!m.isNull()) {
       
   454                     if (getSimilarityScore(m.sourceText(), mv.sourceText()) >= textSimilarityThreshold)
       
   455                         continue;
       
   456                 }
       
   457             }
       
   458         }
       
   459         if (options & NoLocations)
       
   460             outTor.append(mv);
       
   461         else
       
   462             outTor.appendSorted(mv);
       
   463         if (!mv.sourceText().isEmpty())
       
   464             ++neww;
       
   465     }
       
   466 
       
   467     /*
       
   468       The same-text heuristic handles cases where a message has an
       
   469       obsolete counterpart with a different context or comment.
       
   470     */
       
   471     int sameTextHeuristicCount = (options & HeuristicSameText) ? applySameTextHeuristic(outTor) : 0;
       
   472 
       
   473     /*
       
   474       The number heuristic handles cases where a message has an
       
   475       obsolete counterpart with mostly numbers differing in the
       
   476       source text.
       
   477     */
       
   478     int sameNumberHeuristicCount = (options & HeuristicNumber) ? applyNumberHeuristic(outTor) : 0;
       
   479 
       
   480     if (options & Verbose) {
       
   481         int totalFound = neww + known;
       
   482         err += QObject::tr("    Found %n source text(s) (%1 new and %2 already existing)\n", 0, totalFound).arg(neww).arg(known);
       
   483 
       
   484         if (obsoleted) {
       
   485             if (options & NoObsolete) {
       
   486                 err += QObject::tr("    Removed %n obsolete entries\n", 0, obsoleted);
       
   487             } else {
       
   488                 err += QObject::tr("    Kept %n obsolete entries\n", 0, obsoleted);
       
   489             }
       
   490         }
       
   491 
       
   492         if (sameNumberHeuristicCount)
       
   493             err += QObject::tr("    Number heuristic provided %n translation(s)\n",
       
   494                       0, sameNumberHeuristicCount);
       
   495         if (sameTextHeuristicCount)
       
   496             err += QObject::tr("    Same-text heuristic provided %n translation(s)\n",
       
   497                       0, sameTextHeuristicCount);
       
   498         if (similarTextHeuristicCount)
       
   499             err += QObject::tr("    Similar-text heuristic provided %n translation(s)\n",
       
   500                       0, similarTextHeuristicCount);
       
   501     }
       
   502     return outTor;
       
   503 }
       
   504 
       
   505 QT_END_NAMESPACE