tests/auto/utf8/tst_utf8.cpp
changeset 30 5dc02b23752f
parent 18 2f34d5167611
equal deleted inserted replaced
29:b72c6db6890b 30:5dc02b23752f
    71     void charByChar_data();
    71     void charByChar_data();
    72     void charByChar();
    72     void charByChar();
    73 
    73 
    74     void invalidUtf8_data();
    74     void invalidUtf8_data();
    75     void invalidUtf8();
    75     void invalidUtf8();
       
    76 
       
    77     void nonCharacters_data();
       
    78     void nonCharacters();
    76 };
    79 };
    77 
    80 
    78 void tst_Utf8::initTestCase()
    81 void tst_Utf8::initTestCase()
    79 {
    82 {
    80     QTest::addColumn<bool>("useLocale");
    83     QTest::addColumn<bool>("useLocale");
   132 
   135 
   133     static const char utf8_5[] = "\360\220\210\203"; // U+010203
   136     static const char utf8_5[] = "\360\220\210\203"; // U+010203
   134     static const uint utf32_5[] = { 0x010203 };
   137     static const uint utf32_5[] = { 0x010203 };
   135     QTest::newRow("utf8_5") << QByteArray(utf8_5) << QString::fromUcs4(utf32_5, 1);
   138     QTest::newRow("utf8_5") << QByteArray(utf8_5) << QString::fromUcs4(utf32_5, 1);
   136 
   139 
   137     static const char utf8_6[] = "\364\217\277\277"; // U+10FFFF
   140     static const char utf8_6[] = "\364\217\277\275"; // U+10FFFD
   138     static const uint utf32_6[] = { 0x10FFFF };
   141     static const uint utf32_6[] = { 0x10FFFD };
   139     QTest::newRow("utf8_6") << QByteArray(utf8_6) << QString::fromUcs4(utf32_6, 1);
   142     QTest::newRow("utf8_6") << QByteArray(utf8_6) << QString::fromUcs4(utf32_6, 1);
   140 
   143 
   141     static const char utf8_7[] = "abc\302\240\303\241\303\251\307\275 \342\202\254def";
   144     static const char utf8_7[] = "abc\302\240\303\241\303\251\307\275 \342\202\254def";
   142     static const ushort utf16_7[] = { 'a', 'b', 'c', 0x00A0,
   145     static const ushort utf16_7[] = { 'a', 'b', 'c', 0x00A0,
   143                                       0x00E1, 0x00E9, 0x01FD,
   146                                       0x00E1, 0x00E9, 0x01FD,
   144                                       ' ', 0x20AC, 'd', 'e', 'f', 0 };
   147                                       ' ', 0x20AC, 'd', 'e', 'f', 0 };
   145     QTest::newRow("utf8_7") << QByteArray(utf8_7) << QString::fromUtf16(utf16_7);
   148     QTest::newRow("utf8_7") << QByteArray(utf8_7) << QString::fromUtf16(utf16_7);
   146 
   149 
   147     static const char utf8_8[] = "abc\302\240\303\241\303\251\307\275 \364\217\277\277 \342\202\254def";
   150     static const char utf8_8[] = "abc\302\240\303\241\303\251\307\275 \364\217\277\275 \342\202\254def";
   148     static const uint utf32_8[] = { 'a', 'b', 'c', 0x00A0,
   151     static const uint utf32_8[] = { 'a', 'b', 'c', 0x00A0,
   149                                     0x00E1, 0x00E9, 0x01FD,
   152                                     0x00E1, 0x00E9, 0x01FD,
   150                                     ' ', 0x10FFFF, ' ',
   153                                     ' ', 0x10FFFD, ' ',
   151                                     0x20AC, 'd', 'e', 'f', 0 };
   154                                     0x20AC, 'd', 'e', 'f', 0 };
   152     QTest::newRow("utf8_8") << QByteArray(utf8_8) << QString::fromUcs4(utf32_8);
   155     QTest::newRow("utf8_8") << QByteArray(utf8_8) << QString::fromUcs4(utf32_8);
   153 }
   156 }
   154 
   157 
   155 void tst_Utf8::roundTrip()
   158 void tst_Utf8::roundTrip()
   211     QTest::newRow("3chars-1") << QByteArray("\xE0\xA0\xC0");
   214     QTest::newRow("3chars-1") << QByteArray("\xE0\xA0\xC0");
   212     QTest::newRow("3chars-2") << QByteArray("\xE0\xC0\xA0");
   215     QTest::newRow("3chars-2") << QByteArray("\xE0\xC0\xA0");
   213     QTest::newRow("4chars-1") << QByteArray("\xF0\x90\x80\xC0");
   216     QTest::newRow("4chars-1") << QByteArray("\xF0\x90\x80\xC0");
   214     QTest::newRow("4chars-2") << QByteArray("\xF0\x90\xC0\x80");
   217     QTest::newRow("4chars-2") << QByteArray("\xF0\x90\xC0\x80");
   215     QTest::newRow("4chars-3") << QByteArray("\xF0\xC0\x80\x80");
   218     QTest::newRow("4chars-3") << QByteArray("\xF0\xC0\x80\x80");
   216 
       
   217     // U+FFFE and U+FFFF are non-characters and must not be present
       
   218     // U+FFFE:        1111   11 1111   11 1110
       
   219     // encoding: xxxz:1111 xz11:1111 xz11:1110
       
   220     QTest::newRow("fffe") << QByteArray("\xEF\xBF\xBE");
       
   221     // U+FFFF:        1111   11 1111   11 1111
       
   222     // encoding: xxxz:1111 xz11:1111 xz11:1111
       
   223     QTest::newRow("ffff") << QByteArray("\xEF\xBF\xBF");
       
   224 
   219 
   225     // Surrogate pairs must now be present either
   220     // Surrogate pairs must now be present either
   226     // U+D800:        1101   10 0000   00 0000
   221     // U+D800:        1101   10 0000   00 0000
   227     // encoding: xxxz:1101 xz10:0000 xz00:0000
   222     // encoding: xxxz:1101 xz10:0000 xz00:0000
   228     QTest::newRow("hi-surrogate") << QByteArray("\xED\xA0\x80");
   223     QTest::newRow("hi-surrogate") << QByteArray("\xED\xA0\x80");
   300 {
   295 {
   301     QFETCH(QByteArray, utf8);
   296     QFETCH(QByteArray, utf8);
   302     QFETCH_GLOBAL(bool, useLocale);
   297     QFETCH_GLOBAL(bool, useLocale);
   303 
   298 
   304     QSharedPointer<QTextDecoder> decoder = QSharedPointer<QTextDecoder>(codec->makeDecoder());
   299     QSharedPointer<QTextDecoder> decoder = QSharedPointer<QTextDecoder>(codec->makeDecoder());
   305     QString decoded = decoder->toUnicode(utf8);
   300     decoder->toUnicode(utf8);
   306 
   301 
   307     // Only enforce correctness on our UTF-8 decoder
   302     // Only enforce correctness on our UTF-8 decoder
   308     // The system's UTF-8 codec is sometimes buggy
   303     // The system's UTF-8 codec is sometimes buggy
   309     //  GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8
   304     //  GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8
   310     //  OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF
   305     //  OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF
   312         QVERIFY(decoder->hasFailure());
   307         QVERIFY(decoder->hasFailure());
   313     else if (!decoder->hasFailure())
   308     else if (!decoder->hasFailure())
   314         qWarning("System codec does not report failure when it should. Should report bug upstream.");
   309         qWarning("System codec does not report failure when it should. Should report bug upstream.");
   315 }
   310 }
   316 
   311 
       
   312 void tst_Utf8::nonCharacters_data()
       
   313 {
       
   314     QTest::addColumn<QByteArray>("utf8");
       
   315     QTest::addColumn<QString>("utf16");
       
   316 
       
   317     // Unicode has a couple of "non-characters" that one can use internally,
       
   318     // but are not allowed to be used for text interchange.
       
   319     //
       
   320     // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF,
       
   321     // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and
       
   322     // U+FDEF (inclusive)
       
   323 
       
   324     // U+FDD0 through U+FDEF
       
   325     for (int i = 0; i < 16; ++i) {
       
   326         char utf8[] = { 0357, 0267, 0220 + i, 0 };
       
   327         QString utf16 = QChar(0xfdd0 + i);
       
   328         QTest::newRow(qPrintable(QString::number(0xfdd0 + i, 16))) << QByteArray(utf8) << utf16;
       
   329     }
       
   330 
       
   331     // the last two in Planes 1 through 16
       
   332     for (uint plane = 1; plane <= 16; ++plane) {
       
   333         for (uint lower = 0xfffe; lower < 0x10000; ++lower) {
       
   334             uint ucs4 = (plane << 16) | lower;
       
   335             char utf8[] = { 0xf0 | uchar(ucs4 >> 18),
       
   336                             0x80 | (uchar(ucs4 >> 12) & 0x3f),
       
   337                             0x80 | (uchar(ucs4 >> 6) & 0x3f),
       
   338                             0x80 | (uchar(ucs4) & 0x3f),
       
   339                             0 };
       
   340             ushort utf16[] = { QChar::highSurrogate(ucs4), QChar::lowSurrogate(ucs4), 0 };
       
   341 
       
   342             QTest::newRow(qPrintable(QString::number(ucs4, 16))) << QByteArray(utf8) << QString::fromUtf16(utf16);
       
   343         }
       
   344     }
       
   345 
       
   346     QTest::newRow("fffe") << QByteArray("\xEF\xBF\xBE") << QString(QChar(0xfffe));
       
   347     QTest::newRow("ffff") << QByteArray("\xEF\xBF\xBF") << QString(QChar(0xffff));
       
   348 }
       
   349 
       
   350 void tst_Utf8::nonCharacters()
       
   351 {
       
   352     QFETCH(QByteArray, utf8);
       
   353     QFETCH(QString, utf16);
       
   354     QFETCH_GLOBAL(bool, useLocale);
       
   355 
       
   356     QSharedPointer<QTextDecoder> decoder = QSharedPointer<QTextDecoder>(codec->makeDecoder());
       
   357     decoder->toUnicode(utf8);
       
   358 
       
   359     // Only enforce correctness on our UTF-8 decoder
       
   360     // The system's UTF-8 codec is sometimes buggy
       
   361     //  GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8
       
   362     //  OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF
       
   363     if (!useLocale)
       
   364         QVERIFY(decoder->hasFailure());
       
   365     else if (!decoder->hasFailure())
       
   366         qWarning("System codec does not report failure when it should. Should report bug upstream.");
       
   367 
       
   368     QSharedPointer<QTextEncoder> encoder(codec->makeEncoder());
       
   369     encoder->fromUnicode(utf16);
       
   370     if (!useLocale)
       
   371         QVERIFY(encoder->hasFailure());
       
   372     else if (!encoder->hasFailure())
       
   373         qWarning("System codec does not report failure when it should. Should report bug upstream.");
       
   374 }
       
   375 
   317 QTEST_MAIN(tst_Utf8)
   376 QTEST_MAIN(tst_Utf8)
   318 #include "tst_utf8.moc"
   377 #include "tst_utf8.moc"