132 |
135 |
133 static const char utf8_5[] = "\360\220\210\203"; // U+010203 |
136 static const char utf8_5[] = "\360\220\210\203"; // U+010203 |
134 static const uint utf32_5[] = { 0x010203 }; |
137 static const uint utf32_5[] = { 0x010203 }; |
135 QTest::newRow("utf8_5") << QByteArray(utf8_5) << QString::fromUcs4(utf32_5, 1); |
138 QTest::newRow("utf8_5") << QByteArray(utf8_5) << QString::fromUcs4(utf32_5, 1); |
136 |
139 |
137 static const char utf8_6[] = "\364\217\277\277"; // U+10FFFF |
140 static const char utf8_6[] = "\364\217\277\275"; // U+10FFFD |
138 static const uint utf32_6[] = { 0x10FFFF }; |
141 static const uint utf32_6[] = { 0x10FFFD }; |
139 QTest::newRow("utf8_6") << QByteArray(utf8_6) << QString::fromUcs4(utf32_6, 1); |
142 QTest::newRow("utf8_6") << QByteArray(utf8_6) << QString::fromUcs4(utf32_6, 1); |
140 |
143 |
141 static const char utf8_7[] = "abc\302\240\303\241\303\251\307\275 \342\202\254def"; |
144 static const char utf8_7[] = "abc\302\240\303\241\303\251\307\275 \342\202\254def"; |
142 static const ushort utf16_7[] = { 'a', 'b', 'c', 0x00A0, |
145 static const ushort utf16_7[] = { 'a', 'b', 'c', 0x00A0, |
143 0x00E1, 0x00E9, 0x01FD, |
146 0x00E1, 0x00E9, 0x01FD, |
144 ' ', 0x20AC, 'd', 'e', 'f', 0 }; |
147 ' ', 0x20AC, 'd', 'e', 'f', 0 }; |
145 QTest::newRow("utf8_7") << QByteArray(utf8_7) << QString::fromUtf16(utf16_7); |
148 QTest::newRow("utf8_7") << QByteArray(utf8_7) << QString::fromUtf16(utf16_7); |
146 |
149 |
147 static const char utf8_8[] = "abc\302\240\303\241\303\251\307\275 \364\217\277\277 \342\202\254def"; |
150 static const char utf8_8[] = "abc\302\240\303\241\303\251\307\275 \364\217\277\275 \342\202\254def"; |
148 static const uint utf32_8[] = { 'a', 'b', 'c', 0x00A0, |
151 static const uint utf32_8[] = { 'a', 'b', 'c', 0x00A0, |
149 0x00E1, 0x00E9, 0x01FD, |
152 0x00E1, 0x00E9, 0x01FD, |
150 ' ', 0x10FFFF, ' ', |
153 ' ', 0x10FFFD, ' ', |
151 0x20AC, 'd', 'e', 'f', 0 }; |
154 0x20AC, 'd', 'e', 'f', 0 }; |
152 QTest::newRow("utf8_8") << QByteArray(utf8_8) << QString::fromUcs4(utf32_8); |
155 QTest::newRow("utf8_8") << QByteArray(utf8_8) << QString::fromUcs4(utf32_8); |
153 } |
156 } |
154 |
157 |
155 void tst_Utf8::roundTrip() |
158 void tst_Utf8::roundTrip() |
211 QTest::newRow("3chars-1") << QByteArray("\xE0\xA0\xC0"); |
214 QTest::newRow("3chars-1") << QByteArray("\xE0\xA0\xC0"); |
212 QTest::newRow("3chars-2") << QByteArray("\xE0\xC0\xA0"); |
215 QTest::newRow("3chars-2") << QByteArray("\xE0\xC0\xA0"); |
213 QTest::newRow("4chars-1") << QByteArray("\xF0\x90\x80\xC0"); |
216 QTest::newRow("4chars-1") << QByteArray("\xF0\x90\x80\xC0"); |
214 QTest::newRow("4chars-2") << QByteArray("\xF0\x90\xC0\x80"); |
217 QTest::newRow("4chars-2") << QByteArray("\xF0\x90\xC0\x80"); |
215 QTest::newRow("4chars-3") << QByteArray("\xF0\xC0\x80\x80"); |
218 QTest::newRow("4chars-3") << QByteArray("\xF0\xC0\x80\x80"); |
216 |
|
217 // U+FFFE and U+FFFF are non-characters and must not be present |
|
218 // U+FFFE: 1111 11 1111 11 1110 |
|
219 // encoding: xxxz:1111 xz11:1111 xz11:1110 |
|
220 QTest::newRow("fffe") << QByteArray("\xEF\xBF\xBE"); |
|
221 // U+FFFF: 1111 11 1111 11 1111 |
|
222 // encoding: xxxz:1111 xz11:1111 xz11:1111 |
|
223 QTest::newRow("ffff") << QByteArray("\xEF\xBF\xBF"); |
|
224 |
219 |
225 // Surrogate pairs must now be present either |
220 // Surrogate pairs must now be present either |
226 // U+D800: 1101 10 0000 00 0000 |
221 // U+D800: 1101 10 0000 00 0000 |
227 // encoding: xxxz:1101 xz10:0000 xz00:0000 |
222 // encoding: xxxz:1101 xz10:0000 xz00:0000 |
228 QTest::newRow("hi-surrogate") << QByteArray("\xED\xA0\x80"); |
223 QTest::newRow("hi-surrogate") << QByteArray("\xED\xA0\x80"); |
300 { |
295 { |
301 QFETCH(QByteArray, utf8); |
296 QFETCH(QByteArray, utf8); |
302 QFETCH_GLOBAL(bool, useLocale); |
297 QFETCH_GLOBAL(bool, useLocale); |
303 |
298 |
304 QSharedPointer<QTextDecoder> decoder = QSharedPointer<QTextDecoder>(codec->makeDecoder()); |
299 QSharedPointer<QTextDecoder> decoder = QSharedPointer<QTextDecoder>(codec->makeDecoder()); |
305 QString decoded = decoder->toUnicode(utf8); |
300 decoder->toUnicode(utf8); |
306 |
301 |
307 // Only enforce correctness on our UTF-8 decoder |
302 // Only enforce correctness on our UTF-8 decoder |
308 // The system's UTF-8 codec is sometimes buggy |
303 // The system's UTF-8 codec is sometimes buggy |
309 // GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8 |
304 // GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8 |
310 // OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF |
305 // OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF |
312 QVERIFY(decoder->hasFailure()); |
307 QVERIFY(decoder->hasFailure()); |
313 else if (!decoder->hasFailure()) |
308 else if (!decoder->hasFailure()) |
314 qWarning("System codec does not report failure when it should. Should report bug upstream."); |
309 qWarning("System codec does not report failure when it should. Should report bug upstream."); |
315 } |
310 } |
316 |
311 |
|
312 void tst_Utf8::nonCharacters_data() |
|
313 { |
|
314 QTest::addColumn<QByteArray>("utf8"); |
|
315 QTest::addColumn<QString>("utf16"); |
|
316 |
|
317 // Unicode has a couple of "non-characters" that one can use internally, |
|
318 // but are not allowed to be used for text interchange. |
|
319 // |
|
320 // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, |
|
321 // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and |
|
322 // U+FDEF (inclusive) |
|
323 |
|
324 // U+FDD0 through U+FDEF |
|
325 for (int i = 0; i < 16; ++i) { |
|
326 char utf8[] = { 0357, 0267, 0220 + i, 0 }; |
|
327 QString utf16 = QChar(0xfdd0 + i); |
|
328 QTest::newRow(qPrintable(QString::number(0xfdd0 + i, 16))) << QByteArray(utf8) << utf16; |
|
329 } |
|
330 |
|
331 // the last two in Planes 1 through 16 |
|
332 for (uint plane = 1; plane <= 16; ++plane) { |
|
333 for (uint lower = 0xfffe; lower < 0x10000; ++lower) { |
|
334 uint ucs4 = (plane << 16) | lower; |
|
335 char utf8[] = { 0xf0 | uchar(ucs4 >> 18), |
|
336 0x80 | (uchar(ucs4 >> 12) & 0x3f), |
|
337 0x80 | (uchar(ucs4 >> 6) & 0x3f), |
|
338 0x80 | (uchar(ucs4) & 0x3f), |
|
339 0 }; |
|
340 ushort utf16[] = { QChar::highSurrogate(ucs4), QChar::lowSurrogate(ucs4), 0 }; |
|
341 |
|
342 QTest::newRow(qPrintable(QString::number(ucs4, 16))) << QByteArray(utf8) << QString::fromUtf16(utf16); |
|
343 } |
|
344 } |
|
345 |
|
346 QTest::newRow("fffe") << QByteArray("\xEF\xBF\xBE") << QString(QChar(0xfffe)); |
|
347 QTest::newRow("ffff") << QByteArray("\xEF\xBF\xBF") << QString(QChar(0xffff)); |
|
348 } |
|
349 |
|
350 void tst_Utf8::nonCharacters() |
|
351 { |
|
352 QFETCH(QByteArray, utf8); |
|
353 QFETCH(QString, utf16); |
|
354 QFETCH_GLOBAL(bool, useLocale); |
|
355 |
|
356 QSharedPointer<QTextDecoder> decoder = QSharedPointer<QTextDecoder>(codec->makeDecoder()); |
|
357 decoder->toUnicode(utf8); |
|
358 |
|
359 // Only enforce correctness on our UTF-8 decoder |
|
360 // The system's UTF-8 codec is sometimes buggy |
|
361 // GNU libc's iconv is known to accept U+FFFF and U+FFFE encoded as UTF-8 |
|
362 // OS X's iconv is known to accept those, plus surrogates and codepoints above U+10FFFF |
|
363 if (!useLocale) |
|
364 QVERIFY(decoder->hasFailure()); |
|
365 else if (!decoder->hasFailure()) |
|
366 qWarning("System codec does not report failure when it should. Should report bug upstream."); |
|
367 |
|
368 QSharedPointer<QTextEncoder> encoder(codec->makeEncoder()); |
|
369 encoder->fromUnicode(utf16); |
|
370 if (!useLocale) |
|
371 QVERIFY(encoder->hasFailure()); |
|
372 else if (!encoder->hasFailure()) |
|
373 qWarning("System codec does not report failure when it should. Should report bug upstream."); |
|
374 } |
|
375 |
317 QTEST_MAIN(tst_Utf8) |
376 QTEST_MAIN(tst_Utf8) |
318 #include "tst_utf8.moc" |
377 #include "tst_utf8.moc" |