src/corelib/tools/qstring.cpp
branchGCC_SURGE
changeset 31 5daf16870df6
parent 30 5dc02b23752f
child 33 3e2da88830cd
--- a/src/corelib/tools/qstring.cpp	Mon Jun 21 22:38:13 2010 +0100
+++ b/src/corelib/tools/qstring.cpp	Thu Jul 22 16:41:55 2010 +0100
@@ -46,6 +46,7 @@
 #include <qtextcodec.h>
 #endif
 #include <private/qutfcodec_p.h>
+#include "qsimd_p.h"
 #include <qdatastream.h>
 #include <qlist.h>
 #include "qlocale.h"
@@ -55,6 +56,7 @@
 #include "qtools_p.h"
 #include "qhash.h"
 #include "qdebug.h"
+#include "qendian.h"
 
 #ifdef Q_OS_MAC
 #include <private/qcore_mac_p.h>
@@ -333,7 +335,7 @@
   \macro QT_NO_CAST_TO_ASCII
   \relates QString
 
-  disables automatic conversion from QString to ASCII 8-bit strings (char *)
+  disables automatic conversion from QString to 8-bit strings (char *)
 
   \sa QT_NO_CAST_FROM_ASCII, QT_NO_CAST_FROM_BYTEARRAY
 */
@@ -389,10 +391,10 @@
     with code values above 65535 are stored using surrogate pairs,
     i.e., two consecutive \l{QChar}s.)
 
-    \l{Unicode} is an international standard that supports most of
-    the writing systems in use today. It is a superset of ASCII and
-    Latin-1 (ISO 8859-1), and all the ASCII/Latin-1 characters are
-    available at the same code positions.
+    \l{Unicode} is an international standard that supports most of the
+    writing systems in use today. It is a superset of US-ASCII (ANSI
+    X3.4-1986) and Latin-1 (ISO 8859-1), and all the US-ASCII/Latin-1
+    characters are available at the same code positions.
 
     Behind the scenes, QString uses \l{implicit sharing}
     (copy-on-write) to reduce memory usage and to avoid the needless
@@ -560,11 +562,13 @@
     toLatin1(), toUtf8(), and toLocal8Bit().
 
     \list
-    \o toAscii() returns an ASCII encoded 8-bit string.
+    \o toAscii() returns an 8-bit string encoded using the codec
+       specified by QTextCodec::codecForCStrings (by default, that is
+       Latin 1).
     \o toLatin1() returns a Latin-1 (ISO 8859-1) encoded 8-bit string.
     \o toUtf8() returns a UTF-8 encoded 8-bit string. UTF-8 is a
-       superset of ASCII that supports the entire Unicode character
-       set through multibyte sequences.
+       superset of US-ASCII (ANSI X3.4-1986) that supports the entire
+       Unicode character set through multibyte sequences.
     \o toLocal8Bit() returns an 8-bit string using the system's local
        encoding.
     \endlist
@@ -576,7 +580,7 @@
     As mentioned above, QString provides a lot of functions and
     operators that make it easy to interoperate with \c{const char *}
     strings. But this functionality is a double-edged sword: It makes
-    QString more convenient to use if all strings are ASCII or
+    QString more convenient to use if all strings are US-ASCII or
     Latin-1, but there is always the risk that an implicit conversion
     from or to \c{const char *} is done using the wrong 8-bit
     encoding. To minimize these risks, you can turn off these implicit
@@ -584,9 +588,9 @@
 
     \list
     \o \c QT_NO_CAST_FROM_ASCII disables automatic conversions from
-       ASCII to Unicode.
+       C string literals and pointers to Unicode.
     \o \c QT_NO_CAST_TO_ASCII disables automatic conversion from QString
-       to ASCII.
+       to C strings.
     \endlist
 
     One way to define these preprocessor symbols globally for your
@@ -835,7 +839,7 @@
 
 /*! \fn QString::QString(const char *str)
 
-    Constructs a string initialized with the ASCII string \a str. The
+    Constructs a string initialized with the 8-bit string \a str. The
     given const char pointer is converted to Unicode using the
     fromAscii() function.
 
@@ -935,11 +939,11 @@
         const unsigned short *uc = utf16();
         for (int i = 0; i < length(); ++i) {
             uint u = uc[i];
-            if (u >= 0xd800 && u < 0xdc00 && i < length()-1) {
+            if (QChar::isHighSurrogate(u) && i + 1 < length()) {
                 ushort low = uc[i+1];
-                if (low >= 0xdc00 && low < 0xe000) {
+                if (QChar::isLowSurrogate(low)) {
+                    u = QChar::surrogateToUcs4(u, low);
                     ++i;
-                    u = (u - 0xd800)*0x400 + (low - 0xdc00) + 0x10000;
                 }
             }
             *a = wchar_t(u);
@@ -988,6 +992,40 @@
     }
 }
 
+/*!
+    \since 4.7
+
+    Constructs a string initialized with the characters of the QChar array
+    \a unicode, which must be terminated with a 0.
+
+    QString makes a deep copy of the string data. The unicode data is copied as
+    is and the Byte Order Mark is preserved if present.
+*/
+QString::QString(const QChar *unicode)
+{
+     if (!unicode) {
+         d = &shared_null;
+         d->ref.ref();
+     } else {
+         int size = 0;
+         while (unicode[size] != 0)
+             ++size;
+         if (!size) {
+             d = &shared_empty;
+             d->ref.ref();
+         } else {
+             d = (Data*) qMalloc(sizeof(Data)+size*sizeof(QChar));
+             Q_CHECK_PTR(d);
+             d->ref = 1;
+             d->alloc = d->size = size;
+             d->clean = d->asciiCache = d->simpletext = d->righttoleft = d->capacity = 0;
+             d->data = d->array;
+             memcpy(d->array, unicode, size * sizeof(QChar));
+             d->array[size] = '\0';
+         }
+     }
+}
+
 
 /*!
     Constructs a string of the given \a size with every character set
@@ -1091,7 +1129,12 @@
     \internal
 */
 
-/*! \fn void QString::isDetached() const
+/*! \fn bool QString::isDetached() const
+
+    \internal
+*/
+
+/*! \fn bool QString::isSharedWith(const QString &other) const
 
     \internal
 */
@@ -1296,8 +1339,9 @@
 
     \overload operator=()
 
-    Assigns \a ba to this string. The byte array is converted to
-    Unicode using the fromAscii() function.
+    Assigns \a ba to this string. The byte array is converted to Unicode
+    using the fromAscii() function. This function stops conversion at the
+    first NUL character found, or the end of the \a ba byte array.
 
     You can disable this operator by defining \c
     QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -1760,13 +1804,14 @@
     }
 
     QT_TRY {
-        detach();
         if (blen == alen) {
             // replace in place
+            detach();
             for (int i = 0; i < nIndices; ++i)
                 memcpy(d->data + indices[i], afterBuffer, alen * sizeof(QChar));
         } else if (alen < blen) {
             // replace from front
+            detach();
             uint to = indices[0];
             if (alen)
                 memcpy(d->data+to, after, alen*sizeof(QChar));
@@ -2089,7 +2134,8 @@
     \overload operator==()
 
     The \a other byte array is converted to a QString using the
-    fromAscii() function.
+    fromAscii() function. This function stops conversion at the
+    first NUL character found, or the end of the byte array.
 
     You can disable this operator by defining \c
     QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2150,7 +2196,8 @@
     \overload operator<()
 
     The \a other byte array is converted to a QString using the
-    fromAscii() function.
+    fromAscii() function. If any NUL characters ('\0') are embedded
+    in the byte array, they will be included in the transformation.
 
     You can disable this operator by defining \c
     QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2192,7 +2239,8 @@
     \overload operator<=()
 
     The \a other byte array is converted to a QString using the
-    fromAscii() function.
+    fromAscii() function. If any NUL characters ('\0') are embedded
+    in the byte array, they will be included in the transformation.
 
     You can disable this operator by defining \c
     QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2250,7 +2298,8 @@
     \overload operator>()
 
     The \a other byte array is converted to a QString using the
-    fromAscii() function.
+    fromAscii() function. If any NUL characters ('\0') are embedded
+    in the byte array, they will be included in the transformation.
 
     You can disable this operator by defining \c
     QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2292,7 +2341,8 @@
     \overload operator>=()
 
     The \a other byte array is converted to a QString using the
-    fromAscii() function.
+    fromAscii() function. If any NUL characters ('\0') are embedded in
+    the byte array, they will be included in the transformation.
 
     You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
     when you compile your applications. This can be useful if you want
@@ -2307,10 +2357,10 @@
     The \a other const char pointer is converted to a QString using
     the fromAscii() function.
 
-    You can disable this operator by defining \c
-    QT_NO_CAST_FROM_ASCII when you compile your applications. This
-    can be useful if you want to ensure that all user-visible strings
-    go through QObject::tr(), for example.
+    You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
+    when you compile your applications. This can be useful if you want
+    to ensure that all user-visible strings go through QObject::tr(),
+    for example.
 */
 
 /*! \fn bool QString::operator!=(const QString &other) const
@@ -2334,7 +2384,8 @@
     \overload operator!=()
 
     The \a other byte array is converted to a QString using the
-    fromAscii() function.
+    fromAscii() function. If any NUL characters ('\0') are embedded
+    in the byte array, they will be included in the transformation.
 
     You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
     when you compile your applications. This can be useful if you want
@@ -3438,12 +3489,82 @@
     QByteArray ba;
     if (length) {
         ba.resize(length);
-        const ushort *i = reinterpret_cast<const ushort *>(data);
-        const ushort *e = i + length;
-        uchar *s = (uchar*) ba.data();
-        while (i != e) {
-            *s++ = (*i>0xff) ? '?' : (uchar) *i;
-            ++i;
+        const ushort *src = reinterpret_cast<const ushort *>(data);
+        uchar *dst = (uchar*) ba.data();
+#if defined(QT_ALWAYS_HAVE_SSE2)
+        if (length >= 16) {
+            const int chunkCount = length >> 4; // divided by 16
+            const __m128i questionMark = _mm_set1_epi16('?');
+            // SSE has no compare instruction for unsigned comparison.
+            // The variables must be shiffted + 0x8000 to be compared
+            const __m128i signedBitOffset = _mm_set1_epi16(0x8000);
+            const __m128i thresholdMask = _mm_set1_epi16(0xff + 0x8000);
+            for (int i = 0; i < chunkCount; ++i) {
+                __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load
+                src += 8;
+                {
+                    // each 16 bit is equal to 0xFF if the source is outside latin 1 (>0xff)
+                    const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset);
+                    const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
+
+                    // offLimitQuestionMark contains '?' for each 16 bits that was off-limit
+                    // the 16 bits that were correct contains zeros
+                    const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
+
+                    // correctBytes contains the bytes that were in limit
+                    // the 16 bits that were off limits contains zeros
+                    const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk1);
+
+                    // merge offLimitQuestionMark and correctBytes to have the result
+                    chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark);
+                }
+
+                __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load
+                src += 8;
+                {
+                    // exactly the same operations as for the previous chunk of data
+                    const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset);
+                    const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
+                    const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
+                    const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2);
+                    chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark);
+                }
+
+                // pack the two vector to 16 x 8bits elements
+                const __m128i result = _mm_packus_epi16(chunk1, chunk2);
+
+                _mm_storeu_si128((__m128i*)dst, result); // store
+                dst += 16;
+            }
+            length = length % 16;
+        }
+#elif QT_HAVE_NEON
+        // Refer to the documentation of the SSE2 implementation
+        // this use eactly the same method as for SSE except:
+        // 1) neon has unsigned comparison
+        // 2) packing is done to 64 bits (8 x 8bits component).
+        if (length >= 16) {
+            const int chunkCount = length >> 3; // divided by 8
+            const uint16x8_t questionMark = vdupq_n_u16('?'); // set
+            const uint16x8_t thresholdMask = vdupq_n_u16(0xff); // set
+            for (int i = 0; i < chunkCount; ++i) {
+                uint16x8_t chunk = vld1q_u16((uint16_t *)src); // load
+                src += 8;
+
+                const uint16x8_t offLimitMask = vcgtq_u16(chunk, thresholdMask); // chunk > thresholdMask
+                const uint16x8_t offLimitQuestionMark = vandq_u16(offLimitMask, questionMark); // offLimitMask & questionMark
+                const uint16x8_t correctBytes = vbicq_u16(chunk, offLimitMask); // !offLimitMask & chunk
+                chunk = vorrq_u16(correctBytes, offLimitQuestionMark); // correctBytes | offLimitQuestionMark
+                const uint8x8_t result = vmovn_u16(chunk); // narrowing move->packing
+                vst1_u8(dst, result); // store
+                dst += 8;
+            }
+            length = length % 8;
+        }
+#endif
+        while (length--) {
+            *dst++ = (*src>0xff) ? '?' : (uchar) *src;
+            ++src;
         }
     }
     return ba;
@@ -3451,8 +3572,10 @@
 
 /*!
     Returns a Latin-1 representation of the string as a QByteArray.
-    The returned byte array is undefined if the string contains
-    non-Latin1 characters.
+
+    The returned byte array is undefined if the string contains non-Latin1
+    characters. Those characters may be suppressed or replaced with a
+    question mark.
 
     \sa fromLatin1(), toAscii(), toUtf8(), toLocal8Bit(), QTextCodec
 */
@@ -3466,12 +3589,15 @@
 // isn't necessary in the header. See task 177402.
 
 /*!
-    Returns an 8-bit ASCII representation of the string as a QByteArray.
+    Returns an 8-bit representation of the string as a QByteArray.
 
     If a codec has been set using QTextCodec::setCodecForCStrings(),
     it is used to convert Unicode to 8-bit char; otherwise this
     function does the same as toLatin1().
 
+    Note that, despite the name, this function does not necessarily return an US-ASCII
+    (ANSI X3.4-1986) string and its result may not be US-ASCII compatible.
+
     \sa fromAscii(), toLatin1(), toUtf8(), toLocal8Bit(), QTextCodec
 */
 QByteArray QString::toAscii() const
@@ -3499,8 +3625,13 @@
     QByteArray. The returned byte array is undefined if the string
     contains characters not supported by the local 8-bit encoding.
 
-    QTextCodec::codecForLocale() is used to perform the conversion
-    from Unicode.
+    QTextCodec::codecForLocale() is used to perform the conversion from
+    Unicode. If the locale encoding could not be determined, this function
+    does the same as toLatin1().
+
+    If this string contains any characters that cannot be encoded in the
+    locale, the returned byte array is undefined. Those characters may be
+    suppressed or replaced by another.
 
     \sa fromLocal8Bit(), toAscii(), toLatin1(), toUtf8(), QTextCodec
 */
@@ -3516,54 +3647,34 @@
 /*!
     Returns a UTF-8 representation of the string as a QByteArray.
 
+    UTF-8 is a Unicode codec and can represent all characters in a Unicode
+    string like QString.
+
+    However, in the Unicode range, there are certain codepoints that are not
+    considered characters. The Unicode standard reserves the last two
+    codepoints in each Unicode Plane (U+FFFE, U+FFFF, U+1FFFE, U+1FFFF,
+    U+2FFFE, etc.), as well as 16 codepoints in the range U+FDD0..U+FDDF,
+    inclusive, as non-characters. If any of those appear in the string, they
+    may be discarded and will not appear in the UTF-8 representation, or they
+    may be replaced by one or more replacement characters.
+
     \sa fromUtf8(), toAscii(), toLatin1(), toLocal8Bit(), QTextCodec
 */
 QByteArray QString::toUtf8() const
 {
-    QByteArray ba;
-    if (d->size) {
-        int l = d->size;
-        int rlen = l*3+1;
-        ba.resize(rlen);
-        uchar *cursor = (uchar*)ba.data();
-        const ushort *ch =d->data;
-        for (int i=0; i < l; i++) {
-            uint u = *ch;
-            if (u < 0x80) {
-                *cursor++ = (uchar)u;
-            } else {
-                if (u < 0x0800) {
-                    *cursor++ = 0xc0 | ((uchar) (u >> 6));
-                } else {
-                    if (QChar(u).isHighSurrogate() && i < l-1) {
-                        ushort low = ch[1];
-                        if (QChar(low).isLowSurrogate()) {
-                            ++ch;
-                            ++i;
-                            u = QChar::surrogateToUcs4(u,low);
-                        }
-                    }
-                    if (u > 0xffff) {
-                        *cursor++ = 0xf0 | ((uchar) (u >> 18));
-                        *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
-                    } else {
-                        *cursor++ = 0xe0 | ((uchar) (u >> 12));
-                    }
-                    *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
-                }
-                *cursor++ = 0x80 | ((uchar) (u&0x3f));
-            }
-            ++ch;
-        }
-        ba.resize(cursor - (uchar*)ba.constData());
-    }
-    return ba;
+    if (isNull())
+        return QByteArray();
+
+    return QUtf8::convertFromUnicode(constData(), length(), 0);
 }
 
 /*!
     \since 4.2
 
-    Returns a UCS-4 representation of the string as a QVector<uint>.
+    Returns a UCS-4/UTF-32 representation of the string as a QVector<uint>.
+
+    UCS-4 is a Unicode codec and is lossless. All characters from this string
+    can be encoded in UCS-4.
 
     \sa fromUtf8(), toAscii(), toLatin1(), toLocal8Bit(), QTextCodec, fromUcs4(), toWCharArray()
 */
@@ -3606,10 +3717,35 @@
         d->alloc = d->size = size;
         d->clean = d->asciiCache = d->simpletext = d->righttoleft = d->capacity = 0;
         d->data = d->array;
-        ushort *i = d->data;
         d->array[size] = '\0';
+        ushort *dst = d->data;
+        /* SIMD:
+         * Unpacking with SSE has been shown to improve performance on recent CPUs
+         * The same method gives no improvement with NEON.
+         */
+#if defined(QT_ALWAYS_HAVE_SSE2)
+        if (size >= 16) {
+            int chunkCount = size >> 4; // divided by 16
+            const __m128i nullMask = _mm_set1_epi32(0);
+            for (int i = 0; i < chunkCount; ++i) {
+                const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load
+                str += 16;
+
+                // unpack the first 8 bytes, padding with zeros
+                const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
+                _mm_storeu_si128((__m128i*)dst, firstHalf); // store
+                dst += 8;
+
+                // unpack the last 8 bytes, padding with zeros
+                const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
+                _mm_storeu_si128((__m128i*)dst, secondHalf); // store
+                dst += 8;
+            }
+            size = size % 16;
+        }
+#endif
         while (size--)
-            *i++ = (uchar)*str++;
+            *dst++ = (uchar)*str++;
     }
     return d;
 }
@@ -3691,100 +3827,6 @@
 
 #endif
 
-QT_END_NAMESPACE
-
-#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
-#include "qt_windows.h"
-
-QT_BEGIN_NAMESPACE
-
-QByteArray qt_winQString2MB(const QString& s, int uclen)
-{
-    if (uclen < 0)
-        uclen = s.length();
-    if (s.isNull())
-        return QByteArray();
-    if (uclen == 0)
-        return QByteArray("");
-    return qt_winQString2MB(s.constData(), uclen);
-}
-
-QByteArray qt_winQString2MB(const QChar *ch, int uclen)
-{
-    if (!ch)
-	return QByteArray();
-    if (uclen == 0)
-        return QByteArray("");
-    BOOL used_def;
-    QByteArray mb(4096, 0);
-    int len;
-    while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
-                mb.data(), mb.size()-1, 0, &used_def)))
-    {
-        int r = GetLastError();
-        if (r == ERROR_INSUFFICIENT_BUFFER) {
-            mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
-                                (const wchar_t*)ch, uclen,
-                                0, 0, 0, &used_def));
-                // and try again...
-        } else {
-#ifndef QT_NO_DEBUG
-            // Fail.
-            qWarning("WideCharToMultiByte: Cannot convert multibyte text (error %d): %s (UTF-8)",
-                r, QString(ch, uclen).toLocal8Bit().data());
-#endif
-            break;
-        }
-    }
-    mb.resize(len);
-    return mb;
-}
-
-QString qt_winMB2QString(const char *mb, int mblen)
-{
-    if (!mb || !mblen)
-        return QString();
-    const int wclen_auto = 4096;
-    wchar_t wc_auto[wclen_auto];
-    int wclen = wclen_auto;
-    wchar_t *wc = wc_auto;
-    int len;
-    while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
-                mb, mblen, wc, wclen)))
-    {
-        int r = GetLastError();
-        if (r == ERROR_INSUFFICIENT_BUFFER) {
-            if (wc != wc_auto) {
-                qWarning("MultiByteToWideChar: Size changed");
-                break;
-            } else {
-                wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
-                                    mb, mblen, 0, 0);
-                wc = new wchar_t[wclen];
-                // and try again...
-            }
-        } else {
-            // Fail.
-            qWarning("MultiByteToWideChar: Cannot convert multibyte text");
-            break;
-        }
-    }
-    if (len <= 0)
-        return QString();
-    if (wc[len-1] == 0) // len - 1: we don't want terminator
-        --len;
-    QString s((QChar*)wc, len);
-    if (wc != wc_auto)
-        delete [] wc;
-    return s;
-}
-
-QT_END_NAMESPACE
-
-#endif // Q_OS_WIN32
-
-QT_BEGIN_NAMESPACE
-
 /*!
     Returns a QString initialized with the first \a size characters
     of the 8-bit string \a str.
@@ -3815,14 +3857,16 @@
 
 /*!
     Returns a QString initialized with the first \a size characters
-    of the 8-bit ASCII string \a str.
+    of the 8-bit string \a str.
 
     If \a size is -1 (default), it is taken to be qstrlen(\a
     str).
 
-    If a codec has been set using QTextCodec::setCodecForCStrings(),
-    it is used to convert \a str to Unicode; otherwise this function
-    does the same as fromLatin1().
+    Note that, despite the name, this function actually uses the codec
+    defined by QTextCodec::setCodecForCStrings() to convert \a str to
+    Unicode. Depending on the codec, it may not accept valid US-ASCII (ANSI
+    X3.4-1986) input. If no codec has been set, this function does the same
+    as fromLatin1().
 
     \sa toAscii(), fromLatin1(), fromUtf8(), fromLocal8Bit()
 */
@@ -3838,6 +3882,18 @@
     If \a size is -1 (default), it is taken to be qstrlen(\a
     str).
 
+    UTF-8 is a Unicode codec and can represent all characters in a Unicode
+    string like QString. However, invalid sequences are possible with UTF-8
+    and, if any such are found, they will be replaced with one or more
+    "replacement characters", or suppressed. These include non-Unicode
+    sequences, non-characters, overlong sequences or surrogate codepoints
+    encoded into UTF-8.
+
+    Non-characters are codepoints that the Unicode standard reserves and must
+    not be used in text interchange. They are the last two codepoints in each
+    Unicode Plane (U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, etc.), as well
+    as 16 codepoints in the range U+FDD0..U+FDDF, inclusive.
+
     \sa toUtf8(), fromAscii(), fromLatin1(), fromLocal8Bit()
 */
 QString QString::fromUtf8(const char *str, int size)
@@ -3861,7 +3917,7 @@
     host byte order is assumed.
 
     This function is comparatively slow.
-    Use QString(const ushort *, int) if possible.
+    Use QString(const ushort *, int) or QString(const ushort *) if possible.
 
     QString makes a deep copy of the Unicode data.
 
@@ -3954,24 +4010,74 @@
 {
     if (d->size == 0)
         return *this;
-    QString result(d->size, Qt::Uninitialized);
-    const QChar *from = (const QChar*) d->data;
-    const QChar *fromend = (const QChar*) from+d->size;
-    int outc=0;
-    QChar *to   = (QChar*) result.d->data;
-    for (;;) {
-        while (from!=fromend && from->isSpace())
-            from++;
-        while (from!=fromend && !from->isSpace())
-            to[outc++] = *from++;
-        if (from!=fromend)
-            to[outc++] = QLatin1Char(' ');
-        else
+
+    const QChar * const start = reinterpret_cast<QChar *>(d->data);
+    const QChar *from = start;
+    const QChar *fromEnd = start + d->size;
+    forever {
+        QChar ch = *from;
+        if (!ch.isSpace())
+            break;
+        if (++from == fromEnd) {
+            // All-whitespace string
+            shared_empty.ref.ref();
+            return QString(&shared_empty, 0);
+        }
+    }
+    // This loop needs no underflow check, as we already determined that
+    // the string contains non-whitespace. If the string has exactly one
+    // non-whitespace, it will be checked twice - we can live with that.
+    while (fromEnd[-1].isSpace())
+        fromEnd--;
+    // The rest of the function depends on the fact that we already know
+    // that the last character in the source is no whitespace.
+    const QChar *copyFrom = from;
+    int copyCount;
+    forever {
+        if (++from == fromEnd) {
+            // Only leading and/or trailing whitespace, if any at all
+            return mid(copyFrom - start, from - copyFrom);
+        }
+        QChar ch = *from;
+        if (!ch.isSpace())
+            continue;
+        if (ch != QLatin1Char(' ')) {
+            copyCount = from - copyFrom;
             break;
+        }
+        ch = *++from;
+        if (ch.isSpace()) {
+            copyCount = from - copyFrom - 1;
+            break;
+        }
     }
-    if (outc > 0 && to[outc-1] == QLatin1Char(' '))
-        outc--;
-    result.truncate(outc);
+    // 'from' now points at the non-trailing whitespace which made the
+    // string not simplified in the first place. 'copyCount' is the number
+    // of already simplified characters - at least one, obviously -
+    // without a trailing space.
+    QString result((fromEnd - from) + copyCount, Qt::Uninitialized);
+    QChar *to = reinterpret_cast<QChar *>(result.d->data);
+    ::memcpy(to, copyFrom, copyCount * 2);
+    to += copyCount;
+    fromEnd--;
+    QChar ch;
+    forever {
+        *to++ = QLatin1Char(' ');
+        do {
+            ch = *++from;
+        } while (ch.isSpace());
+        if (from == fromEnd)
+            break;
+        do {
+            *to++ = ch;
+            ch = *++from;
+            if (from == fromEnd)
+                goto done;
+        } while (!ch.isSpace());
+    }
+  done:
+    *to++ = ch;
+    result.truncate(to - reinterpret_cast<QChar *>(result.d->data));
     return result;
 }
 
@@ -4212,8 +4318,10 @@
 
     \overload operator+=()
 
-    Appends the byte array \a ba to this string. The byte array is
-    converted to Unicode using the fromAscii() function.
+    Appends the byte array \a ba to this string. The byte array is converted
+    to Unicode using the fromAscii() function. If any NUL characters ('\0')
+    are embedded in the \a ba byte array, they will be included in the
+    transformation.
 
     You can disable this function by defining \c
     QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -4613,6 +4721,12 @@
     return localeAwareCompare_helper(constData(), length(), other.constData(), other.length());
 }
 
+#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
+QT_END_NAMESPACE
+#include "qt_windows.h"
+QT_BEGIN_NAMESPACE
+#endif
+
 /*!
     \internal
     \since 4.5
@@ -4990,8 +5104,19 @@
     const char *c = cformat;
     for (;;) {
         // Copy non-escape chars to result
+#ifndef QT_NO_TEXTCODEC
+        int i = 0;
+        while (*(c + i) != '\0' && *(c + i) != '%')
+            ++i;
+        if (codecForCStrings)
+            result.append(codecForCStrings->toUnicode(c, i));
+        else
+            result.append(fromLatin1(c, i));
+        c += i;
+#else
         while (*c != '\0' && *c != '%')
             result.append(QLatin1Char(*c++));
+#endif
 
         if (*c == '\0')
             break;
@@ -5989,7 +6114,7 @@
 */
 QString QString::normalized(QString::NormalizationForm mode) const
 {
-    return normalized(mode, CURRENT_VERSION);
+    return normalized(mode, UNICODE_DATA_VERSION);
 }
 
 /*!
@@ -6070,8 +6195,10 @@
     if (simple)
         return;
 
-    QString &s = *data;
-    if (version != CURRENT_VERSION) {
+    if (version == QChar::Unicode_Unassigned) {
+        version = UNICODE_DATA_VERSION;
+    } else if (version != UNICODE_DATA_VERSION) {
+        QString &s = *data;
         for (int i = 0; i < NumNormalizationCorrections; ++i) {
             const NormalizationCorrection &n = uc_normalization_corrections[i];
             if (n.version > version) {
@@ -6898,9 +7025,9 @@
     This operator is mostly useful to pass a QString to a function
     that accepts a std::string object.
 
-    If the QString contains non-ASCII Unicode characters, using this
-    operator can lead to loss of information, since the implementation
-    calls toAscii().
+    If the QString contains Unicode characters that the
+    QTextCodec::codecForCStrings() codec cannot handle, using this operator
+    can lead to loss of information.
 
     This operator is only available if Qt is configured with STL
     compatibility enabled.
@@ -6931,7 +7058,7 @@
     '\\0'-terminated string (although utf16() does, at the cost of
     copying the raw data).
 
-    \sa fromUtf16()
+    \sa fromUtf16(), setRawData()
 */
 QString QString::fromRawData(const QChar *unicode, int size)
 {
@@ -6950,8 +7077,46 @@
     return QString(x, 0);
 }
 
+/*!
+    \since 4.7
+
+    Resets the QString to use the first \a size Unicode characters
+    in the array \a unicode. The data in \a unicode is \e not
+    copied. The caller must be able to guarantee that \a unicode will
+    not be deleted or modified as long as the QString (or an
+    unmodified copy of it) exists.
+
+    This function can be used instead of fromRawData() to re-use
+    existings QString objects to save memory re-allocations.
+
+    \sa fromRawData()
+*/
+QString &QString::setRawData(const QChar *unicode, int size)
+{
+    if (d->ref != 1 || d->alloc) {
+        *this = fromRawData(unicode, size);
+    } else {
+#ifdef QT3_SUPPORT
+        if (d->asciiCache) {
+            Q_ASSERT(asciiCache);
+            asciiCache->remove(d);
+        }
+#endif
+        if (unicode) {
+            d->data = (ushort *)unicode;
+        } else {
+            d->data = d->array;
+            size = 0;
+        }
+        d->alloc = d->size = size;
+        *d->array = '\0';
+        d->clean = d->asciiCache = d->simpletext = d->righttoleft = d->capacity = 0;
+    }
+    return *this;
+}
+
 /*! \class QLatin1String
-    \brief The QLatin1String class provides a thin wrapper around an ASCII/Latin-1 encoded string literal.
+    \brief The QLatin1String class provides a thin wrapper around an US-ASCII/Latin-1 encoded string literal.
 
     \ingroup string-processing
     \reentrant
@@ -7038,7 +7203,7 @@
     \since 4.3
     \overload
 
-    The \a other const char pointer is converted to a QLatin1String using
+    The \a other const char pointer is converted to a QString using
     the QString::fromAscii() function.
 
     You can disable this operator by defining \c
@@ -7063,7 +7228,7 @@
     \since 4.3
     \overload operator!=()
 
-    The \a other const char pointer is converted to a QLatin1String using
+    The \a other const char pointer is converted to a QString using
     the QString::fromAscii() function.
 
     You can disable this operator by defining \c
@@ -7089,7 +7254,7 @@
     \since 4.3
     \overload
 
-    The \a other const char pointer is converted to a QLatin1String using
+    The \a other const char pointer is converted to a QString using
     the QString::fromAscii() function.
 
     You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
@@ -7115,7 +7280,7 @@
     \since 4.3
     \overload
 
-    The \a other const char pointer is converted to a QLatin1String using
+    The \a other const char pointer is converted to a QString using
     the QString::fromAscii() function.
 
     You can disable this operator by defining \c
@@ -7141,7 +7306,7 @@
     \since 4.3
     \overload
 
-    The \a other const char pointer is converted to a QLatin1String using
+    The \a other const char pointer is converted to a QString using
     the QString::fromAscii() function.
 
     You can disable this operator by defining \c
@@ -7222,7 +7387,7 @@
 
     Writes the given \a string to the specified \a stream.
 
-    \sa {Format of the QDataStream Operators}
+    \sa {Serializing Qt Data Types}
 */
 
 QDataStream &operator<<(QDataStream &out, const QString &str)
@@ -7270,7 +7435,7 @@
 
     Reads a string from the specified \a stream into the given \a string.
 
-    \sa {Format of the QDataStream Operators}
+    \sa {Serializing Qt Data Types}
 */
 
 QDataStream &operator>>(QDataStream &in, QString &str)
@@ -7317,7 +7482,7 @@
                     != (QSysInfo::ByteOrder == QSysInfo::BigEndian)) {
                 ushort *data = reinterpret_cast<ushort *>(str.data());
                 while (len--) {
-                    *data = (*data >> 8) | (*data << 8);
+                    *data = qbswap(*data);
                     ++data;
                 }
             }