--- a/src/corelib/tools/qstring.cpp Mon Jun 21 22:38:13 2010 +0100
+++ b/src/corelib/tools/qstring.cpp Thu Jul 22 16:41:55 2010 +0100
@@ -46,6 +46,7 @@
#include <qtextcodec.h>
#endif
#include <private/qutfcodec_p.h>
+#include "qsimd_p.h"
#include <qdatastream.h>
#include <qlist.h>
#include "qlocale.h"
@@ -55,6 +56,7 @@
#include "qtools_p.h"
#include "qhash.h"
#include "qdebug.h"
+#include "qendian.h"
#ifdef Q_OS_MAC
#include <private/qcore_mac_p.h>
@@ -333,7 +335,7 @@
\macro QT_NO_CAST_TO_ASCII
\relates QString
- disables automatic conversion from QString to ASCII 8-bit strings (char *)
+ disables automatic conversion from QString to 8-bit strings (char *)
\sa QT_NO_CAST_FROM_ASCII, QT_NO_CAST_FROM_BYTEARRAY
*/
@@ -389,10 +391,10 @@
with code values above 65535 are stored using surrogate pairs,
i.e., two consecutive \l{QChar}s.)
- \l{Unicode} is an international standard that supports most of
- the writing systems in use today. It is a superset of ASCII and
- Latin-1 (ISO 8859-1), and all the ASCII/Latin-1 characters are
- available at the same code positions.
+ \l{Unicode} is an international standard that supports most of the
+ writing systems in use today. It is a superset of US-ASCII (ANSI
+ X3.4-1986) and Latin-1 (ISO 8859-1), and all the US-ASCII/Latin-1
+ characters are available at the same code positions.
Behind the scenes, QString uses \l{implicit sharing}
(copy-on-write) to reduce memory usage and to avoid the needless
@@ -560,11 +562,13 @@
toLatin1(), toUtf8(), and toLocal8Bit().
\list
- \o toAscii() returns an ASCII encoded 8-bit string.
+ \o toAscii() returns an 8-bit string encoded using the codec
+ specified by QTextCodec::codecForCStrings (by default, that is
+ Latin 1).
\o toLatin1() returns a Latin-1 (ISO 8859-1) encoded 8-bit string.
\o toUtf8() returns a UTF-8 encoded 8-bit string. UTF-8 is a
- superset of ASCII that supports the entire Unicode character
- set through multibyte sequences.
+ superset of US-ASCII (ANSI X3.4-1986) that supports the entire
+ Unicode character set through multibyte sequences.
\o toLocal8Bit() returns an 8-bit string using the system's local
encoding.
\endlist
@@ -576,7 +580,7 @@
As mentioned above, QString provides a lot of functions and
operators that make it easy to interoperate with \c{const char *}
strings. But this functionality is a double-edged sword: It makes
- QString more convenient to use if all strings are ASCII or
+ QString more convenient to use if all strings are US-ASCII or
Latin-1, but there is always the risk that an implicit conversion
from or to \c{const char *} is done using the wrong 8-bit
encoding. To minimize these risks, you can turn off these implicit
@@ -584,9 +588,9 @@
\list
\o \c QT_NO_CAST_FROM_ASCII disables automatic conversions from
- ASCII to Unicode.
+ C string literals and pointers to Unicode.
\o \c QT_NO_CAST_TO_ASCII disables automatic conversion from QString
- to ASCII.
+ to C strings.
\endlist
One way to define these preprocessor symbols globally for your
@@ -835,7 +839,7 @@
/*! \fn QString::QString(const char *str)
- Constructs a string initialized with the ASCII string \a str. The
+ Constructs a string initialized with the 8-bit string \a str. The
given const char pointer is converted to Unicode using the
fromAscii() function.
@@ -935,11 +939,11 @@
const unsigned short *uc = utf16();
for (int i = 0; i < length(); ++i) {
uint u = uc[i];
- if (u >= 0xd800 && u < 0xdc00 && i < length()-1) {
+ if (QChar::isHighSurrogate(u) && i + 1 < length()) {
ushort low = uc[i+1];
- if (low >= 0xdc00 && low < 0xe000) {
+ if (QChar::isLowSurrogate(low)) {
+ u = QChar::surrogateToUcs4(u, low);
++i;
- u = (u - 0xd800)*0x400 + (low - 0xdc00) + 0x10000;
}
}
*a = wchar_t(u);
@@ -988,6 +992,40 @@
}
}
+/*!
+ \since 4.7
+
+ Constructs a string initialized with the characters of the QChar array
+ \a unicode, which must be terminated with a 0.
+
+ QString makes a deep copy of the string data. The unicode data is copied as
+ is and the Byte Order Mark is preserved if present.
+*/
+QString::QString(const QChar *unicode)
+{
+ if (!unicode) {
+ d = &shared_null;
+ d->ref.ref();
+ } else {
+ int size = 0;
+ while (unicode[size] != 0)
+ ++size;
+ if (!size) {
+ d = &shared_empty;
+ d->ref.ref();
+ } else {
+ d = (Data*) qMalloc(sizeof(Data)+size*sizeof(QChar));
+ Q_CHECK_PTR(d);
+ d->ref = 1;
+ d->alloc = d->size = size;
+ d->clean = d->asciiCache = d->simpletext = d->righttoleft = d->capacity = 0;
+ d->data = d->array;
+ memcpy(d->array, unicode, size * sizeof(QChar));
+ d->array[size] = '\0';
+ }
+ }
+}
+
/*!
Constructs a string of the given \a size with every character set
@@ -1091,7 +1129,12 @@
\internal
*/
-/*! \fn void QString::isDetached() const
+/*! \fn bool QString::isDetached() const
+
+ \internal
+*/
+
+/*! \fn bool QString::isSharedWith(const QString &other) const
\internal
*/
@@ -1296,8 +1339,9 @@
\overload operator=()
- Assigns \a ba to this string. The byte array is converted to
- Unicode using the fromAscii() function.
+ Assigns \a ba to this string. The byte array is converted to Unicode
+ using the fromAscii() function. This function stops conversion at the
+ first NUL character found, or the end of the \a ba byte array.
You can disable this operator by defining \c
QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -1760,13 +1804,14 @@
}
QT_TRY {
- detach();
if (blen == alen) {
// replace in place
+ detach();
for (int i = 0; i < nIndices; ++i)
memcpy(d->data + indices[i], afterBuffer, alen * sizeof(QChar));
} else if (alen < blen) {
// replace from front
+ detach();
uint to = indices[0];
if (alen)
memcpy(d->data+to, after, alen*sizeof(QChar));
@@ -2089,7 +2134,8 @@
\overload operator==()
The \a other byte array is converted to a QString using the
- fromAscii() function.
+ fromAscii() function. This function stops conversion at the
+ first NUL character found, or the end of the byte array.
You can disable this operator by defining \c
QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2150,7 +2196,8 @@
\overload operator<()
The \a other byte array is converted to a QString using the
- fromAscii() function.
+ fromAscii() function. If any NUL characters ('\0') are embedded
+ in the byte array, they will be included in the transformation.
You can disable this operator by defining \c
QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2192,7 +2239,8 @@
\overload operator<=()
The \a other byte array is converted to a QString using the
- fromAscii() function.
+ fromAscii() function. If any NUL characters ('\0') are embedded
+ in the byte array, they will be included in the transformation.
You can disable this operator by defining \c
QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2250,7 +2298,8 @@
\overload operator>()
The \a other byte array is converted to a QString using the
- fromAscii() function.
+ fromAscii() function. If any NUL characters ('\0') are embedded
+ in the byte array, they will be included in the transformation.
You can disable this operator by defining \c
QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -2292,7 +2341,8 @@
\overload operator>=()
The \a other byte array is converted to a QString using the
- fromAscii() function.
+ fromAscii() function. If any NUL characters ('\0') are embedded in
+ the byte array, they will be included in the transformation.
You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
when you compile your applications. This can be useful if you want
@@ -2307,10 +2357,10 @@
The \a other const char pointer is converted to a QString using
the fromAscii() function.
- You can disable this operator by defining \c
- QT_NO_CAST_FROM_ASCII when you compile your applications. This
- can be useful if you want to ensure that all user-visible strings
- go through QObject::tr(), for example.
+ You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
+ when you compile your applications. This can be useful if you want
+ to ensure that all user-visible strings go through QObject::tr(),
+ for example.
*/
/*! \fn bool QString::operator!=(const QString &other) const
@@ -2334,7 +2384,8 @@
\overload operator!=()
The \a other byte array is converted to a QString using the
- fromAscii() function.
+ fromAscii() function. If any NUL characters ('\0') are embedded
+ in the byte array, they will be included in the transformation.
You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
when you compile your applications. This can be useful if you want
@@ -3438,12 +3489,82 @@
QByteArray ba;
if (length) {
ba.resize(length);
- const ushort *i = reinterpret_cast<const ushort *>(data);
- const ushort *e = i + length;
- uchar *s = (uchar*) ba.data();
- while (i != e) {
- *s++ = (*i>0xff) ? '?' : (uchar) *i;
- ++i;
+ const ushort *src = reinterpret_cast<const ushort *>(data);
+ uchar *dst = (uchar*) ba.data();
+#if defined(QT_ALWAYS_HAVE_SSE2)
+ if (length >= 16) {
+ const int chunkCount = length >> 4; // divided by 16
+ const __m128i questionMark = _mm_set1_epi16('?');
+ // SSE has no compare instruction for unsigned comparison.
+ // The variables must be shiffted + 0x8000 to be compared
+ const __m128i signedBitOffset = _mm_set1_epi16(0x8000);
+ const __m128i thresholdMask = _mm_set1_epi16(0xff + 0x8000);
+ for (int i = 0; i < chunkCount; ++i) {
+ __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load
+ src += 8;
+ {
+ // each 16 bit is equal to 0xFF if the source is outside latin 1 (>0xff)
+ const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset);
+ const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
+
+ // offLimitQuestionMark contains '?' for each 16 bits that was off-limit
+ // the 16 bits that were correct contains zeros
+ const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
+
+ // correctBytes contains the bytes that were in limit
+ // the 16 bits that were off limits contains zeros
+ const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk1);
+
+ // merge offLimitQuestionMark and correctBytes to have the result
+ chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark);
+ }
+
+ __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load
+ src += 8;
+ {
+ // exactly the same operations as for the previous chunk of data
+ const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset);
+ const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
+ const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
+ const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2);
+ chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark);
+ }
+
+ // pack the two vector to 16 x 8bits elements
+ const __m128i result = _mm_packus_epi16(chunk1, chunk2);
+
+ _mm_storeu_si128((__m128i*)dst, result); // store
+ dst += 16;
+ }
+ length = length % 16;
+ }
+#elif QT_HAVE_NEON
+ // Refer to the documentation of the SSE2 implementation
+ // this use eactly the same method as for SSE except:
+ // 1) neon has unsigned comparison
+ // 2) packing is done to 64 bits (8 x 8bits component).
+ if (length >= 16) {
+ const int chunkCount = length >> 3; // divided by 8
+ const uint16x8_t questionMark = vdupq_n_u16('?'); // set
+ const uint16x8_t thresholdMask = vdupq_n_u16(0xff); // set
+ for (int i = 0; i < chunkCount; ++i) {
+ uint16x8_t chunk = vld1q_u16((uint16_t *)src); // load
+ src += 8;
+
+ const uint16x8_t offLimitMask = vcgtq_u16(chunk, thresholdMask); // chunk > thresholdMask
+ const uint16x8_t offLimitQuestionMark = vandq_u16(offLimitMask, questionMark); // offLimitMask & questionMark
+ const uint16x8_t correctBytes = vbicq_u16(chunk, offLimitMask); // !offLimitMask & chunk
+ chunk = vorrq_u16(correctBytes, offLimitQuestionMark); // correctBytes | offLimitQuestionMark
+ const uint8x8_t result = vmovn_u16(chunk); // narrowing move->packing
+ vst1_u8(dst, result); // store
+ dst += 8;
+ }
+ length = length % 8;
+ }
+#endif
+ while (length--) {
+ *dst++ = (*src>0xff) ? '?' : (uchar) *src;
+ ++src;
}
}
return ba;
@@ -3451,8 +3572,10 @@
/*!
Returns a Latin-1 representation of the string as a QByteArray.
- The returned byte array is undefined if the string contains
- non-Latin1 characters.
+
+ The returned byte array is undefined if the string contains non-Latin1
+ characters. Those characters may be suppressed or replaced with a
+ question mark.
\sa fromLatin1(), toAscii(), toUtf8(), toLocal8Bit(), QTextCodec
*/
@@ -3466,12 +3589,15 @@
// isn't necessary in the header. See task 177402.
/*!
- Returns an 8-bit ASCII representation of the string as a QByteArray.
+ Returns an 8-bit representation of the string as a QByteArray.
If a codec has been set using QTextCodec::setCodecForCStrings(),
it is used to convert Unicode to 8-bit char; otherwise this
function does the same as toLatin1().
+ Note that, despite the name, this function does not necessarily return an US-ASCII
+ (ANSI X3.4-1986) string and its result may not be US-ASCII compatible.
+
\sa fromAscii(), toLatin1(), toUtf8(), toLocal8Bit(), QTextCodec
*/
QByteArray QString::toAscii() const
@@ -3499,8 +3625,13 @@
QByteArray. The returned byte array is undefined if the string
contains characters not supported by the local 8-bit encoding.
- QTextCodec::codecForLocale() is used to perform the conversion
- from Unicode.
+ QTextCodec::codecForLocale() is used to perform the conversion from
+ Unicode. If the locale encoding could not be determined, this function
+ does the same as toLatin1().
+
+ If this string contains any characters that cannot be encoded in the
+ locale, the returned byte array is undefined. Those characters may be
+ suppressed or replaced by another.
\sa fromLocal8Bit(), toAscii(), toLatin1(), toUtf8(), QTextCodec
*/
@@ -3516,54 +3647,34 @@
/*!
Returns a UTF-8 representation of the string as a QByteArray.
+ UTF-8 is a Unicode codec and can represent all characters in a Unicode
+ string like QString.
+
+ However, in the Unicode range, there are certain codepoints that are not
+ considered characters. The Unicode standard reserves the last two
+ codepoints in each Unicode Plane (U+FFFE, U+FFFF, U+1FFFE, U+1FFFF,
+ U+2FFFE, etc.), as well as 16 codepoints in the range U+FDD0..U+FDDF,
+ inclusive, as non-characters. If any of those appear in the string, they
+ may be discarded and will not appear in the UTF-8 representation, or they
+ may be replaced by one or more replacement characters.
+
\sa fromUtf8(), toAscii(), toLatin1(), toLocal8Bit(), QTextCodec
*/
QByteArray QString::toUtf8() const
{
- QByteArray ba;
- if (d->size) {
- int l = d->size;
- int rlen = l*3+1;
- ba.resize(rlen);
- uchar *cursor = (uchar*)ba.data();
- const ushort *ch =d->data;
- for (int i=0; i < l; i++) {
- uint u = *ch;
- if (u < 0x80) {
- *cursor++ = (uchar)u;
- } else {
- if (u < 0x0800) {
- *cursor++ = 0xc0 | ((uchar) (u >> 6));
- } else {
- if (QChar(u).isHighSurrogate() && i < l-1) {
- ushort low = ch[1];
- if (QChar(low).isLowSurrogate()) {
- ++ch;
- ++i;
- u = QChar::surrogateToUcs4(u,low);
- }
- }
- if (u > 0xffff) {
- *cursor++ = 0xf0 | ((uchar) (u >> 18));
- *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
- } else {
- *cursor++ = 0xe0 | ((uchar) (u >> 12));
- }
- *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
- }
- *cursor++ = 0x80 | ((uchar) (u&0x3f));
- }
- ++ch;
- }
- ba.resize(cursor - (uchar*)ba.constData());
- }
- return ba;
+ if (isNull())
+ return QByteArray();
+
+ return QUtf8::convertFromUnicode(constData(), length(), 0);
}
/*!
\since 4.2
- Returns a UCS-4 representation of the string as a QVector<uint>.
+ Returns a UCS-4/UTF-32 representation of the string as a QVector<uint>.
+
+ UCS-4 is a Unicode codec and is lossless. All characters from this string
+ can be encoded in UCS-4.
\sa fromUtf8(), toAscii(), toLatin1(), toLocal8Bit(), QTextCodec, fromUcs4(), toWCharArray()
*/
@@ -3606,10 +3717,35 @@
d->alloc = d->size = size;
d->clean = d->asciiCache = d->simpletext = d->righttoleft = d->capacity = 0;
d->data = d->array;
- ushort *i = d->data;
d->array[size] = '\0';
+ ushort *dst = d->data;
+ /* SIMD:
+ * Unpacking with SSE has been shown to improve performance on recent CPUs
+ * The same method gives no improvement with NEON.
+ */
+#if defined(QT_ALWAYS_HAVE_SSE2)
+ if (size >= 16) {
+ int chunkCount = size >> 4; // divided by 16
+ const __m128i nullMask = _mm_set1_epi32(0);
+ for (int i = 0; i < chunkCount; ++i) {
+ const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load
+ str += 16;
+
+ // unpack the first 8 bytes, padding with zeros
+ const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
+ _mm_storeu_si128((__m128i*)dst, firstHalf); // store
+ dst += 8;
+
+ // unpack the last 8 bytes, padding with zeros
+ const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
+ _mm_storeu_si128((__m128i*)dst, secondHalf); // store
+ dst += 8;
+ }
+ size = size % 16;
+ }
+#endif
while (size--)
- *i++ = (uchar)*str++;
+ *dst++ = (uchar)*str++;
}
return d;
}
@@ -3691,100 +3827,6 @@
#endif
-QT_END_NAMESPACE
-
-#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
-#include "qt_windows.h"
-
-QT_BEGIN_NAMESPACE
-
-QByteArray qt_winQString2MB(const QString& s, int uclen)
-{
- if (uclen < 0)
- uclen = s.length();
- if (s.isNull())
- return QByteArray();
- if (uclen == 0)
- return QByteArray("");
- return qt_winQString2MB(s.constData(), uclen);
-}
-
-QByteArray qt_winQString2MB(const QChar *ch, int uclen)
-{
- if (!ch)
- return QByteArray();
- if (uclen == 0)
- return QByteArray("");
- BOOL used_def;
- QByteArray mb(4096, 0);
- int len;
- while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen,
- mb.data(), mb.size()-1, 0, &used_def)))
- {
- int r = GetLastError();
- if (r == ERROR_INSUFFICIENT_BUFFER) {
- mb.resize(1+WideCharToMultiByte(CP_ACP, 0,
- (const wchar_t*)ch, uclen,
- 0, 0, 0, &used_def));
- // and try again...
- } else {
-#ifndef QT_NO_DEBUG
- // Fail.
- qWarning("WideCharToMultiByte: Cannot convert multibyte text (error %d): %s (UTF-8)",
- r, QString(ch, uclen).toLocal8Bit().data());
-#endif
- break;
- }
- }
- mb.resize(len);
- return mb;
-}
-
-QString qt_winMB2QString(const char *mb, int mblen)
-{
- if (!mb || !mblen)
- return QString();
- const int wclen_auto = 4096;
- wchar_t wc_auto[wclen_auto];
- int wclen = wclen_auto;
- wchar_t *wc = wc_auto;
- int len;
- while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
- mb, mblen, wc, wclen)))
- {
- int r = GetLastError();
- if (r == ERROR_INSUFFICIENT_BUFFER) {
- if (wc != wc_auto) {
- qWarning("MultiByteToWideChar: Size changed");
- break;
- } else {
- wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
- mb, mblen, 0, 0);
- wc = new wchar_t[wclen];
- // and try again...
- }
- } else {
- // Fail.
- qWarning("MultiByteToWideChar: Cannot convert multibyte text");
- break;
- }
- }
- if (len <= 0)
- return QString();
- if (wc[len-1] == 0) // len - 1: we don't want terminator
- --len;
- QString s((QChar*)wc, len);
- if (wc != wc_auto)
- delete [] wc;
- return s;
-}
-
-QT_END_NAMESPACE
-
-#endif // Q_OS_WIN32
-
-QT_BEGIN_NAMESPACE
-
/*!
Returns a QString initialized with the first \a size characters
of the 8-bit string \a str.
@@ -3815,14 +3857,16 @@
/*!
Returns a QString initialized with the first \a size characters
- of the 8-bit ASCII string \a str.
+ of the 8-bit string \a str.
If \a size is -1 (default), it is taken to be qstrlen(\a
str).
- If a codec has been set using QTextCodec::setCodecForCStrings(),
- it is used to convert \a str to Unicode; otherwise this function
- does the same as fromLatin1().
+ Note that, despite the name, this function actually uses the codec
+ defined by QTextCodec::setCodecForCStrings() to convert \a str to
+ Unicode. Depending on the codec, it may not accept valid US-ASCII (ANSI
+ X3.4-1986) input. If no codec has been set, this function does the same
+ as fromLatin1().
\sa toAscii(), fromLatin1(), fromUtf8(), fromLocal8Bit()
*/
@@ -3838,6 +3882,18 @@
If \a size is -1 (default), it is taken to be qstrlen(\a
str).
+ UTF-8 is a Unicode codec and can represent all characters in a Unicode
+ string like QString. However, invalid sequences are possible with UTF-8
+ and, if any such are found, they will be replaced with one or more
+ "replacement characters", or suppressed. These include non-Unicode
+ sequences, non-characters, overlong sequences or surrogate codepoints
+ encoded into UTF-8.
+
+ Non-characters are codepoints that the Unicode standard reserves and must
+ not be used in text interchange. They are the last two codepoints in each
+ Unicode Plane (U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, etc.), as well
+ as 16 codepoints in the range U+FDD0..U+FDDF, inclusive.
+
\sa toUtf8(), fromAscii(), fromLatin1(), fromLocal8Bit()
*/
QString QString::fromUtf8(const char *str, int size)
@@ -3861,7 +3917,7 @@
host byte order is assumed.
This function is comparatively slow.
- Use QString(const ushort *, int) if possible.
+ Use QString(const ushort *, int) or QString(const ushort *) if possible.
QString makes a deep copy of the Unicode data.
@@ -3954,24 +4010,74 @@
{
if (d->size == 0)
return *this;
- QString result(d->size, Qt::Uninitialized);
- const QChar *from = (const QChar*) d->data;
- const QChar *fromend = (const QChar*) from+d->size;
- int outc=0;
- QChar *to = (QChar*) result.d->data;
- for (;;) {
- while (from!=fromend && from->isSpace())
- from++;
- while (from!=fromend && !from->isSpace())
- to[outc++] = *from++;
- if (from!=fromend)
- to[outc++] = QLatin1Char(' ');
- else
+
+ const QChar * const start = reinterpret_cast<QChar *>(d->data);
+ const QChar *from = start;
+ const QChar *fromEnd = start + d->size;
+ forever {
+ QChar ch = *from;
+ if (!ch.isSpace())
+ break;
+ if (++from == fromEnd) {
+ // All-whitespace string
+ shared_empty.ref.ref();
+ return QString(&shared_empty, 0);
+ }
+ }
+ // This loop needs no underflow check, as we already determined that
+ // the string contains non-whitespace. If the string has exactly one
+ // non-whitespace, it will be checked twice - we can live with that.
+ while (fromEnd[-1].isSpace())
+ fromEnd--;
+ // The rest of the function depends on the fact that we already know
+ // that the last character in the source is no whitespace.
+ const QChar *copyFrom = from;
+ int copyCount;
+ forever {
+ if (++from == fromEnd) {
+ // Only leading and/or trailing whitespace, if any at all
+ return mid(copyFrom - start, from - copyFrom);
+ }
+ QChar ch = *from;
+ if (!ch.isSpace())
+ continue;
+ if (ch != QLatin1Char(' ')) {
+ copyCount = from - copyFrom;
break;
+ }
+ ch = *++from;
+ if (ch.isSpace()) {
+ copyCount = from - copyFrom - 1;
+ break;
+ }
}
- if (outc > 0 && to[outc-1] == QLatin1Char(' '))
- outc--;
- result.truncate(outc);
+ // 'from' now points at the non-trailing whitespace which made the
+ // string not simplified in the first place. 'copyCount' is the number
+ // of already simplified characters - at least one, obviously -
+ // without a trailing space.
+ QString result((fromEnd - from) + copyCount, Qt::Uninitialized);
+ QChar *to = reinterpret_cast<QChar *>(result.d->data);
+ ::memcpy(to, copyFrom, copyCount * 2);
+ to += copyCount;
+ fromEnd--;
+ QChar ch;
+ forever {
+ *to++ = QLatin1Char(' ');
+ do {
+ ch = *++from;
+ } while (ch.isSpace());
+ if (from == fromEnd)
+ break;
+ do {
+ *to++ = ch;
+ ch = *++from;
+ if (from == fromEnd)
+ goto done;
+ } while (!ch.isSpace());
+ }
+ done:
+ *to++ = ch;
+ result.truncate(to - reinterpret_cast<QChar *>(result.d->data));
return result;
}
@@ -4212,8 +4318,10 @@
\overload operator+=()
- Appends the byte array \a ba to this string. The byte array is
- converted to Unicode using the fromAscii() function.
+ Appends the byte array \a ba to this string. The byte array is converted
+ to Unicode using the fromAscii() function. If any NUL characters ('\0')
+ are embedded in the \a ba byte array, they will be included in the
+ transformation.
You can disable this function by defining \c
QT_NO_CAST_FROM_ASCII when you compile your applications. This
@@ -4613,6 +4721,12 @@
return localeAwareCompare_helper(constData(), length(), other.constData(), other.length());
}
+#if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
+QT_END_NAMESPACE
+#include "qt_windows.h"
+QT_BEGIN_NAMESPACE
+#endif
+
/*!
\internal
\since 4.5
@@ -4990,8 +5104,19 @@
const char *c = cformat;
for (;;) {
// Copy non-escape chars to result
+#ifndef QT_NO_TEXTCODEC
+ int i = 0;
+ while (*(c + i) != '\0' && *(c + i) != '%')
+ ++i;
+ if (codecForCStrings)
+ result.append(codecForCStrings->toUnicode(c, i));
+ else
+ result.append(fromLatin1(c, i));
+ c += i;
+#else
while (*c != '\0' && *c != '%')
result.append(QLatin1Char(*c++));
+#endif
if (*c == '\0')
break;
@@ -5989,7 +6114,7 @@
*/
QString QString::normalized(QString::NormalizationForm mode) const
{
- return normalized(mode, CURRENT_VERSION);
+ return normalized(mode, UNICODE_DATA_VERSION);
}
/*!
@@ -6070,8 +6195,10 @@
if (simple)
return;
- QString &s = *data;
- if (version != CURRENT_VERSION) {
+ if (version == QChar::Unicode_Unassigned) {
+ version = UNICODE_DATA_VERSION;
+ } else if (version != UNICODE_DATA_VERSION) {
+ QString &s = *data;
for (int i = 0; i < NumNormalizationCorrections; ++i) {
const NormalizationCorrection &n = uc_normalization_corrections[i];
if (n.version > version) {
@@ -6898,9 +7025,9 @@
This operator is mostly useful to pass a QString to a function
that accepts a std::string object.
- If the QString contains non-ASCII Unicode characters, using this
- operator can lead to loss of information, since the implementation
- calls toAscii().
+ If the QString contains Unicode characters that the
+ QTextCodec::codecForCStrings() codec cannot handle, using this operator
+ can lead to loss of information.
This operator is only available if Qt is configured with STL
compatibility enabled.
@@ -6931,7 +7058,7 @@
'\\0'-terminated string (although utf16() does, at the cost of
copying the raw data).
- \sa fromUtf16()
+ \sa fromUtf16(), setRawData()
*/
QString QString::fromRawData(const QChar *unicode, int size)
{
@@ -6950,8 +7077,46 @@
return QString(x, 0);
}
+/*!
+ \since 4.7
+
+ Resets the QString to use the first \a size Unicode characters
+ in the array \a unicode. The data in \a unicode is \e not
+ copied. The caller must be able to guarantee that \a unicode will
+ not be deleted or modified as long as the QString (or an
+ unmodified copy of it) exists.
+
+ This function can be used instead of fromRawData() to re-use
+ existings QString objects to save memory re-allocations.
+
+ \sa fromRawData()
+*/
+QString &QString::setRawData(const QChar *unicode, int size)
+{
+ if (d->ref != 1 || d->alloc) {
+ *this = fromRawData(unicode, size);
+ } else {
+#ifdef QT3_SUPPORT
+ if (d->asciiCache) {
+ Q_ASSERT(asciiCache);
+ asciiCache->remove(d);
+ }
+#endif
+ if (unicode) {
+ d->data = (ushort *)unicode;
+ } else {
+ d->data = d->array;
+ size = 0;
+ }
+ d->alloc = d->size = size;
+ *d->array = '\0';
+ d->clean = d->asciiCache = d->simpletext = d->righttoleft = d->capacity = 0;
+ }
+ return *this;
+}
+
/*! \class QLatin1String
- \brief The QLatin1String class provides a thin wrapper around an ASCII/Latin-1 encoded string literal.
+ \brief The QLatin1String class provides a thin wrapper around an US-ASCII/Latin-1 encoded string literal.
\ingroup string-processing
\reentrant
@@ -7038,7 +7203,7 @@
\since 4.3
\overload
- The \a other const char pointer is converted to a QLatin1String using
+ The \a other const char pointer is converted to a QString using
the QString::fromAscii() function.
You can disable this operator by defining \c
@@ -7063,7 +7228,7 @@
\since 4.3
\overload operator!=()
- The \a other const char pointer is converted to a QLatin1String using
+ The \a other const char pointer is converted to a QString using
the QString::fromAscii() function.
You can disable this operator by defining \c
@@ -7089,7 +7254,7 @@
\since 4.3
\overload
- The \a other const char pointer is converted to a QLatin1String using
+ The \a other const char pointer is converted to a QString using
the QString::fromAscii() function.
You can disable this operator by defining \c QT_NO_CAST_FROM_ASCII
@@ -7115,7 +7280,7 @@
\since 4.3
\overload
- The \a other const char pointer is converted to a QLatin1String using
+ The \a other const char pointer is converted to a QString using
the QString::fromAscii() function.
You can disable this operator by defining \c
@@ -7141,7 +7306,7 @@
\since 4.3
\overload
- The \a other const char pointer is converted to a QLatin1String using
+ The \a other const char pointer is converted to a QString using
the QString::fromAscii() function.
You can disable this operator by defining \c
@@ -7222,7 +7387,7 @@
Writes the given \a string to the specified \a stream.
- \sa {Format of the QDataStream Operators}
+ \sa {Serializing Qt Data Types}
*/
QDataStream &operator<<(QDataStream &out, const QString &str)
@@ -7270,7 +7435,7 @@
Reads a string from the specified \a stream into the given \a string.
- \sa {Format of the QDataStream Operators}
+ \sa {Serializing Qt Data Types}
*/
QDataStream &operator>>(QDataStream &in, QString &str)
@@ -7317,7 +7482,7 @@
!= (QSysInfo::ByteOrder == QSysInfo::BigEndian)) {
ushort *data = reinterpret_cast<ushort *>(str.data());
while (len--) {
- *data = (*data >> 8) | (*data << 8);
+ *data = qbswap(*data);
++data;
}
}