src/corelib/tools/qchar.cpp
changeset 30 5dc02b23752f
parent 19 fcece45ef507
--- a/src/corelib/tools/qchar.cpp	Wed Jun 23 19:07:03 2010 +0300
+++ b/src/corelib/tools/qchar.cpp	Tue Jul 06 15:10:48 2010 +0300
@@ -42,10 +42,10 @@
 // Don't define it while compiling this module, or USERS of Qt will
 // not be able to link.
 #ifdef QT_NO_CAST_FROM_ASCII
-#undef QT_NO_CAST_FROM_ASCII
+#  undef QT_NO_CAST_FROM_ASCII
 #endif
 #ifdef QT_NO_CAST_TO_ASCII
-#undef QT_NO_CAST_TO_ASCII
+#  undef QT_NO_CAST_TO_ASCII
 #endif
 #include "qchar.h"
 #include "qdatastream.h"
@@ -57,17 +57,16 @@
 
 QT_BEGIN_NAMESPACE
 
-#define LAST_UNICODE_CHAR 0x10ffff
-
 #ifndef QT_NO_CODEC_FOR_C_STRINGS
-#ifdef QT_NO_TEXTCODEC
-#define QT_NO_CODEC_FOR_C_STRINGS
-#endif
+#  ifdef QT_NO_TEXTCODEC
+#    define QT_NO_CODEC_FOR_C_STRINGS
+#  endif
 #endif
 
 #define FLAG(x) (1 << (x))
 
-/*! \class QLatin1Char
+/*!
+    \class QLatin1Char
     \brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
 
     \ingroup string-processing
@@ -554,7 +553,7 @@
 /*!
     Returns true if the character is a mark (Mark_* categories);
     otherwise returns false.
-    
+
     See QChar::Category for more information regarding marks.
 */
 bool QChar::isMark() const
@@ -651,45 +650,71 @@
 }
 
 /*!
-  \fn bool QChar::isHighSurrogate() const
+    \fn bool QChar::isHighSurrogate() const
+
+    Returns true if the QChar is the high part of a utf16 surrogate
+    (ie. if its code point is between 0xd800 and 0xdbff, inclusive).
+*/
 
-  Returns true if the QChar is the high part of a utf16 surrogate
-  (ie. if its code point is between 0xd800 and 0xdbff).
+/*!
+    \fn bool QChar::isLowSurrogate() const
+
+    Returns true if the QChar is the low part of a utf16 surrogate
+    (ie. if its code point is between 0xdc00 and 0xdfff, inclusive).
 */
 
 /*!
-  \fn bool QChar::isLowSurrogate() const
+    \fn static bool QChar::isHighSurrogate(uint ucs4)
+    \since 4.7
+
+    Returns true if the UCS-4-encoded character specified by \a ucs4
+    is the high part of a utf16 surrogate
+    (ie. if its code point is between 0xd800 and 0xdbff, inclusive).
+*/
 
-  Returns true if the QChar is the low part of a utf16 surrogate
-  (ie. if its code point is between 0xdc00 and 0xdfff).
+/*!
+    \fn static bool QChar::isLowSurrogate(uint ucs4)
+    \since 4.7
+
+    Returns true if the UCS-4-encoded character specified by \a ucs4
+    is the high part of a utf16 surrogate
+    (ie. if its code point is between 0xdc00 and 0xdfff, inclusive).
 */
 
 /*!
-  \fn static uint QChar::surrogateToUcs4(ushort high, ushort low)
+    \fn static bool QChar::requiresSurrogates(uint ucs4)
+    \since 4.7
 
-  Converts a UTF16 surrogate pair with the given \a high and \a low values
-  to its UCS-4 code point.
+    Returns true if the UCS-4-encoded character specified by \a ucs4
+    can be splited to the high and low parts of a utf16 surrogate
+    (ie. if its code point is greater than or equals to 0x10000).
+*/
+
+/*!
+    \fn static uint QChar::surrogateToUcs4(ushort high, ushort low)
+
+    Converts a UTF16 surrogate pair with the given \a high and \a low values
+    to its UCS-4 code point.
 */
 
 /*!
-  \fn static uint QChar::surrogateToUcs4(QChar high, QChar low)
+    \fn static uint QChar::surrogateToUcs4(QChar high, QChar low)
 
-  Converts a utf16 surrogate pair (\a high, \a low) to its ucs4 code
-  point.
+    Converts a utf16 surrogate pair (\a high, \a low) to its ucs4 code point.
 */
 
 /*!
-  \fn static ushort QChar::highSurrogate(uint ucs4)
+    \fn static ushort QChar::highSurrogate(uint ucs4)
 
-  Returns the high surrogate value of a ucs4 code point.
-  The returned result is undefined if \a ucs4 is smaller than 0x10000.
+    Returns the high surrogate value of a ucs4 code point.
+    The returned result is undefined if \a ucs4 is smaller than 0x10000.
 */
 
 /*!
-  \fn static ushort QChar::lowSurrogate(uint ucs4)
+    \fn static ushort QChar::lowSurrogate(uint ucs4)
 
-  Returns the low surrogate value of a ucs4 code point.
-  The returned result is undefined if \a ucs4 is smaller than 0x10000.
+    Returns the low surrogate value of a ucs4 code point.
+    The returned result is undefined if \a ucs4 is smaller than 0x10000.
 */
 
 /*!
@@ -718,7 +743,7 @@
 */
 int QChar::digitValue(uint ucs4)
 {
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return 0;
     return qGetProp(ucs4)->digitValue;
 }
@@ -731,22 +756,22 @@
     return (QChar::Category) qGetProp(ucs)->category;
 }
 
-/*! 
+/*!
     \overload
     \since 4.3
     Returns the category of the UCS-4-encoded character specified by \a ucs4.
- */
+*/
 QChar::Category QChar::category(uint ucs4)
 {
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return QChar::NoCategory;
     return (QChar::Category) qGetProp(ucs4)->category;
 }
 
-/*! 
+/*!
     \overload
     Returns the category of the UCS-2-encoded character specified by \a ucs2.
- */
+*/
 QChar::Category QChar::category(ushort ucs2)
 {
     return (QChar::Category) qGetProp(ucs2)->category;
@@ -761,21 +786,21 @@
     return (QChar::Direction) qGetProp(ucs)->direction;
 }
 
-/*! 
-\overload
-Returns the direction of the UCS-4-encoded character specified by \a ucs4.
- */
+/*!
+    \overload
+    Returns the direction of the UCS-4-encoded character specified by \a ucs4.
+*/
 QChar::Direction QChar::direction(uint ucs4)
 {
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return QChar::DirL;
     return (QChar::Direction) qGetProp(ucs4)->direction;
 }
 
-/*! 
-\overload
-Returns the direction of the UCS-2-encoded character specified by \a ucs2.
- */
+/*!
+    \overload
+    Returns the direction of the UCS-2-encoded character specified by \a ucs2.
+*/
 QChar::Direction QChar::direction(ushort ucs2)
 {
     return (QChar::Direction) qGetProp(ucs2)->direction;
@@ -790,25 +815,25 @@
     return (QChar::Joining) qGetProp(ucs)->joining;
 }
 
-/*! 
-\overload
-Returns information about the joining properties of the UCS-4-encoded
-character specified by \a ucs4 (needed for certain languages such as
-Arabic).
- */
+/*!
+    \overload
+    Returns information about the joining properties of the UCS-4-encoded
+    character specified by \a ucs4 (needed for certain languages such as
+    Arabic).
+*/
 QChar::Joining QChar::joining(uint ucs4)
 {
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return QChar::OtherJoining;
     return (QChar::Joining) qGetProp(ucs4)->joining;
 }
 
-/*! 
-\overload
-Returns information about the joining properties of the UCS-2-encoded
-character specified by \a ucs2 (needed for certain languages such as
-Arabic).
- */
+/*!
+    \overload
+    Returns information about the joining properties of the UCS-2-encoded
+    character specified by \a ucs2 (needed for certain languages such as
+    Arabic).
+*/
 QChar::Joining QChar::joining(ushort ucs2)
 {
     return (QChar::Joining) qGetProp(ucs2)->joining;
@@ -867,26 +892,27 @@
     return ucs + qGetProp(ucs)->mirrorDiff;
 }
 
-/*! \overload
-Returns the mirrored character if the UCS-4-encoded character specified
-by \a ucs4 is a mirrored character; otherwise returns the character itself.
+/*!
+    \overload
+    Returns the mirrored character if the UCS-4-encoded character specified
+    by \a ucs4 is a mirrored character; otherwise returns the character itself.
 
-\sa hasMirrored()
- */
+    \sa hasMirrored()
+*/
 uint QChar::mirroredChar(uint ucs4)
 {
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return ucs4;
     return ucs4 + qGetProp(ucs4)->mirrorDiff;
 }
 
-/*! 
-\overload
-Returns the mirrored character if the UCS-2-encoded character specified
-by \a ucs2 is a mirrored character; otherwise returns the character itself.
+/*!
+    \overload
+    Returns the mirrored character if the UCS-2-encoded character specified
+    by \a ucs2 is a mirrored character; otherwise returns the character itself.
 
-\sa hasMirrored()
- */
+    \sa hasMirrored()
+*/
 ushort QChar::mirroredChar(ushort ucs2)
 {
     return ucs2 + qGetProp(ucs2)->mirrorDiff;
@@ -910,7 +936,7 @@
     (uint ucs4, int *length, int *tag, unsigned short *buffer)
 {
     *length = 0;
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return 0;
     if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
         int SIndex = ucs4 - Hangul_SBase;
@@ -940,11 +966,11 @@
     return decomposition(ucs);
 }
 
-/*! 
-\overload
-Decomposes the UCS-4-encoded character specified by \a ucs4 into its
-constituent parts. Returns an empty string if no decomposition exists.
- */
+/*!
+    \overload
+    Decomposes the UCS-4-encoded character specified by \a ucs4 into its
+    constituent parts. Returns an empty string if no decomposition exists.
+*/
 QString QChar::decomposition(uint ucs4)
 {
     unsigned short buffer[3];
@@ -963,14 +989,14 @@
     return decompositionTag(ucs);
 }
 
-/*! 
-\overload
-Returns the tag defining the composition of the UCS-4-encoded character
-specified by \a ucs4. Returns QChar::Single if no decomposition exists.
- */
+/*!
+    \overload
+    Returns the tag defining the composition of the UCS-4-encoded character
+    specified by \a ucs4. Returns QChar::Single if no decomposition exists.
+*/
 QChar::Decomposition QChar::decompositionTag(uint ucs4)
 {
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return QChar::NoDecomposition;
     const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
     if (index == 0xffff)
@@ -991,27 +1017,28 @@
     return (unsigned char) qGetProp(ucs)->combiningClass;
 }
 
-/*! \overload
-Returns the combining class for the UCS-4-encoded character specified by
-\a ucs4, as defined in the Unicode standard.
- */
+/*!
+    \overload
+    Returns the combining class for the UCS-4-encoded character specified by
+    \a ucs4, as defined in the Unicode standard.
+*/
 unsigned char QChar::combiningClass(uint ucs4)
 {
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return 0;
     return (unsigned char) qGetProp(ucs4)->combiningClass;
 }
 
-/*! \overload
-Returns the combining class for the UCS-2-encoded character specified by
-\a ucs2, as defined in the Unicode standard.
- */
+/*!
+    \overload
+    Returns the combining class for the UCS-2-encoded character specified by
+    \a ucs2, as defined in the Unicode standard.
+*/
 unsigned char QChar::combiningClass(ushort ucs2)
 {
     return (unsigned char) qGetProp(ucs2)->combiningClass;
 }
 
-
 /*!
     Returns the Unicode version that introduced this character.
 */
@@ -1020,21 +1047,23 @@
     return (QChar::UnicodeVersion) qGetProp(ucs)->unicodeVersion;
 }
 
-/*! \overload
-Returns the Unicode version that introduced the character specified in
-its UCS-4-encoded form as \a ucs4.
- */
+/*!
+    \overload
+    Returns the Unicode version that introduced the character specified in
+    its UCS-4-encoded form as \a ucs4.
+*/
 QChar::UnicodeVersion QChar::unicodeVersion(uint ucs4)
 {
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return QChar::Unicode_Unassigned;
     return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
 }
 
-/*! \overload
-Returns the Unicode version that introduced the character specified in
-its UCS-2-encoded form as \a ucs2.
- */
+/*!
+    \overload
+    Returns the Unicode version that introduced the character specified in
+    its UCS-2-encoded form as \a ucs2.
+*/
 QChar::UnicodeVersion QChar::unicodeVersion(ushort ucs2)
 {
     return (QChar::UnicodeVersion) qGetProp(ucs2)->unicodeVersion;
@@ -1053,14 +1082,15 @@
     return ucs;
 }
 
-/*! \overload
-Returns the lowercase equivalent of the UCS-4-encoded character specified
-by \a ucs4 if the character is uppercase or titlecase; otherwise returns
-the character itself.
- */
+/*!
+    \overload
+    Returns the lowercase equivalent of the UCS-4-encoded character specified
+    by \a ucs4 if the character is uppercase or titlecase; otherwise returns
+    the character itself.
+*/
 uint QChar::toLower(uint ucs4)
 {
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return ucs4;
     const QUnicodeTables::Properties *p = qGetProp(ucs4);
     if (!p->lowerCaseSpecial)
@@ -1068,11 +1098,12 @@
     return ucs4;
 }
 
-/*! \overload
-Returns the lowercase equivalent of the UCS-2-encoded character specified
-by \a ucs2 if the character is uppercase or titlecase; otherwise returns
-the character itself.
- */
+/*!
+    \overload
+    Returns the lowercase equivalent of the UCS-2-encoded character specified
+    by \a ucs2 if the character is uppercase or titlecase; otherwise returns
+    the character itself.
+*/
 ushort QChar::toLower(ushort ucs2)
 {
     const QUnicodeTables::Properties *p = qGetProp(ucs2);
@@ -1093,14 +1124,15 @@
     return ucs;
 }
 
-/*! \overload
-Returns the uppercase equivalent of the UCS-4-encoded character specified
-by \a ucs4 if the character is lowercase or titlecase; otherwise returns
-the character itself.
- */
+/*!
+    \overload
+    Returns the uppercase equivalent of the UCS-4-encoded character specified
+    by \a ucs4 if the character is lowercase or titlecase; otherwise returns
+    the character itself.
+*/
 uint QChar::toUpper(uint ucs4)
 {
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return ucs4;
     const QUnicodeTables::Properties *p = qGetProp(ucs4);
     if (!p->upperCaseSpecial)
@@ -1108,11 +1140,12 @@
     return ucs4;
 }
 
-/*! \overload
-Returns the uppercase equivalent of the UCS-2-encoded character specified
-by \a ucs2 if the character is lowercase or titlecase; otherwise returns
-the character itself.
- */
+/*!
+    \overload
+    Returns the uppercase equivalent of the UCS-2-encoded character specified
+    by \a ucs2 if the character is lowercase or titlecase; otherwise returns
+    the character itself.
+*/
 ushort QChar::toUpper(ushort ucs2)
 {
     const QUnicodeTables::Properties *p = qGetProp(ucs2);
@@ -1141,7 +1174,7 @@
 */
 uint QChar::toTitleCase(uint ucs4)
 {
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return ucs4;
     const QUnicodeTables::Properties *p = qGetProp(ucs4);
     if (!p->titleCaseSpecial)
@@ -1202,7 +1235,7 @@
 */
 uint QChar::toCaseFolded(uint ucs4)
 {
-    if (ucs4 > LAST_UNICODE_CHAR)
+    if (ucs4 > UNICODE_LAST_CODEPOINT)
         return ucs4;
     return ucs4 + qGetProp(ucs4)->caseFoldDiff;
 }
@@ -1296,28 +1329,25 @@
 
 #ifndef QT_NO_DATASTREAM
 /*!
-  \relates QChar
-
-  Writes the char \a chr to the stream \a out.
+    \relates QChar
 
-  \sa {Format of the QDataStream operators}
- */
+    Writes the char \a chr to the stream \a out.
 
+    \sa {Serializing Qt Data Types}
+*/
 QDataStream &operator<<(QDataStream &out, const QChar &chr)
 {
     out << quint16(chr.unicode());
     return out;
 }
 
-
 /*!
-  \relates QChar
+    \relates QChar
 
-  Reads a char from the stream \a in into char \a chr.
+    Reads a char from the stream \a in into char \a chr.
 
-  \sa {Format of the QDataStream operators}
- */
-
+    \sa {Serializing Qt Data Types}
+*/
 QDataStream &operator>>(QDataStream &in, QChar &chr)
 {
     quint16 u;
@@ -1450,9 +1480,9 @@
         if (!d || (canonical && tag != QChar::Canonical))
             continue;
 
-        s.replace(uc - utf16, ucs4 > 0x10000 ? 2 : 1, (const QChar *)d, length);
+        int pos = uc - utf16;
+        s.replace(pos, QChar::requiresSurrogates(ucs4) ? 2 : 1, reinterpret_cast<const QChar *>(d), length);
         // since the insert invalidates the pointers and we do decomposition recursive
-        int pos = uc - utf16;
         utf16 = reinterpret_cast<unsigned short *>(s.data());
         uc = utf16 + pos + length;
     }
@@ -1537,46 +1567,52 @@
         int p2 = pos+1;
         uint u1 = s.at(pos).unicode();
         if (QChar(u1).isHighSurrogate()) {
-            ushort low = s.at(pos+1).unicode();
+            ushort low = s.at(p2).unicode();
             if (QChar(low).isLowSurrogate()) {
-                p2++;
                 u1 = QChar::surrogateToUcs4(u1, low);
                 if (p2 >= l)
                     break;
+                ++p2;
             }
         }
         uint u2 = s.at(p2).unicode();
-        if (QChar(u2).isHighSurrogate() && p2 < l-1) {
+        if (QChar(u2).isHighSurrogate() && p2 < l) {
             ushort low = s.at(p2+1).unicode();
             if (QChar(low).isLowSurrogate()) {
-                p2++;
                 u2 = QChar::surrogateToUcs4(u2, low);
+                ++p2;
             }
         }
 
-        int c2 = QChar::combiningClass(u2);
-        if (QChar::unicodeVersion(u2) > version)
-            c2 = 0;
-
+        ushort c2 = 0;
+        {
+            const QUnicodeTables::Properties *p = qGetProp(u2);
+            if ((QChar::UnicodeVersion)p->unicodeVersion <= version)
+                c2 = p->combiningClass;
+        }
         if (c2 == 0) {
             pos = p2+1;
             continue;
         }
-        int c1 = QChar::combiningClass(u1);
-        if (QChar::unicodeVersion(u1) > version)
-            c1 = 0;
+
+        ushort c1 = 0;
+        {
+            const QUnicodeTables::Properties *p = qGetProp(u1);
+            if ((QChar::UnicodeVersion)p->unicodeVersion <= version)
+                c1 = p->combiningClass;
+        }
 
         if (c1 > c2) {
             QChar *uc = s.data();
             int p = pos;
             // exchange characters
-            if (u2 < 0x10000) {
+            if (!QChar::requiresSurrogates(u2)) {
                 uc[p++] = u2;
             } else {
                 uc[p++] = QChar::highSurrogate(u2);
                 uc[p++] = QChar::lowSurrogate(u2);
             }
-            if (u1 < 0x10000) {
+            if (!QChar::requiresSurrogates(u1)) {
                 uc[p++] = u1;
             } else {
                 uc[p++] = QChar::highSurrogate(u1);
@@ -1588,7 +1624,7 @@
                 --pos;
         } else {
             ++pos;
-            if (u1 > 0x10000)
+            if (QChar::requiresSurrogates(u1))
                 ++pos;
         }
     }
@@ -1606,11 +1642,9 @@
     return script;
 }
 
-
 Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL QUnicodeTables::lineBreakClass(uint ucs4)
 {
     return (QUnicodeTables::LineBreakClass) qGetProp(ucs4)->line_break_class;
 }
 
-
 QT_END_NAMESPACE