src/corelib/codecs/qutfcodec.cpp
changeset 0 1918ee327afb
child 4 3b1da2848fc7
child 7 f7bc934e204c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/corelib/codecs/qutfcodec.cpp	Mon Jan 11 14:00:40 2010 +0000
@@ -0,0 +1,656 @@
+/****************************************************************************
+**
+** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
+** All rights reserved.
+** Contact: Nokia Corporation (qt-info@nokia.com)
+**
+** This file is part of the QtCore module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** No Commercial Usage
+** This file contains pre-release code and may not be distributed.
+** You may use this file in accordance with the terms and conditions
+** contained in the Technology Preview License Agreement accompanying
+** this package.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 2.1 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL included in the
+** packaging of this file.  Please review the following information to
+** ensure the GNU Lesser General Public License version 2.1 requirements
+** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+**
+** In addition, as a special exception, Nokia gives you certain additional
+** rights.  These rights are described in the Nokia Qt LGPL Exception
+** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+**
+** If you have questions regarding the use of this file, please contact
+** Nokia at qt-info@nokia.com.
+**
+**
+**
+**
+**
+**
+**
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "qutfcodec_p.h"
+#include "qlist.h"
+#include "qendian.h"
+#include "qchar.h"
+
+QT_BEGIN_NAMESPACE
+
+enum { Endian = 0, Data = 1 };
+
+QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state)
+{
+    uchar replacement = '?';
+    int rlen = 3*len;
+    int surrogate_high = -1;
+    if (state) {
+        if (state->flags & QTextCodec::ConvertInvalidToNull)
+            replacement = 0;
+        if (!(state->flags & QTextCodec::IgnoreHeader))
+            rlen += 3;
+        if (state->remainingChars)
+            surrogate_high = state->state_data[0];
+    }
+
+    QByteArray rstr;
+    rstr.resize(rlen);
+    uchar* cursor = (uchar*)rstr.data();
+    const QChar *ch = uc;
+    int invalid = 0;
+    if (state && !(state->flags & QTextCodec::IgnoreHeader)) {
+        *cursor++ = 0xef;
+        *cursor++ = 0xbb;
+        *cursor++ = 0xbf;
+    }
+
+    const QChar *end = ch + len;
+    while (ch < end) {
+        uint u = ch->unicode();
+        if (surrogate_high >= 0) {
+            if (u >= 0xdc00 && u < 0xe000) {
+                u = (surrogate_high - 0xd800)*0x400 + (u - 0xdc00) + 0x10000;
+                surrogate_high = -1;
+            } else {
+                // high surrogate without low
+                *cursor = replacement;
+                ++ch;
+                ++invalid;
+                surrogate_high = -1;
+                continue;
+            }
+        } else if (u >= 0xdc00 && u < 0xe000) {
+            // low surrogate without high
+            *cursor = replacement;
+            ++ch;
+            ++invalid;
+            continue;
+        } else if (u >= 0xd800 && u < 0xdc00) {
+            surrogate_high = u;
+            ++ch;
+            continue;
+        }
+
+        if (u < 0x80) {
+            *cursor++ = (uchar)u;
+        } else {
+            if (u < 0x0800) {
+                *cursor++ = 0xc0 | ((uchar) (u >> 6));
+            } else {
+                if (u > 0xffff) {
+                    // see QString::fromUtf8() and QString::utf8() for explanations
+                    if (u > 0x10fe00 && u < 0x10ff00) {
+                        *cursor++ = (u - 0x10fe00);
+                        ++ch;
+                        continue;
+                    } else {
+                        *cursor++ = 0xf0 | ((uchar) (u >> 18));
+                        *cursor++ = 0x80 | (((uchar) (u >> 12)) & 0x3f);
+                    }
+                } else {
+                    *cursor++ = 0xe0 | (((uchar) (u >> 12)) & 0x3f);
+                }
+                *cursor++ = 0x80 | (((uchar) (u >> 6)) & 0x3f);
+            }
+            *cursor++ = 0x80 | ((uchar) (u&0x3f));
+        }
+        ++ch;
+    }
+
+    rstr.resize(cursor - (const uchar*)rstr.constData());
+    if (state) {
+        state->invalidChars += invalid;
+        state->flags |= QTextCodec::IgnoreHeader;
+        state->remainingChars = 0;
+        if (surrogate_high >= 0) {
+            state->remainingChars = 1;
+            state->state_data[0] = surrogate_high;
+        }
+    }
+    return rstr;
+}
+
+QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
+{
+    bool headerdone = false;
+    ushort replacement = QChar::ReplacementCharacter;
+    int need = 0;
+    int error = -1;
+    uint uc = 0;
+    uint min_uc = 0;
+    if (state) {
+        if (state->flags & QTextCodec::IgnoreHeader)
+            headerdone = true;
+        if (state->flags & QTextCodec::ConvertInvalidToNull)
+            replacement = QChar::Null;
+        need = state->remainingChars;
+        if (need) {
+            uc = state->state_data[0];
+            min_uc = state->state_data[1];
+        }
+    }
+    if (!headerdone && len > 3
+        && (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
+        // starts with a byte order mark
+        chars += 3;
+        len -= 3;
+        headerdone = true;
+    }
+
+    QString result(need + len + 1, Qt::Uninitialized); // worst case
+    ushort *qch = (ushort *)result.unicode();
+    uchar ch;
+    int invalid = 0;
+
+    for (int i = 0; i < len; ++i) {
+        ch = chars[i];
+        if (need) {
+            if ((ch&0xc0) == 0x80) {
+                uc = (uc << 6) | (ch & 0x3f);
+                --need;
+                if (!need) {
+                    // utf-8 bom composes into 0xfeff code point
+                    if (!headerdone && uc == 0xfeff) {
+                        // dont do anything, just skip the BOM
+                    } else if (uc > 0xffff && uc < 0x110000) {
+                        // surrogate pair
+                        Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
+                        *qch++ = QChar::highSurrogate(uc);
+                        *qch++ = QChar::lowSurrogate(uc);
+                    } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || (uc >= 0xfffe)) {
+                        // error: overlong sequence, UTF16 surrogate or BOM
+                        *qch++ = replacement;
+                        ++invalid;
+                    } else {
+                        *qch++ = uc;
+                    }
+                    headerdone = true;
+                }
+            } else {
+                // error
+                i = error;
+                *qch++ = replacement;
+                ++invalid;
+                need = 0;
+                headerdone = true;
+            }
+        } else {
+            if (ch < 128) {
+                *qch++ = ushort(ch);
+                headerdone = true;
+            } else if ((ch & 0xe0) == 0xc0) {
+                uc = ch & 0x1f;
+                need = 1;
+                error = i;
+                min_uc = 0x80;
+                headerdone = true;
+            } else if ((ch & 0xf0) == 0xe0) {
+                uc = ch & 0x0f;
+                need = 2;
+                error = i;
+                min_uc = 0x800;
+            } else if ((ch&0xf8) == 0xf0) {
+                uc = ch & 0x07;
+                need = 3;
+                error = i;
+                min_uc = 0x10000;
+                headerdone = true;
+            } else {
+                // error
+                *qch++ = replacement;
+                ++invalid;
+                headerdone = true;
+            }
+        }
+    }
+    if (!state && need > 0) {
+        // unterminated UTF sequence
+        for (int i = error; i < len; ++i) {
+            *qch++ = replacement;
+            ++invalid;
+        }
+    }
+    result.truncate(qch - (ushort *)result.unicode());
+    if (state) {
+        state->invalidChars += invalid;
+        state->remainingChars = need;
+        if (headerdone)
+            state->flags |= QTextCodec::IgnoreHeader;
+        state->state_data[0] = need ? uc : 0;
+        state->state_data[1] = need ? min_uc : 0;
+    }
+    return result;
+}
+
+QByteArray QUtf16::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
+{
+    DataEndianness endian = e;
+    int length =  2*len;
+    if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
+        length += 2;
+    }
+    if (e == DetectEndianness) {
+        endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+    }
+
+    QByteArray d;
+    d.resize(length);
+    char *data = d.data();
+    if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
+        QChar bom(QChar::ByteOrderMark);
+        if (endian == BigEndianness) {
+            data[0] = bom.row();
+            data[1] = bom.cell();
+        } else {
+            data[0] = bom.cell();
+            data[1] = bom.row();
+        }
+        data += 2;
+    }
+    if (endian == BigEndianness) {
+        for (int i = 0; i < len; ++i) {
+            *(data++) = uc[i].row();
+            *(data++) = uc[i].cell();
+        }
+    } else {
+        for (int i = 0; i < len; ++i) {
+            *(data++) = uc[i].cell();
+            *(data++) = uc[i].row();
+        }
+    }
+
+    if (state) {
+        state->remainingChars = 0;
+        state->flags |= QTextCodec::IgnoreHeader;
+    }
+    return d;
+}
+
+QString QUtf16::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
+{
+    DataEndianness endian = e;
+    bool half = false;
+    uchar buf = 0;
+    bool headerdone = false;
+    if (state) {
+        headerdone = state->flags & QTextCodec::IgnoreHeader;
+        if (endian == DetectEndianness)
+            endian = (DataEndianness)state->state_data[Endian];
+        if (state->remainingChars) {
+            half = true;
+            buf = state->state_data[Data];
+        }
+    }
+    if (headerdone && endian == DetectEndianness)
+        endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+
+    QString result(len, Qt::Uninitialized); // worst case
+    QChar *qch = (QChar *)result.unicode();
+    while (len--) {
+        if (half) {
+            QChar ch;
+            if (endian == LittleEndianness) {
+                ch.setRow(*chars++);
+                ch.setCell(buf);
+            } else {
+                ch.setRow(buf);
+                ch.setCell(*chars++);
+            }
+            if (!headerdone) {
+                if (endian == DetectEndianness) {
+                    if (ch == QChar::ByteOrderSwapped && endian != BigEndianness) {
+                        endian = LittleEndianness;
+                    } else if (ch == QChar::ByteOrderMark && endian != LittleEndianness) {
+                        // ignore BOM
+                        endian = BigEndianness;
+                    } else {
+                        if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
+                            endian = BigEndianness;
+                        } else {
+                            endian = LittleEndianness;
+                            ch = QChar((ch.unicode() >> 8) | ((ch.unicode() & 0xff) << 8));
+                        }
+                        *qch++ = ch;
+                    }
+                } else if (ch != QChar::ByteOrderMark) {
+                    *qch++ = ch;
+                }
+                headerdone = true;
+            } else {
+                *qch++ = ch;
+            }
+            half = false;
+        } else {
+            buf = *chars++;
+            half = true;
+        }
+    }
+    result.truncate(qch - result.unicode());
+
+    if (state) {
+        if (headerdone)
+            state->flags |= QTextCodec::IgnoreHeader;
+        state->state_data[Endian] = endian;
+        if (half) {
+            state->remainingChars = 1;
+            state->state_data[Data] = buf;
+        } else {
+            state->remainingChars = 0;
+            state->state_data[Data] = 0;
+        }
+    }
+    return result;
+}
+
+QByteArray QUtf32::convertFromUnicode(const QChar *uc, int len, QTextCodec::ConverterState *state, DataEndianness e)
+{
+    DataEndianness endian = e;
+    int length =  4*len;
+    if (!state || (!(state->flags & QTextCodec::IgnoreHeader))) {
+        length += 4;
+    }
+    if (e == DetectEndianness) {
+        endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+    }
+
+    QByteArray d(length, Qt::Uninitialized);
+    char *data = d.data();
+    if (!state || !(state->flags & QTextCodec::IgnoreHeader)) {
+        if (endian == BigEndianness) {
+            data[0] = 0;
+            data[1] = 0;
+            data[2] = (char)0xfe;
+            data[3] = (char)0xff;
+        } else {
+            data[0] = (char)0xff;
+            data[1] = (char)0xfe;
+            data[2] = 0;
+            data[3] = 0;
+        }
+        data += 4;
+    }
+    if (endian == BigEndianness) {
+        for (int i = 0; i < len; ++i) {
+            uint cp = uc[i].unicode();
+            if (uc[i].isHighSurrogate() && i < len - 1)
+                cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
+            *(data++) = cp >> 24;
+            *(data++) = (cp >> 16) & 0xff;
+            *(data++) = (cp >> 8) & 0xff;
+            *(data++) = cp & 0xff;
+        }
+    } else {
+        for (int i = 0; i < len; ++i) {
+            uint cp = uc[i].unicode();
+            if (uc[i].isHighSurrogate() && i < len - 1)
+                cp = QChar::surrogateToUcs4(cp, uc[++i].unicode());
+            *(data++) = cp & 0xff;
+            *(data++) = (cp >> 8) & 0xff;
+            *(data++) = (cp >> 16) & 0xff;
+            *(data++) = cp >> 24;
+        }
+    }
+
+    if (state) {
+        state->remainingChars = 0;
+        state->flags |= QTextCodec::IgnoreHeader;
+    }
+    return d;
+}
+
+QString QUtf32::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state, DataEndianness e)
+{
+    DataEndianness endian = e;
+    uchar tuple[4];
+    int num = 0;
+    bool headerdone = false;
+    if (state) {
+        headerdone = state->flags & QTextCodec::IgnoreHeader;
+        if (endian == DetectEndianness) {
+            endian = (DataEndianness)state->state_data[Endian];
+        }
+        num = state->remainingChars;
+        memcpy(tuple, &state->state_data[Data], 4);
+    }
+    if (headerdone && endian == DetectEndianness)
+        endian = (QSysInfo::ByteOrder == QSysInfo::BigEndian) ? BigEndianness : LittleEndianness;
+
+    QString result;
+    result.resize((num + len) >> 2 << 1); // worst case
+    QChar *qch = (QChar *)result.unicode();
+
+    const char *end = chars + len;
+    while (chars < end) {
+        tuple[num++] = *chars++;
+        if (num == 4) {
+            if (!headerdone) {
+                if (endian == DetectEndianness) {
+                    if (endian == DetectEndianness) {
+                        if (tuple[0] == 0xff && tuple[1] == 0xfe && tuple[2] == 0 && tuple[3] == 0 && endian != BigEndianness) {
+                            endian = LittleEndianness;
+                            num = 0;
+                            continue;
+                        } else if (tuple[0] == 0 && tuple[1] == 0 && tuple[2] == 0xfe && tuple[3] == 0xff && endian != LittleEndianness) {
+                            endian = BigEndianness;
+                            num = 0;
+                            continue;
+                        } else if (QSysInfo::ByteOrder == QSysInfo::BigEndian) {
+                            endian = BigEndianness;
+                        } else {
+                            endian = LittleEndianness;
+                        }
+                    }
+                } else if (((endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple)) == QChar::ByteOrderMark) {
+                    num = 0;
+                    continue;
+                }
+            }
+            uint code = (endian == BigEndianness) ? qFromBigEndian<quint32>(tuple) : qFromLittleEndian<quint32>(tuple);
+            if (code >= 0x10000) {
+                *qch++ = QChar::highSurrogate(code);
+                *qch++ = QChar::lowSurrogate(code);
+            } else {
+                *qch++ = code;
+            }
+            num = 0;
+        }
+    }
+    result.truncate(qch - result.unicode());
+
+    if (state) {
+        if (headerdone)
+            state->flags |= QTextCodec::IgnoreHeader;
+        state->state_data[Endian] = endian;
+        state->remainingChars = num;
+        memcpy(&state->state_data[Data], tuple, 4);
+    }
+    return result;
+}
+
+
+#ifndef QT_NO_TEXTCODEC
+
+QUtf8Codec::~QUtf8Codec()
+{
+}
+
+QByteArray QUtf8Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
+{
+    return QUtf8::convertFromUnicode(uc, len, state);
+}
+
+void QUtf8Codec::convertToUnicode(QString *target, const char *chars, int len, ConverterState *state) const
+{
+    *target += QUtf8::convertToUnicode(chars, len, state);
+}
+
+QString QUtf8Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
+{
+    return QUtf8::convertToUnicode(chars, len, state);
+}
+
+QByteArray QUtf8Codec::name() const
+{
+    return "UTF-8";
+}
+
+int QUtf8Codec::mibEnum() const
+{
+    return 106;
+}
+
+QUtf16Codec::~QUtf16Codec()
+{
+}
+
+QByteArray QUtf16Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
+{
+    return QUtf16::convertFromUnicode(uc, len, state, e);
+}
+
+QString QUtf16Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
+{
+    return QUtf16::convertToUnicode(chars, len, state, e);
+}
+
+int QUtf16Codec::mibEnum() const
+{
+    return 1015;
+}
+
+QByteArray QUtf16Codec::name() const
+{
+    return "UTF-16";
+}
+
+QList<QByteArray> QUtf16Codec::aliases() const
+{
+    return QList<QByteArray>();
+}
+
+int QUtf16BECodec::mibEnum() const
+{
+    return 1013;
+}
+
+QByteArray QUtf16BECodec::name() const
+{
+    return "UTF-16BE";
+}
+
+QList<QByteArray> QUtf16BECodec::aliases() const
+{
+    QList<QByteArray> list;
+    return list;
+}
+
+int QUtf16LECodec::mibEnum() const
+{
+    return 1014;
+}
+
+QByteArray QUtf16LECodec::name() const
+{
+    return "UTF-16LE";
+}
+
+QList<QByteArray> QUtf16LECodec::aliases() const
+{
+    QList<QByteArray> list;
+    return list;
+}
+
+QUtf32Codec::~QUtf32Codec()
+{
+}
+
+QByteArray QUtf32Codec::convertFromUnicode(const QChar *uc, int len, ConverterState *state) const
+{
+    return QUtf32::convertFromUnicode(uc, len, state, e);
+}
+
+QString QUtf32Codec::convertToUnicode(const char *chars, int len, ConverterState *state) const
+{
+    return QUtf32::convertToUnicode(chars, len, state, e);
+}
+
+int QUtf32Codec::mibEnum() const
+{
+    return 1017;
+}
+
+QByteArray QUtf32Codec::name() const
+{
+    return "UTF-32";
+}
+
+QList<QByteArray> QUtf32Codec::aliases() const
+{
+    QList<QByteArray> list;
+    return list;
+}
+
+int QUtf32BECodec::mibEnum() const
+{
+    return 1018;
+}
+
+QByteArray QUtf32BECodec::name() const
+{
+    return "UTF-32BE";
+}
+
+QList<QByteArray> QUtf32BECodec::aliases() const
+{
+    QList<QByteArray> list;
+    return list;
+}
+
+int QUtf32LECodec::mibEnum() const
+{
+    return 1019;
+}
+
+QByteArray QUtf32LECodec::name() const
+{
+    return "UTF-32LE";
+}
+
+QList<QByteArray> QUtf32LECodec::aliases() const
+{
+    QList<QByteArray> list;
+    return list;
+}
+
+#endif //QT_NO_TEXTCODEC
+
+QT_END_NAMESPACE