webengine/osswebengine/WebCore/platform/TextEncoding.cpp
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Thu, 24 Sep 2009 12:53:48 +0300
changeset 11 c8a366e56285
parent 10 a359256acfc6
permissions -rw-r--r--
Revision: 200937 Kit: 200939

/*
 * Copyright (C) 2004, 2006, 2007 Apple Inc. All rights reserved.
 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

#include "config.h"
#include "TextEncoding.h"

#include "CString.h"
#include "PlatformString.h"
#include "TextCodec.h"
#include "TextDecoder.h"
#include "TextEncodingRegistry.h"
#if USE(ICU_UNICODE)
#include <unicode/unorm.h>
#elif USE(QT4_UNICODE)
#include <QString>
#endif
#include <wtf/HashSet.h>
#include <wtf/OwnPtr.h>

namespace WebCore {

static void addEncodingName(HashSet<const char*>& set, const char* name)
{
    const char* atomicName = atomicCanonicalTextEncodingName(name);
    if (atomicName)
        set.add(atomicName);
}

TextEncoding::TextEncoding(const char* name)
    : m_name(atomicCanonicalTextEncodingName(name))
{
}

TextEncoding::TextEncoding(const String& name)
    : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))
{
}

String TextEncoding::decode(const char* data, size_t length) const
{
    if (!m_name)
        return String();

    return TextDecoder(*this).decode(data, length, true);
}

CString TextEncoding::encode(const UChar* characters, size_t length, bool allowEntities) const
{
    if (!m_name)
        return CString();

    if (!length)
        return "";

#if USE(ICU_UNICODE)
    // FIXME: What's the right place to do normalization?
    // It's a little strange to do it inside the encode function.
    // Perhaps normalization should be an explicit step done before calling encode.

    const UChar* source = characters;
    size_t sourceLength = length;

    Vector<UChar> normalizedCharacters;

    UErrorCode err = U_ZERO_ERROR;
    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
        // First try using the length of the original string, since normalization to NFC rarely increases length.
        normalizedCharacters.resize(sourceLength);
        int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
        if (err == U_BUFFER_OVERFLOW_ERROR) {
            err = U_ZERO_ERROR;
            normalizedCharacters.resize(normalizedLength);
            normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
        }
        ASSERT(U_SUCCESS(err));

        source = normalizedCharacters.data();
        sourceLength = normalizedLength;
    }
    return newTextCodec(*this)->encode(source, sourceLength, allowEntities);
#elif USE(QT4_UNICODE)
    QString str(reinterpret_cast<const QChar*>(characters), length);
    str = str.normalized(QString::NormalizationForm_C);
    return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), allowEntities);
#endif
}

bool TextEncoding::usesVisualOrdering() const
{
    if (noExtendedTextEncodingNameUsed())
        return false;

    static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
    return m_name == a;
}

bool TextEncoding::isJapanese() const
{
    if (noExtendedTextEncodingNameUsed())
        return false;

    static HashSet<const char*> set;
    if (set.isEmpty()) {
        addEncodingName(set, "x-mac-japanese");
        addEncodingName(set, "cp932");
        addEncodingName(set, "JIS_X0201");
        addEncodingName(set, "JIS_X0208-1983");
        addEncodingName(set, "JIS_X0208-1990");
        addEncodingName(set, "JIS_X0212-1990");
        addEncodingName(set, "JIS_C6226-1978");
        addEncodingName(set, "Shift_JIS_X0213-2000");
        addEncodingName(set, "ISO-2022-JP");
        addEncodingName(set, "ISO-2022-JP-2");
        addEncodingName(set, "ISO-2022-JP-1");
        addEncodingName(set, "ISO-2022-JP-3");
        addEncodingName(set, "EUC-JP");
        addEncodingName(set, "Shift_JIS");
    }
    return m_name && set.contains(m_name);
}

UChar TextEncoding::backslashAsCurrencySymbol() const
{
    if (noExtendedTextEncodingNameUsed())
        return '\\';

    // The text encodings below treat backslash as a currency symbol.
    // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
    static const char* const a = atomicCanonicalTextEncodingName("Shift_JIS_X0213-2000");
    static const char* const b = atomicCanonicalTextEncodingName("EUC-JP");
    return (m_name == a || m_name == b) ? 0x00A5 : '\\';
}

const TextEncoding& TextEncoding::closest8BitEquivalent() const
{
    if (*this == UTF16BigEndianEncoding() || *this == UTF16LittleEndianEncoding())
        return UTF8Encoding();
    return *this;
}

static TextEncoding* globalASCIIEncoding;
const TextEncoding& ASCIIEncoding()
{
    if( !globalASCIIEncoding ) {
        globalASCIIEncoding = new TextEncoding("ASCII");
    }
    return *globalASCIIEncoding;
}

static TextEncoding* globalLatin1Encoding;
const TextEncoding& Latin1Encoding()
{
    if( !globalLatin1Encoding ) {
        globalLatin1Encoding = new TextEncoding("Latin-1");
    }
    return *globalLatin1Encoding;
}

static TextEncoding* globalUTF16BigEndianEncoding;
const TextEncoding& UTF16BigEndianEncoding()
{
    if( !globalUTF16BigEndianEncoding ) {
        globalUTF16BigEndianEncoding = new TextEncoding("UTF-16BE");
    }
    return *globalUTF16BigEndianEncoding;
}

static TextEncoding* globalUTF16LittleEndianEncoding;
const TextEncoding& UTF16LittleEndianEncoding()
{
    if( !globalUTF16LittleEndianEncoding ) {
        globalUTF16LittleEndianEncoding = new TextEncoding("UTF-16LE");
    }
    return *globalUTF16LittleEndianEncoding;
}

static TextEncoding* globalUTF32BigEndianEncoding;
const TextEncoding& UTF32BigEndianEncoding()
{
    if( !globalUTF32BigEndianEncoding ) {
        globalUTF32BigEndianEncoding = new TextEncoding("UTF-32BE");
    }
    return *globalUTF32BigEndianEncoding;
}

static TextEncoding* globalUTF32LittleEndianEncoding;
const TextEncoding& UTF32LittleEndianEncoding()
{
    if( !globalUTF32LittleEndianEncoding ) {
        globalUTF32LittleEndianEncoding = new TextEncoding("UTF-32LE");
    }
    return *globalUTF32LittleEndianEncoding;
}

static TextEncoding* globalUTF8Encoding;
const TextEncoding& UTF8Encoding()
{
    if( !globalUTF8Encoding ) {
        globalUTF8Encoding = new TextEncoding("UTF-8");
    }
    return *globalUTF8Encoding;
}

static TextEncoding* globalWindowsLatin1Encoding;
const TextEncoding& WindowsLatin1Encoding()
{
    if( !globalWindowsLatin1Encoding ) {
        globalWindowsLatin1Encoding = new TextEncoding("WinLatin-1");
    }
    return *globalWindowsLatin1Encoding;
}

struct cleanupTextEncodings  {
    ~cleanupTextEncodings() {
    // Delete all encodings and set to NULL
    delete globalASCIIEncoding;
    globalASCIIEncoding = NULL;
    delete globalLatin1Encoding;
    globalLatin1Encoding = NULL;
    delete globalUTF16BigEndianEncoding;
    globalUTF16BigEndianEncoding = NULL;
    delete globalUTF16LittleEndianEncoding;
    globalUTF16LittleEndianEncoding = NULL;
    delete globalUTF32BigEndianEncoding;
    globalUTF32BigEndianEncoding = NULL;
    delete globalUTF32LittleEndianEncoding;
    globalUTF32LittleEndianEncoding = NULL;
    delete globalUTF8Encoding;
    globalUTF8Encoding = NULL;
    delete globalWindowsLatin1Encoding;
    globalWindowsLatin1Encoding = NULL;

    }
};
struct cleanupTextEncodings textEncodings;
} // namespace WebCore