src/corelib/codecs/qtextcodec.cpp
changeset 0 1918ee327afb
child 3 41300fa6a67c
equal deleted inserted replaced
-1:000000000000 0:1918ee327afb
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the QtCore module of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include "qplatformdefs.h"
       
    43 #include "qtextcodec.h"
       
    44 #include "qtextcodec_p.h"
       
    45 
       
    46 #ifndef QT_NO_TEXTCODEC
       
    47 
       
    48 #include "qlist.h"
       
    49 #include "qfile.h"
       
    50 #ifndef QT_NO_LIBRARY
       
    51 # include "qcoreapplication.h"
       
    52 # include "qtextcodecplugin.h"
       
    53 # include "private/qfactoryloader_p.h"
       
    54 #endif
       
    55 #include "qstringlist.h"
       
    56 
       
    57 #ifdef Q_OS_UNIX
       
    58 #  include "qiconvcodec_p.h"
       
    59 #endif
       
    60 
       
    61 #include "qutfcodec_p.h"
       
    62 #include "qsimplecodec_p.h"
       
    63 #include "qlatincodec_p.h"
       
    64 #ifndef QT_NO_CODECS
       
    65 #  include "qtsciicodec_p.h"
       
    66 #  include "qisciicodec_p.h"
       
    67 #  if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
       
    68 // no iconv(3) support, must build all codecs into the library
       
    69 #    include "../../plugins/codecs/cn/qgb18030codec.h"
       
    70 #    include "../../plugins/codecs/jp/qeucjpcodec.h"
       
    71 #    include "../../plugins/codecs/jp/qjiscodec.h"
       
    72 #    include "../../plugins/codecs/jp/qsjiscodec.h"
       
    73 #    include "../../plugins/codecs/kr/qeuckrcodec.h"
       
    74 #    include "../../plugins/codecs/tw/qbig5codec.h"
       
    75 #  endif // QT_NO_ICONV
       
    76 #  if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
       
    77 #    include "qfontlaocodec_p.h"
       
    78 #    include "../../plugins/codecs/jp/qfontjpcodec.h"
       
    79 #  endif
       
    80 #endif // QT_NO_CODECS
       
    81 #include "qlocale.h"
       
    82 #include "private/qmutexpool_p.h"
       
    83 
       
    84 #include <stdlib.h>
       
    85 #include <ctype.h>
       
    86 #include <locale.h>
       
    87 #if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
       
    88 #include <langinfo.h>
       
    89 #endif
       
    90 
       
    91 #if defined(Q_OS_WINCE)
       
    92 #  define QT_NO_SETLOCALE
       
    93 #endif
       
    94 
       
    95 // enabling this is not exception safe!
       
    96 // #define Q_DEBUG_TEXTCODEC
       
    97 
       
    98 QT_BEGIN_NAMESPACE
       
    99 
       
   100 #ifndef QT_NO_TEXTCODECPLUGIN
       
   101 Q_GLOBAL_STATIC_WITH_ARGS(QFactoryLoader, loader,
       
   102     (QTextCodecFactoryInterface_iid, QLatin1String("/codecs")))
       
   103 #endif
       
   104 
       
   105 static char qtolower(register char c)
       
   106 { if (c >= 'A' && c <= 'Z') return c + 0x20; return c; }
       
   107 static bool qisalnum(register char c)
       
   108 { return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); }
       
   109 
       
   110 static bool nameMatch(const QByteArray &name, const QByteArray &test)
       
   111 {
       
   112     // if they're the same, return a perfect score
       
   113     if (qstricmp(name, test) == 0)
       
   114         return true;
       
   115 
       
   116     const char *n = name.constData();
       
   117     const char *h = test.constData();
       
   118 
       
   119     // if the letters and numbers are the same, we have a match
       
   120     while (*n != '\0') {
       
   121         if (qisalnum(*n)) {
       
   122             for (;;) {
       
   123                 if (*h == '\0')
       
   124                     return false;
       
   125                 if (qisalnum(*h))
       
   126                     break;
       
   127                 ++h;
       
   128             }
       
   129             if (qtolower(*n) != qtolower(*h))
       
   130                 return false;
       
   131             ++h;
       
   132         }
       
   133         ++n;
       
   134     }
       
   135     while (*h && !qisalnum(*h))
       
   136            ++h;
       
   137     return (*h == '\0');
       
   138 }
       
   139 
       
   140 
       
   141 static QTextCodec *createForName(const QByteArray &name)
       
   142 {
       
   143 #ifndef QT_NO_TEXTCODECPLUGIN
       
   144     QFactoryLoader *l = loader();
       
   145     QStringList keys = l->keys();
       
   146     for (int i = 0; i < keys.size(); ++i) {
       
   147         if (nameMatch(name, keys.at(i).toLatin1())) {
       
   148             QString realName = keys.at(i);
       
   149             if (QTextCodecFactoryInterface *factory
       
   150                 = qobject_cast<QTextCodecFactoryInterface*>(l->instance(realName))) {
       
   151                 return factory->create(realName);
       
   152             }
       
   153         }
       
   154     }
       
   155 #else
       
   156     Q_UNUSED(name);
       
   157 #endif
       
   158     return 0;
       
   159 }
       
   160 
       
   161 static QTextCodec *createForMib(int mib)
       
   162 {
       
   163 #ifndef QT_NO_TEXTCODECPLUGIN
       
   164     QString name = QLatin1String("MIB: ") + QString::number(mib);
       
   165     if (QTextCodecFactoryInterface *factory
       
   166         = qobject_cast<QTextCodecFactoryInterface*>(loader()->instance(name)))
       
   167         return factory->create(name);
       
   168 #else
       
   169     Q_UNUSED(mib);
       
   170 #endif
       
   171     return 0;
       
   172 }
       
   173 
       
   174 static QList<QTextCodec*> *all = 0;
       
   175 #ifdef Q_DEBUG_TEXTCODEC
       
   176 static bool destroying_is_ok = false;
       
   177 #endif
       
   178 
       
   179 static QTextCodec *localeMapper = 0;
       
   180 QTextCodec *QTextCodec::cftr = 0;
       
   181 
       
   182 
       
   183 class QTextCodecCleanup
       
   184 {
       
   185 public:
       
   186     ~QTextCodecCleanup();
       
   187 };
       
   188 
       
   189 /*
       
   190     Deletes all the created codecs. This destructor is called just
       
   191     before exiting to delete any QTextCodec objects that may be lying
       
   192     around.
       
   193 */
       
   194 QTextCodecCleanup::~QTextCodecCleanup()
       
   195 {
       
   196     if (!all)
       
   197         return;
       
   198 
       
   199 #ifdef Q_DEBUG_TEXTCODEC
       
   200     destroying_is_ok = true;
       
   201 #endif
       
   202 
       
   203     for (QList<QTextCodec *>::const_iterator it = all->constBegin()
       
   204             ; it != all->constEnd(); ++it) {
       
   205         delete *it;
       
   206     }
       
   207     delete all;
       
   208     all = 0;
       
   209     localeMapper = 0;
       
   210 
       
   211 #ifdef Q_DEBUG_TEXTCODEC
       
   212     destroying_is_ok = false;
       
   213 #endif
       
   214 }
       
   215 
       
   216 Q_GLOBAL_STATIC(QTextCodecCleanup, createQTextCodecCleanup)
       
   217 
       
   218 #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
       
   219 class QWindowsLocalCodec: public QTextCodec
       
   220 {
       
   221 public:
       
   222     QWindowsLocalCodec();
       
   223     ~QWindowsLocalCodec();
       
   224 
       
   225     QString convertToUnicode(const char *, int, ConverterState *) const;
       
   226     QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const;
       
   227     QString convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const;
       
   228 
       
   229     QByteArray name() const;
       
   230     int mibEnum() const;
       
   231 
       
   232 };
       
   233 
       
   234 QWindowsLocalCodec::QWindowsLocalCodec()
       
   235 {
       
   236 }
       
   237 
       
   238 QWindowsLocalCodec::~QWindowsLocalCodec()
       
   239 {
       
   240 }
       
   241 
       
   242 QString QWindowsLocalCodec::convertToUnicode(const char *chars, int length, ConverterState *state) const
       
   243 {
       
   244     const char *mb = chars;
       
   245     int mblen = length;
       
   246 
       
   247     if (!mb || !mblen)
       
   248         return QString();
       
   249 
       
   250     const int wclen_auto = 4096;
       
   251     wchar_t wc_auto[wclen_auto];
       
   252     int wclen = wclen_auto;
       
   253     wchar_t *wc = wc_auto;
       
   254     int len;
       
   255     QString sp;
       
   256     bool prepend = false;
       
   257     char state_data = 0;
       
   258     int remainingChars = 0;
       
   259 
       
   260     //save the current state information
       
   261     if (state) {
       
   262         state_data = (char)state->state_data[0];
       
   263         remainingChars = state->remainingChars;
       
   264     }
       
   265 
       
   266     //convert the pending charcter (if available)
       
   267     if (state && remainingChars) {
       
   268         char prev[3] = {0};
       
   269         prev[0] = state_data;
       
   270         prev[1] = mb[0];
       
   271         remainingChars = 0;
       
   272         len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
       
   273                                     prev, 2, wc, wclen);
       
   274         if (len) {
       
   275             prepend = true;
       
   276             sp.append(QChar(wc[0]));
       
   277             mb++;
       
   278             mblen--;
       
   279             wc[0] = 0;
       
   280         }
       
   281     }
       
   282 
       
   283     while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS,
       
   284                 mb, mblen, wc, wclen))) {
       
   285         int r = GetLastError();
       
   286         if (r == ERROR_INSUFFICIENT_BUFFER) {
       
   287             if (wc != wc_auto) {
       
   288                 qWarning("MultiByteToWideChar: Size changed");
       
   289                 break;
       
   290             } else {
       
   291                 wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED,
       
   292                                     mb, mblen, 0, 0);
       
   293                 wc = new wchar_t[wclen];
       
   294                 // and try again...
       
   295             }
       
   296         } else if (r == ERROR_NO_UNICODE_TRANSLATION) {
       
   297             //find the last non NULL character
       
   298             while (mblen > 1  && !(mb[mblen-1]))
       
   299                 mblen--;
       
   300             //check whether,  we hit an invalid character in the middle
       
   301             if ((mblen <= 1) || (remainingChars && state_data))
       
   302                 return convertToUnicodeCharByChar(chars, length, state);
       
   303             //Remove the last character and try again...
       
   304             state_data = mb[mblen-1];
       
   305             remainingChars = 1;
       
   306             mblen--;
       
   307         } else {
       
   308             // Fail.
       
   309             qWarning("MultiByteToWideChar: Cannot convert multibyte text");
       
   310             break;
       
   311         }
       
   312     }
       
   313     if (len <= 0)
       
   314         return QString();
       
   315     if (wc[len-1] == 0) // len - 1: we don't want terminator
       
   316         --len;
       
   317 
       
   318     //save the new state information
       
   319     if (state) {
       
   320         state->state_data[0] = (char)state_data;
       
   321         state->remainingChars = remainingChars;
       
   322     }
       
   323     QString s((QChar*)wc, len);
       
   324     if (wc != wc_auto)
       
   325         delete [] wc;
       
   326     if (prepend) {
       
   327         return sp+s;
       
   328     }
       
   329     return s;
       
   330 }
       
   331 
       
   332 QString QWindowsLocalCodec::convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const
       
   333 {
       
   334     if (!chars || !length)
       
   335         return QString();
       
   336 
       
   337     int copyLocation = 0;
       
   338     int extra = 2;
       
   339     if (state && state->remainingChars) {
       
   340         copyLocation = state->remainingChars;
       
   341         extra += copyLocation;
       
   342     }
       
   343     int newLength = length + extra;
       
   344     char *mbcs = new char[newLength];
       
   345     //ensure that we have a NULL terminated string
       
   346     mbcs[newLength-1] = 0;
       
   347     mbcs[newLength-2] = 0;
       
   348     memcpy(&(mbcs[copyLocation]), chars, length);
       
   349     if (copyLocation) {
       
   350         //copy the last character from the state
       
   351         mbcs[0] = (char)state->state_data[0];
       
   352         state->remainingChars = 0;
       
   353     }
       
   354     const char *mb = mbcs;
       
   355 #ifndef Q_OS_WINCE
       
   356     const char *next = 0;
       
   357     QString s;
       
   358     while((next = CharNextExA(CP_ACP, mb, 0)) != mb) {
       
   359         wchar_t wc[2] ={0};
       
   360         int charlength = next - mb;
       
   361         int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2);
       
   362         if (len>0) {
       
   363             s.append(QChar(wc[0]));
       
   364         } else {
       
   365             int r = GetLastError();
       
   366             //check if the character being dropped is the last character
       
   367             if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) {
       
   368                 state->remainingChars = 1;
       
   369                 state->state_data[0] = (char)*mb;
       
   370             }
       
   371         }
       
   372         mb = next;
       
   373     }
       
   374 #else
       
   375     QString s;
       
   376     int size = mbstowcs(NULL, mb, length);
       
   377     if (size < 0) {
       
   378         Q_ASSERT("Error in CE TextCodec");
       
   379         return QString();
       
   380     }
       
   381     wchar_t* ws = new wchar_t[size + 2];
       
   382     ws[size +1] = 0;
       
   383     ws[size] = 0;
       
   384     size = mbstowcs(ws, mb, length);
       
   385     for (int i=0; i< size; i++)
       
   386         s.append(QChar(ws[i]));
       
   387     delete [] ws;
       
   388 #endif
       
   389     delete mbcs;
       
   390     return s;
       
   391 }
       
   392 
       
   393 QByteArray QWindowsLocalCodec::convertFromUnicode(const QChar *uc, int len, ConverterState *) const
       
   394 {
       
   395     return qt_winQString2MB(uc, len);
       
   396 }
       
   397 
       
   398 
       
   399 QByteArray QWindowsLocalCodec::name() const
       
   400 {
       
   401     return "System";
       
   402 }
       
   403 
       
   404 int QWindowsLocalCodec::mibEnum() const
       
   405 {
       
   406     return 0;
       
   407 }
       
   408 
       
   409 #else
       
   410 
       
   411 /* locale names mostly copied from XFree86 */
       
   412 static const char * const iso8859_2locales[] = {
       
   413     "croatian", "cs", "cs_CS", "cs_CZ","cz", "cz_CZ", "czech", "hr",
       
   414     "hr_HR", "hu", "hu_HU", "hungarian", "pl", "pl_PL", "polish", "ro",
       
   415     "ro_RO", "rumanian", "serbocroatian", "sh", "sh_SP", "sh_YU", "sk",
       
   416     "sk_SK", "sl", "sl_CS", "sl_SI", "slovak", "slovene", "sr_SP", 0 };
       
   417 
       
   418 static const char * const iso8859_3locales[] = {
       
   419     "eo", 0 };
       
   420 
       
   421 static const char * const iso8859_4locales[] = {
       
   422     "ee", "ee_EE", 0 };
       
   423 
       
   424 static const char * const iso8859_5locales[] = {
       
   425     "mk", "mk_MK", "sp", "sp_YU", 0 };
       
   426 
       
   427 static const char * const cp_1251locales[] = {
       
   428     "be", "be_BY", "bg", "bg_BG", "bulgarian", 0 };
       
   429 
       
   430 static const char * const pt_154locales[] = {
       
   431     "ba_RU", "ky", "ky_KG", "kk", "kk_KZ", 0 };
       
   432 
       
   433 static const char * const iso8859_6locales[] = {
       
   434     "ar_AA", "ar_SA", "arabic", 0 };
       
   435 
       
   436 static const char * const iso8859_7locales[] = {
       
   437     "el", "el_GR", "greek", 0 };
       
   438 
       
   439 static const char * const iso8859_8locales[] = {
       
   440     "hebrew", "he", "he_IL", "iw", "iw_IL", 0 };
       
   441 
       
   442 static const char * const iso8859_9locales[] = {
       
   443     "tr", "tr_TR", "turkish", 0 };
       
   444 
       
   445 static const char * const iso8859_13locales[] = {
       
   446     "lt", "lt_LT", "lv", "lv_LV", 0 };
       
   447 
       
   448 static const char * const iso8859_15locales[] = {
       
   449     "et", "et_EE",
       
   450     // Euro countries
       
   451     "br_FR", "ca_ES", "de", "de_AT", "de_BE", "de_DE", "de_LU", "en_IE",
       
   452     "es", "es_ES", "eu_ES", "fi", "fi_FI", "finnish", "fr", "fr_FR",
       
   453     "fr_BE", "fr_LU", "french", "ga_IE", "gl_ES", "it", "it_IT", "oc_FR",
       
   454     "nl", "nl_BE", "nl_NL", "pt", "pt_PT", "sv_FI", "wa_BE",
       
   455     0 };
       
   456 
       
   457 static const char * const koi8_ulocales[] = {
       
   458     "uk", "uk_UA", "ru_UA", "ukrainian", 0 };
       
   459 
       
   460 static const char * const tis_620locales[] = {
       
   461     "th", "th_TH", "thai", 0 };
       
   462 
       
   463 // static const char * const tcvnlocales[] = {
       
   464 //     "vi", "vi_VN", 0 };
       
   465 
       
   466 static bool try_locale_list(const char * const locale[], const QByteArray &lang)
       
   467 {
       
   468     int i;
       
   469     for(i=0; locale[i] && lang != locale[i]; i++)
       
   470         ;
       
   471     return locale[i] != 0;
       
   472 }
       
   473 
       
   474 // For the probably_koi8_locales we have to look. the standard says
       
   475 // these are 8859-5, but almost all Russian users use KOI8-R and
       
   476 // incorrectly set $LANG to ru_RU. We'll check tolower() to see what
       
   477 // it thinks ru_RU means.
       
   478 
       
   479 // If you read the history, it seems that many Russians blame ISO and
       
   480 // Perestroika for the confusion.
       
   481 //
       
   482 // The real bug is that some programs break if the user specifies
       
   483 // ru_RU.KOI8-R.
       
   484 
       
   485 static const char * const probably_koi8_rlocales[] = {
       
   486     "ru", "ru_SU", "ru_RU", "russian", 0 };
       
   487 
       
   488 static QTextCodec * ru_RU_hack(const char * i) {
       
   489     QTextCodec * ru_RU_codec = 0;
       
   490 
       
   491 #if !defined(QT_NO_SETLOCALE)
       
   492     QByteArray origlocale(setlocale(LC_CTYPE, i));
       
   493 #else
       
   494     QByteArray origlocale(i);
       
   495 #endif
       
   496     // unicode   koi8r   latin5   name
       
   497     // 0x044E    0xC0    0xEE     CYRILLIC SMALL LETTER YU
       
   498     // 0x042E    0xE0    0xCE     CYRILLIC CAPITAL LETTER YU
       
   499     int latin5 = tolower(0xCE);
       
   500     int koi8r = tolower(0xE0);
       
   501     if (koi8r == 0xC0 && latin5 != 0xEE) {
       
   502         ru_RU_codec = QTextCodec::codecForName("KOI8-R");
       
   503     } else if (koi8r != 0xC0 && latin5 == 0xEE) {
       
   504         ru_RU_codec = QTextCodec::codecForName("ISO 8859-5");
       
   505     } else {
       
   506         // something else again... let's assume... *throws dice*
       
   507         ru_RU_codec = QTextCodec::codecForName("KOI8-R");
       
   508         qWarning("QTextCodec: Using KOI8-R, probe failed (%02x %02x %s)",
       
   509                   koi8r, latin5, i);
       
   510     }
       
   511 #if !defined(QT_NO_SETLOCALE)
       
   512     setlocale(LC_CTYPE, origlocale);
       
   513 #endif
       
   514 
       
   515     return ru_RU_codec;
       
   516 }
       
   517 
       
   518 #endif
       
   519 
       
   520 #if !defined(Q_OS_WIN32) && !defined(Q_OS_WINCE)
       
   521 static QTextCodec *checkForCodec(const QByteArray &name) {
       
   522     QTextCodec *c = QTextCodec::codecForName(name);
       
   523     if (!c) {
       
   524         const int index = name.indexOf('@');
       
   525         if (index != -1) {
       
   526             c = QTextCodec::codecForName(name.left(index));
       
   527         }
       
   528     }
       
   529     return c;
       
   530 }
       
   531 #endif
       
   532 
       
   533 /* the next two functions are implicitely thread safe,
       
   534    as they are only called by setup() which uses a mutex.
       
   535 */
       
   536 static void setupLocaleMapper()
       
   537 {
       
   538 #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
       
   539     localeMapper = QTextCodec::codecForName("System");
       
   540 #else
       
   541 
       
   542 #ifndef QT_NO_ICONV
       
   543     localeMapper = QTextCodec::codecForName("System");
       
   544 #endif
       
   545 
       
   546 #if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF)
       
   547     if (!localeMapper) {
       
   548         char *charset = nl_langinfo (CODESET);
       
   549         if (charset)
       
   550             localeMapper = QTextCodec::codecForName(charset);
       
   551     }
       
   552 #endif
       
   553 
       
   554     if (!localeMapper) {
       
   555         // Very poorly defined and followed standards causes lots of
       
   556         // code to try to get all the cases... This logic is
       
   557         // duplicated in QIconvCodec, so if you change it here, change
       
   558         // it there too.
       
   559 
       
   560         // Try to determine locale codeset from locale name assigned to
       
   561         // LC_CTYPE category.
       
   562 
       
   563         // First part is getting that locale name.  First try setlocale() which
       
   564         // definitely knows it, but since we cannot fully trust it, get ready
       
   565         // to fall back to environment variables.
       
   566 #if !defined(QT_NO_SETLOCALE)
       
   567         const QByteArray ctype = setlocale(LC_CTYPE, 0);
       
   568 #else
       
   569         const QByteArray ctype;
       
   570 #endif
       
   571 
       
   572         // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG
       
   573         // environment variables.
       
   574         QByteArray lang = qgetenv("LC_ALL");
       
   575         if (lang.isEmpty() || lang == "C") {
       
   576             lang = qgetenv("LC_CTYPE");
       
   577         }
       
   578         if (lang.isEmpty() || lang == "C") {
       
   579             lang = qgetenv("LANG");
       
   580         }
       
   581 
       
   582         // Now try these in order:
       
   583         // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
       
   584         // 2. CODESET from lang if it contains a .CODESET part
       
   585         // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
       
   586         // 4. locale (ditto)
       
   587         // 5. check for "@euro"
       
   588         // 6. guess locale from ctype unless ctype is "C"
       
   589         // 7. guess locale from lang
       
   590 
       
   591         // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15)
       
   592         int indexOfDot = ctype.indexOf('.');
       
   593         if (indexOfDot != -1)
       
   594             localeMapper = checkForCodec( ctype.mid(indexOfDot + 1) );
       
   595 
       
   596         // 2. CODESET from lang if it contains a .CODESET part
       
   597         if (!localeMapper) {
       
   598             indexOfDot = lang.indexOf('.');
       
   599             if (indexOfDot != -1)
       
   600                 localeMapper = checkForCodec( lang.mid(indexOfDot + 1) );
       
   601         }
       
   602 
       
   603         // 3. ctype (maybe the locale is named "ISO-8859-1" or something)
       
   604         if (!localeMapper && !ctype.isEmpty() && ctype != "C")
       
   605             localeMapper = checkForCodec(ctype);
       
   606 
       
   607         // 4. locale (ditto)
       
   608         if (!localeMapper && !lang.isEmpty())
       
   609             localeMapper = checkForCodec(lang);
       
   610 
       
   611         // 5. "@euro"
       
   612         if ((!localeMapper && ctype.contains("@euro")) || lang.contains("@euro"))
       
   613             localeMapper = checkForCodec("ISO 8859-15");
       
   614 
       
   615         // 6. guess locale from ctype unless ctype is "C"
       
   616         // 7. guess locale from lang
       
   617         const QByteArray &try_by_name = (!ctype.isEmpty() && ctype != "C") ? lang : ctype;
       
   618 
       
   619         // Now do the guessing.
       
   620         if (!lang.isEmpty() && !localeMapper && !try_by_name.isEmpty()) {
       
   621             if (try_locale_list(iso8859_15locales, lang))
       
   622                 localeMapper = QTextCodec::codecForName("ISO 8859-15");
       
   623             else if (try_locale_list(iso8859_2locales, lang))
       
   624                 localeMapper = QTextCodec::codecForName("ISO 8859-2");
       
   625             else if (try_locale_list(iso8859_3locales, lang))
       
   626                 localeMapper = QTextCodec::codecForName("ISO 8859-3");
       
   627             else if (try_locale_list(iso8859_4locales, lang))
       
   628                 localeMapper = QTextCodec::codecForName("ISO 8859-4");
       
   629             else if (try_locale_list(iso8859_5locales, lang))
       
   630                 localeMapper = QTextCodec::codecForName("ISO 8859-5");
       
   631             else if (try_locale_list(iso8859_6locales, lang))
       
   632                 localeMapper = QTextCodec::codecForName("ISO 8859-6");
       
   633             else if (try_locale_list(iso8859_7locales, lang))
       
   634                 localeMapper = QTextCodec::codecForName("ISO 8859-7");
       
   635             else if (try_locale_list(iso8859_8locales, lang))
       
   636                 localeMapper = QTextCodec::codecForName("ISO 8859-8-I");
       
   637             else if (try_locale_list(iso8859_9locales, lang))
       
   638                 localeMapper = QTextCodec::codecForName("ISO 8859-9");
       
   639             else if (try_locale_list(iso8859_13locales, lang))
       
   640                 localeMapper = QTextCodec::codecForName("ISO 8859-13");
       
   641             else if (try_locale_list(tis_620locales, lang))
       
   642                 localeMapper = QTextCodec::codecForName("ISO 8859-11");
       
   643             else if (try_locale_list(koi8_ulocales, lang))
       
   644                 localeMapper = QTextCodec::codecForName("KOI8-U");
       
   645             else if (try_locale_list(cp_1251locales, lang))
       
   646                 localeMapper = QTextCodec::codecForName("CP 1251");
       
   647             else if (try_locale_list(pt_154locales, lang))
       
   648                 localeMapper = QTextCodec::codecForName("PT 154");
       
   649             else if (try_locale_list(probably_koi8_rlocales, lang))
       
   650                 localeMapper = ru_RU_hack(lang);
       
   651         }
       
   652 
       
   653     }
       
   654 
       
   655     // If everything failed, we default to 8859-1
       
   656     // We could perhaps default to 8859-15.
       
   657     if (!localeMapper)
       
   658         localeMapper = QTextCodec::codecForName("ISO 8859-1");
       
   659 #endif
       
   660 }
       
   661 
       
   662 
       
   663 static void setup()
       
   664 {
       
   665 #ifndef QT_NO_THREAD
       
   666     QMutexLocker locker(QMutexPool::globalInstanceGet(&all));
       
   667 #endif
       
   668 
       
   669     if (all)
       
   670         return;
       
   671 
       
   672 #ifdef Q_DEBUG_TEXTCODEC
       
   673     if (destroying_is_ok)
       
   674         qWarning("QTextCodec: Creating new codec during codec cleanup");
       
   675 #endif
       
   676     all = new QList<QTextCodec*>;
       
   677     // create the cleanup object to cleanup all codecs on exit
       
   678     (void) createQTextCodecCleanup();
       
   679 
       
   680 #ifndef QT_NO_CODECS
       
   681 #  if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED)
       
   682     // no font codecs when bootstrapping
       
   683     (void)new QFontLaoCodec;
       
   684 #    if defined(QT_NO_ICONV)
       
   685     // no iconv(3) support, must build all codecs into the library
       
   686     (void)new QFontGb2312Codec;
       
   687     (void)new QFontGbkCodec;
       
   688     (void)new QFontGb18030_0Codec;
       
   689     (void)new QFontJis0208Codec;
       
   690     (void)new QFontJis0201Codec;
       
   691     (void)new QFontKsc5601Codec;
       
   692     (void)new QFontBig5hkscsCodec;
       
   693     (void)new QFontBig5Codec;
       
   694 #    endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
       
   695 #  endif // Q_WS_X11
       
   696 
       
   697     (void)new QTsciiCodec;
       
   698 
       
   699     for (int i = 0; i < 9; ++i)
       
   700         (void)new QIsciiCodec(i);
       
   701 
       
   702 
       
   703 #  if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
       
   704     // no asian codecs when bootstrapping, sorry
       
   705     (void)new QGb18030Codec;
       
   706     (void)new QGbkCodec;
       
   707     (void)new QGb2312Codec;
       
   708     (void)new QEucJpCodec;
       
   709     (void)new QJisCodec;
       
   710     (void)new QSjisCodec;
       
   711     (void)new QEucKrCodec;
       
   712     (void)new QCP949Codec;
       
   713     (void)new QBig5Codec;
       
   714     (void)new QBig5hkscsCodec;
       
   715 #  endif // QT_NO_ICONV && !QT_BOOTSTRAPPED
       
   716 #endif // QT_NO_CODECS
       
   717 
       
   718 #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE)
       
   719     (void) new QWindowsLocalCodec;
       
   720 #endif // Q_OS_WIN32
       
   721 
       
   722     (void)new QUtf16Codec;
       
   723     (void)new QUtf16BECodec;
       
   724     (void)new QUtf16LECodec;
       
   725     (void)new QUtf32Codec;
       
   726     (void)new QUtf32BECodec;
       
   727     (void)new QUtf32LECodec;
       
   728     (void)new QLatin15Codec;
       
   729     (void)new QLatin1Codec;
       
   730     (void)new QUtf8Codec;
       
   731 
       
   732     for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i)
       
   733         (void)new QSimpleTextCodec(i);
       
   734 
       
   735 #if defined(Q_OS_UNIX) && !defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED)
       
   736     // QIconvCodec depends on the UTF-16 codec, so it needs to be created last
       
   737     (void) new QIconvCodec();
       
   738 #endif
       
   739 
       
   740     if (!localeMapper)
       
   741         setupLocaleMapper();
       
   742 }
       
   743 
       
   744 /*!
       
   745     \enum QTextCodec::ConversionFlag
       
   746 
       
   747     \value DefaultConversion  No flag is set.
       
   748     \value ConvertInvalidToNull  If this flag is set, each invalid input
       
   749                                  character is output as a null character.
       
   750     \value IgnoreHeader  Ignore any Unicode byte-order mark and don't generate any.
       
   751 
       
   752     \omitvalue FreeFunction
       
   753 */
       
   754 
       
   755 /*!
       
   756     \fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags)
       
   757 
       
   758     Constructs a ConverterState object initialized with the given \a flags.
       
   759 */
       
   760 
       
   761 /*!
       
   762     Destroys the ConverterState object.
       
   763 */
       
   764 QTextCodec::ConverterState::~ConverterState()
       
   765 {
       
   766     if (flags & FreeFunction)
       
   767         (QTextCodecUnalignedPointer::decode(state_data))(this);
       
   768     else if (d)
       
   769         qFree(d);
       
   770 }
       
   771 
       
   772 /*!
       
   773     \class QTextCodec
       
   774     \brief The QTextCodec class provides conversions between text encodings.
       
   775     \reentrant
       
   776     \ingroup i18n
       
   777 
       
   778     Qt uses Unicode to store, draw and manipulate strings. In many
       
   779     situations you may wish to deal with data that uses a different
       
   780     encoding. For example, most Japanese documents are still stored
       
   781     in Shift-JIS or ISO 2022-JP, while Russian users often have their
       
   782     documents in KOI8-R or Windows-1251.
       
   783 
       
   784     Qt provides a set of QTextCodec classes to help with converting
       
   785     non-Unicode formats to and from Unicode. You can also create your
       
   786     own codec classes.
       
   787 
       
   788     The supported encodings are:
       
   789 
       
   790     \list
       
   791     \o Apple Roman
       
   792     \o \l{Big5 Text Codec}{Big5}
       
   793     \o \l{Big5-HKSCS Text Codec}{Big5-HKSCS}
       
   794     \o CP949
       
   795     \o \l{EUC-JP Text Codec}{EUC-JP}
       
   796     \o \l{EUC-KR Text Codec}{EUC-KR}
       
   797     \o \l{GBK Text Codec}{GB18030-0}
       
   798     \o IBM 850
       
   799     \o IBM 866
       
   800     \o IBM 874
       
   801     \o \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP}
       
   802     \o ISO 8859-1 to 10
       
   803     \o ISO 8859-13 to 16
       
   804     \o Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml
       
   805     \o JIS X 0201
       
   806     \o JIS X 0208
       
   807     \o KOI8-R
       
   808     \o KOI8-U
       
   809     \o MuleLao-1
       
   810     \o ROMAN8
       
   811     \o \l{Shift-JIS Text Codec}{Shift-JIS}
       
   812     \o TIS-620
       
   813     \o \l{TSCII Text Codec}{TSCII}
       
   814     \o UTF-8
       
   815     \o UTF-16
       
   816     \o UTF-16BE
       
   817     \o UTF-16LE
       
   818     \o UTF-32
       
   819     \o UTF-32BE
       
   820     \o UTF-32LE
       
   821     \o Windows-1250 to 1258
       
   822     \o WINSAMI2
       
   823     \endlist
       
   824 
       
   825     QTextCodecs can be used as follows to convert some locally encoded
       
   826     string to Unicode. Suppose you have some string encoded in Russian
       
   827     KOI8-R encoding, and want to convert it to Unicode. The simple way
       
   828     to do it is like this:
       
   829 
       
   830     \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 0
       
   831 
       
   832     After this, \c string holds the text converted to Unicode.
       
   833     Converting a string from Unicode to the local encoding is just as
       
   834     easy:
       
   835 
       
   836     \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 1
       
   837 
       
   838     To read or write files in various encodings, use QTextStream and
       
   839     its \l{QTextStream::setCodec()}{setCodec()} function. See the
       
   840     \l{tools/codecs}{Codecs} example for an application of QTextCodec
       
   841     to file I/O.
       
   842 
       
   843     Some care must be taken when trying to convert the data in chunks,
       
   844     for example, when receiving it over a network. In such cases it is
       
   845     possible that a multi-byte character will be split over two
       
   846     chunks. At best this might result in the loss of a character and
       
   847     at worst cause the entire conversion to fail.
       
   848 
       
   849     The approach to use in these situations is to create a QTextDecoder
       
   850     object for the codec and use this QTextDecoder for the whole
       
   851     decoding process, as shown below:
       
   852 
       
   853     \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 2
       
   854 
       
   855     The QTextDecoder object maintains state between chunks and therefore
       
   856     works correctly even if a multi-byte character is split between
       
   857     chunks.
       
   858 
       
   859     \section1 Creating Your Own Codec Class
       
   860 
       
   861     Support for new text encodings can be added to Qt by creating
       
   862     QTextCodec subclasses.
       
   863 
       
   864     The pure virtual functions describe the encoder to the system and
       
   865     the coder is used as required in the different text file formats
       
   866     supported by QTextStream, and under X11, for the locale-specific
       
   867     character input and output.
       
   868 
       
   869     To add support for another encoding to Qt, make a subclass of
       
   870     QTextCodec and implement the functions listed in the table below.
       
   871 
       
   872     \table
       
   873     \header \o Function \o Description
       
   874 
       
   875     \row \o name()
       
   876          \o Returns the official name for the encoding. If the
       
   877             encoding is listed in the
       
   878             \l{IANA character-sets encoding file}, the name
       
   879             should be the preferred MIME name for the encoding.
       
   880 
       
   881     \row \o aliases()
       
   882          \o Returns a list of alternative names for the encoding.
       
   883             QTextCodec provides a default implementation that returns
       
   884             an empty list. For example, "ISO-8859-1" has "latin1",
       
   885             "CP819", "IBM819", and "iso-ir-100" as aliases.
       
   886 
       
   887     \row \o mibEnum()
       
   888          \o Return the MIB enum for the encoding if it is listed in
       
   889             the \l{IANA character-sets encoding file}.
       
   890 
       
   891     \row \o convertToUnicode()
       
   892          \o Converts an 8-bit character string to Unicode.
       
   893 
       
   894     \row \o convertFromUnicode()
       
   895          \o Converts a Unicode string to an 8-bit character string.
       
   896     \endtable
       
   897 
       
   898     You may find it more convenient to make your codec class
       
   899     available as a plugin; see \l{How to Create Qt Plugins} for
       
   900     details.
       
   901 
       
   902     \sa QTextStream, QTextDecoder, QTextEncoder, {Codecs Example}
       
   903 */
       
   904 
       
   905 /*!
       
   906     \nonreentrant
       
   907 
       
   908     Constructs a QTextCodec, and gives it the highest precedence. The
       
   909     QTextCodec should always be constructed on the heap (i.e. with \c
       
   910     new). Qt takes ownership and will delete it when the application
       
   911     terminates.
       
   912 */
       
   913 QTextCodec::QTextCodec()
       
   914 {
       
   915     setup();
       
   916     all->prepend(this);
       
   917 }
       
   918 
       
   919 
       
   920 /*!
       
   921     \nonreentrant
       
   922 
       
   923     Destroys the QTextCodec. Note that you should not delete codecs
       
   924     yourself: once created they become Qt's responsibility.
       
   925 */
       
   926 QTextCodec::~QTextCodec()
       
   927 {
       
   928 #ifdef Q_DEBUG_TEXTCODEC
       
   929     if (!destroying_is_ok)
       
   930         qWarning("QTextCodec::~QTextCodec: Called by application");
       
   931 #endif
       
   932     if (all)
       
   933         all->removeAll(this);
       
   934 }
       
   935 
       
   936 /*!
       
   937     \fn QTextCodec *QTextCodec::codecForName(const char *name)
       
   938 
       
   939     Searches all installed QTextCodec objects and returns the one
       
   940     which best matches \a name; the match is case-insensitive. Returns
       
   941     0 if no codec matching the name \a name could be found.
       
   942 */
       
   943 
       
   944 /*!
       
   945     Searches all installed QTextCodec objects and returns the one
       
   946     which best matches \a name; the match is case-insensitive. Returns
       
   947     0 if no codec matching the name \a name could be found.
       
   948 */
       
   949 QTextCodec *QTextCodec::codecForName(const QByteArray &name)
       
   950 {
       
   951     if (name.isEmpty())
       
   952         return 0;
       
   953 
       
   954     setup();
       
   955 
       
   956     for (int i = 0; i < all->size(); ++i) {
       
   957         QTextCodec *cursor = all->at(i);
       
   958         if (nameMatch(cursor->name(), name))
       
   959             return cursor;
       
   960         QList<QByteArray> aliases = cursor->aliases();
       
   961         for (int i = 0; i < aliases.size(); ++i)
       
   962             if (nameMatch(aliases.at(i), name))
       
   963                 return cursor;
       
   964     }
       
   965 
       
   966     return createForName(name);
       
   967 }
       
   968 
       
   969 
       
   970 /*!
       
   971     Returns the QTextCodec which matches the \link
       
   972     QTextCodec::mibEnum() MIBenum\endlink \a mib.
       
   973 */
       
   974 QTextCodec* QTextCodec::codecForMib(int mib)
       
   975 {
       
   976     setup();
       
   977 
       
   978     // Qt 3 used 1000 (mib for UCS2) as its identifier for the utf16 codec. Map
       
   979     // this correctly for compatibility.
       
   980     if (mib == 1000)
       
   981         mib = 1015;
       
   982 
       
   983     QList<QTextCodec*>::ConstIterator i;
       
   984     for (int i = 0; i < all->size(); ++i) {
       
   985         QTextCodec *cursor = all->at(i);
       
   986         if (cursor->mibEnum() == mib)
       
   987             return cursor;
       
   988     }
       
   989 
       
   990     return createForMib(mib);
       
   991 }
       
   992 
       
   993 /*!
       
   994     Returns the list of all available codecs, by name. Call
       
   995     QTextCodec::codecForName() to obtain the QTextCodec for the name.
       
   996 
       
   997     The list may contain many mentions of the same codec
       
   998     if the codec has aliases.
       
   999 
       
  1000     \sa availableMibs(), name(), aliases()
       
  1001 */
       
  1002 QList<QByteArray> QTextCodec::availableCodecs()
       
  1003 {
       
  1004     setup();
       
  1005 
       
  1006     QList<QByteArray> codecs;
       
  1007     for (int i = 0; i < all->size(); ++i) {
       
  1008         codecs += all->at(i)->name();
       
  1009         codecs += all->at(i)->aliases();
       
  1010     }
       
  1011 #ifndef QT_NO_TEXTCODECPLUGIN
       
  1012     QFactoryLoader *l = loader();
       
  1013     QStringList keys = l->keys();
       
  1014     for (int i = 0; i < keys.size(); ++i) {
       
  1015         if (!keys.at(i).startsWith(QLatin1String("MIB: "))) {
       
  1016             QByteArray name = keys.at(i).toLatin1();
       
  1017             if (!codecs.contains(name))
       
  1018                 codecs += name;
       
  1019         }
       
  1020     }
       
  1021 #endif
       
  1022 
       
  1023     return codecs;
       
  1024 }
       
  1025 
       
  1026 /*!
       
  1027     Returns the list of MIBs for all available codecs. Call
       
  1028     QTextCodec::codecForMib() to obtain the QTextCodec for the MIB.
       
  1029 
       
  1030     \sa availableCodecs(), mibEnum()
       
  1031 */
       
  1032 QList<int> QTextCodec::availableMibs()
       
  1033 {
       
  1034     setup();
       
  1035 
       
  1036     QList<int> codecs;
       
  1037     for (int i = 0; i < all->size(); ++i)
       
  1038         codecs += all->at(i)->mibEnum();
       
  1039 #ifndef QT_NO_TEXTCODECPLUGIN
       
  1040     QFactoryLoader *l = loader();
       
  1041     QStringList keys = l->keys();
       
  1042     for (int i = 0; i < keys.size(); ++i) {
       
  1043         if (keys.at(i).startsWith(QLatin1String("MIB: "))) {
       
  1044             int mib = keys.at(i).mid(5).toInt();
       
  1045             if (!codecs.contains(mib))
       
  1046                 codecs += mib;
       
  1047         }
       
  1048     }
       
  1049 #endif
       
  1050 
       
  1051     return codecs;
       
  1052 }
       
  1053 
       
  1054 /*!
       
  1055     Set the codec to \a c; this will be returned by
       
  1056     codecForLocale(). If \a c is a null pointer, the codec is reset to
       
  1057     the default.
       
  1058 
       
  1059     This might be needed for some applications that want to use their
       
  1060     own mechanism for setting the locale.
       
  1061 
       
  1062     \sa codecForLocale()
       
  1063 */
       
  1064 void QTextCodec::setCodecForLocale(QTextCodec *c)
       
  1065 {
       
  1066     localeMapper = c;
       
  1067     if (!localeMapper)
       
  1068         setupLocaleMapper();
       
  1069 }
       
  1070 
       
  1071 /*!
       
  1072     Returns a pointer to the codec most suitable for this locale.
       
  1073 
       
  1074     On Windows, the codec will be based on a system locale. On Unix
       
  1075     systems, starting with Qt 4.2, the codec will be using the \e
       
  1076     iconv library. Note that in both cases the codec's name will be
       
  1077     "System".
       
  1078 */
       
  1079 
       
  1080 QTextCodec* QTextCodec::codecForLocale()
       
  1081 {
       
  1082     if (localeMapper)
       
  1083         return localeMapper;
       
  1084 
       
  1085     setup();
       
  1086 
       
  1087     return localeMapper;
       
  1088 }
       
  1089 
       
  1090 
       
  1091 /*!
       
  1092     \fn QByteArray QTextCodec::name() const
       
  1093 
       
  1094     QTextCodec subclasses must reimplement this function. It returns
       
  1095     the name of the encoding supported by the subclass.
       
  1096 
       
  1097     If the codec is registered as a character set in the
       
  1098     \l{IANA character-sets encoding file} this method should
       
  1099     return the preferred mime name for the codec if defined,
       
  1100     otherwise its name.
       
  1101 */
       
  1102 
       
  1103 /*!
       
  1104     \fn int QTextCodec::mibEnum() const
       
  1105 
       
  1106     Subclasses of QTextCodec must reimplement this function. It
       
  1107     returns the MIBenum (see \l{IANA character-sets encoding file}
       
  1108     for more information). It is important that each QTextCodec
       
  1109     subclass returns the correct unique value for this function.
       
  1110 */
       
  1111 
       
  1112 /*!
       
  1113   Subclasses can return a number of aliases for the codec in question.
       
  1114 
       
  1115   Standard aliases for codecs can be found in the
       
  1116   \l{IANA character-sets encoding file}.
       
  1117 */
       
  1118 QList<QByteArray> QTextCodec::aliases() const
       
  1119 {
       
  1120     return QList<QByteArray>();
       
  1121 }
       
  1122 
       
  1123 /*!
       
  1124     \fn QString QTextCodec::convertToUnicode(const char *chars, int len,
       
  1125                                              ConverterState *state) const
       
  1126 
       
  1127     QTextCodec subclasses must reimplement this function.
       
  1128 
       
  1129     Converts the first \a len characters of \a chars from the
       
  1130     encoding of the subclass to Unicode, and returns the result in a
       
  1131     QString.
       
  1132 
       
  1133     \a state can be 0, in which case the conversion is stateless and
       
  1134     default conversion rules should be used. If state is not 0, the
       
  1135     codec should save the state after the conversion in \a state, and
       
  1136     adjust the remainingChars and invalidChars members of the struct.
       
  1137 */
       
  1138 
       
  1139 /*!
       
  1140     \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number,
       
  1141                                                   ConverterState *state) const
       
  1142 
       
  1143     QTextCodec subclasses must reimplement this function.
       
  1144 
       
  1145     Converts the first \a number of characters from the \a input array
       
  1146     from Unicode to the encoding of the subclass, and returns the result
       
  1147     in a QByteArray.
       
  1148 
       
  1149     \a state can be 0 in which case the conversion is stateless and
       
  1150     default conversion rules should be used. If state is not 0, the
       
  1151     codec should save the state after the conversion in \a state, and
       
  1152     adjust the remainingChars and invalidChars members of the struct.
       
  1153 */
       
  1154 
       
  1155 /*!
       
  1156     Creates a QTextDecoder which stores enough state to decode chunks
       
  1157     of \c{char *} data to create chunks of Unicode data.
       
  1158 
       
  1159     The caller is responsible for deleting the returned object.
       
  1160 */
       
  1161 QTextDecoder* QTextCodec::makeDecoder() const
       
  1162 {
       
  1163     return new QTextDecoder(this);
       
  1164 }
       
  1165 
       
  1166 
       
  1167 /*!
       
  1168     Creates a QTextEncoder which stores enough state to encode chunks
       
  1169     of Unicode data as \c{char *} data.
       
  1170 
       
  1171     The caller is responsible for deleting the returned object.
       
  1172 */
       
  1173 QTextEncoder* QTextCodec::makeEncoder() const
       
  1174 {
       
  1175     return new QTextEncoder(this);
       
  1176 }
       
  1177 
       
  1178 /*!
       
  1179     \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number,
       
  1180                                            ConverterState *state) const
       
  1181 
       
  1182     Converts the first \a number of characters from the \a input array
       
  1183     from Unicode to the encoding of this codec, and returns the result
       
  1184     in a QByteArray.
       
  1185 
       
  1186     The \a state of the convertor used is updated.
       
  1187 */
       
  1188 
       
  1189 /*!
       
  1190     Converts \a str from Unicode to the encoding of this codec, and
       
  1191     returns the result in a QByteArray.
       
  1192 */
       
  1193 QByteArray QTextCodec::fromUnicode(const QString& str) const
       
  1194 {
       
  1195     return convertFromUnicode(str.constData(), str.length(), 0);
       
  1196 }
       
  1197 
       
  1198 /*!
       
  1199     \fn QString QTextCodec::toUnicode(const char *input, int size,
       
  1200                                       ConverterState *state) const
       
  1201 
       
  1202     Converts the first \a size characters from the \a input from the
       
  1203     encoding of this codec to Unicode, and returns the result in a
       
  1204     QString.
       
  1205 
       
  1206     The \a state of the convertor used is updated.
       
  1207 */
       
  1208 
       
  1209 /*!
       
  1210     Converts \a a from the encoding of this codec to Unicode, and
       
  1211     returns the result in a QString.
       
  1212 */
       
  1213 QString QTextCodec::toUnicode(const QByteArray& a) const
       
  1214 {
       
  1215     return convertToUnicode(a.constData(), a.length(), 0);
       
  1216 }
       
  1217 
       
  1218 /*!
       
  1219     Returns true if the Unicode character \a ch can be fully encoded
       
  1220     with this codec; otherwise returns false.
       
  1221 */
       
  1222 bool QTextCodec::canEncode(QChar ch) const
       
  1223 {
       
  1224     ConverterState state;
       
  1225     state.flags = ConvertInvalidToNull;
       
  1226     convertFromUnicode(&ch, 1, &state);
       
  1227     return (state.invalidChars == 0);
       
  1228 }
       
  1229 
       
  1230 /*!
       
  1231     \overload
       
  1232 
       
  1233     \a s contains the string being tested for encode-ability.
       
  1234 */
       
  1235 bool QTextCodec::canEncode(const QString& s) const
       
  1236 {
       
  1237     ConverterState state;
       
  1238     state.flags = ConvertInvalidToNull;
       
  1239     convertFromUnicode(s.constData(), s.length(), &state);
       
  1240     return (state.invalidChars == 0);
       
  1241 }
       
  1242 
       
  1243 #ifdef QT3_SUPPORT
       
  1244 /*!
       
  1245     Returns a string representing the current language and
       
  1246     sublanguage, e.g. "pt" for Portuguese, or "pt_br" for Portuguese/Brazil.
       
  1247 
       
  1248     \sa QLocale
       
  1249 */
       
  1250 const char *QTextCodec::locale()
       
  1251 {
       
  1252     static char locale[6];
       
  1253     QByteArray l = QLocale::system().name().toLatin1();
       
  1254     int len = qMin(l.length(), 5);
       
  1255     memcpy(locale, l.constData(), len);
       
  1256     locale[len] = '\0';
       
  1257 
       
  1258     return locale;
       
  1259 }
       
  1260 
       
  1261 /*!
       
  1262     \overload
       
  1263 */
       
  1264 
       
  1265 QByteArray QTextCodec::fromUnicode(const QString& uc, int& lenInOut) const
       
  1266 {
       
  1267     QByteArray result = convertFromUnicode(uc.constData(), lenInOut, 0);
       
  1268     lenInOut = result.length();
       
  1269     return result;
       
  1270 }
       
  1271 
       
  1272 /*!
       
  1273     \overload
       
  1274 
       
  1275     \a a contains the source characters; \a len contains the number of
       
  1276     characters in \a a to use.
       
  1277 */
       
  1278 QString QTextCodec::toUnicode(const QByteArray& a, int len) const
       
  1279 {
       
  1280     len = qMin(a.size(), len);
       
  1281     return convertToUnicode(a.constData(), len, 0);
       
  1282 }
       
  1283 #endif
       
  1284 
       
  1285 /*!
       
  1286     \overload
       
  1287 
       
  1288     \a chars contains the source characters.
       
  1289 */
       
  1290 QString QTextCodec::toUnicode(const char *chars) const
       
  1291 {
       
  1292     int len = qstrlen(chars);
       
  1293     return convertToUnicode(chars, len, 0);
       
  1294 }
       
  1295 
       
  1296 
       
  1297 /*!
       
  1298     \class QTextEncoder
       
  1299     \brief The QTextEncoder class provides a state-based encoder.
       
  1300     \reentrant
       
  1301     \ingroup i18n
       
  1302 
       
  1303     A text encoder converts text from Unicode into an encoded text format
       
  1304     using a specific codec.
       
  1305 
       
  1306     The encoder converts Unicode into another format, remembering any
       
  1307     state that is required between calls.
       
  1308 
       
  1309     \sa QTextCodec::makeEncoder(), QTextDecoder
       
  1310 */
       
  1311 
       
  1312 /*!
       
  1313     \fn QTextEncoder::QTextEncoder(const QTextCodec *codec)
       
  1314 
       
  1315     Constructs a text encoder for the given \a codec.
       
  1316 */
       
  1317 
       
  1318 /*!
       
  1319     Destroys the encoder.
       
  1320 */
       
  1321 QTextEncoder::~QTextEncoder()
       
  1322 {
       
  1323 }
       
  1324 
       
  1325 /*! \internal
       
  1326     \since 4.5
       
  1327     Determines whether the eecoder encountered a failure while decoding the input. If
       
  1328     an error was encountered, the produced result is undefined, and gets converted as according
       
  1329     to the conversion flags.
       
  1330  */
       
  1331 bool QTextEncoder::hasFailure() const
       
  1332 {
       
  1333     return state.invalidChars != 0;
       
  1334 }
       
  1335 
       
  1336 /*!
       
  1337     Converts the Unicode string \a str into an encoded QByteArray.
       
  1338 */
       
  1339 QByteArray QTextEncoder::fromUnicode(const QString& str)
       
  1340 {
       
  1341     QByteArray result = c->fromUnicode(str.constData(), str.length(), &state);
       
  1342     return result;
       
  1343 }
       
  1344 
       
  1345 /*!
       
  1346     \overload
       
  1347 
       
  1348     Converts \a len characters (not bytes) from \a uc, and returns the
       
  1349     result in a QByteArray.
       
  1350 */
       
  1351 QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len)
       
  1352 {
       
  1353     QByteArray result = c->fromUnicode(uc, len, &state);
       
  1354     return result;
       
  1355 }
       
  1356 
       
  1357 #ifdef QT3_SUPPORT
       
  1358 /*!
       
  1359   \overload
       
  1360 
       
  1361   Converts \a lenInOut characters (not bytes) from \a uc, and returns the
       
  1362   result in a QByteArray. The number of characters read is returned in
       
  1363   the \a lenInOut parameter.
       
  1364 */
       
  1365 QByteArray QTextEncoder::fromUnicode(const QString& uc, int& lenInOut)
       
  1366 {
       
  1367     QByteArray result = c->fromUnicode(uc.constData(), lenInOut, &state);
       
  1368     lenInOut = result.length();
       
  1369     return result;
       
  1370 }
       
  1371 #endif
       
  1372 
       
  1373 /*!
       
  1374     \class QTextDecoder
       
  1375     \brief The QTextDecoder class provides a state-based decoder.
       
  1376     \reentrant
       
  1377     \ingroup i18n
       
  1378 
       
  1379     A text decoder converts text from an encoded text format into Unicode
       
  1380     using a specific codec.
       
  1381 
       
  1382     The decoder converts text in this format into Unicode, remembering any
       
  1383     state that is required between calls.
       
  1384 
       
  1385     \sa QTextCodec::makeDecoder(), QTextEncoder
       
  1386 */
       
  1387 
       
  1388 /*!
       
  1389     \fn QTextDecoder::QTextDecoder(const QTextCodec *codec)
       
  1390 
       
  1391     Constructs a text decoder for the given \a codec.
       
  1392 */
       
  1393 
       
  1394 /*!
       
  1395     Destroys the decoder.
       
  1396 */
       
  1397 QTextDecoder::~QTextDecoder()
       
  1398 {
       
  1399 }
       
  1400 
       
  1401 /*!
       
  1402     \fn QString QTextDecoder::toUnicode(const char *chars, int len)
       
  1403 
       
  1404     Converts the first \a len bytes in \a chars to Unicode, returning
       
  1405     the result.
       
  1406 
       
  1407     If not all characters are used (e.g. if only part of a multi-byte
       
  1408     encoding is at the end of the characters), the decoder remembers
       
  1409     enough state to continue with the next call to this function.
       
  1410 */
       
  1411 QString QTextDecoder::toUnicode(const char *chars, int len)
       
  1412 {
       
  1413     return c->toUnicode(chars, len, &state);
       
  1414 }
       
  1415 
       
  1416 
       
  1417 /*! \overload
       
  1418 
       
  1419     The converted string is returned in \a target.
       
  1420  */
       
  1421 void QTextDecoder::toUnicode(QString *target, const char *chars, int len)
       
  1422 {
       
  1423     Q_ASSERT(target);
       
  1424     switch (c->mibEnum()) {
       
  1425     case 106: // utf8
       
  1426         static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state);
       
  1427         break;
       
  1428     case 4: { // latin1
       
  1429         target->resize(len);
       
  1430         ushort *data = (ushort*)target->data();
       
  1431         for (int i = len; i >=0; --i)
       
  1432             data[i] = (uchar) chars[i];
       
  1433     } break;
       
  1434     default:
       
  1435         *target = c->toUnicode(chars, len, &state);
       
  1436     }
       
  1437 }
       
  1438 
       
  1439 
       
  1440 /*!
       
  1441     \overload
       
  1442 
       
  1443     Converts the bytes in the byte array specified by \a ba to Unicode
       
  1444     and returns the result.
       
  1445 */
       
  1446 QString QTextDecoder::toUnicode(const QByteArray &ba)
       
  1447 {
       
  1448     return c->toUnicode(ba.constData(), ba.length(), &state);
       
  1449 }
       
  1450 
       
  1451 
       
  1452 /*!
       
  1453     \fn QTextCodec* QTextCodec::codecForTr()
       
  1454 
       
  1455     Returns the codec used by QObject::tr() on its argument. If this
       
  1456     function returns 0 (the default), tr() assumes Latin-1.
       
  1457 
       
  1458     \sa setCodecForTr()
       
  1459 */
       
  1460 
       
  1461 /*!
       
  1462     \fn void QTextCodec::setCodecForTr(QTextCodec *c)
       
  1463     \nonreentrant
       
  1464 
       
  1465     Sets the codec used by QObject::tr() on its argument to \a c. If
       
  1466     \a c is 0 (the default), tr() assumes Latin-1.
       
  1467 
       
  1468     If the literal quoted text in the program is not in the Latin-1
       
  1469     encoding, this function can be used to set the appropriate
       
  1470     encoding. For example, software developed by Korean programmers
       
  1471     might use eucKR for all the text in the program, in which case the
       
  1472     main() function might look like this:
       
  1473 
       
  1474     \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 3
       
  1475 
       
  1476     Note that this is not the way to select the encoding that the \e
       
  1477     user has chosen. For example, to convert an application containing
       
  1478     literal English strings to Korean, all that is needed is for the
       
  1479     English strings to be passed through tr() and for translation
       
  1480     files to be loaded. For details of internationalization, see
       
  1481     \l{Internationalization with Qt}.
       
  1482 
       
  1483     \sa codecForTr(), setCodecForCStrings()
       
  1484 */
       
  1485 
       
  1486 
       
  1487 /*!
       
  1488     \fn QTextCodec* QTextCodec::codecForCStrings()
       
  1489 
       
  1490     Returns the codec used by QString to convert to and from \c{const
       
  1491     char *} and QByteArrays. If this function returns 0 (the default),
       
  1492     QString assumes Latin-1.
       
  1493 
       
  1494     \sa setCodecForCStrings()
       
  1495 */
       
  1496 
       
  1497 /*!
       
  1498     \fn void QTextCodec::setCodecForCStrings(QTextCodec *codec)
       
  1499     \nonreentrant
       
  1500 
       
  1501     Sets the codec used by QString to convert to and from \c{const
       
  1502     char *} and QByteArrays. If the \a codec is 0 (the default),
       
  1503     QString assumes Latin-1.
       
  1504 
       
  1505     \warning Some codecs do not preserve the characters in the ASCII
       
  1506     range (0x00 to 0x7F). For example, the Japanese Shift-JIS
       
  1507     encoding maps the backslash character (0x5A) to the Yen
       
  1508     character. To avoid undesirable side-effects, we recommend
       
  1509     avoiding such codecs with setCodecsForCString().
       
  1510 
       
  1511     \sa codecForCStrings(), setCodecForTr()
       
  1512 */
       
  1513 
       
  1514 /*!
       
  1515     \since 4.4
       
  1516 
       
  1517     Tries to detect the encoding of the provided snippet of HTML in
       
  1518     the given byte array, \a ba, by checking the BOM (Byte Order Mark)
       
  1519     and the content-type meta header and returns a QTextCodec instance
       
  1520     that is capable of decoding the html to unicode.  If the codec
       
  1521     cannot be detected from the content provided, \a defaultCodec is
       
  1522     returned.
       
  1523 
       
  1524     \sa codecForUtfText()
       
  1525 */
       
  1526 QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec)
       
  1527 {
       
  1528     // determine charset
       
  1529     int pos;
       
  1530     QTextCodec *c = 0;
       
  1531 
       
  1532     c = QTextCodec::codecForUtfText(ba, c);
       
  1533     if (!c) {
       
  1534         QByteArray header = ba.left(512).toLower();
       
  1535         if ((pos = header.indexOf("http-equiv=")) != -1) {
       
  1536             if ((pos = header.lastIndexOf("meta ", pos)) != -1) {
       
  1537                 pos = header.indexOf("charset=", pos) + int(strlen("charset="));
       
  1538                 if (pos != -1) {
       
  1539                     int pos2 = header.indexOf('\"', pos+1);
       
  1540                     QByteArray cs = header.mid(pos, pos2-pos);
       
  1541                     //            qDebug("found charset: %s", cs.data());
       
  1542                     c = QTextCodec::codecForName(cs);
       
  1543                 }
       
  1544             }
       
  1545         }
       
  1546     }
       
  1547     if (!c)
       
  1548         c = defaultCodec;
       
  1549 
       
  1550     return c;
       
  1551 }
       
  1552 
       
  1553 /*!
       
  1554     \overload
       
  1555 
       
  1556     Tries to detect the encoding of the provided snippet of HTML in
       
  1557     the given byte array, \a ba, by checking the BOM (Byte Order Mark)
       
  1558     and the content-type meta header and returns a QTextCodec instance
       
  1559     that is capable of decoding the html to unicode. If the codec cannot
       
  1560     be detected, this overload returns a Latin-1 QTextCodec.
       
  1561 */
       
  1562 QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba)
       
  1563 {
       
  1564     return codecForHtml(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
       
  1565 }
       
  1566 
       
  1567 /*!
       
  1568     \since 4.6
       
  1569 
       
  1570     Tries to detect the encoding of the provided snippet \a ba by
       
  1571     using the BOM (Byte Order Mark) and returns a QTextCodec instance
       
  1572     that is capable of decoding the text to unicode. If the codec
       
  1573     cannot be detected from the content provided, \a defaultCodec is
       
  1574     returned.
       
  1575 
       
  1576     The behavior of this function is undefined if \a ba is not
       
  1577     encoded in unicode.
       
  1578 
       
  1579     \sa codecForHtml()
       
  1580 */
       
  1581 QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec)
       
  1582 {
       
  1583     const int arraySize = ba.size();
       
  1584 
       
  1585     if (arraySize > 3) {
       
  1586         if ((uchar)ba[0] == 0x00
       
  1587             && (uchar)ba[1] == 0x00
       
  1588             && (uchar)ba[2] == 0xFE
       
  1589             && (uchar)ba[3] == 0xFF)
       
  1590             return QTextCodec::codecForMib(1018); // utf-32 be
       
  1591         else if ((uchar)ba[0] == 0xFF
       
  1592                  && (uchar)ba[1] == 0xFE
       
  1593                  && (uchar)ba[2] == 0x00
       
  1594                  && (uchar)ba[3] == 0x00)
       
  1595             return QTextCodec::codecForMib(1019); // utf-32 le
       
  1596     }
       
  1597 
       
  1598     if (arraySize < 2)
       
  1599         return defaultCodec;
       
  1600     if ((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff)
       
  1601         return QTextCodec::codecForMib(1013); // utf16 be
       
  1602     else if ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe)
       
  1603         return QTextCodec::codecForMib(1014); // utf16 le
       
  1604 
       
  1605     if (arraySize < 3)
       
  1606         return defaultCodec;
       
  1607     if ((uchar)ba[0] == 0xef
       
  1608         && (uchar)ba[1] == 0xbb
       
  1609         && (uchar)ba[2] == 0xbf)
       
  1610         return QTextCodec::codecForMib(106); // utf-8
       
  1611 
       
  1612     return defaultCodec;
       
  1613 }
       
  1614 
       
  1615 /*!
       
  1616     \overload
       
  1617 
       
  1618     Tries to detect the encoding of the provided snippet \a ba by
       
  1619     using the BOM (Byte Order Mark) and returns a QTextCodec instance
       
  1620     that is capable of decoding the text to unicode. If the codec
       
  1621     cannot be detected, this overload returns a Latin-1 QTextCodec.
       
  1622 
       
  1623     The behavior of this function is undefined if \a ba is not
       
  1624     encoded in unicode.
       
  1625 
       
  1626     \sa codecForHtml()
       
  1627 */
       
  1628 QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba)
       
  1629 {
       
  1630     return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4));
       
  1631 }
       
  1632 
       
  1633 
       
  1634 /*! \internal
       
  1635     \since 4.3
       
  1636     Determines whether the decoder encountered a failure while decoding the input. If
       
  1637     an error was encountered, the produced result is undefined, and gets converted as according
       
  1638     to the conversion flags.
       
  1639  */
       
  1640 bool QTextDecoder::hasFailure() const
       
  1641 {
       
  1642     return state.invalidChars != 0;
       
  1643 }
       
  1644 
       
  1645 /*!
       
  1646     \fn QTextCodec *QTextCodec::codecForContent(const char *str, int size)
       
  1647 
       
  1648     This functionality is no longer provided by Qt. This
       
  1649     compatibility function always returns a null pointer.
       
  1650 */
       
  1651 
       
  1652 /*!
       
  1653     \fn QTextCodec *QTextCodec::codecForName(const char *hint, int accuracy)
       
  1654 
       
  1655     Use the codecForName(const QByteArray &) overload instead.
       
  1656 */
       
  1657 
       
  1658 /*!
       
  1659     \fn QTextCodec *QTextCodec::codecForIndex(int i)
       
  1660 
       
  1661     Use availableCodecs() or availableMibs() instead and iterate
       
  1662     through the resulting list.
       
  1663 */
       
  1664 
       
  1665 
       
  1666 /*!
       
  1667     \fn QByteArray QTextCodec::mimeName() const
       
  1668 
       
  1669     Use name() instead.
       
  1670 */
       
  1671 
       
  1672 QT_END_NAMESPACE
       
  1673 
       
  1674 #endif // QT_NO_TEXTCODEC