|
1 /**************************************************************************** |
|
2 ** |
|
3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). |
|
4 ** All rights reserved. |
|
5 ** Contact: Nokia Corporation (qt-info@nokia.com) |
|
6 ** |
|
7 ** This file is part of the QtCore module of the Qt Toolkit. |
|
8 ** |
|
9 ** $QT_BEGIN_LICENSE:LGPL$ |
|
10 ** No Commercial Usage |
|
11 ** This file contains pre-release code and may not be distributed. |
|
12 ** You may use this file in accordance with the terms and conditions |
|
13 ** contained in the Technology Preview License Agreement accompanying |
|
14 ** this package. |
|
15 ** |
|
16 ** GNU Lesser General Public License Usage |
|
17 ** Alternatively, this file may be used under the terms of the GNU Lesser |
|
18 ** General Public License version 2.1 as published by the Free Software |
|
19 ** Foundation and appearing in the file LICENSE.LGPL included in the |
|
20 ** packaging of this file. Please review the following information to |
|
21 ** ensure the GNU Lesser General Public License version 2.1 requirements |
|
22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
|
23 ** |
|
24 ** In addition, as a special exception, Nokia gives you certain additional |
|
25 ** rights. These rights are described in the Nokia Qt LGPL Exception |
|
26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
|
27 ** |
|
28 ** If you have questions regarding the use of this file, please contact |
|
29 ** Nokia at qt-info@nokia.com. |
|
30 ** |
|
31 ** |
|
32 ** |
|
33 ** |
|
34 ** |
|
35 ** |
|
36 ** |
|
37 ** |
|
38 ** $QT_END_LICENSE$ |
|
39 ** |
|
40 ****************************************************************************/ |
|
41 |
|
42 #include "qplatformdefs.h" |
|
43 #include "qtextcodec.h" |
|
44 #include "qtextcodec_p.h" |
|
45 |
|
46 #ifndef QT_NO_TEXTCODEC |
|
47 |
|
48 #include "qlist.h" |
|
49 #include "qfile.h" |
|
50 #ifndef QT_NO_LIBRARY |
|
51 # include "qcoreapplication.h" |
|
52 # include "qtextcodecplugin.h" |
|
53 # include "private/qfactoryloader_p.h" |
|
54 #endif |
|
55 #include "qstringlist.h" |
|
56 |
|
57 #ifdef Q_OS_UNIX |
|
58 # include "qiconvcodec_p.h" |
|
59 #endif |
|
60 |
|
61 #include "qutfcodec_p.h" |
|
62 #include "qsimplecodec_p.h" |
|
63 #include "qlatincodec_p.h" |
|
64 #ifndef QT_NO_CODECS |
|
65 # include "qtsciicodec_p.h" |
|
66 # include "qisciicodec_p.h" |
|
67 # if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED) |
|
68 // no iconv(3) support, must build all codecs into the library |
|
69 # include "../../plugins/codecs/cn/qgb18030codec.h" |
|
70 # include "../../plugins/codecs/jp/qeucjpcodec.h" |
|
71 # include "../../plugins/codecs/jp/qjiscodec.h" |
|
72 # include "../../plugins/codecs/jp/qsjiscodec.h" |
|
73 # include "../../plugins/codecs/kr/qeuckrcodec.h" |
|
74 # include "../../plugins/codecs/tw/qbig5codec.h" |
|
75 # endif // QT_NO_ICONV |
|
76 # if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED) |
|
77 # include "qfontlaocodec_p.h" |
|
78 # include "../../plugins/codecs/jp/qfontjpcodec.h" |
|
79 # endif |
|
80 #endif // QT_NO_CODECS |
|
81 #include "qlocale.h" |
|
82 #include "private/qmutexpool_p.h" |
|
83 |
|
84 #include <stdlib.h> |
|
85 #include <ctype.h> |
|
86 #include <locale.h> |
|
87 #if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF) |
|
88 #include <langinfo.h> |
|
89 #endif |
|
90 |
|
91 #if defined(Q_OS_WINCE) |
|
92 # define QT_NO_SETLOCALE |
|
93 #endif |
|
94 |
|
95 // enabling this is not exception safe! |
|
96 // #define Q_DEBUG_TEXTCODEC |
|
97 |
|
98 QT_BEGIN_NAMESPACE |
|
99 |
|
100 #ifndef QT_NO_TEXTCODECPLUGIN |
|
101 Q_GLOBAL_STATIC_WITH_ARGS(QFactoryLoader, loader, |
|
102 (QTextCodecFactoryInterface_iid, QLatin1String("/codecs"))) |
|
103 #endif |
|
104 |
|
105 static char qtolower(register char c) |
|
106 { if (c >= 'A' && c <= 'Z') return c + 0x20; return c; } |
|
107 static bool qisalnum(register char c) |
|
108 { return (c >= '0' && c <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'z'); } |
|
109 |
|
110 static bool nameMatch(const QByteArray &name, const QByteArray &test) |
|
111 { |
|
112 // if they're the same, return a perfect score |
|
113 if (qstricmp(name, test) == 0) |
|
114 return true; |
|
115 |
|
116 const char *n = name.constData(); |
|
117 const char *h = test.constData(); |
|
118 |
|
119 // if the letters and numbers are the same, we have a match |
|
120 while (*n != '\0') { |
|
121 if (qisalnum(*n)) { |
|
122 for (;;) { |
|
123 if (*h == '\0') |
|
124 return false; |
|
125 if (qisalnum(*h)) |
|
126 break; |
|
127 ++h; |
|
128 } |
|
129 if (qtolower(*n) != qtolower(*h)) |
|
130 return false; |
|
131 ++h; |
|
132 } |
|
133 ++n; |
|
134 } |
|
135 while (*h && !qisalnum(*h)) |
|
136 ++h; |
|
137 return (*h == '\0'); |
|
138 } |
|
139 |
|
140 |
|
141 static QTextCodec *createForName(const QByteArray &name) |
|
142 { |
|
143 #ifndef QT_NO_TEXTCODECPLUGIN |
|
144 QFactoryLoader *l = loader(); |
|
145 QStringList keys = l->keys(); |
|
146 for (int i = 0; i < keys.size(); ++i) { |
|
147 if (nameMatch(name, keys.at(i).toLatin1())) { |
|
148 QString realName = keys.at(i); |
|
149 if (QTextCodecFactoryInterface *factory |
|
150 = qobject_cast<QTextCodecFactoryInterface*>(l->instance(realName))) { |
|
151 return factory->create(realName); |
|
152 } |
|
153 } |
|
154 } |
|
155 #else |
|
156 Q_UNUSED(name); |
|
157 #endif |
|
158 return 0; |
|
159 } |
|
160 |
|
161 static QTextCodec *createForMib(int mib) |
|
162 { |
|
163 #ifndef QT_NO_TEXTCODECPLUGIN |
|
164 QString name = QLatin1String("MIB: ") + QString::number(mib); |
|
165 if (QTextCodecFactoryInterface *factory |
|
166 = qobject_cast<QTextCodecFactoryInterface*>(loader()->instance(name))) |
|
167 return factory->create(name); |
|
168 #else |
|
169 Q_UNUSED(mib); |
|
170 #endif |
|
171 return 0; |
|
172 } |
|
173 |
|
174 static QList<QTextCodec*> *all = 0; |
|
175 #ifdef Q_DEBUG_TEXTCODEC |
|
176 static bool destroying_is_ok = false; |
|
177 #endif |
|
178 |
|
179 static QTextCodec *localeMapper = 0; |
|
180 QTextCodec *QTextCodec::cftr = 0; |
|
181 |
|
182 |
|
183 class QTextCodecCleanup |
|
184 { |
|
185 public: |
|
186 ~QTextCodecCleanup(); |
|
187 }; |
|
188 |
|
189 /* |
|
190 Deletes all the created codecs. This destructor is called just |
|
191 before exiting to delete any QTextCodec objects that may be lying |
|
192 around. |
|
193 */ |
|
194 QTextCodecCleanup::~QTextCodecCleanup() |
|
195 { |
|
196 if (!all) |
|
197 return; |
|
198 |
|
199 #ifdef Q_DEBUG_TEXTCODEC |
|
200 destroying_is_ok = true; |
|
201 #endif |
|
202 |
|
203 for (QList<QTextCodec *>::const_iterator it = all->constBegin() |
|
204 ; it != all->constEnd(); ++it) { |
|
205 delete *it; |
|
206 } |
|
207 delete all; |
|
208 all = 0; |
|
209 localeMapper = 0; |
|
210 |
|
211 #ifdef Q_DEBUG_TEXTCODEC |
|
212 destroying_is_ok = false; |
|
213 #endif |
|
214 } |
|
215 |
|
216 Q_GLOBAL_STATIC(QTextCodecCleanup, createQTextCodecCleanup) |
|
217 |
|
218 #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE) |
|
219 class QWindowsLocalCodec: public QTextCodec |
|
220 { |
|
221 public: |
|
222 QWindowsLocalCodec(); |
|
223 ~QWindowsLocalCodec(); |
|
224 |
|
225 QString convertToUnicode(const char *, int, ConverterState *) const; |
|
226 QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const; |
|
227 QString convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const; |
|
228 |
|
229 QByteArray name() const; |
|
230 int mibEnum() const; |
|
231 |
|
232 }; |
|
233 |
|
234 QWindowsLocalCodec::QWindowsLocalCodec() |
|
235 { |
|
236 } |
|
237 |
|
238 QWindowsLocalCodec::~QWindowsLocalCodec() |
|
239 { |
|
240 } |
|
241 |
|
242 QString QWindowsLocalCodec::convertToUnicode(const char *chars, int length, ConverterState *state) const |
|
243 { |
|
244 const char *mb = chars; |
|
245 int mblen = length; |
|
246 |
|
247 if (!mb || !mblen) |
|
248 return QString(); |
|
249 |
|
250 const int wclen_auto = 4096; |
|
251 wchar_t wc_auto[wclen_auto]; |
|
252 int wclen = wclen_auto; |
|
253 wchar_t *wc = wc_auto; |
|
254 int len; |
|
255 QString sp; |
|
256 bool prepend = false; |
|
257 char state_data = 0; |
|
258 int remainingChars = 0; |
|
259 |
|
260 //save the current state information |
|
261 if (state) { |
|
262 state_data = (char)state->state_data[0]; |
|
263 remainingChars = state->remainingChars; |
|
264 } |
|
265 |
|
266 //convert the pending charcter (if available) |
|
267 if (state && remainingChars) { |
|
268 char prev[3] = {0}; |
|
269 prev[0] = state_data; |
|
270 prev[1] = mb[0]; |
|
271 remainingChars = 0; |
|
272 len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, |
|
273 prev, 2, wc, wclen); |
|
274 if (len) { |
|
275 prepend = true; |
|
276 sp.append(QChar(wc[0])); |
|
277 mb++; |
|
278 mblen--; |
|
279 wc[0] = 0; |
|
280 } |
|
281 } |
|
282 |
|
283 while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, |
|
284 mb, mblen, wc, wclen))) { |
|
285 int r = GetLastError(); |
|
286 if (r == ERROR_INSUFFICIENT_BUFFER) { |
|
287 if (wc != wc_auto) { |
|
288 qWarning("MultiByteToWideChar: Size changed"); |
|
289 break; |
|
290 } else { |
|
291 wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, |
|
292 mb, mblen, 0, 0); |
|
293 wc = new wchar_t[wclen]; |
|
294 // and try again... |
|
295 } |
|
296 } else if (r == ERROR_NO_UNICODE_TRANSLATION) { |
|
297 //find the last non NULL character |
|
298 while (mblen > 1 && !(mb[mblen-1])) |
|
299 mblen--; |
|
300 //check whether, we hit an invalid character in the middle |
|
301 if ((mblen <= 1) || (remainingChars && state_data)) |
|
302 return convertToUnicodeCharByChar(chars, length, state); |
|
303 //Remove the last character and try again... |
|
304 state_data = mb[mblen-1]; |
|
305 remainingChars = 1; |
|
306 mblen--; |
|
307 } else { |
|
308 // Fail. |
|
309 qWarning("MultiByteToWideChar: Cannot convert multibyte text"); |
|
310 break; |
|
311 } |
|
312 } |
|
313 if (len <= 0) |
|
314 return QString(); |
|
315 if (wc[len-1] == 0) // len - 1: we don't want terminator |
|
316 --len; |
|
317 |
|
318 //save the new state information |
|
319 if (state) { |
|
320 state->state_data[0] = (char)state_data; |
|
321 state->remainingChars = remainingChars; |
|
322 } |
|
323 QString s((QChar*)wc, len); |
|
324 if (wc != wc_auto) |
|
325 delete [] wc; |
|
326 if (prepend) { |
|
327 return sp+s; |
|
328 } |
|
329 return s; |
|
330 } |
|
331 |
|
332 QString QWindowsLocalCodec::convertToUnicodeCharByChar(const char *chars, int length, ConverterState *state) const |
|
333 { |
|
334 if (!chars || !length) |
|
335 return QString(); |
|
336 |
|
337 int copyLocation = 0; |
|
338 int extra = 2; |
|
339 if (state && state->remainingChars) { |
|
340 copyLocation = state->remainingChars; |
|
341 extra += copyLocation; |
|
342 } |
|
343 int newLength = length + extra; |
|
344 char *mbcs = new char[newLength]; |
|
345 //ensure that we have a NULL terminated string |
|
346 mbcs[newLength-1] = 0; |
|
347 mbcs[newLength-2] = 0; |
|
348 memcpy(&(mbcs[copyLocation]), chars, length); |
|
349 if (copyLocation) { |
|
350 //copy the last character from the state |
|
351 mbcs[0] = (char)state->state_data[0]; |
|
352 state->remainingChars = 0; |
|
353 } |
|
354 const char *mb = mbcs; |
|
355 #ifndef Q_OS_WINCE |
|
356 const char *next = 0; |
|
357 QString s; |
|
358 while((next = CharNextExA(CP_ACP, mb, 0)) != mb) { |
|
359 wchar_t wc[2] ={0}; |
|
360 int charlength = next - mb; |
|
361 int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2); |
|
362 if (len>0) { |
|
363 s.append(QChar(wc[0])); |
|
364 } else { |
|
365 int r = GetLastError(); |
|
366 //check if the character being dropped is the last character |
|
367 if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) { |
|
368 state->remainingChars = 1; |
|
369 state->state_data[0] = (char)*mb; |
|
370 } |
|
371 } |
|
372 mb = next; |
|
373 } |
|
374 #else |
|
375 QString s; |
|
376 int size = mbstowcs(NULL, mb, length); |
|
377 if (size < 0) { |
|
378 Q_ASSERT("Error in CE TextCodec"); |
|
379 return QString(); |
|
380 } |
|
381 wchar_t* ws = new wchar_t[size + 2]; |
|
382 ws[size +1] = 0; |
|
383 ws[size] = 0; |
|
384 size = mbstowcs(ws, mb, length); |
|
385 for (int i=0; i< size; i++) |
|
386 s.append(QChar(ws[i])); |
|
387 delete [] ws; |
|
388 #endif |
|
389 delete mbcs; |
|
390 return s; |
|
391 } |
|
392 |
|
393 QByteArray QWindowsLocalCodec::convertFromUnicode(const QChar *uc, int len, ConverterState *) const |
|
394 { |
|
395 return qt_winQString2MB(uc, len); |
|
396 } |
|
397 |
|
398 |
|
399 QByteArray QWindowsLocalCodec::name() const |
|
400 { |
|
401 return "System"; |
|
402 } |
|
403 |
|
404 int QWindowsLocalCodec::mibEnum() const |
|
405 { |
|
406 return 0; |
|
407 } |
|
408 |
|
409 #else |
|
410 |
|
411 /* locale names mostly copied from XFree86 */ |
|
412 static const char * const iso8859_2locales[] = { |
|
413 "croatian", "cs", "cs_CS", "cs_CZ","cz", "cz_CZ", "czech", "hr", |
|
414 "hr_HR", "hu", "hu_HU", "hungarian", "pl", "pl_PL", "polish", "ro", |
|
415 "ro_RO", "rumanian", "serbocroatian", "sh", "sh_SP", "sh_YU", "sk", |
|
416 "sk_SK", "sl", "sl_CS", "sl_SI", "slovak", "slovene", "sr_SP", 0 }; |
|
417 |
|
418 static const char * const iso8859_3locales[] = { |
|
419 "eo", 0 }; |
|
420 |
|
421 static const char * const iso8859_4locales[] = { |
|
422 "ee", "ee_EE", 0 }; |
|
423 |
|
424 static const char * const iso8859_5locales[] = { |
|
425 "mk", "mk_MK", "sp", "sp_YU", 0 }; |
|
426 |
|
427 static const char * const cp_1251locales[] = { |
|
428 "be", "be_BY", "bg", "bg_BG", "bulgarian", 0 }; |
|
429 |
|
430 static const char * const pt_154locales[] = { |
|
431 "ba_RU", "ky", "ky_KG", "kk", "kk_KZ", 0 }; |
|
432 |
|
433 static const char * const iso8859_6locales[] = { |
|
434 "ar_AA", "ar_SA", "arabic", 0 }; |
|
435 |
|
436 static const char * const iso8859_7locales[] = { |
|
437 "el", "el_GR", "greek", 0 }; |
|
438 |
|
439 static const char * const iso8859_8locales[] = { |
|
440 "hebrew", "he", "he_IL", "iw", "iw_IL", 0 }; |
|
441 |
|
442 static const char * const iso8859_9locales[] = { |
|
443 "tr", "tr_TR", "turkish", 0 }; |
|
444 |
|
445 static const char * const iso8859_13locales[] = { |
|
446 "lt", "lt_LT", "lv", "lv_LV", 0 }; |
|
447 |
|
448 static const char * const iso8859_15locales[] = { |
|
449 "et", "et_EE", |
|
450 // Euro countries |
|
451 "br_FR", "ca_ES", "de", "de_AT", "de_BE", "de_DE", "de_LU", "en_IE", |
|
452 "es", "es_ES", "eu_ES", "fi", "fi_FI", "finnish", "fr", "fr_FR", |
|
453 "fr_BE", "fr_LU", "french", "ga_IE", "gl_ES", "it", "it_IT", "oc_FR", |
|
454 "nl", "nl_BE", "nl_NL", "pt", "pt_PT", "sv_FI", "wa_BE", |
|
455 0 }; |
|
456 |
|
457 static const char * const koi8_ulocales[] = { |
|
458 "uk", "uk_UA", "ru_UA", "ukrainian", 0 }; |
|
459 |
|
460 static const char * const tis_620locales[] = { |
|
461 "th", "th_TH", "thai", 0 }; |
|
462 |
|
463 // static const char * const tcvnlocales[] = { |
|
464 // "vi", "vi_VN", 0 }; |
|
465 |
|
466 static bool try_locale_list(const char * const locale[], const QByteArray &lang) |
|
467 { |
|
468 int i; |
|
469 for(i=0; locale[i] && lang != locale[i]; i++) |
|
470 ; |
|
471 return locale[i] != 0; |
|
472 } |
|
473 |
|
474 // For the probably_koi8_locales we have to look. the standard says |
|
475 // these are 8859-5, but almost all Russian users use KOI8-R and |
|
476 // incorrectly set $LANG to ru_RU. We'll check tolower() to see what |
|
477 // it thinks ru_RU means. |
|
478 |
|
479 // If you read the history, it seems that many Russians blame ISO and |
|
480 // Perestroika for the confusion. |
|
481 // |
|
482 // The real bug is that some programs break if the user specifies |
|
483 // ru_RU.KOI8-R. |
|
484 |
|
485 static const char * const probably_koi8_rlocales[] = { |
|
486 "ru", "ru_SU", "ru_RU", "russian", 0 }; |
|
487 |
|
488 static QTextCodec * ru_RU_hack(const char * i) { |
|
489 QTextCodec * ru_RU_codec = 0; |
|
490 |
|
491 #if !defined(QT_NO_SETLOCALE) |
|
492 QByteArray origlocale(setlocale(LC_CTYPE, i)); |
|
493 #else |
|
494 QByteArray origlocale(i); |
|
495 #endif |
|
496 // unicode koi8r latin5 name |
|
497 // 0x044E 0xC0 0xEE CYRILLIC SMALL LETTER YU |
|
498 // 0x042E 0xE0 0xCE CYRILLIC CAPITAL LETTER YU |
|
499 int latin5 = tolower(0xCE); |
|
500 int koi8r = tolower(0xE0); |
|
501 if (koi8r == 0xC0 && latin5 != 0xEE) { |
|
502 ru_RU_codec = QTextCodec::codecForName("KOI8-R"); |
|
503 } else if (koi8r != 0xC0 && latin5 == 0xEE) { |
|
504 ru_RU_codec = QTextCodec::codecForName("ISO 8859-5"); |
|
505 } else { |
|
506 // something else again... let's assume... *throws dice* |
|
507 ru_RU_codec = QTextCodec::codecForName("KOI8-R"); |
|
508 qWarning("QTextCodec: Using KOI8-R, probe failed (%02x %02x %s)", |
|
509 koi8r, latin5, i); |
|
510 } |
|
511 #if !defined(QT_NO_SETLOCALE) |
|
512 setlocale(LC_CTYPE, origlocale); |
|
513 #endif |
|
514 |
|
515 return ru_RU_codec; |
|
516 } |
|
517 |
|
518 #endif |
|
519 |
|
520 #if !defined(Q_OS_WIN32) && !defined(Q_OS_WINCE) |
|
521 static QTextCodec *checkForCodec(const QByteArray &name) { |
|
522 QTextCodec *c = QTextCodec::codecForName(name); |
|
523 if (!c) { |
|
524 const int index = name.indexOf('@'); |
|
525 if (index != -1) { |
|
526 c = QTextCodec::codecForName(name.left(index)); |
|
527 } |
|
528 } |
|
529 return c; |
|
530 } |
|
531 #endif |
|
532 |
|
533 /* the next two functions are implicitely thread safe, |
|
534 as they are only called by setup() which uses a mutex. |
|
535 */ |
|
536 static void setupLocaleMapper() |
|
537 { |
|
538 #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE) |
|
539 localeMapper = QTextCodec::codecForName("System"); |
|
540 #else |
|
541 |
|
542 #ifndef QT_NO_ICONV |
|
543 localeMapper = QTextCodec::codecForName("System"); |
|
544 #endif |
|
545 |
|
546 #if defined (_XOPEN_UNIX) && !defined(Q_OS_QNX) && !defined(Q_OS_OSF) |
|
547 if (!localeMapper) { |
|
548 char *charset = nl_langinfo (CODESET); |
|
549 if (charset) |
|
550 localeMapper = QTextCodec::codecForName(charset); |
|
551 } |
|
552 #endif |
|
553 |
|
554 if (!localeMapper) { |
|
555 // Very poorly defined and followed standards causes lots of |
|
556 // code to try to get all the cases... This logic is |
|
557 // duplicated in QIconvCodec, so if you change it here, change |
|
558 // it there too. |
|
559 |
|
560 // Try to determine locale codeset from locale name assigned to |
|
561 // LC_CTYPE category. |
|
562 |
|
563 // First part is getting that locale name. First try setlocale() which |
|
564 // definitely knows it, but since we cannot fully trust it, get ready |
|
565 // to fall back to environment variables. |
|
566 #if !defined(QT_NO_SETLOCALE) |
|
567 const QByteArray ctype = setlocale(LC_CTYPE, 0); |
|
568 #else |
|
569 const QByteArray ctype; |
|
570 #endif |
|
571 |
|
572 // Get the first nonempty value from $LC_ALL, $LC_CTYPE, and $LANG |
|
573 // environment variables. |
|
574 QByteArray lang = qgetenv("LC_ALL"); |
|
575 if (lang.isEmpty() || lang == "C") { |
|
576 lang = qgetenv("LC_CTYPE"); |
|
577 } |
|
578 if (lang.isEmpty() || lang == "C") { |
|
579 lang = qgetenv("LANG"); |
|
580 } |
|
581 |
|
582 // Now try these in order: |
|
583 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15) |
|
584 // 2. CODESET from lang if it contains a .CODESET part |
|
585 // 3. ctype (maybe the locale is named "ISO-8859-1" or something) |
|
586 // 4. locale (ditto) |
|
587 // 5. check for "@euro" |
|
588 // 6. guess locale from ctype unless ctype is "C" |
|
589 // 7. guess locale from lang |
|
590 |
|
591 // 1. CODESET from ctype if it contains a .CODESET part (e.g. en_US.ISO8859-15) |
|
592 int indexOfDot = ctype.indexOf('.'); |
|
593 if (indexOfDot != -1) |
|
594 localeMapper = checkForCodec( ctype.mid(indexOfDot + 1) ); |
|
595 |
|
596 // 2. CODESET from lang if it contains a .CODESET part |
|
597 if (!localeMapper) { |
|
598 indexOfDot = lang.indexOf('.'); |
|
599 if (indexOfDot != -1) |
|
600 localeMapper = checkForCodec( lang.mid(indexOfDot + 1) ); |
|
601 } |
|
602 |
|
603 // 3. ctype (maybe the locale is named "ISO-8859-1" or something) |
|
604 if (!localeMapper && !ctype.isEmpty() && ctype != "C") |
|
605 localeMapper = checkForCodec(ctype); |
|
606 |
|
607 // 4. locale (ditto) |
|
608 if (!localeMapper && !lang.isEmpty()) |
|
609 localeMapper = checkForCodec(lang); |
|
610 |
|
611 // 5. "@euro" |
|
612 if ((!localeMapper && ctype.contains("@euro")) || lang.contains("@euro")) |
|
613 localeMapper = checkForCodec("ISO 8859-15"); |
|
614 |
|
615 // 6. guess locale from ctype unless ctype is "C" |
|
616 // 7. guess locale from lang |
|
617 const QByteArray &try_by_name = (!ctype.isEmpty() && ctype != "C") ? lang : ctype; |
|
618 |
|
619 // Now do the guessing. |
|
620 if (!lang.isEmpty() && !localeMapper && !try_by_name.isEmpty()) { |
|
621 if (try_locale_list(iso8859_15locales, lang)) |
|
622 localeMapper = QTextCodec::codecForName("ISO 8859-15"); |
|
623 else if (try_locale_list(iso8859_2locales, lang)) |
|
624 localeMapper = QTextCodec::codecForName("ISO 8859-2"); |
|
625 else if (try_locale_list(iso8859_3locales, lang)) |
|
626 localeMapper = QTextCodec::codecForName("ISO 8859-3"); |
|
627 else if (try_locale_list(iso8859_4locales, lang)) |
|
628 localeMapper = QTextCodec::codecForName("ISO 8859-4"); |
|
629 else if (try_locale_list(iso8859_5locales, lang)) |
|
630 localeMapper = QTextCodec::codecForName("ISO 8859-5"); |
|
631 else if (try_locale_list(iso8859_6locales, lang)) |
|
632 localeMapper = QTextCodec::codecForName("ISO 8859-6"); |
|
633 else if (try_locale_list(iso8859_7locales, lang)) |
|
634 localeMapper = QTextCodec::codecForName("ISO 8859-7"); |
|
635 else if (try_locale_list(iso8859_8locales, lang)) |
|
636 localeMapper = QTextCodec::codecForName("ISO 8859-8-I"); |
|
637 else if (try_locale_list(iso8859_9locales, lang)) |
|
638 localeMapper = QTextCodec::codecForName("ISO 8859-9"); |
|
639 else if (try_locale_list(iso8859_13locales, lang)) |
|
640 localeMapper = QTextCodec::codecForName("ISO 8859-13"); |
|
641 else if (try_locale_list(tis_620locales, lang)) |
|
642 localeMapper = QTextCodec::codecForName("ISO 8859-11"); |
|
643 else if (try_locale_list(koi8_ulocales, lang)) |
|
644 localeMapper = QTextCodec::codecForName("KOI8-U"); |
|
645 else if (try_locale_list(cp_1251locales, lang)) |
|
646 localeMapper = QTextCodec::codecForName("CP 1251"); |
|
647 else if (try_locale_list(pt_154locales, lang)) |
|
648 localeMapper = QTextCodec::codecForName("PT 154"); |
|
649 else if (try_locale_list(probably_koi8_rlocales, lang)) |
|
650 localeMapper = ru_RU_hack(lang); |
|
651 } |
|
652 |
|
653 } |
|
654 |
|
655 // If everything failed, we default to 8859-1 |
|
656 // We could perhaps default to 8859-15. |
|
657 if (!localeMapper) |
|
658 localeMapper = QTextCodec::codecForName("ISO 8859-1"); |
|
659 #endif |
|
660 } |
|
661 |
|
662 |
|
663 static void setup() |
|
664 { |
|
665 #ifndef QT_NO_THREAD |
|
666 QMutexLocker locker(QMutexPool::globalInstanceGet(&all)); |
|
667 #endif |
|
668 |
|
669 if (all) |
|
670 return; |
|
671 |
|
672 #ifdef Q_DEBUG_TEXTCODEC |
|
673 if (destroying_is_ok) |
|
674 qWarning("QTextCodec: Creating new codec during codec cleanup"); |
|
675 #endif |
|
676 all = new QList<QTextCodec*>; |
|
677 // create the cleanup object to cleanup all codecs on exit |
|
678 (void) createQTextCodecCleanup(); |
|
679 |
|
680 #ifndef QT_NO_CODECS |
|
681 # if defined(Q_WS_X11) && !defined(QT_BOOTSTRAPPED) |
|
682 // no font codecs when bootstrapping |
|
683 (void)new QFontLaoCodec; |
|
684 # if defined(QT_NO_ICONV) |
|
685 // no iconv(3) support, must build all codecs into the library |
|
686 (void)new QFontGb2312Codec; |
|
687 (void)new QFontGbkCodec; |
|
688 (void)new QFontGb18030_0Codec; |
|
689 (void)new QFontJis0208Codec; |
|
690 (void)new QFontJis0201Codec; |
|
691 (void)new QFontKsc5601Codec; |
|
692 (void)new QFontBig5hkscsCodec; |
|
693 (void)new QFontBig5Codec; |
|
694 # endif // QT_NO_ICONV && !QT_BOOTSTRAPPED |
|
695 # endif // Q_WS_X11 |
|
696 |
|
697 (void)new QTsciiCodec; |
|
698 |
|
699 for (int i = 0; i < 9; ++i) |
|
700 (void)new QIsciiCodec(i); |
|
701 |
|
702 |
|
703 # if defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED) |
|
704 // no asian codecs when bootstrapping, sorry |
|
705 (void)new QGb18030Codec; |
|
706 (void)new QGbkCodec; |
|
707 (void)new QGb2312Codec; |
|
708 (void)new QEucJpCodec; |
|
709 (void)new QJisCodec; |
|
710 (void)new QSjisCodec; |
|
711 (void)new QEucKrCodec; |
|
712 (void)new QCP949Codec; |
|
713 (void)new QBig5Codec; |
|
714 (void)new QBig5hkscsCodec; |
|
715 # endif // QT_NO_ICONV && !QT_BOOTSTRAPPED |
|
716 #endif // QT_NO_CODECS |
|
717 |
|
718 #if defined(Q_OS_WIN32) || defined(Q_OS_WINCE) |
|
719 (void) new QWindowsLocalCodec; |
|
720 #endif // Q_OS_WIN32 |
|
721 |
|
722 (void)new QUtf16Codec; |
|
723 (void)new QUtf16BECodec; |
|
724 (void)new QUtf16LECodec; |
|
725 (void)new QUtf32Codec; |
|
726 (void)new QUtf32BECodec; |
|
727 (void)new QUtf32LECodec; |
|
728 (void)new QLatin15Codec; |
|
729 (void)new QLatin1Codec; |
|
730 (void)new QUtf8Codec; |
|
731 |
|
732 for (int i = 0; i < QSimpleTextCodec::numSimpleCodecs; ++i) |
|
733 (void)new QSimpleTextCodec(i); |
|
734 |
|
735 #if defined(Q_OS_UNIX) && !defined(QT_NO_ICONV) && !defined(QT_BOOTSTRAPPED) |
|
736 // QIconvCodec depends on the UTF-16 codec, so it needs to be created last |
|
737 (void) new QIconvCodec(); |
|
738 #endif |
|
739 |
|
740 if (!localeMapper) |
|
741 setupLocaleMapper(); |
|
742 } |
|
743 |
|
744 /*! |
|
745 \enum QTextCodec::ConversionFlag |
|
746 |
|
747 \value DefaultConversion No flag is set. |
|
748 \value ConvertInvalidToNull If this flag is set, each invalid input |
|
749 character is output as a null character. |
|
750 \value IgnoreHeader Ignore any Unicode byte-order mark and don't generate any. |
|
751 |
|
752 \omitvalue FreeFunction |
|
753 */ |
|
754 |
|
755 /*! |
|
756 \fn QTextCodec::ConverterState::ConverterState(ConversionFlags flags) |
|
757 |
|
758 Constructs a ConverterState object initialized with the given \a flags. |
|
759 */ |
|
760 |
|
761 /*! |
|
762 Destroys the ConverterState object. |
|
763 */ |
|
764 QTextCodec::ConverterState::~ConverterState() |
|
765 { |
|
766 if (flags & FreeFunction) |
|
767 (QTextCodecUnalignedPointer::decode(state_data))(this); |
|
768 else if (d) |
|
769 qFree(d); |
|
770 } |
|
771 |
|
772 /*! |
|
773 \class QTextCodec |
|
774 \brief The QTextCodec class provides conversions between text encodings. |
|
775 \reentrant |
|
776 \ingroup i18n |
|
777 |
|
778 Qt uses Unicode to store, draw and manipulate strings. In many |
|
779 situations you may wish to deal with data that uses a different |
|
780 encoding. For example, most Japanese documents are still stored |
|
781 in Shift-JIS or ISO 2022-JP, while Russian users often have their |
|
782 documents in KOI8-R or Windows-1251. |
|
783 |
|
784 Qt provides a set of QTextCodec classes to help with converting |
|
785 non-Unicode formats to and from Unicode. You can also create your |
|
786 own codec classes. |
|
787 |
|
788 The supported encodings are: |
|
789 |
|
790 \list |
|
791 \o Apple Roman |
|
792 \o \l{Big5 Text Codec}{Big5} |
|
793 \o \l{Big5-HKSCS Text Codec}{Big5-HKSCS} |
|
794 \o CP949 |
|
795 \o \l{EUC-JP Text Codec}{EUC-JP} |
|
796 \o \l{EUC-KR Text Codec}{EUC-KR} |
|
797 \o \l{GBK Text Codec}{GB18030-0} |
|
798 \o IBM 850 |
|
799 \o IBM 866 |
|
800 \o IBM 874 |
|
801 \o \l{ISO 2022-JP (JIS) Text Codec}{ISO 2022-JP} |
|
802 \o ISO 8859-1 to 10 |
|
803 \o ISO 8859-13 to 16 |
|
804 \o Iscii-Bng, Dev, Gjr, Knd, Mlm, Ori, Pnj, Tlg, and Tml |
|
805 \o JIS X 0201 |
|
806 \o JIS X 0208 |
|
807 \o KOI8-R |
|
808 \o KOI8-U |
|
809 \o MuleLao-1 |
|
810 \o ROMAN8 |
|
811 \o \l{Shift-JIS Text Codec}{Shift-JIS} |
|
812 \o TIS-620 |
|
813 \o \l{TSCII Text Codec}{TSCII} |
|
814 \o UTF-8 |
|
815 \o UTF-16 |
|
816 \o UTF-16BE |
|
817 \o UTF-16LE |
|
818 \o UTF-32 |
|
819 \o UTF-32BE |
|
820 \o UTF-32LE |
|
821 \o Windows-1250 to 1258 |
|
822 \o WINSAMI2 |
|
823 \endlist |
|
824 |
|
825 QTextCodecs can be used as follows to convert some locally encoded |
|
826 string to Unicode. Suppose you have some string encoded in Russian |
|
827 KOI8-R encoding, and want to convert it to Unicode. The simple way |
|
828 to do it is like this: |
|
829 |
|
830 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 0 |
|
831 |
|
832 After this, \c string holds the text converted to Unicode. |
|
833 Converting a string from Unicode to the local encoding is just as |
|
834 easy: |
|
835 |
|
836 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 1 |
|
837 |
|
838 To read or write files in various encodings, use QTextStream and |
|
839 its \l{QTextStream::setCodec()}{setCodec()} function. See the |
|
840 \l{tools/codecs}{Codecs} example for an application of QTextCodec |
|
841 to file I/O. |
|
842 |
|
843 Some care must be taken when trying to convert the data in chunks, |
|
844 for example, when receiving it over a network. In such cases it is |
|
845 possible that a multi-byte character will be split over two |
|
846 chunks. At best this might result in the loss of a character and |
|
847 at worst cause the entire conversion to fail. |
|
848 |
|
849 The approach to use in these situations is to create a QTextDecoder |
|
850 object for the codec and use this QTextDecoder for the whole |
|
851 decoding process, as shown below: |
|
852 |
|
853 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 2 |
|
854 |
|
855 The QTextDecoder object maintains state between chunks and therefore |
|
856 works correctly even if a multi-byte character is split between |
|
857 chunks. |
|
858 |
|
859 \section1 Creating Your Own Codec Class |
|
860 |
|
861 Support for new text encodings can be added to Qt by creating |
|
862 QTextCodec subclasses. |
|
863 |
|
864 The pure virtual functions describe the encoder to the system and |
|
865 the coder is used as required in the different text file formats |
|
866 supported by QTextStream, and under X11, for the locale-specific |
|
867 character input and output. |
|
868 |
|
869 To add support for another encoding to Qt, make a subclass of |
|
870 QTextCodec and implement the functions listed in the table below. |
|
871 |
|
872 \table |
|
873 \header \o Function \o Description |
|
874 |
|
875 \row \o name() |
|
876 \o Returns the official name for the encoding. If the |
|
877 encoding is listed in the |
|
878 \l{IANA character-sets encoding file}, the name |
|
879 should be the preferred MIME name for the encoding. |
|
880 |
|
881 \row \o aliases() |
|
882 \o Returns a list of alternative names for the encoding. |
|
883 QTextCodec provides a default implementation that returns |
|
884 an empty list. For example, "ISO-8859-1" has "latin1", |
|
885 "CP819", "IBM819", and "iso-ir-100" as aliases. |
|
886 |
|
887 \row \o mibEnum() |
|
888 \o Return the MIB enum for the encoding if it is listed in |
|
889 the \l{IANA character-sets encoding file}. |
|
890 |
|
891 \row \o convertToUnicode() |
|
892 \o Converts an 8-bit character string to Unicode. |
|
893 |
|
894 \row \o convertFromUnicode() |
|
895 \o Converts a Unicode string to an 8-bit character string. |
|
896 \endtable |
|
897 |
|
898 You may find it more convenient to make your codec class |
|
899 available as a plugin; see \l{How to Create Qt Plugins} for |
|
900 details. |
|
901 |
|
902 \sa QTextStream, QTextDecoder, QTextEncoder, {Codecs Example} |
|
903 */ |
|
904 |
|
905 /*! |
|
906 \nonreentrant |
|
907 |
|
908 Constructs a QTextCodec, and gives it the highest precedence. The |
|
909 QTextCodec should always be constructed on the heap (i.e. with \c |
|
910 new). Qt takes ownership and will delete it when the application |
|
911 terminates. |
|
912 */ |
|
913 QTextCodec::QTextCodec() |
|
914 { |
|
915 setup(); |
|
916 all->prepend(this); |
|
917 } |
|
918 |
|
919 |
|
920 /*! |
|
921 \nonreentrant |
|
922 |
|
923 Destroys the QTextCodec. Note that you should not delete codecs |
|
924 yourself: once created they become Qt's responsibility. |
|
925 */ |
|
926 QTextCodec::~QTextCodec() |
|
927 { |
|
928 #ifdef Q_DEBUG_TEXTCODEC |
|
929 if (!destroying_is_ok) |
|
930 qWarning("QTextCodec::~QTextCodec: Called by application"); |
|
931 #endif |
|
932 if (all) |
|
933 all->removeAll(this); |
|
934 } |
|
935 |
|
936 /*! |
|
937 \fn QTextCodec *QTextCodec::codecForName(const char *name) |
|
938 |
|
939 Searches all installed QTextCodec objects and returns the one |
|
940 which best matches \a name; the match is case-insensitive. Returns |
|
941 0 if no codec matching the name \a name could be found. |
|
942 */ |
|
943 |
|
944 /*! |
|
945 Searches all installed QTextCodec objects and returns the one |
|
946 which best matches \a name; the match is case-insensitive. Returns |
|
947 0 if no codec matching the name \a name could be found. |
|
948 */ |
|
949 QTextCodec *QTextCodec::codecForName(const QByteArray &name) |
|
950 { |
|
951 if (name.isEmpty()) |
|
952 return 0; |
|
953 |
|
954 setup(); |
|
955 |
|
956 for (int i = 0; i < all->size(); ++i) { |
|
957 QTextCodec *cursor = all->at(i); |
|
958 if (nameMatch(cursor->name(), name)) |
|
959 return cursor; |
|
960 QList<QByteArray> aliases = cursor->aliases(); |
|
961 for (int i = 0; i < aliases.size(); ++i) |
|
962 if (nameMatch(aliases.at(i), name)) |
|
963 return cursor; |
|
964 } |
|
965 |
|
966 return createForName(name); |
|
967 } |
|
968 |
|
969 |
|
970 /*! |
|
971 Returns the QTextCodec which matches the \link |
|
972 QTextCodec::mibEnum() MIBenum\endlink \a mib. |
|
973 */ |
|
974 QTextCodec* QTextCodec::codecForMib(int mib) |
|
975 { |
|
976 setup(); |
|
977 |
|
978 // Qt 3 used 1000 (mib for UCS2) as its identifier for the utf16 codec. Map |
|
979 // this correctly for compatibility. |
|
980 if (mib == 1000) |
|
981 mib = 1015; |
|
982 |
|
983 QList<QTextCodec*>::ConstIterator i; |
|
984 for (int i = 0; i < all->size(); ++i) { |
|
985 QTextCodec *cursor = all->at(i); |
|
986 if (cursor->mibEnum() == mib) |
|
987 return cursor; |
|
988 } |
|
989 |
|
990 return createForMib(mib); |
|
991 } |
|
992 |
|
993 /*! |
|
994 Returns the list of all available codecs, by name. Call |
|
995 QTextCodec::codecForName() to obtain the QTextCodec for the name. |
|
996 |
|
997 The list may contain many mentions of the same codec |
|
998 if the codec has aliases. |
|
999 |
|
1000 \sa availableMibs(), name(), aliases() |
|
1001 */ |
|
1002 QList<QByteArray> QTextCodec::availableCodecs() |
|
1003 { |
|
1004 setup(); |
|
1005 |
|
1006 QList<QByteArray> codecs; |
|
1007 for (int i = 0; i < all->size(); ++i) { |
|
1008 codecs += all->at(i)->name(); |
|
1009 codecs += all->at(i)->aliases(); |
|
1010 } |
|
1011 #ifndef QT_NO_TEXTCODECPLUGIN |
|
1012 QFactoryLoader *l = loader(); |
|
1013 QStringList keys = l->keys(); |
|
1014 for (int i = 0; i < keys.size(); ++i) { |
|
1015 if (!keys.at(i).startsWith(QLatin1String("MIB: "))) { |
|
1016 QByteArray name = keys.at(i).toLatin1(); |
|
1017 if (!codecs.contains(name)) |
|
1018 codecs += name; |
|
1019 } |
|
1020 } |
|
1021 #endif |
|
1022 |
|
1023 return codecs; |
|
1024 } |
|
1025 |
|
1026 /*! |
|
1027 Returns the list of MIBs for all available codecs. Call |
|
1028 QTextCodec::codecForMib() to obtain the QTextCodec for the MIB. |
|
1029 |
|
1030 \sa availableCodecs(), mibEnum() |
|
1031 */ |
|
1032 QList<int> QTextCodec::availableMibs() |
|
1033 { |
|
1034 setup(); |
|
1035 |
|
1036 QList<int> codecs; |
|
1037 for (int i = 0; i < all->size(); ++i) |
|
1038 codecs += all->at(i)->mibEnum(); |
|
1039 #ifndef QT_NO_TEXTCODECPLUGIN |
|
1040 QFactoryLoader *l = loader(); |
|
1041 QStringList keys = l->keys(); |
|
1042 for (int i = 0; i < keys.size(); ++i) { |
|
1043 if (keys.at(i).startsWith(QLatin1String("MIB: "))) { |
|
1044 int mib = keys.at(i).mid(5).toInt(); |
|
1045 if (!codecs.contains(mib)) |
|
1046 codecs += mib; |
|
1047 } |
|
1048 } |
|
1049 #endif |
|
1050 |
|
1051 return codecs; |
|
1052 } |
|
1053 |
|
1054 /*! |
|
1055 Set the codec to \a c; this will be returned by |
|
1056 codecForLocale(). If \a c is a null pointer, the codec is reset to |
|
1057 the default. |
|
1058 |
|
1059 This might be needed for some applications that want to use their |
|
1060 own mechanism for setting the locale. |
|
1061 |
|
1062 \sa codecForLocale() |
|
1063 */ |
|
1064 void QTextCodec::setCodecForLocale(QTextCodec *c) |
|
1065 { |
|
1066 localeMapper = c; |
|
1067 if (!localeMapper) |
|
1068 setupLocaleMapper(); |
|
1069 } |
|
1070 |
|
1071 /*! |
|
1072 Returns a pointer to the codec most suitable for this locale. |
|
1073 |
|
1074 On Windows, the codec will be based on a system locale. On Unix |
|
1075 systems, starting with Qt 4.2, the codec will be using the \e |
|
1076 iconv library. Note that in both cases the codec's name will be |
|
1077 "System". |
|
1078 */ |
|
1079 |
|
1080 QTextCodec* QTextCodec::codecForLocale() |
|
1081 { |
|
1082 if (localeMapper) |
|
1083 return localeMapper; |
|
1084 |
|
1085 setup(); |
|
1086 |
|
1087 return localeMapper; |
|
1088 } |
|
1089 |
|
1090 |
|
1091 /*! |
|
1092 \fn QByteArray QTextCodec::name() const |
|
1093 |
|
1094 QTextCodec subclasses must reimplement this function. It returns |
|
1095 the name of the encoding supported by the subclass. |
|
1096 |
|
1097 If the codec is registered as a character set in the |
|
1098 \l{IANA character-sets encoding file} this method should |
|
1099 return the preferred mime name for the codec if defined, |
|
1100 otherwise its name. |
|
1101 */ |
|
1102 |
|
1103 /*! |
|
1104 \fn int QTextCodec::mibEnum() const |
|
1105 |
|
1106 Subclasses of QTextCodec must reimplement this function. It |
|
1107 returns the MIBenum (see \l{IANA character-sets encoding file} |
|
1108 for more information). It is important that each QTextCodec |
|
1109 subclass returns the correct unique value for this function. |
|
1110 */ |
|
1111 |
|
1112 /*! |
|
1113 Subclasses can return a number of aliases for the codec in question. |
|
1114 |
|
1115 Standard aliases for codecs can be found in the |
|
1116 \l{IANA character-sets encoding file}. |
|
1117 */ |
|
1118 QList<QByteArray> QTextCodec::aliases() const |
|
1119 { |
|
1120 return QList<QByteArray>(); |
|
1121 } |
|
1122 |
|
1123 /*! |
|
1124 \fn QString QTextCodec::convertToUnicode(const char *chars, int len, |
|
1125 ConverterState *state) const |
|
1126 |
|
1127 QTextCodec subclasses must reimplement this function. |
|
1128 |
|
1129 Converts the first \a len characters of \a chars from the |
|
1130 encoding of the subclass to Unicode, and returns the result in a |
|
1131 QString. |
|
1132 |
|
1133 \a state can be 0, in which case the conversion is stateless and |
|
1134 default conversion rules should be used. If state is not 0, the |
|
1135 codec should save the state after the conversion in \a state, and |
|
1136 adjust the remainingChars and invalidChars members of the struct. |
|
1137 */ |
|
1138 |
|
1139 /*! |
|
1140 \fn QByteArray QTextCodec::convertFromUnicode(const QChar *input, int number, |
|
1141 ConverterState *state) const |
|
1142 |
|
1143 QTextCodec subclasses must reimplement this function. |
|
1144 |
|
1145 Converts the first \a number of characters from the \a input array |
|
1146 from Unicode to the encoding of the subclass, and returns the result |
|
1147 in a QByteArray. |
|
1148 |
|
1149 \a state can be 0 in which case the conversion is stateless and |
|
1150 default conversion rules should be used. If state is not 0, the |
|
1151 codec should save the state after the conversion in \a state, and |
|
1152 adjust the remainingChars and invalidChars members of the struct. |
|
1153 */ |
|
1154 |
|
1155 /*! |
|
1156 Creates a QTextDecoder which stores enough state to decode chunks |
|
1157 of \c{char *} data to create chunks of Unicode data. |
|
1158 |
|
1159 The caller is responsible for deleting the returned object. |
|
1160 */ |
|
1161 QTextDecoder* QTextCodec::makeDecoder() const |
|
1162 { |
|
1163 return new QTextDecoder(this); |
|
1164 } |
|
1165 |
|
1166 |
|
1167 /*! |
|
1168 Creates a QTextEncoder which stores enough state to encode chunks |
|
1169 of Unicode data as \c{char *} data. |
|
1170 |
|
1171 The caller is responsible for deleting the returned object. |
|
1172 */ |
|
1173 QTextEncoder* QTextCodec::makeEncoder() const |
|
1174 { |
|
1175 return new QTextEncoder(this); |
|
1176 } |
|
1177 |
|
1178 /*! |
|
1179 \fn QByteArray QTextCodec::fromUnicode(const QChar *input, int number, |
|
1180 ConverterState *state) const |
|
1181 |
|
1182 Converts the first \a number of characters from the \a input array |
|
1183 from Unicode to the encoding of this codec, and returns the result |
|
1184 in a QByteArray. |
|
1185 |
|
1186 The \a state of the convertor used is updated. |
|
1187 */ |
|
1188 |
|
1189 /*! |
|
1190 Converts \a str from Unicode to the encoding of this codec, and |
|
1191 returns the result in a QByteArray. |
|
1192 */ |
|
1193 QByteArray QTextCodec::fromUnicode(const QString& str) const |
|
1194 { |
|
1195 return convertFromUnicode(str.constData(), str.length(), 0); |
|
1196 } |
|
1197 |
|
1198 /*! |
|
1199 \fn QString QTextCodec::toUnicode(const char *input, int size, |
|
1200 ConverterState *state) const |
|
1201 |
|
1202 Converts the first \a size characters from the \a input from the |
|
1203 encoding of this codec to Unicode, and returns the result in a |
|
1204 QString. |
|
1205 |
|
1206 The \a state of the convertor used is updated. |
|
1207 */ |
|
1208 |
|
1209 /*! |
|
1210 Converts \a a from the encoding of this codec to Unicode, and |
|
1211 returns the result in a QString. |
|
1212 */ |
|
1213 QString QTextCodec::toUnicode(const QByteArray& a) const |
|
1214 { |
|
1215 return convertToUnicode(a.constData(), a.length(), 0); |
|
1216 } |
|
1217 |
|
1218 /*! |
|
1219 Returns true if the Unicode character \a ch can be fully encoded |
|
1220 with this codec; otherwise returns false. |
|
1221 */ |
|
1222 bool QTextCodec::canEncode(QChar ch) const |
|
1223 { |
|
1224 ConverterState state; |
|
1225 state.flags = ConvertInvalidToNull; |
|
1226 convertFromUnicode(&ch, 1, &state); |
|
1227 return (state.invalidChars == 0); |
|
1228 } |
|
1229 |
|
1230 /*! |
|
1231 \overload |
|
1232 |
|
1233 \a s contains the string being tested for encode-ability. |
|
1234 */ |
|
1235 bool QTextCodec::canEncode(const QString& s) const |
|
1236 { |
|
1237 ConverterState state; |
|
1238 state.flags = ConvertInvalidToNull; |
|
1239 convertFromUnicode(s.constData(), s.length(), &state); |
|
1240 return (state.invalidChars == 0); |
|
1241 } |
|
1242 |
|
1243 #ifdef QT3_SUPPORT |
|
1244 /*! |
|
1245 Returns a string representing the current language and |
|
1246 sublanguage, e.g. "pt" for Portuguese, or "pt_br" for Portuguese/Brazil. |
|
1247 |
|
1248 \sa QLocale |
|
1249 */ |
|
1250 const char *QTextCodec::locale() |
|
1251 { |
|
1252 static char locale[6]; |
|
1253 QByteArray l = QLocale::system().name().toLatin1(); |
|
1254 int len = qMin(l.length(), 5); |
|
1255 memcpy(locale, l.constData(), len); |
|
1256 locale[len] = '\0'; |
|
1257 |
|
1258 return locale; |
|
1259 } |
|
1260 |
|
1261 /*! |
|
1262 \overload |
|
1263 */ |
|
1264 |
|
1265 QByteArray QTextCodec::fromUnicode(const QString& uc, int& lenInOut) const |
|
1266 { |
|
1267 QByteArray result = convertFromUnicode(uc.constData(), lenInOut, 0); |
|
1268 lenInOut = result.length(); |
|
1269 return result; |
|
1270 } |
|
1271 |
|
1272 /*! |
|
1273 \overload |
|
1274 |
|
1275 \a a contains the source characters; \a len contains the number of |
|
1276 characters in \a a to use. |
|
1277 */ |
|
1278 QString QTextCodec::toUnicode(const QByteArray& a, int len) const |
|
1279 { |
|
1280 len = qMin(a.size(), len); |
|
1281 return convertToUnicode(a.constData(), len, 0); |
|
1282 } |
|
1283 #endif |
|
1284 |
|
1285 /*! |
|
1286 \overload |
|
1287 |
|
1288 \a chars contains the source characters. |
|
1289 */ |
|
1290 QString QTextCodec::toUnicode(const char *chars) const |
|
1291 { |
|
1292 int len = qstrlen(chars); |
|
1293 return convertToUnicode(chars, len, 0); |
|
1294 } |
|
1295 |
|
1296 |
|
1297 /*! |
|
1298 \class QTextEncoder |
|
1299 \brief The QTextEncoder class provides a state-based encoder. |
|
1300 \reentrant |
|
1301 \ingroup i18n |
|
1302 |
|
1303 A text encoder converts text from Unicode into an encoded text format |
|
1304 using a specific codec. |
|
1305 |
|
1306 The encoder converts Unicode into another format, remembering any |
|
1307 state that is required between calls. |
|
1308 |
|
1309 \sa QTextCodec::makeEncoder(), QTextDecoder |
|
1310 */ |
|
1311 |
|
1312 /*! |
|
1313 \fn QTextEncoder::QTextEncoder(const QTextCodec *codec) |
|
1314 |
|
1315 Constructs a text encoder for the given \a codec. |
|
1316 */ |
|
1317 |
|
1318 /*! |
|
1319 Destroys the encoder. |
|
1320 */ |
|
1321 QTextEncoder::~QTextEncoder() |
|
1322 { |
|
1323 } |
|
1324 |
|
1325 /*! \internal |
|
1326 \since 4.5 |
|
1327 Determines whether the eecoder encountered a failure while decoding the input. If |
|
1328 an error was encountered, the produced result is undefined, and gets converted as according |
|
1329 to the conversion flags. |
|
1330 */ |
|
1331 bool QTextEncoder::hasFailure() const |
|
1332 { |
|
1333 return state.invalidChars != 0; |
|
1334 } |
|
1335 |
|
1336 /*! |
|
1337 Converts the Unicode string \a str into an encoded QByteArray. |
|
1338 */ |
|
1339 QByteArray QTextEncoder::fromUnicode(const QString& str) |
|
1340 { |
|
1341 QByteArray result = c->fromUnicode(str.constData(), str.length(), &state); |
|
1342 return result; |
|
1343 } |
|
1344 |
|
1345 /*! |
|
1346 \overload |
|
1347 |
|
1348 Converts \a len characters (not bytes) from \a uc, and returns the |
|
1349 result in a QByteArray. |
|
1350 */ |
|
1351 QByteArray QTextEncoder::fromUnicode(const QChar *uc, int len) |
|
1352 { |
|
1353 QByteArray result = c->fromUnicode(uc, len, &state); |
|
1354 return result; |
|
1355 } |
|
1356 |
|
1357 #ifdef QT3_SUPPORT |
|
1358 /*! |
|
1359 \overload |
|
1360 |
|
1361 Converts \a lenInOut characters (not bytes) from \a uc, and returns the |
|
1362 result in a QByteArray. The number of characters read is returned in |
|
1363 the \a lenInOut parameter. |
|
1364 */ |
|
1365 QByteArray QTextEncoder::fromUnicode(const QString& uc, int& lenInOut) |
|
1366 { |
|
1367 QByteArray result = c->fromUnicode(uc.constData(), lenInOut, &state); |
|
1368 lenInOut = result.length(); |
|
1369 return result; |
|
1370 } |
|
1371 #endif |
|
1372 |
|
1373 /*! |
|
1374 \class QTextDecoder |
|
1375 \brief The QTextDecoder class provides a state-based decoder. |
|
1376 \reentrant |
|
1377 \ingroup i18n |
|
1378 |
|
1379 A text decoder converts text from an encoded text format into Unicode |
|
1380 using a specific codec. |
|
1381 |
|
1382 The decoder converts text in this format into Unicode, remembering any |
|
1383 state that is required between calls. |
|
1384 |
|
1385 \sa QTextCodec::makeDecoder(), QTextEncoder |
|
1386 */ |
|
1387 |
|
1388 /*! |
|
1389 \fn QTextDecoder::QTextDecoder(const QTextCodec *codec) |
|
1390 |
|
1391 Constructs a text decoder for the given \a codec. |
|
1392 */ |
|
1393 |
|
1394 /*! |
|
1395 Destroys the decoder. |
|
1396 */ |
|
1397 QTextDecoder::~QTextDecoder() |
|
1398 { |
|
1399 } |
|
1400 |
|
1401 /*! |
|
1402 \fn QString QTextDecoder::toUnicode(const char *chars, int len) |
|
1403 |
|
1404 Converts the first \a len bytes in \a chars to Unicode, returning |
|
1405 the result. |
|
1406 |
|
1407 If not all characters are used (e.g. if only part of a multi-byte |
|
1408 encoding is at the end of the characters), the decoder remembers |
|
1409 enough state to continue with the next call to this function. |
|
1410 */ |
|
1411 QString QTextDecoder::toUnicode(const char *chars, int len) |
|
1412 { |
|
1413 return c->toUnicode(chars, len, &state); |
|
1414 } |
|
1415 |
|
1416 |
|
1417 /*! \overload |
|
1418 |
|
1419 The converted string is returned in \a target. |
|
1420 */ |
|
1421 void QTextDecoder::toUnicode(QString *target, const char *chars, int len) |
|
1422 { |
|
1423 Q_ASSERT(target); |
|
1424 switch (c->mibEnum()) { |
|
1425 case 106: // utf8 |
|
1426 static_cast<const QUtf8Codec*>(c)->convertToUnicode(target, chars, len, &state); |
|
1427 break; |
|
1428 case 4: { // latin1 |
|
1429 target->resize(len); |
|
1430 ushort *data = (ushort*)target->data(); |
|
1431 for (int i = len; i >=0; --i) |
|
1432 data[i] = (uchar) chars[i]; |
|
1433 } break; |
|
1434 default: |
|
1435 *target = c->toUnicode(chars, len, &state); |
|
1436 } |
|
1437 } |
|
1438 |
|
1439 |
|
1440 /*! |
|
1441 \overload |
|
1442 |
|
1443 Converts the bytes in the byte array specified by \a ba to Unicode |
|
1444 and returns the result. |
|
1445 */ |
|
1446 QString QTextDecoder::toUnicode(const QByteArray &ba) |
|
1447 { |
|
1448 return c->toUnicode(ba.constData(), ba.length(), &state); |
|
1449 } |
|
1450 |
|
1451 |
|
1452 /*! |
|
1453 \fn QTextCodec* QTextCodec::codecForTr() |
|
1454 |
|
1455 Returns the codec used by QObject::tr() on its argument. If this |
|
1456 function returns 0 (the default), tr() assumes Latin-1. |
|
1457 |
|
1458 \sa setCodecForTr() |
|
1459 */ |
|
1460 |
|
1461 /*! |
|
1462 \fn void QTextCodec::setCodecForTr(QTextCodec *c) |
|
1463 \nonreentrant |
|
1464 |
|
1465 Sets the codec used by QObject::tr() on its argument to \a c. If |
|
1466 \a c is 0 (the default), tr() assumes Latin-1. |
|
1467 |
|
1468 If the literal quoted text in the program is not in the Latin-1 |
|
1469 encoding, this function can be used to set the appropriate |
|
1470 encoding. For example, software developed by Korean programmers |
|
1471 might use eucKR for all the text in the program, in which case the |
|
1472 main() function might look like this: |
|
1473 |
|
1474 \snippet doc/src/snippets/code/src_corelib_codecs_qtextcodec.cpp 3 |
|
1475 |
|
1476 Note that this is not the way to select the encoding that the \e |
|
1477 user has chosen. For example, to convert an application containing |
|
1478 literal English strings to Korean, all that is needed is for the |
|
1479 English strings to be passed through tr() and for translation |
|
1480 files to be loaded. For details of internationalization, see |
|
1481 \l{Internationalization with Qt}. |
|
1482 |
|
1483 \sa codecForTr(), setCodecForCStrings() |
|
1484 */ |
|
1485 |
|
1486 |
|
1487 /*! |
|
1488 \fn QTextCodec* QTextCodec::codecForCStrings() |
|
1489 |
|
1490 Returns the codec used by QString to convert to and from \c{const |
|
1491 char *} and QByteArrays. If this function returns 0 (the default), |
|
1492 QString assumes Latin-1. |
|
1493 |
|
1494 \sa setCodecForCStrings() |
|
1495 */ |
|
1496 |
|
1497 /*! |
|
1498 \fn void QTextCodec::setCodecForCStrings(QTextCodec *codec) |
|
1499 \nonreentrant |
|
1500 |
|
1501 Sets the codec used by QString to convert to and from \c{const |
|
1502 char *} and QByteArrays. If the \a codec is 0 (the default), |
|
1503 QString assumes Latin-1. |
|
1504 |
|
1505 \warning Some codecs do not preserve the characters in the ASCII |
|
1506 range (0x00 to 0x7F). For example, the Japanese Shift-JIS |
|
1507 encoding maps the backslash character (0x5A) to the Yen |
|
1508 character. To avoid undesirable side-effects, we recommend |
|
1509 avoiding such codecs with setCodecsForCString(). |
|
1510 |
|
1511 \sa codecForCStrings(), setCodecForTr() |
|
1512 */ |
|
1513 |
|
1514 /*! |
|
1515 \since 4.4 |
|
1516 |
|
1517 Tries to detect the encoding of the provided snippet of HTML in |
|
1518 the given byte array, \a ba, by checking the BOM (Byte Order Mark) |
|
1519 and the content-type meta header and returns a QTextCodec instance |
|
1520 that is capable of decoding the html to unicode. If the codec |
|
1521 cannot be detected from the content provided, \a defaultCodec is |
|
1522 returned. |
|
1523 |
|
1524 \sa codecForUtfText() |
|
1525 */ |
|
1526 QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba, QTextCodec *defaultCodec) |
|
1527 { |
|
1528 // determine charset |
|
1529 int pos; |
|
1530 QTextCodec *c = 0; |
|
1531 |
|
1532 c = QTextCodec::codecForUtfText(ba, c); |
|
1533 if (!c) { |
|
1534 QByteArray header = ba.left(512).toLower(); |
|
1535 if ((pos = header.indexOf("http-equiv=")) != -1) { |
|
1536 if ((pos = header.lastIndexOf("meta ", pos)) != -1) { |
|
1537 pos = header.indexOf("charset=", pos) + int(strlen("charset=")); |
|
1538 if (pos != -1) { |
|
1539 int pos2 = header.indexOf('\"', pos+1); |
|
1540 QByteArray cs = header.mid(pos, pos2-pos); |
|
1541 // qDebug("found charset: %s", cs.data()); |
|
1542 c = QTextCodec::codecForName(cs); |
|
1543 } |
|
1544 } |
|
1545 } |
|
1546 } |
|
1547 if (!c) |
|
1548 c = defaultCodec; |
|
1549 |
|
1550 return c; |
|
1551 } |
|
1552 |
|
1553 /*! |
|
1554 \overload |
|
1555 |
|
1556 Tries to detect the encoding of the provided snippet of HTML in |
|
1557 the given byte array, \a ba, by checking the BOM (Byte Order Mark) |
|
1558 and the content-type meta header and returns a QTextCodec instance |
|
1559 that is capable of decoding the html to unicode. If the codec cannot |
|
1560 be detected, this overload returns a Latin-1 QTextCodec. |
|
1561 */ |
|
1562 QTextCodec *QTextCodec::codecForHtml(const QByteArray &ba) |
|
1563 { |
|
1564 return codecForHtml(ba, QTextCodec::codecForMib(/*Latin 1*/ 4)); |
|
1565 } |
|
1566 |
|
1567 /*! |
|
1568 \since 4.6 |
|
1569 |
|
1570 Tries to detect the encoding of the provided snippet \a ba by |
|
1571 using the BOM (Byte Order Mark) and returns a QTextCodec instance |
|
1572 that is capable of decoding the text to unicode. If the codec |
|
1573 cannot be detected from the content provided, \a defaultCodec is |
|
1574 returned. |
|
1575 |
|
1576 The behavior of this function is undefined if \a ba is not |
|
1577 encoded in unicode. |
|
1578 |
|
1579 \sa codecForHtml() |
|
1580 */ |
|
1581 QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba, QTextCodec *defaultCodec) |
|
1582 { |
|
1583 const int arraySize = ba.size(); |
|
1584 |
|
1585 if (arraySize > 3) { |
|
1586 if ((uchar)ba[0] == 0x00 |
|
1587 && (uchar)ba[1] == 0x00 |
|
1588 && (uchar)ba[2] == 0xFE |
|
1589 && (uchar)ba[3] == 0xFF) |
|
1590 return QTextCodec::codecForMib(1018); // utf-32 be |
|
1591 else if ((uchar)ba[0] == 0xFF |
|
1592 && (uchar)ba[1] == 0xFE |
|
1593 && (uchar)ba[2] == 0x00 |
|
1594 && (uchar)ba[3] == 0x00) |
|
1595 return QTextCodec::codecForMib(1019); // utf-32 le |
|
1596 } |
|
1597 |
|
1598 if (arraySize < 2) |
|
1599 return defaultCodec; |
|
1600 if ((uchar)ba[0] == 0xfe && (uchar)ba[1] == 0xff) |
|
1601 return QTextCodec::codecForMib(1013); // utf16 be |
|
1602 else if ((uchar)ba[0] == 0xff && (uchar)ba[1] == 0xfe) |
|
1603 return QTextCodec::codecForMib(1014); // utf16 le |
|
1604 |
|
1605 if (arraySize < 3) |
|
1606 return defaultCodec; |
|
1607 if ((uchar)ba[0] == 0xef |
|
1608 && (uchar)ba[1] == 0xbb |
|
1609 && (uchar)ba[2] == 0xbf) |
|
1610 return QTextCodec::codecForMib(106); // utf-8 |
|
1611 |
|
1612 return defaultCodec; |
|
1613 } |
|
1614 |
|
1615 /*! |
|
1616 \overload |
|
1617 |
|
1618 Tries to detect the encoding of the provided snippet \a ba by |
|
1619 using the BOM (Byte Order Mark) and returns a QTextCodec instance |
|
1620 that is capable of decoding the text to unicode. If the codec |
|
1621 cannot be detected, this overload returns a Latin-1 QTextCodec. |
|
1622 |
|
1623 The behavior of this function is undefined if \a ba is not |
|
1624 encoded in unicode. |
|
1625 |
|
1626 \sa codecForHtml() |
|
1627 */ |
|
1628 QTextCodec *QTextCodec::codecForUtfText(const QByteArray &ba) |
|
1629 { |
|
1630 return codecForUtfText(ba, QTextCodec::codecForMib(/*Latin 1*/ 4)); |
|
1631 } |
|
1632 |
|
1633 |
|
1634 /*! \internal |
|
1635 \since 4.3 |
|
1636 Determines whether the decoder encountered a failure while decoding the input. If |
|
1637 an error was encountered, the produced result is undefined, and gets converted as according |
|
1638 to the conversion flags. |
|
1639 */ |
|
1640 bool QTextDecoder::hasFailure() const |
|
1641 { |
|
1642 return state.invalidChars != 0; |
|
1643 } |
|
1644 |
|
1645 /*! |
|
1646 \fn QTextCodec *QTextCodec::codecForContent(const char *str, int size) |
|
1647 |
|
1648 This functionality is no longer provided by Qt. This |
|
1649 compatibility function always returns a null pointer. |
|
1650 */ |
|
1651 |
|
1652 /*! |
|
1653 \fn QTextCodec *QTextCodec::codecForName(const char *hint, int accuracy) |
|
1654 |
|
1655 Use the codecForName(const QByteArray &) overload instead. |
|
1656 */ |
|
1657 |
|
1658 /*! |
|
1659 \fn QTextCodec *QTextCodec::codecForIndex(int i) |
|
1660 |
|
1661 Use availableCodecs() or availableMibs() instead and iterate |
|
1662 through the resulting list. |
|
1663 */ |
|
1664 |
|
1665 |
|
1666 /*! |
|
1667 \fn QByteArray QTextCodec::mimeName() const |
|
1668 |
|
1669 Use name() instead. |
|
1670 */ |
|
1671 |
|
1672 QT_END_NAMESPACE |
|
1673 |
|
1674 #endif // QT_NO_TEXTCODEC |