src/plugins/codecs/cn/qgb18030codec.cpp
changeset 30 5dc02b23752f
parent 18 2f34d5167611
child 33 3e2da88830cd
equal deleted inserted replaced
29:b72c6db6890b 30:5dc02b23752f
    58 #define Is2ndByteIn4Bytes(c)        (InRange((c), 0x30, 0x39))
    58 #define Is2ndByteIn4Bytes(c)        (InRange((c), 0x30, 0x39))
    59 #define Is2ndByte(c)        (Is2ndByteIn2Bytes(c) || Is2ndByteIn4Bytes(c))
    59 #define Is2ndByte(c)        (Is2ndByteIn2Bytes(c) || Is2ndByteIn4Bytes(c))
    60 #define Is3rdByte(c)        (InRange((c), 0x81, 0xFE))
    60 #define Is3rdByte(c)        (InRange((c), 0x81, 0xFE))
    61 #define Is4thByte(c)        (InRange((c), 0x30, 0x39))
    61 #define Is4thByte(c)        (InRange((c), 0x30, 0x39))
    62 
    62 
    63 #define QValidChar(u)        ((u) ? QChar((ushort)(u)) : QChar(QChar::ReplacementCharacter))
    63 #define qValidChar(u)        ((u) ? (u) : static_cast<ushort>(QChar::ReplacementCharacter))
    64 
    64 
    65 /* User-defined areas:        UDA 1: 0xAAA1 - 0xAFFE (564/0)
    65 /* User-defined areas:        UDA 1: 0xAAA1 - 0xAFFE (564/0)
    66                         UDA 2: 0xF8A1 - 0xFEFE (658/0)
    66                         UDA 2: 0xF8A1 - 0xFEFE (658/0)
    67                         UDA 3: 0xA140 - 0xA7A0 (672/0) */
    67                         UDA 3: 0xA140 - 0xA7A0 (672/0) */
    68 #define IsUDA1(a, b)        (InRange((a), 0xAA, 0xAF) && InRange((b), 0xA1, 0xFE))
    68 #define IsUDA1(a, b)        (InRange((a), 0xAA, 0xAF) && InRange((b), 0xA1, 0xFE))
   158 
   158 
   159 QString QGb18030Codec::convertToUnicode(const char* chars, int len, ConverterState *state) const
   159 QString QGb18030Codec::convertToUnicode(const char* chars, int len, ConverterState *state) const
   160 {
   160 {
   161     uchar buf[4];
   161     uchar buf[4];
   162     int nbuf = 0;
   162     int nbuf = 0;
   163     QChar replacement = QChar::ReplacementCharacter;
   163     ushort replacement = QChar::ReplacementCharacter;
   164     if (state) {
   164     if (state) {
   165         if (state->flags & ConvertInvalidToNull)
   165         if (state->flags & ConvertInvalidToNull)
   166             replacement = QChar::Null;
   166             replacement = QChar::Null;
   167         nbuf = state->remainingChars;
   167         nbuf = state->remainingChars;
   168         buf[0] = (state->state_data[0] >> 24) & 0xff;
   168         buf[0] = (state->state_data[0] >> 24) & 0xff;
   171         buf[3] = (state->state_data[0] >>  0) & 0xff;
   171         buf[3] = (state->state_data[0] >>  0) & 0xff;
   172     }
   172     }
   173     int invalid = 0;
   173     int invalid = 0;
   174 
   174 
   175     QString result;
   175     QString result;
       
   176     result.resize(len);
       
   177     int unicodeLen = 0;
       
   178     ushort *const resultData = reinterpret_cast<ushort*>(result.data());
   176     //qDebug("QGb18030Decoder::toUnicode(const char* chars, int len = %d)", len);
   179     //qDebug("QGb18030Decoder::toUnicode(const char* chars, int len = %d)", len);
   177     for (int i = 0; i < len; i++) {
   180     for (int i = 0; i < len; i++) {
   178         uchar ch = chars[i];
   181         uchar ch = chars[i];
   179         switch (nbuf) {
   182         switch (nbuf) {
   180         case 0:
   183         case 0:
   181             if (ch < 0x80) {
   184             if (ch < 0x80) {
   182                 // ASCII
   185                 // ASCII
   183                 result += QLatin1Char(ch);
   186                 resultData[unicodeLen] = ch;
       
   187                 ++unicodeLen;
   184             } else if (Is1stByte(ch)) {
   188             } else if (Is1stByte(ch)) {
   185                 // GB18030?
   189                 // GB18030?
   186                 buf[0] = ch;
   190                 buf[0] = ch;
   187                 nbuf = 1;
   191                 nbuf = 1;
   188             } else {
   192             } else {
   189                 // Invalid
   193                 // Invalid
   190                 result += replacement;
   194                 resultData[unicodeLen] = replacement;
       
   195                 ++unicodeLen;
   191                 ++invalid;
   196                 ++invalid;
   192             }
   197             }
   193             break;
   198             break;
   194         case 1:
   199         case 1:
   195             // GB18030 2 bytes
   200             // GB18030 2 bytes
   196             if (Is2ndByteIn2Bytes(ch)) {
   201             if (Is2ndByteIn2Bytes(ch)) {
   197                 buf[1] = ch;
   202                 buf[1] = ch;
   198                 int clen = 2;
   203                 int clen = 2;
   199                 uint u = qt_Gb18030ToUnicode(buf, clen);
   204                 uint u = qt_Gb18030ToUnicode(buf, clen);
   200                 if (clen == 2) {
   205                 if (clen == 2) {
   201                     result += QValidChar(u);
   206                     resultData[unicodeLen] = qValidChar(static_cast<ushort>(u));
       
   207                     ++unicodeLen;
   202                 } else {
   208                 } else {
   203                     result += replacement;
   209                     resultData[unicodeLen] = replacement;
       
   210                     ++unicodeLen;
   204                     ++invalid;
   211                     ++invalid;
   205                 }
   212                 }
   206                 nbuf = 0;
   213                 nbuf = 0;
   207             } else if (Is2ndByteIn4Bytes(ch)) {
   214             } else if (Is2ndByteIn4Bytes(ch)) {
   208                 buf[1] = ch;
   215                 buf[1] = ch;
   209                 nbuf = 2;
   216                 nbuf = 2;
   210             } else {
   217             } else {
   211                 // Error
   218                 // Error
   212                 result += replacement;
   219                 resultData[unicodeLen] = replacement;
       
   220                 ++unicodeLen;
   213                 ++invalid;
   221                 ++invalid;
   214                 nbuf = 0;
   222                 nbuf = 0;
   215             }
   223             }
   216             break;
   224             break;
   217         case 2:
   225         case 2:
   218             // GB18030 3 bytes
   226             // GB18030 3 bytes
   219             if (Is3rdByte(ch)) {
   227             if (Is3rdByte(ch)) {
   220                 buf[2] = ch;
   228                 buf[2] = ch;
   221                 nbuf = 3;
   229                 nbuf = 3;
   222             } else {
   230             } else {
   223                 result += replacement;
   231                 resultData[unicodeLen] = replacement;
       
   232                 ++unicodeLen;
   224                 ++invalid;
   233                 ++invalid;
   225                 nbuf = 0;
   234                 nbuf = 0;
   226             }
   235             }
   227             break;
   236             break;
   228         case 3:
   237         case 3:
   230             if (Is4thByte(ch)) {
   239             if (Is4thByte(ch)) {
   231                 buf[3] = ch;
   240                 buf[3] = ch;
   232                 int clen = 4;
   241                 int clen = 4;
   233                 uint u = qt_Gb18030ToUnicode(buf, clen);
   242                 uint u = qt_Gb18030ToUnicode(buf, clen);
   234                 if (clen == 4) {
   243                 if (clen == 4) {
   235                     result += QValidChar(u);
   244                     resultData[unicodeLen] = qValidChar(u);
       
   245                     ++unicodeLen;
   236                 } else {
   246                 } else {
   237                     result += replacement;
   247                     resultData[unicodeLen] = replacement;
       
   248                     ++unicodeLen;
   238                     ++invalid;
   249                     ++invalid;
   239                 }
   250                 }
   240             } else {
   251             } else {
   241                 result += replacement;
   252                 resultData[unicodeLen] = replacement;
       
   253                 ++unicodeLen;
   242                 ++invalid;
   254                 ++invalid;
   243             }
   255             }
   244             nbuf = 0;
   256             nbuf = 0;
   245             break;
   257             break;
   246         }
   258         }
   247     }
   259     }
       
   260     result.resize(unicodeLen);
       
   261 
   248     if (state) {
   262     if (state) {
   249         state->remainingChars = nbuf;
   263         state->remainingChars = nbuf;
   250         state->state_data[0] = (buf[0] << 24) + (buf[1] << 16) + (buf[2] << 8) + buf[3];
   264         state->state_data[0] = (buf[0] << 24) + (buf[1] << 16) + (buf[2] << 8) + buf[3];
   251         state->invalidChars += invalid;
   265         state->invalidChars += invalid;
   252     }
   266     }
   340             if (Is2ndByteIn2Bytes(ch)) {
   354             if (Is2ndByteIn2Bytes(ch)) {
   341                 buf[1] = ch;
   355                 buf[1] = ch;
   342                 int clen = 2;
   356                 int clen = 2;
   343                 uint u = qt_Gb18030ToUnicode(buf, clen);
   357                 uint u = qt_Gb18030ToUnicode(buf, clen);
   344                 if (clen == 2) {
   358                 if (clen == 2) {
   345                     result += QValidChar(u);
   359                     result += qValidChar(u);
   346                 } else {
   360                 } else {
   347                     result += replacement;
   361                     result += replacement;
   348                     ++invalid;
   362                     ++invalid;
   349                 }
   363                 }
   350                 nbuf = 0;
   364                 nbuf = 0;
   443 
   457 
   444 QString QGb2312Codec::convertToUnicode(const char* chars, int len, ConverterState *state) const
   458 QString QGb2312Codec::convertToUnicode(const char* chars, int len, ConverterState *state) const
   445 {
   459 {
   446     uchar buf[2];
   460     uchar buf[2];
   447     int nbuf = 0;
   461     int nbuf = 0;
   448     QChar replacement = QChar::ReplacementCharacter;
   462     ushort replacement = QChar::ReplacementCharacter;
   449     if (state) {
   463     if (state) {
   450         if (state->flags & ConvertInvalidToNull)
   464         if (state->flags & ConvertInvalidToNull)
   451             replacement = QChar::Null;
   465             replacement = QChar::Null;
   452         nbuf = state->remainingChars;
   466         nbuf = state->remainingChars;
   453         buf[0] = state->state_data[0];
   467         buf[0] = state->state_data[0];
   454         buf[1] = state->state_data[1];
   468         buf[1] = state->state_data[1];
   455     }
   469     }
   456     int invalid = 0;
   470     int invalid = 0;
   457 
   471 
   458     QString result;
   472     QString result;
       
   473     result.resize(len);
       
   474     int unicodeLen = 0;
       
   475     ushort *const resultData = reinterpret_cast<ushort*>(result.data());
   459     //qDebug("QGb2312Decoder::toUnicode(const char* chars, int len = %d)", len);
   476     //qDebug("QGb2312Decoder::toUnicode(const char* chars, int len = %d)", len);
   460     for (int i=0; i<len; i++) {
   477     for (int i=0; i<len; i++) {
   461         uchar ch = chars[i];
   478         uchar ch = chars[i];
   462         switch (nbuf) {
   479         switch (nbuf) {
   463         case 0:
   480         case 0:
   464             if (ch < 0x80) {
   481             if (ch < 0x80) {
   465                 // ASCII
   482                 // ASCII
   466                 result += QLatin1Char(ch);
   483                 resultData[unicodeLen] = ch;
       
   484                 ++unicodeLen;
   467             } else if (IsByteInGb2312(ch)) {
   485             } else if (IsByteInGb2312(ch)) {
   468                 // GB2312 1st byte?
   486                 // GB2312 1st byte?
   469                 buf[0] = ch;
   487                 buf[0] = ch;
   470                 nbuf = 1;
   488                 nbuf = 1;
   471             } else {
   489             } else {
   472                 // Invalid
   490                 // Invalid
   473                 result += replacement;
   491                 resultData[unicodeLen] = replacement;
       
   492                 ++unicodeLen;
   474                 ++invalid;
   493                 ++invalid;
   475             }
   494             }
   476             break;
   495             break;
   477         case 1:
   496         case 1:
   478             // GB2312 2nd byte
   497             // GB2312 2nd byte
   479             if (IsByteInGb2312(ch)) {
   498             if (IsByteInGb2312(ch)) {
   480                 buf[1] = ch;
   499                 buf[1] = ch;
   481                 int clen = 2;
   500                 int clen = 2;
   482                 uint u = qt_Gb18030ToUnicode(buf, clen);
   501                 uint u = qt_Gb18030ToUnicode(buf, clen);
   483                 if (clen == 2) {
   502                 if (clen == 2) {
   484                     result += QValidChar(u);
   503                     resultData[unicodeLen] = qValidChar(static_cast<ushort>(u));
       
   504                     ++unicodeLen;
   485                 } else {
   505                 } else {
   486                     result += replacement;
   506                     resultData[unicodeLen] = replacement;
       
   507                     ++unicodeLen;
   487                     ++invalid;
   508                     ++invalid;
   488                 }
   509                 }
   489                 nbuf = 0;
   510                 nbuf = 0;
   490             } else {
   511             } else {
   491                 // Error
   512                 // Error
   492                 result += replacement;
   513                 resultData[unicodeLen] = replacement;
       
   514                 ++unicodeLen;
   493                 ++invalid;
   515                 ++invalid;
   494                 nbuf = 0;
   516                 nbuf = 0;
   495             }
   517             }
   496             break;
   518             break;
   497         }
   519         }
   498     }
   520     }
       
   521     result.resize(unicodeLen);
   499 
   522 
   500     if (state) {
   523     if (state) {
   501         state->remainingChars = nbuf;
   524         state->remainingChars = nbuf;
   502         state->state_data[0] = buf[0];
   525         state->state_data[0] = buf[0];
   503         state->state_data[1] = buf[1];
   526         state->state_data[1] = buf[1];