charconvfw/Charconv/ongoing/Source/tool/UTF.CPP
changeset 32 8b9155204a54
equal deleted inserted replaced
31:b9ad20498fb4 32:8b9155204a54
       
     1 /*
       
     2 * Copyright (c) 1997-1999 Nokia Corporation and/or its subsidiary(-ies). 
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of the License "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description:      
       
    15 *
       
    16 */
       
    17 
       
    18 
       
    19 
       
    20 
       
    21 
       
    22 
       
    23 
       
    24 
       
    25 #include <stdlib.h>
       
    26 
       
    27 const int KErrorIllFormedInput=-1;
       
    28 
       
    29 int Utf8ToUnicode(wchar_t* aUnicode, const char* aUtf8)
       
    30 // must '\0'-terminate the output
       
    31 	{
       
    32 	wchar_t* startOfUnicode=aUnicode;
       
    33 	for (;;)
       
    34 		{
       
    35 		unsigned int currentUtf8Byte=*aUtf8;
       
    36 		if (currentUtf8Byte=='\0')
       
    37 			{
       
    38 			break;
       
    39 			}
       
    40 		if ((currentUtf8Byte&0x80)==0x00)
       
    41 			{
       
    42 			if (startOfUnicode!=NULL)
       
    43 				{
       
    44 				*aUnicode=(wchar_t)currentUtf8Byte;
       
    45 				}
       
    46 			}
       
    47 		else if ((currentUtf8Byte&0xe0)==0xc0)
       
    48 			{
       
    49 			unsigned int currentUnicodeCharacter=((currentUtf8Byte&0x1f)<<6);
       
    50 			++aUtf8;
       
    51 			currentUtf8Byte=*aUtf8;
       
    52 			if ((currentUtf8Byte&0xc0)!=0x80)
       
    53 				{
       
    54 				return KErrorIllFormedInput;
       
    55 				}
       
    56 			currentUnicodeCharacter|=(currentUtf8Byte&0x3f);
       
    57 			if (startOfUnicode!=NULL)
       
    58 				{
       
    59 				*aUnicode=(wchar_t)currentUnicodeCharacter;
       
    60 				}
       
    61 			}
       
    62 		else if ((currentUtf8Byte&0xf0)==0xe0)
       
    63 			{
       
    64 			unsigned int currentUnicodeCharacter=((currentUtf8Byte&0x0f)<<12);
       
    65 			++aUtf8;
       
    66 			currentUtf8Byte=*aUtf8;
       
    67 			if ((currentUtf8Byte&0xc0)!=0x80)
       
    68 				{
       
    69 				return KErrorIllFormedInput;
       
    70 				}
       
    71 			currentUnicodeCharacter|=((currentUtf8Byte&0x3f)<<6);
       
    72 			++aUtf8;
       
    73 			currentUtf8Byte=*aUtf8;
       
    74 			if ((currentUtf8Byte&0xc0)!=0x80)
       
    75 				{
       
    76 				return KErrorIllFormedInput;
       
    77 				}
       
    78 			currentUnicodeCharacter|=(currentUtf8Byte&0x3f);
       
    79 			if (startOfUnicode!=NULL)
       
    80 				{
       
    81 				*aUnicode=(wchar_t)currentUnicodeCharacter;
       
    82 				}
       
    83 			}
       
    84 		else if ((currentUtf8Byte&0xf8)==0xf0)
       
    85 			{
       
    86 			unsigned int currentUnicodeCharacter=((currentUtf8Byte&0x07)<<8);
       
    87 			++aUtf8;
       
    88 			currentUtf8Byte=*aUtf8;
       
    89 			if ((currentUtf8Byte&0xc0)!=0x80)
       
    90 				{
       
    91 				return KErrorIllFormedInput;
       
    92 				}
       
    93 			currentUnicodeCharacter|=((currentUtf8Byte&0x3f)<<2);
       
    94 			if (currentUnicodeCharacter<0x0040)
       
    95 				{
       
    96 				return KErrorIllFormedInput;
       
    97 				}
       
    98 			currentUnicodeCharacter-=0x0040;
       
    99 			if (currentUnicodeCharacter>=0x0400)
       
   100 				{
       
   101 				return KErrorIllFormedInput;
       
   102 				}
       
   103 			++aUtf8;
       
   104 			currentUtf8Byte=*aUtf8;
       
   105 			if ((currentUtf8Byte&0xc0)!=0x80)
       
   106 				{
       
   107 				return KErrorIllFormedInput;
       
   108 				}
       
   109 			currentUnicodeCharacter|=((currentUtf8Byte&0x30)>>4);
       
   110 			if (startOfUnicode!=NULL)
       
   111 				{
       
   112 				*aUnicode=(wchar_t)(0xd800|currentUnicodeCharacter);
       
   113 				}
       
   114 			currentUnicodeCharacter=((currentUtf8Byte&0x0f)<<6);
       
   115 			++aUtf8;
       
   116 			currentUtf8Byte=*aUtf8;
       
   117 			if ((currentUtf8Byte&0xc0)!=0x80)
       
   118 				{
       
   119 				return KErrorIllFormedInput;
       
   120 				}
       
   121 			currentUnicodeCharacter|=(currentUtf8Byte&0x3f);
       
   122 			++aUnicode;
       
   123 			if (startOfUnicode!=NULL)
       
   124 				{
       
   125 				*aUnicode=(wchar_t)(0xdc00|currentUnicodeCharacter);
       
   126 				}
       
   127 			}
       
   128 		else
       
   129 			{
       
   130 			return KErrorIllFormedInput;
       
   131 			}
       
   132 		++aUnicode;
       
   133 		++aUtf8;
       
   134 		}
       
   135 	if (startOfUnicode!=NULL)
       
   136 		{
       
   137 		*aUnicode='\0';
       
   138 		}
       
   139 	return aUnicode-startOfUnicode;
       
   140 	}
       
   141 #include <STDIO.H>
       
   142 int UnicodeToUtf8(char* aUtf8, const wchar_t* aUnicode)
       
   143 // must '\0'-terminate the output
       
   144 	{
       
   145 	char* startOfUtf8=aUtf8;
       
   146 	for (;;)
       
   147 		{
       
   148 		unsigned int currentUnicodeCharacter=*aUnicode;
       
   149 		if (currentUnicodeCharacter=='\0')
       
   150 			{
       
   151 			break;
       
   152 			}
       
   153 		if ((currentUnicodeCharacter&0xff80)==0x0000)
       
   154 			{
       
   155 			if (startOfUtf8!=NULL)
       
   156 				{
       
   157 				*aUtf8=(char)currentUnicodeCharacter;
       
   158 				}
       
   159 			}
       
   160 		else if ((currentUnicodeCharacter&0xf800)==0x0000)
       
   161 			{
       
   162 			if (startOfUtf8!=NULL)
       
   163 				{
       
   164 				*aUtf8=(char)(0xc0|(currentUnicodeCharacter>>6));
       
   165 				}
       
   166 			++aUtf8;
       
   167 			if (startOfUtf8!=NULL)
       
   168 				{
       
   169 				*aUtf8=(char)(0x80|(currentUnicodeCharacter&0x3f));
       
   170 				}
       
   171 			}
       
   172 		else if ((currentUnicodeCharacter&0xfc00)==0xd800)
       
   173 			{
       
   174 			currentUnicodeCharacter+=0x0040;
       
   175 			if (startOfUtf8!=NULL)
       
   176 				{
       
   177 				*aUtf8=(char)(0xf0|((currentUnicodeCharacter>>8)&0x07));
       
   178 				}
       
   179 			++aUtf8;
       
   180 			if (startOfUtf8!=NULL)
       
   181 				{
       
   182 				*aUtf8=(char)(0x80|((currentUnicodeCharacter>>2)&0x3f));
       
   183 				}
       
   184 			{
       
   185 			unsigned int currentUtf8Byte=(0x80|((currentUnicodeCharacter&0x03)<<4));
       
   186 			++aUnicode;
       
   187 			currentUnicodeCharacter=*aUnicode;
       
   188 			if ((currentUnicodeCharacter&0xfc00)!=0xdc00)
       
   189 				{
       
   190 				return KErrorIllFormedInput;
       
   191 				}
       
   192 			currentUtf8Byte|=((currentUnicodeCharacter>>6)&0x0f);
       
   193 			++aUtf8;
       
   194 			if (startOfUtf8!=NULL)
       
   195 				{
       
   196 				*aUtf8=(char)currentUtf8Byte;
       
   197 				}
       
   198 			}
       
   199 			++aUtf8;
       
   200 			if (startOfUtf8!=NULL)
       
   201 				{
       
   202 				*aUtf8=(char)(0x80|(currentUnicodeCharacter&0x3f));
       
   203 				}
       
   204 			}
       
   205 		else
       
   206 			{
       
   207 			if (startOfUtf8!=NULL)
       
   208 				{
       
   209 				*aUtf8=(char)(0xe0|(currentUnicodeCharacter>>12));
       
   210 				}
       
   211 			++aUtf8;
       
   212 			if (startOfUtf8!=NULL)
       
   213 				{
       
   214 				*aUtf8=(char)(0x80|((currentUnicodeCharacter>>6)&0x3f));
       
   215 				}
       
   216 			++aUtf8;
       
   217 			if (startOfUtf8!=NULL)
       
   218 				{
       
   219 				*aUtf8=(char)(0x80|(currentUnicodeCharacter&0x3f));
       
   220 				}
       
   221 			}
       
   222 		++aUtf8;
       
   223 		++aUnicode;
       
   224 		}
       
   225 	if (startOfUtf8!=NULL)
       
   226 		{
       
   227 		*aUtf8='\0';
       
   228 		}
       
   229 	return aUtf8-startOfUtf8;
       
   230 	}
       
   231