1 /* |
|
2 * Copyright (c) 2000-2005 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 /* ================================================================ */ |
|
18 /* |
|
19 File: ConvertUTF.C |
|
20 Author: Mark E. Davis |
|
21 Copyright (C) 1994 Taligent, Inc. All rights reserved. |
|
22 |
|
23 This code is copyrighted. Under the copyright laws, this code may not |
|
24 be copied, in whole or part, without prior written consent of Taligent. |
|
25 |
|
26 Taligent grants the right to use or reprint this code as long as this |
|
27 ENTIRE copyright notice is reproduced in the code or reproduction. |
|
28 The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES, |
|
29 EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED |
|
30 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN |
|
31 NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING, |
|
32 WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS |
|
33 INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY |
|
34 LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN |
|
35 IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. |
|
36 BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF |
|
37 LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE |
|
38 LIMITATION MAY NOT APPLY TO YOU. |
|
39 |
|
40 RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the |
|
41 government is subject to restrictions as set forth in subparagraph |
|
42 (c)(l)(ii) of the Rights in Technical Data and Computer Software |
|
43 clause at DFARS 252.227-7013 and FAR 52.227-19. |
|
44 |
|
45 This code may be protected by one or more U.S. and International |
|
46 Patents. |
|
47 |
|
48 TRADEMARKS: Taligent and the Taligent Design Mark are registered |
|
49 trademarks of Taligent, Inc. |
|
50 */ |
|
51 /* ================================================================ */ |
|
52 |
|
53 // #include "CVTUTF.H" // commented out by DPB |
|
54 #include "UTF8.H" // added by DPB |
|
55 |
|
56 /* ================================================================ */ |
|
57 |
|
58 const int halfShift = 10; |
|
59 const UCS4 halfBase = 0x0010000UL; |
|
60 const UCS4 halfMask = 0x3FFUL; |
|
61 const UCS4 kSurrogateHighStart = 0xD800UL; |
|
62 const UCS4 kSurrogateHighEnd = 0xDBFFUL; |
|
63 const UCS4 kSurrogateLowStart = 0xDC00UL; |
|
64 const UCS4 kSurrogateLowEnd = 0xDFFFUL; |
|
65 |
|
66 /* ================================================================ */ |
|
67 |
|
68 EXPORT_C // added by DPB |
|
69 ConversionResult ConvertUCS4toUTF16 ( |
|
70 UCS4** sourceStart, const UCS4* sourceEnd, |
|
71 UTF16** targetStart, const UTF16* targetEnd) { |
|
72 ConversionResult result = ok; |
|
73 register UCS4* source = *sourceStart; |
|
74 register UTF16* target = *targetStart; |
|
75 while (source < sourceEnd) { |
|
76 register UCS4 ch; |
|
77 if (target >= targetEnd) { |
|
78 result = targetExhausted; break; |
|
79 }; |
|
80 ch = *source++; |
|
81 if (ch <= kMaximumUCS2) { |
|
82 *target++ = (UTF16)ch; // cast added by DPB |
|
83 } else if (ch > kMaximumUTF16) { |
|
84 *target++ = kReplacementCharacter; |
|
85 } else { |
|
86 if (target + 1 >= targetEnd) { |
|
87 result = targetExhausted; break; |
|
88 }; |
|
89 ch -= halfBase; |
|
90 *target++ = (UTF16)((ch >> halfShift) + kSurrogateHighStart); // cast added by DPB |
|
91 *target++ = (UTF16)((ch & halfMask) + kSurrogateLowStart); // cast added by DPB |
|
92 }; |
|
93 }; |
|
94 *sourceStart = source; |
|
95 *targetStart = target; |
|
96 return result; |
|
97 }; |
|
98 |
|
99 /* ================================================================ */ |
|
100 |
|
101 EXPORT_C // added by DPB |
|
102 ConversionResult ConvertUTF16toUCS4 ( |
|
103 UTF16** sourceStart, UTF16* sourceEnd, |
|
104 UCS4** targetStart, const UCS4* targetEnd) { |
|
105 ConversionResult result = ok; |
|
106 register UTF16* source = *sourceStart; |
|
107 register UCS4* target = *targetStart; |
|
108 while (source < sourceEnd) { |
|
109 register UCS4 ch; |
|
110 ch = *source++; |
|
111 if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd && source < sourceEnd) { |
|
112 register UCS4 ch2 = *source; |
|
113 if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { |
|
114 ch = ((ch - kSurrogateHighStart) << halfShift) |
|
115 + (ch2 - kSurrogateLowStart) + halfBase; |
|
116 ++source; |
|
117 }; |
|
118 }; |
|
119 if (target >= targetEnd) { |
|
120 result = targetExhausted; break; |
|
121 }; |
|
122 *target++ = ch; |
|
123 }; |
|
124 *sourceStart = source; |
|
125 *targetStart = target; |
|
126 return result; |
|
127 }; |
|
128 |
|
129 /* ================================================================ */ |
|
130 |
|
131 const UCS4 offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E2080UL, // "const" added by DPB |
|
132 0x03C82080UL, 0xFA082080UL, 0x82082080UL}; |
|
133 const char bytesFromUTF8[256] = { // "const" added by DPB |
|
134 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
135 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
136 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
137 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
138 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
139 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
140 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
141 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5}; |
|
142 |
|
143 const UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC}; // "const" added by DPB |
|
144 |
|
145 /* ================================================================ */ |
|
146 /* This code is similar in effect to making successive calls on the |
|
147 mbtowc and wctomb routines in FSS-UTF. However, it is considerably |
|
148 different in code: |
|
149 * it is adapted to be consistent with UTF16, |
|
150 * the interface converts a whole buffer to avoid function-call overhead |
|
151 * constants have been gathered. |
|
152 * loops & conditionals have been removed as much as possible for |
|
153 efficiency, in favor of drop-through switch statements. |
|
154 */ |
|
155 |
|
156 /* ================================================================ */ |
|
157 EXPORT_C // added by DPB |
|
158 ConversionResult ConvertUTF16toUTF8 ( |
|
159 UTF16** sourceStart, const UTF16* sourceEnd, |
|
160 UTF8** targetStart, const UTF8* targetEnd) |
|
161 { |
|
162 ConversionResult result = ok; |
|
163 register UTF16* source = *sourceStart; |
|
164 register UTF8* target = *targetStart; |
|
165 while (source < sourceEnd) { |
|
166 register UCS4 ch; |
|
167 register unsigned short bytesToWrite = 0; |
|
168 register const UCS4 byteMask = 0xBF; |
|
169 register const UCS4 byteMark = 0x80; |
|
170 ch = *source++; |
|
171 if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd |
|
172 && source < sourceEnd) { |
|
173 register UCS4 ch2 = *source; |
|
174 if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { |
|
175 ch = ((ch - kSurrogateHighStart) << halfShift) |
|
176 + (ch2 - kSurrogateLowStart) + halfBase; |
|
177 ++source; |
|
178 }; |
|
179 }; |
|
180 if (ch < 0x80) { bytesToWrite = 1; |
|
181 } else if (ch < 0x800) { bytesToWrite = 2; |
|
182 } else if (ch < 0x10000) { bytesToWrite = 3; |
|
183 } else if (ch < 0x200000) { bytesToWrite = 4; |
|
184 } else if (ch < 0x4000000) { bytesToWrite = 5; |
|
185 } else if (ch <= kMaximumUCS4){ bytesToWrite = 6; |
|
186 } else { bytesToWrite = 2; |
|
187 ch = kReplacementCharacter; |
|
188 }; /* I wish there were a smart way to avoid this conditional */ |
|
189 |
|
190 target += bytesToWrite; |
|
191 if (target > targetEnd) { |
|
192 target -= bytesToWrite; result = targetExhausted; break; |
|
193 }; |
|
194 switch (bytesToWrite) { /* note: code falls through cases! */ |
|
195 case 6: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB |
|
196 case 5: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB |
|
197 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB |
|
198 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB |
|
199 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB |
|
200 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); // cast added by DPB |
|
201 }; |
|
202 target += bytesToWrite; |
|
203 }; |
|
204 *sourceStart = source; |
|
205 *targetStart = target; |
|
206 return result; |
|
207 }; |
|
208 |
|
209 /* ================================================================ */ |
|
210 |
|
211 EXPORT_C // added by DPB |
|
212 ConversionResult ConvertUTF8toUTF16 ( |
|
213 UTF8** sourceStart, UTF8* sourceEnd, |
|
214 UTF16** targetStart, const UTF16* targetEnd) |
|
215 { |
|
216 ConversionResult result = ok; |
|
217 register UTF8* source = *sourceStart; |
|
218 register UTF16* target = *targetStart; |
|
219 while (source < sourceEnd) { |
|
220 register UCS4 ch = 0; |
|
221 register unsigned short extraBytesToWrite = bytesFromUTF8[*source]; |
|
222 if (source + extraBytesToWrite > sourceEnd) { |
|
223 result = sourceExhausted; break; |
|
224 }; |
|
225 switch(extraBytesToWrite) { /* note: code falls through cases! */ |
|
226 case 5: ch += *source++; ch <<= 6; |
|
227 case 4: ch += *source++; ch <<= 6; |
|
228 case 3: ch += *source++; ch <<= 6; |
|
229 case 2: ch += *source++; ch <<= 6; |
|
230 case 1: ch += *source++; ch <<= 6; |
|
231 case 0: ch += *source++; |
|
232 }; |
|
233 ch -= offsetsFromUTF8[extraBytesToWrite]; |
|
234 |
|
235 if (target >= targetEnd) { |
|
236 result = targetExhausted; break; |
|
237 }; |
|
238 if (ch <= kMaximumUCS2) { |
|
239 *target++ = (UTF16)ch; |
|
240 } else if (ch > kMaximumUTF16) { |
|
241 *target++ = kReplacementCharacter; |
|
242 } else { |
|
243 if (target + 1 >= targetEnd) { |
|
244 result = targetExhausted; break; |
|
245 }; |
|
246 ch -= halfBase; |
|
247 *target++ = (UTF16)((ch >> halfShift) + kSurrogateHighStart); |
|
248 *target++ = (UTF16)((ch & halfMask) + kSurrogateLowStart); |
|
249 }; |
|
250 }; |
|
251 *sourceStart = source; |
|
252 *targetStart = target; |
|
253 return result; |
|
254 }; |
|
255 |
|
256 /* ================================================================ */ |
|
257 EXPORT_C // added by DPB |
|
258 ConversionResult ConvertUCS4toUTF8 ( |
|
259 UCS4** sourceStart, const UCS4* sourceEnd, |
|
260 UTF8** targetStart, const UTF8* targetEnd) |
|
261 { |
|
262 ConversionResult result = ok; |
|
263 register UCS4* source = *sourceStart; |
|
264 register UTF8* target = *targetStart; |
|
265 while (source < sourceEnd) { |
|
266 register UCS4 ch; |
|
267 register unsigned short bytesToWrite = 0; |
|
268 register const UCS4 byteMask = 0xBF; |
|
269 register const UCS4 byteMark = 0x80; |
|
270 ch = *source++; |
|
271 if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd |
|
272 && source < sourceEnd) { |
|
273 register UCS4 ch2 = *source; |
|
274 if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) { |
|
275 ch = ((ch - kSurrogateHighStart) << halfShift) |
|
276 + (ch2 - kSurrogateLowStart) + halfBase; |
|
277 ++source; |
|
278 }; |
|
279 }; |
|
280 if (ch < 0x80) { bytesToWrite = 1; |
|
281 } else if (ch < 0x800) { bytesToWrite = 2; |
|
282 } else if (ch < 0x10000) { bytesToWrite = 3; |
|
283 } else if (ch < 0x200000) { bytesToWrite = 4; |
|
284 } else if (ch < 0x4000000) { bytesToWrite = 5; |
|
285 } else if (ch <= kMaximumUCS4){ bytesToWrite = 6; |
|
286 } else { bytesToWrite = 2; |
|
287 ch = kReplacementCharacter; |
|
288 }; /* I wish there were a smart way to avoid this conditional */ |
|
289 |
|
290 target += bytesToWrite; |
|
291 if (target > targetEnd) { |
|
292 target -= bytesToWrite; result = targetExhausted; break; |
|
293 }; |
|
294 switch (bytesToWrite) { /* note: code falls through cases! */ |
|
295 case 6: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB |
|
296 case 5: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB |
|
297 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB |
|
298 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB |
|
299 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; // cast added by DPB |
|
300 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); // cast added by DPB |
|
301 }; |
|
302 target += bytesToWrite; |
|
303 }; |
|
304 *sourceStart = source; |
|
305 *targetStart = target; |
|
306 return result; |
|
307 }; |
|
308 |
|
309 /* ================================================================ */ |
|
310 |
|
311 EXPORT_C // added by DPB |
|
312 ConversionResult ConvertUTF8toUCS4 ( |
|
313 UTF8** sourceStart, UTF8* sourceEnd, |
|
314 UCS4** targetStart, const UCS4* targetEnd) |
|
315 { |
|
316 ConversionResult result = ok; |
|
317 register UTF8* source = *sourceStart; |
|
318 register UCS4* target = *targetStart; |
|
319 while (source < sourceEnd) { |
|
320 register UCS4 ch = 0; |
|
321 register unsigned short extraBytesToWrite = bytesFromUTF8[*source]; |
|
322 if (source + extraBytesToWrite > sourceEnd) { |
|
323 result = sourceExhausted; break; |
|
324 }; |
|
325 switch(extraBytesToWrite) { /* note: code falls through cases! */ |
|
326 case 5: ch += *source++; ch <<= 6; |
|
327 case 4: ch += *source++; ch <<= 6; |
|
328 case 3: ch += *source++; ch <<= 6; |
|
329 case 2: ch += *source++; ch <<= 6; |
|
330 case 1: ch += *source++; ch <<= 6; |
|
331 case 0: ch += *source++; |
|
332 }; |
|
333 ch -= offsetsFromUTF8[extraBytesToWrite]; |
|
334 |
|
335 if (target >= targetEnd) { |
|
336 result = targetExhausted; break; |
|
337 }; |
|
338 if (ch <= kMaximumUCS2) { |
|
339 *target++ = ch; |
|
340 } else if (ch > kMaximumUCS4) { |
|
341 *target++ = kReplacementCharacter; |
|
342 } else { |
|
343 if (target + 1 >= targetEnd) { |
|
344 result = targetExhausted; break; |
|
345 }; |
|
346 ch -= halfBase; |
|
347 *target++ = (ch >> halfShift) + kSurrogateHighStart; |
|
348 *target++ = (ch & halfMask) + kSurrogateLowStart; |
|
349 }; |
|
350 }; |
|
351 *sourceStart = source; |
|
352 *targetStart = target; |
|
353 return result; |
|
354 }; |
|