|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 1999-2004, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * |
|
7 * uconv_cnv.h: |
|
8 * defines all the low level conversion functions |
|
9 * T_UnicodeConverter_{to,from}Unicode_$ConversionType |
|
10 * |
|
11 * Modification History: |
|
12 * |
|
13 * Date Name Description |
|
14 * 05/09/00 helena Added implementation to handle fallback mappings. |
|
15 * 06/29/2000 helena Major rewrite of the callback APIs. |
|
16 */ |
|
17 |
|
18 #ifndef UCNV_CNV_H |
|
19 #define UCNV_CNV_H |
|
20 |
|
21 #include "unicode/utypes.h" |
|
22 |
|
23 #if !UCONFIG_NO_CONVERSION |
|
24 |
|
25 #include "unicode/ucnv.h" |
|
26 #include "unicode/ucnv_err.h" |
|
27 #include "unicode/uset.h" |
|
28 #include "uset_imp.h" |
|
29 |
|
30 U_CDECL_BEGIN |
|
31 |
|
32 /* this is used in fromUnicode DBCS tables as an "unassigned" marker */ |
|
33 #define missingCharMarker 0xFFFF |
|
34 |
|
35 /* |
|
36 * #define missingUCharMarker 0xfffe |
|
37 * |
|
38 * commented out because there are actually two values used in toUnicode tables: |
|
39 * U+fffe "unassigned" |
|
40 * U+ffff "illegal" |
|
41 */ |
|
42 |
|
43 /** Forward declaration, see ucnv_bld.h */ |
|
44 struct UConverterSharedData; |
|
45 typedef struct UConverterSharedData UConverterSharedData; |
|
46 |
|
47 /* function types for UConverterImpl ---------------------------------------- */ |
|
48 |
|
49 /* struct with arguments for UConverterLoad and ucnv_load() */ |
|
50 typedef struct { |
|
51 int32_t size; /* sizeof(UConverterLoadArgs) */ |
|
52 int32_t nestedLoads; /* count nested ucnv_load() calls */ |
|
53 int32_t reserved; /* reserved - for good alignment of the pointers */ |
|
54 uint32_t options; |
|
55 const char *pkg, *name; |
|
56 } UConverterLoadArgs; |
|
57 |
|
58 typedef void (*UConverterLoad) (UConverterSharedData *sharedData, |
|
59 UConverterLoadArgs *pArgs, |
|
60 const uint8_t *raw, UErrorCode *pErrorCode); |
|
61 typedef void (*UConverterUnload) (UConverterSharedData *sharedData); |
|
62 |
|
63 typedef void (*UConverterOpen) (UConverter *cnv, const char *name, const char *locale,uint32_t options, UErrorCode *pErrorCode); |
|
64 typedef void (*UConverterClose) (UConverter *cnv); |
|
65 |
|
66 typedef enum UConverterResetChoice { |
|
67 UCNV_RESET_BOTH, |
|
68 UCNV_RESET_TO_UNICODE, |
|
69 UCNV_RESET_FROM_UNICODE |
|
70 } UConverterResetChoice; |
|
71 |
|
72 typedef void (*UConverterReset) (UConverter *cnv, UConverterResetChoice choice); |
|
73 |
|
74 /* |
|
75 * Converter implementation function(s) for ucnv_toUnicode(). |
|
76 * If the toUnicodeWithOffsets function pointer is NULL, |
|
77 * then the toUnicode function will be used and the offsets will be set to -1. |
|
78 * |
|
79 * Must maintain state across buffers. Use toUBytes[toULength] for partial input |
|
80 * sequences; it will be checked in ucnv.c at the end of the input stream |
|
81 * to detect truncated input. |
|
82 * Some converters may need additional detection and may then set U_TRUNCATED_CHAR_FOUND. |
|
83 * |
|
84 * The toUnicodeWithOffsets must write exactly as many offset values as target |
|
85 * units. Write offset values of -1 for when the source index corresponding to |
|
86 * the output unit is not known (e.g., the character started in an earlier buffer). |
|
87 * The pArgs->offsets pointer need not be moved forward. |
|
88 * |
|
89 * At function return, either one of the following conditions must be true: |
|
90 * - U_BUFFER_OVERFLOW_ERROR and the target is full: target==targetLimit |
|
91 * - another error code with toUBytes[toULength] set to the offending input |
|
92 * - no error, and the source is consumed: source==sourceLimit |
|
93 * |
|
94 * The ucnv.c code will handle the end of the input (reset) |
|
95 * (reset, and truncation detection) and callbacks. |
|
96 */ |
|
97 typedef void (*UConverterToUnicode) (UConverterToUnicodeArgs *, UErrorCode *); |
|
98 |
|
99 /* |
|
100 * Same rules as for UConverterToUnicode. |
|
101 * A lead surrogate is kept in fromUChar32 across buffers, and if an error |
|
102 * occurs, then the offending input code point must be put into fromUChar32 |
|
103 * as well. |
|
104 */ |
|
105 typedef void (*UConverterFromUnicode) (UConverterFromUnicodeArgs *, UErrorCode *); |
|
106 |
|
107 /* |
|
108 * Converter implementation function for ucnv_getNextUChar(). |
|
109 * If the function pointer is NULL, then the toUnicode function will be used. |
|
110 * |
|
111 * Will be called at a character boundary (toULength==0). |
|
112 * May return with |
|
113 * - U_INDEX_OUTOFBOUNDS_ERROR if there was no output for the input |
|
114 * (the return value will be ignored) |
|
115 * - U_TRUNCATED_CHAR_FOUND or another error code (never U_BUFFER_OVERFLOW_ERROR!) |
|
116 * with toUBytes[toULength] set to the offending input |
|
117 * (the return value will be ignored) |
|
118 * - return UCNV_GET_NEXT_UCHAR_USE_TO_U, without moving the source pointer, |
|
119 * to indicate that the ucnv.c code shall call the toUnicode function instead |
|
120 * - return a real code point result |
|
121 * |
|
122 * Unless UCNV_GET_NEXT_UCHAR_USE_TO_U is returned, the source bytes must be consumed. |
|
123 * |
|
124 * The ucnv.c code will handle the end of the input (reset) |
|
125 * (except for truncation detection!) and callbacks. |
|
126 */ |
|
127 typedef UChar32 (*UConverterGetNextUChar) (UConverterToUnicodeArgs *, UErrorCode *); |
|
128 |
|
129 typedef void (*UConverterGetStarters)(const UConverter* converter, |
|
130 UBool starters[256], |
|
131 UErrorCode *pErrorCode); |
|
132 |
|
133 /* If this function pointer is null or if the function returns null |
|
134 * the name field in static data struct should be returned by |
|
135 * ucnv_getName() API function |
|
136 */ |
|
137 typedef const char * (*UConverterGetName) (const UConverter *cnv); |
|
138 |
|
139 /** |
|
140 * Write the codepage substitution character. |
|
141 * If this function is not set, then ucnv_cbFromUWriteSub() writes |
|
142 * the substitution character from UConverter. |
|
143 * For stateful converters, it is typically necessary to handle this |
|
144 * specificially for the converter in order to properly maintain the state. |
|
145 */ |
|
146 typedef void (*UConverterWriteSub) (UConverterFromUnicodeArgs *pArgs, int32_t offsetIndex, UErrorCode *pErrorCode); |
|
147 |
|
148 /** |
|
149 * For converter-specific safeClone processing |
|
150 * If this function is not set, then ucnv_safeClone assumes that the converter has no private data that changes |
|
151 * after the converter is done opening. |
|
152 * If this function is set, then it is called just after a memcpy() of |
|
153 * converter data to the new, empty converter, and is expected to set up |
|
154 * the initial state of the converter. It is not expected to increment the |
|
155 * reference counts of the standard data types such as the shared data. |
|
156 */ |
|
157 typedef UConverter * (*UConverterSafeClone) (const UConverter *cnv, |
|
158 void *stackBuffer, |
|
159 int32_t *pBufferSize, |
|
160 UErrorCode *status); |
|
161 |
|
162 /** |
|
163 * Fills the set of Unicode code points that can be converted by an ICU converter. |
|
164 * The API function ucnv_getUnicodeSet() clears the USet before calling |
|
165 * the converter's getUnicodeSet() implementation; the converter should only |
|
166 * add the appropriate code points to allow recursive use. |
|
167 * For example, the ISO-2022-JP converter will call each subconverter's |
|
168 * getUnicodeSet() implementation to consecutively add code points to |
|
169 * the same USet, which will result in a union of the sets of all subconverters. |
|
170 * |
|
171 * For more documentation, see ucnv_getUnicodeSet() in ucnv.h. |
|
172 */ |
|
173 typedef void (*UConverterGetUnicodeSet) (const UConverter *cnv, |
|
174 const USetAdder *sa, |
|
175 UConverterUnicodeSet which, |
|
176 UErrorCode *pErrorCode); |
|
177 |
|
178 UBool CONVERSION_U_SUCCESS (UErrorCode err); |
|
179 |
|
180 /** |
|
181 * UConverterImpl contains all the data and functions for a converter type. |
|
182 * Its function pointers work much like a C++ vtable. |
|
183 * Many converter types need to define only a subset of the functions; |
|
184 * when a function pointer is NULL, then a default action will be performed. |
|
185 * |
|
186 * Every converter type must implement toUnicode, fromUnicode, and getNextUChar, |
|
187 * otherwise the converter may crash. |
|
188 * Every converter type that has variable-length codepage sequences should |
|
189 * also implement toUnicodeWithOffsets and fromUnicodeWithOffsets for |
|
190 * correct offset handling. |
|
191 * All other functions may or may not be implemented - it depends only on |
|
192 * whether the converter type needs them. |
|
193 * |
|
194 * When open() fails, then close() will be called, if present. |
|
195 */ |
|
196 struct UConverterImpl { |
|
197 UConverterType type; |
|
198 |
|
199 UConverterLoad load; |
|
200 UConverterUnload unload; |
|
201 |
|
202 UConverterOpen open; |
|
203 UConverterClose close; |
|
204 UConverterReset reset; |
|
205 |
|
206 UConverterToUnicode toUnicode; |
|
207 UConverterToUnicode toUnicodeWithOffsets; |
|
208 UConverterFromUnicode fromUnicode; |
|
209 UConverterFromUnicode fromUnicodeWithOffsets; |
|
210 UConverterGetNextUChar getNextUChar; |
|
211 |
|
212 UConverterGetStarters getStarters; |
|
213 UConverterGetName getName; |
|
214 UConverterWriteSub writeSub; |
|
215 UConverterSafeClone safeClone; |
|
216 UConverterGetUnicodeSet getUnicodeSet; |
|
217 }; |
|
218 |
|
219 extern const UConverterSharedData |
|
220 _MBCSData, _Latin1Data, |
|
221 _UTF8Data, _UTF16BEData, _UTF16LEData, _UTF32BEData, _UTF32LEData, |
|
222 _ISO2022Data, |
|
223 _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6, |
|
224 _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19, |
|
225 _HZData,_ISCIIData, _SCSUData, _ASCIIData, |
|
226 _UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData; |
|
227 |
|
228 U_CDECL_END |
|
229 |
|
230 /** Always use fallbacks from codepage to Unicode */ |
|
231 #define TO_U_USE_FALLBACK(useFallback) TRUE |
|
232 #define UCNV_TO_U_USE_FALLBACK(cnv) TRUE |
|
233 |
|
234 /** Use fallbacks from Unicode to codepage when cnv->useFallback or for private-use code points */ |
|
235 #define IS_PRIVATE_USE(c) ((uint32_t)((c)-0xe000)<0x1900 || (uint32_t)((c)-0xf0000)<0x20000) |
|
236 #define FROM_U_USE_FALLBACK(useFallback, c) ((useFallback) || IS_PRIVATE_USE(c)) |
|
237 #define UCNV_FROM_U_USE_FALLBACK(cnv, c) FROM_U_USE_FALLBACK((cnv)->useFallback, c) |
|
238 |
|
239 /** |
|
240 * Magic number for ucnv_getNextUChar(), returned by a |
|
241 * getNextUChar() implementation to indicate to use the converter's toUnicode() |
|
242 * instead of the native function. |
|
243 * @internal |
|
244 */ |
|
245 #define UCNV_GET_NEXT_UCHAR_USE_TO_U -9 |
|
246 |
|
247 U_CFUNC void |
|
248 ucnv_getCompleteUnicodeSet(const UConverter *cnv, |
|
249 const USetAdder *sa, |
|
250 UConverterUnicodeSet which, |
|
251 UErrorCode *pErrorCode); |
|
252 |
|
253 U_CFUNC void |
|
254 ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv, |
|
255 const USetAdder *sa, |
|
256 UConverterUnicodeSet which, |
|
257 UErrorCode *pErrorCode); |
|
258 |
|
259 U_CFUNC void |
|
260 ucnv_fromUWriteBytes(UConverter *cnv, |
|
261 const char *bytes, int32_t length, |
|
262 char **target, const char *targetLimit, |
|
263 int32_t **offsets, |
|
264 int32_t sourceIndex, |
|
265 UErrorCode *pErrorCode); |
|
266 U_CFUNC void |
|
267 ucnv_toUWriteUChars(UConverter *cnv, |
|
268 const UChar *uchars, int32_t length, |
|
269 UChar **target, const UChar *targetLimit, |
|
270 int32_t **offsets, |
|
271 int32_t sourceIndex, |
|
272 UErrorCode *pErrorCode); |
|
273 |
|
274 U_CFUNC void |
|
275 ucnv_toUWriteCodePoint(UConverter *cnv, |
|
276 UChar32 c, |
|
277 UChar **target, const UChar *targetLimit, |
|
278 int32_t **offsets, |
|
279 int32_t sourceIndex, |
|
280 UErrorCode *pErrorCode); |
|
281 |
|
282 #endif |
|
283 |
|
284 #endif /* UCNV_CNV */ |