|
1 /* |
|
2 ******************************************************************************* |
|
3 * |
|
4 * Copyright (C) 1999-2005, International Business Machines |
|
5 * Corporation and others. All Rights Reserved. |
|
6 * |
|
7 ******************************************************************************* |
|
8 * file name: utf8.h |
|
9 * encoding: US-ASCII |
|
10 * tab size: 8 (not used) |
|
11 * indentation:4 |
|
12 * |
|
13 * created on: 1999sep13 |
|
14 * created by: Markus W. Scherer |
|
15 */ |
|
16 |
|
17 /** |
|
18 * \file |
|
19 * \brief C API: 8-bit Unicode handling macros |
|
20 * |
|
21 * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. |
|
22 * utf8.h is included by utf.h after unicode/umachine.h |
|
23 * and some common definitions. |
|
24 * |
|
25 * For more information see utf.h and the ICU User Guide Strings chapter |
|
26 * (http://icu.sourceforge.net/userguide/strings.html). |
|
27 * |
|
28 * <em>Usage:</em> |
|
29 * ICU coding guidelines for if() statements should be followed when using these macros. |
|
30 * Compound statements (curly braces {}) must be used for if-else-while... |
|
31 * bodies and all macro statements should be terminated with semicolon. |
|
32 */ |
|
33 |
|
34 #ifndef __UTF8_H__ |
|
35 #define __UTF8_H__ |
|
36 |
|
37 /* utf.h must be included first. */ |
|
38 #ifndef __UTF_H__ |
|
39 # include "unicode/utf.h" |
|
40 #endif |
|
41 |
|
42 /* internal definitions ----------------------------------------------------- */ |
|
43 |
|
44 /** |
|
45 * \var utf8_countTrailBytes |
|
46 * Internal array with numbers of trail bytes for any given byte used in |
|
47 * lead byte position. |
|
48 * @internal |
|
49 */ |
|
50 #ifdef U_UTF8_IMPL |
|
51 U_INTERNAL const uint8_t |
|
52 #elif defined(U_STATIC_IMPLEMENTATION) |
|
53 U_CFUNC const uint8_t |
|
54 #else |
|
55 U_CFUNC U_IMPORT const uint8_t /* U_IMPORT2? */ /*U_IMPORT*/ |
|
56 #endif |
|
57 utf8_countTrailBytes[256]; |
|
58 |
|
59 /** |
|
60 * Count the trail bytes for a UTF-8 lead byte. |
|
61 * @internal |
|
62 */ |
|
63 #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) |
|
64 |
|
65 /** |
|
66 * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. |
|
67 * @internal |
|
68 */ |
|
69 #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) |
|
70 |
|
71 /** |
|
72 * Function for handling "next code point" with error-checking. |
|
73 * @internal |
|
74 */ |
|
75 U_INTERNAL UChar32 U_EXPORT2 |
|
76 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict); |
|
77 |
|
78 /** |
|
79 * Function for handling "append code point" with error-checking. |
|
80 * @internal |
|
81 */ |
|
82 U_INTERNAL int32_t U_EXPORT2 |
|
83 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); |
|
84 |
|
85 /** |
|
86 * Function for handling "previous code point" with error-checking. |
|
87 * @internal |
|
88 */ |
|
89 U_INTERNAL UChar32 U_EXPORT2 |
|
90 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict); |
|
91 |
|
92 /** |
|
93 * Function for handling "skip backward one code point" with error-checking. |
|
94 * @internal |
|
95 */ |
|
96 U_INTERNAL int32_t U_EXPORT2 |
|
97 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); |
|
98 |
|
99 /* single-code point definitions -------------------------------------------- */ |
|
100 |
|
101 /** |
|
102 * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? |
|
103 * @param c 8-bit code unit (byte) |
|
104 * @return TRUE or FALSE |
|
105 * @stable ICU 2.4 |
|
106 */ |
|
107 #define U8_IS_SINGLE(c) (((c)&0x80)==0) |
|
108 |
|
109 /** |
|
110 * Is this code unit (byte) a UTF-8 lead byte? |
|
111 * @param c 8-bit code unit (byte) |
|
112 * @return TRUE or FALSE |
|
113 * @stable ICU 2.4 |
|
114 */ |
|
115 #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e) |
|
116 |
|
117 /** |
|
118 * Is this code unit (byte) a UTF-8 trail byte? |
|
119 * @param c 8-bit code unit (byte) |
|
120 * @return TRUE or FALSE |
|
121 * @stable ICU 2.4 |
|
122 */ |
|
123 #define U8_IS_TRAIL(c) (((c)&0xc0)==0x80) |
|
124 |
|
125 /** |
|
126 * How many code units (bytes) are used for the UTF-8 encoding |
|
127 * of this Unicode code point? |
|
128 * @param c 32-bit code point |
|
129 * @return 1..4, or 0 if c is a surrogate or not a Unicode code point |
|
130 * @stable ICU 2.4 |
|
131 */ |
|
132 #define U8_LENGTH(c) \ |
|
133 ((uint32_t)(c)<=0x7f ? 1 : \ |
|
134 ((uint32_t)(c)<=0x7ff ? 2 : \ |
|
135 ((uint32_t)(c)<=0xd7ff ? 3 : \ |
|
136 ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ |
|
137 ((uint32_t)(c)<=0xffff ? 3 : 4)\ |
|
138 ) \ |
|
139 ) \ |
|
140 ) \ |
|
141 ) |
|
142 |
|
143 /** |
|
144 * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). |
|
145 * @return 4 |
|
146 * @stable ICU 2.4 |
|
147 */ |
|
148 #define U8_MAX_LENGTH 4 |
|
149 |
|
150 /** |
|
151 * Get a code point from a string at a random-access offset, |
|
152 * without changing the offset. |
|
153 * The offset may point to either the lead byte or one of the trail bytes |
|
154 * for a code point, in which case the macro will read all of the bytes |
|
155 * for the code point. |
|
156 * The result is undefined if the offset points to an illegal UTF-8 |
|
157 * byte sequence. |
|
158 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. |
|
159 * |
|
160 * @param s const uint8_t * string |
|
161 * @param i string offset |
|
162 * @param c output UChar32 variable |
|
163 * @see U8_GET |
|
164 * @stable ICU 2.4 |
|
165 */ |
|
166 #define U8_GET_UNSAFE(s, i, c) { \ |
|
167 int32_t _u8_get_unsafe_index=(int32_t)(i); \ |
|
168 U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \ |
|
169 U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \ |
|
170 } |
|
171 |
|
172 /** |
|
173 * Get a code point from a string at a random-access offset, |
|
174 * without changing the offset. |
|
175 * The offset may point to either the lead byte or one of the trail bytes |
|
176 * for a code point, in which case the macro will read all of the bytes |
|
177 * for the code point. |
|
178 * If the offset points to an illegal UTF-8 byte sequence, then |
|
179 * c is set to a negative value. |
|
180 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. |
|
181 * |
|
182 * @param s const uint8_t * string |
|
183 * @param start starting string offset |
|
184 * @param i string offset, start<=i<length |
|
185 * @param length string length |
|
186 * @param c output UChar32 variable, set to <0 in case of an error |
|
187 * @see U8_GET_UNSAFE |
|
188 * @stable ICU 2.4 |
|
189 */ |
|
190 #define U8_GET(s, start, i, length, c) { \ |
|
191 int32_t _u8_get_index=(int32_t)(i); \ |
|
192 U8_SET_CP_START(s, start, _u8_get_index); \ |
|
193 U8_NEXT(s, _u8_get_index, length, c); \ |
|
194 } |
|
195 |
|
196 /* definitions with forward iteration --------------------------------------- */ |
|
197 |
|
198 /** |
|
199 * Get a code point from a string at a code point boundary offset, |
|
200 * and advance the offset to the next code point boundary. |
|
201 * (Post-incrementing forward iteration.) |
|
202 * "Unsafe" macro, assumes well-formed UTF-8. |
|
203 * |
|
204 * The offset may point to the lead byte of a multi-byte sequence, |
|
205 * in which case the macro will read the whole sequence. |
|
206 * The result is undefined if the offset points to a trail byte |
|
207 * or an illegal UTF-8 sequence. |
|
208 * |
|
209 * @param s const uint8_t * string |
|
210 * @param i string offset |
|
211 * @param c output UChar32 variable |
|
212 * @see U8_NEXT |
|
213 * @stable ICU 2.4 |
|
214 */ |
|
215 #define U8_NEXT_UNSAFE(s, i, c) { \ |
|
216 (c)=(s)[(i)++]; \ |
|
217 if((uint8_t)((c)-0xc0)<0x35) { \ |
|
218 uint8_t __count=U8_COUNT_TRAIL_BYTES(c); \ |
|
219 U8_MASK_LEAD_BYTE(c, __count); \ |
|
220 switch(__count) { \ |
|
221 /* each following branch falls through to the next one */ \ |
|
222 case 3: \ |
|
223 (c)=((c)<<6)|((s)[(i)++]&0x3f); \ |
|
224 case 2: \ |
|
225 (c)=((c)<<6)|((s)[(i)++]&0x3f); \ |
|
226 case 1: \ |
|
227 (c)=((c)<<6)|((s)[(i)++]&0x3f); \ |
|
228 /* no other branches to optimize switch() */ \ |
|
229 break; \ |
|
230 } \ |
|
231 } \ |
|
232 } |
|
233 |
|
234 /** |
|
235 * Get a code point from a string at a code point boundary offset, |
|
236 * and advance the offset to the next code point boundary. |
|
237 * (Post-incrementing forward iteration.) |
|
238 * "Safe" macro, checks for illegal sequences and for string boundaries. |
|
239 * |
|
240 * The offset may point to the lead byte of a multi-byte sequence, |
|
241 * in which case the macro will read the whole sequence. |
|
242 * If the offset points to a trail byte or an illegal UTF-8 sequence, then |
|
243 * c is set to a negative value. |
|
244 * |
|
245 * @param s const uint8_t * string |
|
246 * @param i string offset, i<length |
|
247 * @param length string length |
|
248 * @param c output UChar32 variable, set to <0 in case of an error |
|
249 * @see U8_NEXT_UNSAFE |
|
250 * @stable ICU 2.4 |
|
251 */ |
|
252 #define U8_NEXT(s, i, length, c) { \ |
|
253 (c)=(s)[(i)++]; \ |
|
254 if(((uint8_t)(c))>=0x80) { \ |
|
255 if(U8_IS_LEAD(c)) { \ |
|
256 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -1); \ |
|
257 } else { \ |
|
258 (c)=U_SENTINEL; \ |
|
259 } \ |
|
260 } \ |
|
261 } |
|
262 |
|
263 /** |
|
264 * Append a code point to a string, overwriting 1 to 4 bytes. |
|
265 * The offset points to the current end of the string contents |
|
266 * and is advanced (post-increment). |
|
267 * "Unsafe" macro, assumes a valid code point and sufficient space in the string. |
|
268 * Otherwise, the result is undefined. |
|
269 * |
|
270 * @param s const uint8_t * string buffer |
|
271 * @param i string offset |
|
272 * @param c code point to append |
|
273 * @see U8_APPEND |
|
274 * @stable ICU 2.4 |
|
275 */ |
|
276 #define U8_APPEND_UNSAFE(s, i, c) { \ |
|
277 if((uint32_t)(c)<=0x7f) { \ |
|
278 (s)[(i)++]=(uint8_t)(c); \ |
|
279 } else { \ |
|
280 if((uint32_t)(c)<=0x7ff) { \ |
|
281 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ |
|
282 } else { \ |
|
283 if((uint32_t)(c)<=0xffff) { \ |
|
284 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ |
|
285 } else { \ |
|
286 (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ |
|
287 (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ |
|
288 } \ |
|
289 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ |
|
290 } \ |
|
291 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ |
|
292 } \ |
|
293 } |
|
294 |
|
295 /** |
|
296 * Append a code point to a string, overwriting 1 or 2 code units. |
|
297 * The offset points to the current end of the string contents |
|
298 * and is advanced (post-increment). |
|
299 * "Safe" macro, checks for a valid code point. |
|
300 * If a non-ASCII code point is written, checks for sufficient space in the string. |
|
301 * If the code point is not valid or trail bytes do not fit, |
|
302 * then isError is set to TRUE. |
|
303 * |
|
304 * @param s const uint8_t * string buffer |
|
305 * @param i string offset, i<length |
|
306 * @param length size of the string buffer |
|
307 * @param c code point to append |
|
308 * @param isError output UBool set to TRUE if an error occurs, otherwise not modified |
|
309 * @see U8_APPEND_UNSAFE |
|
310 * @stable ICU 2.4 |
|
311 */ |
|
312 #define U8_APPEND(s, i, length, c, isError) { \ |
|
313 if((uint32_t)(c)<=0x7f) { \ |
|
314 (s)[(i)++]=(uint8_t)(c); \ |
|
315 } else { \ |
|
316 (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(length), c, &(isError)); \ |
|
317 } \ |
|
318 } |
|
319 |
|
320 /** |
|
321 * Advance the string offset from one code point boundary to the next. |
|
322 * (Post-incrementing iteration.) |
|
323 * "Unsafe" macro, assumes well-formed UTF-8. |
|
324 * |
|
325 * @param s const uint8_t * string |
|
326 * @param i string offset |
|
327 * @see U8_FWD_1 |
|
328 * @stable ICU 2.4 |
|
329 */ |
|
330 #define U8_FWD_1_UNSAFE(s, i) { \ |
|
331 (i)+=1+U8_COUNT_TRAIL_BYTES((s)[i]); \ |
|
332 } |
|
333 |
|
334 /** |
|
335 * Advance the string offset from one code point boundary to the next. |
|
336 * (Post-incrementing iteration.) |
|
337 * "Safe" macro, checks for illegal sequences and for string boundaries. |
|
338 * |
|
339 * @param s const uint8_t * string |
|
340 * @param i string offset, i<length |
|
341 * @param length string length |
|
342 * @see U8_FWD_1_UNSAFE |
|
343 * @stable ICU 2.4 |
|
344 */ |
|
345 #define U8_FWD_1(s, i, length) { \ |
|
346 uint8_t __b=(s)[(i)++]; \ |
|
347 if(U8_IS_LEAD(__b)) { \ |
|
348 uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \ |
|
349 if((i)+__count>(length)) { \ |
|
350 __count=(uint8_t)((length)-(i)); \ |
|
351 } \ |
|
352 while(__count>0 && U8_IS_TRAIL((s)[i])) { \ |
|
353 ++(i); \ |
|
354 --__count; \ |
|
355 } \ |
|
356 } \ |
|
357 } |
|
358 |
|
359 /** |
|
360 * Advance the string offset from one code point boundary to the n-th next one, |
|
361 * i.e., move forward by n code points. |
|
362 * (Post-incrementing iteration.) |
|
363 * "Unsafe" macro, assumes well-formed UTF-8. |
|
364 * |
|
365 * @param s const uint8_t * string |
|
366 * @param i string offset |
|
367 * @param n number of code points to skip |
|
368 * @see U8_FWD_N |
|
369 * @stable ICU 2.4 |
|
370 */ |
|
371 #define U8_FWD_N_UNSAFE(s, i, n) { \ |
|
372 int32_t __N=(n); \ |
|
373 while(__N>0) { \ |
|
374 U8_FWD_1_UNSAFE(s, i); \ |
|
375 --__N; \ |
|
376 } \ |
|
377 } |
|
378 |
|
379 /** |
|
380 * Advance the string offset from one code point boundary to the n-th next one, |
|
381 * i.e., move forward by n code points. |
|
382 * (Post-incrementing iteration.) |
|
383 * "Safe" macro, checks for illegal sequences and for string boundaries. |
|
384 * |
|
385 * @param s const uint8_t * string |
|
386 * @param i string offset, i<length |
|
387 * @param length string length |
|
388 * @param n number of code points to skip |
|
389 * @see U8_FWD_N_UNSAFE |
|
390 * @stable ICU 2.4 |
|
391 */ |
|
392 #define U8_FWD_N(s, i, length, n) { \ |
|
393 int32_t __N=(n); \ |
|
394 while(__N>0 && (i)<(length)) { \ |
|
395 U8_FWD_1(s, i, length); \ |
|
396 --__N; \ |
|
397 } \ |
|
398 } |
|
399 |
|
400 /** |
|
401 * Adjust a random-access offset to a code point boundary |
|
402 * at the start of a code point. |
|
403 * If the offset points to a UTF-8 trail byte, |
|
404 * then the offset is moved backward to the corresponding lead byte. |
|
405 * Otherwise, it is not modified. |
|
406 * "Unsafe" macro, assumes well-formed UTF-8. |
|
407 * |
|
408 * @param s const uint8_t * string |
|
409 * @param i string offset |
|
410 * @see U8_SET_CP_START |
|
411 * @stable ICU 2.4 |
|
412 */ |
|
413 #define U8_SET_CP_START_UNSAFE(s, i) { \ |
|
414 while(U8_IS_TRAIL((s)[i])) { --(i); } \ |
|
415 } |
|
416 |
|
417 /** |
|
418 * Adjust a random-access offset to a code point boundary |
|
419 * at the start of a code point. |
|
420 * If the offset points to a UTF-8 trail byte, |
|
421 * then the offset is moved backward to the corresponding lead byte. |
|
422 * Otherwise, it is not modified. |
|
423 * "Safe" macro, checks for illegal sequences and for string boundaries. |
|
424 * |
|
425 * @param s const uint8_t * string |
|
426 * @param start starting string offset (usually 0) |
|
427 * @param i string offset, start<=i |
|
428 * @see U8_SET_CP_START_UNSAFE |
|
429 * @stable ICU 2.4 |
|
430 */ |
|
431 #define U8_SET_CP_START(s, start, i) { \ |
|
432 if(U8_IS_TRAIL((s)[(i)])) { \ |
|
433 (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ |
|
434 } \ |
|
435 } |
|
436 |
|
437 /* definitions with backward iteration -------------------------------------- */ |
|
438 |
|
439 /** |
|
440 * Move the string offset from one code point boundary to the previous one |
|
441 * and get the code point between them. |
|
442 * (Pre-decrementing backward iteration.) |
|
443 * "Unsafe" macro, assumes well-formed UTF-8. |
|
444 * |
|
445 * The input offset may be the same as the string length. |
|
446 * If the offset is behind a multi-byte sequence, then the macro will read |
|
447 * the whole sequence. |
|
448 * If the offset is behind a lead byte, then that itself |
|
449 * will be returned as the code point. |
|
450 * The result is undefined if the offset is behind an illegal UTF-8 sequence. |
|
451 * |
|
452 * @param s const uint8_t * string |
|
453 * @param i string offset |
|
454 * @param c output UChar32 variable |
|
455 * @see U8_PREV |
|
456 * @stable ICU 2.4 |
|
457 */ |
|
458 #define U8_PREV_UNSAFE(s, i, c) { \ |
|
459 (c)=(s)[--(i)]; \ |
|
460 if(U8_IS_TRAIL(c)) { \ |
|
461 uint8_t __b, __count=1, __shift=6; \ |
|
462 \ |
|
463 /* c is a trail byte */ \ |
|
464 (c)&=0x3f; \ |
|
465 for(;;) { \ |
|
466 __b=(s)[--(i)]; \ |
|
467 if(__b>=0xc0) { \ |
|
468 U8_MASK_LEAD_BYTE(__b, __count); \ |
|
469 (c)|=(UChar32)__b<<__shift; \ |
|
470 break; \ |
|
471 } else { \ |
|
472 (c)|=(UChar32)(__b&0x3f)<<__shift; \ |
|
473 ++__count; \ |
|
474 __shift+=6; \ |
|
475 } \ |
|
476 } \ |
|
477 } \ |
|
478 } |
|
479 |
|
480 /** |
|
481 * Move the string offset from one code point boundary to the previous one |
|
482 * and get the code point between them. |
|
483 * (Pre-decrementing backward iteration.) |
|
484 * "Safe" macro, checks for illegal sequences and for string boundaries. |
|
485 * |
|
486 * The input offset may be the same as the string length. |
|
487 * If the offset is behind a multi-byte sequence, then the macro will read |
|
488 * the whole sequence. |
|
489 * If the offset is behind a lead byte, then that itself |
|
490 * will be returned as the code point. |
|
491 * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. |
|
492 * |
|
493 * @param s const uint8_t * string |
|
494 * @param start starting string offset (usually 0) |
|
495 * @param i string offset, start<=i |
|
496 * @param c output UChar32 variable, set to <0 in case of an error |
|
497 * @see U8_PREV_UNSAFE |
|
498 * @stable ICU 2.4 |
|
499 */ |
|
500 #define U8_PREV(s, start, i, c) { \ |
|
501 (c)=(s)[--(i)]; \ |
|
502 if((c)>=0x80) { \ |
|
503 if((c)<=0xbf) { \ |
|
504 (c)=utf8_prevCharSafeBody(s, start, &(i), c, -1); \ |
|
505 } else { \ |
|
506 (c)=U_SENTINEL; \ |
|
507 } \ |
|
508 } \ |
|
509 } |
|
510 |
|
511 /** |
|
512 * Move the string offset from one code point boundary to the previous one. |
|
513 * (Pre-decrementing backward iteration.) |
|
514 * The input offset may be the same as the string length. |
|
515 * "Unsafe" macro, assumes well-formed UTF-8. |
|
516 * |
|
517 * @param s const uint8_t * string |
|
518 * @param i string offset |
|
519 * @see U8_BACK_1 |
|
520 * @stable ICU 2.4 |
|
521 */ |
|
522 #define U8_BACK_1_UNSAFE(s, i) { \ |
|
523 while(U8_IS_TRAIL((s)[--(i)])) {} \ |
|
524 } |
|
525 |
|
526 /** |
|
527 * Move the string offset from one code point boundary to the previous one. |
|
528 * (Pre-decrementing backward iteration.) |
|
529 * The input offset may be the same as the string length. |
|
530 * "Safe" macro, checks for illegal sequences and for string boundaries. |
|
531 * |
|
532 * @param s const uint8_t * string |
|
533 * @param start starting string offset (usually 0) |
|
534 * @param i string offset, start<=i |
|
535 * @see U8_BACK_1_UNSAFE |
|
536 * @stable ICU 2.4 |
|
537 */ |
|
538 #define U8_BACK_1(s, start, i) { \ |
|
539 if(U8_IS_TRAIL((s)[--(i)])) { \ |
|
540 (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ |
|
541 } \ |
|
542 } |
|
543 |
|
544 /** |
|
545 * Move the string offset from one code point boundary to the n-th one before it, |
|
546 * i.e., move backward by n code points. |
|
547 * (Pre-decrementing backward iteration.) |
|
548 * The input offset may be the same as the string length. |
|
549 * "Unsafe" macro, assumes well-formed UTF-8. |
|
550 * |
|
551 * @param s const uint8_t * string |
|
552 * @param i string offset |
|
553 * @param n number of code points to skip |
|
554 * @see U8_BACK_N |
|
555 * @stable ICU 2.4 |
|
556 */ |
|
557 #define U8_BACK_N_UNSAFE(s, i, n) { \ |
|
558 int32_t __N=(n); \ |
|
559 while(__N>0) { \ |
|
560 U8_BACK_1_UNSAFE(s, i); \ |
|
561 --__N; \ |
|
562 } \ |
|
563 } |
|
564 |
|
565 /** |
|
566 * Move the string offset from one code point boundary to the n-th one before it, |
|
567 * i.e., move backward by n code points. |
|
568 * (Pre-decrementing backward iteration.) |
|
569 * The input offset may be the same as the string length. |
|
570 * "Safe" macro, checks for illegal sequences and for string boundaries. |
|
571 * |
|
572 * @param s const uint8_t * string |
|
573 * @param start index of the start of the string |
|
574 * @param i string offset, i<length |
|
575 * @param n number of code points to skip |
|
576 * @see U8_BACK_N_UNSAFE |
|
577 * @stable ICU 2.4 |
|
578 */ |
|
579 #define U8_BACK_N(s, start, i, n) { \ |
|
580 int32_t __N=(n); \ |
|
581 while(__N>0 && (i)>(start)) { \ |
|
582 U8_BACK_1(s, start, i); \ |
|
583 --__N; \ |
|
584 } \ |
|
585 } |
|
586 |
|
587 /** |
|
588 * Adjust a random-access offset to a code point boundary after a code point. |
|
589 * If the offset is behind a partial multi-byte sequence, |
|
590 * then the offset is incremented to behind the whole sequence. |
|
591 * Otherwise, it is not modified. |
|
592 * The input offset may be the same as the string length. |
|
593 * "Unsafe" macro, assumes well-formed UTF-8. |
|
594 * |
|
595 * @param s const uint8_t * string |
|
596 * @param i string offset |
|
597 * @see U8_SET_CP_LIMIT |
|
598 * @stable ICU 2.4 |
|
599 */ |
|
600 #define U8_SET_CP_LIMIT_UNSAFE(s, i) { \ |
|
601 U8_BACK_1_UNSAFE(s, i); \ |
|
602 U8_FWD_1_UNSAFE(s, i); \ |
|
603 } |
|
604 |
|
605 /** |
|
606 * Adjust a random-access offset to a code point boundary after a code point. |
|
607 * If the offset is behind a partial multi-byte sequence, |
|
608 * then the offset is incremented to behind the whole sequence. |
|
609 * Otherwise, it is not modified. |
|
610 * The input offset may be the same as the string length. |
|
611 * "Safe" macro, checks for illegal sequences and for string boundaries. |
|
612 * |
|
613 * @param s const uint8_t * string |
|
614 * @param start starting string offset (usually 0) |
|
615 * @param i string offset, start<=i<=length |
|
616 * @param length string length |
|
617 * @see U8_SET_CP_LIMIT_UNSAFE |
|
618 * @stable ICU 2.4 |
|
619 */ |
|
620 #define U8_SET_CP_LIMIT(s, start, i, length) { \ |
|
621 if((start)<(i) && (i)<(length)) { \ |
|
622 U8_BACK_1(s, start, i); \ |
|
623 U8_FWD_1(s, i, length); \ |
|
624 } \ |
|
625 } |
|
626 |
|
627 #endif |