|
1 /* |
|
2 |
|
3 Unicode implementation based on original code by Fredrik Lundh, |
|
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the |
|
5 Unicode Integration Proposal (see file Misc/unicode.txt). |
|
6 |
|
7 Major speed upgrades to the method implementations at the Reykjavik |
|
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. |
|
9 |
|
10 Copyright (c) Corporation for National Research Initiatives. |
|
11 |
|
12 -------------------------------------------------------------------- |
|
13 The original string type implementation is: |
|
14 |
|
15 Copyright (c) 1999 by Secret Labs AB |
|
16 Copyright (c) 1999 by Fredrik Lundh |
|
17 |
|
18 By obtaining, using, and/or copying this software and/or its |
|
19 associated documentation, you agree that you have read, understood, |
|
20 and will comply with the following terms and conditions: |
|
21 |
|
22 Permission to use, copy, modify, and distribute this software and its |
|
23 associated documentation for any purpose and without fee is hereby |
|
24 granted, provided that the above copyright notice appears in all |
|
25 copies, and that both that copyright notice and this permission notice |
|
26 appear in supporting documentation, and that the name of Secret Labs |
|
27 AB or the author not be used in advertising or publicity pertaining to |
|
28 distribution of the software without specific, written prior |
|
29 permission. |
|
30 |
|
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO |
|
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
|
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR |
|
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
|
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT |
|
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
|
38 -------------------------------------------------------------------- |
|
39 |
|
40 */ |
|
41 |
|
42 #define PY_SSIZE_T_CLEAN |
|
43 #include "Python.h" |
|
44 |
|
45 #include "unicodeobject.h" |
|
46 #include "ucnhash.h" |
|
47 |
|
48 #ifdef MS_WINDOWS |
|
49 #include <windows.h> |
|
50 #endif |
|
51 |
|
52 /* Limit for the Unicode object free list */ |
|
53 |
|
54 #define PyUnicode_MAXFREELIST 1024 |
|
55 |
|
56 /* Limit for the Unicode object free list stay alive optimization. |
|
57 |
|
58 The implementation will keep allocated Unicode memory intact for |
|
59 all objects on the free list having a size less than this |
|
60 limit. This reduces malloc() overhead for small Unicode objects. |
|
61 |
|
62 At worst this will result in PyUnicode_MAXFREELIST * |
|
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT + |
|
64 malloc()-overhead) bytes of unused garbage. |
|
65 |
|
66 Setting the limit to 0 effectively turns the feature off. |
|
67 |
|
68 Note: This is an experimental feature ! If you get core dumps when |
|
69 using Unicode objects, turn this feature off. |
|
70 |
|
71 */ |
|
72 |
|
73 #define KEEPALIVE_SIZE_LIMIT 9 |
|
74 |
|
75 /* Endianness switches; defaults to little endian */ |
|
76 |
|
77 #ifdef WORDS_BIGENDIAN |
|
78 # define BYTEORDER_IS_BIG_ENDIAN |
|
79 #else |
|
80 # define BYTEORDER_IS_LITTLE_ENDIAN |
|
81 #endif |
|
82 |
|
83 /* --- Globals ------------------------------------------------------------ |
|
84 |
|
85 The globals are initialized by the _PyUnicode_Init() API and should |
|
86 not be used before calling that API. |
|
87 |
|
88 */ |
|
89 |
|
90 |
|
91 #ifdef __cplusplus |
|
92 extern "C" { |
|
93 #endif |
|
94 |
|
95 /* Free list for Unicode objects */ |
|
96 static PyUnicodeObject *free_list; |
|
97 static int numfree; |
|
98 |
|
99 /* The empty Unicode object is shared to improve performance. */ |
|
100 static PyUnicodeObject *unicode_empty; |
|
101 |
|
102 /* Single character Unicode strings in the Latin-1 range are being |
|
103 shared as well. */ |
|
104 static PyUnicodeObject *unicode_latin1[256]; |
|
105 |
|
106 /* Default encoding to use and assume when NULL is passed as encoding |
|
107 parameter; it is initialized by _PyUnicode_Init(). |
|
108 |
|
109 Always use the PyUnicode_SetDefaultEncoding() and |
|
110 PyUnicode_GetDefaultEncoding() APIs to access this global. |
|
111 |
|
112 */ |
|
113 static char unicode_default_encoding[100]; |
|
114 |
|
115 /* Fast detection of the most frequent whitespace characters */ |
|
116 const unsigned char _Py_ascii_whitespace[] = { |
|
117 0, 0, 0, 0, 0, 0, 0, 0, |
|
118 /* case 0x0009: * HORIZONTAL TABULATION */ |
|
119 /* case 0x000A: * LINE FEED */ |
|
120 /* case 0x000B: * VERTICAL TABULATION */ |
|
121 /* case 0x000C: * FORM FEED */ |
|
122 /* case 0x000D: * CARRIAGE RETURN */ |
|
123 0, 1, 1, 1, 1, 1, 0, 0, |
|
124 0, 0, 0, 0, 0, 0, 0, 0, |
|
125 /* case 0x001C: * FILE SEPARATOR */ |
|
126 /* case 0x001D: * GROUP SEPARATOR */ |
|
127 /* case 0x001E: * RECORD SEPARATOR */ |
|
128 /* case 0x001F: * UNIT SEPARATOR */ |
|
129 0, 0, 0, 0, 1, 1, 1, 1, |
|
130 /* case 0x0020: * SPACE */ |
|
131 1, 0, 0, 0, 0, 0, 0, 0, |
|
132 0, 0, 0, 0, 0, 0, 0, 0, |
|
133 0, 0, 0, 0, 0, 0, 0, 0, |
|
134 0, 0, 0, 0, 0, 0, 0, 0, |
|
135 |
|
136 0, 0, 0, 0, 0, 0, 0, 0, |
|
137 0, 0, 0, 0, 0, 0, 0, 0, |
|
138 0, 0, 0, 0, 0, 0, 0, 0, |
|
139 0, 0, 0, 0, 0, 0, 0, 0, |
|
140 0, 0, 0, 0, 0, 0, 0, 0, |
|
141 0, 0, 0, 0, 0, 0, 0, 0, |
|
142 0, 0, 0, 0, 0, 0, 0, 0, |
|
143 0, 0, 0, 0, 0, 0, 0, 0 |
|
144 }; |
|
145 |
|
146 /* Same for linebreaks */ |
|
147 static unsigned char ascii_linebreak[] = { |
|
148 0, 0, 0, 0, 0, 0, 0, 0, |
|
149 /* 0x000A, * LINE FEED */ |
|
150 /* 0x000D, * CARRIAGE RETURN */ |
|
151 0, 0, 1, 0, 0, 1, 0, 0, |
|
152 0, 0, 0, 0, 0, 0, 0, 0, |
|
153 /* 0x001C, * FILE SEPARATOR */ |
|
154 /* 0x001D, * GROUP SEPARATOR */ |
|
155 /* 0x001E, * RECORD SEPARATOR */ |
|
156 0, 0, 0, 0, 1, 1, 1, 0, |
|
157 0, 0, 0, 0, 0, 0, 0, 0, |
|
158 0, 0, 0, 0, 0, 0, 0, 0, |
|
159 0, 0, 0, 0, 0, 0, 0, 0, |
|
160 0, 0, 0, 0, 0, 0, 0, 0, |
|
161 |
|
162 0, 0, 0, 0, 0, 0, 0, 0, |
|
163 0, 0, 0, 0, 0, 0, 0, 0, |
|
164 0, 0, 0, 0, 0, 0, 0, 0, |
|
165 0, 0, 0, 0, 0, 0, 0, 0, |
|
166 0, 0, 0, 0, 0, 0, 0, 0, |
|
167 0, 0, 0, 0, 0, 0, 0, 0, |
|
168 0, 0, 0, 0, 0, 0, 0, 0, |
|
169 0, 0, 0, 0, 0, 0, 0, 0 |
|
170 }; |
|
171 |
|
172 |
|
173 Py_UNICODE |
|
174 PyUnicode_GetMax(void) |
|
175 { |
|
176 #ifdef Py_UNICODE_WIDE |
|
177 return 0x10FFFF; |
|
178 #else |
|
179 /* This is actually an illegal character, so it should |
|
180 not be passed to unichr. */ |
|
181 return 0xFFFF; |
|
182 #endif |
|
183 } |
|
184 |
|
185 /* --- Bloom Filters ----------------------------------------------------- */ |
|
186 |
|
187 /* stuff to implement simple "bloom filters" for Unicode characters. |
|
188 to keep things simple, we use a single bitmask, using the least 5 |
|
189 bits from each unicode characters as the bit index. */ |
|
190 |
|
191 /* the linebreak mask is set up by Unicode_Init below */ |
|
192 |
|
193 #define BLOOM_MASK unsigned long |
|
194 |
|
195 static BLOOM_MASK bloom_linebreak; |
|
196 |
|
197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F)))) |
|
198 |
|
199 #define BLOOM_LINEBREAK(ch) \ |
|
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \ |
|
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) |
|
202 |
|
203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) |
|
204 { |
|
205 /* calculate simple bloom-style bitmask for a given unicode string */ |
|
206 |
|
207 long mask; |
|
208 Py_ssize_t i; |
|
209 |
|
210 mask = 0; |
|
211 for (i = 0; i < len; i++) |
|
212 mask |= (1 << (ptr[i] & 0x1F)); |
|
213 |
|
214 return mask; |
|
215 } |
|
216 |
|
217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) |
|
218 { |
|
219 Py_ssize_t i; |
|
220 |
|
221 for (i = 0; i < setlen; i++) |
|
222 if (set[i] == chr) |
|
223 return 1; |
|
224 |
|
225 return 0; |
|
226 } |
|
227 |
|
228 #define BLOOM_MEMBER(mask, chr, set, setlen)\ |
|
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen) |
|
230 |
|
231 /* --- Unicode Object ----------------------------------------------------- */ |
|
232 |
|
233 static |
|
234 int unicode_resize(register PyUnicodeObject *unicode, |
|
235 Py_ssize_t length) |
|
236 { |
|
237 void *oldstr; |
|
238 |
|
239 /* Shortcut if there's nothing much to do. */ |
|
240 if (unicode->length == length) |
|
241 goto reset; |
|
242 |
|
243 /* Resizing shared object (unicode_empty or single character |
|
244 objects) in-place is not allowed. Use PyUnicode_Resize() |
|
245 instead ! */ |
|
246 |
|
247 if (unicode == unicode_empty || |
|
248 (unicode->length == 1 && |
|
249 unicode->str[0] < 256U && |
|
250 unicode_latin1[unicode->str[0]] == unicode)) { |
|
251 PyErr_SetString(PyExc_SystemError, |
|
252 "can't resize shared unicode objects"); |
|
253 return -1; |
|
254 } |
|
255 |
|
256 /* We allocate one more byte to make sure the string is Ux0000 terminated. |
|
257 The overallocation is also used by fastsearch, which assumes that it's |
|
258 safe to look at str[length] (without making any assumptions about what |
|
259 it contains). */ |
|
260 |
|
261 oldstr = unicode->str; |
|
262 unicode->str = PyObject_REALLOC(unicode->str, |
|
263 sizeof(Py_UNICODE) * (length + 1)); |
|
264 if (!unicode->str) { |
|
265 unicode->str = (Py_UNICODE *)oldstr; |
|
266 PyErr_NoMemory(); |
|
267 return -1; |
|
268 } |
|
269 unicode->str[length] = 0; |
|
270 unicode->length = length; |
|
271 |
|
272 reset: |
|
273 /* Reset the object caches */ |
|
274 if (unicode->defenc) { |
|
275 Py_DECREF(unicode->defenc); |
|
276 unicode->defenc = NULL; |
|
277 } |
|
278 unicode->hash = -1; |
|
279 |
|
280 return 0; |
|
281 } |
|
282 |
|
283 /* We allocate one more byte to make sure the string is |
|
284 Ux0000 terminated -- XXX is this needed ? |
|
285 |
|
286 XXX This allocator could further be enhanced by assuring that the |
|
287 free list never reduces its size below 1. |
|
288 |
|
289 */ |
|
290 |
|
291 static |
|
292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length) |
|
293 { |
|
294 register PyUnicodeObject *unicode; |
|
295 |
|
296 /* Optimization for empty strings */ |
|
297 if (length == 0 && unicode_empty != NULL) { |
|
298 Py_INCREF(unicode_empty); |
|
299 return unicode_empty; |
|
300 } |
|
301 |
|
302 /* Ensure we won't overflow the size. */ |
|
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) { |
|
304 return (PyUnicodeObject *)PyErr_NoMemory(); |
|
305 } |
|
306 |
|
307 /* Unicode freelist & memory allocation */ |
|
308 if (free_list) { |
|
309 unicode = free_list; |
|
310 free_list = *(PyUnicodeObject **)unicode; |
|
311 numfree--; |
|
312 if (unicode->str) { |
|
313 /* Keep-Alive optimization: we only upsize the buffer, |
|
314 never downsize it. */ |
|
315 if ((unicode->length < length) && |
|
316 unicode_resize(unicode, length) < 0) { |
|
317 PyObject_DEL(unicode->str); |
|
318 unicode->str = NULL; |
|
319 } |
|
320 } |
|
321 else { |
|
322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); |
|
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); |
|
324 } |
|
325 PyObject_INIT(unicode, &PyUnicode_Type); |
|
326 } |
|
327 else { |
|
328 size_t new_size; |
|
329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type); |
|
330 if (unicode == NULL) |
|
331 return NULL; |
|
332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1); |
|
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size); |
|
334 } |
|
335 |
|
336 if (!unicode->str) { |
|
337 PyErr_NoMemory(); |
|
338 goto onError; |
|
339 } |
|
340 /* Initialize the first element to guard against cases where |
|
341 * the caller fails before initializing str -- unicode_resize() |
|
342 * reads str[0], and the Keep-Alive optimization can keep memory |
|
343 * allocated for str alive across a call to unicode_dealloc(unicode). |
|
344 * We don't want unicode_resize to read uninitialized memory in |
|
345 * that case. |
|
346 */ |
|
347 unicode->str[0] = 0; |
|
348 unicode->str[length] = 0; |
|
349 unicode->length = length; |
|
350 unicode->hash = -1; |
|
351 unicode->defenc = NULL; |
|
352 return unicode; |
|
353 |
|
354 onError: |
|
355 /* XXX UNREF/NEWREF interface should be more symmetrical */ |
|
356 _Py_DEC_REFTOTAL; |
|
357 _Py_ForgetReference((PyObject *)unicode); |
|
358 PyObject_Del(unicode); |
|
359 return NULL; |
|
360 } |
|
361 |
|
362 static |
|
363 void unicode_dealloc(register PyUnicodeObject *unicode) |
|
364 { |
|
365 if (PyUnicode_CheckExact(unicode) && |
|
366 numfree < PyUnicode_MAXFREELIST) { |
|
367 /* Keep-Alive optimization */ |
|
368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) { |
|
369 PyObject_DEL(unicode->str); |
|
370 unicode->str = NULL; |
|
371 unicode->length = 0; |
|
372 } |
|
373 if (unicode->defenc) { |
|
374 Py_DECREF(unicode->defenc); |
|
375 unicode->defenc = NULL; |
|
376 } |
|
377 /* Add to free list */ |
|
378 *(PyUnicodeObject **)unicode = free_list; |
|
379 free_list = unicode; |
|
380 numfree++; |
|
381 } |
|
382 else { |
|
383 PyObject_DEL(unicode->str); |
|
384 Py_XDECREF(unicode->defenc); |
|
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode); |
|
386 } |
|
387 } |
|
388 |
|
389 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) |
|
390 { |
|
391 register PyUnicodeObject *v; |
|
392 |
|
393 /* Argument checks */ |
|
394 if (unicode == NULL) { |
|
395 PyErr_BadInternalCall(); |
|
396 return -1; |
|
397 } |
|
398 v = (PyUnicodeObject *)*unicode; |
|
399 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) { |
|
400 PyErr_BadInternalCall(); |
|
401 return -1; |
|
402 } |
|
403 |
|
404 /* Resizing unicode_empty and single character objects is not |
|
405 possible since these are being shared. We simply return a fresh |
|
406 copy with the same Unicode content. */ |
|
407 if (v->length != length && |
|
408 (v == unicode_empty || v->length == 1)) { |
|
409 PyUnicodeObject *w = _PyUnicode_New(length); |
|
410 if (w == NULL) |
|
411 return -1; |
|
412 Py_UNICODE_COPY(w->str, v->str, |
|
413 length < v->length ? length : v->length); |
|
414 Py_DECREF(*unicode); |
|
415 *unicode = (PyObject *)w; |
|
416 return 0; |
|
417 } |
|
418 |
|
419 /* Note that we don't have to modify *unicode for unshared Unicode |
|
420 objects, since we can modify them in-place. */ |
|
421 return unicode_resize(v, length); |
|
422 } |
|
423 |
|
424 /* Internal API for use in unicodeobject.c only ! */ |
|
425 #define _PyUnicode_Resize(unicodevar, length) \ |
|
426 PyUnicode_Resize(((PyObject **)(unicodevar)), length) |
|
427 |
|
428 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u, |
|
429 Py_ssize_t size) |
|
430 { |
|
431 PyUnicodeObject *unicode; |
|
432 |
|
433 /* If the Unicode data is known at construction time, we can apply |
|
434 some optimizations which share commonly used objects. */ |
|
435 if (u != NULL) { |
|
436 |
|
437 /* Optimization for empty strings */ |
|
438 if (size == 0 && unicode_empty != NULL) { |
|
439 Py_INCREF(unicode_empty); |
|
440 return (PyObject *)unicode_empty; |
|
441 } |
|
442 |
|
443 /* Single character Unicode objects in the Latin-1 range are |
|
444 shared when using this constructor */ |
|
445 if (size == 1 && *u < 256) { |
|
446 unicode = unicode_latin1[*u]; |
|
447 if (!unicode) { |
|
448 unicode = _PyUnicode_New(1); |
|
449 if (!unicode) |
|
450 return NULL; |
|
451 unicode->str[0] = *u; |
|
452 unicode_latin1[*u] = unicode; |
|
453 } |
|
454 Py_INCREF(unicode); |
|
455 return (PyObject *)unicode; |
|
456 } |
|
457 } |
|
458 |
|
459 unicode = _PyUnicode_New(size); |
|
460 if (!unicode) |
|
461 return NULL; |
|
462 |
|
463 /* Copy the Unicode data into the new object */ |
|
464 if (u != NULL) |
|
465 Py_UNICODE_COPY(unicode->str, u, size); |
|
466 |
|
467 return (PyObject *)unicode; |
|
468 } |
|
469 |
|
470 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) |
|
471 { |
|
472 PyUnicodeObject *unicode; |
|
473 |
|
474 if (size < 0) { |
|
475 PyErr_SetString(PyExc_SystemError, |
|
476 "Negative size passed to PyUnicode_FromStringAndSize"); |
|
477 return NULL; |
|
478 } |
|
479 |
|
480 /* If the Unicode data is known at construction time, we can apply |
|
481 some optimizations which share commonly used objects. |
|
482 Also, this means the input must be UTF-8, so fall back to the |
|
483 UTF-8 decoder at the end. */ |
|
484 if (u != NULL) { |
|
485 |
|
486 /* Optimization for empty strings */ |
|
487 if (size == 0 && unicode_empty != NULL) { |
|
488 Py_INCREF(unicode_empty); |
|
489 return (PyObject *)unicode_empty; |
|
490 } |
|
491 |
|
492 /* Single characters are shared when using this constructor. |
|
493 Restrict to ASCII, since the input must be UTF-8. */ |
|
494 if (size == 1 && Py_CHARMASK(*u) < 128) { |
|
495 unicode = unicode_latin1[Py_CHARMASK(*u)]; |
|
496 if (!unicode) { |
|
497 unicode = _PyUnicode_New(1); |
|
498 if (!unicode) |
|
499 return NULL; |
|
500 unicode->str[0] = Py_CHARMASK(*u); |
|
501 unicode_latin1[Py_CHARMASK(*u)] = unicode; |
|
502 } |
|
503 Py_INCREF(unicode); |
|
504 return (PyObject *)unicode; |
|
505 } |
|
506 |
|
507 return PyUnicode_DecodeUTF8(u, size, NULL); |
|
508 } |
|
509 |
|
510 unicode = _PyUnicode_New(size); |
|
511 if (!unicode) |
|
512 return NULL; |
|
513 |
|
514 return (PyObject *)unicode; |
|
515 } |
|
516 |
|
517 PyObject *PyUnicode_FromString(const char *u) |
|
518 { |
|
519 size_t size = strlen(u); |
|
520 if (size > PY_SSIZE_T_MAX) { |
|
521 PyErr_SetString(PyExc_OverflowError, "input too long"); |
|
522 return NULL; |
|
523 } |
|
524 |
|
525 return PyUnicode_FromStringAndSize(u, size); |
|
526 } |
|
527 |
|
528 #ifdef HAVE_WCHAR_H |
|
529 |
|
530 PyObject *PyUnicode_FromWideChar(register const wchar_t *w, |
|
531 Py_ssize_t size) |
|
532 { |
|
533 PyUnicodeObject *unicode; |
|
534 |
|
535 if (w == NULL) { |
|
536 PyErr_BadInternalCall(); |
|
537 return NULL; |
|
538 } |
|
539 |
|
540 unicode = _PyUnicode_New(size); |
|
541 if (!unicode) |
|
542 return NULL; |
|
543 |
|
544 /* Copy the wchar_t data into the new object */ |
|
545 #ifdef HAVE_USABLE_WCHAR_T |
|
546 memcpy(unicode->str, w, size * sizeof(wchar_t)); |
|
547 #else |
|
548 { |
|
549 register Py_UNICODE *u; |
|
550 register Py_ssize_t i; |
|
551 u = PyUnicode_AS_UNICODE(unicode); |
|
552 for (i = size; i > 0; i--) |
|
553 *u++ = *w++; |
|
554 } |
|
555 #endif |
|
556 |
|
557 return (PyObject *)unicode; |
|
558 } |
|
559 |
|
560 static void |
|
561 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c) |
|
562 { |
|
563 *fmt++ = '%'; |
|
564 if (width) { |
|
565 if (zeropad) |
|
566 *fmt++ = '0'; |
|
567 fmt += sprintf(fmt, "%d", width); |
|
568 } |
|
569 if (precision) |
|
570 fmt += sprintf(fmt, ".%d", precision); |
|
571 if (longflag) |
|
572 *fmt++ = 'l'; |
|
573 else if (size_tflag) { |
|
574 char *f = PY_FORMAT_SIZE_T; |
|
575 while (*f) |
|
576 *fmt++ = *f++; |
|
577 } |
|
578 *fmt++ = c; |
|
579 *fmt = '\0'; |
|
580 } |
|
581 |
|
582 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;} |
|
583 |
|
584 PyObject * |
|
585 PyUnicode_FromFormatV(const char *format, va_list vargs) |
|
586 { |
|
587 va_list count; |
|
588 Py_ssize_t callcount = 0; |
|
589 PyObject **callresults = NULL; |
|
590 PyObject **callresult = NULL; |
|
591 Py_ssize_t n = 0; |
|
592 int width = 0; |
|
593 int precision = 0; |
|
594 int zeropad; |
|
595 const char* f; |
|
596 Py_UNICODE *s; |
|
597 PyObject *string; |
|
598 /* used by sprintf */ |
|
599 char buffer[21]; |
|
600 /* use abuffer instead of buffer, if we need more space |
|
601 * (which can happen if there's a format specifier with width). */ |
|
602 char *abuffer = NULL; |
|
603 char *realbuffer; |
|
604 Py_ssize_t abuffersize = 0; |
|
605 char fmt[60]; /* should be enough for %0width.precisionld */ |
|
606 const char *copy; |
|
607 |
|
608 #ifdef VA_LIST_IS_ARRAY |
|
609 Py_MEMCPY(count, vargs, sizeof(va_list)); |
|
610 #else |
|
611 #ifdef __va_copy |
|
612 __va_copy(count, vargs); |
|
613 #else |
|
614 count = vargs; |
|
615 #endif |
|
616 #endif |
|
617 /* step 1: count the number of %S/%R format specifications |
|
618 * (we call PyObject_Str()/PyObject_Repr() for these objects |
|
619 * once during step 3 and put the result in an array) */ |
|
620 for (f = format; *f; f++) { |
|
621 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R')) |
|
622 ++callcount; |
|
623 } |
|
624 /* step 2: allocate memory for the results of |
|
625 * PyObject_Str()/PyObject_Repr() calls */ |
|
626 if (callcount) { |
|
627 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount); |
|
628 if (!callresults) { |
|
629 PyErr_NoMemory(); |
|
630 return NULL; |
|
631 } |
|
632 callresult = callresults; |
|
633 } |
|
634 /* step 3: figure out how large a buffer we need */ |
|
635 for (f = format; *f; f++) { |
|
636 if (*f == '%') { |
|
637 const char* p = f; |
|
638 width = 0; |
|
639 while (isdigit((unsigned)*f)) |
|
640 width = (width*10) + *f++ - '0'; |
|
641 while (*++f && *f != '%' && !isalpha((unsigned)*f)) |
|
642 ; |
|
643 |
|
644 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since |
|
645 * they don't affect the amount of space we reserve. |
|
646 */ |
|
647 if ((*f == 'l' || *f == 'z') && |
|
648 (f[1] == 'd' || f[1] == 'u')) |
|
649 ++f; |
|
650 |
|
651 switch (*f) { |
|
652 case 'c': |
|
653 (void)va_arg(count, int); |
|
654 /* fall through... */ |
|
655 case '%': |
|
656 n++; |
|
657 break; |
|
658 case 'd': case 'u': case 'i': case 'x': |
|
659 (void) va_arg(count, int); |
|
660 /* 20 bytes is enough to hold a 64-bit |
|
661 integer. Decimal takes the most space. |
|
662 This isn't enough for octal. |
|
663 If a width is specified we need more |
|
664 (which we allocate later). */ |
|
665 if (width < 20) |
|
666 width = 20; |
|
667 n += width; |
|
668 if (abuffersize < width) |
|
669 abuffersize = width; |
|
670 break; |
|
671 case 's': |
|
672 { |
|
673 /* UTF-8 */ |
|
674 unsigned char*s; |
|
675 s = va_arg(count, unsigned char*); |
|
676 while (*s) { |
|
677 if (*s < 128) { |
|
678 n++; s++; |
|
679 } else if (*s < 0xc0) { |
|
680 /* invalid UTF-8 */ |
|
681 n++; s++; |
|
682 } else if (*s < 0xc0) { |
|
683 n++; |
|
684 s++; if(!*s)break; |
|
685 s++; |
|
686 } else if (*s < 0xe0) { |
|
687 n++; |
|
688 s++; if(!*s)break; |
|
689 s++; if(!*s)break; |
|
690 s++; |
|
691 } else { |
|
692 #ifdef Py_UNICODE_WIDE |
|
693 n++; |
|
694 #else |
|
695 n+=2; |
|
696 #endif |
|
697 s++; if(!*s)break; |
|
698 s++; if(!*s)break; |
|
699 s++; if(!*s)break; |
|
700 s++; |
|
701 } |
|
702 } |
|
703 break; |
|
704 } |
|
705 case 'U': |
|
706 { |
|
707 PyObject *obj = va_arg(count, PyObject *); |
|
708 assert(obj && PyUnicode_Check(obj)); |
|
709 n += PyUnicode_GET_SIZE(obj); |
|
710 break; |
|
711 } |
|
712 case 'V': |
|
713 { |
|
714 PyObject *obj = va_arg(count, PyObject *); |
|
715 const char *str = va_arg(count, const char *); |
|
716 assert(obj || str); |
|
717 assert(!obj || PyUnicode_Check(obj)); |
|
718 if (obj) |
|
719 n += PyUnicode_GET_SIZE(obj); |
|
720 else |
|
721 n += strlen(str); |
|
722 break; |
|
723 } |
|
724 case 'S': |
|
725 { |
|
726 PyObject *obj = va_arg(count, PyObject *); |
|
727 PyObject *str; |
|
728 assert(obj); |
|
729 str = PyObject_Str(obj); |
|
730 if (!str) |
|
731 goto fail; |
|
732 n += PyUnicode_GET_SIZE(str); |
|
733 /* Remember the str and switch to the next slot */ |
|
734 *callresult++ = str; |
|
735 break; |
|
736 } |
|
737 case 'R': |
|
738 { |
|
739 PyObject *obj = va_arg(count, PyObject *); |
|
740 PyObject *repr; |
|
741 assert(obj); |
|
742 repr = PyObject_Repr(obj); |
|
743 if (!repr) |
|
744 goto fail; |
|
745 n += PyUnicode_GET_SIZE(repr); |
|
746 /* Remember the repr and switch to the next slot */ |
|
747 *callresult++ = repr; |
|
748 break; |
|
749 } |
|
750 case 'p': |
|
751 (void) va_arg(count, int); |
|
752 /* maximum 64-bit pointer representation: |
|
753 * 0xffffffffffffffff |
|
754 * so 19 characters is enough. |
|
755 * XXX I count 18 -- what's the extra for? |
|
756 */ |
|
757 n += 19; |
|
758 break; |
|
759 default: |
|
760 /* if we stumble upon an unknown |
|
761 formatting code, copy the rest of |
|
762 the format string to the output |
|
763 string. (we cannot just skip the |
|
764 code, since there's no way to know |
|
765 what's in the argument list) */ |
|
766 n += strlen(p); |
|
767 goto expand; |
|
768 } |
|
769 } else |
|
770 n++; |
|
771 } |
|
772 expand: |
|
773 if (abuffersize > 20) { |
|
774 abuffer = PyObject_Malloc(abuffersize); |
|
775 if (!abuffer) { |
|
776 PyErr_NoMemory(); |
|
777 goto fail; |
|
778 } |
|
779 realbuffer = abuffer; |
|
780 } |
|
781 else |
|
782 realbuffer = buffer; |
|
783 /* step 4: fill the buffer */ |
|
784 /* Since we've analyzed how much space we need for the worst case, |
|
785 we don't have to resize the string. |
|
786 There can be no errors beyond this point. */ |
|
787 string = PyUnicode_FromUnicode(NULL, n); |
|
788 if (!string) |
|
789 goto fail; |
|
790 |
|
791 s = PyUnicode_AS_UNICODE(string); |
|
792 callresult = callresults; |
|
793 |
|
794 for (f = format; *f; f++) { |
|
795 if (*f == '%') { |
|
796 const char* p = f++; |
|
797 int longflag = 0; |
|
798 int size_tflag = 0; |
|
799 zeropad = (*f == '0'); |
|
800 /* parse the width.precision part */ |
|
801 width = 0; |
|
802 while (isdigit((unsigned)*f)) |
|
803 width = (width*10) + *f++ - '0'; |
|
804 precision = 0; |
|
805 if (*f == '.') { |
|
806 f++; |
|
807 while (isdigit((unsigned)*f)) |
|
808 precision = (precision*10) + *f++ - '0'; |
|
809 } |
|
810 /* handle the long flag, but only for %ld and %lu. |
|
811 others can be added when necessary. */ |
|
812 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) { |
|
813 longflag = 1; |
|
814 ++f; |
|
815 } |
|
816 /* handle the size_t flag. */ |
|
817 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) { |
|
818 size_tflag = 1; |
|
819 ++f; |
|
820 } |
|
821 |
|
822 switch (*f) { |
|
823 case 'c': |
|
824 *s++ = va_arg(vargs, int); |
|
825 break; |
|
826 case 'd': |
|
827 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd'); |
|
828 if (longflag) |
|
829 sprintf(realbuffer, fmt, va_arg(vargs, long)); |
|
830 else if (size_tflag) |
|
831 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t)); |
|
832 else |
|
833 sprintf(realbuffer, fmt, va_arg(vargs, int)); |
|
834 appendstring(realbuffer); |
|
835 break; |
|
836 case 'u': |
|
837 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u'); |
|
838 if (longflag) |
|
839 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long)); |
|
840 else if (size_tflag) |
|
841 sprintf(realbuffer, fmt, va_arg(vargs, size_t)); |
|
842 else |
|
843 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int)); |
|
844 appendstring(realbuffer); |
|
845 break; |
|
846 case 'i': |
|
847 makefmt(fmt, 0, 0, zeropad, width, precision, 'i'); |
|
848 sprintf(realbuffer, fmt, va_arg(vargs, int)); |
|
849 appendstring(realbuffer); |
|
850 break; |
|
851 case 'x': |
|
852 makefmt(fmt, 0, 0, zeropad, width, precision, 'x'); |
|
853 sprintf(realbuffer, fmt, va_arg(vargs, int)); |
|
854 appendstring(realbuffer); |
|
855 break; |
|
856 case 's': |
|
857 { |
|
858 /* Parameter must be UTF-8 encoded. |
|
859 In case of encoding errors, use |
|
860 the replacement character. */ |
|
861 PyObject *u; |
|
862 p = va_arg(vargs, char*); |
|
863 u = PyUnicode_DecodeUTF8(p, strlen(p), |
|
864 "replace"); |
|
865 if (!u) |
|
866 goto fail; |
|
867 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u), |
|
868 PyUnicode_GET_SIZE(u)); |
|
869 s += PyUnicode_GET_SIZE(u); |
|
870 Py_DECREF(u); |
|
871 break; |
|
872 } |
|
873 case 'U': |
|
874 { |
|
875 PyObject *obj = va_arg(vargs, PyObject *); |
|
876 Py_ssize_t size = PyUnicode_GET_SIZE(obj); |
|
877 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); |
|
878 s += size; |
|
879 break; |
|
880 } |
|
881 case 'V': |
|
882 { |
|
883 PyObject *obj = va_arg(vargs, PyObject *); |
|
884 const char *str = va_arg(vargs, const char *); |
|
885 if (obj) { |
|
886 Py_ssize_t size = PyUnicode_GET_SIZE(obj); |
|
887 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size); |
|
888 s += size; |
|
889 } else { |
|
890 appendstring(str); |
|
891 } |
|
892 break; |
|
893 } |
|
894 case 'S': |
|
895 case 'R': |
|
896 { |
|
897 Py_UNICODE *ucopy; |
|
898 Py_ssize_t usize; |
|
899 Py_ssize_t upos; |
|
900 /* unused, since we already have the result */ |
|
901 (void) va_arg(vargs, PyObject *); |
|
902 ucopy = PyUnicode_AS_UNICODE(*callresult); |
|
903 usize = PyUnicode_GET_SIZE(*callresult); |
|
904 for (upos = 0; upos<usize;) |
|
905 *s++ = ucopy[upos++]; |
|
906 /* We're done with the unicode()/repr() => forget it */ |
|
907 Py_DECREF(*callresult); |
|
908 /* switch to next unicode()/repr() result */ |
|
909 ++callresult; |
|
910 break; |
|
911 } |
|
912 case 'p': |
|
913 sprintf(buffer, "%p", va_arg(vargs, void*)); |
|
914 /* %p is ill-defined: ensure leading 0x. */ |
|
915 if (buffer[1] == 'X') |
|
916 buffer[1] = 'x'; |
|
917 else if (buffer[1] != 'x') { |
|
918 memmove(buffer+2, buffer, strlen(buffer)+1); |
|
919 buffer[0] = '0'; |
|
920 buffer[1] = 'x'; |
|
921 } |
|
922 appendstring(buffer); |
|
923 break; |
|
924 case '%': |
|
925 *s++ = '%'; |
|
926 break; |
|
927 default: |
|
928 appendstring(p); |
|
929 goto end; |
|
930 } |
|
931 } else |
|
932 *s++ = *f; |
|
933 } |
|
934 |
|
935 end: |
|
936 if (callresults) |
|
937 PyObject_Free(callresults); |
|
938 if (abuffer) |
|
939 PyObject_Free(abuffer); |
|
940 _PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string)); |
|
941 return string; |
|
942 fail: |
|
943 if (callresults) { |
|
944 PyObject **callresult2 = callresults; |
|
945 while (callresult2 < callresult) { |
|
946 Py_DECREF(*callresult2); |
|
947 ++callresult2; |
|
948 } |
|
949 PyObject_Free(callresults); |
|
950 } |
|
951 if (abuffer) |
|
952 PyObject_Free(abuffer); |
|
953 return NULL; |
|
954 } |
|
955 |
|
956 #undef appendstring |
|
957 |
|
958 PyObject * |
|
959 PyUnicode_FromFormat(const char *format, ...) |
|
960 { |
|
961 PyObject* ret; |
|
962 va_list vargs; |
|
963 |
|
964 #ifdef HAVE_STDARG_PROTOTYPES |
|
965 va_start(vargs, format); |
|
966 #else |
|
967 va_start(vargs); |
|
968 #endif |
|
969 ret = PyUnicode_FromFormatV(format, vargs); |
|
970 va_end(vargs); |
|
971 return ret; |
|
972 } |
|
973 |
|
974 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode, |
|
975 wchar_t *w, |
|
976 Py_ssize_t size) |
|
977 { |
|
978 if (unicode == NULL) { |
|
979 PyErr_BadInternalCall(); |
|
980 return -1; |
|
981 } |
|
982 |
|
983 /* If possible, try to copy the 0-termination as well */ |
|
984 if (size > PyUnicode_GET_SIZE(unicode)) |
|
985 size = PyUnicode_GET_SIZE(unicode) + 1; |
|
986 |
|
987 #ifdef HAVE_USABLE_WCHAR_T |
|
988 memcpy(w, unicode->str, size * sizeof(wchar_t)); |
|
989 #else |
|
990 { |
|
991 register Py_UNICODE *u; |
|
992 register Py_ssize_t i; |
|
993 u = PyUnicode_AS_UNICODE(unicode); |
|
994 for (i = size; i > 0; i--) |
|
995 *w++ = *u++; |
|
996 } |
|
997 #endif |
|
998 |
|
999 if (size > PyUnicode_GET_SIZE(unicode)) |
|
1000 return PyUnicode_GET_SIZE(unicode); |
|
1001 else |
|
1002 return size; |
|
1003 } |
|
1004 |
|
1005 #endif |
|
1006 |
|
1007 PyObject *PyUnicode_FromOrdinal(int ordinal) |
|
1008 { |
|
1009 Py_UNICODE s[1]; |
|
1010 |
|
1011 #ifdef Py_UNICODE_WIDE |
|
1012 if (ordinal < 0 || ordinal > 0x10ffff) { |
|
1013 PyErr_SetString(PyExc_ValueError, |
|
1014 "unichr() arg not in range(0x110000) " |
|
1015 "(wide Python build)"); |
|
1016 return NULL; |
|
1017 } |
|
1018 #else |
|
1019 if (ordinal < 0 || ordinal > 0xffff) { |
|
1020 PyErr_SetString(PyExc_ValueError, |
|
1021 "unichr() arg not in range(0x10000) " |
|
1022 "(narrow Python build)"); |
|
1023 return NULL; |
|
1024 } |
|
1025 #endif |
|
1026 |
|
1027 s[0] = (Py_UNICODE)ordinal; |
|
1028 return PyUnicode_FromUnicode(s, 1); |
|
1029 } |
|
1030 |
|
1031 PyObject *PyUnicode_FromObject(register PyObject *obj) |
|
1032 { |
|
1033 /* XXX Perhaps we should make this API an alias of |
|
1034 PyObject_Unicode() instead ?! */ |
|
1035 if (PyUnicode_CheckExact(obj)) { |
|
1036 Py_INCREF(obj); |
|
1037 return obj; |
|
1038 } |
|
1039 if (PyUnicode_Check(obj)) { |
|
1040 /* For a Unicode subtype that's not a Unicode object, |
|
1041 return a true Unicode object with the same data. */ |
|
1042 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), |
|
1043 PyUnicode_GET_SIZE(obj)); |
|
1044 } |
|
1045 return PyUnicode_FromEncodedObject(obj, NULL, "strict"); |
|
1046 } |
|
1047 |
|
1048 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, |
|
1049 const char *encoding, |
|
1050 const char *errors) |
|
1051 { |
|
1052 const char *s = NULL; |
|
1053 Py_ssize_t len; |
|
1054 PyObject *v; |
|
1055 |
|
1056 if (obj == NULL) { |
|
1057 PyErr_BadInternalCall(); |
|
1058 return NULL; |
|
1059 } |
|
1060 |
|
1061 #if 0 |
|
1062 /* For b/w compatibility we also accept Unicode objects provided |
|
1063 that no encodings is given and then redirect to |
|
1064 PyObject_Unicode() which then applies the additional logic for |
|
1065 Unicode subclasses. |
|
1066 |
|
1067 NOTE: This API should really only be used for object which |
|
1068 represent *encoded* Unicode ! |
|
1069 |
|
1070 */ |
|
1071 if (PyUnicode_Check(obj)) { |
|
1072 if (encoding) { |
|
1073 PyErr_SetString(PyExc_TypeError, |
|
1074 "decoding Unicode is not supported"); |
|
1075 return NULL; |
|
1076 } |
|
1077 return PyObject_Unicode(obj); |
|
1078 } |
|
1079 #else |
|
1080 if (PyUnicode_Check(obj)) { |
|
1081 PyErr_SetString(PyExc_TypeError, |
|
1082 "decoding Unicode is not supported"); |
|
1083 return NULL; |
|
1084 } |
|
1085 #endif |
|
1086 |
|
1087 /* Coerce object */ |
|
1088 if (PyString_Check(obj)) { |
|
1089 s = PyString_AS_STRING(obj); |
|
1090 len = PyString_GET_SIZE(obj); |
|
1091 } |
|
1092 else if (PyByteArray_Check(obj)) { |
|
1093 /* Python 2.x specific */ |
|
1094 PyErr_Format(PyExc_TypeError, |
|
1095 "decoding bytearray is not supported"); |
|
1096 return NULL; |
|
1097 } |
|
1098 else if (PyObject_AsCharBuffer(obj, &s, &len)) { |
|
1099 /* Overwrite the error message with something more useful in |
|
1100 case of a TypeError. */ |
|
1101 if (PyErr_ExceptionMatches(PyExc_TypeError)) |
|
1102 PyErr_Format(PyExc_TypeError, |
|
1103 "coercing to Unicode: need string or buffer, " |
|
1104 "%.80s found", |
|
1105 Py_TYPE(obj)->tp_name); |
|
1106 goto onError; |
|
1107 } |
|
1108 |
|
1109 /* Convert to Unicode */ |
|
1110 if (len == 0) { |
|
1111 Py_INCREF(unicode_empty); |
|
1112 v = (PyObject *)unicode_empty; |
|
1113 } |
|
1114 else |
|
1115 v = PyUnicode_Decode(s, len, encoding, errors); |
|
1116 |
|
1117 return v; |
|
1118 |
|
1119 onError: |
|
1120 return NULL; |
|
1121 } |
|
1122 |
|
1123 PyObject *PyUnicode_Decode(const char *s, |
|
1124 Py_ssize_t size, |
|
1125 const char *encoding, |
|
1126 const char *errors) |
|
1127 { |
|
1128 PyObject *buffer = NULL, *unicode; |
|
1129 |
|
1130 if (encoding == NULL) |
|
1131 encoding = PyUnicode_GetDefaultEncoding(); |
|
1132 |
|
1133 /* Shortcuts for common default encodings */ |
|
1134 if (strcmp(encoding, "utf-8") == 0) |
|
1135 return PyUnicode_DecodeUTF8(s, size, errors); |
|
1136 else if (strcmp(encoding, "latin-1") == 0) |
|
1137 return PyUnicode_DecodeLatin1(s, size, errors); |
|
1138 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) |
|
1139 else if (strcmp(encoding, "mbcs") == 0) |
|
1140 return PyUnicode_DecodeMBCS(s, size, errors); |
|
1141 #endif |
|
1142 else if (strcmp(encoding, "ascii") == 0) |
|
1143 return PyUnicode_DecodeASCII(s, size, errors); |
|
1144 |
|
1145 /* Decode via the codec registry */ |
|
1146 buffer = PyBuffer_FromMemory((void *)s, size); |
|
1147 if (buffer == NULL) |
|
1148 goto onError; |
|
1149 unicode = PyCodec_Decode(buffer, encoding, errors); |
|
1150 if (unicode == NULL) |
|
1151 goto onError; |
|
1152 if (!PyUnicode_Check(unicode)) { |
|
1153 PyErr_Format(PyExc_TypeError, |
|
1154 "decoder did not return an unicode object (type=%.400s)", |
|
1155 Py_TYPE(unicode)->tp_name); |
|
1156 Py_DECREF(unicode); |
|
1157 goto onError; |
|
1158 } |
|
1159 Py_DECREF(buffer); |
|
1160 return unicode; |
|
1161 |
|
1162 onError: |
|
1163 Py_XDECREF(buffer); |
|
1164 return NULL; |
|
1165 } |
|
1166 |
|
1167 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, |
|
1168 const char *encoding, |
|
1169 const char *errors) |
|
1170 { |
|
1171 PyObject *v; |
|
1172 |
|
1173 if (!PyUnicode_Check(unicode)) { |
|
1174 PyErr_BadArgument(); |
|
1175 goto onError; |
|
1176 } |
|
1177 |
|
1178 if (encoding == NULL) |
|
1179 encoding = PyUnicode_GetDefaultEncoding(); |
|
1180 |
|
1181 /* Decode via the codec registry */ |
|
1182 v = PyCodec_Decode(unicode, encoding, errors); |
|
1183 if (v == NULL) |
|
1184 goto onError; |
|
1185 return v; |
|
1186 |
|
1187 onError: |
|
1188 return NULL; |
|
1189 } |
|
1190 |
|
1191 PyObject *PyUnicode_Encode(const Py_UNICODE *s, |
|
1192 Py_ssize_t size, |
|
1193 const char *encoding, |
|
1194 const char *errors) |
|
1195 { |
|
1196 PyObject *v, *unicode; |
|
1197 |
|
1198 unicode = PyUnicode_FromUnicode(s, size); |
|
1199 if (unicode == NULL) |
|
1200 return NULL; |
|
1201 v = PyUnicode_AsEncodedString(unicode, encoding, errors); |
|
1202 Py_DECREF(unicode); |
|
1203 return v; |
|
1204 } |
|
1205 |
|
1206 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, |
|
1207 const char *encoding, |
|
1208 const char *errors) |
|
1209 { |
|
1210 PyObject *v; |
|
1211 |
|
1212 if (!PyUnicode_Check(unicode)) { |
|
1213 PyErr_BadArgument(); |
|
1214 goto onError; |
|
1215 } |
|
1216 |
|
1217 if (encoding == NULL) |
|
1218 encoding = PyUnicode_GetDefaultEncoding(); |
|
1219 |
|
1220 /* Encode via the codec registry */ |
|
1221 v = PyCodec_Encode(unicode, encoding, errors); |
|
1222 if (v == NULL) |
|
1223 goto onError; |
|
1224 return v; |
|
1225 |
|
1226 onError: |
|
1227 return NULL; |
|
1228 } |
|
1229 |
|
1230 PyObject *PyUnicode_AsEncodedString(PyObject *unicode, |
|
1231 const char *encoding, |
|
1232 const char *errors) |
|
1233 { |
|
1234 PyObject *v; |
|
1235 |
|
1236 if (!PyUnicode_Check(unicode)) { |
|
1237 PyErr_BadArgument(); |
|
1238 goto onError; |
|
1239 } |
|
1240 |
|
1241 if (encoding == NULL) |
|
1242 encoding = PyUnicode_GetDefaultEncoding(); |
|
1243 |
|
1244 /* Shortcuts for common default encodings */ |
|
1245 if (errors == NULL) { |
|
1246 if (strcmp(encoding, "utf-8") == 0) |
|
1247 return PyUnicode_AsUTF8String(unicode); |
|
1248 else if (strcmp(encoding, "latin-1") == 0) |
|
1249 return PyUnicode_AsLatin1String(unicode); |
|
1250 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) |
|
1251 else if (strcmp(encoding, "mbcs") == 0) |
|
1252 return PyUnicode_AsMBCSString(unicode); |
|
1253 #endif |
|
1254 else if (strcmp(encoding, "ascii") == 0) |
|
1255 return PyUnicode_AsASCIIString(unicode); |
|
1256 } |
|
1257 |
|
1258 /* Encode via the codec registry */ |
|
1259 v = PyCodec_Encode(unicode, encoding, errors); |
|
1260 if (v == NULL) |
|
1261 goto onError; |
|
1262 if (!PyString_Check(v)) { |
|
1263 PyErr_Format(PyExc_TypeError, |
|
1264 "encoder did not return a string object (type=%.400s)", |
|
1265 Py_TYPE(v)->tp_name); |
|
1266 Py_DECREF(v); |
|
1267 goto onError; |
|
1268 } |
|
1269 return v; |
|
1270 |
|
1271 onError: |
|
1272 return NULL; |
|
1273 } |
|
1274 |
|
1275 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, |
|
1276 const char *errors) |
|
1277 { |
|
1278 PyObject *v = ((PyUnicodeObject *)unicode)->defenc; |
|
1279 |
|
1280 if (v) |
|
1281 return v; |
|
1282 v = PyUnicode_AsEncodedString(unicode, NULL, errors); |
|
1283 if (v && errors == NULL) |
|
1284 ((PyUnicodeObject *)unicode)->defenc = v; |
|
1285 return v; |
|
1286 } |
|
1287 |
|
1288 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode) |
|
1289 { |
|
1290 if (!PyUnicode_Check(unicode)) { |
|
1291 PyErr_BadArgument(); |
|
1292 goto onError; |
|
1293 } |
|
1294 return PyUnicode_AS_UNICODE(unicode); |
|
1295 |
|
1296 onError: |
|
1297 return NULL; |
|
1298 } |
|
1299 |
|
1300 Py_ssize_t PyUnicode_GetSize(PyObject *unicode) |
|
1301 { |
|
1302 if (!PyUnicode_Check(unicode)) { |
|
1303 PyErr_BadArgument(); |
|
1304 goto onError; |
|
1305 } |
|
1306 return PyUnicode_GET_SIZE(unicode); |
|
1307 |
|
1308 onError: |
|
1309 return -1; |
|
1310 } |
|
1311 |
|
1312 const char *PyUnicode_GetDefaultEncoding(void) |
|
1313 { |
|
1314 return unicode_default_encoding; |
|
1315 } |
|
1316 |
|
1317 int PyUnicode_SetDefaultEncoding(const char *encoding) |
|
1318 { |
|
1319 PyObject *v; |
|
1320 |
|
1321 /* Make sure the encoding is valid. As side effect, this also |
|
1322 loads the encoding into the codec registry cache. */ |
|
1323 v = _PyCodec_Lookup(encoding); |
|
1324 if (v == NULL) |
|
1325 goto onError; |
|
1326 Py_DECREF(v); |
|
1327 strncpy(unicode_default_encoding, |
|
1328 encoding, |
|
1329 sizeof(unicode_default_encoding)); |
|
1330 return 0; |
|
1331 |
|
1332 onError: |
|
1333 return -1; |
|
1334 } |
|
1335 |
|
1336 /* error handling callback helper: |
|
1337 build arguments, call the callback and check the arguments, |
|
1338 if no exception occurred, copy the replacement to the output |
|
1339 and adjust various state variables. |
|
1340 return 0 on success, -1 on error |
|
1341 */ |
|
1342 |
|
1343 static |
|
1344 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, |
|
1345 const char *encoding, const char *reason, |
|
1346 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos, |
|
1347 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, |
|
1348 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) |
|
1349 { |
|
1350 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple"; |
|
1351 |
|
1352 PyObject *restuple = NULL; |
|
1353 PyObject *repunicode = NULL; |
|
1354 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); |
|
1355 Py_ssize_t requiredsize; |
|
1356 Py_ssize_t newpos; |
|
1357 Py_UNICODE *repptr; |
|
1358 Py_ssize_t repsize; |
|
1359 int res = -1; |
|
1360 |
|
1361 if (*errorHandler == NULL) { |
|
1362 *errorHandler = PyCodec_LookupError(errors); |
|
1363 if (*errorHandler == NULL) |
|
1364 goto onError; |
|
1365 } |
|
1366 |
|
1367 if (*exceptionObject == NULL) { |
|
1368 *exceptionObject = PyUnicodeDecodeError_Create( |
|
1369 encoding, input, insize, *startinpos, *endinpos, reason); |
|
1370 if (*exceptionObject == NULL) |
|
1371 goto onError; |
|
1372 } |
|
1373 else { |
|
1374 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos)) |
|
1375 goto onError; |
|
1376 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos)) |
|
1377 goto onError; |
|
1378 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason)) |
|
1379 goto onError; |
|
1380 } |
|
1381 |
|
1382 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL); |
|
1383 if (restuple == NULL) |
|
1384 goto onError; |
|
1385 if (!PyTuple_Check(restuple)) { |
|
1386 PyErr_Format(PyExc_TypeError, &argparse[4]); |
|
1387 goto onError; |
|
1388 } |
|
1389 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) |
|
1390 goto onError; |
|
1391 if (newpos<0) |
|
1392 newpos = insize+newpos; |
|
1393 if (newpos<0 || newpos>insize) { |
|
1394 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos); |
|
1395 goto onError; |
|
1396 } |
|
1397 |
|
1398 /* need more space? (at least enough for what we |
|
1399 have+the replacement+the rest of the string (starting |
|
1400 at the new input position), so we won't have to check space |
|
1401 when there are no errors in the rest of the string) */ |
|
1402 repptr = PyUnicode_AS_UNICODE(repunicode); |
|
1403 repsize = PyUnicode_GET_SIZE(repunicode); |
|
1404 requiredsize = *outpos + repsize + insize-newpos; |
|
1405 if (requiredsize > outsize) { |
|
1406 if (requiredsize<2*outsize) |
|
1407 requiredsize = 2*outsize; |
|
1408 if (PyUnicode_Resize(output, requiredsize) < 0) |
|
1409 goto onError; |
|
1410 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; |
|
1411 } |
|
1412 *endinpos = newpos; |
|
1413 *inptr = input + newpos; |
|
1414 Py_UNICODE_COPY(*outptr, repptr, repsize); |
|
1415 *outptr += repsize; |
|
1416 *outpos += repsize; |
|
1417 /* we made it! */ |
|
1418 res = 0; |
|
1419 |
|
1420 onError: |
|
1421 Py_XDECREF(restuple); |
|
1422 return res; |
|
1423 } |
|
1424 |
|
1425 /* --- UTF-7 Codec -------------------------------------------------------- */ |
|
1426 |
|
1427 /* see RFC2152 for details */ |
|
1428 |
|
1429 static |
|
1430 char utf7_special[128] = { |
|
1431 /* indicate whether a UTF-7 character is special i.e. cannot be directly |
|
1432 encoded: |
|
1433 0 - not special |
|
1434 1 - special |
|
1435 2 - whitespace (optional) |
|
1436 3 - RFC2152 Set O (optional) */ |
|
1437 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, |
|
1438 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1439 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, |
|
1440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, |
|
1441 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1442 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, |
|
1443 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1444 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, |
|
1445 |
|
1446 }; |
|
1447 |
|
1448 /* Note: The comparison (c) <= 0 is a trick to work-around gcc |
|
1449 warnings about the comparison always being false; since |
|
1450 utf7_special[0] is 1, we can safely make that one comparison |
|
1451 true */ |
|
1452 |
|
1453 #define SPECIAL(c, encodeO, encodeWS) \ |
|
1454 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \ |
|
1455 (encodeWS && (utf7_special[(c)] == 2)) || \ |
|
1456 (encodeO && (utf7_special[(c)] == 3))) |
|
1457 |
|
1458 #define B64(n) \ |
|
1459 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) |
|
1460 #define B64CHAR(c) \ |
|
1461 (isalnum(c) || (c) == '+' || (c) == '/') |
|
1462 #define UB64(c) \ |
|
1463 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ |
|
1464 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 ) |
|
1465 |
|
1466 #define ENCODE(out, ch, bits) \ |
|
1467 while (bits >= 6) { \ |
|
1468 *out++ = B64(ch >> (bits-6)); \ |
|
1469 bits -= 6; \ |
|
1470 } |
|
1471 |
|
1472 #define DECODE(out, ch, bits, surrogate) \ |
|
1473 while (bits >= 16) { \ |
|
1474 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ |
|
1475 bits -= 16; \ |
|
1476 if (surrogate) { \ |
|
1477 /* We have already generated an error for the high surrogate \ |
|
1478 so let's not bother seeing if the low surrogate is correct or not */ \ |
|
1479 surrogate = 0; \ |
|
1480 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ |
|
1481 /* This is a surrogate pair. Unfortunately we can't represent \ |
|
1482 it in a 16-bit character */ \ |
|
1483 surrogate = 1; \ |
|
1484 errmsg = "code pairs are not supported"; \ |
|
1485 goto utf7Error; \ |
|
1486 } else { \ |
|
1487 *out++ = outCh; \ |
|
1488 } \ |
|
1489 } |
|
1490 |
|
1491 PyObject *PyUnicode_DecodeUTF7(const char *s, |
|
1492 Py_ssize_t size, |
|
1493 const char *errors) |
|
1494 { |
|
1495 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL); |
|
1496 } |
|
1497 |
|
1498 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s, |
|
1499 Py_ssize_t size, |
|
1500 const char *errors, |
|
1501 Py_ssize_t *consumed) |
|
1502 { |
|
1503 const char *starts = s; |
|
1504 Py_ssize_t startinpos; |
|
1505 Py_ssize_t endinpos; |
|
1506 Py_ssize_t outpos; |
|
1507 const char *e; |
|
1508 PyUnicodeObject *unicode; |
|
1509 Py_UNICODE *p; |
|
1510 const char *errmsg = ""; |
|
1511 int inShift = 0; |
|
1512 unsigned int bitsleft = 0; |
|
1513 unsigned long charsleft = 0; |
|
1514 int surrogate = 0; |
|
1515 PyObject *errorHandler = NULL; |
|
1516 PyObject *exc = NULL; |
|
1517 |
|
1518 unicode = _PyUnicode_New(size); |
|
1519 if (!unicode) |
|
1520 return NULL; |
|
1521 if (size == 0) { |
|
1522 if (consumed) |
|
1523 *consumed = 0; |
|
1524 return (PyObject *)unicode; |
|
1525 } |
|
1526 |
|
1527 p = unicode->str; |
|
1528 e = s + size; |
|
1529 |
|
1530 while (s < e) { |
|
1531 Py_UNICODE ch; |
|
1532 restart: |
|
1533 ch = (unsigned char) *s; |
|
1534 |
|
1535 if (inShift) { |
|
1536 if ((ch == '-') || !B64CHAR(ch)) { |
|
1537 inShift = 0; |
|
1538 s++; |
|
1539 |
|
1540 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); |
|
1541 if (bitsleft >= 6) { |
|
1542 /* The shift sequence has a partial character in it. If |
|
1543 bitsleft < 6 then we could just classify it as padding |
|
1544 but that is not the case here */ |
|
1545 |
|
1546 errmsg = "partial character in shift sequence"; |
|
1547 goto utf7Error; |
|
1548 } |
|
1549 /* According to RFC2152 the remaining bits should be zero. We |
|
1550 choose to signal an error/insert a replacement character |
|
1551 here so indicate the potential of a misencoded character. */ |
|
1552 |
|
1553 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ |
|
1554 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { |
|
1555 errmsg = "non-zero padding bits in shift sequence"; |
|
1556 goto utf7Error; |
|
1557 } |
|
1558 |
|
1559 if (ch == '-') { |
|
1560 if ((s < e) && (*(s) == '-')) { |
|
1561 *p++ = '-'; |
|
1562 inShift = 1; |
|
1563 } |
|
1564 } else if (SPECIAL(ch,0,0)) { |
|
1565 errmsg = "unexpected special character"; |
|
1566 goto utf7Error; |
|
1567 } else { |
|
1568 *p++ = ch; |
|
1569 } |
|
1570 } else { |
|
1571 charsleft = (charsleft << 6) | UB64(ch); |
|
1572 bitsleft += 6; |
|
1573 s++; |
|
1574 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); |
|
1575 } |
|
1576 } |
|
1577 else if ( ch == '+' ) { |
|
1578 startinpos = s-starts; |
|
1579 s++; |
|
1580 if (s < e && *s == '-') { |
|
1581 s++; |
|
1582 *p++ = '+'; |
|
1583 } else |
|
1584 { |
|
1585 inShift = 1; |
|
1586 bitsleft = 0; |
|
1587 } |
|
1588 } |
|
1589 else if (SPECIAL(ch,0,0)) { |
|
1590 startinpos = s-starts; |
|
1591 errmsg = "unexpected special character"; |
|
1592 s++; |
|
1593 goto utf7Error; |
|
1594 } |
|
1595 else { |
|
1596 *p++ = ch; |
|
1597 s++; |
|
1598 } |
|
1599 continue; |
|
1600 utf7Error: |
|
1601 outpos = p-PyUnicode_AS_UNICODE(unicode); |
|
1602 endinpos = s-starts; |
|
1603 if (unicode_decode_call_errorhandler( |
|
1604 errors, &errorHandler, |
|
1605 "utf7", errmsg, |
|
1606 starts, size, &startinpos, &endinpos, &exc, &s, |
|
1607 (PyObject **)&unicode, &outpos, &p)) |
|
1608 goto onError; |
|
1609 } |
|
1610 |
|
1611 if (inShift && !consumed) { |
|
1612 outpos = p-PyUnicode_AS_UNICODE(unicode); |
|
1613 endinpos = size; |
|
1614 if (unicode_decode_call_errorhandler( |
|
1615 errors, &errorHandler, |
|
1616 "utf7", "unterminated shift sequence", |
|
1617 starts, size, &startinpos, &endinpos, &exc, &s, |
|
1618 (PyObject **)&unicode, &outpos, &p)) |
|
1619 goto onError; |
|
1620 if (s < e) |
|
1621 goto restart; |
|
1622 } |
|
1623 if (consumed) { |
|
1624 if(inShift) |
|
1625 *consumed = startinpos; |
|
1626 else |
|
1627 *consumed = s-starts; |
|
1628 } |
|
1629 |
|
1630 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) |
|
1631 goto onError; |
|
1632 |
|
1633 Py_XDECREF(errorHandler); |
|
1634 Py_XDECREF(exc); |
|
1635 return (PyObject *)unicode; |
|
1636 |
|
1637 onError: |
|
1638 Py_XDECREF(errorHandler); |
|
1639 Py_XDECREF(exc); |
|
1640 Py_DECREF(unicode); |
|
1641 return NULL; |
|
1642 } |
|
1643 |
|
1644 |
|
1645 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, |
|
1646 Py_ssize_t size, |
|
1647 int encodeSetO, |
|
1648 int encodeWhiteSpace, |
|
1649 const char *errors) |
|
1650 { |
|
1651 PyObject *v; |
|
1652 /* It might be possible to tighten this worst case */ |
|
1653 Py_ssize_t cbAllocated = 5 * size; |
|
1654 int inShift = 0; |
|
1655 Py_ssize_t i = 0; |
|
1656 unsigned int bitsleft = 0; |
|
1657 unsigned long charsleft = 0; |
|
1658 char * out; |
|
1659 char * start; |
|
1660 |
|
1661 if (cbAllocated / 5 != size) |
|
1662 return PyErr_NoMemory(); |
|
1663 |
|
1664 if (size == 0) |
|
1665 return PyString_FromStringAndSize(NULL, 0); |
|
1666 |
|
1667 v = PyString_FromStringAndSize(NULL, cbAllocated); |
|
1668 if (v == NULL) |
|
1669 return NULL; |
|
1670 |
|
1671 start = out = PyString_AS_STRING(v); |
|
1672 for (;i < size; ++i) { |
|
1673 Py_UNICODE ch = s[i]; |
|
1674 |
|
1675 if (!inShift) { |
|
1676 if (ch == '+') { |
|
1677 *out++ = '+'; |
|
1678 *out++ = '-'; |
|
1679 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { |
|
1680 charsleft = ch; |
|
1681 bitsleft = 16; |
|
1682 *out++ = '+'; |
|
1683 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); |
|
1684 inShift = bitsleft > 0; |
|
1685 } else { |
|
1686 *out++ = (char) ch; |
|
1687 } |
|
1688 } else { |
|
1689 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { |
|
1690 *out++ = B64(charsleft << (6-bitsleft)); |
|
1691 charsleft = 0; |
|
1692 bitsleft = 0; |
|
1693 /* Characters not in the BASE64 set implicitly unshift the sequence |
|
1694 so no '-' is required, except if the character is itself a '-' */ |
|
1695 if (B64CHAR(ch) || ch == '-') { |
|
1696 *out++ = '-'; |
|
1697 } |
|
1698 inShift = 0; |
|
1699 *out++ = (char) ch; |
|
1700 } else { |
|
1701 bitsleft += 16; |
|
1702 charsleft = (charsleft << 16) | ch; |
|
1703 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); |
|
1704 |
|
1705 /* If the next character is special then we dont' need to terminate |
|
1706 the shift sequence. If the next character is not a BASE64 character |
|
1707 or '-' then the shift sequence will be terminated implicitly and we |
|
1708 don't have to insert a '-'. */ |
|
1709 |
|
1710 if (bitsleft == 0) { |
|
1711 if (i + 1 < size) { |
|
1712 Py_UNICODE ch2 = s[i+1]; |
|
1713 |
|
1714 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { |
|
1715 |
|
1716 } else if (B64CHAR(ch2) || ch2 == '-') { |
|
1717 *out++ = '-'; |
|
1718 inShift = 0; |
|
1719 } else { |
|
1720 inShift = 0; |
|
1721 } |
|
1722 |
|
1723 } |
|
1724 else { |
|
1725 *out++ = '-'; |
|
1726 inShift = 0; |
|
1727 } |
|
1728 } |
|
1729 } |
|
1730 } |
|
1731 } |
|
1732 if (bitsleft) { |
|
1733 *out++= B64(charsleft << (6-bitsleft) ); |
|
1734 *out++ = '-'; |
|
1735 } |
|
1736 |
|
1737 _PyString_Resize(&v, out - start); |
|
1738 return v; |
|
1739 } |
|
1740 |
|
1741 #undef SPECIAL |
|
1742 #undef B64 |
|
1743 #undef B64CHAR |
|
1744 #undef UB64 |
|
1745 #undef ENCODE |
|
1746 #undef DECODE |
|
1747 |
|
1748 /* --- UTF-8 Codec -------------------------------------------------------- */ |
|
1749 |
|
1750 static |
|
1751 char utf8_code_length[256] = { |
|
1752 /* Map UTF-8 encoded prefix byte to sequence length. zero means |
|
1753 illegal prefix. see RFC 2279 for details */ |
|
1754 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1755 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1757 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1758 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1759 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1760 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1761 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
1762 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1763 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1764 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
1766 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
1767 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
|
1768 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
|
1769 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 |
|
1770 }; |
|
1771 |
|
1772 PyObject *PyUnicode_DecodeUTF8(const char *s, |
|
1773 Py_ssize_t size, |
|
1774 const char *errors) |
|
1775 { |
|
1776 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL); |
|
1777 } |
|
1778 |
|
1779 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, |
|
1780 Py_ssize_t size, |
|
1781 const char *errors, |
|
1782 Py_ssize_t *consumed) |
|
1783 { |
|
1784 const char *starts = s; |
|
1785 int n; |
|
1786 Py_ssize_t startinpos; |
|
1787 Py_ssize_t endinpos; |
|
1788 Py_ssize_t outpos; |
|
1789 const char *e; |
|
1790 PyUnicodeObject *unicode; |
|
1791 Py_UNICODE *p; |
|
1792 const char *errmsg = ""; |
|
1793 PyObject *errorHandler = NULL; |
|
1794 PyObject *exc = NULL; |
|
1795 |
|
1796 /* Note: size will always be longer than the resulting Unicode |
|
1797 character count */ |
|
1798 unicode = _PyUnicode_New(size); |
|
1799 if (!unicode) |
|
1800 return NULL; |
|
1801 if (size == 0) { |
|
1802 if (consumed) |
|
1803 *consumed = 0; |
|
1804 return (PyObject *)unicode; |
|
1805 } |
|
1806 |
|
1807 /* Unpack UTF-8 encoded data */ |
|
1808 p = unicode->str; |
|
1809 e = s + size; |
|
1810 |
|
1811 while (s < e) { |
|
1812 Py_UCS4 ch = (unsigned char)*s; |
|
1813 |
|
1814 if (ch < 0x80) { |
|
1815 *p++ = (Py_UNICODE)ch; |
|
1816 s++; |
|
1817 continue; |
|
1818 } |
|
1819 |
|
1820 n = utf8_code_length[ch]; |
|
1821 |
|
1822 if (s + n > e) { |
|
1823 if (consumed) |
|
1824 break; |
|
1825 else { |
|
1826 errmsg = "unexpected end of data"; |
|
1827 startinpos = s-starts; |
|
1828 endinpos = size; |
|
1829 goto utf8Error; |
|
1830 } |
|
1831 } |
|
1832 |
|
1833 switch (n) { |
|
1834 |
|
1835 case 0: |
|
1836 errmsg = "unexpected code byte"; |
|
1837 startinpos = s-starts; |
|
1838 endinpos = startinpos+1; |
|
1839 goto utf8Error; |
|
1840 |
|
1841 case 1: |
|
1842 errmsg = "internal error"; |
|
1843 startinpos = s-starts; |
|
1844 endinpos = startinpos+1; |
|
1845 goto utf8Error; |
|
1846 |
|
1847 case 2: |
|
1848 if ((s[1] & 0xc0) != 0x80) { |
|
1849 errmsg = "invalid data"; |
|
1850 startinpos = s-starts; |
|
1851 endinpos = startinpos+2; |
|
1852 goto utf8Error; |
|
1853 } |
|
1854 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); |
|
1855 if (ch < 0x80) { |
|
1856 startinpos = s-starts; |
|
1857 endinpos = startinpos+2; |
|
1858 errmsg = "illegal encoding"; |
|
1859 goto utf8Error; |
|
1860 } |
|
1861 else |
|
1862 *p++ = (Py_UNICODE)ch; |
|
1863 break; |
|
1864 |
|
1865 case 3: |
|
1866 if ((s[1] & 0xc0) != 0x80 || |
|
1867 (s[2] & 0xc0) != 0x80) { |
|
1868 errmsg = "invalid data"; |
|
1869 startinpos = s-starts; |
|
1870 endinpos = startinpos+3; |
|
1871 goto utf8Error; |
|
1872 } |
|
1873 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); |
|
1874 if (ch < 0x0800) { |
|
1875 /* Note: UTF-8 encodings of surrogates are considered |
|
1876 legal UTF-8 sequences; |
|
1877 |
|
1878 XXX For wide builds (UCS-4) we should probably try |
|
1879 to recombine the surrogates into a single code |
|
1880 unit. |
|
1881 */ |
|
1882 errmsg = "illegal encoding"; |
|
1883 startinpos = s-starts; |
|
1884 endinpos = startinpos+3; |
|
1885 goto utf8Error; |
|
1886 } |
|
1887 else |
|
1888 *p++ = (Py_UNICODE)ch; |
|
1889 break; |
|
1890 |
|
1891 case 4: |
|
1892 if ((s[1] & 0xc0) != 0x80 || |
|
1893 (s[2] & 0xc0) != 0x80 || |
|
1894 (s[3] & 0xc0) != 0x80) { |
|
1895 errmsg = "invalid data"; |
|
1896 startinpos = s-starts; |
|
1897 endinpos = startinpos+4; |
|
1898 goto utf8Error; |
|
1899 } |
|
1900 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + |
|
1901 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); |
|
1902 /* validate and convert to UTF-16 */ |
|
1903 if ((ch < 0x10000) /* minimum value allowed for 4 |
|
1904 byte encoding */ |
|
1905 || (ch > 0x10ffff)) /* maximum value allowed for |
|
1906 UTF-16 */ |
|
1907 { |
|
1908 errmsg = "illegal encoding"; |
|
1909 startinpos = s-starts; |
|
1910 endinpos = startinpos+4; |
|
1911 goto utf8Error; |
|
1912 } |
|
1913 #ifdef Py_UNICODE_WIDE |
|
1914 *p++ = (Py_UNICODE)ch; |
|
1915 #else |
|
1916 /* compute and append the two surrogates: */ |
|
1917 |
|
1918 /* translate from 10000..10FFFF to 0..FFFF */ |
|
1919 ch -= 0x10000; |
|
1920 |
|
1921 /* high surrogate = top 10 bits added to D800 */ |
|
1922 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10)); |
|
1923 |
|
1924 /* low surrogate = bottom 10 bits added to DC00 */ |
|
1925 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF)); |
|
1926 #endif |
|
1927 break; |
|
1928 |
|
1929 default: |
|
1930 /* Other sizes are only needed for UCS-4 */ |
|
1931 errmsg = "unsupported Unicode code range"; |
|
1932 startinpos = s-starts; |
|
1933 endinpos = startinpos+n; |
|
1934 goto utf8Error; |
|
1935 } |
|
1936 s += n; |
|
1937 continue; |
|
1938 |
|
1939 utf8Error: |
|
1940 outpos = p-PyUnicode_AS_UNICODE(unicode); |
|
1941 if (unicode_decode_call_errorhandler( |
|
1942 errors, &errorHandler, |
|
1943 "utf8", errmsg, |
|
1944 starts, size, &startinpos, &endinpos, &exc, &s, |
|
1945 (PyObject **)&unicode, &outpos, &p)) |
|
1946 goto onError; |
|
1947 } |
|
1948 if (consumed) |
|
1949 *consumed = s-starts; |
|
1950 |
|
1951 /* Adjust length */ |
|
1952 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) |
|
1953 goto onError; |
|
1954 |
|
1955 Py_XDECREF(errorHandler); |
|
1956 Py_XDECREF(exc); |
|
1957 return (PyObject *)unicode; |
|
1958 |
|
1959 onError: |
|
1960 Py_XDECREF(errorHandler); |
|
1961 Py_XDECREF(exc); |
|
1962 Py_DECREF(unicode); |
|
1963 return NULL; |
|
1964 } |
|
1965 |
|
1966 /* Allocation strategy: if the string is short, convert into a stack buffer |
|
1967 and allocate exactly as much space needed at the end. Else allocate the |
|
1968 maximum possible needed (4 result bytes per Unicode character), and return |
|
1969 the excess memory at the end. |
|
1970 */ |
|
1971 PyObject * |
|
1972 PyUnicode_EncodeUTF8(const Py_UNICODE *s, |
|
1973 Py_ssize_t size, |
|
1974 const char *errors) |
|
1975 { |
|
1976 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ |
|
1977 |
|
1978 Py_ssize_t i; /* index into s of next input byte */ |
|
1979 PyObject *v; /* result string object */ |
|
1980 char *p; /* next free byte in output buffer */ |
|
1981 Py_ssize_t nallocated; /* number of result bytes allocated */ |
|
1982 Py_ssize_t nneeded; /* number of result bytes needed */ |
|
1983 char stackbuf[MAX_SHORT_UNICHARS * 4]; |
|
1984 |
|
1985 assert(s != NULL); |
|
1986 assert(size >= 0); |
|
1987 |
|
1988 if (size <= MAX_SHORT_UNICHARS) { |
|
1989 /* Write into the stack buffer; nallocated can't overflow. |
|
1990 * At the end, we'll allocate exactly as much heap space as it |
|
1991 * turns out we need. |
|
1992 */ |
|
1993 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); |
|
1994 v = NULL; /* will allocate after we're done */ |
|
1995 p = stackbuf; |
|
1996 } |
|
1997 else { |
|
1998 /* Overallocate on the heap, and give the excess back at the end. */ |
|
1999 nallocated = size * 4; |
|
2000 if (nallocated / 4 != size) /* overflow! */ |
|
2001 return PyErr_NoMemory(); |
|
2002 v = PyString_FromStringAndSize(NULL, nallocated); |
|
2003 if (v == NULL) |
|
2004 return NULL; |
|
2005 p = PyString_AS_STRING(v); |
|
2006 } |
|
2007 |
|
2008 for (i = 0; i < size;) { |
|
2009 Py_UCS4 ch = s[i++]; |
|
2010 |
|
2011 if (ch < 0x80) |
|
2012 /* Encode ASCII */ |
|
2013 *p++ = (char) ch; |
|
2014 |
|
2015 else if (ch < 0x0800) { |
|
2016 /* Encode Latin-1 */ |
|
2017 *p++ = (char)(0xc0 | (ch >> 6)); |
|
2018 *p++ = (char)(0x80 | (ch & 0x3f)); |
|
2019 } |
|
2020 else { |
|
2021 /* Encode UCS2 Unicode ordinals */ |
|
2022 if (ch < 0x10000) { |
|
2023 /* Special case: check for high surrogate */ |
|
2024 if (0xD800 <= ch && ch <= 0xDBFF && i != size) { |
|
2025 Py_UCS4 ch2 = s[i]; |
|
2026 /* Check for low surrogate and combine the two to |
|
2027 form a UCS4 value */ |
|
2028 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { |
|
2029 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; |
|
2030 i++; |
|
2031 goto encodeUCS4; |
|
2032 } |
|
2033 /* Fall through: handles isolated high surrogates */ |
|
2034 } |
|
2035 *p++ = (char)(0xe0 | (ch >> 12)); |
|
2036 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
|
2037 *p++ = (char)(0x80 | (ch & 0x3f)); |
|
2038 continue; |
|
2039 } |
|
2040 encodeUCS4: |
|
2041 /* Encode UCS4 Unicode ordinals */ |
|
2042 *p++ = (char)(0xf0 | (ch >> 18)); |
|
2043 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); |
|
2044 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); |
|
2045 *p++ = (char)(0x80 | (ch & 0x3f)); |
|
2046 } |
|
2047 } |
|
2048 |
|
2049 if (v == NULL) { |
|
2050 /* This was stack allocated. */ |
|
2051 nneeded = p - stackbuf; |
|
2052 assert(nneeded <= nallocated); |
|
2053 v = PyString_FromStringAndSize(stackbuf, nneeded); |
|
2054 } |
|
2055 else { |
|
2056 /* Cut back to size actually needed. */ |
|
2057 nneeded = p - PyString_AS_STRING(v); |
|
2058 assert(nneeded <= nallocated); |
|
2059 _PyString_Resize(&v, nneeded); |
|
2060 } |
|
2061 return v; |
|
2062 |
|
2063 #undef MAX_SHORT_UNICHARS |
|
2064 } |
|
2065 |
|
2066 PyObject *PyUnicode_AsUTF8String(PyObject *unicode) |
|
2067 { |
|
2068 if (!PyUnicode_Check(unicode)) { |
|
2069 PyErr_BadArgument(); |
|
2070 return NULL; |
|
2071 } |
|
2072 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), |
|
2073 PyUnicode_GET_SIZE(unicode), |
|
2074 NULL); |
|
2075 } |
|
2076 |
|
2077 /* --- UTF-32 Codec ------------------------------------------------------- */ |
|
2078 |
|
2079 PyObject * |
|
2080 PyUnicode_DecodeUTF32(const char *s, |
|
2081 Py_ssize_t size, |
|
2082 const char *errors, |
|
2083 int *byteorder) |
|
2084 { |
|
2085 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); |
|
2086 } |
|
2087 |
|
2088 PyObject * |
|
2089 PyUnicode_DecodeUTF32Stateful(const char *s, |
|
2090 Py_ssize_t size, |
|
2091 const char *errors, |
|
2092 int *byteorder, |
|
2093 Py_ssize_t *consumed) |
|
2094 { |
|
2095 const char *starts = s; |
|
2096 Py_ssize_t startinpos; |
|
2097 Py_ssize_t endinpos; |
|
2098 Py_ssize_t outpos; |
|
2099 PyUnicodeObject *unicode; |
|
2100 Py_UNICODE *p; |
|
2101 #ifndef Py_UNICODE_WIDE |
|
2102 int i, pairs; |
|
2103 #else |
|
2104 const int pairs = 0; |
|
2105 #endif |
|
2106 const unsigned char *q, *e; |
|
2107 int bo = 0; /* assume native ordering by default */ |
|
2108 const char *errmsg = ""; |
|
2109 /* Offsets from q for retrieving bytes in the right order. */ |
|
2110 #ifdef BYTEORDER_IS_LITTLE_ENDIAN |
|
2111 int iorder[] = {0, 1, 2, 3}; |
|
2112 #else |
|
2113 int iorder[] = {3, 2, 1, 0}; |
|
2114 #endif |
|
2115 PyObject *errorHandler = NULL; |
|
2116 PyObject *exc = NULL; |
|
2117 /* On narrow builds we split characters outside the BMP into two |
|
2118 codepoints => count how much extra space we need. */ |
|
2119 #ifndef Py_UNICODE_WIDE |
|
2120 for (i = pairs = 0; i < size/4; i++) |
|
2121 if (((Py_UCS4 *)s)[i] >= 0x10000) |
|
2122 pairs++; |
|
2123 #endif |
|
2124 |
|
2125 /* This might be one to much, because of a BOM */ |
|
2126 unicode = _PyUnicode_New((size+3)/4+pairs); |
|
2127 if (!unicode) |
|
2128 return NULL; |
|
2129 if (size == 0) |
|
2130 return (PyObject *)unicode; |
|
2131 |
|
2132 /* Unpack UTF-32 encoded data */ |
|
2133 p = unicode->str; |
|
2134 q = (unsigned char *)s; |
|
2135 e = q + size; |
|
2136 |
|
2137 if (byteorder) |
|
2138 bo = *byteorder; |
|
2139 |
|
2140 /* Check for BOM marks (U+FEFF) in the input and adjust current |
|
2141 byte order setting accordingly. In native mode, the leading BOM |
|
2142 mark is skipped, in all other modes, it is copied to the output |
|
2143 stream as-is (giving a ZWNBSP character). */ |
|
2144 if (bo == 0) { |
|
2145 if (size >= 4) { |
|
2146 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | |
|
2147 (q[iorder[1]] << 8) | q[iorder[0]]; |
|
2148 #ifdef BYTEORDER_IS_LITTLE_ENDIAN |
|
2149 if (bom == 0x0000FEFF) { |
|
2150 q += 4; |
|
2151 bo = -1; |
|
2152 } |
|
2153 else if (bom == 0xFFFE0000) { |
|
2154 q += 4; |
|
2155 bo = 1; |
|
2156 } |
|
2157 #else |
|
2158 if (bom == 0x0000FEFF) { |
|
2159 q += 4; |
|
2160 bo = 1; |
|
2161 } |
|
2162 else if (bom == 0xFFFE0000) { |
|
2163 q += 4; |
|
2164 bo = -1; |
|
2165 } |
|
2166 #endif |
|
2167 } |
|
2168 } |
|
2169 |
|
2170 if (bo == -1) { |
|
2171 /* force LE */ |
|
2172 iorder[0] = 0; |
|
2173 iorder[1] = 1; |
|
2174 iorder[2] = 2; |
|
2175 iorder[3] = 3; |
|
2176 } |
|
2177 else if (bo == 1) { |
|
2178 /* force BE */ |
|
2179 iorder[0] = 3; |
|
2180 iorder[1] = 2; |
|
2181 iorder[2] = 1; |
|
2182 iorder[3] = 0; |
|
2183 } |
|
2184 |
|
2185 while (q < e) { |
|
2186 Py_UCS4 ch; |
|
2187 /* remaining bytes at the end? (size should be divisible by 4) */ |
|
2188 if (e-q<4) { |
|
2189 if (consumed) |
|
2190 break; |
|
2191 errmsg = "truncated data"; |
|
2192 startinpos = ((const char *)q)-starts; |
|
2193 endinpos = ((const char *)e)-starts; |
|
2194 goto utf32Error; |
|
2195 /* The remaining input chars are ignored if the callback |
|
2196 chooses to skip the input */ |
|
2197 } |
|
2198 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | |
|
2199 (q[iorder[1]] << 8) | q[iorder[0]]; |
|
2200 |
|
2201 if (ch >= 0x110000) |
|
2202 { |
|
2203 errmsg = "codepoint not in range(0x110000)"; |
|
2204 startinpos = ((const char *)q)-starts; |
|
2205 endinpos = startinpos+4; |
|
2206 goto utf32Error; |
|
2207 } |
|
2208 #ifndef Py_UNICODE_WIDE |
|
2209 if (ch >= 0x10000) |
|
2210 { |
|
2211 *p++ = 0xD800 | ((ch-0x10000) >> 10); |
|
2212 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); |
|
2213 } |
|
2214 else |
|
2215 #endif |
|
2216 *p++ = ch; |
|
2217 q += 4; |
|
2218 continue; |
|
2219 utf32Error: |
|
2220 outpos = p-PyUnicode_AS_UNICODE(unicode); |
|
2221 if (unicode_decode_call_errorhandler( |
|
2222 errors, &errorHandler, |
|
2223 "utf32", errmsg, |
|
2224 starts, size, &startinpos, &endinpos, &exc, &s, |
|
2225 (PyObject **)&unicode, &outpos, &p)) |
|
2226 goto onError; |
|
2227 } |
|
2228 |
|
2229 if (byteorder) |
|
2230 *byteorder = bo; |
|
2231 |
|
2232 if (consumed) |
|
2233 *consumed = (const char *)q-starts; |
|
2234 |
|
2235 /* Adjust length */ |
|
2236 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) |
|
2237 goto onError; |
|
2238 |
|
2239 Py_XDECREF(errorHandler); |
|
2240 Py_XDECREF(exc); |
|
2241 return (PyObject *)unicode; |
|
2242 |
|
2243 onError: |
|
2244 Py_DECREF(unicode); |
|
2245 Py_XDECREF(errorHandler); |
|
2246 Py_XDECREF(exc); |
|
2247 return NULL; |
|
2248 } |
|
2249 |
|
2250 PyObject * |
|
2251 PyUnicode_EncodeUTF32(const Py_UNICODE *s, |
|
2252 Py_ssize_t size, |
|
2253 const char *errors, |
|
2254 int byteorder) |
|
2255 { |
|
2256 PyObject *v; |
|
2257 unsigned char *p; |
|
2258 Py_ssize_t nsize, bytesize; |
|
2259 #ifndef Py_UNICODE_WIDE |
|
2260 Py_ssize_t i, pairs; |
|
2261 #else |
|
2262 const int pairs = 0; |
|
2263 #endif |
|
2264 /* Offsets from p for storing byte pairs in the right order. */ |
|
2265 #ifdef BYTEORDER_IS_LITTLE_ENDIAN |
|
2266 int iorder[] = {0, 1, 2, 3}; |
|
2267 #else |
|
2268 int iorder[] = {3, 2, 1, 0}; |
|
2269 #endif |
|
2270 |
|
2271 #define STORECHAR(CH) \ |
|
2272 do { \ |
|
2273 p[iorder[3]] = ((CH) >> 24) & 0xff; \ |
|
2274 p[iorder[2]] = ((CH) >> 16) & 0xff; \ |
|
2275 p[iorder[1]] = ((CH) >> 8) & 0xff; \ |
|
2276 p[iorder[0]] = (CH) & 0xff; \ |
|
2277 p += 4; \ |
|
2278 } while(0) |
|
2279 |
|
2280 /* In narrow builds we can output surrogate pairs as one codepoint, |
|
2281 so we need less space. */ |
|
2282 #ifndef Py_UNICODE_WIDE |
|
2283 for (i = pairs = 0; i < size-1; i++) |
|
2284 if (0xD800 <= s[i] && s[i] <= 0xDBFF && |
|
2285 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) |
|
2286 pairs++; |
|
2287 #endif |
|
2288 nsize = (size - pairs + (byteorder == 0)); |
|
2289 bytesize = nsize * 4; |
|
2290 if (bytesize / 4 != nsize) |
|
2291 return PyErr_NoMemory(); |
|
2292 v = PyString_FromStringAndSize(NULL, bytesize); |
|
2293 if (v == NULL) |
|
2294 return NULL; |
|
2295 |
|
2296 p = (unsigned char *)PyString_AS_STRING(v); |
|
2297 if (byteorder == 0) |
|
2298 STORECHAR(0xFEFF); |
|
2299 if (size == 0) |
|
2300 return v; |
|
2301 |
|
2302 if (byteorder == -1) { |
|
2303 /* force LE */ |
|
2304 iorder[0] = 0; |
|
2305 iorder[1] = 1; |
|
2306 iorder[2] = 2; |
|
2307 iorder[3] = 3; |
|
2308 } |
|
2309 else if (byteorder == 1) { |
|
2310 /* force BE */ |
|
2311 iorder[0] = 3; |
|
2312 iorder[1] = 2; |
|
2313 iorder[2] = 1; |
|
2314 iorder[3] = 0; |
|
2315 } |
|
2316 |
|
2317 while (size-- > 0) { |
|
2318 Py_UCS4 ch = *s++; |
|
2319 #ifndef Py_UNICODE_WIDE |
|
2320 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { |
|
2321 Py_UCS4 ch2 = *s; |
|
2322 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { |
|
2323 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; |
|
2324 s++; |
|
2325 size--; |
|
2326 } |
|
2327 } |
|
2328 #endif |
|
2329 STORECHAR(ch); |
|
2330 } |
|
2331 return v; |
|
2332 #undef STORECHAR |
|
2333 } |
|
2334 |
|
2335 PyObject *PyUnicode_AsUTF32String(PyObject *unicode) |
|
2336 { |
|
2337 if (!PyUnicode_Check(unicode)) { |
|
2338 PyErr_BadArgument(); |
|
2339 return NULL; |
|
2340 } |
|
2341 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), |
|
2342 PyUnicode_GET_SIZE(unicode), |
|
2343 NULL, |
|
2344 0); |
|
2345 } |
|
2346 |
|
2347 /* --- UTF-16 Codec ------------------------------------------------------- */ |
|
2348 |
|
2349 PyObject * |
|
2350 PyUnicode_DecodeUTF16(const char *s, |
|
2351 Py_ssize_t size, |
|
2352 const char *errors, |
|
2353 int *byteorder) |
|
2354 { |
|
2355 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); |
|
2356 } |
|
2357 |
|
2358 PyObject * |
|
2359 PyUnicode_DecodeUTF16Stateful(const char *s, |
|
2360 Py_ssize_t size, |
|
2361 const char *errors, |
|
2362 int *byteorder, |
|
2363 Py_ssize_t *consumed) |
|
2364 { |
|
2365 const char *starts = s; |
|
2366 Py_ssize_t startinpos; |
|
2367 Py_ssize_t endinpos; |
|
2368 Py_ssize_t outpos; |
|
2369 PyUnicodeObject *unicode; |
|
2370 Py_UNICODE *p; |
|
2371 const unsigned char *q, *e; |
|
2372 int bo = 0; /* assume native ordering by default */ |
|
2373 const char *errmsg = ""; |
|
2374 /* Offsets from q for retrieving byte pairs in the right order. */ |
|
2375 #ifdef BYTEORDER_IS_LITTLE_ENDIAN |
|
2376 int ihi = 1, ilo = 0; |
|
2377 #else |
|
2378 int ihi = 0, ilo = 1; |
|
2379 #endif |
|
2380 PyObject *errorHandler = NULL; |
|
2381 PyObject *exc = NULL; |
|
2382 |
|
2383 /* Note: size will always be longer than the resulting Unicode |
|
2384 character count */ |
|
2385 unicode = _PyUnicode_New(size); |
|
2386 if (!unicode) |
|
2387 return NULL; |
|
2388 if (size == 0) |
|
2389 return (PyObject *)unicode; |
|
2390 |
|
2391 /* Unpack UTF-16 encoded data */ |
|
2392 p = unicode->str; |
|
2393 q = (unsigned char *)s; |
|
2394 e = q + size; |
|
2395 |
|
2396 if (byteorder) |
|
2397 bo = *byteorder; |
|
2398 |
|
2399 /* Check for BOM marks (U+FEFF) in the input and adjust current |
|
2400 byte order setting accordingly. In native mode, the leading BOM |
|
2401 mark is skipped, in all other modes, it is copied to the output |
|
2402 stream as-is (giving a ZWNBSP character). */ |
|
2403 if (bo == 0) { |
|
2404 if (size >= 2) { |
|
2405 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo]; |
|
2406 #ifdef BYTEORDER_IS_LITTLE_ENDIAN |
|
2407 if (bom == 0xFEFF) { |
|
2408 q += 2; |
|
2409 bo = -1; |
|
2410 } |
|
2411 else if (bom == 0xFFFE) { |
|
2412 q += 2; |
|
2413 bo = 1; |
|
2414 } |
|
2415 #else |
|
2416 if (bom == 0xFEFF) { |
|
2417 q += 2; |
|
2418 bo = 1; |
|
2419 } |
|
2420 else if (bom == 0xFFFE) { |
|
2421 q += 2; |
|
2422 bo = -1; |
|
2423 } |
|
2424 #endif |
|
2425 } |
|
2426 } |
|
2427 |
|
2428 if (bo == -1) { |
|
2429 /* force LE */ |
|
2430 ihi = 1; |
|
2431 ilo = 0; |
|
2432 } |
|
2433 else if (bo == 1) { |
|
2434 /* force BE */ |
|
2435 ihi = 0; |
|
2436 ilo = 1; |
|
2437 } |
|
2438 |
|
2439 while (q < e) { |
|
2440 Py_UNICODE ch; |
|
2441 /* remaining bytes at the end? (size should be even) */ |
|
2442 if (e-q<2) { |
|
2443 if (consumed) |
|
2444 break; |
|
2445 errmsg = "truncated data"; |
|
2446 startinpos = ((const char *)q)-starts; |
|
2447 endinpos = ((const char *)e)-starts; |
|
2448 goto utf16Error; |
|
2449 /* The remaining input chars are ignored if the callback |
|
2450 chooses to skip the input */ |
|
2451 } |
|
2452 ch = (q[ihi] << 8) | q[ilo]; |
|
2453 |
|
2454 q += 2; |
|
2455 |
|
2456 if (ch < 0xD800 || ch > 0xDFFF) { |
|
2457 *p++ = ch; |
|
2458 continue; |
|
2459 } |
|
2460 |
|
2461 /* UTF-16 code pair: */ |
|
2462 if (q >= e) { |
|
2463 errmsg = "unexpected end of data"; |
|
2464 startinpos = (((const char *)q)-2)-starts; |
|
2465 endinpos = ((const char *)e)-starts; |
|
2466 goto utf16Error; |
|
2467 } |
|
2468 if (0xD800 <= ch && ch <= 0xDBFF) { |
|
2469 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; |
|
2470 q += 2; |
|
2471 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { |
|
2472 #ifndef Py_UNICODE_WIDE |
|
2473 *p++ = ch; |
|
2474 *p++ = ch2; |
|
2475 #else |
|
2476 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; |
|
2477 #endif |
|
2478 continue; |
|
2479 } |
|
2480 else { |
|
2481 errmsg = "illegal UTF-16 surrogate"; |
|
2482 startinpos = (((const char *)q)-4)-starts; |
|
2483 endinpos = startinpos+2; |
|
2484 goto utf16Error; |
|
2485 } |
|
2486 |
|
2487 } |
|
2488 errmsg = "illegal encoding"; |
|
2489 startinpos = (((const char *)q)-2)-starts; |
|
2490 endinpos = startinpos+2; |
|
2491 /* Fall through to report the error */ |
|
2492 |
|
2493 utf16Error: |
|
2494 outpos = p-PyUnicode_AS_UNICODE(unicode); |
|
2495 if (unicode_decode_call_errorhandler( |
|
2496 errors, &errorHandler, |
|
2497 "utf16", errmsg, |
|
2498 starts, size, &startinpos, &endinpos, &exc, (const char **)&q, |
|
2499 (PyObject **)&unicode, &outpos, &p)) |
|
2500 goto onError; |
|
2501 } |
|
2502 |
|
2503 if (byteorder) |
|
2504 *byteorder = bo; |
|
2505 |
|
2506 if (consumed) |
|
2507 *consumed = (const char *)q-starts; |
|
2508 |
|
2509 /* Adjust length */ |
|
2510 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) |
|
2511 goto onError; |
|
2512 |
|
2513 Py_XDECREF(errorHandler); |
|
2514 Py_XDECREF(exc); |
|
2515 return (PyObject *)unicode; |
|
2516 |
|
2517 onError: |
|
2518 Py_DECREF(unicode); |
|
2519 Py_XDECREF(errorHandler); |
|
2520 Py_XDECREF(exc); |
|
2521 return NULL; |
|
2522 } |
|
2523 |
|
2524 PyObject * |
|
2525 PyUnicode_EncodeUTF16(const Py_UNICODE *s, |
|
2526 Py_ssize_t size, |
|
2527 const char *errors, |
|
2528 int byteorder) |
|
2529 { |
|
2530 PyObject *v; |
|
2531 unsigned char *p; |
|
2532 Py_ssize_t nsize, bytesize; |
|
2533 #ifdef Py_UNICODE_WIDE |
|
2534 Py_ssize_t i, pairs; |
|
2535 #else |
|
2536 const int pairs = 0; |
|
2537 #endif |
|
2538 /* Offsets from p for storing byte pairs in the right order. */ |
|
2539 #ifdef BYTEORDER_IS_LITTLE_ENDIAN |
|
2540 int ihi = 1, ilo = 0; |
|
2541 #else |
|
2542 int ihi = 0, ilo = 1; |
|
2543 #endif |
|
2544 |
|
2545 #define STORECHAR(CH) \ |
|
2546 do { \ |
|
2547 p[ihi] = ((CH) >> 8) & 0xff; \ |
|
2548 p[ilo] = (CH) & 0xff; \ |
|
2549 p += 2; \ |
|
2550 } while(0) |
|
2551 |
|
2552 #ifdef Py_UNICODE_WIDE |
|
2553 for (i = pairs = 0; i < size; i++) |
|
2554 if (s[i] >= 0x10000) |
|
2555 pairs++; |
|
2556 #endif |
|
2557 /* 2 * (size + pairs + (byteorder == 0)) */ |
|
2558 if (size > PY_SSIZE_T_MAX || |
|
2559 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0)) |
|
2560 return PyErr_NoMemory(); |
|
2561 nsize = size + pairs + (byteorder == 0); |
|
2562 bytesize = nsize * 2; |
|
2563 if (bytesize / 2 != nsize) |
|
2564 return PyErr_NoMemory(); |
|
2565 v = PyString_FromStringAndSize(NULL, bytesize); |
|
2566 if (v == NULL) |
|
2567 return NULL; |
|
2568 |
|
2569 p = (unsigned char *)PyString_AS_STRING(v); |
|
2570 if (byteorder == 0) |
|
2571 STORECHAR(0xFEFF); |
|
2572 if (size == 0) |
|
2573 return v; |
|
2574 |
|
2575 if (byteorder == -1) { |
|
2576 /* force LE */ |
|
2577 ihi = 1; |
|
2578 ilo = 0; |
|
2579 } |
|
2580 else if (byteorder == 1) { |
|
2581 /* force BE */ |
|
2582 ihi = 0; |
|
2583 ilo = 1; |
|
2584 } |
|
2585 |
|
2586 while (size-- > 0) { |
|
2587 Py_UNICODE ch = *s++; |
|
2588 Py_UNICODE ch2 = 0; |
|
2589 #ifdef Py_UNICODE_WIDE |
|
2590 if (ch >= 0x10000) { |
|
2591 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF); |
|
2592 ch = 0xD800 | ((ch-0x10000) >> 10); |
|
2593 } |
|
2594 #endif |
|
2595 STORECHAR(ch); |
|
2596 if (ch2) |
|
2597 STORECHAR(ch2); |
|
2598 } |
|
2599 return v; |
|
2600 #undef STORECHAR |
|
2601 } |
|
2602 |
|
2603 PyObject *PyUnicode_AsUTF16String(PyObject *unicode) |
|
2604 { |
|
2605 if (!PyUnicode_Check(unicode)) { |
|
2606 PyErr_BadArgument(); |
|
2607 return NULL; |
|
2608 } |
|
2609 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode), |
|
2610 PyUnicode_GET_SIZE(unicode), |
|
2611 NULL, |
|
2612 0); |
|
2613 } |
|
2614 |
|
2615 /* --- Unicode Escape Codec ----------------------------------------------- */ |
|
2616 |
|
2617 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; |
|
2618 |
|
2619 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, |
|
2620 Py_ssize_t size, |
|
2621 const char *errors) |
|
2622 { |
|
2623 const char *starts = s; |
|
2624 Py_ssize_t startinpos; |
|
2625 Py_ssize_t endinpos; |
|
2626 Py_ssize_t outpos; |
|
2627 int i; |
|
2628 PyUnicodeObject *v; |
|
2629 Py_UNICODE *p; |
|
2630 const char *end; |
|
2631 char* message; |
|
2632 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ |
|
2633 PyObject *errorHandler = NULL; |
|
2634 PyObject *exc = NULL; |
|
2635 |
|
2636 /* Escaped strings will always be longer than the resulting |
|
2637 Unicode string, so we start with size here and then reduce the |
|
2638 length after conversion to the true value. |
|
2639 (but if the error callback returns a long replacement string |
|
2640 we'll have to allocate more space) */ |
|
2641 v = _PyUnicode_New(size); |
|
2642 if (v == NULL) |
|
2643 goto onError; |
|
2644 if (size == 0) |
|
2645 return (PyObject *)v; |
|
2646 |
|
2647 p = PyUnicode_AS_UNICODE(v); |
|
2648 end = s + size; |
|
2649 |
|
2650 while (s < end) { |
|
2651 unsigned char c; |
|
2652 Py_UNICODE x; |
|
2653 int digits; |
|
2654 |
|
2655 /* Non-escape characters are interpreted as Unicode ordinals */ |
|
2656 if (*s != '\\') { |
|
2657 *p++ = (unsigned char) *s++; |
|
2658 continue; |
|
2659 } |
|
2660 |
|
2661 startinpos = s-starts; |
|
2662 /* \ - Escapes */ |
|
2663 s++; |
|
2664 c = *s++; |
|
2665 if (s > end) |
|
2666 c = '\0'; /* Invalid after \ */ |
|
2667 switch (c) { |
|
2668 |
|
2669 /* \x escapes */ |
|
2670 case '\n': break; |
|
2671 case '\\': *p++ = '\\'; break; |
|
2672 case '\'': *p++ = '\''; break; |
|
2673 case '\"': *p++ = '\"'; break; |
|
2674 case 'b': *p++ = '\b'; break; |
|
2675 case 'f': *p++ = '\014'; break; /* FF */ |
|
2676 case 't': *p++ = '\t'; break; |
|
2677 case 'n': *p++ = '\n'; break; |
|
2678 case 'r': *p++ = '\r'; break; |
|
2679 case 'v': *p++ = '\013'; break; /* VT */ |
|
2680 case 'a': *p++ = '\007'; break; /* BEL, not classic C */ |
|
2681 |
|
2682 /* \OOO (octal) escapes */ |
|
2683 case '0': case '1': case '2': case '3': |
|
2684 case '4': case '5': case '6': case '7': |
|
2685 x = s[-1] - '0'; |
|
2686 if (s < end && '0' <= *s && *s <= '7') { |
|
2687 x = (x<<3) + *s++ - '0'; |
|
2688 if (s < end && '0' <= *s && *s <= '7') |
|
2689 x = (x<<3) + *s++ - '0'; |
|
2690 } |
|
2691 *p++ = x; |
|
2692 break; |
|
2693 |
|
2694 /* hex escapes */ |
|
2695 /* \xXX */ |
|
2696 case 'x': |
|
2697 digits = 2; |
|
2698 message = "truncated \\xXX escape"; |
|
2699 goto hexescape; |
|
2700 |
|
2701 /* \uXXXX */ |
|
2702 case 'u': |
|
2703 digits = 4; |
|
2704 message = "truncated \\uXXXX escape"; |
|
2705 goto hexescape; |
|
2706 |
|
2707 /* \UXXXXXXXX */ |
|
2708 case 'U': |
|
2709 digits = 8; |
|
2710 message = "truncated \\UXXXXXXXX escape"; |
|
2711 hexescape: |
|
2712 chr = 0; |
|
2713 outpos = p-PyUnicode_AS_UNICODE(v); |
|
2714 if (s+digits>end) { |
|
2715 endinpos = size; |
|
2716 if (unicode_decode_call_errorhandler( |
|
2717 errors, &errorHandler, |
|
2718 "unicodeescape", "end of string in escape sequence", |
|
2719 starts, size, &startinpos, &endinpos, &exc, &s, |
|
2720 (PyObject **)&v, &outpos, &p)) |
|
2721 goto onError; |
|
2722 goto nextByte; |
|
2723 } |
|
2724 for (i = 0; i < digits; ++i) { |
|
2725 c = (unsigned char) s[i]; |
|
2726 if (!isxdigit(c)) { |
|
2727 endinpos = (s+i+1)-starts; |
|
2728 if (unicode_decode_call_errorhandler( |
|
2729 errors, &errorHandler, |
|
2730 "unicodeescape", message, |
|
2731 starts, size, &startinpos, &endinpos, &exc, &s, |
|
2732 (PyObject **)&v, &outpos, &p)) |
|
2733 goto onError; |
|
2734 goto nextByte; |
|
2735 } |
|
2736 chr = (chr<<4) & ~0xF; |
|
2737 if (c >= '0' && c <= '9') |
|
2738 chr += c - '0'; |
|
2739 else if (c >= 'a' && c <= 'f') |
|
2740 chr += 10 + c - 'a'; |
|
2741 else |
|
2742 chr += 10 + c - 'A'; |
|
2743 } |
|
2744 s += i; |
|
2745 if (chr == 0xffffffff && PyErr_Occurred()) |
|
2746 /* _decoding_error will have already written into the |
|
2747 target buffer. */ |
|
2748 break; |
|
2749 store: |
|
2750 /* when we get here, chr is a 32-bit unicode character */ |
|
2751 if (chr <= 0xffff) |
|
2752 /* UCS-2 character */ |
|
2753 *p++ = (Py_UNICODE) chr; |
|
2754 else if (chr <= 0x10ffff) { |
|
2755 /* UCS-4 character. Either store directly, or as |
|
2756 surrogate pair. */ |
|
2757 #ifdef Py_UNICODE_WIDE |
|
2758 *p++ = chr; |
|
2759 #else |
|
2760 chr -= 0x10000L; |
|
2761 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10); |
|
2762 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF); |
|
2763 #endif |
|
2764 } else { |
|
2765 endinpos = s-starts; |
|
2766 outpos = p-PyUnicode_AS_UNICODE(v); |
|
2767 if (unicode_decode_call_errorhandler( |
|
2768 errors, &errorHandler, |
|
2769 "unicodeescape", "illegal Unicode character", |
|
2770 starts, size, &startinpos, &endinpos, &exc, &s, |
|
2771 (PyObject **)&v, &outpos, &p)) |
|
2772 goto onError; |
|
2773 } |
|
2774 break; |
|
2775 |
|
2776 /* \N{name} */ |
|
2777 case 'N': |
|
2778 message = "malformed \\N character escape"; |
|
2779 if (ucnhash_CAPI == NULL) { |
|
2780 /* load the unicode data module */ |
|
2781 PyObject *m, *api; |
|
2782 m = PyImport_ImportModuleNoBlock("unicodedata"); |
|
2783 if (m == NULL) |
|
2784 goto ucnhashError; |
|
2785 api = PyObject_GetAttrString(m, "ucnhash_CAPI"); |
|
2786 Py_DECREF(m); |
|
2787 if (api == NULL) |
|
2788 goto ucnhashError; |
|
2789 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api); |
|
2790 Py_DECREF(api); |
|
2791 if (ucnhash_CAPI == NULL) |
|
2792 goto ucnhashError; |
|
2793 } |
|
2794 if (*s == '{') { |
|
2795 const char *start = s+1; |
|
2796 /* look for the closing brace */ |
|
2797 while (*s != '}' && s < end) |
|
2798 s++; |
|
2799 if (s > start && s < end && *s == '}') { |
|
2800 /* found a name. look it up in the unicode database */ |
|
2801 message = "unknown Unicode character name"; |
|
2802 s++; |
|
2803 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) |
|
2804 goto store; |
|
2805 } |
|
2806 } |
|
2807 endinpos = s-starts; |
|
2808 outpos = p-PyUnicode_AS_UNICODE(v); |
|
2809 if (unicode_decode_call_errorhandler( |
|
2810 errors, &errorHandler, |
|
2811 "unicodeescape", message, |
|
2812 starts, size, &startinpos, &endinpos, &exc, &s, |
|
2813 (PyObject **)&v, &outpos, &p)) |
|
2814 goto onError; |
|
2815 break; |
|
2816 |
|
2817 default: |
|
2818 if (s > end) { |
|
2819 message = "\\ at end of string"; |
|
2820 s--; |
|
2821 endinpos = s-starts; |
|
2822 outpos = p-PyUnicode_AS_UNICODE(v); |
|
2823 if (unicode_decode_call_errorhandler( |
|
2824 errors, &errorHandler, |
|
2825 "unicodeescape", message, |
|
2826 starts, size, &startinpos, &endinpos, &exc, &s, |
|
2827 (PyObject **)&v, &outpos, &p)) |
|
2828 goto onError; |
|
2829 } |
|
2830 else { |
|
2831 *p++ = '\\'; |
|
2832 *p++ = (unsigned char)s[-1]; |
|
2833 } |
|
2834 break; |
|
2835 } |
|
2836 nextByte: |
|
2837 ; |
|
2838 } |
|
2839 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) |
|
2840 goto onError; |
|
2841 Py_XDECREF(errorHandler); |
|
2842 Py_XDECREF(exc); |
|
2843 return (PyObject *)v; |
|
2844 |
|
2845 ucnhashError: |
|
2846 PyErr_SetString( |
|
2847 PyExc_UnicodeError, |
|
2848 "\\N escapes not supported (can't load unicodedata module)" |
|
2849 ); |
|
2850 Py_XDECREF(v); |
|
2851 Py_XDECREF(errorHandler); |
|
2852 Py_XDECREF(exc); |
|
2853 return NULL; |
|
2854 |
|
2855 onError: |
|
2856 Py_XDECREF(v); |
|
2857 Py_XDECREF(errorHandler); |
|
2858 Py_XDECREF(exc); |
|
2859 return NULL; |
|
2860 } |
|
2861 |
|
2862 /* Return a Unicode-Escape string version of the Unicode object. |
|
2863 |
|
2864 If quotes is true, the string is enclosed in u"" or u'' quotes as |
|
2865 appropriate. |
|
2866 |
|
2867 */ |
|
2868 |
|
2869 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s, |
|
2870 Py_ssize_t size, |
|
2871 Py_UNICODE ch) |
|
2872 { |
|
2873 /* like wcschr, but doesn't stop at NULL characters */ |
|
2874 |
|
2875 while (size-- > 0) { |
|
2876 if (*s == ch) |
|
2877 return s; |
|
2878 s++; |
|
2879 } |
|
2880 |
|
2881 return NULL; |
|
2882 } |
|
2883 |
|
2884 static |
|
2885 PyObject *unicodeescape_string(const Py_UNICODE *s, |
|
2886 Py_ssize_t size, |
|
2887 int quotes) |
|
2888 { |
|
2889 PyObject *repr; |
|
2890 char *p; |
|
2891 |
|
2892 static const char *hexdigit = "0123456789abcdef"; |
|
2893 #ifdef Py_UNICODE_WIDE |
|
2894 const Py_ssize_t expandsize = 10; |
|
2895 #else |
|
2896 const Py_ssize_t expandsize = 6; |
|
2897 #endif |
|
2898 |
|
2899 /* XXX(nnorwitz): rather than over-allocating, it would be |
|
2900 better to choose a different scheme. Perhaps scan the |
|
2901 first N-chars of the string and allocate based on that size. |
|
2902 */ |
|
2903 /* Initial allocation is based on the longest-possible unichr |
|
2904 escape. |
|
2905 |
|
2906 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source |
|
2907 unichr, so in this case it's the longest unichr escape. In |
|
2908 narrow (UTF-16) builds this is five chars per source unichr |
|
2909 since there are two unichrs in the surrogate pair, so in narrow |
|
2910 (UTF-16) builds it's not the longest unichr escape. |
|
2911 |
|
2912 In wide or narrow builds '\uxxxx' is 6 chars per source unichr, |
|
2913 so in the narrow (UTF-16) build case it's the longest unichr |
|
2914 escape. |
|
2915 */ |
|
2916 |
|
2917 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) |
|
2918 return PyErr_NoMemory(); |
|
2919 |
|
2920 repr = PyString_FromStringAndSize(NULL, |
|
2921 2 |
|
2922 + expandsize*size |
|
2923 + 1); |
|
2924 if (repr == NULL) |
|
2925 return NULL; |
|
2926 |
|
2927 p = PyString_AS_STRING(repr); |
|
2928 |
|
2929 if (quotes) { |
|
2930 *p++ = 'u'; |
|
2931 *p++ = (findchar(s, size, '\'') && |
|
2932 !findchar(s, size, '"')) ? '"' : '\''; |
|
2933 } |
|
2934 while (size-- > 0) { |
|
2935 Py_UNICODE ch = *s++; |
|
2936 |
|
2937 /* Escape quotes and backslashes */ |
|
2938 if ((quotes && |
|
2939 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') { |
|
2940 *p++ = '\\'; |
|
2941 *p++ = (char) ch; |
|
2942 continue; |
|
2943 } |
|
2944 |
|
2945 #ifdef Py_UNICODE_WIDE |
|
2946 /* Map 21-bit characters to '\U00xxxxxx' */ |
|
2947 else if (ch >= 0x10000) { |
|
2948 *p++ = '\\'; |
|
2949 *p++ = 'U'; |
|
2950 *p++ = hexdigit[(ch >> 28) & 0x0000000F]; |
|
2951 *p++ = hexdigit[(ch >> 24) & 0x0000000F]; |
|
2952 *p++ = hexdigit[(ch >> 20) & 0x0000000F]; |
|
2953 *p++ = hexdigit[(ch >> 16) & 0x0000000F]; |
|
2954 *p++ = hexdigit[(ch >> 12) & 0x0000000F]; |
|
2955 *p++ = hexdigit[(ch >> 8) & 0x0000000F]; |
|
2956 *p++ = hexdigit[(ch >> 4) & 0x0000000F]; |
|
2957 *p++ = hexdigit[ch & 0x0000000F]; |
|
2958 continue; |
|
2959 } |
|
2960 #else |
|
2961 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ |
|
2962 else if (ch >= 0xD800 && ch < 0xDC00) { |
|
2963 Py_UNICODE ch2; |
|
2964 Py_UCS4 ucs; |
|
2965 |
|
2966 ch2 = *s++; |
|
2967 size--; |
|
2968 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { |
|
2969 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; |
|
2970 *p++ = '\\'; |
|
2971 *p++ = 'U'; |
|
2972 *p++ = hexdigit[(ucs >> 28) & 0x0000000F]; |
|
2973 *p++ = hexdigit[(ucs >> 24) & 0x0000000F]; |
|
2974 *p++ = hexdigit[(ucs >> 20) & 0x0000000F]; |
|
2975 *p++ = hexdigit[(ucs >> 16) & 0x0000000F]; |
|
2976 *p++ = hexdigit[(ucs >> 12) & 0x0000000F]; |
|
2977 *p++ = hexdigit[(ucs >> 8) & 0x0000000F]; |
|
2978 *p++ = hexdigit[(ucs >> 4) & 0x0000000F]; |
|
2979 *p++ = hexdigit[ucs & 0x0000000F]; |
|
2980 continue; |
|
2981 } |
|
2982 /* Fall through: isolated surrogates are copied as-is */ |
|
2983 s--; |
|
2984 size++; |
|
2985 } |
|
2986 #endif |
|
2987 |
|
2988 /* Map 16-bit characters to '\uxxxx' */ |
|
2989 if (ch >= 256) { |
|
2990 *p++ = '\\'; |
|
2991 *p++ = 'u'; |
|
2992 *p++ = hexdigit[(ch >> 12) & 0x000F]; |
|
2993 *p++ = hexdigit[(ch >> 8) & 0x000F]; |
|
2994 *p++ = hexdigit[(ch >> 4) & 0x000F]; |
|
2995 *p++ = hexdigit[ch & 0x000F]; |
|
2996 } |
|
2997 |
|
2998 /* Map special whitespace to '\t', \n', '\r' */ |
|
2999 else if (ch == '\t') { |
|
3000 *p++ = '\\'; |
|
3001 *p++ = 't'; |
|
3002 } |
|
3003 else if (ch == '\n') { |
|
3004 *p++ = '\\'; |
|
3005 *p++ = 'n'; |
|
3006 } |
|
3007 else if (ch == '\r') { |
|
3008 *p++ = '\\'; |
|
3009 *p++ = 'r'; |
|
3010 } |
|
3011 |
|
3012 /* Map non-printable US ASCII to '\xhh' */ |
|
3013 else if (ch < ' ' || ch >= 0x7F) { |
|
3014 *p++ = '\\'; |
|
3015 *p++ = 'x'; |
|
3016 *p++ = hexdigit[(ch >> 4) & 0x000F]; |
|
3017 *p++ = hexdigit[ch & 0x000F]; |
|
3018 } |
|
3019 |
|
3020 /* Copy everything else as-is */ |
|
3021 else |
|
3022 *p++ = (char) ch; |
|
3023 } |
|
3024 if (quotes) |
|
3025 *p++ = PyString_AS_STRING(repr)[1]; |
|
3026 |
|
3027 *p = '\0'; |
|
3028 _PyString_Resize(&repr, p - PyString_AS_STRING(repr)); |
|
3029 return repr; |
|
3030 } |
|
3031 |
|
3032 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, |
|
3033 Py_ssize_t size) |
|
3034 { |
|
3035 return unicodeescape_string(s, size, 0); |
|
3036 } |
|
3037 |
|
3038 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) |
|
3039 { |
|
3040 if (!PyUnicode_Check(unicode)) { |
|
3041 PyErr_BadArgument(); |
|
3042 return NULL; |
|
3043 } |
|
3044 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), |
|
3045 PyUnicode_GET_SIZE(unicode)); |
|
3046 } |
|
3047 |
|
3048 /* --- Raw Unicode Escape Codec ------------------------------------------- */ |
|
3049 |
|
3050 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, |
|
3051 Py_ssize_t size, |
|
3052 const char *errors) |
|
3053 { |
|
3054 const char *starts = s; |
|
3055 Py_ssize_t startinpos; |
|
3056 Py_ssize_t endinpos; |
|
3057 Py_ssize_t outpos; |
|
3058 PyUnicodeObject *v; |
|
3059 Py_UNICODE *p; |
|
3060 const char *end; |
|
3061 const char *bs; |
|
3062 PyObject *errorHandler = NULL; |
|
3063 PyObject *exc = NULL; |
|
3064 |
|
3065 /* Escaped strings will always be longer than the resulting |
|
3066 Unicode string, so we start with size here and then reduce the |
|
3067 length after conversion to the true value. (But decoding error |
|
3068 handler might have to resize the string) */ |
|
3069 v = _PyUnicode_New(size); |
|
3070 if (v == NULL) |
|
3071 goto onError; |
|
3072 if (size == 0) |
|
3073 return (PyObject *)v; |
|
3074 p = PyUnicode_AS_UNICODE(v); |
|
3075 end = s + size; |
|
3076 while (s < end) { |
|
3077 unsigned char c; |
|
3078 Py_UCS4 x; |
|
3079 int i; |
|
3080 int count; |
|
3081 |
|
3082 /* Non-escape characters are interpreted as Unicode ordinals */ |
|
3083 if (*s != '\\') { |
|
3084 *p++ = (unsigned char)*s++; |
|
3085 continue; |
|
3086 } |
|
3087 startinpos = s-starts; |
|
3088 |
|
3089 /* \u-escapes are only interpreted iff the number of leading |
|
3090 backslashes if odd */ |
|
3091 bs = s; |
|
3092 for (;s < end;) { |
|
3093 if (*s != '\\') |
|
3094 break; |
|
3095 *p++ = (unsigned char)*s++; |
|
3096 } |
|
3097 if (((s - bs) & 1) == 0 || |
|
3098 s >= end || |
|
3099 (*s != 'u' && *s != 'U')) { |
|
3100 continue; |
|
3101 } |
|
3102 p--; |
|
3103 count = *s=='u' ? 4 : 8; |
|
3104 s++; |
|
3105 |
|
3106 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ |
|
3107 outpos = p-PyUnicode_AS_UNICODE(v); |
|
3108 for (x = 0, i = 0; i < count; ++i, ++s) { |
|
3109 c = (unsigned char)*s; |
|
3110 if (!isxdigit(c)) { |
|
3111 endinpos = s-starts; |
|
3112 if (unicode_decode_call_errorhandler( |
|
3113 errors, &errorHandler, |
|
3114 "rawunicodeescape", "truncated \\uXXXX", |
|
3115 starts, size, &startinpos, &endinpos, &exc, &s, |
|
3116 (PyObject **)&v, &outpos, &p)) |
|
3117 goto onError; |
|
3118 goto nextByte; |
|
3119 } |
|
3120 x = (x<<4) & ~0xF; |
|
3121 if (c >= '0' && c <= '9') |
|
3122 x += c - '0'; |
|
3123 else if (c >= 'a' && c <= 'f') |
|
3124 x += 10 + c - 'a'; |
|
3125 else |
|
3126 x += 10 + c - 'A'; |
|
3127 } |
|
3128 if (x <= 0xffff) |
|
3129 /* UCS-2 character */ |
|
3130 *p++ = (Py_UNICODE) x; |
|
3131 else if (x <= 0x10ffff) { |
|
3132 /* UCS-4 character. Either store directly, or as |
|
3133 surrogate pair. */ |
|
3134 #ifdef Py_UNICODE_WIDE |
|
3135 *p++ = (Py_UNICODE) x; |
|
3136 #else |
|
3137 x -= 0x10000L; |
|
3138 *p++ = 0xD800 + (Py_UNICODE) (x >> 10); |
|
3139 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF); |
|
3140 #endif |
|
3141 } else { |
|
3142 endinpos = s-starts; |
|
3143 outpos = p-PyUnicode_AS_UNICODE(v); |
|
3144 if (unicode_decode_call_errorhandler( |
|
3145 errors, &errorHandler, |
|
3146 "rawunicodeescape", "\\Uxxxxxxxx out of range", |
|
3147 starts, size, &startinpos, &endinpos, &exc, &s, |
|
3148 (PyObject **)&v, &outpos, &p)) |
|
3149 goto onError; |
|
3150 } |
|
3151 nextByte: |
|
3152 ; |
|
3153 } |
|
3154 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) |
|
3155 goto onError; |
|
3156 Py_XDECREF(errorHandler); |
|
3157 Py_XDECREF(exc); |
|
3158 return (PyObject *)v; |
|
3159 |
|
3160 onError: |
|
3161 Py_XDECREF(v); |
|
3162 Py_XDECREF(errorHandler); |
|
3163 Py_XDECREF(exc); |
|
3164 return NULL; |
|
3165 } |
|
3166 |
|
3167 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, |
|
3168 Py_ssize_t size) |
|
3169 { |
|
3170 PyObject *repr; |
|
3171 char *p; |
|
3172 char *q; |
|
3173 |
|
3174 static const char *hexdigit = "0123456789abcdef"; |
|
3175 #ifdef Py_UNICODE_WIDE |
|
3176 const Py_ssize_t expandsize = 10; |
|
3177 #else |
|
3178 const Py_ssize_t expandsize = 6; |
|
3179 #endif |
|
3180 |
|
3181 if (size > PY_SSIZE_T_MAX / expandsize) |
|
3182 return PyErr_NoMemory(); |
|
3183 |
|
3184 repr = PyString_FromStringAndSize(NULL, expandsize * size); |
|
3185 if (repr == NULL) |
|
3186 return NULL; |
|
3187 if (size == 0) |
|
3188 return repr; |
|
3189 |
|
3190 p = q = PyString_AS_STRING(repr); |
|
3191 while (size-- > 0) { |
|
3192 Py_UNICODE ch = *s++; |
|
3193 #ifdef Py_UNICODE_WIDE |
|
3194 /* Map 32-bit characters to '\Uxxxxxxxx' */ |
|
3195 if (ch >= 0x10000) { |
|
3196 *p++ = '\\'; |
|
3197 *p++ = 'U'; |
|
3198 *p++ = hexdigit[(ch >> 28) & 0xf]; |
|
3199 *p++ = hexdigit[(ch >> 24) & 0xf]; |
|
3200 *p++ = hexdigit[(ch >> 20) & 0xf]; |
|
3201 *p++ = hexdigit[(ch >> 16) & 0xf]; |
|
3202 *p++ = hexdigit[(ch >> 12) & 0xf]; |
|
3203 *p++ = hexdigit[(ch >> 8) & 0xf]; |
|
3204 *p++ = hexdigit[(ch >> 4) & 0xf]; |
|
3205 *p++ = hexdigit[ch & 15]; |
|
3206 } |
|
3207 else |
|
3208 #else |
|
3209 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ |
|
3210 if (ch >= 0xD800 && ch < 0xDC00) { |
|
3211 Py_UNICODE ch2; |
|
3212 Py_UCS4 ucs; |
|
3213 |
|
3214 ch2 = *s++; |
|
3215 size--; |
|
3216 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { |
|
3217 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; |
|
3218 *p++ = '\\'; |
|
3219 *p++ = 'U'; |
|
3220 *p++ = hexdigit[(ucs >> 28) & 0xf]; |
|
3221 *p++ = hexdigit[(ucs >> 24) & 0xf]; |
|
3222 *p++ = hexdigit[(ucs >> 20) & 0xf]; |
|
3223 *p++ = hexdigit[(ucs >> 16) & 0xf]; |
|
3224 *p++ = hexdigit[(ucs >> 12) & 0xf]; |
|
3225 *p++ = hexdigit[(ucs >> 8) & 0xf]; |
|
3226 *p++ = hexdigit[(ucs >> 4) & 0xf]; |
|
3227 *p++ = hexdigit[ucs & 0xf]; |
|
3228 continue; |
|
3229 } |
|
3230 /* Fall through: isolated surrogates are copied as-is */ |
|
3231 s--; |
|
3232 size++; |
|
3233 } |
|
3234 #endif |
|
3235 /* Map 16-bit characters to '\uxxxx' */ |
|
3236 if (ch >= 256) { |
|
3237 *p++ = '\\'; |
|
3238 *p++ = 'u'; |
|
3239 *p++ = hexdigit[(ch >> 12) & 0xf]; |
|
3240 *p++ = hexdigit[(ch >> 8) & 0xf]; |
|
3241 *p++ = hexdigit[(ch >> 4) & 0xf]; |
|
3242 *p++ = hexdigit[ch & 15]; |
|
3243 } |
|
3244 /* Copy everything else as-is */ |
|
3245 else |
|
3246 *p++ = (char) ch; |
|
3247 } |
|
3248 *p = '\0'; |
|
3249 _PyString_Resize(&repr, p - q); |
|
3250 return repr; |
|
3251 } |
|
3252 |
|
3253 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) |
|
3254 { |
|
3255 if (!PyUnicode_Check(unicode)) { |
|
3256 PyErr_BadArgument(); |
|
3257 return NULL; |
|
3258 } |
|
3259 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), |
|
3260 PyUnicode_GET_SIZE(unicode)); |
|
3261 } |
|
3262 |
|
3263 /* --- Unicode Internal Codec ------------------------------------------- */ |
|
3264 |
|
3265 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, |
|
3266 Py_ssize_t size, |
|
3267 const char *errors) |
|
3268 { |
|
3269 const char *starts = s; |
|
3270 Py_ssize_t startinpos; |
|
3271 Py_ssize_t endinpos; |
|
3272 Py_ssize_t outpos; |
|
3273 PyUnicodeObject *v; |
|
3274 Py_UNICODE *p; |
|
3275 const char *end; |
|
3276 const char *reason; |
|
3277 PyObject *errorHandler = NULL; |
|
3278 PyObject *exc = NULL; |
|
3279 |
|
3280 #ifdef Py_UNICODE_WIDE |
|
3281 Py_UNICODE unimax = PyUnicode_GetMax(); |
|
3282 #endif |
|
3283 |
|
3284 /* XXX overflow detection missing */ |
|
3285 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); |
|
3286 if (v == NULL) |
|
3287 goto onError; |
|
3288 if (PyUnicode_GetSize((PyObject *)v) == 0) |
|
3289 return (PyObject *)v; |
|
3290 p = PyUnicode_AS_UNICODE(v); |
|
3291 end = s + size; |
|
3292 |
|
3293 while (s < end) { |
|
3294 memcpy(p, s, sizeof(Py_UNICODE)); |
|
3295 /* We have to sanity check the raw data, otherwise doom looms for |
|
3296 some malformed UCS-4 data. */ |
|
3297 if ( |
|
3298 #ifdef Py_UNICODE_WIDE |
|
3299 *p > unimax || *p < 0 || |
|
3300 #endif |
|
3301 end-s < Py_UNICODE_SIZE |
|
3302 ) |
|
3303 { |
|
3304 startinpos = s - starts; |
|
3305 if (end-s < Py_UNICODE_SIZE) { |
|
3306 endinpos = end-starts; |
|
3307 reason = "truncated input"; |
|
3308 } |
|
3309 else { |
|
3310 endinpos = s - starts + Py_UNICODE_SIZE; |
|
3311 reason = "illegal code point (> 0x10FFFF)"; |
|
3312 } |
|
3313 outpos = p - PyUnicode_AS_UNICODE(v); |
|
3314 if (unicode_decode_call_errorhandler( |
|
3315 errors, &errorHandler, |
|
3316 "unicode_internal", reason, |
|
3317 starts, size, &startinpos, &endinpos, &exc, &s, |
|
3318 (PyObject **)&v, &outpos, &p)) { |
|
3319 goto onError; |
|
3320 } |
|
3321 } |
|
3322 else { |
|
3323 p++; |
|
3324 s += Py_UNICODE_SIZE; |
|
3325 } |
|
3326 } |
|
3327 |
|
3328 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) |
|
3329 goto onError; |
|
3330 Py_XDECREF(errorHandler); |
|
3331 Py_XDECREF(exc); |
|
3332 return (PyObject *)v; |
|
3333 |
|
3334 onError: |
|
3335 Py_XDECREF(v); |
|
3336 Py_XDECREF(errorHandler); |
|
3337 Py_XDECREF(exc); |
|
3338 return NULL; |
|
3339 } |
|
3340 |
|
3341 /* --- Latin-1 Codec ------------------------------------------------------ */ |
|
3342 |
|
3343 PyObject *PyUnicode_DecodeLatin1(const char *s, |
|
3344 Py_ssize_t size, |
|
3345 const char *errors) |
|
3346 { |
|
3347 PyUnicodeObject *v; |
|
3348 Py_UNICODE *p; |
|
3349 |
|
3350 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ |
|
3351 if (size == 1) { |
|
3352 Py_UNICODE r = *(unsigned char*)s; |
|
3353 return PyUnicode_FromUnicode(&r, 1); |
|
3354 } |
|
3355 |
|
3356 v = _PyUnicode_New(size); |
|
3357 if (v == NULL) |
|
3358 goto onError; |
|
3359 if (size == 0) |
|
3360 return (PyObject *)v; |
|
3361 p = PyUnicode_AS_UNICODE(v); |
|
3362 while (size-- > 0) |
|
3363 *p++ = (unsigned char)*s++; |
|
3364 return (PyObject *)v; |
|
3365 |
|
3366 onError: |
|
3367 Py_XDECREF(v); |
|
3368 return NULL; |
|
3369 } |
|
3370 |
|
3371 /* create or adjust a UnicodeEncodeError */ |
|
3372 static void make_encode_exception(PyObject **exceptionObject, |
|
3373 const char *encoding, |
|
3374 const Py_UNICODE *unicode, Py_ssize_t size, |
|
3375 Py_ssize_t startpos, Py_ssize_t endpos, |
|
3376 const char *reason) |
|
3377 { |
|
3378 if (*exceptionObject == NULL) { |
|
3379 *exceptionObject = PyUnicodeEncodeError_Create( |
|
3380 encoding, unicode, size, startpos, endpos, reason); |
|
3381 } |
|
3382 else { |
|
3383 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos)) |
|
3384 goto onError; |
|
3385 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos)) |
|
3386 goto onError; |
|
3387 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason)) |
|
3388 goto onError; |
|
3389 return; |
|
3390 onError: |
|
3391 Py_DECREF(*exceptionObject); |
|
3392 *exceptionObject = NULL; |
|
3393 } |
|
3394 } |
|
3395 |
|
3396 /* raises a UnicodeEncodeError */ |
|
3397 static void raise_encode_exception(PyObject **exceptionObject, |
|
3398 const char *encoding, |
|
3399 const Py_UNICODE *unicode, Py_ssize_t size, |
|
3400 Py_ssize_t startpos, Py_ssize_t endpos, |
|
3401 const char *reason) |
|
3402 { |
|
3403 make_encode_exception(exceptionObject, |
|
3404 encoding, unicode, size, startpos, endpos, reason); |
|
3405 if (*exceptionObject != NULL) |
|
3406 PyCodec_StrictErrors(*exceptionObject); |
|
3407 } |
|
3408 |
|
3409 /* error handling callback helper: |
|
3410 build arguments, call the callback and check the arguments, |
|
3411 put the result into newpos and return the replacement string, which |
|
3412 has to be freed by the caller */ |
|
3413 static PyObject *unicode_encode_call_errorhandler(const char *errors, |
|
3414 PyObject **errorHandler, |
|
3415 const char *encoding, const char *reason, |
|
3416 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, |
|
3417 Py_ssize_t startpos, Py_ssize_t endpos, |
|
3418 Py_ssize_t *newpos) |
|
3419 { |
|
3420 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple"; |
|
3421 |
|
3422 PyObject *restuple; |
|
3423 PyObject *resunicode; |
|
3424 |
|
3425 if (*errorHandler == NULL) { |
|
3426 *errorHandler = PyCodec_LookupError(errors); |
|
3427 if (*errorHandler == NULL) |
|
3428 return NULL; |
|
3429 } |
|
3430 |
|
3431 make_encode_exception(exceptionObject, |
|
3432 encoding, unicode, size, startpos, endpos, reason); |
|
3433 if (*exceptionObject == NULL) |
|
3434 return NULL; |
|
3435 |
|
3436 restuple = PyObject_CallFunctionObjArgs( |
|
3437 *errorHandler, *exceptionObject, NULL); |
|
3438 if (restuple == NULL) |
|
3439 return NULL; |
|
3440 if (!PyTuple_Check(restuple)) { |
|
3441 PyErr_Format(PyExc_TypeError, &argparse[4]); |
|
3442 Py_DECREF(restuple); |
|
3443 return NULL; |
|
3444 } |
|
3445 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, |
|
3446 &resunicode, newpos)) { |
|
3447 Py_DECREF(restuple); |
|
3448 return NULL; |
|
3449 } |
|
3450 if (*newpos<0) |
|
3451 *newpos = size+*newpos; |
|
3452 if (*newpos<0 || *newpos>size) { |
|
3453 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); |
|
3454 Py_DECREF(restuple); |
|
3455 return NULL; |
|
3456 } |
|
3457 Py_INCREF(resunicode); |
|
3458 Py_DECREF(restuple); |
|
3459 return resunicode; |
|
3460 } |
|
3461 |
|
3462 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, |
|
3463 Py_ssize_t size, |
|
3464 const char *errors, |
|
3465 int limit) |
|
3466 { |
|
3467 /* output object */ |
|
3468 PyObject *res; |
|
3469 /* pointers to the beginning and end+1 of input */ |
|
3470 const Py_UNICODE *startp = p; |
|
3471 const Py_UNICODE *endp = p + size; |
|
3472 /* pointer to the beginning of the unencodable characters */ |
|
3473 /* const Py_UNICODE *badp = NULL; */ |
|
3474 /* pointer into the output */ |
|
3475 char *str; |
|
3476 /* current output position */ |
|
3477 Py_ssize_t respos = 0; |
|
3478 Py_ssize_t ressize; |
|
3479 const char *encoding = (limit == 256) ? "latin-1" : "ascii"; |
|
3480 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; |
|
3481 PyObject *errorHandler = NULL; |
|
3482 PyObject *exc = NULL; |
|
3483 /* the following variable is used for caching string comparisons |
|
3484 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ |
|
3485 int known_errorHandler = -1; |
|
3486 |
|
3487 /* allocate enough for a simple encoding without |
|
3488 replacements, if we need more, we'll resize */ |
|
3489 res = PyString_FromStringAndSize(NULL, size); |
|
3490 if (res == NULL) |
|
3491 goto onError; |
|
3492 if (size == 0) |
|
3493 return res; |
|
3494 str = PyString_AS_STRING(res); |
|
3495 ressize = size; |
|
3496 |
|
3497 while (p<endp) { |
|
3498 Py_UNICODE c = *p; |
|
3499 |
|
3500 /* can we encode this? */ |
|
3501 if (c<limit) { |
|
3502 /* no overflow check, because we know that the space is enough */ |
|
3503 *str++ = (char)c; |
|
3504 ++p; |
|
3505 } |
|
3506 else { |
|
3507 Py_ssize_t unicodepos = p-startp; |
|
3508 Py_ssize_t requiredsize; |
|
3509 PyObject *repunicode; |
|
3510 Py_ssize_t repsize; |
|
3511 Py_ssize_t newpos; |
|
3512 Py_ssize_t respos; |
|
3513 Py_UNICODE *uni2; |
|
3514 /* startpos for collecting unencodable chars */ |
|
3515 const Py_UNICODE *collstart = p; |
|
3516 const Py_UNICODE *collend = p; |
|
3517 /* find all unecodable characters */ |
|
3518 while ((collend < endp) && ((*collend)>=limit)) |
|
3519 ++collend; |
|
3520 /* cache callback name lookup (if not done yet, i.e. it's the first error) */ |
|
3521 if (known_errorHandler==-1) { |
|
3522 if ((errors==NULL) || (!strcmp(errors, "strict"))) |
|
3523 known_errorHandler = 1; |
|
3524 else if (!strcmp(errors, "replace")) |
|
3525 known_errorHandler = 2; |
|
3526 else if (!strcmp(errors, "ignore")) |
|
3527 known_errorHandler = 3; |
|
3528 else if (!strcmp(errors, "xmlcharrefreplace")) |
|
3529 known_errorHandler = 4; |
|
3530 else |
|
3531 known_errorHandler = 0; |
|
3532 } |
|
3533 switch (known_errorHandler) { |
|
3534 case 1: /* strict */ |
|
3535 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason); |
|
3536 goto onError; |
|
3537 case 2: /* replace */ |
|
3538 while (collstart++<collend) |
|
3539 *str++ = '?'; /* fall through */ |
|
3540 case 3: /* ignore */ |
|
3541 p = collend; |
|
3542 break; |
|
3543 case 4: /* xmlcharrefreplace */ |
|
3544 respos = str-PyString_AS_STRING(res); |
|
3545 /* determine replacement size (temporarily (mis)uses p) */ |
|
3546 for (p = collstart, repsize = 0; p < collend; ++p) { |
|
3547 if (*p<10) |
|
3548 repsize += 2+1+1; |
|
3549 else if (*p<100) |
|
3550 repsize += 2+2+1; |
|
3551 else if (*p<1000) |
|
3552 repsize += 2+3+1; |
|
3553 else if (*p<10000) |
|
3554 repsize += 2+4+1; |
|
3555 #ifndef Py_UNICODE_WIDE |
|
3556 else |
|
3557 repsize += 2+5+1; |
|
3558 #else |
|
3559 else if (*p<100000) |
|
3560 repsize += 2+5+1; |
|
3561 else if (*p<1000000) |
|
3562 repsize += 2+6+1; |
|
3563 else |
|
3564 repsize += 2+7+1; |
|
3565 #endif |
|
3566 } |
|
3567 requiredsize = respos+repsize+(endp-collend); |
|
3568 if (requiredsize > ressize) { |
|
3569 if (requiredsize<2*ressize) |
|
3570 requiredsize = 2*ressize; |
|
3571 if (_PyString_Resize(&res, requiredsize)) |
|
3572 goto onError; |
|
3573 str = PyString_AS_STRING(res) + respos; |
|
3574 ressize = requiredsize; |
|
3575 } |
|
3576 /* generate replacement (temporarily (mis)uses p) */ |
|
3577 for (p = collstart; p < collend; ++p) { |
|
3578 str += sprintf(str, "&#%d;", (int)*p); |
|
3579 } |
|
3580 p = collend; |
|
3581 break; |
|
3582 default: |
|
3583 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, |
|
3584 encoding, reason, startp, size, &exc, |
|
3585 collstart-startp, collend-startp, &newpos); |
|
3586 if (repunicode == NULL) |
|
3587 goto onError; |
|
3588 /* need more space? (at least enough for what we |
|
3589 have+the replacement+the rest of the string, so |
|
3590 we won't have to check space for encodable characters) */ |
|
3591 respos = str-PyString_AS_STRING(res); |
|
3592 repsize = PyUnicode_GET_SIZE(repunicode); |
|
3593 requiredsize = respos+repsize+(endp-collend); |
|
3594 if (requiredsize > ressize) { |
|
3595 if (requiredsize<2*ressize) |
|
3596 requiredsize = 2*ressize; |
|
3597 if (_PyString_Resize(&res, requiredsize)) { |
|
3598 Py_DECREF(repunicode); |
|
3599 goto onError; |
|
3600 } |
|
3601 str = PyString_AS_STRING(res) + respos; |
|
3602 ressize = requiredsize; |
|
3603 } |
|
3604 /* check if there is anything unencodable in the replacement |
|
3605 and copy it to the output */ |
|
3606 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) { |
|
3607 c = *uni2; |
|
3608 if (c >= limit) { |
|
3609 raise_encode_exception(&exc, encoding, startp, size, |
|
3610 unicodepos, unicodepos+1, reason); |
|
3611 Py_DECREF(repunicode); |
|
3612 goto onError; |
|
3613 } |
|
3614 *str = (char)c; |
|
3615 } |
|
3616 p = startp + newpos; |
|
3617 Py_DECREF(repunicode); |
|
3618 } |
|
3619 } |
|
3620 } |
|
3621 /* Resize if we allocated to much */ |
|
3622 respos = str-PyString_AS_STRING(res); |
|
3623 if (respos<ressize) |
|
3624 /* If this falls res will be NULL */ |
|
3625 _PyString_Resize(&res, respos); |
|
3626 Py_XDECREF(errorHandler); |
|
3627 Py_XDECREF(exc); |
|
3628 return res; |
|
3629 |
|
3630 onError: |
|
3631 Py_XDECREF(res); |
|
3632 Py_XDECREF(errorHandler); |
|
3633 Py_XDECREF(exc); |
|
3634 return NULL; |
|
3635 } |
|
3636 |
|
3637 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, |
|
3638 Py_ssize_t size, |
|
3639 const char *errors) |
|
3640 { |
|
3641 return unicode_encode_ucs1(p, size, errors, 256); |
|
3642 } |
|
3643 |
|
3644 PyObject *PyUnicode_AsLatin1String(PyObject *unicode) |
|
3645 { |
|
3646 if (!PyUnicode_Check(unicode)) { |
|
3647 PyErr_BadArgument(); |
|
3648 return NULL; |
|
3649 } |
|
3650 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), |
|
3651 PyUnicode_GET_SIZE(unicode), |
|
3652 NULL); |
|
3653 } |
|
3654 |
|
3655 /* --- 7-bit ASCII Codec -------------------------------------------------- */ |
|
3656 |
|
3657 PyObject *PyUnicode_DecodeASCII(const char *s, |
|
3658 Py_ssize_t size, |
|
3659 const char *errors) |
|
3660 { |
|
3661 const char *starts = s; |
|
3662 PyUnicodeObject *v; |
|
3663 Py_UNICODE *p; |
|
3664 Py_ssize_t startinpos; |
|
3665 Py_ssize_t endinpos; |
|
3666 Py_ssize_t outpos; |
|
3667 const char *e; |
|
3668 PyObject *errorHandler = NULL; |
|
3669 PyObject *exc = NULL; |
|
3670 |
|
3671 /* ASCII is equivalent to the first 128 ordinals in Unicode. */ |
|
3672 if (size == 1 && *(unsigned char*)s < 128) { |
|
3673 Py_UNICODE r = *(unsigned char*)s; |
|
3674 return PyUnicode_FromUnicode(&r, 1); |
|
3675 } |
|
3676 |
|
3677 v = _PyUnicode_New(size); |
|
3678 if (v == NULL) |
|
3679 goto onError; |
|
3680 if (size == 0) |
|
3681 return (PyObject *)v; |
|
3682 p = PyUnicode_AS_UNICODE(v); |
|
3683 e = s + size; |
|
3684 while (s < e) { |
|
3685 register unsigned char c = (unsigned char)*s; |
|
3686 if (c < 128) { |
|
3687 *p++ = c; |
|
3688 ++s; |
|
3689 } |
|
3690 else { |
|
3691 startinpos = s-starts; |
|
3692 endinpos = startinpos + 1; |
|
3693 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); |
|
3694 if (unicode_decode_call_errorhandler( |
|
3695 errors, &errorHandler, |
|
3696 "ascii", "ordinal not in range(128)", |
|
3697 starts, size, &startinpos, &endinpos, &exc, &s, |
|
3698 (PyObject **)&v, &outpos, &p)) |
|
3699 goto onError; |
|
3700 } |
|
3701 } |
|
3702 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v)) |
|
3703 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) |
|
3704 goto onError; |
|
3705 Py_XDECREF(errorHandler); |
|
3706 Py_XDECREF(exc); |
|
3707 return (PyObject *)v; |
|
3708 |
|
3709 onError: |
|
3710 Py_XDECREF(v); |
|
3711 Py_XDECREF(errorHandler); |
|
3712 Py_XDECREF(exc); |
|
3713 return NULL; |
|
3714 } |
|
3715 |
|
3716 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p, |
|
3717 Py_ssize_t size, |
|
3718 const char *errors) |
|
3719 { |
|
3720 return unicode_encode_ucs1(p, size, errors, 128); |
|
3721 } |
|
3722 |
|
3723 PyObject *PyUnicode_AsASCIIString(PyObject *unicode) |
|
3724 { |
|
3725 if (!PyUnicode_Check(unicode)) { |
|
3726 PyErr_BadArgument(); |
|
3727 return NULL; |
|
3728 } |
|
3729 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), |
|
3730 PyUnicode_GET_SIZE(unicode), |
|
3731 NULL); |
|
3732 } |
|
3733 |
|
3734 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) |
|
3735 |
|
3736 /* --- MBCS codecs for Windows -------------------------------------------- */ |
|
3737 |
|
3738 #if SIZEOF_INT < SIZEOF_SSIZE_T |
|
3739 #define NEED_RETRY |
|
3740 #endif |
|
3741 |
|
3742 /* XXX This code is limited to "true" double-byte encodings, as |
|
3743 a) it assumes an incomplete character consists of a single byte, and |
|
3744 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte |
|
3745 encodings, see IsDBCSLeadByteEx documentation. */ |
|
3746 |
|
3747 static int is_dbcs_lead_byte(const char *s, int offset) |
|
3748 { |
|
3749 const char *curr = s + offset; |
|
3750 |
|
3751 if (IsDBCSLeadByte(*curr)) { |
|
3752 const char *prev = CharPrev(s, curr); |
|
3753 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); |
|
3754 } |
|
3755 return 0; |
|
3756 } |
|
3757 |
|
3758 /* |
|
3759 * Decode MBCS string into unicode object. If 'final' is set, converts |
|
3760 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. |
|
3761 */ |
|
3762 static int decode_mbcs(PyUnicodeObject **v, |
|
3763 const char *s, /* MBCS string */ |
|
3764 int size, /* sizeof MBCS string */ |
|
3765 int final) |
|
3766 { |
|
3767 Py_UNICODE *p; |
|
3768 Py_ssize_t n = 0; |
|
3769 int usize = 0; |
|
3770 |
|
3771 assert(size >= 0); |
|
3772 |
|
3773 /* Skip trailing lead-byte unless 'final' is set */ |
|
3774 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) |
|
3775 --size; |
|
3776 |
|
3777 /* First get the size of the result */ |
|
3778 if (size > 0) { |
|
3779 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); |
|
3780 if (usize == 0) { |
|
3781 PyErr_SetFromWindowsErrWithFilename(0, NULL); |
|
3782 return -1; |
|
3783 } |
|
3784 } |
|
3785 |
|
3786 if (*v == NULL) { |
|
3787 /* Create unicode object */ |
|
3788 *v = _PyUnicode_New(usize); |
|
3789 if (*v == NULL) |
|
3790 return -1; |
|
3791 } |
|
3792 else { |
|
3793 /* Extend unicode object */ |
|
3794 n = PyUnicode_GET_SIZE(*v); |
|
3795 if (_PyUnicode_Resize(v, n + usize) < 0) |
|
3796 return -1; |
|
3797 } |
|
3798 |
|
3799 /* Do the conversion */ |
|
3800 if (size > 0) { |
|
3801 p = PyUnicode_AS_UNICODE(*v) + n; |
|
3802 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { |
|
3803 PyErr_SetFromWindowsErrWithFilename(0, NULL); |
|
3804 return -1; |
|
3805 } |
|
3806 } |
|
3807 |
|
3808 return size; |
|
3809 } |
|
3810 |
|
3811 PyObject *PyUnicode_DecodeMBCSStateful(const char *s, |
|
3812 Py_ssize_t size, |
|
3813 const char *errors, |
|
3814 Py_ssize_t *consumed) |
|
3815 { |
|
3816 PyUnicodeObject *v = NULL; |
|
3817 int done; |
|
3818 |
|
3819 if (consumed) |
|
3820 *consumed = 0; |
|
3821 |
|
3822 #ifdef NEED_RETRY |
|
3823 retry: |
|
3824 if (size > INT_MAX) |
|
3825 done = decode_mbcs(&v, s, INT_MAX, 0); |
|
3826 else |
|
3827 #endif |
|
3828 done = decode_mbcs(&v, s, (int)size, !consumed); |
|
3829 |
|
3830 if (done < 0) { |
|
3831 Py_XDECREF(v); |
|
3832 return NULL; |
|
3833 } |
|
3834 |
|
3835 if (consumed) |
|
3836 *consumed += done; |
|
3837 |
|
3838 #ifdef NEED_RETRY |
|
3839 if (size > INT_MAX) { |
|
3840 s += done; |
|
3841 size -= done; |
|
3842 goto retry; |
|
3843 } |
|
3844 #endif |
|
3845 |
|
3846 return (PyObject *)v; |
|
3847 } |
|
3848 |
|
3849 PyObject *PyUnicode_DecodeMBCS(const char *s, |
|
3850 Py_ssize_t size, |
|
3851 const char *errors) |
|
3852 { |
|
3853 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); |
|
3854 } |
|
3855 |
|
3856 /* |
|
3857 * Convert unicode into string object (MBCS). |
|
3858 * Returns 0 if succeed, -1 otherwise. |
|
3859 */ |
|
3860 static int encode_mbcs(PyObject **repr, |
|
3861 const Py_UNICODE *p, /* unicode */ |
|
3862 int size) /* size of unicode */ |
|
3863 { |
|
3864 int mbcssize = 0; |
|
3865 Py_ssize_t n = 0; |
|
3866 |
|
3867 assert(size >= 0); |
|
3868 |
|
3869 /* First get the size of the result */ |
|
3870 if (size > 0) { |
|
3871 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); |
|
3872 if (mbcssize == 0) { |
|
3873 PyErr_SetFromWindowsErrWithFilename(0, NULL); |
|
3874 return -1; |
|
3875 } |
|
3876 } |
|
3877 |
|
3878 if (*repr == NULL) { |
|
3879 /* Create string object */ |
|
3880 *repr = PyString_FromStringAndSize(NULL, mbcssize); |
|
3881 if (*repr == NULL) |
|
3882 return -1; |
|
3883 } |
|
3884 else { |
|
3885 /* Extend string object */ |
|
3886 n = PyString_Size(*repr); |
|
3887 if (_PyString_Resize(repr, n + mbcssize) < 0) |
|
3888 return -1; |
|
3889 } |
|
3890 |
|
3891 /* Do the conversion */ |
|
3892 if (size > 0) { |
|
3893 char *s = PyString_AS_STRING(*repr) + n; |
|
3894 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { |
|
3895 PyErr_SetFromWindowsErrWithFilename(0, NULL); |
|
3896 return -1; |
|
3897 } |
|
3898 } |
|
3899 |
|
3900 return 0; |
|
3901 } |
|
3902 |
|
3903 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, |
|
3904 Py_ssize_t size, |
|
3905 const char *errors) |
|
3906 { |
|
3907 PyObject *repr = NULL; |
|
3908 int ret; |
|
3909 |
|
3910 #ifdef NEED_RETRY |
|
3911 retry: |
|
3912 if (size > INT_MAX) |
|
3913 ret = encode_mbcs(&repr, p, INT_MAX); |
|
3914 else |
|
3915 #endif |
|
3916 ret = encode_mbcs(&repr, p, (int)size); |
|
3917 |
|
3918 if (ret < 0) { |
|
3919 Py_XDECREF(repr); |
|
3920 return NULL; |
|
3921 } |
|
3922 |
|
3923 #ifdef NEED_RETRY |
|
3924 if (size > INT_MAX) { |
|
3925 p += INT_MAX; |
|
3926 size -= INT_MAX; |
|
3927 goto retry; |
|
3928 } |
|
3929 #endif |
|
3930 |
|
3931 return repr; |
|
3932 } |
|
3933 |
|
3934 PyObject *PyUnicode_AsMBCSString(PyObject *unicode) |
|
3935 { |
|
3936 if (!PyUnicode_Check(unicode)) { |
|
3937 PyErr_BadArgument(); |
|
3938 return NULL; |
|
3939 } |
|
3940 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), |
|
3941 PyUnicode_GET_SIZE(unicode), |
|
3942 NULL); |
|
3943 } |
|
3944 |
|
3945 #undef NEED_RETRY |
|
3946 |
|
3947 #endif /* MS_WINDOWS */ |
|
3948 |
|
3949 /* --- Character Mapping Codec -------------------------------------------- */ |
|
3950 |
|
3951 PyObject *PyUnicode_DecodeCharmap(const char *s, |
|
3952 Py_ssize_t size, |
|
3953 PyObject *mapping, |
|
3954 const char *errors) |
|
3955 { |
|
3956 const char *starts = s; |
|
3957 Py_ssize_t startinpos; |
|
3958 Py_ssize_t endinpos; |
|
3959 Py_ssize_t outpos; |
|
3960 const char *e; |
|
3961 PyUnicodeObject *v; |
|
3962 Py_UNICODE *p; |
|
3963 Py_ssize_t extrachars = 0; |
|
3964 PyObject *errorHandler = NULL; |
|
3965 PyObject *exc = NULL; |
|
3966 Py_UNICODE *mapstring = NULL; |
|
3967 Py_ssize_t maplen = 0; |
|
3968 |
|
3969 /* Default to Latin-1 */ |
|
3970 if (mapping == NULL) |
|
3971 return PyUnicode_DecodeLatin1(s, size, errors); |
|
3972 |
|
3973 v = _PyUnicode_New(size); |
|
3974 if (v == NULL) |
|
3975 goto onError; |
|
3976 if (size == 0) |
|
3977 return (PyObject *)v; |
|
3978 p = PyUnicode_AS_UNICODE(v); |
|
3979 e = s + size; |
|
3980 if (PyUnicode_CheckExact(mapping)) { |
|
3981 mapstring = PyUnicode_AS_UNICODE(mapping); |
|
3982 maplen = PyUnicode_GET_SIZE(mapping); |
|
3983 while (s < e) { |
|
3984 unsigned char ch = *s; |
|
3985 Py_UNICODE x = 0xfffe; /* illegal value */ |
|
3986 |
|
3987 if (ch < maplen) |
|
3988 x = mapstring[ch]; |
|
3989 |
|
3990 if (x == 0xfffe) { |
|
3991 /* undefined mapping */ |
|
3992 outpos = p-PyUnicode_AS_UNICODE(v); |
|
3993 startinpos = s-starts; |
|
3994 endinpos = startinpos+1; |
|
3995 if (unicode_decode_call_errorhandler( |
|
3996 errors, &errorHandler, |
|
3997 "charmap", "character maps to <undefined>", |
|
3998 starts, size, &startinpos, &endinpos, &exc, &s, |
|
3999 (PyObject **)&v, &outpos, &p)) { |
|
4000 goto onError; |
|
4001 } |
|
4002 continue; |
|
4003 } |
|
4004 *p++ = x; |
|
4005 ++s; |
|
4006 } |
|
4007 } |
|
4008 else { |
|
4009 while (s < e) { |
|
4010 unsigned char ch = *s; |
|
4011 PyObject *w, *x; |
|
4012 |
|
4013 /* Get mapping (char ordinal -> integer, Unicode char or None) */ |
|
4014 w = PyInt_FromLong((long)ch); |
|
4015 if (w == NULL) |
|
4016 goto onError; |
|
4017 x = PyObject_GetItem(mapping, w); |
|
4018 Py_DECREF(w); |
|
4019 if (x == NULL) { |
|
4020 if (PyErr_ExceptionMatches(PyExc_LookupError)) { |
|
4021 /* No mapping found means: mapping is undefined. */ |
|
4022 PyErr_Clear(); |
|
4023 x = Py_None; |
|
4024 Py_INCREF(x); |
|
4025 } else |
|
4026 goto onError; |
|
4027 } |
|
4028 |
|
4029 /* Apply mapping */ |
|
4030 if (PyInt_Check(x)) { |
|
4031 long value = PyInt_AS_LONG(x); |
|
4032 if (value < 0 || value > 65535) { |
|
4033 PyErr_SetString(PyExc_TypeError, |
|
4034 "character mapping must be in range(65536)"); |
|
4035 Py_DECREF(x); |
|
4036 goto onError; |
|
4037 } |
|
4038 *p++ = (Py_UNICODE)value; |
|
4039 } |
|
4040 else if (x == Py_None) { |
|
4041 /* undefined mapping */ |
|
4042 outpos = p-PyUnicode_AS_UNICODE(v); |
|
4043 startinpos = s-starts; |
|
4044 endinpos = startinpos+1; |
|
4045 if (unicode_decode_call_errorhandler( |
|
4046 errors, &errorHandler, |
|
4047 "charmap", "character maps to <undefined>", |
|
4048 starts, size, &startinpos, &endinpos, &exc, &s, |
|
4049 (PyObject **)&v, &outpos, &p)) { |
|
4050 Py_DECREF(x); |
|
4051 goto onError; |
|
4052 } |
|
4053 Py_DECREF(x); |
|
4054 continue; |
|
4055 } |
|
4056 else if (PyUnicode_Check(x)) { |
|
4057 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); |
|
4058 |
|
4059 if (targetsize == 1) |
|
4060 /* 1-1 mapping */ |
|
4061 *p++ = *PyUnicode_AS_UNICODE(x); |
|
4062 |
|
4063 else if (targetsize > 1) { |
|
4064 /* 1-n mapping */ |
|
4065 if (targetsize > extrachars) { |
|
4066 /* resize first */ |
|
4067 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v); |
|
4068 Py_ssize_t needed = (targetsize - extrachars) + \ |
|
4069 (targetsize << 2); |
|
4070 extrachars += needed; |
|
4071 /* XXX overflow detection missing */ |
|
4072 if (_PyUnicode_Resize(&v, |
|
4073 PyUnicode_GET_SIZE(v) + needed) < 0) { |
|
4074 Py_DECREF(x); |
|
4075 goto onError; |
|
4076 } |
|
4077 p = PyUnicode_AS_UNICODE(v) + oldpos; |
|
4078 } |
|
4079 Py_UNICODE_COPY(p, |
|
4080 PyUnicode_AS_UNICODE(x), |
|
4081 targetsize); |
|
4082 p += targetsize; |
|
4083 extrachars -= targetsize; |
|
4084 } |
|
4085 /* 1-0 mapping: skip the character */ |
|
4086 } |
|
4087 else { |
|
4088 /* wrong return value */ |
|
4089 PyErr_SetString(PyExc_TypeError, |
|
4090 "character mapping must return integer, None or unicode"); |
|
4091 Py_DECREF(x); |
|
4092 goto onError; |
|
4093 } |
|
4094 Py_DECREF(x); |
|
4095 ++s; |
|
4096 } |
|
4097 } |
|
4098 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) |
|
4099 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) |
|
4100 goto onError; |
|
4101 Py_XDECREF(errorHandler); |
|
4102 Py_XDECREF(exc); |
|
4103 return (PyObject *)v; |
|
4104 |
|
4105 onError: |
|
4106 Py_XDECREF(errorHandler); |
|
4107 Py_XDECREF(exc); |
|
4108 Py_XDECREF(v); |
|
4109 return NULL; |
|
4110 } |
|
4111 |
|
4112 /* Charmap encoding: the lookup table */ |
|
4113 |
|
4114 struct encoding_map{ |
|
4115 PyObject_HEAD |
|
4116 unsigned char level1[32]; |
|
4117 int count2, count3; |
|
4118 unsigned char level23[1]; |
|
4119 }; |
|
4120 |
|
4121 static PyObject* |
|
4122 encoding_map_size(PyObject *obj, PyObject* args) |
|
4123 { |
|
4124 struct encoding_map *map = (struct encoding_map*)obj; |
|
4125 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 + |
|
4126 128*map->count3); |
|
4127 } |
|
4128 |
|
4129 static PyMethodDef encoding_map_methods[] = { |
|
4130 {"size", encoding_map_size, METH_NOARGS, |
|
4131 PyDoc_STR("Return the size (in bytes) of this object") }, |
|
4132 { 0 } |
|
4133 }; |
|
4134 |
|
4135 static void |
|
4136 encoding_map_dealloc(PyObject* o) |
|
4137 { |
|
4138 PyObject_FREE(o); |
|
4139 } |
|
4140 |
|
4141 static PyTypeObject EncodingMapType = { |
|
4142 PyVarObject_HEAD_INIT(NULL, 0) |
|
4143 "EncodingMap", /*tp_name*/ |
|
4144 sizeof(struct encoding_map), /*tp_basicsize*/ |
|
4145 0, /*tp_itemsize*/ |
|
4146 /* methods */ |
|
4147 encoding_map_dealloc, /*tp_dealloc*/ |
|
4148 0, /*tp_print*/ |
|
4149 0, /*tp_getattr*/ |
|
4150 0, /*tp_setattr*/ |
|
4151 0, /*tp_compare*/ |
|
4152 0, /*tp_repr*/ |
|
4153 0, /*tp_as_number*/ |
|
4154 0, /*tp_as_sequence*/ |
|
4155 0, /*tp_as_mapping*/ |
|
4156 0, /*tp_hash*/ |
|
4157 0, /*tp_call*/ |
|
4158 0, /*tp_str*/ |
|
4159 0, /*tp_getattro*/ |
|
4160 0, /*tp_setattro*/ |
|
4161 0, /*tp_as_buffer*/ |
|
4162 Py_TPFLAGS_DEFAULT, /*tp_flags*/ |
|
4163 0, /*tp_doc*/ |
|
4164 0, /*tp_traverse*/ |
|
4165 0, /*tp_clear*/ |
|
4166 0, /*tp_richcompare*/ |
|
4167 0, /*tp_weaklistoffset*/ |
|
4168 0, /*tp_iter*/ |
|
4169 0, /*tp_iternext*/ |
|
4170 encoding_map_methods, /*tp_methods*/ |
|
4171 0, /*tp_members*/ |
|
4172 0, /*tp_getset*/ |
|
4173 0, /*tp_base*/ |
|
4174 0, /*tp_dict*/ |
|
4175 0, /*tp_descr_get*/ |
|
4176 0, /*tp_descr_set*/ |
|
4177 0, /*tp_dictoffset*/ |
|
4178 0, /*tp_init*/ |
|
4179 0, /*tp_alloc*/ |
|
4180 0, /*tp_new*/ |
|
4181 0, /*tp_free*/ |
|
4182 0, /*tp_is_gc*/ |
|
4183 }; |
|
4184 |
|
4185 PyObject* |
|
4186 PyUnicode_BuildEncodingMap(PyObject* string) |
|
4187 { |
|
4188 Py_UNICODE *decode; |
|
4189 PyObject *result; |
|
4190 struct encoding_map *mresult; |
|
4191 int i; |
|
4192 int need_dict = 0; |
|
4193 unsigned char level1[32]; |
|
4194 unsigned char level2[512]; |
|
4195 unsigned char *mlevel1, *mlevel2, *mlevel3; |
|
4196 int count2 = 0, count3 = 0; |
|
4197 |
|
4198 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) { |
|
4199 PyErr_BadArgument(); |
|
4200 return NULL; |
|
4201 } |
|
4202 decode = PyUnicode_AS_UNICODE(string); |
|
4203 memset(level1, 0xFF, sizeof level1); |
|
4204 memset(level2, 0xFF, sizeof level2); |
|
4205 |
|
4206 /* If there isn't a one-to-one mapping of NULL to \0, |
|
4207 or if there are non-BMP characters, we need to use |
|
4208 a mapping dictionary. */ |
|
4209 if (decode[0] != 0) |
|
4210 need_dict = 1; |
|
4211 for (i = 1; i < 256; i++) { |
|
4212 int l1, l2; |
|
4213 if (decode[i] == 0 |
|
4214 #ifdef Py_UNICODE_WIDE |
|
4215 || decode[i] > 0xFFFF |
|
4216 #endif |
|
4217 ) { |
|
4218 need_dict = 1; |
|
4219 break; |
|
4220 } |
|
4221 if (decode[i] == 0xFFFE) |
|
4222 /* unmapped character */ |
|
4223 continue; |
|
4224 l1 = decode[i] >> 11; |
|
4225 l2 = decode[i] >> 7; |
|
4226 if (level1[l1] == 0xFF) |
|
4227 level1[l1] = count2++; |
|
4228 if (level2[l2] == 0xFF) |
|
4229 level2[l2] = count3++; |
|
4230 } |
|
4231 |
|
4232 if (count2 >= 0xFF || count3 >= 0xFF) |
|
4233 need_dict = 1; |
|
4234 |
|
4235 if (need_dict) { |
|
4236 PyObject *result = PyDict_New(); |
|
4237 PyObject *key, *value; |
|
4238 if (!result) |
|
4239 return NULL; |
|
4240 for (i = 0; i < 256; i++) { |
|
4241 key = value = NULL; |
|
4242 key = PyInt_FromLong(decode[i]); |
|
4243 value = PyInt_FromLong(i); |
|
4244 if (!key || !value) |
|
4245 goto failed1; |
|
4246 if (PyDict_SetItem(result, key, value) == -1) |
|
4247 goto failed1; |
|
4248 Py_DECREF(key); |
|
4249 Py_DECREF(value); |
|
4250 } |
|
4251 return result; |
|
4252 failed1: |
|
4253 Py_XDECREF(key); |
|
4254 Py_XDECREF(value); |
|
4255 Py_DECREF(result); |
|
4256 return NULL; |
|
4257 } |
|
4258 |
|
4259 /* Create a three-level trie */ |
|
4260 result = PyObject_MALLOC(sizeof(struct encoding_map) + |
|
4261 16*count2 + 128*count3 - 1); |
|
4262 if (!result) |
|
4263 return PyErr_NoMemory(); |
|
4264 PyObject_Init(result, &EncodingMapType); |
|
4265 mresult = (struct encoding_map*)result; |
|
4266 mresult->count2 = count2; |
|
4267 mresult->count3 = count3; |
|
4268 mlevel1 = mresult->level1; |
|
4269 mlevel2 = mresult->level23; |
|
4270 mlevel3 = mresult->level23 + 16*count2; |
|
4271 memcpy(mlevel1, level1, 32); |
|
4272 memset(mlevel2, 0xFF, 16*count2); |
|
4273 memset(mlevel3, 0, 128*count3); |
|
4274 count3 = 0; |
|
4275 for (i = 1; i < 256; i++) { |
|
4276 int o1, o2, o3, i2, i3; |
|
4277 if (decode[i] == 0xFFFE) |
|
4278 /* unmapped character */ |
|
4279 continue; |
|
4280 o1 = decode[i]>>11; |
|
4281 o2 = (decode[i]>>7) & 0xF; |
|
4282 i2 = 16*mlevel1[o1] + o2; |
|
4283 if (mlevel2[i2] == 0xFF) |
|
4284 mlevel2[i2] = count3++; |
|
4285 o3 = decode[i] & 0x7F; |
|
4286 i3 = 128*mlevel2[i2] + o3; |
|
4287 mlevel3[i3] = i; |
|
4288 } |
|
4289 return result; |
|
4290 } |
|
4291 |
|
4292 static int |
|
4293 encoding_map_lookup(Py_UNICODE c, PyObject *mapping) |
|
4294 { |
|
4295 struct encoding_map *map = (struct encoding_map*)mapping; |
|
4296 int l1 = c>>11; |
|
4297 int l2 = (c>>7) & 0xF; |
|
4298 int l3 = c & 0x7F; |
|
4299 int i; |
|
4300 |
|
4301 #ifdef Py_UNICODE_WIDE |
|
4302 if (c > 0xFFFF) { |
|
4303 return -1; |
|
4304 } |
|
4305 #endif |
|
4306 if (c == 0) |
|
4307 return 0; |
|
4308 /* level 1*/ |
|
4309 i = map->level1[l1]; |
|
4310 if (i == 0xFF) { |
|
4311 return -1; |
|
4312 } |
|
4313 /* level 2*/ |
|
4314 i = map->level23[16*i+l2]; |
|
4315 if (i == 0xFF) { |
|
4316 return -1; |
|
4317 } |
|
4318 /* level 3 */ |
|
4319 i = map->level23[16*map->count2 + 128*i + l3]; |
|
4320 if (i == 0) { |
|
4321 return -1; |
|
4322 } |
|
4323 return i; |
|
4324 } |
|
4325 |
|
4326 /* Lookup the character ch in the mapping. If the character |
|
4327 can't be found, Py_None is returned (or NULL, if another |
|
4328 error occurred). */ |
|
4329 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) |
|
4330 { |
|
4331 PyObject *w = PyInt_FromLong((long)c); |
|
4332 PyObject *x; |
|
4333 |
|
4334 if (w == NULL) |
|
4335 return NULL; |
|
4336 x = PyObject_GetItem(mapping, w); |
|
4337 Py_DECREF(w); |
|
4338 if (x == NULL) { |
|
4339 if (PyErr_ExceptionMatches(PyExc_LookupError)) { |
|
4340 /* No mapping found means: mapping is undefined. */ |
|
4341 PyErr_Clear(); |
|
4342 x = Py_None; |
|
4343 Py_INCREF(x); |
|
4344 return x; |
|
4345 } else |
|
4346 return NULL; |
|
4347 } |
|
4348 else if (x == Py_None) |
|
4349 return x; |
|
4350 else if (PyInt_Check(x)) { |
|
4351 long value = PyInt_AS_LONG(x); |
|
4352 if (value < 0 || value > 255) { |
|
4353 PyErr_SetString(PyExc_TypeError, |
|
4354 "character mapping must be in range(256)"); |
|
4355 Py_DECREF(x); |
|
4356 return NULL; |
|
4357 } |
|
4358 return x; |
|
4359 } |
|
4360 else if (PyString_Check(x)) |
|
4361 return x; |
|
4362 else { |
|
4363 /* wrong return value */ |
|
4364 PyErr_SetString(PyExc_TypeError, |
|
4365 "character mapping must return integer, None or str"); |
|
4366 Py_DECREF(x); |
|
4367 return NULL; |
|
4368 } |
|
4369 } |
|
4370 |
|
4371 static int |
|
4372 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) |
|
4373 { |
|
4374 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); |
|
4375 /* exponentially overallocate to minimize reallocations */ |
|
4376 if (requiredsize < 2*outsize) |
|
4377 requiredsize = 2*outsize; |
|
4378 if (_PyString_Resize(outobj, requiredsize)) { |
|
4379 return 0; |
|
4380 } |
|
4381 return 1; |
|
4382 } |
|
4383 |
|
4384 typedef enum charmapencode_result { |
|
4385 enc_SUCCESS, enc_FAILED, enc_EXCEPTION |
|
4386 }charmapencode_result; |
|
4387 /* lookup the character, put the result in the output string and adjust |
|
4388 various state variables. Reallocate the output string if not enough |
|
4389 space is available. Return a new reference to the object that |
|
4390 was put in the output buffer, or Py_None, if the mapping was undefined |
|
4391 (in which case no character was written) or NULL, if a |
|
4392 reallocation error occurred. The caller must decref the result */ |
|
4393 static |
|
4394 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, |
|
4395 PyObject **outobj, Py_ssize_t *outpos) |
|
4396 { |
|
4397 PyObject *rep; |
|
4398 char *outstart; |
|
4399 Py_ssize_t outsize = PyString_GET_SIZE(*outobj); |
|
4400 |
|
4401 if (Py_TYPE(mapping) == &EncodingMapType) { |
|
4402 int res = encoding_map_lookup(c, mapping); |
|
4403 Py_ssize_t requiredsize = *outpos+1; |
|
4404 if (res == -1) |
|
4405 return enc_FAILED; |
|
4406 if (outsize<requiredsize) |
|
4407 if (!charmapencode_resize(outobj, outpos, requiredsize)) |
|
4408 return enc_EXCEPTION; |
|
4409 outstart = PyString_AS_STRING(*outobj); |
|
4410 outstart[(*outpos)++] = (char)res; |
|
4411 return enc_SUCCESS; |
|
4412 } |
|
4413 |
|
4414 rep = charmapencode_lookup(c, mapping); |
|
4415 if (rep==NULL) |
|
4416 return enc_EXCEPTION; |
|
4417 else if (rep==Py_None) { |
|
4418 Py_DECREF(rep); |
|
4419 return enc_FAILED; |
|
4420 } else { |
|
4421 if (PyInt_Check(rep)) { |
|
4422 Py_ssize_t requiredsize = *outpos+1; |
|
4423 if (outsize<requiredsize) |
|
4424 if (!charmapencode_resize(outobj, outpos, requiredsize)) { |
|
4425 Py_DECREF(rep); |
|
4426 return enc_EXCEPTION; |
|
4427 } |
|
4428 outstart = PyString_AS_STRING(*outobj); |
|
4429 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); |
|
4430 } |
|
4431 else { |
|
4432 const char *repchars = PyString_AS_STRING(rep); |
|
4433 Py_ssize_t repsize = PyString_GET_SIZE(rep); |
|
4434 Py_ssize_t requiredsize = *outpos+repsize; |
|
4435 if (outsize<requiredsize) |
|
4436 if (!charmapencode_resize(outobj, outpos, requiredsize)) { |
|
4437 Py_DECREF(rep); |
|
4438 return enc_EXCEPTION; |
|
4439 } |
|
4440 outstart = PyString_AS_STRING(*outobj); |
|
4441 memcpy(outstart + *outpos, repchars, repsize); |
|
4442 *outpos += repsize; |
|
4443 } |
|
4444 } |
|
4445 Py_DECREF(rep); |
|
4446 return enc_SUCCESS; |
|
4447 } |
|
4448 |
|
4449 /* handle an error in PyUnicode_EncodeCharmap |
|
4450 Return 0 on success, -1 on error */ |
|
4451 static |
|
4452 int charmap_encoding_error( |
|
4453 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, |
|
4454 PyObject **exceptionObject, |
|
4455 int *known_errorHandler, PyObject **errorHandler, const char *errors, |
|
4456 PyObject **res, Py_ssize_t *respos) |
|
4457 { |
|
4458 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ |
|
4459 Py_ssize_t repsize; |
|
4460 Py_ssize_t newpos; |
|
4461 Py_UNICODE *uni2; |
|
4462 /* startpos for collecting unencodable chars */ |
|
4463 Py_ssize_t collstartpos = *inpos; |
|
4464 Py_ssize_t collendpos = *inpos+1; |
|
4465 Py_ssize_t collpos; |
|
4466 char *encoding = "charmap"; |
|
4467 char *reason = "character maps to <undefined>"; |
|
4468 charmapencode_result x; |
|
4469 |
|
4470 /* find all unencodable characters */ |
|
4471 while (collendpos < size) { |
|
4472 PyObject *rep; |
|
4473 if (Py_TYPE(mapping) == &EncodingMapType) { |
|
4474 int res = encoding_map_lookup(p[collendpos], mapping); |
|
4475 if (res != -1) |
|
4476 break; |
|
4477 ++collendpos; |
|
4478 continue; |
|
4479 } |
|
4480 |
|
4481 rep = charmapencode_lookup(p[collendpos], mapping); |
|
4482 if (rep==NULL) |
|
4483 return -1; |
|
4484 else if (rep!=Py_None) { |
|
4485 Py_DECREF(rep); |
|
4486 break; |
|
4487 } |
|
4488 Py_DECREF(rep); |
|
4489 ++collendpos; |
|
4490 } |
|
4491 /* cache callback name lookup |
|
4492 * (if not done yet, i.e. it's the first error) */ |
|
4493 if (*known_errorHandler==-1) { |
|
4494 if ((errors==NULL) || (!strcmp(errors, "strict"))) |
|
4495 *known_errorHandler = 1; |
|
4496 else if (!strcmp(errors, "replace")) |
|
4497 *known_errorHandler = 2; |
|
4498 else if (!strcmp(errors, "ignore")) |
|
4499 *known_errorHandler = 3; |
|
4500 else if (!strcmp(errors, "xmlcharrefreplace")) |
|
4501 *known_errorHandler = 4; |
|
4502 else |
|
4503 *known_errorHandler = 0; |
|
4504 } |
|
4505 switch (*known_errorHandler) { |
|
4506 case 1: /* strict */ |
|
4507 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); |
|
4508 return -1; |
|
4509 case 2: /* replace */ |
|
4510 for (collpos = collstartpos; collpos<collendpos; ++collpos) { |
|
4511 x = charmapencode_output('?', mapping, res, respos); |
|
4512 if (x==enc_EXCEPTION) { |
|
4513 return -1; |
|
4514 } |
|
4515 else if (x==enc_FAILED) { |
|
4516 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); |
|
4517 return -1; |
|
4518 } |
|
4519 } |
|
4520 /* fall through */ |
|
4521 case 3: /* ignore */ |
|
4522 *inpos = collendpos; |
|
4523 break; |
|
4524 case 4: /* xmlcharrefreplace */ |
|
4525 /* generate replacement (temporarily (mis)uses p) */ |
|
4526 for (collpos = collstartpos; collpos < collendpos; ++collpos) { |
|
4527 char buffer[2+29+1+1]; |
|
4528 char *cp; |
|
4529 sprintf(buffer, "&#%d;", (int)p[collpos]); |
|
4530 for (cp = buffer; *cp; ++cp) { |
|
4531 x = charmapencode_output(*cp, mapping, res, respos); |
|
4532 if (x==enc_EXCEPTION) |
|
4533 return -1; |
|
4534 else if (x==enc_FAILED) { |
|
4535 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); |
|
4536 return -1; |
|
4537 } |
|
4538 } |
|
4539 } |
|
4540 *inpos = collendpos; |
|
4541 break; |
|
4542 default: |
|
4543 repunicode = unicode_encode_call_errorhandler(errors, errorHandler, |
|
4544 encoding, reason, p, size, exceptionObject, |
|
4545 collstartpos, collendpos, &newpos); |
|
4546 if (repunicode == NULL) |
|
4547 return -1; |
|
4548 /* generate replacement */ |
|
4549 repsize = PyUnicode_GET_SIZE(repunicode); |
|
4550 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { |
|
4551 x = charmapencode_output(*uni2, mapping, res, respos); |
|
4552 if (x==enc_EXCEPTION) { |
|
4553 return -1; |
|
4554 } |
|
4555 else if (x==enc_FAILED) { |
|
4556 Py_DECREF(repunicode); |
|
4557 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason); |
|
4558 return -1; |
|
4559 } |
|
4560 } |
|
4561 *inpos = newpos; |
|
4562 Py_DECREF(repunicode); |
|
4563 } |
|
4564 return 0; |
|
4565 } |
|
4566 |
|
4567 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, |
|
4568 Py_ssize_t size, |
|
4569 PyObject *mapping, |
|
4570 const char *errors) |
|
4571 { |
|
4572 /* output object */ |
|
4573 PyObject *res = NULL; |
|
4574 /* current input position */ |
|
4575 Py_ssize_t inpos = 0; |
|
4576 /* current output position */ |
|
4577 Py_ssize_t respos = 0; |
|
4578 PyObject *errorHandler = NULL; |
|
4579 PyObject *exc = NULL; |
|
4580 /* the following variable is used for caching string comparisons |
|
4581 * -1=not initialized, 0=unknown, 1=strict, 2=replace, |
|
4582 * 3=ignore, 4=xmlcharrefreplace */ |
|
4583 int known_errorHandler = -1; |
|
4584 |
|
4585 /* Default to Latin-1 */ |
|
4586 if (mapping == NULL) |
|
4587 return PyUnicode_EncodeLatin1(p, size, errors); |
|
4588 |
|
4589 /* allocate enough for a simple encoding without |
|
4590 replacements, if we need more, we'll resize */ |
|
4591 res = PyString_FromStringAndSize(NULL, size); |
|
4592 if (res == NULL) |
|
4593 goto onError; |
|
4594 if (size == 0) |
|
4595 return res; |
|
4596 |
|
4597 while (inpos<size) { |
|
4598 /* try to encode it */ |
|
4599 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); |
|
4600 if (x==enc_EXCEPTION) /* error */ |
|
4601 goto onError; |
|
4602 if (x==enc_FAILED) { /* unencodable character */ |
|
4603 if (charmap_encoding_error(p, size, &inpos, mapping, |
|
4604 &exc, |
|
4605 &known_errorHandler, &errorHandler, errors, |
|
4606 &res, &respos)) { |
|
4607 goto onError; |
|
4608 } |
|
4609 } |
|
4610 else |
|
4611 /* done with this character => adjust input position */ |
|
4612 ++inpos; |
|
4613 } |
|
4614 |
|
4615 /* Resize if we allocated to much */ |
|
4616 if (respos<PyString_GET_SIZE(res)) { |
|
4617 if (_PyString_Resize(&res, respos)) |
|
4618 goto onError; |
|
4619 } |
|
4620 Py_XDECREF(exc); |
|
4621 Py_XDECREF(errorHandler); |
|
4622 return res; |
|
4623 |
|
4624 onError: |
|
4625 Py_XDECREF(res); |
|
4626 Py_XDECREF(exc); |
|
4627 Py_XDECREF(errorHandler); |
|
4628 return NULL; |
|
4629 } |
|
4630 |
|
4631 PyObject *PyUnicode_AsCharmapString(PyObject *unicode, |
|
4632 PyObject *mapping) |
|
4633 { |
|
4634 if (!PyUnicode_Check(unicode) || mapping == NULL) { |
|
4635 PyErr_BadArgument(); |
|
4636 return NULL; |
|
4637 } |
|
4638 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode), |
|
4639 PyUnicode_GET_SIZE(unicode), |
|
4640 mapping, |
|
4641 NULL); |
|
4642 } |
|
4643 |
|
4644 /* create or adjust a UnicodeTranslateError */ |
|
4645 static void make_translate_exception(PyObject **exceptionObject, |
|
4646 const Py_UNICODE *unicode, Py_ssize_t size, |
|
4647 Py_ssize_t startpos, Py_ssize_t endpos, |
|
4648 const char *reason) |
|
4649 { |
|
4650 if (*exceptionObject == NULL) { |
|
4651 *exceptionObject = PyUnicodeTranslateError_Create( |
|
4652 unicode, size, startpos, endpos, reason); |
|
4653 } |
|
4654 else { |
|
4655 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos)) |
|
4656 goto onError; |
|
4657 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos)) |
|
4658 goto onError; |
|
4659 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason)) |
|
4660 goto onError; |
|
4661 return; |
|
4662 onError: |
|
4663 Py_DECREF(*exceptionObject); |
|
4664 *exceptionObject = NULL; |
|
4665 } |
|
4666 } |
|
4667 |
|
4668 /* raises a UnicodeTranslateError */ |
|
4669 static void raise_translate_exception(PyObject **exceptionObject, |
|
4670 const Py_UNICODE *unicode, Py_ssize_t size, |
|
4671 Py_ssize_t startpos, Py_ssize_t endpos, |
|
4672 const char *reason) |
|
4673 { |
|
4674 make_translate_exception(exceptionObject, |
|
4675 unicode, size, startpos, endpos, reason); |
|
4676 if (*exceptionObject != NULL) |
|
4677 PyCodec_StrictErrors(*exceptionObject); |
|
4678 } |
|
4679 |
|
4680 /* error handling callback helper: |
|
4681 build arguments, call the callback and check the arguments, |
|
4682 put the result into newpos and return the replacement string, which |
|
4683 has to be freed by the caller */ |
|
4684 static PyObject *unicode_translate_call_errorhandler(const char *errors, |
|
4685 PyObject **errorHandler, |
|
4686 const char *reason, |
|
4687 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject, |
|
4688 Py_ssize_t startpos, Py_ssize_t endpos, |
|
4689 Py_ssize_t *newpos) |
|
4690 { |
|
4691 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple"; |
|
4692 |
|
4693 Py_ssize_t i_newpos; |
|
4694 PyObject *restuple; |
|
4695 PyObject *resunicode; |
|
4696 |
|
4697 if (*errorHandler == NULL) { |
|
4698 *errorHandler = PyCodec_LookupError(errors); |
|
4699 if (*errorHandler == NULL) |
|
4700 return NULL; |
|
4701 } |
|
4702 |
|
4703 make_translate_exception(exceptionObject, |
|
4704 unicode, size, startpos, endpos, reason); |
|
4705 if (*exceptionObject == NULL) |
|
4706 return NULL; |
|
4707 |
|
4708 restuple = PyObject_CallFunctionObjArgs( |
|
4709 *errorHandler, *exceptionObject, NULL); |
|
4710 if (restuple == NULL) |
|
4711 return NULL; |
|
4712 if (!PyTuple_Check(restuple)) { |
|
4713 PyErr_Format(PyExc_TypeError, &argparse[4]); |
|
4714 Py_DECREF(restuple); |
|
4715 return NULL; |
|
4716 } |
|
4717 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, |
|
4718 &resunicode, &i_newpos)) { |
|
4719 Py_DECREF(restuple); |
|
4720 return NULL; |
|
4721 } |
|
4722 if (i_newpos<0) |
|
4723 *newpos = size+i_newpos; |
|
4724 else |
|
4725 *newpos = i_newpos; |
|
4726 if (*newpos<0 || *newpos>size) { |
|
4727 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos); |
|
4728 Py_DECREF(restuple); |
|
4729 return NULL; |
|
4730 } |
|
4731 Py_INCREF(resunicode); |
|
4732 Py_DECREF(restuple); |
|
4733 return resunicode; |
|
4734 } |
|
4735 |
|
4736 /* Lookup the character ch in the mapping and put the result in result, |
|
4737 which must be decrefed by the caller. |
|
4738 Return 0 on success, -1 on error */ |
|
4739 static |
|
4740 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result) |
|
4741 { |
|
4742 PyObject *w = PyInt_FromLong((long)c); |
|
4743 PyObject *x; |
|
4744 |
|
4745 if (w == NULL) |
|
4746 return -1; |
|
4747 x = PyObject_GetItem(mapping, w); |
|
4748 Py_DECREF(w); |
|
4749 if (x == NULL) { |
|
4750 if (PyErr_ExceptionMatches(PyExc_LookupError)) { |
|
4751 /* No mapping found means: use 1:1 mapping. */ |
|
4752 PyErr_Clear(); |
|
4753 *result = NULL; |
|
4754 return 0; |
|
4755 } else |
|
4756 return -1; |
|
4757 } |
|
4758 else if (x == Py_None) { |
|
4759 *result = x; |
|
4760 return 0; |
|
4761 } |
|
4762 else if (PyInt_Check(x)) { |
|
4763 long value = PyInt_AS_LONG(x); |
|
4764 long max = PyUnicode_GetMax(); |
|
4765 if (value < 0 || value > max) { |
|
4766 PyErr_Format(PyExc_TypeError, |
|
4767 "character mapping must be in range(0x%lx)", max+1); |
|
4768 Py_DECREF(x); |
|
4769 return -1; |
|
4770 } |
|
4771 *result = x; |
|
4772 return 0; |
|
4773 } |
|
4774 else if (PyUnicode_Check(x)) { |
|
4775 *result = x; |
|
4776 return 0; |
|
4777 } |
|
4778 else { |
|
4779 /* wrong return value */ |
|
4780 PyErr_SetString(PyExc_TypeError, |
|
4781 "character mapping must return integer, None or unicode"); |
|
4782 Py_DECREF(x); |
|
4783 return -1; |
|
4784 } |
|
4785 } |
|
4786 /* ensure that *outobj is at least requiredsize characters long, |
|
4787 if not reallocate and adjust various state variables. |
|
4788 Return 0 on success, -1 on error */ |
|
4789 static |
|
4790 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, |
|
4791 Py_ssize_t requiredsize) |
|
4792 { |
|
4793 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj); |
|
4794 if (requiredsize > oldsize) { |
|
4795 /* remember old output position */ |
|
4796 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj); |
|
4797 /* exponentially overallocate to minimize reallocations */ |
|
4798 if (requiredsize < 2 * oldsize) |
|
4799 requiredsize = 2 * oldsize; |
|
4800 if (_PyUnicode_Resize(outobj, requiredsize) < 0) |
|
4801 return -1; |
|
4802 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos; |
|
4803 } |
|
4804 return 0; |
|
4805 } |
|
4806 /* lookup the character, put the result in the output string and adjust |
|
4807 various state variables. Return a new reference to the object that |
|
4808 was put in the output buffer in *result, or Py_None, if the mapping was |
|
4809 undefined (in which case no character was written). |
|
4810 The called must decref result. |
|
4811 Return 0 on success, -1 on error. */ |
|
4812 static |
|
4813 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp, |
|
4814 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp, |
|
4815 PyObject **res) |
|
4816 { |
|
4817 if (charmaptranslate_lookup(*curinp, mapping, res)) |
|
4818 return -1; |
|
4819 if (*res==NULL) { |
|
4820 /* not found => default to 1:1 mapping */ |
|
4821 *(*outp)++ = *curinp; |
|
4822 } |
|
4823 else if (*res==Py_None) |
|
4824 ; |
|
4825 else if (PyInt_Check(*res)) { |
|
4826 /* no overflow check, because we know that the space is enough */ |
|
4827 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res); |
|
4828 } |
|
4829 else if (PyUnicode_Check(*res)) { |
|
4830 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res); |
|
4831 if (repsize==1) { |
|
4832 /* no overflow check, because we know that the space is enough */ |
|
4833 *(*outp)++ = *PyUnicode_AS_UNICODE(*res); |
|
4834 } |
|
4835 else if (repsize!=0) { |
|
4836 /* more than one character */ |
|
4837 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) + |
|
4838 (insize - (curinp-startinp)) + |
|
4839 repsize - 1; |
|
4840 if (charmaptranslate_makespace(outobj, outp, requiredsize)) |
|
4841 return -1; |
|
4842 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize); |
|
4843 *outp += repsize; |
|
4844 } |
|
4845 } |
|
4846 else |
|
4847 return -1; |
|
4848 return 0; |
|
4849 } |
|
4850 |
|
4851 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p, |
|
4852 Py_ssize_t size, |
|
4853 PyObject *mapping, |
|
4854 const char *errors) |
|
4855 { |
|
4856 /* output object */ |
|
4857 PyObject *res = NULL; |
|
4858 /* pointers to the beginning and end+1 of input */ |
|
4859 const Py_UNICODE *startp = p; |
|
4860 const Py_UNICODE *endp = p + size; |
|
4861 /* pointer into the output */ |
|
4862 Py_UNICODE *str; |
|
4863 /* current output position */ |
|
4864 Py_ssize_t respos = 0; |
|
4865 char *reason = "character maps to <undefined>"; |
|
4866 PyObject *errorHandler = NULL; |
|
4867 PyObject *exc = NULL; |
|
4868 /* the following variable is used for caching string comparisons |
|
4869 * -1=not initialized, 0=unknown, 1=strict, 2=replace, |
|
4870 * 3=ignore, 4=xmlcharrefreplace */ |
|
4871 int known_errorHandler = -1; |
|
4872 |
|
4873 if (mapping == NULL) { |
|
4874 PyErr_BadArgument(); |
|
4875 return NULL; |
|
4876 } |
|
4877 |
|
4878 /* allocate enough for a simple 1:1 translation without |
|
4879 replacements, if we need more, we'll resize */ |
|
4880 res = PyUnicode_FromUnicode(NULL, size); |
|
4881 if (res == NULL) |
|
4882 goto onError; |
|
4883 if (size == 0) |
|
4884 return res; |
|
4885 str = PyUnicode_AS_UNICODE(res); |
|
4886 |
|
4887 while (p<endp) { |
|
4888 /* try to encode it */ |
|
4889 PyObject *x = NULL; |
|
4890 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) { |
|
4891 Py_XDECREF(x); |
|
4892 goto onError; |
|
4893 } |
|
4894 Py_XDECREF(x); |
|
4895 if (x!=Py_None) /* it worked => adjust input pointer */ |
|
4896 ++p; |
|
4897 else { /* untranslatable character */ |
|
4898 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ |
|
4899 Py_ssize_t repsize; |
|
4900 Py_ssize_t newpos; |
|
4901 Py_UNICODE *uni2; |
|
4902 /* startpos for collecting untranslatable chars */ |
|
4903 const Py_UNICODE *collstart = p; |
|
4904 const Py_UNICODE *collend = p+1; |
|
4905 const Py_UNICODE *coll; |
|
4906 |
|
4907 /* find all untranslatable characters */ |
|
4908 while (collend < endp) { |
|
4909 if (charmaptranslate_lookup(*collend, mapping, &x)) |
|
4910 goto onError; |
|
4911 Py_XDECREF(x); |
|
4912 if (x!=Py_None) |
|
4913 break; |
|
4914 ++collend; |
|
4915 } |
|
4916 /* cache callback name lookup |
|
4917 * (if not done yet, i.e. it's the first error) */ |
|
4918 if (known_errorHandler==-1) { |
|
4919 if ((errors==NULL) || (!strcmp(errors, "strict"))) |
|
4920 known_errorHandler = 1; |
|
4921 else if (!strcmp(errors, "replace")) |
|
4922 known_errorHandler = 2; |
|
4923 else if (!strcmp(errors, "ignore")) |
|
4924 known_errorHandler = 3; |
|
4925 else if (!strcmp(errors, "xmlcharrefreplace")) |
|
4926 known_errorHandler = 4; |
|
4927 else |
|
4928 known_errorHandler = 0; |
|
4929 } |
|
4930 switch (known_errorHandler) { |
|
4931 case 1: /* strict */ |
|
4932 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason); |
|
4933 goto onError; |
|
4934 case 2: /* replace */ |
|
4935 /* No need to check for space, this is a 1:1 replacement */ |
|
4936 for (coll = collstart; coll<collend; ++coll) |
|
4937 *str++ = '?'; |
|
4938 /* fall through */ |
|
4939 case 3: /* ignore */ |
|
4940 p = collend; |
|
4941 break; |
|
4942 case 4: /* xmlcharrefreplace */ |
|
4943 /* generate replacement (temporarily (mis)uses p) */ |
|
4944 for (p = collstart; p < collend; ++p) { |
|
4945 char buffer[2+29+1+1]; |
|
4946 char *cp; |
|
4947 sprintf(buffer, "&#%d;", (int)*p); |
|
4948 if (charmaptranslate_makespace(&res, &str, |
|
4949 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend))) |
|
4950 goto onError; |
|
4951 for (cp = buffer; *cp; ++cp) |
|
4952 *str++ = *cp; |
|
4953 } |
|
4954 p = collend; |
|
4955 break; |
|
4956 default: |
|
4957 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, |
|
4958 reason, startp, size, &exc, |
|
4959 collstart-startp, collend-startp, &newpos); |
|
4960 if (repunicode == NULL) |
|
4961 goto onError; |
|
4962 /* generate replacement */ |
|
4963 repsize = PyUnicode_GET_SIZE(repunicode); |
|
4964 if (charmaptranslate_makespace(&res, &str, |
|
4965 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) { |
|
4966 Py_DECREF(repunicode); |
|
4967 goto onError; |
|
4968 } |
|
4969 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) |
|
4970 *str++ = *uni2; |
|
4971 p = startp + newpos; |
|
4972 Py_DECREF(repunicode); |
|
4973 } |
|
4974 } |
|
4975 } |
|
4976 /* Resize if we allocated to much */ |
|
4977 respos = str-PyUnicode_AS_UNICODE(res); |
|
4978 if (respos<PyUnicode_GET_SIZE(res)) { |
|
4979 if (_PyUnicode_Resize(&res, respos) < 0) |
|
4980 goto onError; |
|
4981 } |
|
4982 Py_XDECREF(exc); |
|
4983 Py_XDECREF(errorHandler); |
|
4984 return res; |
|
4985 |
|
4986 onError: |
|
4987 Py_XDECREF(res); |
|
4988 Py_XDECREF(exc); |
|
4989 Py_XDECREF(errorHandler); |
|
4990 return NULL; |
|
4991 } |
|
4992 |
|
4993 PyObject *PyUnicode_Translate(PyObject *str, |
|
4994 PyObject *mapping, |
|
4995 const char *errors) |
|
4996 { |
|
4997 PyObject *result; |
|
4998 |
|
4999 str = PyUnicode_FromObject(str); |
|
5000 if (str == NULL) |
|
5001 goto onError; |
|
5002 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str), |
|
5003 PyUnicode_GET_SIZE(str), |
|
5004 mapping, |
|
5005 errors); |
|
5006 Py_DECREF(str); |
|
5007 return result; |
|
5008 |
|
5009 onError: |
|
5010 Py_XDECREF(str); |
|
5011 return NULL; |
|
5012 } |
|
5013 |
|
5014 /* --- Decimal Encoder ---------------------------------------------------- */ |
|
5015 |
|
5016 int PyUnicode_EncodeDecimal(Py_UNICODE *s, |
|
5017 Py_ssize_t length, |
|
5018 char *output, |
|
5019 const char *errors) |
|
5020 { |
|
5021 Py_UNICODE *p, *end; |
|
5022 PyObject *errorHandler = NULL; |
|
5023 PyObject *exc = NULL; |
|
5024 const char *encoding = "decimal"; |
|
5025 const char *reason = "invalid decimal Unicode string"; |
|
5026 /* the following variable is used for caching string comparisons |
|
5027 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ |
|
5028 int known_errorHandler = -1; |
|
5029 |
|
5030 if (output == NULL) { |
|
5031 PyErr_BadArgument(); |
|
5032 return -1; |
|
5033 } |
|
5034 |
|
5035 p = s; |
|
5036 end = s + length; |
|
5037 while (p < end) { |
|
5038 register Py_UNICODE ch = *p; |
|
5039 int decimal; |
|
5040 PyObject *repunicode; |
|
5041 Py_ssize_t repsize; |
|
5042 Py_ssize_t newpos; |
|
5043 Py_UNICODE *uni2; |
|
5044 Py_UNICODE *collstart; |
|
5045 Py_UNICODE *collend; |
|
5046 |
|
5047 if (Py_UNICODE_ISSPACE(ch)) { |
|
5048 *output++ = ' '; |
|
5049 ++p; |
|
5050 continue; |
|
5051 } |
|
5052 decimal = Py_UNICODE_TODECIMAL(ch); |
|
5053 if (decimal >= 0) { |
|
5054 *output++ = '0' + decimal; |
|
5055 ++p; |
|
5056 continue; |
|
5057 } |
|
5058 if (0 < ch && ch < 256) { |
|
5059 *output++ = (char)ch; |
|
5060 ++p; |
|
5061 continue; |
|
5062 } |
|
5063 /* All other characters are considered unencodable */ |
|
5064 collstart = p; |
|
5065 collend = p+1; |
|
5066 while (collend < end) { |
|
5067 if ((0 < *collend && *collend < 256) || |
|
5068 !Py_UNICODE_ISSPACE(*collend) || |
|
5069 Py_UNICODE_TODECIMAL(*collend)) |
|
5070 break; |
|
5071 } |
|
5072 /* cache callback name lookup |
|
5073 * (if not done yet, i.e. it's the first error) */ |
|
5074 if (known_errorHandler==-1) { |
|
5075 if ((errors==NULL) || (!strcmp(errors, "strict"))) |
|
5076 known_errorHandler = 1; |
|
5077 else if (!strcmp(errors, "replace")) |
|
5078 known_errorHandler = 2; |
|
5079 else if (!strcmp(errors, "ignore")) |
|
5080 known_errorHandler = 3; |
|
5081 else if (!strcmp(errors, "xmlcharrefreplace")) |
|
5082 known_errorHandler = 4; |
|
5083 else |
|
5084 known_errorHandler = 0; |
|
5085 } |
|
5086 switch (known_errorHandler) { |
|
5087 case 1: /* strict */ |
|
5088 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason); |
|
5089 goto onError; |
|
5090 case 2: /* replace */ |
|
5091 for (p = collstart; p < collend; ++p) |
|
5092 *output++ = '?'; |
|
5093 /* fall through */ |
|
5094 case 3: /* ignore */ |
|
5095 p = collend; |
|
5096 break; |
|
5097 case 4: /* xmlcharrefreplace */ |
|
5098 /* generate replacement (temporarily (mis)uses p) */ |
|
5099 for (p = collstart; p < collend; ++p) |
|
5100 output += sprintf(output, "&#%d;", (int)*p); |
|
5101 p = collend; |
|
5102 break; |
|
5103 default: |
|
5104 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, |
|
5105 encoding, reason, s, length, &exc, |
|
5106 collstart-s, collend-s, &newpos); |
|
5107 if (repunicode == NULL) |
|
5108 goto onError; |
|
5109 /* generate replacement */ |
|
5110 repsize = PyUnicode_GET_SIZE(repunicode); |
|
5111 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { |
|
5112 Py_UNICODE ch = *uni2; |
|
5113 if (Py_UNICODE_ISSPACE(ch)) |
|
5114 *output++ = ' '; |
|
5115 else { |
|
5116 decimal = Py_UNICODE_TODECIMAL(ch); |
|
5117 if (decimal >= 0) |
|
5118 *output++ = '0' + decimal; |
|
5119 else if (0 < ch && ch < 256) |
|
5120 *output++ = (char)ch; |
|
5121 else { |
|
5122 Py_DECREF(repunicode); |
|
5123 raise_encode_exception(&exc, encoding, |
|
5124 s, length, collstart-s, collend-s, reason); |
|
5125 goto onError; |
|
5126 } |
|
5127 } |
|
5128 } |
|
5129 p = s + newpos; |
|
5130 Py_DECREF(repunicode); |
|
5131 } |
|
5132 } |
|
5133 /* 0-terminate the output string */ |
|
5134 *output++ = '\0'; |
|
5135 Py_XDECREF(exc); |
|
5136 Py_XDECREF(errorHandler); |
|
5137 return 0; |
|
5138 |
|
5139 onError: |
|
5140 Py_XDECREF(exc); |
|
5141 Py_XDECREF(errorHandler); |
|
5142 return -1; |
|
5143 } |
|
5144 |
|
5145 /* --- Helpers ------------------------------------------------------------ */ |
|
5146 |
|
5147 #include "stringlib/unicodedefs.h" |
|
5148 |
|
5149 #define FROM_UNICODE |
|
5150 |
|
5151 #include "stringlib/fastsearch.h" |
|
5152 |
|
5153 #include "stringlib/count.h" |
|
5154 #include "stringlib/find.h" |
|
5155 #include "stringlib/partition.h" |
|
5156 |
|
5157 /* helper macro to fixup start/end slice values */ |
|
5158 #define FIX_START_END(obj) \ |
|
5159 if (start < 0) \ |
|
5160 start += (obj)->length; \ |
|
5161 if (start < 0) \ |
|
5162 start = 0; \ |
|
5163 if (end > (obj)->length) \ |
|
5164 end = (obj)->length; \ |
|
5165 if (end < 0) \ |
|
5166 end += (obj)->length; \ |
|
5167 if (end < 0) \ |
|
5168 end = 0; |
|
5169 |
|
5170 Py_ssize_t PyUnicode_Count(PyObject *str, |
|
5171 PyObject *substr, |
|
5172 Py_ssize_t start, |
|
5173 Py_ssize_t end) |
|
5174 { |
|
5175 Py_ssize_t result; |
|
5176 PyUnicodeObject* str_obj; |
|
5177 PyUnicodeObject* sub_obj; |
|
5178 |
|
5179 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str); |
|
5180 if (!str_obj) |
|
5181 return -1; |
|
5182 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr); |
|
5183 if (!sub_obj) { |
|
5184 Py_DECREF(str_obj); |
|
5185 return -1; |
|
5186 } |
|
5187 |
|
5188 FIX_START_END(str_obj); |
|
5189 |
|
5190 result = stringlib_count( |
|
5191 str_obj->str + start, end - start, sub_obj->str, sub_obj->length |
|
5192 ); |
|
5193 |
|
5194 Py_DECREF(sub_obj); |
|
5195 Py_DECREF(str_obj); |
|
5196 |
|
5197 return result; |
|
5198 } |
|
5199 |
|
5200 Py_ssize_t PyUnicode_Find(PyObject *str, |
|
5201 PyObject *sub, |
|
5202 Py_ssize_t start, |
|
5203 Py_ssize_t end, |
|
5204 int direction) |
|
5205 { |
|
5206 Py_ssize_t result; |
|
5207 |
|
5208 str = PyUnicode_FromObject(str); |
|
5209 if (!str) |
|
5210 return -2; |
|
5211 sub = PyUnicode_FromObject(sub); |
|
5212 if (!sub) { |
|
5213 Py_DECREF(str); |
|
5214 return -2; |
|
5215 } |
|
5216 |
|
5217 if (direction > 0) |
|
5218 result = stringlib_find_slice( |
|
5219 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), |
|
5220 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), |
|
5221 start, end |
|
5222 ); |
|
5223 else |
|
5224 result = stringlib_rfind_slice( |
|
5225 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str), |
|
5226 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub), |
|
5227 start, end |
|
5228 ); |
|
5229 |
|
5230 Py_DECREF(str); |
|
5231 Py_DECREF(sub); |
|
5232 |
|
5233 return result; |
|
5234 } |
|
5235 |
|
5236 static |
|
5237 int tailmatch(PyUnicodeObject *self, |
|
5238 PyUnicodeObject *substring, |
|
5239 Py_ssize_t start, |
|
5240 Py_ssize_t end, |
|
5241 int direction) |
|
5242 { |
|
5243 if (substring->length == 0) |
|
5244 return 1; |
|
5245 |
|
5246 FIX_START_END(self); |
|
5247 |
|
5248 end -= substring->length; |
|
5249 if (end < start) |
|
5250 return 0; |
|
5251 |
|
5252 if (direction > 0) { |
|
5253 if (Py_UNICODE_MATCH(self, end, substring)) |
|
5254 return 1; |
|
5255 } else { |
|
5256 if (Py_UNICODE_MATCH(self, start, substring)) |
|
5257 return 1; |
|
5258 } |
|
5259 |
|
5260 return 0; |
|
5261 } |
|
5262 |
|
5263 Py_ssize_t PyUnicode_Tailmatch(PyObject *str, |
|
5264 PyObject *substr, |
|
5265 Py_ssize_t start, |
|
5266 Py_ssize_t end, |
|
5267 int direction) |
|
5268 { |
|
5269 Py_ssize_t result; |
|
5270 |
|
5271 str = PyUnicode_FromObject(str); |
|
5272 if (str == NULL) |
|
5273 return -1; |
|
5274 substr = PyUnicode_FromObject(substr); |
|
5275 if (substr == NULL) { |
|
5276 Py_DECREF(str); |
|
5277 return -1; |
|
5278 } |
|
5279 |
|
5280 result = tailmatch((PyUnicodeObject *)str, |
|
5281 (PyUnicodeObject *)substr, |
|
5282 start, end, direction); |
|
5283 Py_DECREF(str); |
|
5284 Py_DECREF(substr); |
|
5285 return result; |
|
5286 } |
|
5287 |
|
5288 /* Apply fixfct filter to the Unicode object self and return a |
|
5289 reference to the modified object */ |
|
5290 |
|
5291 static |
|
5292 PyObject *fixup(PyUnicodeObject *self, |
|
5293 int (*fixfct)(PyUnicodeObject *s)) |
|
5294 { |
|
5295 |
|
5296 PyUnicodeObject *u; |
|
5297 |
|
5298 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); |
|
5299 if (u == NULL) |
|
5300 return NULL; |
|
5301 |
|
5302 Py_UNICODE_COPY(u->str, self->str, self->length); |
|
5303 |
|
5304 if (!fixfct(u) && PyUnicode_CheckExact(self)) { |
|
5305 /* fixfct should return TRUE if it modified the buffer. If |
|
5306 FALSE, return a reference to the original buffer instead |
|
5307 (to save space, not time) */ |
|
5308 Py_INCREF(self); |
|
5309 Py_DECREF(u); |
|
5310 return (PyObject*) self; |
|
5311 } |
|
5312 return (PyObject*) u; |
|
5313 } |
|
5314 |
|
5315 static |
|
5316 int fixupper(PyUnicodeObject *self) |
|
5317 { |
|
5318 Py_ssize_t len = self->length; |
|
5319 Py_UNICODE *s = self->str; |
|
5320 int status = 0; |
|
5321 |
|
5322 while (len-- > 0) { |
|
5323 register Py_UNICODE ch; |
|
5324 |
|
5325 ch = Py_UNICODE_TOUPPER(*s); |
|
5326 if (ch != *s) { |
|
5327 status = 1; |
|
5328 *s = ch; |
|
5329 } |
|
5330 s++; |
|
5331 } |
|
5332 |
|
5333 return status; |
|
5334 } |
|
5335 |
|
5336 static |
|
5337 int fixlower(PyUnicodeObject *self) |
|
5338 { |
|
5339 Py_ssize_t len = self->length; |
|
5340 Py_UNICODE *s = self->str; |
|
5341 int status = 0; |
|
5342 |
|
5343 while (len-- > 0) { |
|
5344 register Py_UNICODE ch; |
|
5345 |
|
5346 ch = Py_UNICODE_TOLOWER(*s); |
|
5347 if (ch != *s) { |
|
5348 status = 1; |
|
5349 *s = ch; |
|
5350 } |
|
5351 s++; |
|
5352 } |
|
5353 |
|
5354 return status; |
|
5355 } |
|
5356 |
|
5357 static |
|
5358 int fixswapcase(PyUnicodeObject *self) |
|
5359 { |
|
5360 Py_ssize_t len = self->length; |
|
5361 Py_UNICODE *s = self->str; |
|
5362 int status = 0; |
|
5363 |
|
5364 while (len-- > 0) { |
|
5365 if (Py_UNICODE_ISUPPER(*s)) { |
|
5366 *s = Py_UNICODE_TOLOWER(*s); |
|
5367 status = 1; |
|
5368 } else if (Py_UNICODE_ISLOWER(*s)) { |
|
5369 *s = Py_UNICODE_TOUPPER(*s); |
|
5370 status = 1; |
|
5371 } |
|
5372 s++; |
|
5373 } |
|
5374 |
|
5375 return status; |
|
5376 } |
|
5377 |
|
5378 static |
|
5379 int fixcapitalize(PyUnicodeObject *self) |
|
5380 { |
|
5381 Py_ssize_t len = self->length; |
|
5382 Py_UNICODE *s = self->str; |
|
5383 int status = 0; |
|
5384 |
|
5385 if (len == 0) |
|
5386 return 0; |
|
5387 if (Py_UNICODE_ISLOWER(*s)) { |
|
5388 *s = Py_UNICODE_TOUPPER(*s); |
|
5389 status = 1; |
|
5390 } |
|
5391 s++; |
|
5392 while (--len > 0) { |
|
5393 if (Py_UNICODE_ISUPPER(*s)) { |
|
5394 *s = Py_UNICODE_TOLOWER(*s); |
|
5395 status = 1; |
|
5396 } |
|
5397 s++; |
|
5398 } |
|
5399 return status; |
|
5400 } |
|
5401 |
|
5402 static |
|
5403 int fixtitle(PyUnicodeObject *self) |
|
5404 { |
|
5405 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self); |
|
5406 register Py_UNICODE *e; |
|
5407 int previous_is_cased; |
|
5408 |
|
5409 /* Shortcut for single character strings */ |
|
5410 if (PyUnicode_GET_SIZE(self) == 1) { |
|
5411 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p); |
|
5412 if (*p != ch) { |
|
5413 *p = ch; |
|
5414 return 1; |
|
5415 } |
|
5416 else |
|
5417 return 0; |
|
5418 } |
|
5419 |
|
5420 e = p + PyUnicode_GET_SIZE(self); |
|
5421 previous_is_cased = 0; |
|
5422 for (; p < e; p++) { |
|
5423 register const Py_UNICODE ch = *p; |
|
5424 |
|
5425 if (previous_is_cased) |
|
5426 *p = Py_UNICODE_TOLOWER(ch); |
|
5427 else |
|
5428 *p = Py_UNICODE_TOTITLE(ch); |
|
5429 |
|
5430 if (Py_UNICODE_ISLOWER(ch) || |
|
5431 Py_UNICODE_ISUPPER(ch) || |
|
5432 Py_UNICODE_ISTITLE(ch)) |
|
5433 previous_is_cased = 1; |
|
5434 else |
|
5435 previous_is_cased = 0; |
|
5436 } |
|
5437 return 1; |
|
5438 } |
|
5439 |
|
5440 PyObject * |
|
5441 PyUnicode_Join(PyObject *separator, PyObject *seq) |
|
5442 { |
|
5443 PyObject *internal_separator = NULL; |
|
5444 const Py_UNICODE blank = ' '; |
|
5445 const Py_UNICODE *sep = ␣ |
|
5446 Py_ssize_t seplen = 1; |
|
5447 PyUnicodeObject *res = NULL; /* the result */ |
|
5448 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */ |
|
5449 Py_ssize_t res_used; /* # used bytes */ |
|
5450 Py_UNICODE *res_p; /* pointer to free byte in res's string area */ |
|
5451 PyObject *fseq; /* PySequence_Fast(seq) */ |
|
5452 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */ |
|
5453 PyObject *item; |
|
5454 Py_ssize_t i; |
|
5455 |
|
5456 fseq = PySequence_Fast(seq, ""); |
|
5457 if (fseq == NULL) { |
|
5458 return NULL; |
|
5459 } |
|
5460 |
|
5461 /* Grrrr. A codec may be invoked to convert str objects to |
|
5462 * Unicode, and so it's possible to call back into Python code |
|
5463 * during PyUnicode_FromObject(), and so it's possible for a sick |
|
5464 * codec to change the size of fseq (if seq is a list). Therefore |
|
5465 * we have to keep refetching the size -- can't assume seqlen |
|
5466 * is invariant. |
|
5467 */ |
|
5468 seqlen = PySequence_Fast_GET_SIZE(fseq); |
|
5469 /* If empty sequence, return u"". */ |
|
5470 if (seqlen == 0) { |
|
5471 res = _PyUnicode_New(0); /* empty sequence; return u"" */ |
|
5472 goto Done; |
|
5473 } |
|
5474 /* If singleton sequence with an exact Unicode, return that. */ |
|
5475 if (seqlen == 1) { |
|
5476 item = PySequence_Fast_GET_ITEM(fseq, 0); |
|
5477 if (PyUnicode_CheckExact(item)) { |
|
5478 Py_INCREF(item); |
|
5479 res = (PyUnicodeObject *)item; |
|
5480 goto Done; |
|
5481 } |
|
5482 } |
|
5483 |
|
5484 /* At least two items to join, or one that isn't exact Unicode. */ |
|
5485 if (seqlen > 1) { |
|
5486 /* Set up sep and seplen -- they're needed. */ |
|
5487 if (separator == NULL) { |
|
5488 sep = ␣ |
|
5489 seplen = 1; |
|
5490 } |
|
5491 else { |
|
5492 internal_separator = PyUnicode_FromObject(separator); |
|
5493 if (internal_separator == NULL) |
|
5494 goto onError; |
|
5495 sep = PyUnicode_AS_UNICODE(internal_separator); |
|
5496 seplen = PyUnicode_GET_SIZE(internal_separator); |
|
5497 /* In case PyUnicode_FromObject() mutated seq. */ |
|
5498 seqlen = PySequence_Fast_GET_SIZE(fseq); |
|
5499 } |
|
5500 } |
|
5501 |
|
5502 /* Get space. */ |
|
5503 res = _PyUnicode_New(res_alloc); |
|
5504 if (res == NULL) |
|
5505 goto onError; |
|
5506 res_p = PyUnicode_AS_UNICODE(res); |
|
5507 res_used = 0; |
|
5508 |
|
5509 for (i = 0; i < seqlen; ++i) { |
|
5510 Py_ssize_t itemlen; |
|
5511 Py_ssize_t new_res_used; |
|
5512 |
|
5513 item = PySequence_Fast_GET_ITEM(fseq, i); |
|
5514 /* Convert item to Unicode. */ |
|
5515 if (! PyUnicode_Check(item) && ! PyString_Check(item)) { |
|
5516 PyErr_Format(PyExc_TypeError, |
|
5517 "sequence item %zd: expected string or Unicode," |
|
5518 " %.80s found", |
|
5519 i, Py_TYPE(item)->tp_name); |
|
5520 goto onError; |
|
5521 } |
|
5522 item = PyUnicode_FromObject(item); |
|
5523 if (item == NULL) |
|
5524 goto onError; |
|
5525 /* We own a reference to item from here on. */ |
|
5526 |
|
5527 /* In case PyUnicode_FromObject() mutated seq. */ |
|
5528 seqlen = PySequence_Fast_GET_SIZE(fseq); |
|
5529 |
|
5530 /* Make sure we have enough space for the separator and the item. */ |
|
5531 itemlen = PyUnicode_GET_SIZE(item); |
|
5532 new_res_used = res_used + itemlen; |
|
5533 if (new_res_used < 0) |
|
5534 goto Overflow; |
|
5535 if (i < seqlen - 1) { |
|
5536 new_res_used += seplen; |
|
5537 if (new_res_used < 0) |
|
5538 goto Overflow; |
|
5539 } |
|
5540 if (new_res_used > res_alloc) { |
|
5541 /* double allocated size until it's big enough */ |
|
5542 do { |
|
5543 res_alloc += res_alloc; |
|
5544 if (res_alloc <= 0) |
|
5545 goto Overflow; |
|
5546 } while (new_res_used > res_alloc); |
|
5547 if (_PyUnicode_Resize(&res, res_alloc) < 0) { |
|
5548 Py_DECREF(item); |
|
5549 goto onError; |
|
5550 } |
|
5551 res_p = PyUnicode_AS_UNICODE(res) + res_used; |
|
5552 } |
|
5553 |
|
5554 /* Copy item, and maybe the separator. */ |
|
5555 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen); |
|
5556 res_p += itemlen; |
|
5557 if (i < seqlen - 1) { |
|
5558 Py_UNICODE_COPY(res_p, sep, seplen); |
|
5559 res_p += seplen; |
|
5560 } |
|
5561 Py_DECREF(item); |
|
5562 res_used = new_res_used; |
|
5563 } |
|
5564 |
|
5565 /* Shrink res to match the used area; this probably can't fail, |
|
5566 * but it's cheap to check. |
|
5567 */ |
|
5568 if (_PyUnicode_Resize(&res, res_used) < 0) |
|
5569 goto onError; |
|
5570 |
|
5571 Done: |
|
5572 Py_XDECREF(internal_separator); |
|
5573 Py_DECREF(fseq); |
|
5574 return (PyObject *)res; |
|
5575 |
|
5576 Overflow: |
|
5577 PyErr_SetString(PyExc_OverflowError, |
|
5578 "join() result is too long for a Python string"); |
|
5579 Py_DECREF(item); |
|
5580 /* fall through */ |
|
5581 |
|
5582 onError: |
|
5583 Py_XDECREF(internal_separator); |
|
5584 Py_DECREF(fseq); |
|
5585 Py_XDECREF(res); |
|
5586 return NULL; |
|
5587 } |
|
5588 |
|
5589 static |
|
5590 PyUnicodeObject *pad(PyUnicodeObject *self, |
|
5591 Py_ssize_t left, |
|
5592 Py_ssize_t right, |
|
5593 Py_UNICODE fill) |
|
5594 { |
|
5595 PyUnicodeObject *u; |
|
5596 |
|
5597 if (left < 0) |
|
5598 left = 0; |
|
5599 if (right < 0) |
|
5600 right = 0; |
|
5601 |
|
5602 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) { |
|
5603 Py_INCREF(self); |
|
5604 return self; |
|
5605 } |
|
5606 |
|
5607 if (left > PY_SSIZE_T_MAX - self->length || |
|
5608 right > PY_SSIZE_T_MAX - (left + self->length)) { |
|
5609 PyErr_SetString(PyExc_OverflowError, "padded string is too long"); |
|
5610 return NULL; |
|
5611 } |
|
5612 u = _PyUnicode_New(left + self->length + right); |
|
5613 if (u) { |
|
5614 if (left) |
|
5615 Py_UNICODE_FILL(u->str, fill, left); |
|
5616 Py_UNICODE_COPY(u->str + left, self->str, self->length); |
|
5617 if (right) |
|
5618 Py_UNICODE_FILL(u->str + left + self->length, fill, right); |
|
5619 } |
|
5620 |
|
5621 return u; |
|
5622 } |
|
5623 |
|
5624 #define SPLIT_APPEND(data, left, right) \ |
|
5625 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \ |
|
5626 if (!str) \ |
|
5627 goto onError; \ |
|
5628 if (PyList_Append(list, str)) { \ |
|
5629 Py_DECREF(str); \ |
|
5630 goto onError; \ |
|
5631 } \ |
|
5632 else \ |
|
5633 Py_DECREF(str); |
|
5634 |
|
5635 static |
|
5636 PyObject *split_whitespace(PyUnicodeObject *self, |
|
5637 PyObject *list, |
|
5638 Py_ssize_t maxcount) |
|
5639 { |
|
5640 register Py_ssize_t i; |
|
5641 register Py_ssize_t j; |
|
5642 Py_ssize_t len = self->length; |
|
5643 PyObject *str; |
|
5644 register const Py_UNICODE *buf = self->str; |
|
5645 |
|
5646 for (i = j = 0; i < len; ) { |
|
5647 /* find a token */ |
|
5648 while (i < len && Py_UNICODE_ISSPACE(buf[i])) |
|
5649 i++; |
|
5650 j = i; |
|
5651 while (i < len && !Py_UNICODE_ISSPACE(buf[i])) |
|
5652 i++; |
|
5653 if (j < i) { |
|
5654 if (maxcount-- <= 0) |
|
5655 break; |
|
5656 SPLIT_APPEND(buf, j, i); |
|
5657 while (i < len && Py_UNICODE_ISSPACE(buf[i])) |
|
5658 i++; |
|
5659 j = i; |
|
5660 } |
|
5661 } |
|
5662 if (j < len) { |
|
5663 SPLIT_APPEND(buf, j, len); |
|
5664 } |
|
5665 return list; |
|
5666 |
|
5667 onError: |
|
5668 Py_DECREF(list); |
|
5669 return NULL; |
|
5670 } |
|
5671 |
|
5672 PyObject *PyUnicode_Splitlines(PyObject *string, |
|
5673 int keepends) |
|
5674 { |
|
5675 register Py_ssize_t i; |
|
5676 register Py_ssize_t j; |
|
5677 Py_ssize_t len; |
|
5678 PyObject *list; |
|
5679 PyObject *str; |
|
5680 Py_UNICODE *data; |
|
5681 |
|
5682 string = PyUnicode_FromObject(string); |
|
5683 if (string == NULL) |
|
5684 return NULL; |
|
5685 data = PyUnicode_AS_UNICODE(string); |
|
5686 len = PyUnicode_GET_SIZE(string); |
|
5687 |
|
5688 list = PyList_New(0); |
|
5689 if (!list) |
|
5690 goto onError; |
|
5691 |
|
5692 for (i = j = 0; i < len; ) { |
|
5693 Py_ssize_t eol; |
|
5694 |
|
5695 /* Find a line and append it */ |
|
5696 while (i < len && !BLOOM_LINEBREAK(data[i])) |
|
5697 i++; |
|
5698 |
|
5699 /* Skip the line break reading CRLF as one line break */ |
|
5700 eol = i; |
|
5701 if (i < len) { |
|
5702 if (data[i] == '\r' && i + 1 < len && |
|
5703 data[i+1] == '\n') |
|
5704 i += 2; |
|
5705 else |
|
5706 i++; |
|
5707 if (keepends) |
|
5708 eol = i; |
|
5709 } |
|
5710 SPLIT_APPEND(data, j, eol); |
|
5711 j = i; |
|
5712 } |
|
5713 if (j < len) { |
|
5714 SPLIT_APPEND(data, j, len); |
|
5715 } |
|
5716 |
|
5717 Py_DECREF(string); |
|
5718 return list; |
|
5719 |
|
5720 onError: |
|
5721 Py_XDECREF(list); |
|
5722 Py_DECREF(string); |
|
5723 return NULL; |
|
5724 } |
|
5725 |
|
5726 static |
|
5727 PyObject *split_char(PyUnicodeObject *self, |
|
5728 PyObject *list, |
|
5729 Py_UNICODE ch, |
|
5730 Py_ssize_t maxcount) |
|
5731 { |
|
5732 register Py_ssize_t i; |
|
5733 register Py_ssize_t j; |
|
5734 Py_ssize_t len = self->length; |
|
5735 PyObject *str; |
|
5736 register const Py_UNICODE *buf = self->str; |
|
5737 |
|
5738 for (i = j = 0; i < len; ) { |
|
5739 if (buf[i] == ch) { |
|
5740 if (maxcount-- <= 0) |
|
5741 break; |
|
5742 SPLIT_APPEND(buf, j, i); |
|
5743 i = j = i + 1; |
|
5744 } else |
|
5745 i++; |
|
5746 } |
|
5747 if (j <= len) { |
|
5748 SPLIT_APPEND(buf, j, len); |
|
5749 } |
|
5750 return list; |
|
5751 |
|
5752 onError: |
|
5753 Py_DECREF(list); |
|
5754 return NULL; |
|
5755 } |
|
5756 |
|
5757 static |
|
5758 PyObject *split_substring(PyUnicodeObject *self, |
|
5759 PyObject *list, |
|
5760 PyUnicodeObject *substring, |
|
5761 Py_ssize_t maxcount) |
|
5762 { |
|
5763 register Py_ssize_t i; |
|
5764 register Py_ssize_t j; |
|
5765 Py_ssize_t len = self->length; |
|
5766 Py_ssize_t sublen = substring->length; |
|
5767 PyObject *str; |
|
5768 |
|
5769 for (i = j = 0; i <= len - sublen; ) { |
|
5770 if (Py_UNICODE_MATCH(self, i, substring)) { |
|
5771 if (maxcount-- <= 0) |
|
5772 break; |
|
5773 SPLIT_APPEND(self->str, j, i); |
|
5774 i = j = i + sublen; |
|
5775 } else |
|
5776 i++; |
|
5777 } |
|
5778 if (j <= len) { |
|
5779 SPLIT_APPEND(self->str, j, len); |
|
5780 } |
|
5781 return list; |
|
5782 |
|
5783 onError: |
|
5784 Py_DECREF(list); |
|
5785 return NULL; |
|
5786 } |
|
5787 |
|
5788 static |
|
5789 PyObject *rsplit_whitespace(PyUnicodeObject *self, |
|
5790 PyObject *list, |
|
5791 Py_ssize_t maxcount) |
|
5792 { |
|
5793 register Py_ssize_t i; |
|
5794 register Py_ssize_t j; |
|
5795 Py_ssize_t len = self->length; |
|
5796 PyObject *str; |
|
5797 register const Py_UNICODE *buf = self->str; |
|
5798 |
|
5799 for (i = j = len - 1; i >= 0; ) { |
|
5800 /* find a token */ |
|
5801 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) |
|
5802 i--; |
|
5803 j = i; |
|
5804 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i])) |
|
5805 i--; |
|
5806 if (j > i) { |
|
5807 if (maxcount-- <= 0) |
|
5808 break; |
|
5809 SPLIT_APPEND(buf, i + 1, j + 1); |
|
5810 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i])) |
|
5811 i--; |
|
5812 j = i; |
|
5813 } |
|
5814 } |
|
5815 if (j >= 0) { |
|
5816 SPLIT_APPEND(buf, 0, j + 1); |
|
5817 } |
|
5818 if (PyList_Reverse(list) < 0) |
|
5819 goto onError; |
|
5820 return list; |
|
5821 |
|
5822 onError: |
|
5823 Py_DECREF(list); |
|
5824 return NULL; |
|
5825 } |
|
5826 |
|
5827 static |
|
5828 PyObject *rsplit_char(PyUnicodeObject *self, |
|
5829 PyObject *list, |
|
5830 Py_UNICODE ch, |
|
5831 Py_ssize_t maxcount) |
|
5832 { |
|
5833 register Py_ssize_t i; |
|
5834 register Py_ssize_t j; |
|
5835 Py_ssize_t len = self->length; |
|
5836 PyObject *str; |
|
5837 register const Py_UNICODE *buf = self->str; |
|
5838 |
|
5839 for (i = j = len - 1; i >= 0; ) { |
|
5840 if (buf[i] == ch) { |
|
5841 if (maxcount-- <= 0) |
|
5842 break; |
|
5843 SPLIT_APPEND(buf, i + 1, j + 1); |
|
5844 j = i = i - 1; |
|
5845 } else |
|
5846 i--; |
|
5847 } |
|
5848 if (j >= -1) { |
|
5849 SPLIT_APPEND(buf, 0, j + 1); |
|
5850 } |
|
5851 if (PyList_Reverse(list) < 0) |
|
5852 goto onError; |
|
5853 return list; |
|
5854 |
|
5855 onError: |
|
5856 Py_DECREF(list); |
|
5857 return NULL; |
|
5858 } |
|
5859 |
|
5860 static |
|
5861 PyObject *rsplit_substring(PyUnicodeObject *self, |
|
5862 PyObject *list, |
|
5863 PyUnicodeObject *substring, |
|
5864 Py_ssize_t maxcount) |
|
5865 { |
|
5866 register Py_ssize_t i; |
|
5867 register Py_ssize_t j; |
|
5868 Py_ssize_t len = self->length; |
|
5869 Py_ssize_t sublen = substring->length; |
|
5870 PyObject *str; |
|
5871 |
|
5872 for (i = len - sublen, j = len; i >= 0; ) { |
|
5873 if (Py_UNICODE_MATCH(self, i, substring)) { |
|
5874 if (maxcount-- <= 0) |
|
5875 break; |
|
5876 SPLIT_APPEND(self->str, i + sublen, j); |
|
5877 j = i; |
|
5878 i -= sublen; |
|
5879 } else |
|
5880 i--; |
|
5881 } |
|
5882 if (j >= 0) { |
|
5883 SPLIT_APPEND(self->str, 0, j); |
|
5884 } |
|
5885 if (PyList_Reverse(list) < 0) |
|
5886 goto onError; |
|
5887 return list; |
|
5888 |
|
5889 onError: |
|
5890 Py_DECREF(list); |
|
5891 return NULL; |
|
5892 } |
|
5893 |
|
5894 #undef SPLIT_APPEND |
|
5895 |
|
5896 static |
|
5897 PyObject *split(PyUnicodeObject *self, |
|
5898 PyUnicodeObject *substring, |
|
5899 Py_ssize_t maxcount) |
|
5900 { |
|
5901 PyObject *list; |
|
5902 |
|
5903 if (maxcount < 0) |
|
5904 maxcount = PY_SSIZE_T_MAX; |
|
5905 |
|
5906 list = PyList_New(0); |
|
5907 if (!list) |
|
5908 return NULL; |
|
5909 |
|
5910 if (substring == NULL) |
|
5911 return split_whitespace(self,list,maxcount); |
|
5912 |
|
5913 else if (substring->length == 1) |
|
5914 return split_char(self,list,substring->str[0],maxcount); |
|
5915 |
|
5916 else if (substring->length == 0) { |
|
5917 Py_DECREF(list); |
|
5918 PyErr_SetString(PyExc_ValueError, "empty separator"); |
|
5919 return NULL; |
|
5920 } |
|
5921 else |
|
5922 return split_substring(self,list,substring,maxcount); |
|
5923 } |
|
5924 |
|
5925 static |
|
5926 PyObject *rsplit(PyUnicodeObject *self, |
|
5927 PyUnicodeObject *substring, |
|
5928 Py_ssize_t maxcount) |
|
5929 { |
|
5930 PyObject *list; |
|
5931 |
|
5932 if (maxcount < 0) |
|
5933 maxcount = PY_SSIZE_T_MAX; |
|
5934 |
|
5935 list = PyList_New(0); |
|
5936 if (!list) |
|
5937 return NULL; |
|
5938 |
|
5939 if (substring == NULL) |
|
5940 return rsplit_whitespace(self,list,maxcount); |
|
5941 |
|
5942 else if (substring->length == 1) |
|
5943 return rsplit_char(self,list,substring->str[0],maxcount); |
|
5944 |
|
5945 else if (substring->length == 0) { |
|
5946 Py_DECREF(list); |
|
5947 PyErr_SetString(PyExc_ValueError, "empty separator"); |
|
5948 return NULL; |
|
5949 } |
|
5950 else |
|
5951 return rsplit_substring(self,list,substring,maxcount); |
|
5952 } |
|
5953 |
|
5954 static |
|
5955 PyObject *replace(PyUnicodeObject *self, |
|
5956 PyUnicodeObject *str1, |
|
5957 PyUnicodeObject *str2, |
|
5958 Py_ssize_t maxcount) |
|
5959 { |
|
5960 PyUnicodeObject *u; |
|
5961 |
|
5962 if (maxcount < 0) |
|
5963 maxcount = PY_SSIZE_T_MAX; |
|
5964 |
|
5965 if (str1->length == str2->length) { |
|
5966 /* same length */ |
|
5967 Py_ssize_t i; |
|
5968 if (str1->length == 1) { |
|
5969 /* replace characters */ |
|
5970 Py_UNICODE u1, u2; |
|
5971 if (!findchar(self->str, self->length, str1->str[0])) |
|
5972 goto nothing; |
|
5973 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); |
|
5974 if (!u) |
|
5975 return NULL; |
|
5976 Py_UNICODE_COPY(u->str, self->str, self->length); |
|
5977 u1 = str1->str[0]; |
|
5978 u2 = str2->str[0]; |
|
5979 for (i = 0; i < u->length; i++) |
|
5980 if (u->str[i] == u1) { |
|
5981 if (--maxcount < 0) |
|
5982 break; |
|
5983 u->str[i] = u2; |
|
5984 } |
|
5985 } else { |
|
5986 i = fastsearch( |
|
5987 self->str, self->length, str1->str, str1->length, FAST_SEARCH |
|
5988 ); |
|
5989 if (i < 0) |
|
5990 goto nothing; |
|
5991 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length); |
|
5992 if (!u) |
|
5993 return NULL; |
|
5994 Py_UNICODE_COPY(u->str, self->str, self->length); |
|
5995 while (i <= self->length - str1->length) |
|
5996 if (Py_UNICODE_MATCH(self, i, str1)) { |
|
5997 if (--maxcount < 0) |
|
5998 break; |
|
5999 Py_UNICODE_COPY(u->str+i, str2->str, str2->length); |
|
6000 i += str1->length; |
|
6001 } else |
|
6002 i++; |
|
6003 } |
|
6004 } else { |
|
6005 |
|
6006 Py_ssize_t n, i, j, e; |
|
6007 Py_ssize_t product, new_size, delta; |
|
6008 Py_UNICODE *p; |
|
6009 |
|
6010 /* replace strings */ |
|
6011 n = stringlib_count(self->str, self->length, str1->str, str1->length); |
|
6012 if (n > maxcount) |
|
6013 n = maxcount; |
|
6014 if (n == 0) |
|
6015 goto nothing; |
|
6016 /* new_size = self->length + n * (str2->length - str1->length)); */ |
|
6017 delta = (str2->length - str1->length); |
|
6018 if (delta == 0) { |
|
6019 new_size = self->length; |
|
6020 } else { |
|
6021 product = n * (str2->length - str1->length); |
|
6022 if ((product / (str2->length - str1->length)) != n) { |
|
6023 PyErr_SetString(PyExc_OverflowError, |
|
6024 "replace string is too long"); |
|
6025 return NULL; |
|
6026 } |
|
6027 new_size = self->length + product; |
|
6028 if (new_size < 0) { |
|
6029 PyErr_SetString(PyExc_OverflowError, |
|
6030 "replace string is too long"); |
|
6031 return NULL; |
|
6032 } |
|
6033 } |
|
6034 u = _PyUnicode_New(new_size); |
|
6035 if (!u) |
|
6036 return NULL; |
|
6037 i = 0; |
|
6038 p = u->str; |
|
6039 e = self->length - str1->length; |
|
6040 if (str1->length > 0) { |
|
6041 while (n-- > 0) { |
|
6042 /* look for next match */ |
|
6043 j = i; |
|
6044 while (j <= e) { |
|
6045 if (Py_UNICODE_MATCH(self, j, str1)) |
|
6046 break; |
|
6047 j++; |
|
6048 } |
|
6049 if (j > i) { |
|
6050 if (j > e) |
|
6051 break; |
|
6052 /* copy unchanged part [i:j] */ |
|
6053 Py_UNICODE_COPY(p, self->str+i, j-i); |
|
6054 p += j - i; |
|
6055 } |
|
6056 /* copy substitution string */ |
|
6057 if (str2->length > 0) { |
|
6058 Py_UNICODE_COPY(p, str2->str, str2->length); |
|
6059 p += str2->length; |
|
6060 } |
|
6061 i = j + str1->length; |
|
6062 } |
|
6063 if (i < self->length) |
|
6064 /* copy tail [i:] */ |
|
6065 Py_UNICODE_COPY(p, self->str+i, self->length-i); |
|
6066 } else { |
|
6067 /* interleave */ |
|
6068 while (n > 0) { |
|
6069 Py_UNICODE_COPY(p, str2->str, str2->length); |
|
6070 p += str2->length; |
|
6071 if (--n <= 0) |
|
6072 break; |
|
6073 *p++ = self->str[i++]; |
|
6074 } |
|
6075 Py_UNICODE_COPY(p, self->str+i, self->length-i); |
|
6076 } |
|
6077 } |
|
6078 return (PyObject *) u; |
|
6079 |
|
6080 nothing: |
|
6081 /* nothing to replace; return original string (when possible) */ |
|
6082 if (PyUnicode_CheckExact(self)) { |
|
6083 Py_INCREF(self); |
|
6084 return (PyObject *) self; |
|
6085 } |
|
6086 return PyUnicode_FromUnicode(self->str, self->length); |
|
6087 } |
|
6088 |
|
6089 /* --- Unicode Object Methods --------------------------------------------- */ |
|
6090 |
|
6091 PyDoc_STRVAR(title__doc__, |
|
6092 "S.title() -> unicode\n\ |
|
6093 \n\ |
|
6094 Return a titlecased version of S, i.e. words start with title case\n\ |
|
6095 characters, all remaining cased characters have lower case."); |
|
6096 |
|
6097 static PyObject* |
|
6098 unicode_title(PyUnicodeObject *self) |
|
6099 { |
|
6100 return fixup(self, fixtitle); |
|
6101 } |
|
6102 |
|
6103 PyDoc_STRVAR(capitalize__doc__, |
|
6104 "S.capitalize() -> unicode\n\ |
|
6105 \n\ |
|
6106 Return a capitalized version of S, i.e. make the first character\n\ |
|
6107 have upper case."); |
|
6108 |
|
6109 static PyObject* |
|
6110 unicode_capitalize(PyUnicodeObject *self) |
|
6111 { |
|
6112 return fixup(self, fixcapitalize); |
|
6113 } |
|
6114 |
|
6115 #if 0 |
|
6116 PyDoc_STRVAR(capwords__doc__, |
|
6117 "S.capwords() -> unicode\n\ |
|
6118 \n\ |
|
6119 Apply .capitalize() to all words in S and return the result with\n\ |
|
6120 normalized whitespace (all whitespace strings are replaced by ' ')."); |
|
6121 |
|
6122 static PyObject* |
|
6123 unicode_capwords(PyUnicodeObject *self) |
|
6124 { |
|
6125 PyObject *list; |
|
6126 PyObject *item; |
|
6127 Py_ssize_t i; |
|
6128 |
|
6129 /* Split into words */ |
|
6130 list = split(self, NULL, -1); |
|
6131 if (!list) |
|
6132 return NULL; |
|
6133 |
|
6134 /* Capitalize each word */ |
|
6135 for (i = 0; i < PyList_GET_SIZE(list); i++) { |
|
6136 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i), |
|
6137 fixcapitalize); |
|
6138 if (item == NULL) |
|
6139 goto onError; |
|
6140 Py_DECREF(PyList_GET_ITEM(list, i)); |
|
6141 PyList_SET_ITEM(list, i, item); |
|
6142 } |
|
6143 |
|
6144 /* Join the words to form a new string */ |
|
6145 item = PyUnicode_Join(NULL, list); |
|
6146 |
|
6147 onError: |
|
6148 Py_DECREF(list); |
|
6149 return (PyObject *)item; |
|
6150 } |
|
6151 #endif |
|
6152 |
|
6153 /* Argument converter. Coerces to a single unicode character */ |
|
6154 |
|
6155 static int |
|
6156 convert_uc(PyObject *obj, void *addr) |
|
6157 { |
|
6158 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr; |
|
6159 PyObject *uniobj; |
|
6160 Py_UNICODE *unistr; |
|
6161 |
|
6162 uniobj = PyUnicode_FromObject(obj); |
|
6163 if (uniobj == NULL) { |
|
6164 PyErr_SetString(PyExc_TypeError, |
|
6165 "The fill character cannot be converted to Unicode"); |
|
6166 return 0; |
|
6167 } |
|
6168 if (PyUnicode_GET_SIZE(uniobj) != 1) { |
|
6169 PyErr_SetString(PyExc_TypeError, |
|
6170 "The fill character must be exactly one character long"); |
|
6171 Py_DECREF(uniobj); |
|
6172 return 0; |
|
6173 } |
|
6174 unistr = PyUnicode_AS_UNICODE(uniobj); |
|
6175 *fillcharloc = unistr[0]; |
|
6176 Py_DECREF(uniobj); |
|
6177 return 1; |
|
6178 } |
|
6179 |
|
6180 PyDoc_STRVAR(center__doc__, |
|
6181 "S.center(width[, fillchar]) -> unicode\n\ |
|
6182 \n\ |
|
6183 Return S centered in a Unicode string of length width. Padding is\n\ |
|
6184 done using the specified fill character (default is a space)"); |
|
6185 |
|
6186 static PyObject * |
|
6187 unicode_center(PyUnicodeObject *self, PyObject *args) |
|
6188 { |
|
6189 Py_ssize_t marg, left; |
|
6190 Py_ssize_t width; |
|
6191 Py_UNICODE fillchar = ' '; |
|
6192 |
|
6193 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar)) |
|
6194 return NULL; |
|
6195 |
|
6196 if (self->length >= width && PyUnicode_CheckExact(self)) { |
|
6197 Py_INCREF(self); |
|
6198 return (PyObject*) self; |
|
6199 } |
|
6200 |
|
6201 marg = width - self->length; |
|
6202 left = marg / 2 + (marg & width & 1); |
|
6203 |
|
6204 return (PyObject*) pad(self, left, marg - left, fillchar); |
|
6205 } |
|
6206 |
|
6207 #if 0 |
|
6208 |
|
6209 /* This code should go into some future Unicode collation support |
|
6210 module. The basic comparison should compare ordinals on a naive |
|
6211 basis (this is what Java does and thus JPython too). */ |
|
6212 |
|
6213 /* speedy UTF-16 code point order comparison */ |
|
6214 /* gleaned from: */ |
|
6215 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */ |
|
6216 |
|
6217 static short utf16Fixup[32] = |
|
6218 { |
|
6219 0, 0, 0, 0, 0, 0, 0, 0, |
|
6220 0, 0, 0, 0, 0, 0, 0, 0, |
|
6221 0, 0, 0, 0, 0, 0, 0, 0, |
|
6222 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800 |
|
6223 }; |
|
6224 |
|
6225 static int |
|
6226 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) |
|
6227 { |
|
6228 Py_ssize_t len1, len2; |
|
6229 |
|
6230 Py_UNICODE *s1 = str1->str; |
|
6231 Py_UNICODE *s2 = str2->str; |
|
6232 |
|
6233 len1 = str1->length; |
|
6234 len2 = str2->length; |
|
6235 |
|
6236 while (len1 > 0 && len2 > 0) { |
|
6237 Py_UNICODE c1, c2; |
|
6238 |
|
6239 c1 = *s1++; |
|
6240 c2 = *s2++; |
|
6241 |
|
6242 if (c1 > (1<<11) * 26) |
|
6243 c1 += utf16Fixup[c1>>11]; |
|
6244 if (c2 > (1<<11) * 26) |
|
6245 c2 += utf16Fixup[c2>>11]; |
|
6246 /* now c1 and c2 are in UTF-32-compatible order */ |
|
6247 |
|
6248 if (c1 != c2) |
|
6249 return (c1 < c2) ? -1 : 1; |
|
6250 |
|
6251 len1--; len2--; |
|
6252 } |
|
6253 |
|
6254 return (len1 < len2) ? -1 : (len1 != len2); |
|
6255 } |
|
6256 |
|
6257 #else |
|
6258 |
|
6259 static int |
|
6260 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2) |
|
6261 { |
|
6262 register Py_ssize_t len1, len2; |
|
6263 |
|
6264 Py_UNICODE *s1 = str1->str; |
|
6265 Py_UNICODE *s2 = str2->str; |
|
6266 |
|
6267 len1 = str1->length; |
|
6268 len2 = str2->length; |
|
6269 |
|
6270 while (len1 > 0 && len2 > 0) { |
|
6271 Py_UNICODE c1, c2; |
|
6272 |
|
6273 c1 = *s1++; |
|
6274 c2 = *s2++; |
|
6275 |
|
6276 if (c1 != c2) |
|
6277 return (c1 < c2) ? -1 : 1; |
|
6278 |
|
6279 len1--; len2--; |
|
6280 } |
|
6281 |
|
6282 return (len1 < len2) ? -1 : (len1 != len2); |
|
6283 } |
|
6284 |
|
6285 #endif |
|
6286 |
|
6287 int PyUnicode_Compare(PyObject *left, |
|
6288 PyObject *right) |
|
6289 { |
|
6290 PyUnicodeObject *u = NULL, *v = NULL; |
|
6291 int result; |
|
6292 |
|
6293 /* Coerce the two arguments */ |
|
6294 u = (PyUnicodeObject *)PyUnicode_FromObject(left); |
|
6295 if (u == NULL) |
|
6296 goto onError; |
|
6297 v = (PyUnicodeObject *)PyUnicode_FromObject(right); |
|
6298 if (v == NULL) |
|
6299 goto onError; |
|
6300 |
|
6301 /* Shortcut for empty or interned objects */ |
|
6302 if (v == u) { |
|
6303 Py_DECREF(u); |
|
6304 Py_DECREF(v); |
|
6305 return 0; |
|
6306 } |
|
6307 |
|
6308 result = unicode_compare(u, v); |
|
6309 |
|
6310 Py_DECREF(u); |
|
6311 Py_DECREF(v); |
|
6312 return result; |
|
6313 |
|
6314 onError: |
|
6315 Py_XDECREF(u); |
|
6316 Py_XDECREF(v); |
|
6317 return -1; |
|
6318 } |
|
6319 |
|
6320 PyObject *PyUnicode_RichCompare(PyObject *left, |
|
6321 PyObject *right, |
|
6322 int op) |
|
6323 { |
|
6324 int result; |
|
6325 |
|
6326 result = PyUnicode_Compare(left, right); |
|
6327 if (result == -1 && PyErr_Occurred()) |
|
6328 goto onError; |
|
6329 |
|
6330 /* Convert the return value to a Boolean */ |
|
6331 switch (op) { |
|
6332 case Py_EQ: |
|
6333 result = (result == 0); |
|
6334 break; |
|
6335 case Py_NE: |
|
6336 result = (result != 0); |
|
6337 break; |
|
6338 case Py_LE: |
|
6339 result = (result <= 0); |
|
6340 break; |
|
6341 case Py_GE: |
|
6342 result = (result >= 0); |
|
6343 break; |
|
6344 case Py_LT: |
|
6345 result = (result == -1); |
|
6346 break; |
|
6347 case Py_GT: |
|
6348 result = (result == 1); |
|
6349 break; |
|
6350 } |
|
6351 return PyBool_FromLong(result); |
|
6352 |
|
6353 onError: |
|
6354 |
|
6355 /* Standard case |
|
6356 |
|
6357 Type errors mean that PyUnicode_FromObject() could not convert |
|
6358 one of the arguments (usually the right hand side) to Unicode, |
|
6359 ie. we can't handle the comparison request. However, it is |
|
6360 possible that the other object knows a comparison method, which |
|
6361 is why we return Py_NotImplemented to give the other object a |
|
6362 chance. |
|
6363 |
|
6364 */ |
|
6365 if (PyErr_ExceptionMatches(PyExc_TypeError)) { |
|
6366 PyErr_Clear(); |
|
6367 Py_INCREF(Py_NotImplemented); |
|
6368 return Py_NotImplemented; |
|
6369 } |
|
6370 if (op != Py_EQ && op != Py_NE) |
|
6371 return NULL; |
|
6372 |
|
6373 /* Equality comparison. |
|
6374 |
|
6375 This is a special case: we silence any PyExc_UnicodeDecodeError |
|
6376 and instead turn it into a PyErr_UnicodeWarning. |
|
6377 |
|
6378 */ |
|
6379 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) |
|
6380 return NULL; |
|
6381 PyErr_Clear(); |
|
6382 if (PyErr_Warn(PyExc_UnicodeWarning, |
|
6383 (op == Py_EQ) ? |
|
6384 "Unicode equal comparison " |
|
6385 "failed to convert both arguments to Unicode - " |
|
6386 "interpreting them as being unequal" : |
|
6387 "Unicode unequal comparison " |
|
6388 "failed to convert both arguments to Unicode - " |
|
6389 "interpreting them as being unequal" |
|
6390 ) < 0) |
|
6391 return NULL; |
|
6392 result = (op == Py_NE); |
|
6393 return PyBool_FromLong(result); |
|
6394 } |
|
6395 |
|
6396 int PyUnicode_Contains(PyObject *container, |
|
6397 PyObject *element) |
|
6398 { |
|
6399 PyObject *str, *sub; |
|
6400 int result; |
|
6401 |
|
6402 /* Coerce the two arguments */ |
|
6403 sub = PyUnicode_FromObject(element); |
|
6404 if (!sub) { |
|
6405 PyErr_SetString(PyExc_TypeError, |
|
6406 "'in <string>' requires string as left operand"); |
|
6407 return -1; |
|
6408 } |
|
6409 |
|
6410 str = PyUnicode_FromObject(container); |
|
6411 if (!str) { |
|
6412 Py_DECREF(sub); |
|
6413 return -1; |
|
6414 } |
|
6415 |
|
6416 result = stringlib_contains_obj(str, sub); |
|
6417 |
|
6418 Py_DECREF(str); |
|
6419 Py_DECREF(sub); |
|
6420 |
|
6421 return result; |
|
6422 } |
|
6423 |
|
6424 /* Concat to string or Unicode object giving a new Unicode object. */ |
|
6425 |
|
6426 PyObject *PyUnicode_Concat(PyObject *left, |
|
6427 PyObject *right) |
|
6428 { |
|
6429 PyUnicodeObject *u = NULL, *v = NULL, *w; |
|
6430 |
|
6431 /* Coerce the two arguments */ |
|
6432 u = (PyUnicodeObject *)PyUnicode_FromObject(left); |
|
6433 if (u == NULL) |
|
6434 goto onError; |
|
6435 v = (PyUnicodeObject *)PyUnicode_FromObject(right); |
|
6436 if (v == NULL) |
|
6437 goto onError; |
|
6438 |
|
6439 /* Shortcuts */ |
|
6440 if (v == unicode_empty) { |
|
6441 Py_DECREF(v); |
|
6442 return (PyObject *)u; |
|
6443 } |
|
6444 if (u == unicode_empty) { |
|
6445 Py_DECREF(u); |
|
6446 return (PyObject *)v; |
|
6447 } |
|
6448 |
|
6449 /* Concat the two Unicode strings */ |
|
6450 w = _PyUnicode_New(u->length + v->length); |
|
6451 if (w == NULL) |
|
6452 goto onError; |
|
6453 Py_UNICODE_COPY(w->str, u->str, u->length); |
|
6454 Py_UNICODE_COPY(w->str + u->length, v->str, v->length); |
|
6455 |
|
6456 Py_DECREF(u); |
|
6457 Py_DECREF(v); |
|
6458 return (PyObject *)w; |
|
6459 |
|
6460 onError: |
|
6461 Py_XDECREF(u); |
|
6462 Py_XDECREF(v); |
|
6463 return NULL; |
|
6464 } |
|
6465 |
|
6466 PyDoc_STRVAR(count__doc__, |
|
6467 "S.count(sub[, start[, end]]) -> int\n\ |
|
6468 \n\ |
|
6469 Return the number of non-overlapping occurrences of substring sub in\n\ |
|
6470 Unicode string S[start:end]. Optional arguments start and end are\n\ |
|
6471 interpreted as in slice notation."); |
|
6472 |
|
6473 static PyObject * |
|
6474 unicode_count(PyUnicodeObject *self, PyObject *args) |
|
6475 { |
|
6476 PyUnicodeObject *substring; |
|
6477 Py_ssize_t start = 0; |
|
6478 Py_ssize_t end = PY_SSIZE_T_MAX; |
|
6479 PyObject *result; |
|
6480 |
|
6481 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring, |
|
6482 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) |
|
6483 return NULL; |
|
6484 |
|
6485 substring = (PyUnicodeObject *)PyUnicode_FromObject( |
|
6486 (PyObject *)substring); |
|
6487 if (substring == NULL) |
|
6488 return NULL; |
|
6489 |
|
6490 FIX_START_END(self); |
|
6491 |
|
6492 result = PyInt_FromSsize_t( |
|
6493 stringlib_count(self->str + start, end - start, |
|
6494 substring->str, substring->length) |
|
6495 ); |
|
6496 |
|
6497 Py_DECREF(substring); |
|
6498 |
|
6499 return result; |
|
6500 } |
|
6501 |
|
6502 PyDoc_STRVAR(encode__doc__, |
|
6503 "S.encode([encoding[,errors]]) -> string or unicode\n\ |
|
6504 \n\ |
|
6505 Encodes S using the codec registered for encoding. encoding defaults\n\ |
|
6506 to the default encoding. errors may be given to set a different error\n\ |
|
6507 handling scheme. Default is 'strict' meaning that encoding errors raise\n\ |
|
6508 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\ |
|
6509 'xmlcharrefreplace' as well as any other name registered with\n\ |
|
6510 codecs.register_error that can handle UnicodeEncodeErrors."); |
|
6511 |
|
6512 static PyObject * |
|
6513 unicode_encode(PyUnicodeObject *self, PyObject *args) |
|
6514 { |
|
6515 char *encoding = NULL; |
|
6516 char *errors = NULL; |
|
6517 PyObject *v; |
|
6518 |
|
6519 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) |
|
6520 return NULL; |
|
6521 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); |
|
6522 if (v == NULL) |
|
6523 goto onError; |
|
6524 if (!PyString_Check(v) && !PyUnicode_Check(v)) { |
|
6525 PyErr_Format(PyExc_TypeError, |
|
6526 "encoder did not return a string/unicode object " |
|
6527 "(type=%.400s)", |
|
6528 Py_TYPE(v)->tp_name); |
|
6529 Py_DECREF(v); |
|
6530 return NULL; |
|
6531 } |
|
6532 return v; |
|
6533 |
|
6534 onError: |
|
6535 return NULL; |
|
6536 } |
|
6537 |
|
6538 PyDoc_STRVAR(decode__doc__, |
|
6539 "S.decode([encoding[,errors]]) -> string or unicode\n\ |
|
6540 \n\ |
|
6541 Decodes S using the codec registered for encoding. encoding defaults\n\ |
|
6542 to the default encoding. errors may be given to set a different error\n\ |
|
6543 handling scheme. Default is 'strict' meaning that encoding errors raise\n\ |
|
6544 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\ |
|
6545 as well as any other name registerd with codecs.register_error that is\n\ |
|
6546 able to handle UnicodeDecodeErrors."); |
|
6547 |
|
6548 static PyObject * |
|
6549 unicode_decode(PyUnicodeObject *self, PyObject *args) |
|
6550 { |
|
6551 char *encoding = NULL; |
|
6552 char *errors = NULL; |
|
6553 PyObject *v; |
|
6554 |
|
6555 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors)) |
|
6556 return NULL; |
|
6557 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors); |
|
6558 if (v == NULL) |
|
6559 goto onError; |
|
6560 if (!PyString_Check(v) && !PyUnicode_Check(v)) { |
|
6561 PyErr_Format(PyExc_TypeError, |
|
6562 "decoder did not return a string/unicode object " |
|
6563 "(type=%.400s)", |
|
6564 Py_TYPE(v)->tp_name); |
|
6565 Py_DECREF(v); |
|
6566 return NULL; |
|
6567 } |
|
6568 return v; |
|
6569 |
|
6570 onError: |
|
6571 return NULL; |
|
6572 } |
|
6573 |
|
6574 PyDoc_STRVAR(expandtabs__doc__, |
|
6575 "S.expandtabs([tabsize]) -> unicode\n\ |
|
6576 \n\ |
|
6577 Return a copy of S where all tab characters are expanded using spaces.\n\ |
|
6578 If tabsize is not given, a tab size of 8 characters is assumed."); |
|
6579 |
|
6580 static PyObject* |
|
6581 unicode_expandtabs(PyUnicodeObject *self, PyObject *args) |
|
6582 { |
|
6583 Py_UNICODE *e; |
|
6584 Py_UNICODE *p; |
|
6585 Py_UNICODE *q; |
|
6586 Py_UNICODE *qe; |
|
6587 Py_ssize_t i, j, incr; |
|
6588 PyUnicodeObject *u; |
|
6589 int tabsize = 8; |
|
6590 |
|
6591 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) |
|
6592 return NULL; |
|
6593 |
|
6594 /* First pass: determine size of output string */ |
|
6595 i = 0; /* chars up to and including most recent \n or \r */ |
|
6596 j = 0; /* chars since most recent \n or \r (use in tab calculations) */ |
|
6597 e = self->str + self->length; /* end of input */ |
|
6598 for (p = self->str; p < e; p++) |
|
6599 if (*p == '\t') { |
|
6600 if (tabsize > 0) { |
|
6601 incr = tabsize - (j % tabsize); /* cannot overflow */ |
|
6602 if (j > PY_SSIZE_T_MAX - incr) |
|
6603 goto overflow1; |
|
6604 j += incr; |
|
6605 } |
|
6606 } |
|
6607 else { |
|
6608 if (j > PY_SSIZE_T_MAX - 1) |
|
6609 goto overflow1; |
|
6610 j++; |
|
6611 if (*p == '\n' || *p == '\r') { |
|
6612 if (i > PY_SSIZE_T_MAX - j) |
|
6613 goto overflow1; |
|
6614 i += j; |
|
6615 j = 0; |
|
6616 } |
|
6617 } |
|
6618 |
|
6619 if (i > PY_SSIZE_T_MAX - j) |
|
6620 goto overflow1; |
|
6621 |
|
6622 /* Second pass: create output string and fill it */ |
|
6623 u = _PyUnicode_New(i + j); |
|
6624 if (!u) |
|
6625 return NULL; |
|
6626 |
|
6627 j = 0; /* same as in first pass */ |
|
6628 q = u->str; /* next output char */ |
|
6629 qe = u->str + u->length; /* end of output */ |
|
6630 |
|
6631 for (p = self->str; p < e; p++) |
|
6632 if (*p == '\t') { |
|
6633 if (tabsize > 0) { |
|
6634 i = tabsize - (j % tabsize); |
|
6635 j += i; |
|
6636 while (i--) { |
|
6637 if (q >= qe) |
|
6638 goto overflow2; |
|
6639 *q++ = ' '; |
|
6640 } |
|
6641 } |
|
6642 } |
|
6643 else { |
|
6644 if (q >= qe) |
|
6645 goto overflow2; |
|
6646 *q++ = *p; |
|
6647 j++; |
|
6648 if (*p == '\n' || *p == '\r') |
|
6649 j = 0; |
|
6650 } |
|
6651 |
|
6652 return (PyObject*) u; |
|
6653 |
|
6654 overflow2: |
|
6655 Py_DECREF(u); |
|
6656 overflow1: |
|
6657 PyErr_SetString(PyExc_OverflowError, "new string is too long"); |
|
6658 return NULL; |
|
6659 } |
|
6660 |
|
6661 PyDoc_STRVAR(find__doc__, |
|
6662 "S.find(sub [,start [,end]]) -> int\n\ |
|
6663 \n\ |
|
6664 Return the lowest index in S where substring sub is found,\n\ |
|
6665 such that sub is contained within s[start:end]. Optional\n\ |
|
6666 arguments start and end are interpreted as in slice notation.\n\ |
|
6667 \n\ |
|
6668 Return -1 on failure."); |
|
6669 |
|
6670 static PyObject * |
|
6671 unicode_find(PyUnicodeObject *self, PyObject *args) |
|
6672 { |
|
6673 PyObject *substring; |
|
6674 Py_ssize_t start; |
|
6675 Py_ssize_t end; |
|
6676 Py_ssize_t result; |
|
6677 |
|
6678 if (!_ParseTupleFinds(args, &substring, &start, &end)) |
|
6679 return NULL; |
|
6680 |
|
6681 result = stringlib_find_slice( |
|
6682 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), |
|
6683 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), |
|
6684 start, end |
|
6685 ); |
|
6686 |
|
6687 Py_DECREF(substring); |
|
6688 |
|
6689 return PyInt_FromSsize_t(result); |
|
6690 } |
|
6691 |
|
6692 static PyObject * |
|
6693 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index) |
|
6694 { |
|
6695 if (index < 0 || index >= self->length) { |
|
6696 PyErr_SetString(PyExc_IndexError, "string index out of range"); |
|
6697 return NULL; |
|
6698 } |
|
6699 |
|
6700 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1); |
|
6701 } |
|
6702 |
|
6703 static long |
|
6704 unicode_hash(PyUnicodeObject *self) |
|
6705 { |
|
6706 /* Since Unicode objects compare equal to their ASCII string |
|
6707 counterparts, they should use the individual character values |
|
6708 as basis for their hash value. This is needed to assure that |
|
6709 strings and Unicode objects behave in the same way as |
|
6710 dictionary keys. */ |
|
6711 |
|
6712 register Py_ssize_t len; |
|
6713 register Py_UNICODE *p; |
|
6714 register long x; |
|
6715 |
|
6716 if (self->hash != -1) |
|
6717 return self->hash; |
|
6718 len = PyUnicode_GET_SIZE(self); |
|
6719 p = PyUnicode_AS_UNICODE(self); |
|
6720 x = *p << 7; |
|
6721 while (--len >= 0) |
|
6722 x = (1000003*x) ^ *p++; |
|
6723 x ^= PyUnicode_GET_SIZE(self); |
|
6724 if (x == -1) |
|
6725 x = -2; |
|
6726 self->hash = x; |
|
6727 return x; |
|
6728 } |
|
6729 |
|
6730 PyDoc_STRVAR(index__doc__, |
|
6731 "S.index(sub [,start [,end]]) -> int\n\ |
|
6732 \n\ |
|
6733 Like S.find() but raise ValueError when the substring is not found."); |
|
6734 |
|
6735 static PyObject * |
|
6736 unicode_index(PyUnicodeObject *self, PyObject *args) |
|
6737 { |
|
6738 Py_ssize_t result; |
|
6739 PyObject *substring; |
|
6740 Py_ssize_t start; |
|
6741 Py_ssize_t end; |
|
6742 |
|
6743 if (!_ParseTupleFinds(args, &substring, &start, &end)) |
|
6744 return NULL; |
|
6745 |
|
6746 result = stringlib_find_slice( |
|
6747 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), |
|
6748 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), |
|
6749 start, end |
|
6750 ); |
|
6751 |
|
6752 Py_DECREF(substring); |
|
6753 |
|
6754 if (result < 0) { |
|
6755 PyErr_SetString(PyExc_ValueError, "substring not found"); |
|
6756 return NULL; |
|
6757 } |
|
6758 |
|
6759 return PyInt_FromSsize_t(result); |
|
6760 } |
|
6761 |
|
6762 PyDoc_STRVAR(islower__doc__, |
|
6763 "S.islower() -> bool\n\ |
|
6764 \n\ |
|
6765 Return True if all cased characters in S are lowercase and there is\n\ |
|
6766 at least one cased character in S, False otherwise."); |
|
6767 |
|
6768 static PyObject* |
|
6769 unicode_islower(PyUnicodeObject *self) |
|
6770 { |
|
6771 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); |
|
6772 register const Py_UNICODE *e; |
|
6773 int cased; |
|
6774 |
|
6775 /* Shortcut for single character strings */ |
|
6776 if (PyUnicode_GET_SIZE(self) == 1) |
|
6777 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p)); |
|
6778 |
|
6779 /* Special case for empty strings */ |
|
6780 if (PyUnicode_GET_SIZE(self) == 0) |
|
6781 return PyBool_FromLong(0); |
|
6782 |
|
6783 e = p + PyUnicode_GET_SIZE(self); |
|
6784 cased = 0; |
|
6785 for (; p < e; p++) { |
|
6786 register const Py_UNICODE ch = *p; |
|
6787 |
|
6788 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) |
|
6789 return PyBool_FromLong(0); |
|
6790 else if (!cased && Py_UNICODE_ISLOWER(ch)) |
|
6791 cased = 1; |
|
6792 } |
|
6793 return PyBool_FromLong(cased); |
|
6794 } |
|
6795 |
|
6796 PyDoc_STRVAR(isupper__doc__, |
|
6797 "S.isupper() -> bool\n\ |
|
6798 \n\ |
|
6799 Return True if all cased characters in S are uppercase and there is\n\ |
|
6800 at least one cased character in S, False otherwise."); |
|
6801 |
|
6802 static PyObject* |
|
6803 unicode_isupper(PyUnicodeObject *self) |
|
6804 { |
|
6805 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); |
|
6806 register const Py_UNICODE *e; |
|
6807 int cased; |
|
6808 |
|
6809 /* Shortcut for single character strings */ |
|
6810 if (PyUnicode_GET_SIZE(self) == 1) |
|
6811 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0); |
|
6812 |
|
6813 /* Special case for empty strings */ |
|
6814 if (PyUnicode_GET_SIZE(self) == 0) |
|
6815 return PyBool_FromLong(0); |
|
6816 |
|
6817 e = p + PyUnicode_GET_SIZE(self); |
|
6818 cased = 0; |
|
6819 for (; p < e; p++) { |
|
6820 register const Py_UNICODE ch = *p; |
|
6821 |
|
6822 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) |
|
6823 return PyBool_FromLong(0); |
|
6824 else if (!cased && Py_UNICODE_ISUPPER(ch)) |
|
6825 cased = 1; |
|
6826 } |
|
6827 return PyBool_FromLong(cased); |
|
6828 } |
|
6829 |
|
6830 PyDoc_STRVAR(istitle__doc__, |
|
6831 "S.istitle() -> bool\n\ |
|
6832 \n\ |
|
6833 Return True if S is a titlecased string and there is at least one\n\ |
|
6834 character in S, i.e. upper- and titlecase characters may only\n\ |
|
6835 follow uncased characters and lowercase characters only cased ones.\n\ |
|
6836 Return False otherwise."); |
|
6837 |
|
6838 static PyObject* |
|
6839 unicode_istitle(PyUnicodeObject *self) |
|
6840 { |
|
6841 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); |
|
6842 register const Py_UNICODE *e; |
|
6843 int cased, previous_is_cased; |
|
6844 |
|
6845 /* Shortcut for single character strings */ |
|
6846 if (PyUnicode_GET_SIZE(self) == 1) |
|
6847 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) || |
|
6848 (Py_UNICODE_ISUPPER(*p) != 0)); |
|
6849 |
|
6850 /* Special case for empty strings */ |
|
6851 if (PyUnicode_GET_SIZE(self) == 0) |
|
6852 return PyBool_FromLong(0); |
|
6853 |
|
6854 e = p + PyUnicode_GET_SIZE(self); |
|
6855 cased = 0; |
|
6856 previous_is_cased = 0; |
|
6857 for (; p < e; p++) { |
|
6858 register const Py_UNICODE ch = *p; |
|
6859 |
|
6860 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { |
|
6861 if (previous_is_cased) |
|
6862 return PyBool_FromLong(0); |
|
6863 previous_is_cased = 1; |
|
6864 cased = 1; |
|
6865 } |
|
6866 else if (Py_UNICODE_ISLOWER(ch)) { |
|
6867 if (!previous_is_cased) |
|
6868 return PyBool_FromLong(0); |
|
6869 previous_is_cased = 1; |
|
6870 cased = 1; |
|
6871 } |
|
6872 else |
|
6873 previous_is_cased = 0; |
|
6874 } |
|
6875 return PyBool_FromLong(cased); |
|
6876 } |
|
6877 |
|
6878 PyDoc_STRVAR(isspace__doc__, |
|
6879 "S.isspace() -> bool\n\ |
|
6880 \n\ |
|
6881 Return True if all characters in S are whitespace\n\ |
|
6882 and there is at least one character in S, False otherwise."); |
|
6883 |
|
6884 static PyObject* |
|
6885 unicode_isspace(PyUnicodeObject *self) |
|
6886 { |
|
6887 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); |
|
6888 register const Py_UNICODE *e; |
|
6889 |
|
6890 /* Shortcut for single character strings */ |
|
6891 if (PyUnicode_GET_SIZE(self) == 1 && |
|
6892 Py_UNICODE_ISSPACE(*p)) |
|
6893 return PyBool_FromLong(1); |
|
6894 |
|
6895 /* Special case for empty strings */ |
|
6896 if (PyUnicode_GET_SIZE(self) == 0) |
|
6897 return PyBool_FromLong(0); |
|
6898 |
|
6899 e = p + PyUnicode_GET_SIZE(self); |
|
6900 for (; p < e; p++) { |
|
6901 if (!Py_UNICODE_ISSPACE(*p)) |
|
6902 return PyBool_FromLong(0); |
|
6903 } |
|
6904 return PyBool_FromLong(1); |
|
6905 } |
|
6906 |
|
6907 PyDoc_STRVAR(isalpha__doc__, |
|
6908 "S.isalpha() -> bool\n\ |
|
6909 \n\ |
|
6910 Return True if all characters in S are alphabetic\n\ |
|
6911 and there is at least one character in S, False otherwise."); |
|
6912 |
|
6913 static PyObject* |
|
6914 unicode_isalpha(PyUnicodeObject *self) |
|
6915 { |
|
6916 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); |
|
6917 register const Py_UNICODE *e; |
|
6918 |
|
6919 /* Shortcut for single character strings */ |
|
6920 if (PyUnicode_GET_SIZE(self) == 1 && |
|
6921 Py_UNICODE_ISALPHA(*p)) |
|
6922 return PyBool_FromLong(1); |
|
6923 |
|
6924 /* Special case for empty strings */ |
|
6925 if (PyUnicode_GET_SIZE(self) == 0) |
|
6926 return PyBool_FromLong(0); |
|
6927 |
|
6928 e = p + PyUnicode_GET_SIZE(self); |
|
6929 for (; p < e; p++) { |
|
6930 if (!Py_UNICODE_ISALPHA(*p)) |
|
6931 return PyBool_FromLong(0); |
|
6932 } |
|
6933 return PyBool_FromLong(1); |
|
6934 } |
|
6935 |
|
6936 PyDoc_STRVAR(isalnum__doc__, |
|
6937 "S.isalnum() -> bool\n\ |
|
6938 \n\ |
|
6939 Return True if all characters in S are alphanumeric\n\ |
|
6940 and there is at least one character in S, False otherwise."); |
|
6941 |
|
6942 static PyObject* |
|
6943 unicode_isalnum(PyUnicodeObject *self) |
|
6944 { |
|
6945 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); |
|
6946 register const Py_UNICODE *e; |
|
6947 |
|
6948 /* Shortcut for single character strings */ |
|
6949 if (PyUnicode_GET_SIZE(self) == 1 && |
|
6950 Py_UNICODE_ISALNUM(*p)) |
|
6951 return PyBool_FromLong(1); |
|
6952 |
|
6953 /* Special case for empty strings */ |
|
6954 if (PyUnicode_GET_SIZE(self) == 0) |
|
6955 return PyBool_FromLong(0); |
|
6956 |
|
6957 e = p + PyUnicode_GET_SIZE(self); |
|
6958 for (; p < e; p++) { |
|
6959 if (!Py_UNICODE_ISALNUM(*p)) |
|
6960 return PyBool_FromLong(0); |
|
6961 } |
|
6962 return PyBool_FromLong(1); |
|
6963 } |
|
6964 |
|
6965 PyDoc_STRVAR(isdecimal__doc__, |
|
6966 "S.isdecimal() -> bool\n\ |
|
6967 \n\ |
|
6968 Return True if there are only decimal characters in S,\n\ |
|
6969 False otherwise."); |
|
6970 |
|
6971 static PyObject* |
|
6972 unicode_isdecimal(PyUnicodeObject *self) |
|
6973 { |
|
6974 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); |
|
6975 register const Py_UNICODE *e; |
|
6976 |
|
6977 /* Shortcut for single character strings */ |
|
6978 if (PyUnicode_GET_SIZE(self) == 1 && |
|
6979 Py_UNICODE_ISDECIMAL(*p)) |
|
6980 return PyBool_FromLong(1); |
|
6981 |
|
6982 /* Special case for empty strings */ |
|
6983 if (PyUnicode_GET_SIZE(self) == 0) |
|
6984 return PyBool_FromLong(0); |
|
6985 |
|
6986 e = p + PyUnicode_GET_SIZE(self); |
|
6987 for (; p < e; p++) { |
|
6988 if (!Py_UNICODE_ISDECIMAL(*p)) |
|
6989 return PyBool_FromLong(0); |
|
6990 } |
|
6991 return PyBool_FromLong(1); |
|
6992 } |
|
6993 |
|
6994 PyDoc_STRVAR(isdigit__doc__, |
|
6995 "S.isdigit() -> bool\n\ |
|
6996 \n\ |
|
6997 Return True if all characters in S are digits\n\ |
|
6998 and there is at least one character in S, False otherwise."); |
|
6999 |
|
7000 static PyObject* |
|
7001 unicode_isdigit(PyUnicodeObject *self) |
|
7002 { |
|
7003 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); |
|
7004 register const Py_UNICODE *e; |
|
7005 |
|
7006 /* Shortcut for single character strings */ |
|
7007 if (PyUnicode_GET_SIZE(self) == 1 && |
|
7008 Py_UNICODE_ISDIGIT(*p)) |
|
7009 return PyBool_FromLong(1); |
|
7010 |
|
7011 /* Special case for empty strings */ |
|
7012 if (PyUnicode_GET_SIZE(self) == 0) |
|
7013 return PyBool_FromLong(0); |
|
7014 |
|
7015 e = p + PyUnicode_GET_SIZE(self); |
|
7016 for (; p < e; p++) { |
|
7017 if (!Py_UNICODE_ISDIGIT(*p)) |
|
7018 return PyBool_FromLong(0); |
|
7019 } |
|
7020 return PyBool_FromLong(1); |
|
7021 } |
|
7022 |
|
7023 PyDoc_STRVAR(isnumeric__doc__, |
|
7024 "S.isnumeric() -> bool\n\ |
|
7025 \n\ |
|
7026 Return True if there are only numeric characters in S,\n\ |
|
7027 False otherwise."); |
|
7028 |
|
7029 static PyObject* |
|
7030 unicode_isnumeric(PyUnicodeObject *self) |
|
7031 { |
|
7032 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self); |
|
7033 register const Py_UNICODE *e; |
|
7034 |
|
7035 /* Shortcut for single character strings */ |
|
7036 if (PyUnicode_GET_SIZE(self) == 1 && |
|
7037 Py_UNICODE_ISNUMERIC(*p)) |
|
7038 return PyBool_FromLong(1); |
|
7039 |
|
7040 /* Special case for empty strings */ |
|
7041 if (PyUnicode_GET_SIZE(self) == 0) |
|
7042 return PyBool_FromLong(0); |
|
7043 |
|
7044 e = p + PyUnicode_GET_SIZE(self); |
|
7045 for (; p < e; p++) { |
|
7046 if (!Py_UNICODE_ISNUMERIC(*p)) |
|
7047 return PyBool_FromLong(0); |
|
7048 } |
|
7049 return PyBool_FromLong(1); |
|
7050 } |
|
7051 |
|
7052 PyDoc_STRVAR(join__doc__, |
|
7053 "S.join(sequence) -> unicode\n\ |
|
7054 \n\ |
|
7055 Return a string which is the concatenation of the strings in the\n\ |
|
7056 sequence. The separator between elements is S."); |
|
7057 |
|
7058 static PyObject* |
|
7059 unicode_join(PyObject *self, PyObject *data) |
|
7060 { |
|
7061 return PyUnicode_Join(self, data); |
|
7062 } |
|
7063 |
|
7064 static Py_ssize_t |
|
7065 unicode_length(PyUnicodeObject *self) |
|
7066 { |
|
7067 return self->length; |
|
7068 } |
|
7069 |
|
7070 PyDoc_STRVAR(ljust__doc__, |
|
7071 "S.ljust(width[, fillchar]) -> int\n\ |
|
7072 \n\ |
|
7073 Return S left-justified in a Unicode string of length width. Padding is\n\ |
|
7074 done using the specified fill character (default is a space)."); |
|
7075 |
|
7076 static PyObject * |
|
7077 unicode_ljust(PyUnicodeObject *self, PyObject *args) |
|
7078 { |
|
7079 Py_ssize_t width; |
|
7080 Py_UNICODE fillchar = ' '; |
|
7081 |
|
7082 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar)) |
|
7083 return NULL; |
|
7084 |
|
7085 if (self->length >= width && PyUnicode_CheckExact(self)) { |
|
7086 Py_INCREF(self); |
|
7087 return (PyObject*) self; |
|
7088 } |
|
7089 |
|
7090 return (PyObject*) pad(self, 0, width - self->length, fillchar); |
|
7091 } |
|
7092 |
|
7093 PyDoc_STRVAR(lower__doc__, |
|
7094 "S.lower() -> unicode\n\ |
|
7095 \n\ |
|
7096 Return a copy of the string S converted to lowercase."); |
|
7097 |
|
7098 static PyObject* |
|
7099 unicode_lower(PyUnicodeObject *self) |
|
7100 { |
|
7101 return fixup(self, fixlower); |
|
7102 } |
|
7103 |
|
7104 #define LEFTSTRIP 0 |
|
7105 #define RIGHTSTRIP 1 |
|
7106 #define BOTHSTRIP 2 |
|
7107 |
|
7108 /* Arrays indexed by above */ |
|
7109 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; |
|
7110 |
|
7111 #define STRIPNAME(i) (stripformat[i]+3) |
|
7112 |
|
7113 /* externally visible for str.strip(unicode) */ |
|
7114 PyObject * |
|
7115 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) |
|
7116 { |
|
7117 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); |
|
7118 Py_ssize_t len = PyUnicode_GET_SIZE(self); |
|
7119 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); |
|
7120 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); |
|
7121 Py_ssize_t i, j; |
|
7122 |
|
7123 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); |
|
7124 |
|
7125 i = 0; |
|
7126 if (striptype != RIGHTSTRIP) { |
|
7127 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { |
|
7128 i++; |
|
7129 } |
|
7130 } |
|
7131 |
|
7132 j = len; |
|
7133 if (striptype != LEFTSTRIP) { |
|
7134 do { |
|
7135 j--; |
|
7136 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); |
|
7137 j++; |
|
7138 } |
|
7139 |
|
7140 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { |
|
7141 Py_INCREF(self); |
|
7142 return (PyObject*)self; |
|
7143 } |
|
7144 else |
|
7145 return PyUnicode_FromUnicode(s+i, j-i); |
|
7146 } |
|
7147 |
|
7148 |
|
7149 static PyObject * |
|
7150 do_strip(PyUnicodeObject *self, int striptype) |
|
7151 { |
|
7152 Py_UNICODE *s = PyUnicode_AS_UNICODE(self); |
|
7153 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; |
|
7154 |
|
7155 i = 0; |
|
7156 if (striptype != RIGHTSTRIP) { |
|
7157 while (i < len && Py_UNICODE_ISSPACE(s[i])) { |
|
7158 i++; |
|
7159 } |
|
7160 } |
|
7161 |
|
7162 j = len; |
|
7163 if (striptype != LEFTSTRIP) { |
|
7164 do { |
|
7165 j--; |
|
7166 } while (j >= i && Py_UNICODE_ISSPACE(s[j])); |
|
7167 j++; |
|
7168 } |
|
7169 |
|
7170 if (i == 0 && j == len && PyUnicode_CheckExact(self)) { |
|
7171 Py_INCREF(self); |
|
7172 return (PyObject*)self; |
|
7173 } |
|
7174 else |
|
7175 return PyUnicode_FromUnicode(s+i, j-i); |
|
7176 } |
|
7177 |
|
7178 |
|
7179 static PyObject * |
|
7180 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) |
|
7181 { |
|
7182 PyObject *sep = NULL; |
|
7183 |
|
7184 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) |
|
7185 return NULL; |
|
7186 |
|
7187 if (sep != NULL && sep != Py_None) { |
|
7188 if (PyUnicode_Check(sep)) |
|
7189 return _PyUnicode_XStrip(self, striptype, sep); |
|
7190 else if (PyString_Check(sep)) { |
|
7191 PyObject *res; |
|
7192 sep = PyUnicode_FromObject(sep); |
|
7193 if (sep==NULL) |
|
7194 return NULL; |
|
7195 res = _PyUnicode_XStrip(self, striptype, sep); |
|
7196 Py_DECREF(sep); |
|
7197 return res; |
|
7198 } |
|
7199 else { |
|
7200 PyErr_Format(PyExc_TypeError, |
|
7201 "%s arg must be None, unicode or str", |
|
7202 STRIPNAME(striptype)); |
|
7203 return NULL; |
|
7204 } |
|
7205 } |
|
7206 |
|
7207 return do_strip(self, striptype); |
|
7208 } |
|
7209 |
|
7210 |
|
7211 PyDoc_STRVAR(strip__doc__, |
|
7212 "S.strip([chars]) -> unicode\n\ |
|
7213 \n\ |
|
7214 Return a copy of the string S with leading and trailing\n\ |
|
7215 whitespace removed.\n\ |
|
7216 If chars is given and not None, remove characters in chars instead.\n\ |
|
7217 If chars is a str, it will be converted to unicode before stripping"); |
|
7218 |
|
7219 static PyObject * |
|
7220 unicode_strip(PyUnicodeObject *self, PyObject *args) |
|
7221 { |
|
7222 if (PyTuple_GET_SIZE(args) == 0) |
|
7223 return do_strip(self, BOTHSTRIP); /* Common case */ |
|
7224 else |
|
7225 return do_argstrip(self, BOTHSTRIP, args); |
|
7226 } |
|
7227 |
|
7228 |
|
7229 PyDoc_STRVAR(lstrip__doc__, |
|
7230 "S.lstrip([chars]) -> unicode\n\ |
|
7231 \n\ |
|
7232 Return a copy of the string S with leading whitespace removed.\n\ |
|
7233 If chars is given and not None, remove characters in chars instead.\n\ |
|
7234 If chars is a str, it will be converted to unicode before stripping"); |
|
7235 |
|
7236 static PyObject * |
|
7237 unicode_lstrip(PyUnicodeObject *self, PyObject *args) |
|
7238 { |
|
7239 if (PyTuple_GET_SIZE(args) == 0) |
|
7240 return do_strip(self, LEFTSTRIP); /* Common case */ |
|
7241 else |
|
7242 return do_argstrip(self, LEFTSTRIP, args); |
|
7243 } |
|
7244 |
|
7245 |
|
7246 PyDoc_STRVAR(rstrip__doc__, |
|
7247 "S.rstrip([chars]) -> unicode\n\ |
|
7248 \n\ |
|
7249 Return a copy of the string S with trailing whitespace removed.\n\ |
|
7250 If chars is given and not None, remove characters in chars instead.\n\ |
|
7251 If chars is a str, it will be converted to unicode before stripping"); |
|
7252 |
|
7253 static PyObject * |
|
7254 unicode_rstrip(PyUnicodeObject *self, PyObject *args) |
|
7255 { |
|
7256 if (PyTuple_GET_SIZE(args) == 0) |
|
7257 return do_strip(self, RIGHTSTRIP); /* Common case */ |
|
7258 else |
|
7259 return do_argstrip(self, RIGHTSTRIP, args); |
|
7260 } |
|
7261 |
|
7262 |
|
7263 static PyObject* |
|
7264 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) |
|
7265 { |
|
7266 PyUnicodeObject *u; |
|
7267 Py_UNICODE *p; |
|
7268 Py_ssize_t nchars; |
|
7269 size_t nbytes; |
|
7270 |
|
7271 if (len < 0) |
|
7272 len = 0; |
|
7273 |
|
7274 if (len == 1 && PyUnicode_CheckExact(str)) { |
|
7275 /* no repeat, return original string */ |
|
7276 Py_INCREF(str); |
|
7277 return (PyObject*) str; |
|
7278 } |
|
7279 |
|
7280 /* ensure # of chars needed doesn't overflow int and # of bytes |
|
7281 * needed doesn't overflow size_t |
|
7282 */ |
|
7283 nchars = len * str->length; |
|
7284 if (len && nchars / len != str->length) { |
|
7285 PyErr_SetString(PyExc_OverflowError, |
|
7286 "repeated string is too long"); |
|
7287 return NULL; |
|
7288 } |
|
7289 nbytes = (nchars + 1) * sizeof(Py_UNICODE); |
|
7290 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) { |
|
7291 PyErr_SetString(PyExc_OverflowError, |
|
7292 "repeated string is too long"); |
|
7293 return NULL; |
|
7294 } |
|
7295 u = _PyUnicode_New(nchars); |
|
7296 if (!u) |
|
7297 return NULL; |
|
7298 |
|
7299 p = u->str; |
|
7300 |
|
7301 if (str->length == 1 && len > 0) { |
|
7302 Py_UNICODE_FILL(p, str->str[0], len); |
|
7303 } else { |
|
7304 Py_ssize_t done = 0; /* number of characters copied this far */ |
|
7305 if (done < nchars) { |
|
7306 Py_UNICODE_COPY(p, str->str, str->length); |
|
7307 done = str->length; |
|
7308 } |
|
7309 while (done < nchars) { |
|
7310 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done; |
|
7311 Py_UNICODE_COPY(p+done, p, n); |
|
7312 done += n; |
|
7313 } |
|
7314 } |
|
7315 |
|
7316 return (PyObject*) u; |
|
7317 } |
|
7318 |
|
7319 PyObject *PyUnicode_Replace(PyObject *obj, |
|
7320 PyObject *subobj, |
|
7321 PyObject *replobj, |
|
7322 Py_ssize_t maxcount) |
|
7323 { |
|
7324 PyObject *self; |
|
7325 PyObject *str1; |
|
7326 PyObject *str2; |
|
7327 PyObject *result; |
|
7328 |
|
7329 self = PyUnicode_FromObject(obj); |
|
7330 if (self == NULL) |
|
7331 return NULL; |
|
7332 str1 = PyUnicode_FromObject(subobj); |
|
7333 if (str1 == NULL) { |
|
7334 Py_DECREF(self); |
|
7335 return NULL; |
|
7336 } |
|
7337 str2 = PyUnicode_FromObject(replobj); |
|
7338 if (str2 == NULL) { |
|
7339 Py_DECREF(self); |
|
7340 Py_DECREF(str1); |
|
7341 return NULL; |
|
7342 } |
|
7343 result = replace((PyUnicodeObject *)self, |
|
7344 (PyUnicodeObject *)str1, |
|
7345 (PyUnicodeObject *)str2, |
|
7346 maxcount); |
|
7347 Py_DECREF(self); |
|
7348 Py_DECREF(str1); |
|
7349 Py_DECREF(str2); |
|
7350 return result; |
|
7351 } |
|
7352 |
|
7353 PyDoc_STRVAR(replace__doc__, |
|
7354 "S.replace (old, new[, count]) -> unicode\n\ |
|
7355 \n\ |
|
7356 Return a copy of S with all occurrences of substring\n\ |
|
7357 old replaced by new. If the optional argument count is\n\ |
|
7358 given, only the first count occurrences are replaced."); |
|
7359 |
|
7360 static PyObject* |
|
7361 unicode_replace(PyUnicodeObject *self, PyObject *args) |
|
7362 { |
|
7363 PyUnicodeObject *str1; |
|
7364 PyUnicodeObject *str2; |
|
7365 Py_ssize_t maxcount = -1; |
|
7366 PyObject *result; |
|
7367 |
|
7368 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount)) |
|
7369 return NULL; |
|
7370 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1); |
|
7371 if (str1 == NULL) |
|
7372 return NULL; |
|
7373 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2); |
|
7374 if (str2 == NULL) { |
|
7375 Py_DECREF(str1); |
|
7376 return NULL; |
|
7377 } |
|
7378 |
|
7379 result = replace(self, str1, str2, maxcount); |
|
7380 |
|
7381 Py_DECREF(str1); |
|
7382 Py_DECREF(str2); |
|
7383 return result; |
|
7384 } |
|
7385 |
|
7386 static |
|
7387 PyObject *unicode_repr(PyObject *unicode) |
|
7388 { |
|
7389 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode), |
|
7390 PyUnicode_GET_SIZE(unicode), |
|
7391 1); |
|
7392 } |
|
7393 |
|
7394 PyDoc_STRVAR(rfind__doc__, |
|
7395 "S.rfind(sub [,start [,end]]) -> int\n\ |
|
7396 \n\ |
|
7397 Return the highest index in S where substring sub is found,\n\ |
|
7398 such that sub is contained within s[start:end]. Optional\n\ |
|
7399 arguments start and end are interpreted as in slice notation.\n\ |
|
7400 \n\ |
|
7401 Return -1 on failure."); |
|
7402 |
|
7403 static PyObject * |
|
7404 unicode_rfind(PyUnicodeObject *self, PyObject *args) |
|
7405 { |
|
7406 PyObject *substring; |
|
7407 Py_ssize_t start; |
|
7408 Py_ssize_t end; |
|
7409 Py_ssize_t result; |
|
7410 |
|
7411 if (!_ParseTupleFinds(args, &substring, &start, &end)) |
|
7412 return NULL; |
|
7413 |
|
7414 result = stringlib_rfind_slice( |
|
7415 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), |
|
7416 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), |
|
7417 start, end |
|
7418 ); |
|
7419 |
|
7420 Py_DECREF(substring); |
|
7421 |
|
7422 return PyInt_FromSsize_t(result); |
|
7423 } |
|
7424 |
|
7425 PyDoc_STRVAR(rindex__doc__, |
|
7426 "S.rindex(sub [,start [,end]]) -> int\n\ |
|
7427 \n\ |
|
7428 Like S.rfind() but raise ValueError when the substring is not found."); |
|
7429 |
|
7430 static PyObject * |
|
7431 unicode_rindex(PyUnicodeObject *self, PyObject *args) |
|
7432 { |
|
7433 PyObject *substring; |
|
7434 Py_ssize_t start; |
|
7435 Py_ssize_t end; |
|
7436 Py_ssize_t result; |
|
7437 |
|
7438 if (!_ParseTupleFinds(args, &substring, &start, &end)) |
|
7439 return NULL; |
|
7440 |
|
7441 result = stringlib_rfind_slice( |
|
7442 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self), |
|
7443 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring), |
|
7444 start, end |
|
7445 ); |
|
7446 |
|
7447 Py_DECREF(substring); |
|
7448 |
|
7449 if (result < 0) { |
|
7450 PyErr_SetString(PyExc_ValueError, "substring not found"); |
|
7451 return NULL; |
|
7452 } |
|
7453 return PyInt_FromSsize_t(result); |
|
7454 } |
|
7455 |
|
7456 PyDoc_STRVAR(rjust__doc__, |
|
7457 "S.rjust(width[, fillchar]) -> unicode\n\ |
|
7458 \n\ |
|
7459 Return S right-justified in a Unicode string of length width. Padding is\n\ |
|
7460 done using the specified fill character (default is a space)."); |
|
7461 |
|
7462 static PyObject * |
|
7463 unicode_rjust(PyUnicodeObject *self, PyObject *args) |
|
7464 { |
|
7465 Py_ssize_t width; |
|
7466 Py_UNICODE fillchar = ' '; |
|
7467 |
|
7468 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar)) |
|
7469 return NULL; |
|
7470 |
|
7471 if (self->length >= width && PyUnicode_CheckExact(self)) { |
|
7472 Py_INCREF(self); |
|
7473 return (PyObject*) self; |
|
7474 } |
|
7475 |
|
7476 return (PyObject*) pad(self, width - self->length, 0, fillchar); |
|
7477 } |
|
7478 |
|
7479 static PyObject* |
|
7480 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end) |
|
7481 { |
|
7482 /* standard clamping */ |
|
7483 if (start < 0) |
|
7484 start = 0; |
|
7485 if (end < 0) |
|
7486 end = 0; |
|
7487 if (end > self->length) |
|
7488 end = self->length; |
|
7489 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { |
|
7490 /* full slice, return original string */ |
|
7491 Py_INCREF(self); |
|
7492 return (PyObject*) self; |
|
7493 } |
|
7494 if (start > end) |
|
7495 start = end; |
|
7496 /* copy slice */ |
|
7497 return (PyObject*) PyUnicode_FromUnicode(self->str + start, |
|
7498 end - start); |
|
7499 } |
|
7500 |
|
7501 PyObject *PyUnicode_Split(PyObject *s, |
|
7502 PyObject *sep, |
|
7503 Py_ssize_t maxsplit) |
|
7504 { |
|
7505 PyObject *result; |
|
7506 |
|
7507 s = PyUnicode_FromObject(s); |
|
7508 if (s == NULL) |
|
7509 return NULL; |
|
7510 if (sep != NULL) { |
|
7511 sep = PyUnicode_FromObject(sep); |
|
7512 if (sep == NULL) { |
|
7513 Py_DECREF(s); |
|
7514 return NULL; |
|
7515 } |
|
7516 } |
|
7517 |
|
7518 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); |
|
7519 |
|
7520 Py_DECREF(s); |
|
7521 Py_XDECREF(sep); |
|
7522 return result; |
|
7523 } |
|
7524 |
|
7525 PyDoc_STRVAR(split__doc__, |
|
7526 "S.split([sep [,maxsplit]]) -> list of strings\n\ |
|
7527 \n\ |
|
7528 Return a list of the words in S, using sep as the\n\ |
|
7529 delimiter string. If maxsplit is given, at most maxsplit\n\ |
|
7530 splits are done. If sep is not specified or is None, any\n\ |
|
7531 whitespace string is a separator and empty strings are\n\ |
|
7532 removed from the result."); |
|
7533 |
|
7534 static PyObject* |
|
7535 unicode_split(PyUnicodeObject *self, PyObject *args) |
|
7536 { |
|
7537 PyObject *substring = Py_None; |
|
7538 Py_ssize_t maxcount = -1; |
|
7539 |
|
7540 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount)) |
|
7541 return NULL; |
|
7542 |
|
7543 if (substring == Py_None) |
|
7544 return split(self, NULL, maxcount); |
|
7545 else if (PyUnicode_Check(substring)) |
|
7546 return split(self, (PyUnicodeObject *)substring, maxcount); |
|
7547 else |
|
7548 return PyUnicode_Split((PyObject *)self, substring, maxcount); |
|
7549 } |
|
7550 |
|
7551 PyObject * |
|
7552 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) |
|
7553 { |
|
7554 PyObject* str_obj; |
|
7555 PyObject* sep_obj; |
|
7556 PyObject* out; |
|
7557 |
|
7558 str_obj = PyUnicode_FromObject(str_in); |
|
7559 if (!str_obj) |
|
7560 return NULL; |
|
7561 sep_obj = PyUnicode_FromObject(sep_in); |
|
7562 if (!sep_obj) { |
|
7563 Py_DECREF(str_obj); |
|
7564 return NULL; |
|
7565 } |
|
7566 |
|
7567 out = stringlib_partition( |
|
7568 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), |
|
7569 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) |
|
7570 ); |
|
7571 |
|
7572 Py_DECREF(sep_obj); |
|
7573 Py_DECREF(str_obj); |
|
7574 |
|
7575 return out; |
|
7576 } |
|
7577 |
|
7578 |
|
7579 PyObject * |
|
7580 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) |
|
7581 { |
|
7582 PyObject* str_obj; |
|
7583 PyObject* sep_obj; |
|
7584 PyObject* out; |
|
7585 |
|
7586 str_obj = PyUnicode_FromObject(str_in); |
|
7587 if (!str_obj) |
|
7588 return NULL; |
|
7589 sep_obj = PyUnicode_FromObject(sep_in); |
|
7590 if (!sep_obj) { |
|
7591 Py_DECREF(str_obj); |
|
7592 return NULL; |
|
7593 } |
|
7594 |
|
7595 out = stringlib_rpartition( |
|
7596 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj), |
|
7597 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj) |
|
7598 ); |
|
7599 |
|
7600 Py_DECREF(sep_obj); |
|
7601 Py_DECREF(str_obj); |
|
7602 |
|
7603 return out; |
|
7604 } |
|
7605 |
|
7606 PyDoc_STRVAR(partition__doc__, |
|
7607 "S.partition(sep) -> (head, sep, tail)\n\ |
|
7608 \n\ |
|
7609 Search for the separator sep in S, and return the part before it,\n\ |
|
7610 the separator itself, and the part after it. If the separator is not\n\ |
|
7611 found, return S and two empty strings."); |
|
7612 |
|
7613 static PyObject* |
|
7614 unicode_partition(PyUnicodeObject *self, PyObject *separator) |
|
7615 { |
|
7616 return PyUnicode_Partition((PyObject *)self, separator); |
|
7617 } |
|
7618 |
|
7619 PyDoc_STRVAR(rpartition__doc__, |
|
7620 "S.rpartition(sep) -> (tail, sep, head)\n\ |
|
7621 \n\ |
|
7622 Search for the separator sep in S, starting at the end of S, and return\n\ |
|
7623 the part before it, the separator itself, and the part after it. If the\n\ |
|
7624 separator is not found, return two empty strings and S."); |
|
7625 |
|
7626 static PyObject* |
|
7627 unicode_rpartition(PyUnicodeObject *self, PyObject *separator) |
|
7628 { |
|
7629 return PyUnicode_RPartition((PyObject *)self, separator); |
|
7630 } |
|
7631 |
|
7632 PyObject *PyUnicode_RSplit(PyObject *s, |
|
7633 PyObject *sep, |
|
7634 Py_ssize_t maxsplit) |
|
7635 { |
|
7636 PyObject *result; |
|
7637 |
|
7638 s = PyUnicode_FromObject(s); |
|
7639 if (s == NULL) |
|
7640 return NULL; |
|
7641 if (sep != NULL) { |
|
7642 sep = PyUnicode_FromObject(sep); |
|
7643 if (sep == NULL) { |
|
7644 Py_DECREF(s); |
|
7645 return NULL; |
|
7646 } |
|
7647 } |
|
7648 |
|
7649 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); |
|
7650 |
|
7651 Py_DECREF(s); |
|
7652 Py_XDECREF(sep); |
|
7653 return result; |
|
7654 } |
|
7655 |
|
7656 PyDoc_STRVAR(rsplit__doc__, |
|
7657 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\ |
|
7658 \n\ |
|
7659 Return a list of the words in S, using sep as the\n\ |
|
7660 delimiter string, starting at the end of the string and\n\ |
|
7661 working to the front. If maxsplit is given, at most maxsplit\n\ |
|
7662 splits are done. If sep is not specified, any whitespace string\n\ |
|
7663 is a separator."); |
|
7664 |
|
7665 static PyObject* |
|
7666 unicode_rsplit(PyUnicodeObject *self, PyObject *args) |
|
7667 { |
|
7668 PyObject *substring = Py_None; |
|
7669 Py_ssize_t maxcount = -1; |
|
7670 |
|
7671 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount)) |
|
7672 return NULL; |
|
7673 |
|
7674 if (substring == Py_None) |
|
7675 return rsplit(self, NULL, maxcount); |
|
7676 else if (PyUnicode_Check(substring)) |
|
7677 return rsplit(self, (PyUnicodeObject *)substring, maxcount); |
|
7678 else |
|
7679 return PyUnicode_RSplit((PyObject *)self, substring, maxcount); |
|
7680 } |
|
7681 |
|
7682 PyDoc_STRVAR(splitlines__doc__, |
|
7683 "S.splitlines([keepends]]) -> list of strings\n\ |
|
7684 \n\ |
|
7685 Return a list of the lines in S, breaking at line boundaries.\n\ |
|
7686 Line breaks are not included in the resulting list unless keepends\n\ |
|
7687 is given and true."); |
|
7688 |
|
7689 static PyObject* |
|
7690 unicode_splitlines(PyUnicodeObject *self, PyObject *args) |
|
7691 { |
|
7692 int keepends = 0; |
|
7693 |
|
7694 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends)) |
|
7695 return NULL; |
|
7696 |
|
7697 return PyUnicode_Splitlines((PyObject *)self, keepends); |
|
7698 } |
|
7699 |
|
7700 static |
|
7701 PyObject *unicode_str(PyUnicodeObject *self) |
|
7702 { |
|
7703 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); |
|
7704 } |
|
7705 |
|
7706 PyDoc_STRVAR(swapcase__doc__, |
|
7707 "S.swapcase() -> unicode\n\ |
|
7708 \n\ |
|
7709 Return a copy of S with uppercase characters converted to lowercase\n\ |
|
7710 and vice versa."); |
|
7711 |
|
7712 static PyObject* |
|
7713 unicode_swapcase(PyUnicodeObject *self) |
|
7714 { |
|
7715 return fixup(self, fixswapcase); |
|
7716 } |
|
7717 |
|
7718 PyDoc_STRVAR(translate__doc__, |
|
7719 "S.translate(table) -> unicode\n\ |
|
7720 \n\ |
|
7721 Return a copy of the string S, where all characters have been mapped\n\ |
|
7722 through the given translation table, which must be a mapping of\n\ |
|
7723 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\ |
|
7724 Unmapped characters are left untouched. Characters mapped to None\n\ |
|
7725 are deleted."); |
|
7726 |
|
7727 static PyObject* |
|
7728 unicode_translate(PyUnicodeObject *self, PyObject *table) |
|
7729 { |
|
7730 return PyUnicode_TranslateCharmap(self->str, |
|
7731 self->length, |
|
7732 table, |
|
7733 "ignore"); |
|
7734 } |
|
7735 |
|
7736 PyDoc_STRVAR(upper__doc__, |
|
7737 "S.upper() -> unicode\n\ |
|
7738 \n\ |
|
7739 Return a copy of S converted to uppercase."); |
|
7740 |
|
7741 static PyObject* |
|
7742 unicode_upper(PyUnicodeObject *self) |
|
7743 { |
|
7744 return fixup(self, fixupper); |
|
7745 } |
|
7746 |
|
7747 PyDoc_STRVAR(zfill__doc__, |
|
7748 "S.zfill(width) -> unicode\n\ |
|
7749 \n\ |
|
7750 Pad a numeric string S with zeros on the left, to fill a field\n\ |
|
7751 of the specified width. The string S is never truncated."); |
|
7752 |
|
7753 static PyObject * |
|
7754 unicode_zfill(PyUnicodeObject *self, PyObject *args) |
|
7755 { |
|
7756 Py_ssize_t fill; |
|
7757 PyUnicodeObject *u; |
|
7758 |
|
7759 Py_ssize_t width; |
|
7760 if (!PyArg_ParseTuple(args, "n:zfill", &width)) |
|
7761 return NULL; |
|
7762 |
|
7763 if (self->length >= width) { |
|
7764 if (PyUnicode_CheckExact(self)) { |
|
7765 Py_INCREF(self); |
|
7766 return (PyObject*) self; |
|
7767 } |
|
7768 else |
|
7769 return PyUnicode_FromUnicode( |
|
7770 PyUnicode_AS_UNICODE(self), |
|
7771 PyUnicode_GET_SIZE(self) |
|
7772 ); |
|
7773 } |
|
7774 |
|
7775 fill = width - self->length; |
|
7776 |
|
7777 u = pad(self, fill, 0, '0'); |
|
7778 |
|
7779 if (u == NULL) |
|
7780 return NULL; |
|
7781 |
|
7782 if (u->str[fill] == '+' || u->str[fill] == '-') { |
|
7783 /* move sign to beginning of string */ |
|
7784 u->str[0] = u->str[fill]; |
|
7785 u->str[fill] = '0'; |
|
7786 } |
|
7787 |
|
7788 return (PyObject*) u; |
|
7789 } |
|
7790 |
|
7791 #if 0 |
|
7792 static PyObject* |
|
7793 free_listsize(PyUnicodeObject *self) |
|
7794 { |
|
7795 return PyInt_FromLong(numfree); |
|
7796 } |
|
7797 #endif |
|
7798 |
|
7799 PyDoc_STRVAR(startswith__doc__, |
|
7800 "S.startswith(prefix[, start[, end]]) -> bool\n\ |
|
7801 \n\ |
|
7802 Return True if S starts with the specified prefix, False otherwise.\n\ |
|
7803 With optional start, test S beginning at that position.\n\ |
|
7804 With optional end, stop comparing S at that position.\n\ |
|
7805 prefix can also be a tuple of strings to try."); |
|
7806 |
|
7807 static PyObject * |
|
7808 unicode_startswith(PyUnicodeObject *self, |
|
7809 PyObject *args) |
|
7810 { |
|
7811 PyObject *subobj; |
|
7812 PyUnicodeObject *substring; |
|
7813 Py_ssize_t start = 0; |
|
7814 Py_ssize_t end = PY_SSIZE_T_MAX; |
|
7815 int result; |
|
7816 |
|
7817 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj, |
|
7818 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) |
|
7819 return NULL; |
|
7820 if (PyTuple_Check(subobj)) { |
|
7821 Py_ssize_t i; |
|
7822 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { |
|
7823 substring = (PyUnicodeObject *)PyUnicode_FromObject( |
|
7824 PyTuple_GET_ITEM(subobj, i)); |
|
7825 if (substring == NULL) |
|
7826 return NULL; |
|
7827 result = tailmatch(self, substring, start, end, -1); |
|
7828 Py_DECREF(substring); |
|
7829 if (result) { |
|
7830 Py_RETURN_TRUE; |
|
7831 } |
|
7832 } |
|
7833 /* nothing matched */ |
|
7834 Py_RETURN_FALSE; |
|
7835 } |
|
7836 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); |
|
7837 if (substring == NULL) |
|
7838 return NULL; |
|
7839 result = tailmatch(self, substring, start, end, -1); |
|
7840 Py_DECREF(substring); |
|
7841 return PyBool_FromLong(result); |
|
7842 } |
|
7843 |
|
7844 |
|
7845 PyDoc_STRVAR(endswith__doc__, |
|
7846 "S.endswith(suffix[, start[, end]]) -> bool\n\ |
|
7847 \n\ |
|
7848 Return True if S ends with the specified suffix, False otherwise.\n\ |
|
7849 With optional start, test S beginning at that position.\n\ |
|
7850 With optional end, stop comparing S at that position.\n\ |
|
7851 suffix can also be a tuple of strings to try."); |
|
7852 |
|
7853 static PyObject * |
|
7854 unicode_endswith(PyUnicodeObject *self, |
|
7855 PyObject *args) |
|
7856 { |
|
7857 PyObject *subobj; |
|
7858 PyUnicodeObject *substring; |
|
7859 Py_ssize_t start = 0; |
|
7860 Py_ssize_t end = PY_SSIZE_T_MAX; |
|
7861 int result; |
|
7862 |
|
7863 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj, |
|
7864 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end)) |
|
7865 return NULL; |
|
7866 if (PyTuple_Check(subobj)) { |
|
7867 Py_ssize_t i; |
|
7868 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) { |
|
7869 substring = (PyUnicodeObject *)PyUnicode_FromObject( |
|
7870 PyTuple_GET_ITEM(subobj, i)); |
|
7871 if (substring == NULL) |
|
7872 return NULL; |
|
7873 result = tailmatch(self, substring, start, end, +1); |
|
7874 Py_DECREF(substring); |
|
7875 if (result) { |
|
7876 Py_RETURN_TRUE; |
|
7877 } |
|
7878 } |
|
7879 Py_RETURN_FALSE; |
|
7880 } |
|
7881 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj); |
|
7882 if (substring == NULL) |
|
7883 return NULL; |
|
7884 |
|
7885 result = tailmatch(self, substring, start, end, +1); |
|
7886 Py_DECREF(substring); |
|
7887 return PyBool_FromLong(result); |
|
7888 } |
|
7889 |
|
7890 |
|
7891 /* Implements do_string_format, which is unicode because of stringlib */ |
|
7892 #include "stringlib/string_format.h" |
|
7893 |
|
7894 PyDoc_STRVAR(format__doc__, |
|
7895 "S.format(*args, **kwargs) -> unicode\n\ |
|
7896 \n\ |
|
7897 "); |
|
7898 |
|
7899 static PyObject * |
|
7900 unicode__format__(PyObject *self, PyObject *args) |
|
7901 { |
|
7902 PyObject *format_spec; |
|
7903 PyObject *result = NULL; |
|
7904 PyObject *tmp = NULL; |
|
7905 |
|
7906 /* If 2.x, convert format_spec to the same type as value */ |
|
7907 /* This is to allow things like u''.format('') */ |
|
7908 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec)) |
|
7909 goto done; |
|
7910 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) { |
|
7911 PyErr_Format(PyExc_TypeError, "__format__ arg must be str " |
|
7912 "or unicode, not %s", Py_TYPE(format_spec)->tp_name); |
|
7913 goto done; |
|
7914 } |
|
7915 tmp = PyObject_Unicode(format_spec); |
|
7916 if (tmp == NULL) |
|
7917 goto done; |
|
7918 format_spec = tmp; |
|
7919 |
|
7920 result = _PyUnicode_FormatAdvanced(self, |
|
7921 PyUnicode_AS_UNICODE(format_spec), |
|
7922 PyUnicode_GET_SIZE(format_spec)); |
|
7923 done: |
|
7924 Py_XDECREF(tmp); |
|
7925 return result; |
|
7926 } |
|
7927 |
|
7928 PyDoc_STRVAR(p_format__doc__, |
|
7929 "S.__format__(format_spec) -> unicode\n\ |
|
7930 \n\ |
|
7931 "); |
|
7932 |
|
7933 static PyObject * |
|
7934 unicode__sizeof__(PyUnicodeObject *v) |
|
7935 { |
|
7936 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) + |
|
7937 sizeof(Py_UNICODE) * (v->length + 1)); |
|
7938 } |
|
7939 |
|
7940 PyDoc_STRVAR(sizeof__doc__, |
|
7941 "S.__sizeof__() -> size of S in memory, in bytes\n\ |
|
7942 \n\ |
|
7943 "); |
|
7944 |
|
7945 static PyObject * |
|
7946 unicode_getnewargs(PyUnicodeObject *v) |
|
7947 { |
|
7948 return Py_BuildValue("(u#)", v->str, v->length); |
|
7949 } |
|
7950 |
|
7951 |
|
7952 static PyMethodDef unicode_methods[] = { |
|
7953 |
|
7954 /* Order is according to common usage: often used methods should |
|
7955 appear first, since lookup is done sequentially. */ |
|
7956 |
|
7957 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__}, |
|
7958 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__}, |
|
7959 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__}, |
|
7960 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__}, |
|
7961 {"join", (PyCFunction) unicode_join, METH_O, join__doc__}, |
|
7962 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__}, |
|
7963 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__}, |
|
7964 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__}, |
|
7965 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, |
|
7966 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, |
|
7967 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, |
|
7968 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__}, |
|
7969 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, |
|
7970 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, |
|
7971 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, |
|
7972 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, |
|
7973 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__}, |
|
7974 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ |
|
7975 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, |
|
7976 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, |
|
7977 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, |
|
7978 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, |
|
7979 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, |
|
7980 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, |
|
7981 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, |
|
7982 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, |
|
7983 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, |
|
7984 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__}, |
|
7985 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__}, |
|
7986 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__}, |
|
7987 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__}, |
|
7988 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__}, |
|
7989 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__}, |
|
7990 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__}, |
|
7991 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__}, |
|
7992 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__}, |
|
7993 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, |
|
7994 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, |
|
7995 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, |
|
7996 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, |
|
7997 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__}, |
|
7998 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__}, |
|
7999 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS}, |
|
8000 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS}, |
|
8001 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__}, |
|
8002 #if 0 |
|
8003 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, |
|
8004 #endif |
|
8005 |
|
8006 #if 0 |
|
8007 /* This one is just used for debugging the implementation. */ |
|
8008 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS}, |
|
8009 #endif |
|
8010 |
|
8011 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, |
|
8012 {NULL, NULL} |
|
8013 }; |
|
8014 |
|
8015 static PyObject * |
|
8016 unicode_mod(PyObject *v, PyObject *w) |
|
8017 { |
|
8018 if (!PyUnicode_Check(v)) { |
|
8019 Py_INCREF(Py_NotImplemented); |
|
8020 return Py_NotImplemented; |
|
8021 } |
|
8022 return PyUnicode_Format(v, w); |
|
8023 } |
|
8024 |
|
8025 static PyNumberMethods unicode_as_number = { |
|
8026 0, /*nb_add*/ |
|
8027 0, /*nb_subtract*/ |
|
8028 0, /*nb_multiply*/ |
|
8029 0, /*nb_divide*/ |
|
8030 unicode_mod, /*nb_remainder*/ |
|
8031 }; |
|
8032 |
|
8033 static PySequenceMethods unicode_as_sequence = { |
|
8034 (lenfunc) unicode_length, /* sq_length */ |
|
8035 PyUnicode_Concat, /* sq_concat */ |
|
8036 (ssizeargfunc) unicode_repeat, /* sq_repeat */ |
|
8037 (ssizeargfunc) unicode_getitem, /* sq_item */ |
|
8038 (ssizessizeargfunc) unicode_slice, /* sq_slice */ |
|
8039 0, /* sq_ass_item */ |
|
8040 0, /* sq_ass_slice */ |
|
8041 PyUnicode_Contains, /* sq_contains */ |
|
8042 }; |
|
8043 |
|
8044 static PyObject* |
|
8045 unicode_subscript(PyUnicodeObject* self, PyObject* item) |
|
8046 { |
|
8047 if (PyIndex_Check(item)) { |
|
8048 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); |
|
8049 if (i == -1 && PyErr_Occurred()) |
|
8050 return NULL; |
|
8051 if (i < 0) |
|
8052 i += PyUnicode_GET_SIZE(self); |
|
8053 return unicode_getitem(self, i); |
|
8054 } else if (PySlice_Check(item)) { |
|
8055 Py_ssize_t start, stop, step, slicelength, cur, i; |
|
8056 Py_UNICODE* source_buf; |
|
8057 Py_UNICODE* result_buf; |
|
8058 PyObject* result; |
|
8059 |
|
8060 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self), |
|
8061 &start, &stop, &step, &slicelength) < 0) { |
|
8062 return NULL; |
|
8063 } |
|
8064 |
|
8065 if (slicelength <= 0) { |
|
8066 return PyUnicode_FromUnicode(NULL, 0); |
|
8067 } else if (start == 0 && step == 1 && slicelength == self->length && |
|
8068 PyUnicode_CheckExact(self)) { |
|
8069 Py_INCREF(self); |
|
8070 return (PyObject *)self; |
|
8071 } else if (step == 1) { |
|
8072 return PyUnicode_FromUnicode(self->str + start, slicelength); |
|
8073 } else { |
|
8074 source_buf = PyUnicode_AS_UNICODE((PyObject*)self); |
|
8075 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* |
|
8076 sizeof(Py_UNICODE)); |
|
8077 |
|
8078 if (result_buf == NULL) |
|
8079 return PyErr_NoMemory(); |
|
8080 |
|
8081 for (cur = start, i = 0; i < slicelength; cur += step, i++) { |
|
8082 result_buf[i] = source_buf[cur]; |
|
8083 } |
|
8084 |
|
8085 result = PyUnicode_FromUnicode(result_buf, slicelength); |
|
8086 PyObject_FREE(result_buf); |
|
8087 return result; |
|
8088 } |
|
8089 } else { |
|
8090 PyErr_SetString(PyExc_TypeError, "string indices must be integers"); |
|
8091 return NULL; |
|
8092 } |
|
8093 } |
|
8094 |
|
8095 static PyMappingMethods unicode_as_mapping = { |
|
8096 (lenfunc)unicode_length, /* mp_length */ |
|
8097 (binaryfunc)unicode_subscript, /* mp_subscript */ |
|
8098 (objobjargproc)0, /* mp_ass_subscript */ |
|
8099 }; |
|
8100 |
|
8101 static Py_ssize_t |
|
8102 unicode_buffer_getreadbuf(PyUnicodeObject *self, |
|
8103 Py_ssize_t index, |
|
8104 const void **ptr) |
|
8105 { |
|
8106 if (index != 0) { |
|
8107 PyErr_SetString(PyExc_SystemError, |
|
8108 "accessing non-existent unicode segment"); |
|
8109 return -1; |
|
8110 } |
|
8111 *ptr = (void *) self->str; |
|
8112 return PyUnicode_GET_DATA_SIZE(self); |
|
8113 } |
|
8114 |
|
8115 static Py_ssize_t |
|
8116 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index, |
|
8117 const void **ptr) |
|
8118 { |
|
8119 PyErr_SetString(PyExc_TypeError, |
|
8120 "cannot use unicode as modifiable buffer"); |
|
8121 return -1; |
|
8122 } |
|
8123 |
|
8124 static int |
|
8125 unicode_buffer_getsegcount(PyUnicodeObject *self, |
|
8126 Py_ssize_t *lenp) |
|
8127 { |
|
8128 if (lenp) |
|
8129 *lenp = PyUnicode_GET_DATA_SIZE(self); |
|
8130 return 1; |
|
8131 } |
|
8132 |
|
8133 static Py_ssize_t |
|
8134 unicode_buffer_getcharbuf(PyUnicodeObject *self, |
|
8135 Py_ssize_t index, |
|
8136 const void **ptr) |
|
8137 { |
|
8138 PyObject *str; |
|
8139 |
|
8140 if (index != 0) { |
|
8141 PyErr_SetString(PyExc_SystemError, |
|
8142 "accessing non-existent unicode segment"); |
|
8143 return -1; |
|
8144 } |
|
8145 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL); |
|
8146 if (str == NULL) |
|
8147 return -1; |
|
8148 *ptr = (void *) PyString_AS_STRING(str); |
|
8149 return PyString_GET_SIZE(str); |
|
8150 } |
|
8151 |
|
8152 /* Helpers for PyUnicode_Format() */ |
|
8153 |
|
8154 static PyObject * |
|
8155 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) |
|
8156 { |
|
8157 Py_ssize_t argidx = *p_argidx; |
|
8158 if (argidx < arglen) { |
|
8159 (*p_argidx)++; |
|
8160 if (arglen < 0) |
|
8161 return args; |
|
8162 else |
|
8163 return PyTuple_GetItem(args, argidx); |
|
8164 } |
|
8165 PyErr_SetString(PyExc_TypeError, |
|
8166 "not enough arguments for format string"); |
|
8167 return NULL; |
|
8168 } |
|
8169 |
|
8170 #define F_LJUST (1<<0) |
|
8171 #define F_SIGN (1<<1) |
|
8172 #define F_BLANK (1<<2) |
|
8173 #define F_ALT (1<<3) |
|
8174 #define F_ZERO (1<<4) |
|
8175 |
|
8176 static Py_ssize_t |
|
8177 strtounicode(Py_UNICODE *buffer, const char *charbuffer) |
|
8178 { |
|
8179 register Py_ssize_t i; |
|
8180 Py_ssize_t len = strlen(charbuffer); |
|
8181 for (i = len - 1; i >= 0; i--) |
|
8182 buffer[i] = (Py_UNICODE) charbuffer[i]; |
|
8183 |
|
8184 return len; |
|
8185 } |
|
8186 |
|
8187 static int |
|
8188 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x) |
|
8189 { |
|
8190 Py_ssize_t result; |
|
8191 |
|
8192 PyOS_ascii_formatd((char *)buffer, len, format, x); |
|
8193 result = strtounicode(buffer, (char *)buffer); |
|
8194 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); |
|
8195 } |
|
8196 |
|
8197 static int |
|
8198 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x) |
|
8199 { |
|
8200 Py_ssize_t result; |
|
8201 |
|
8202 PyOS_snprintf((char *)buffer, len, format, x); |
|
8203 result = strtounicode(buffer, (char *)buffer); |
|
8204 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int); |
|
8205 } |
|
8206 |
|
8207 /* XXX To save some code duplication, formatfloat/long/int could have been |
|
8208 shared with stringobject.c, converting from 8-bit to Unicode after the |
|
8209 formatting is done. */ |
|
8210 |
|
8211 static int |
|
8212 formatfloat(Py_UNICODE *buf, |
|
8213 size_t buflen, |
|
8214 int flags, |
|
8215 int prec, |
|
8216 int type, |
|
8217 PyObject *v) |
|
8218 { |
|
8219 /* fmt = '%#.' + `prec` + `type` |
|
8220 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/ |
|
8221 char fmt[20]; |
|
8222 double x; |
|
8223 |
|
8224 x = PyFloat_AsDouble(v); |
|
8225 if (x == -1.0 && PyErr_Occurred()) |
|
8226 return -1; |
|
8227 if (prec < 0) |
|
8228 prec = 6; |
|
8229 if (type == 'f' && (fabs(x) / 1e25) >= 1e25) |
|
8230 type = 'g'; |
|
8231 /* Worst case length calc to ensure no buffer overrun: |
|
8232 |
|
8233 'g' formats: |
|
8234 fmt = %#.<prec>g |
|
8235 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp |
|
8236 for any double rep.) |
|
8237 len = 1 + prec + 1 + 2 + 5 = 9 + prec |
|
8238 |
|
8239 'f' formats: |
|
8240 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50) |
|
8241 len = 1 + 50 + 1 + prec = 52 + prec |
|
8242 |
|
8243 If prec=0 the effective precision is 1 (the leading digit is |
|
8244 always given), therefore increase the length by one. |
|
8245 |
|
8246 */ |
|
8247 if (((type == 'g' || type == 'G') && |
|
8248 buflen <= (size_t)10 + (size_t)prec) || |
|
8249 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) { |
|
8250 PyErr_SetString(PyExc_OverflowError, |
|
8251 "formatted float is too long (precision too large?)"); |
|
8252 return -1; |
|
8253 } |
|
8254 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c", |
|
8255 (flags&F_ALT) ? "#" : "", |
|
8256 prec, type); |
|
8257 return doubletounicode(buf, buflen, fmt, x); |
|
8258 } |
|
8259 |
|
8260 static PyObject* |
|
8261 formatlong(PyObject *val, int flags, int prec, int type) |
|
8262 { |
|
8263 char *buf; |
|
8264 int i, len; |
|
8265 PyObject *str; /* temporary string object. */ |
|
8266 PyUnicodeObject *result; |
|
8267 |
|
8268 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len); |
|
8269 if (!str) |
|
8270 return NULL; |
|
8271 result = _PyUnicode_New(len); |
|
8272 if (!result) { |
|
8273 Py_DECREF(str); |
|
8274 return NULL; |
|
8275 } |
|
8276 for (i = 0; i < len; i++) |
|
8277 result->str[i] = buf[i]; |
|
8278 result->str[len] = 0; |
|
8279 Py_DECREF(str); |
|
8280 return (PyObject*)result; |
|
8281 } |
|
8282 |
|
8283 static int |
|
8284 formatint(Py_UNICODE *buf, |
|
8285 size_t buflen, |
|
8286 int flags, |
|
8287 int prec, |
|
8288 int type, |
|
8289 PyObject *v) |
|
8290 { |
|
8291 /* fmt = '%#.' + `prec` + 'l' + `type` |
|
8292 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine) |
|
8293 * + 1 + 1 |
|
8294 * = 24 |
|
8295 */ |
|
8296 char fmt[64]; /* plenty big enough! */ |
|
8297 char *sign; |
|
8298 long x; |
|
8299 |
|
8300 x = PyInt_AsLong(v); |
|
8301 if (x == -1 && PyErr_Occurred()) |
|
8302 return -1; |
|
8303 if (x < 0 && type == 'u') { |
|
8304 type = 'd'; |
|
8305 } |
|
8306 if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) |
|
8307 sign = "-"; |
|
8308 else |
|
8309 sign = ""; |
|
8310 if (prec < 0) |
|
8311 prec = 1; |
|
8312 |
|
8313 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal)) |
|
8314 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11 |
|
8315 */ |
|
8316 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) { |
|
8317 PyErr_SetString(PyExc_OverflowError, |
|
8318 "formatted integer is too long (precision too large?)"); |
|
8319 return -1; |
|
8320 } |
|
8321 |
|
8322 if ((flags & F_ALT) && |
|
8323 (type == 'x' || type == 'X')) { |
|
8324 /* When converting under %#x or %#X, there are a number |
|
8325 * of issues that cause pain: |
|
8326 * - when 0 is being converted, the C standard leaves off |
|
8327 * the '0x' or '0X', which is inconsistent with other |
|
8328 * %#x/%#X conversions and inconsistent with Python's |
|
8329 * hex() function |
|
8330 * - there are platforms that violate the standard and |
|
8331 * convert 0 with the '0x' or '0X' |
|
8332 * (Metrowerks, Compaq Tru64) |
|
8333 * - there are platforms that give '0x' when converting |
|
8334 * under %#X, but convert 0 in accordance with the |
|
8335 * standard (OS/2 EMX) |
|
8336 * |
|
8337 * We can achieve the desired consistency by inserting our |
|
8338 * own '0x' or '0X' prefix, and substituting %x/%X in place |
|
8339 * of %#x/%#X. |
|
8340 * |
|
8341 * Note that this is the same approach as used in |
|
8342 * formatint() in stringobject.c |
|
8343 */ |
|
8344 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c", |
|
8345 sign, type, prec, type); |
|
8346 } |
|
8347 else { |
|
8348 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c", |
|
8349 sign, (flags&F_ALT) ? "#" : "", |
|
8350 prec, type); |
|
8351 } |
|
8352 if (sign[0]) |
|
8353 return longtounicode(buf, buflen, fmt, -x); |
|
8354 else |
|
8355 return longtounicode(buf, buflen, fmt, x); |
|
8356 } |
|
8357 |
|
8358 static int |
|
8359 formatchar(Py_UNICODE *buf, |
|
8360 size_t buflen, |
|
8361 PyObject *v) |
|
8362 { |
|
8363 /* presume that the buffer is at least 2 characters long */ |
|
8364 if (PyUnicode_Check(v)) { |
|
8365 if (PyUnicode_GET_SIZE(v) != 1) |
|
8366 goto onError; |
|
8367 buf[0] = PyUnicode_AS_UNICODE(v)[0]; |
|
8368 } |
|
8369 |
|
8370 else if (PyString_Check(v)) { |
|
8371 if (PyString_GET_SIZE(v) != 1) |
|
8372 goto onError; |
|
8373 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0]; |
|
8374 } |
|
8375 |
|
8376 else { |
|
8377 /* Integer input truncated to a character */ |
|
8378 long x; |
|
8379 x = PyInt_AsLong(v); |
|
8380 if (x == -1 && PyErr_Occurred()) |
|
8381 goto onError; |
|
8382 #ifdef Py_UNICODE_WIDE |
|
8383 if (x < 0 || x > 0x10ffff) { |
|
8384 PyErr_SetString(PyExc_OverflowError, |
|
8385 "%c arg not in range(0x110000) " |
|
8386 "(wide Python build)"); |
|
8387 return -1; |
|
8388 } |
|
8389 #else |
|
8390 if (x < 0 || x > 0xffff) { |
|
8391 PyErr_SetString(PyExc_OverflowError, |
|
8392 "%c arg not in range(0x10000) " |
|
8393 "(narrow Python build)"); |
|
8394 return -1; |
|
8395 } |
|
8396 #endif |
|
8397 buf[0] = (Py_UNICODE) x; |
|
8398 } |
|
8399 buf[1] = '\0'; |
|
8400 return 1; |
|
8401 |
|
8402 onError: |
|
8403 PyErr_SetString(PyExc_TypeError, |
|
8404 "%c requires int or char"); |
|
8405 return -1; |
|
8406 } |
|
8407 |
|
8408 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...) |
|
8409 |
|
8410 FORMATBUFLEN is the length of the buffer in which the floats, ints, & |
|
8411 chars are formatted. XXX This is a magic number. Each formatting |
|
8412 routine does bounds checking to ensure no overflow, but a better |
|
8413 solution may be to malloc a buffer of appropriate size for each |
|
8414 format. For now, the current solution is sufficient. |
|
8415 */ |
|
8416 #define FORMATBUFLEN (size_t)120 |
|
8417 |
|
8418 PyObject *PyUnicode_Format(PyObject *format, |
|
8419 PyObject *args) |
|
8420 { |
|
8421 Py_UNICODE *fmt, *res; |
|
8422 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx; |
|
8423 int args_owned = 0; |
|
8424 PyUnicodeObject *result = NULL; |
|
8425 PyObject *dict = NULL; |
|
8426 PyObject *uformat; |
|
8427 |
|
8428 if (format == NULL || args == NULL) { |
|
8429 PyErr_BadInternalCall(); |
|
8430 return NULL; |
|
8431 } |
|
8432 uformat = PyUnicode_FromObject(format); |
|
8433 if (uformat == NULL) |
|
8434 return NULL; |
|
8435 fmt = PyUnicode_AS_UNICODE(uformat); |
|
8436 fmtcnt = PyUnicode_GET_SIZE(uformat); |
|
8437 |
|
8438 reslen = rescnt = fmtcnt + 100; |
|
8439 result = _PyUnicode_New(reslen); |
|
8440 if (result == NULL) |
|
8441 goto onError; |
|
8442 res = PyUnicode_AS_UNICODE(result); |
|
8443 |
|
8444 if (PyTuple_Check(args)) { |
|
8445 arglen = PyTuple_Size(args); |
|
8446 argidx = 0; |
|
8447 } |
|
8448 else { |
|
8449 arglen = -1; |
|
8450 argidx = -2; |
|
8451 } |
|
8452 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) && |
|
8453 !PyObject_TypeCheck(args, &PyBaseString_Type)) |
|
8454 dict = args; |
|
8455 |
|
8456 while (--fmtcnt >= 0) { |
|
8457 if (*fmt != '%') { |
|
8458 if (--rescnt < 0) { |
|
8459 rescnt = fmtcnt + 100; |
|
8460 reslen += rescnt; |
|
8461 if (_PyUnicode_Resize(&result, reslen) < 0) |
|
8462 goto onError; |
|
8463 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt; |
|
8464 --rescnt; |
|
8465 } |
|
8466 *res++ = *fmt++; |
|
8467 } |
|
8468 else { |
|
8469 /* Got a format specifier */ |
|
8470 int flags = 0; |
|
8471 Py_ssize_t width = -1; |
|
8472 int prec = -1; |
|
8473 Py_UNICODE c = '\0'; |
|
8474 Py_UNICODE fill; |
|
8475 int isnumok; |
|
8476 PyObject *v = NULL; |
|
8477 PyObject *temp = NULL; |
|
8478 Py_UNICODE *pbuf; |
|
8479 Py_UNICODE sign; |
|
8480 Py_ssize_t len; |
|
8481 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */ |
|
8482 |
|
8483 fmt++; |
|
8484 if (*fmt == '(') { |
|
8485 Py_UNICODE *keystart; |
|
8486 Py_ssize_t keylen; |
|
8487 PyObject *key; |
|
8488 int pcount = 1; |
|
8489 |
|
8490 if (dict == NULL) { |
|
8491 PyErr_SetString(PyExc_TypeError, |
|
8492 "format requires a mapping"); |
|
8493 goto onError; |
|
8494 } |
|
8495 ++fmt; |
|
8496 --fmtcnt; |
|
8497 keystart = fmt; |
|
8498 /* Skip over balanced parentheses */ |
|
8499 while (pcount > 0 && --fmtcnt >= 0) { |
|
8500 if (*fmt == ')') |
|
8501 --pcount; |
|
8502 else if (*fmt == '(') |
|
8503 ++pcount; |
|
8504 fmt++; |
|
8505 } |
|
8506 keylen = fmt - keystart - 1; |
|
8507 if (fmtcnt < 0 || pcount > 0) { |
|
8508 PyErr_SetString(PyExc_ValueError, |
|
8509 "incomplete format key"); |
|
8510 goto onError; |
|
8511 } |
|
8512 #if 0 |
|
8513 /* keys are converted to strings using UTF-8 and |
|
8514 then looked up since Python uses strings to hold |
|
8515 variables names etc. in its namespaces and we |
|
8516 wouldn't want to break common idioms. */ |
|
8517 key = PyUnicode_EncodeUTF8(keystart, |
|
8518 keylen, |
|
8519 NULL); |
|
8520 #else |
|
8521 key = PyUnicode_FromUnicode(keystart, keylen); |
|
8522 #endif |
|
8523 if (key == NULL) |
|
8524 goto onError; |
|
8525 if (args_owned) { |
|
8526 Py_DECREF(args); |
|
8527 args_owned = 0; |
|
8528 } |
|
8529 args = PyObject_GetItem(dict, key); |
|
8530 Py_DECREF(key); |
|
8531 if (args == NULL) { |
|
8532 goto onError; |
|
8533 } |
|
8534 args_owned = 1; |
|
8535 arglen = -1; |
|
8536 argidx = -2; |
|
8537 } |
|
8538 while (--fmtcnt >= 0) { |
|
8539 switch (c = *fmt++) { |
|
8540 case '-': flags |= F_LJUST; continue; |
|
8541 case '+': flags |= F_SIGN; continue; |
|
8542 case ' ': flags |= F_BLANK; continue; |
|
8543 case '#': flags |= F_ALT; continue; |
|
8544 case '0': flags |= F_ZERO; continue; |
|
8545 } |
|
8546 break; |
|
8547 } |
|
8548 if (c == '*') { |
|
8549 v = getnextarg(args, arglen, &argidx); |
|
8550 if (v == NULL) |
|
8551 goto onError; |
|
8552 if (!PyInt_Check(v)) { |
|
8553 PyErr_SetString(PyExc_TypeError, |
|
8554 "* wants int"); |
|
8555 goto onError; |
|
8556 } |
|
8557 width = PyInt_AsLong(v); |
|
8558 if (width < 0) { |
|
8559 flags |= F_LJUST; |
|
8560 width = -width; |
|
8561 } |
|
8562 if (--fmtcnt >= 0) |
|
8563 c = *fmt++; |
|
8564 } |
|
8565 else if (c >= '0' && c <= '9') { |
|
8566 width = c - '0'; |
|
8567 while (--fmtcnt >= 0) { |
|
8568 c = *fmt++; |
|
8569 if (c < '0' || c > '9') |
|
8570 break; |
|
8571 if ((width*10) / 10 != width) { |
|
8572 PyErr_SetString(PyExc_ValueError, |
|
8573 "width too big"); |
|
8574 goto onError; |
|
8575 } |
|
8576 width = width*10 + (c - '0'); |
|
8577 } |
|
8578 } |
|
8579 if (c == '.') { |
|
8580 prec = 0; |
|
8581 if (--fmtcnt >= 0) |
|
8582 c = *fmt++; |
|
8583 if (c == '*') { |
|
8584 v = getnextarg(args, arglen, &argidx); |
|
8585 if (v == NULL) |
|
8586 goto onError; |
|
8587 if (!PyInt_Check(v)) { |
|
8588 PyErr_SetString(PyExc_TypeError, |
|
8589 "* wants int"); |
|
8590 goto onError; |
|
8591 } |
|
8592 prec = PyInt_AsLong(v); |
|
8593 if (prec < 0) |
|
8594 prec = 0; |
|
8595 if (--fmtcnt >= 0) |
|
8596 c = *fmt++; |
|
8597 } |
|
8598 else if (c >= '0' && c <= '9') { |
|
8599 prec = c - '0'; |
|
8600 while (--fmtcnt >= 0) { |
|
8601 c = Py_CHARMASK(*fmt++); |
|
8602 if (c < '0' || c > '9') |
|
8603 break; |
|
8604 if ((prec*10) / 10 != prec) { |
|
8605 PyErr_SetString(PyExc_ValueError, |
|
8606 "prec too big"); |
|
8607 goto onError; |
|
8608 } |
|
8609 prec = prec*10 + (c - '0'); |
|
8610 } |
|
8611 } |
|
8612 } /* prec */ |
|
8613 if (fmtcnt >= 0) { |
|
8614 if (c == 'h' || c == 'l' || c == 'L') { |
|
8615 if (--fmtcnt >= 0) |
|
8616 c = *fmt++; |
|
8617 } |
|
8618 } |
|
8619 if (fmtcnt < 0) { |
|
8620 PyErr_SetString(PyExc_ValueError, |
|
8621 "incomplete format"); |
|
8622 goto onError; |
|
8623 } |
|
8624 if (c != '%') { |
|
8625 v = getnextarg(args, arglen, &argidx); |
|
8626 if (v == NULL) |
|
8627 goto onError; |
|
8628 } |
|
8629 sign = 0; |
|
8630 fill = ' '; |
|
8631 switch (c) { |
|
8632 |
|
8633 case '%': |
|
8634 pbuf = formatbuf; |
|
8635 /* presume that buffer length is at least 1 */ |
|
8636 pbuf[0] = '%'; |
|
8637 len = 1; |
|
8638 break; |
|
8639 |
|
8640 case 's': |
|
8641 case 'r': |
|
8642 if (PyUnicode_Check(v) && c == 's') { |
|
8643 temp = v; |
|
8644 Py_INCREF(temp); |
|
8645 } |
|
8646 else { |
|
8647 PyObject *unicode; |
|
8648 if (c == 's') |
|
8649 temp = PyObject_Unicode(v); |
|
8650 else |
|
8651 temp = PyObject_Repr(v); |
|
8652 if (temp == NULL) |
|
8653 goto onError; |
|
8654 if (PyUnicode_Check(temp)) |
|
8655 /* nothing to do */; |
|
8656 else if (PyString_Check(temp)) { |
|
8657 /* convert to string to Unicode */ |
|
8658 unicode = PyUnicode_Decode(PyString_AS_STRING(temp), |
|
8659 PyString_GET_SIZE(temp), |
|
8660 NULL, |
|
8661 "strict"); |
|
8662 Py_DECREF(temp); |
|
8663 temp = unicode; |
|
8664 if (temp == NULL) |
|
8665 goto onError; |
|
8666 } |
|
8667 else { |
|
8668 Py_DECREF(temp); |
|
8669 PyErr_SetString(PyExc_TypeError, |
|
8670 "%s argument has non-string str()"); |
|
8671 goto onError; |
|
8672 } |
|
8673 } |
|
8674 pbuf = PyUnicode_AS_UNICODE(temp); |
|
8675 len = PyUnicode_GET_SIZE(temp); |
|
8676 if (prec >= 0 && len > prec) |
|
8677 len = prec; |
|
8678 break; |
|
8679 |
|
8680 case 'i': |
|
8681 case 'd': |
|
8682 case 'u': |
|
8683 case 'o': |
|
8684 case 'x': |
|
8685 case 'X': |
|
8686 if (c == 'i') |
|
8687 c = 'd'; |
|
8688 isnumok = 0; |
|
8689 if (PyNumber_Check(v)) { |
|
8690 PyObject *iobj=NULL; |
|
8691 |
|
8692 if (PyInt_Check(v) || (PyLong_Check(v))) { |
|
8693 iobj = v; |
|
8694 Py_INCREF(iobj); |
|
8695 } |
|
8696 else { |
|
8697 iobj = PyNumber_Int(v); |
|
8698 if (iobj==NULL) iobj = PyNumber_Long(v); |
|
8699 } |
|
8700 if (iobj!=NULL) { |
|
8701 if (PyInt_Check(iobj)) { |
|
8702 isnumok = 1; |
|
8703 pbuf = formatbuf; |
|
8704 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), |
|
8705 flags, prec, c, iobj); |
|
8706 Py_DECREF(iobj); |
|
8707 if (len < 0) |
|
8708 goto onError; |
|
8709 sign = 1; |
|
8710 } |
|
8711 else if (PyLong_Check(iobj)) { |
|
8712 isnumok = 1; |
|
8713 temp = formatlong(iobj, flags, prec, c); |
|
8714 Py_DECREF(iobj); |
|
8715 if (!temp) |
|
8716 goto onError; |
|
8717 pbuf = PyUnicode_AS_UNICODE(temp); |
|
8718 len = PyUnicode_GET_SIZE(temp); |
|
8719 sign = 1; |
|
8720 } |
|
8721 else { |
|
8722 Py_DECREF(iobj); |
|
8723 } |
|
8724 } |
|
8725 } |
|
8726 if (!isnumok) { |
|
8727 PyErr_Format(PyExc_TypeError, |
|
8728 "%%%c format: a number is required, " |
|
8729 "not %.200s", (char)c, Py_TYPE(v)->tp_name); |
|
8730 goto onError; |
|
8731 } |
|
8732 if (flags & F_ZERO) |
|
8733 fill = '0'; |
|
8734 break; |
|
8735 |
|
8736 case 'e': |
|
8737 case 'E': |
|
8738 case 'f': |
|
8739 case 'F': |
|
8740 case 'g': |
|
8741 case 'G': |
|
8742 if (c == 'F') |
|
8743 c = 'f'; |
|
8744 pbuf = formatbuf; |
|
8745 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), |
|
8746 flags, prec, c, v); |
|
8747 if (len < 0) |
|
8748 goto onError; |
|
8749 sign = 1; |
|
8750 if (flags & F_ZERO) |
|
8751 fill = '0'; |
|
8752 break; |
|
8753 |
|
8754 case 'c': |
|
8755 pbuf = formatbuf; |
|
8756 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v); |
|
8757 if (len < 0) |
|
8758 goto onError; |
|
8759 break; |
|
8760 |
|
8761 default: |
|
8762 PyErr_Format(PyExc_ValueError, |
|
8763 "unsupported format character '%c' (0x%x) " |
|
8764 "at index %zd", |
|
8765 (31<=c && c<=126) ? (char)c : '?', |
|
8766 (int)c, |
|
8767 (Py_ssize_t)(fmt - 1 - |
|
8768 PyUnicode_AS_UNICODE(uformat))); |
|
8769 goto onError; |
|
8770 } |
|
8771 if (sign) { |
|
8772 if (*pbuf == '-' || *pbuf == '+') { |
|
8773 sign = *pbuf++; |
|
8774 len--; |
|
8775 } |
|
8776 else if (flags & F_SIGN) |
|
8777 sign = '+'; |
|
8778 else if (flags & F_BLANK) |
|
8779 sign = ' '; |
|
8780 else |
|
8781 sign = 0; |
|
8782 } |
|
8783 if (width < len) |
|
8784 width = len; |
|
8785 if (rescnt - (sign != 0) < width) { |
|
8786 reslen -= rescnt; |
|
8787 rescnt = width + fmtcnt + 100; |
|
8788 reslen += rescnt; |
|
8789 if (reslen < 0) { |
|
8790 Py_XDECREF(temp); |
|
8791 PyErr_NoMemory(); |
|
8792 goto onError; |
|
8793 } |
|
8794 if (_PyUnicode_Resize(&result, reslen) < 0) { |
|
8795 Py_XDECREF(temp); |
|
8796 goto onError; |
|
8797 } |
|
8798 res = PyUnicode_AS_UNICODE(result) |
|
8799 + reslen - rescnt; |
|
8800 } |
|
8801 if (sign) { |
|
8802 if (fill != ' ') |
|
8803 *res++ = sign; |
|
8804 rescnt--; |
|
8805 if (width > len) |
|
8806 width--; |
|
8807 } |
|
8808 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { |
|
8809 assert(pbuf[0] == '0'); |
|
8810 assert(pbuf[1] == c); |
|
8811 if (fill != ' ') { |
|
8812 *res++ = *pbuf++; |
|
8813 *res++ = *pbuf++; |
|
8814 } |
|
8815 rescnt -= 2; |
|
8816 width -= 2; |
|
8817 if (width < 0) |
|
8818 width = 0; |
|
8819 len -= 2; |
|
8820 } |
|
8821 if (width > len && !(flags & F_LJUST)) { |
|
8822 do { |
|
8823 --rescnt; |
|
8824 *res++ = fill; |
|
8825 } while (--width > len); |
|
8826 } |
|
8827 if (fill == ' ') { |
|
8828 if (sign) |
|
8829 *res++ = sign; |
|
8830 if ((flags & F_ALT) && (c == 'x' || c == 'X')) { |
|
8831 assert(pbuf[0] == '0'); |
|
8832 assert(pbuf[1] == c); |
|
8833 *res++ = *pbuf++; |
|
8834 *res++ = *pbuf++; |
|
8835 } |
|
8836 } |
|
8837 Py_UNICODE_COPY(res, pbuf, len); |
|
8838 res += len; |
|
8839 rescnt -= len; |
|
8840 while (--width >= len) { |
|
8841 --rescnt; |
|
8842 *res++ = ' '; |
|
8843 } |
|
8844 if (dict && (argidx < arglen) && c != '%') { |
|
8845 PyErr_SetString(PyExc_TypeError, |
|
8846 "not all arguments converted during string formatting"); |
|
8847 Py_XDECREF(temp); |
|
8848 goto onError; |
|
8849 } |
|
8850 Py_XDECREF(temp); |
|
8851 } /* '%' */ |
|
8852 } /* until end */ |
|
8853 if (argidx < arglen && !dict) { |
|
8854 PyErr_SetString(PyExc_TypeError, |
|
8855 "not all arguments converted during string formatting"); |
|
8856 goto onError; |
|
8857 } |
|
8858 |
|
8859 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0) |
|
8860 goto onError; |
|
8861 if (args_owned) { |
|
8862 Py_DECREF(args); |
|
8863 } |
|
8864 Py_DECREF(uformat); |
|
8865 return (PyObject *)result; |
|
8866 |
|
8867 onError: |
|
8868 Py_XDECREF(result); |
|
8869 Py_DECREF(uformat); |
|
8870 if (args_owned) { |
|
8871 Py_DECREF(args); |
|
8872 } |
|
8873 return NULL; |
|
8874 } |
|
8875 |
|
8876 static PyBufferProcs unicode_as_buffer = { |
|
8877 (readbufferproc) unicode_buffer_getreadbuf, |
|
8878 (writebufferproc) unicode_buffer_getwritebuf, |
|
8879 (segcountproc) unicode_buffer_getsegcount, |
|
8880 (charbufferproc) unicode_buffer_getcharbuf, |
|
8881 }; |
|
8882 |
|
8883 static PyObject * |
|
8884 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); |
|
8885 |
|
8886 static PyObject * |
|
8887 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds) |
|
8888 { |
|
8889 PyObject *x = NULL; |
|
8890 static char *kwlist[] = {"string", "encoding", "errors", 0}; |
|
8891 char *encoding = NULL; |
|
8892 char *errors = NULL; |
|
8893 |
|
8894 if (type != &PyUnicode_Type) |
|
8895 return unicode_subtype_new(type, args, kwds); |
|
8896 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode", |
|
8897 kwlist, &x, &encoding, &errors)) |
|
8898 return NULL; |
|
8899 if (x == NULL) |
|
8900 return (PyObject *)_PyUnicode_New(0); |
|
8901 if (encoding == NULL && errors == NULL) |
|
8902 return PyObject_Unicode(x); |
|
8903 else |
|
8904 return PyUnicode_FromEncodedObject(x, encoding, errors); |
|
8905 } |
|
8906 |
|
8907 static PyObject * |
|
8908 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) |
|
8909 { |
|
8910 PyUnicodeObject *tmp, *pnew; |
|
8911 Py_ssize_t n; |
|
8912 |
|
8913 assert(PyType_IsSubtype(type, &PyUnicode_Type)); |
|
8914 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); |
|
8915 if (tmp == NULL) |
|
8916 return NULL; |
|
8917 assert(PyUnicode_Check(tmp)); |
|
8918 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length); |
|
8919 if (pnew == NULL) { |
|
8920 Py_DECREF(tmp); |
|
8921 return NULL; |
|
8922 } |
|
8923 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1)); |
|
8924 if (pnew->str == NULL) { |
|
8925 _Py_ForgetReference((PyObject *)pnew); |
|
8926 PyObject_Del(pnew); |
|
8927 Py_DECREF(tmp); |
|
8928 return PyErr_NoMemory(); |
|
8929 } |
|
8930 Py_UNICODE_COPY(pnew->str, tmp->str, n+1); |
|
8931 pnew->length = n; |
|
8932 pnew->hash = tmp->hash; |
|
8933 Py_DECREF(tmp); |
|
8934 return (PyObject *)pnew; |
|
8935 } |
|
8936 |
|
8937 PyDoc_STRVAR(unicode_doc, |
|
8938 "unicode(string [, encoding[, errors]]) -> object\n\ |
|
8939 \n\ |
|
8940 Create a new Unicode object from the given encoded string.\n\ |
|
8941 encoding defaults to the current default string encoding.\n\ |
|
8942 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'."); |
|
8943 |
|
8944 PyTypeObject PyUnicode_Type = { |
|
8945 PyVarObject_HEAD_INIT(&PyType_Type, 0) |
|
8946 "unicode", /* tp_name */ |
|
8947 sizeof(PyUnicodeObject), /* tp_size */ |
|
8948 0, /* tp_itemsize */ |
|
8949 /* Slots */ |
|
8950 (destructor)unicode_dealloc, /* tp_dealloc */ |
|
8951 0, /* tp_print */ |
|
8952 0, /* tp_getattr */ |
|
8953 0, /* tp_setattr */ |
|
8954 0, /* tp_compare */ |
|
8955 unicode_repr, /* tp_repr */ |
|
8956 &unicode_as_number, /* tp_as_number */ |
|
8957 &unicode_as_sequence, /* tp_as_sequence */ |
|
8958 &unicode_as_mapping, /* tp_as_mapping */ |
|
8959 (hashfunc) unicode_hash, /* tp_hash*/ |
|
8960 0, /* tp_call*/ |
|
8961 (reprfunc) unicode_str, /* tp_str */ |
|
8962 PyObject_GenericGetAttr, /* tp_getattro */ |
|
8963 0, /* tp_setattro */ |
|
8964 &unicode_as_buffer, /* tp_as_buffer */ |
|
8965 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES | |
|
8966 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */ |
|
8967 unicode_doc, /* tp_doc */ |
|
8968 0, /* tp_traverse */ |
|
8969 0, /* tp_clear */ |
|
8970 PyUnicode_RichCompare, /* tp_richcompare */ |
|
8971 0, /* tp_weaklistoffset */ |
|
8972 0, /* tp_iter */ |
|
8973 0, /* tp_iternext */ |
|
8974 unicode_methods, /* tp_methods */ |
|
8975 0, /* tp_members */ |
|
8976 0, /* tp_getset */ |
|
8977 &PyBaseString_Type, /* tp_base */ |
|
8978 0, /* tp_dict */ |
|
8979 0, /* tp_descr_get */ |
|
8980 0, /* tp_descr_set */ |
|
8981 0, /* tp_dictoffset */ |
|
8982 0, /* tp_init */ |
|
8983 0, /* tp_alloc */ |
|
8984 unicode_new, /* tp_new */ |
|
8985 PyObject_Del, /* tp_free */ |
|
8986 }; |
|
8987 |
|
8988 /* Initialize the Unicode implementation */ |
|
8989 |
|
8990 void _PyUnicode_Init(void) |
|
8991 { |
|
8992 int i; |
|
8993 |
|
8994 /* XXX - move this array to unicodectype.c ? */ |
|
8995 Py_UNICODE linebreak[] = { |
|
8996 0x000A, /* LINE FEED */ |
|
8997 0x000D, /* CARRIAGE RETURN */ |
|
8998 0x001C, /* FILE SEPARATOR */ |
|
8999 0x001D, /* GROUP SEPARATOR */ |
|
9000 0x001E, /* RECORD SEPARATOR */ |
|
9001 0x0085, /* NEXT LINE */ |
|
9002 0x2028, /* LINE SEPARATOR */ |
|
9003 0x2029, /* PARAGRAPH SEPARATOR */ |
|
9004 }; |
|
9005 |
|
9006 /* Init the implementation */ |
|
9007 free_list = NULL; |
|
9008 numfree = 0; |
|
9009 unicode_empty = _PyUnicode_New(0); |
|
9010 if (!unicode_empty) |
|
9011 return; |
|
9012 |
|
9013 strcpy(unicode_default_encoding, "ascii"); |
|
9014 for (i = 0; i < 256; i++) |
|
9015 unicode_latin1[i] = NULL; |
|
9016 if (PyType_Ready(&PyUnicode_Type) < 0) |
|
9017 Py_FatalError("Can't initialize 'unicode'"); |
|
9018 |
|
9019 /* initialize the linebreak bloom filter */ |
|
9020 bloom_linebreak = make_bloom_mask( |
|
9021 linebreak, sizeof(linebreak) / sizeof(linebreak[0]) |
|
9022 ); |
|
9023 |
|
9024 PyType_Ready(&EncodingMapType); |
|
9025 } |
|
9026 |
|
9027 /* Finalize the Unicode implementation */ |
|
9028 |
|
9029 int |
|
9030 PyUnicode_ClearFreeList(void) |
|
9031 { |
|
9032 int freelist_size = numfree; |
|
9033 PyUnicodeObject *u; |
|
9034 |
|
9035 for (u = free_list; u != NULL;) { |
|
9036 PyUnicodeObject *v = u; |
|
9037 u = *(PyUnicodeObject **)u; |
|
9038 if (v->str) |
|
9039 PyObject_DEL(v->str); |
|
9040 Py_XDECREF(v->defenc); |
|
9041 PyObject_Del(v); |
|
9042 numfree--; |
|
9043 } |
|
9044 free_list = NULL; |
|
9045 assert(numfree == 0); |
|
9046 return freelist_size; |
|
9047 } |
|
9048 |
|
9049 void |
|
9050 _PyUnicode_Fini(void) |
|
9051 { |
|
9052 int i; |
|
9053 |
|
9054 Py_XDECREF(unicode_empty); |
|
9055 unicode_empty = NULL; |
|
9056 |
|
9057 for (i = 0; i < 256; i++) { |
|
9058 if (unicode_latin1[i]) { |
|
9059 Py_DECREF(unicode_latin1[i]); |
|
9060 unicode_latin1[i] = NULL; |
|
9061 } |
|
9062 } |
|
9063 (void)PyUnicode_ClearFreeList(); |
|
9064 } |
|
9065 |
|
9066 #ifdef __cplusplus |
|
9067 } |
|
9068 #endif |
|
9069 |
|
9070 |
|
9071 /* |
|
9072 Local variables: |
|
9073 c-basic-offset: 4 |
|
9074 indent-tabs-mode: nil |
|
9075 End: |
|
9076 */ |