symbian-qemu-0.9.1-12/python-2.6.1/Objects/unicodeobject.c
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 /*
       
     2 
       
     3 Unicode implementation based on original code by Fredrik Lundh,
       
     4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
       
     5 Unicode Integration Proposal (see file Misc/unicode.txt).
       
     6 
       
     7 Major speed upgrades to the method implementations at the Reykjavik
       
     8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
       
     9 
       
    10 Copyright (c) Corporation for National Research Initiatives.
       
    11 
       
    12 --------------------------------------------------------------------
       
    13 The original string type implementation is:
       
    14 
       
    15     Copyright (c) 1999 by Secret Labs AB
       
    16     Copyright (c) 1999 by Fredrik Lundh
       
    17 
       
    18 By obtaining, using, and/or copying this software and/or its
       
    19 associated documentation, you agree that you have read, understood,
       
    20 and will comply with the following terms and conditions:
       
    21 
       
    22 Permission to use, copy, modify, and distribute this software and its
       
    23 associated documentation for any purpose and without fee is hereby
       
    24 granted, provided that the above copyright notice appears in all
       
    25 copies, and that both that copyright notice and this permission notice
       
    26 appear in supporting documentation, and that the name of Secret Labs
       
    27 AB or the author not be used in advertising or publicity pertaining to
       
    28 distribution of the software without specific, written prior
       
    29 permission.
       
    30 
       
    31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
       
    32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
       
    33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
       
    34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
       
    35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
       
    36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
       
    37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
       
    38 --------------------------------------------------------------------
       
    39 
       
    40 */
       
    41 
       
    42 #define PY_SSIZE_T_CLEAN
       
    43 #include "Python.h"
       
    44 
       
    45 #include "unicodeobject.h"
       
    46 #include "ucnhash.h"
       
    47 
       
    48 #ifdef MS_WINDOWS
       
    49 #include <windows.h>
       
    50 #endif
       
    51 
       
    52 /* Limit for the Unicode object free list */
       
    53 
       
    54 #define PyUnicode_MAXFREELIST       1024
       
    55 
       
    56 /* Limit for the Unicode object free list stay alive optimization.
       
    57 
       
    58    The implementation will keep allocated Unicode memory intact for
       
    59    all objects on the free list having a size less than this
       
    60    limit. This reduces malloc() overhead for small Unicode objects.
       
    61 
       
    62    At worst this will result in PyUnicode_MAXFREELIST *
       
    63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
       
    64    malloc()-overhead) bytes of unused garbage.
       
    65 
       
    66    Setting the limit to 0 effectively turns the feature off.
       
    67 
       
    68    Note: This is an experimental feature ! If you get core dumps when
       
    69    using Unicode objects, turn this feature off.
       
    70 
       
    71 */
       
    72 
       
    73 #define KEEPALIVE_SIZE_LIMIT       9
       
    74 
       
    75 /* Endianness switches; defaults to little endian */
       
    76 
       
    77 #ifdef WORDS_BIGENDIAN
       
    78 # define BYTEORDER_IS_BIG_ENDIAN
       
    79 #else
       
    80 # define BYTEORDER_IS_LITTLE_ENDIAN
       
    81 #endif
       
    82 
       
    83 /* --- Globals ------------------------------------------------------------
       
    84 
       
    85    The globals are initialized by the _PyUnicode_Init() API and should
       
    86    not be used before calling that API.
       
    87 
       
    88 */
       
    89 
       
    90 
       
    91 #ifdef __cplusplus
       
    92 extern "C" {
       
    93 #endif
       
    94 
       
    95 /* Free list for Unicode objects */
       
    96 static PyUnicodeObject *free_list;
       
    97 static int numfree;
       
    98 
       
    99 /* The empty Unicode object is shared to improve performance. */
       
   100 static PyUnicodeObject *unicode_empty;
       
   101 
       
   102 /* Single character Unicode strings in the Latin-1 range are being
       
   103    shared as well. */
       
   104 static PyUnicodeObject *unicode_latin1[256];
       
   105 
       
   106 /* Default encoding to use and assume when NULL is passed as encoding
       
   107    parameter; it is initialized by _PyUnicode_Init().
       
   108 
       
   109    Always use the PyUnicode_SetDefaultEncoding() and
       
   110    PyUnicode_GetDefaultEncoding() APIs to access this global.
       
   111 
       
   112 */
       
   113 static char unicode_default_encoding[100];
       
   114 
       
   115 /* Fast detection of the most frequent whitespace characters */
       
   116 const unsigned char _Py_ascii_whitespace[] = {
       
   117 	0, 0, 0, 0, 0, 0, 0, 0,
       
   118 /*     case 0x0009: * HORIZONTAL TABULATION */
       
   119 /*     case 0x000A: * LINE FEED */
       
   120 /*     case 0x000B: * VERTICAL TABULATION */
       
   121 /*     case 0x000C: * FORM FEED */
       
   122 /*     case 0x000D: * CARRIAGE RETURN */
       
   123 	0, 1, 1, 1, 1, 1, 0, 0,
       
   124 	0, 0, 0, 0, 0, 0, 0, 0,
       
   125 /*     case 0x001C: * FILE SEPARATOR */
       
   126 /*     case 0x001D: * GROUP SEPARATOR */
       
   127 /*     case 0x001E: * RECORD SEPARATOR */
       
   128 /*     case 0x001F: * UNIT SEPARATOR */
       
   129 	0, 0, 0, 0, 1, 1, 1, 1,
       
   130 /*     case 0x0020: * SPACE */
       
   131 	1, 0, 0, 0, 0, 0, 0, 0,
       
   132 	0, 0, 0, 0, 0, 0, 0, 0,
       
   133 	0, 0, 0, 0, 0, 0, 0, 0,
       
   134 	0, 0, 0, 0, 0, 0, 0, 0,
       
   135 
       
   136 	0, 0, 0, 0, 0, 0, 0, 0,
       
   137 	0, 0, 0, 0, 0, 0, 0, 0,
       
   138 	0, 0, 0, 0, 0, 0, 0, 0,
       
   139 	0, 0, 0, 0, 0, 0, 0, 0,
       
   140 	0, 0, 0, 0, 0, 0, 0, 0,
       
   141 	0, 0, 0, 0, 0, 0, 0, 0,
       
   142 	0, 0, 0, 0, 0, 0, 0, 0,
       
   143 	0, 0, 0, 0, 0, 0, 0, 0
       
   144 };
       
   145 
       
   146 /* Same for linebreaks */
       
   147 static unsigned char ascii_linebreak[] = {
       
   148 	0, 0, 0, 0, 0, 0, 0, 0,
       
   149 /*         0x000A, * LINE FEED */
       
   150 /*         0x000D, * CARRIAGE RETURN */
       
   151 	0, 0, 1, 0, 0, 1, 0, 0,
       
   152 	0, 0, 0, 0, 0, 0, 0, 0,
       
   153 /*         0x001C, * FILE SEPARATOR */
       
   154 /*         0x001D, * GROUP SEPARATOR */
       
   155 /*         0x001E, * RECORD SEPARATOR */
       
   156 	0, 0, 0, 0, 1, 1, 1, 0,
       
   157 	0, 0, 0, 0, 0, 0, 0, 0,
       
   158 	0, 0, 0, 0, 0, 0, 0, 0,
       
   159 	0, 0, 0, 0, 0, 0, 0, 0,
       
   160 	0, 0, 0, 0, 0, 0, 0, 0,
       
   161 
       
   162 	0, 0, 0, 0, 0, 0, 0, 0,
       
   163 	0, 0, 0, 0, 0, 0, 0, 0,
       
   164 	0, 0, 0, 0, 0, 0, 0, 0,
       
   165 	0, 0, 0, 0, 0, 0, 0, 0,
       
   166 	0, 0, 0, 0, 0, 0, 0, 0,
       
   167 	0, 0, 0, 0, 0, 0, 0, 0,
       
   168 	0, 0, 0, 0, 0, 0, 0, 0,
       
   169 	0, 0, 0, 0, 0, 0, 0, 0
       
   170 };
       
   171 
       
   172 
       
   173 Py_UNICODE
       
   174 PyUnicode_GetMax(void)
       
   175 {
       
   176 #ifdef Py_UNICODE_WIDE
       
   177 	return 0x10FFFF;
       
   178 #else
       
   179 	/* This is actually an illegal character, so it should
       
   180 	   not be passed to unichr. */
       
   181 	return 0xFFFF;
       
   182 #endif
       
   183 }
       
   184 
       
   185 /* --- Bloom Filters ----------------------------------------------------- */
       
   186 
       
   187 /* stuff to implement simple "bloom filters" for Unicode characters.
       
   188    to keep things simple, we use a single bitmask, using the least 5
       
   189    bits from each unicode characters as the bit index. */
       
   190 
       
   191 /* the linebreak mask is set up by Unicode_Init below */
       
   192 
       
   193 #define BLOOM_MASK unsigned long
       
   194 
       
   195 static BLOOM_MASK bloom_linebreak;
       
   196 
       
   197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
       
   198 
       
   199 #define BLOOM_LINEBREAK(ch) \
       
   200     ((ch) < 128U ? ascii_linebreak[(ch)] : \
       
   201     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
       
   202 
       
   203 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
       
   204 {
       
   205     /* calculate simple bloom-style bitmask for a given unicode string */
       
   206 
       
   207     long mask;
       
   208     Py_ssize_t i;
       
   209 
       
   210     mask = 0;
       
   211     for (i = 0; i < len; i++)
       
   212         mask |= (1 << (ptr[i] & 0x1F));
       
   213 
       
   214     return mask;
       
   215 }
       
   216 
       
   217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
       
   218 {
       
   219     Py_ssize_t i;
       
   220 
       
   221     for (i = 0; i < setlen; i++)
       
   222         if (set[i] == chr)
       
   223             return 1;
       
   224 
       
   225     return 0;
       
   226 }
       
   227 
       
   228 #define BLOOM_MEMBER(mask, chr, set, setlen)\
       
   229     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
       
   230 
       
   231 /* --- Unicode Object ----------------------------------------------------- */
       
   232 
       
   233 static
       
   234 int unicode_resize(register PyUnicodeObject *unicode,
       
   235                       Py_ssize_t length)
       
   236 {
       
   237     void *oldstr;
       
   238 
       
   239     /* Shortcut if there's nothing much to do. */
       
   240     if (unicode->length == length)
       
   241 	goto reset;
       
   242 
       
   243     /* Resizing shared object (unicode_empty or single character
       
   244        objects) in-place is not allowed. Use PyUnicode_Resize()
       
   245        instead ! */
       
   246 
       
   247     if (unicode == unicode_empty || 
       
   248 	(unicode->length == 1 && 
       
   249 	 unicode->str[0] < 256U &&
       
   250 	 unicode_latin1[unicode->str[0]] == unicode)) {
       
   251         PyErr_SetString(PyExc_SystemError,
       
   252                         "can't resize shared unicode objects");
       
   253         return -1;
       
   254     }
       
   255 
       
   256     /* We allocate one more byte to make sure the string is Ux0000 terminated.
       
   257        The overallocation is also used by fastsearch, which assumes that it's
       
   258        safe to look at str[length] (without making any assumptions about what
       
   259        it contains). */
       
   260 
       
   261     oldstr = unicode->str;
       
   262     unicode->str = PyObject_REALLOC(unicode->str,
       
   263 				    sizeof(Py_UNICODE) * (length + 1));
       
   264     if (!unicode->str) {
       
   265 	unicode->str = (Py_UNICODE *)oldstr;
       
   266         PyErr_NoMemory();
       
   267         return -1;
       
   268     }
       
   269     unicode->str[length] = 0;
       
   270     unicode->length = length;
       
   271 
       
   272  reset:
       
   273     /* Reset the object caches */
       
   274     if (unicode->defenc) {
       
   275         Py_DECREF(unicode->defenc);
       
   276         unicode->defenc = NULL;
       
   277     }
       
   278     unicode->hash = -1;
       
   279 
       
   280     return 0;
       
   281 }
       
   282 
       
   283 /* We allocate one more byte to make sure the string is
       
   284    Ux0000 terminated -- XXX is this needed ?
       
   285 
       
   286    XXX This allocator could further be enhanced by assuring that the
       
   287        free list never reduces its size below 1.
       
   288 
       
   289 */
       
   290 
       
   291 static
       
   292 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
       
   293 {
       
   294     register PyUnicodeObject *unicode;
       
   295 
       
   296     /* Optimization for empty strings */
       
   297     if (length == 0 && unicode_empty != NULL) {
       
   298         Py_INCREF(unicode_empty);
       
   299         return unicode_empty;
       
   300     }
       
   301 
       
   302     /* Ensure we won't overflow the size. */
       
   303     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
       
   304         return (PyUnicodeObject *)PyErr_NoMemory();
       
   305     }
       
   306 
       
   307     /* Unicode freelist & memory allocation */
       
   308     if (free_list) {
       
   309         unicode = free_list;
       
   310         free_list = *(PyUnicodeObject **)unicode;
       
   311         numfree--;
       
   312 	if (unicode->str) {
       
   313 	    /* Keep-Alive optimization: we only upsize the buffer,
       
   314 	       never downsize it. */
       
   315 	    if ((unicode->length < length) &&
       
   316                 unicode_resize(unicode, length) < 0) {
       
   317 		PyObject_DEL(unicode->str);
       
   318 		unicode->str = NULL;
       
   319 	    }
       
   320 	}
       
   321         else {
       
   322 	    size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
       
   323 	    unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
       
   324         }
       
   325         PyObject_INIT(unicode, &PyUnicode_Type);
       
   326     }
       
   327     else {
       
   328 	size_t new_size;
       
   329         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
       
   330         if (unicode == NULL)
       
   331             return NULL;
       
   332 	new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
       
   333 	unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
       
   334     }
       
   335 
       
   336     if (!unicode->str) {
       
   337 	PyErr_NoMemory();
       
   338 	goto onError;
       
   339     }
       
   340     /* Initialize the first element to guard against cases where
       
   341      * the caller fails before initializing str -- unicode_resize()
       
   342      * reads str[0], and the Keep-Alive optimization can keep memory
       
   343      * allocated for str alive across a call to unicode_dealloc(unicode).
       
   344      * We don't want unicode_resize to read uninitialized memory in
       
   345      * that case.
       
   346      */
       
   347     unicode->str[0] = 0;
       
   348     unicode->str[length] = 0;
       
   349     unicode->length = length;
       
   350     unicode->hash = -1;
       
   351     unicode->defenc = NULL;
       
   352     return unicode;
       
   353 
       
   354  onError:
       
   355     /* XXX UNREF/NEWREF interface should be more symmetrical */
       
   356     _Py_DEC_REFTOTAL;
       
   357     _Py_ForgetReference((PyObject *)unicode);
       
   358     PyObject_Del(unicode);
       
   359     return NULL;
       
   360 }
       
   361 
       
   362 static
       
   363 void unicode_dealloc(register PyUnicodeObject *unicode)
       
   364 {
       
   365     if (PyUnicode_CheckExact(unicode) &&
       
   366 	numfree < PyUnicode_MAXFREELIST) {
       
   367         /* Keep-Alive optimization */
       
   368 	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
       
   369 	    PyObject_DEL(unicode->str);
       
   370 	    unicode->str = NULL;
       
   371 	    unicode->length = 0;
       
   372 	}
       
   373 	if (unicode->defenc) {
       
   374 	    Py_DECREF(unicode->defenc);
       
   375 	    unicode->defenc = NULL;
       
   376 	}
       
   377 	/* Add to free list */
       
   378         *(PyUnicodeObject **)unicode = free_list;
       
   379         free_list = unicode;
       
   380         numfree++;
       
   381     }
       
   382     else {
       
   383 	PyObject_DEL(unicode->str);
       
   384 	Py_XDECREF(unicode->defenc);
       
   385 	Py_TYPE(unicode)->tp_free((PyObject *)unicode);
       
   386     }
       
   387 }
       
   388 
       
   389 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
       
   390 {
       
   391     register PyUnicodeObject *v;
       
   392 
       
   393     /* Argument checks */
       
   394     if (unicode == NULL) {
       
   395 	PyErr_BadInternalCall();
       
   396 	return -1;
       
   397     }
       
   398     v = (PyUnicodeObject *)*unicode;
       
   399     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
       
   400 	PyErr_BadInternalCall();
       
   401 	return -1;
       
   402     }
       
   403 
       
   404     /* Resizing unicode_empty and single character objects is not
       
   405        possible since these are being shared. We simply return a fresh
       
   406        copy with the same Unicode content. */
       
   407     if (v->length != length &&
       
   408 	(v == unicode_empty || v->length == 1)) {
       
   409 	PyUnicodeObject *w = _PyUnicode_New(length);
       
   410 	if (w == NULL)
       
   411 	    return -1;
       
   412 	Py_UNICODE_COPY(w->str, v->str,
       
   413 			length < v->length ? length : v->length);
       
   414 	Py_DECREF(*unicode);
       
   415 	*unicode = (PyObject *)w;
       
   416 	return 0;
       
   417     }
       
   418 
       
   419     /* Note that we don't have to modify *unicode for unshared Unicode
       
   420        objects, since we can modify them in-place. */
       
   421     return unicode_resize(v, length);
       
   422 }
       
   423 
       
   424 /* Internal API for use in unicodeobject.c only ! */
       
   425 #define _PyUnicode_Resize(unicodevar, length) \
       
   426         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
       
   427 
       
   428 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
       
   429 				Py_ssize_t size)
       
   430 {
       
   431     PyUnicodeObject *unicode;
       
   432 
       
   433     /* If the Unicode data is known at construction time, we can apply
       
   434        some optimizations which share commonly used objects. */
       
   435     if (u != NULL) {
       
   436 
       
   437 	/* Optimization for empty strings */
       
   438 	if (size == 0 && unicode_empty != NULL) {
       
   439 	    Py_INCREF(unicode_empty);
       
   440 	    return (PyObject *)unicode_empty;
       
   441 	}
       
   442 
       
   443 	/* Single character Unicode objects in the Latin-1 range are
       
   444 	   shared when using this constructor */
       
   445 	if (size == 1 && *u < 256) {
       
   446 	    unicode = unicode_latin1[*u];
       
   447 	    if (!unicode) {
       
   448 		unicode = _PyUnicode_New(1);
       
   449 		if (!unicode)
       
   450 		    return NULL;
       
   451 		unicode->str[0] = *u;
       
   452 		unicode_latin1[*u] = unicode;
       
   453 	    }
       
   454 	    Py_INCREF(unicode);
       
   455 	    return (PyObject *)unicode;
       
   456 	}
       
   457     }
       
   458 
       
   459     unicode = _PyUnicode_New(size);
       
   460     if (!unicode)
       
   461         return NULL;
       
   462 
       
   463     /* Copy the Unicode data into the new object */
       
   464     if (u != NULL)
       
   465 	Py_UNICODE_COPY(unicode->str, u, size);
       
   466 
       
   467     return (PyObject *)unicode;
       
   468 }
       
   469 
       
   470 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
       
   471 {
       
   472     PyUnicodeObject *unicode;
       
   473 
       
   474 	if (size < 0) {
       
   475 		PyErr_SetString(PyExc_SystemError,
       
   476 		    "Negative size passed to PyUnicode_FromStringAndSize");
       
   477 		return NULL;
       
   478 	}
       
   479 
       
   480     /* If the Unicode data is known at construction time, we can apply
       
   481        some optimizations which share commonly used objects.
       
   482        Also, this means the input must be UTF-8, so fall back to the
       
   483        UTF-8 decoder at the end. */
       
   484     if (u != NULL) {
       
   485 
       
   486 	/* Optimization for empty strings */
       
   487 	if (size == 0 && unicode_empty != NULL) {
       
   488 	    Py_INCREF(unicode_empty);
       
   489 	    return (PyObject *)unicode_empty;
       
   490 	}
       
   491 
       
   492 	/* Single characters are shared when using this constructor.
       
   493            Restrict to ASCII, since the input must be UTF-8. */
       
   494 	if (size == 1 && Py_CHARMASK(*u) < 128) {
       
   495 	    unicode = unicode_latin1[Py_CHARMASK(*u)];
       
   496 	    if (!unicode) {
       
   497 		unicode = _PyUnicode_New(1);
       
   498 		if (!unicode)
       
   499 		    return NULL;
       
   500 		unicode->str[0] = Py_CHARMASK(*u);
       
   501 		unicode_latin1[Py_CHARMASK(*u)] = unicode;
       
   502 	    }
       
   503 	    Py_INCREF(unicode);
       
   504 	    return (PyObject *)unicode;
       
   505 	}
       
   506 
       
   507         return PyUnicode_DecodeUTF8(u, size, NULL);
       
   508     }
       
   509 
       
   510     unicode = _PyUnicode_New(size);
       
   511     if (!unicode)
       
   512         return NULL;
       
   513 
       
   514     return (PyObject *)unicode;
       
   515 }
       
   516 
       
   517 PyObject *PyUnicode_FromString(const char *u)
       
   518 {
       
   519     size_t size = strlen(u);
       
   520     if (size > PY_SSIZE_T_MAX) {
       
   521         PyErr_SetString(PyExc_OverflowError, "input too long");
       
   522         return NULL;
       
   523     }
       
   524 
       
   525     return PyUnicode_FromStringAndSize(u, size);
       
   526 }
       
   527 
       
   528 #ifdef HAVE_WCHAR_H
       
   529 
       
   530 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
       
   531 				 Py_ssize_t size)
       
   532 {
       
   533     PyUnicodeObject *unicode;
       
   534 
       
   535     if (w == NULL) {
       
   536 	PyErr_BadInternalCall();
       
   537 	return NULL;
       
   538     }
       
   539 
       
   540     unicode = _PyUnicode_New(size);
       
   541     if (!unicode)
       
   542         return NULL;
       
   543 
       
   544     /* Copy the wchar_t data into the new object */
       
   545 #ifdef HAVE_USABLE_WCHAR_T
       
   546     memcpy(unicode->str, w, size * sizeof(wchar_t));
       
   547 #else
       
   548     {
       
   549 	register Py_UNICODE *u;
       
   550 	register Py_ssize_t i;
       
   551 	u = PyUnicode_AS_UNICODE(unicode);
       
   552 	for (i = size; i > 0; i--)
       
   553 	    *u++ = *w++;
       
   554     }
       
   555 #endif
       
   556 
       
   557     return (PyObject *)unicode;
       
   558 }
       
   559 
       
   560 static void
       
   561 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
       
   562 {
       
   563 	*fmt++ = '%';
       
   564 	if (width) {
       
   565 		if (zeropad)
       
   566 			*fmt++ = '0';
       
   567 		fmt += sprintf(fmt, "%d", width);
       
   568 	}
       
   569 	if (precision)
       
   570 		fmt += sprintf(fmt, ".%d", precision);
       
   571 	if (longflag)
       
   572 		*fmt++ = 'l';
       
   573 	else if (size_tflag) {
       
   574 		char *f = PY_FORMAT_SIZE_T;
       
   575 		while (*f)
       
   576 			*fmt++ = *f++;
       
   577 	}
       
   578 	*fmt++ = c;
       
   579 	*fmt = '\0';
       
   580 }
       
   581 
       
   582 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
       
   583 
       
   584 PyObject *
       
   585 PyUnicode_FromFormatV(const char *format, va_list vargs)
       
   586 {
       
   587 	va_list count;
       
   588 	Py_ssize_t callcount = 0;
       
   589 	PyObject **callresults = NULL;
       
   590 	PyObject **callresult = NULL;
       
   591 	Py_ssize_t n = 0;
       
   592 	int width = 0;
       
   593 	int precision = 0;
       
   594 	int zeropad;
       
   595 	const char* f;
       
   596 	Py_UNICODE *s;
       
   597 	PyObject *string;
       
   598 	/* used by sprintf */
       
   599 	char buffer[21];
       
   600 	/* use abuffer instead of buffer, if we need more space
       
   601 	 * (which can happen if there's a format specifier with width). */
       
   602 	char *abuffer = NULL;
       
   603 	char *realbuffer;
       
   604 	Py_ssize_t abuffersize = 0;
       
   605 	char fmt[60]; /* should be enough for %0width.precisionld */
       
   606 	const char *copy;
       
   607 
       
   608 #ifdef VA_LIST_IS_ARRAY
       
   609 	Py_MEMCPY(count, vargs, sizeof(va_list));
       
   610 #else
       
   611 #ifdef  __va_copy
       
   612 	__va_copy(count, vargs);
       
   613 #else
       
   614 	count = vargs;
       
   615 #endif
       
   616 #endif
       
   617 	/* step 1: count the number of %S/%R format specifications
       
   618 	 * (we call PyObject_Str()/PyObject_Repr() for these objects
       
   619 	 * once during step 3 and put the result in an array) */
       
   620 	for (f = format; *f; f++) {
       
   621 		if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
       
   622 			++callcount;
       
   623 	}
       
   624 	/* step 2: allocate memory for the results of
       
   625 	 * PyObject_Str()/PyObject_Repr() calls */
       
   626 	if (callcount) {
       
   627 		callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
       
   628 		if (!callresults) {
       
   629 			PyErr_NoMemory();
       
   630 			return NULL;
       
   631 		}
       
   632 		callresult = callresults;
       
   633 	}
       
   634 	/* step 3: figure out how large a buffer we need */
       
   635 	for (f = format; *f; f++) {
       
   636 		if (*f == '%') {
       
   637 			const char* p = f;
       
   638 			width = 0;
       
   639 			while (isdigit((unsigned)*f))
       
   640 				width = (width*10) + *f++ - '0';
       
   641 			while (*++f && *f != '%' && !isalpha((unsigned)*f))
       
   642 				;
       
   643 
       
   644 			/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
       
   645 			 * they don't affect the amount of space we reserve.
       
   646 			 */
       
   647 			if ((*f == 'l' || *f == 'z') &&
       
   648 					(f[1] == 'd' || f[1] == 'u'))
       
   649                                 ++f;
       
   650 
       
   651 			switch (*f) {
       
   652 			case 'c':
       
   653 				(void)va_arg(count, int);
       
   654 				/* fall through... */
       
   655 			case '%':
       
   656 				n++;
       
   657 				break;
       
   658 			case 'd': case 'u': case 'i': case 'x':
       
   659 				(void) va_arg(count, int);
       
   660 				/* 20 bytes is enough to hold a 64-bit
       
   661 				   integer.  Decimal takes the most space.
       
   662 				   This isn't enough for octal.
       
   663 				   If a width is specified we need more
       
   664 				   (which we allocate later). */
       
   665 				if (width < 20)
       
   666 					width = 20;
       
   667 				n += width;
       
   668 				if (abuffersize < width)
       
   669 					abuffersize = width;
       
   670 				break;
       
   671 			case 's':
       
   672 			{
       
   673 				/* UTF-8 */
       
   674 				unsigned char*s;
       
   675 				s = va_arg(count, unsigned char*);
       
   676 				while (*s) {
       
   677 					if (*s < 128) {
       
   678 						n++; s++;
       
   679 					} else if (*s < 0xc0) {
       
   680 						/* invalid UTF-8 */
       
   681 						n++; s++;
       
   682 					} else if (*s < 0xc0) {
       
   683 						n++;
       
   684 						s++; if(!*s)break;
       
   685 						s++;
       
   686 					} else if (*s < 0xe0) {
       
   687 						n++;
       
   688 						s++; if(!*s)break;
       
   689 						s++; if(!*s)break;
       
   690 						s++;
       
   691 					} else {
       
   692 						#ifdef Py_UNICODE_WIDE
       
   693 						n++;
       
   694 						#else
       
   695 						n+=2;
       
   696 						#endif
       
   697 						s++; if(!*s)break;
       
   698 						s++; if(!*s)break;
       
   699 						s++; if(!*s)break;
       
   700 						s++;
       
   701 					}
       
   702 				}
       
   703 				break;
       
   704 			}
       
   705 			case 'U':
       
   706 			{
       
   707 				PyObject *obj = va_arg(count, PyObject *);
       
   708 				assert(obj && PyUnicode_Check(obj));
       
   709 				n += PyUnicode_GET_SIZE(obj);
       
   710 				break;
       
   711 			}
       
   712 			case 'V':
       
   713 			{
       
   714 				PyObject *obj = va_arg(count, PyObject *);
       
   715 				const char *str = va_arg(count, const char *);
       
   716 				assert(obj || str);
       
   717 				assert(!obj || PyUnicode_Check(obj));
       
   718 				if (obj)
       
   719 					n += PyUnicode_GET_SIZE(obj);
       
   720 				else
       
   721 					n += strlen(str);
       
   722 				break;
       
   723 			}
       
   724 			case 'S':
       
   725 			{
       
   726 				PyObject *obj = va_arg(count, PyObject *);
       
   727 				PyObject *str;
       
   728 				assert(obj);
       
   729 				str = PyObject_Str(obj);
       
   730 				if (!str)
       
   731 					goto fail;
       
   732 				n += PyUnicode_GET_SIZE(str);
       
   733 				/* Remember the str and switch to the next slot */
       
   734 				*callresult++ = str;
       
   735 				break;
       
   736 			}
       
   737 			case 'R':
       
   738 			{
       
   739 				PyObject *obj = va_arg(count, PyObject *);
       
   740 				PyObject *repr;
       
   741 				assert(obj);
       
   742 				repr = PyObject_Repr(obj);
       
   743 				if (!repr)
       
   744 					goto fail;
       
   745 				n += PyUnicode_GET_SIZE(repr);
       
   746 				/* Remember the repr and switch to the next slot */
       
   747 				*callresult++ = repr;
       
   748 				break;
       
   749 			}
       
   750 			case 'p':
       
   751 				(void) va_arg(count, int);
       
   752 				/* maximum 64-bit pointer representation:
       
   753 				 * 0xffffffffffffffff
       
   754 				 * so 19 characters is enough.
       
   755 				 * XXX I count 18 -- what's the extra for?
       
   756 				 */
       
   757 				n += 19;
       
   758 				break;
       
   759 			default:
       
   760 				/* if we stumble upon an unknown
       
   761 				   formatting code, copy the rest of
       
   762 				   the format string to the output
       
   763 				   string. (we cannot just skip the
       
   764 				   code, since there's no way to know
       
   765 				   what's in the argument list) */
       
   766 				n += strlen(p);
       
   767 				goto expand;
       
   768 			}
       
   769 		} else
       
   770 			n++;
       
   771 	}
       
   772  expand:
       
   773 	if (abuffersize > 20) {
       
   774 		abuffer = PyObject_Malloc(abuffersize);
       
   775 		if (!abuffer) {
       
   776 			PyErr_NoMemory();
       
   777 			goto fail;
       
   778 		}
       
   779 		realbuffer = abuffer;
       
   780 	}
       
   781 	else
       
   782 		realbuffer = buffer;
       
   783 	/* step 4: fill the buffer */
       
   784 	/* Since we've analyzed how much space we need for the worst case,
       
   785 	   we don't have to resize the string.
       
   786 	   There can be no errors beyond this point. */
       
   787 	string = PyUnicode_FromUnicode(NULL, n);
       
   788 	if (!string)
       
   789 		goto fail;
       
   790 
       
   791 	s = PyUnicode_AS_UNICODE(string);
       
   792 	callresult = callresults;
       
   793 
       
   794 	for (f = format; *f; f++) {
       
   795 		if (*f == '%') {
       
   796 			const char* p = f++;
       
   797 			int longflag = 0;
       
   798 			int size_tflag = 0;
       
   799 			zeropad = (*f == '0');
       
   800 			/* parse the width.precision part */
       
   801 			width = 0;
       
   802 			while (isdigit((unsigned)*f))
       
   803 				width = (width*10) + *f++ - '0';
       
   804 			precision = 0;
       
   805 			if (*f == '.') {
       
   806 				f++;
       
   807 				while (isdigit((unsigned)*f))
       
   808 					precision = (precision*10) + *f++ - '0';
       
   809 			}
       
   810 			/* handle the long flag, but only for %ld and %lu.
       
   811 			   others can be added when necessary. */
       
   812 			if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
       
   813 				longflag = 1;
       
   814 				++f;
       
   815 			}
       
   816 			/* handle the size_t flag. */
       
   817 			if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
       
   818 				size_tflag = 1;
       
   819 				++f;
       
   820 			}
       
   821 
       
   822 			switch (*f) {
       
   823 			case 'c':
       
   824 				*s++ = va_arg(vargs, int);
       
   825 				break;
       
   826 			case 'd':
       
   827 				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
       
   828 				if (longflag)
       
   829 					sprintf(realbuffer, fmt, va_arg(vargs, long));
       
   830 				else if (size_tflag)
       
   831 					sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
       
   832 				else
       
   833 					sprintf(realbuffer, fmt, va_arg(vargs, int));
       
   834 				appendstring(realbuffer);
       
   835 				break;
       
   836 			case 'u':
       
   837 				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
       
   838 				if (longflag)
       
   839 					sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
       
   840 				else if (size_tflag)
       
   841 					sprintf(realbuffer, fmt, va_arg(vargs, size_t));
       
   842 				else
       
   843 					sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
       
   844 				appendstring(realbuffer);
       
   845 				break;
       
   846 			case 'i':
       
   847 				makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
       
   848 				sprintf(realbuffer, fmt, va_arg(vargs, int));
       
   849 				appendstring(realbuffer);
       
   850 				break;
       
   851 			case 'x':
       
   852 				makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
       
   853 				sprintf(realbuffer, fmt, va_arg(vargs, int));
       
   854 				appendstring(realbuffer);
       
   855 				break;
       
   856 			case 's':
       
   857 			{
       
   858 				/* Parameter must be UTF-8 encoded.
       
   859 				   In case of encoding errors, use
       
   860 				   the replacement character. */
       
   861 				PyObject *u;
       
   862 				p = va_arg(vargs, char*);
       
   863 				u = PyUnicode_DecodeUTF8(p, strlen(p), 
       
   864 							 "replace");
       
   865 				if (!u)
       
   866 					goto fail;
       
   867 				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
       
   868 						PyUnicode_GET_SIZE(u));
       
   869 				s += PyUnicode_GET_SIZE(u);
       
   870 				Py_DECREF(u);
       
   871 				break;
       
   872 			}
       
   873 			case 'U':
       
   874 			{
       
   875 				PyObject *obj = va_arg(vargs, PyObject *);
       
   876 				Py_ssize_t size = PyUnicode_GET_SIZE(obj);
       
   877 				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
       
   878 				s += size;
       
   879 				break;
       
   880 			}
       
   881 			case 'V':
       
   882 			{
       
   883 				PyObject *obj = va_arg(vargs, PyObject *);
       
   884 				const char *str = va_arg(vargs, const char *);
       
   885 				if (obj) {
       
   886 					Py_ssize_t size = PyUnicode_GET_SIZE(obj);
       
   887 					Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
       
   888 					s += size;
       
   889 				} else {
       
   890 					appendstring(str);
       
   891 				}
       
   892 				break;
       
   893 			}
       
   894 			case 'S':
       
   895 			case 'R':
       
   896 			{
       
   897 				Py_UNICODE *ucopy;
       
   898 				Py_ssize_t usize;
       
   899 				Py_ssize_t upos;
       
   900 				/* unused, since we already have the result */
       
   901 				(void) va_arg(vargs, PyObject *);
       
   902 				ucopy = PyUnicode_AS_UNICODE(*callresult);
       
   903 				usize = PyUnicode_GET_SIZE(*callresult);
       
   904 				for (upos = 0; upos<usize;)
       
   905 					*s++ = ucopy[upos++];
       
   906 				/* We're done with the unicode()/repr() => forget it */
       
   907 				Py_DECREF(*callresult);
       
   908 				/* switch to next unicode()/repr() result */
       
   909 				++callresult;
       
   910 				break;
       
   911 			}
       
   912 			case 'p':
       
   913 				sprintf(buffer, "%p", va_arg(vargs, void*));
       
   914 				/* %p is ill-defined:  ensure leading 0x. */
       
   915 				if (buffer[1] == 'X')
       
   916 					buffer[1] = 'x';
       
   917 				else if (buffer[1] != 'x') {
       
   918 					memmove(buffer+2, buffer, strlen(buffer)+1);
       
   919 					buffer[0] = '0';
       
   920 					buffer[1] = 'x';
       
   921 				}
       
   922 				appendstring(buffer);
       
   923 				break;
       
   924 			case '%':
       
   925 				*s++ = '%';
       
   926 				break;
       
   927 			default:
       
   928 				appendstring(p);
       
   929 				goto end;
       
   930 			}
       
   931 		} else
       
   932 			*s++ = *f;
       
   933 	}
       
   934 
       
   935  end:
       
   936 	if (callresults)
       
   937 		PyObject_Free(callresults);
       
   938 	if (abuffer)
       
   939 		PyObject_Free(abuffer);
       
   940 	_PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
       
   941 	return string;
       
   942  fail:
       
   943 	if (callresults) {
       
   944 		PyObject **callresult2 = callresults;
       
   945 		while (callresult2 < callresult) {
       
   946 			Py_DECREF(*callresult2);
       
   947 			++callresult2;
       
   948 		}
       
   949 		PyObject_Free(callresults);
       
   950 	}
       
   951 	if (abuffer)
       
   952 		PyObject_Free(abuffer);
       
   953 	return NULL;
       
   954 }
       
   955 
       
   956 #undef appendstring
       
   957 
       
   958 PyObject *
       
   959 PyUnicode_FromFormat(const char *format, ...)
       
   960 {
       
   961 	PyObject* ret;
       
   962 	va_list vargs;
       
   963 
       
   964 #ifdef HAVE_STDARG_PROTOTYPES
       
   965 	va_start(vargs, format);
       
   966 #else
       
   967 	va_start(vargs);
       
   968 #endif
       
   969 	ret = PyUnicode_FromFormatV(format, vargs);
       
   970 	va_end(vargs);
       
   971 	return ret;
       
   972 }
       
   973 
       
   974 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
       
   975 				wchar_t *w,
       
   976 				Py_ssize_t size)
       
   977 {
       
   978     if (unicode == NULL) {
       
   979 	PyErr_BadInternalCall();
       
   980 	return -1;
       
   981     }
       
   982 
       
   983     /* If possible, try to copy the 0-termination as well */
       
   984     if (size > PyUnicode_GET_SIZE(unicode))
       
   985 	size = PyUnicode_GET_SIZE(unicode) + 1;
       
   986 
       
   987 #ifdef HAVE_USABLE_WCHAR_T
       
   988     memcpy(w, unicode->str, size * sizeof(wchar_t));
       
   989 #else
       
   990     {
       
   991 	register Py_UNICODE *u;
       
   992 	register Py_ssize_t i;
       
   993 	u = PyUnicode_AS_UNICODE(unicode);
       
   994 	for (i = size; i > 0; i--)
       
   995 	    *w++ = *u++;
       
   996     }
       
   997 #endif
       
   998 
       
   999     if (size > PyUnicode_GET_SIZE(unicode))
       
  1000         return PyUnicode_GET_SIZE(unicode);
       
  1001     else
       
  1002     return size;
       
  1003 }
       
  1004 
       
  1005 #endif
       
  1006 
       
  1007 PyObject *PyUnicode_FromOrdinal(int ordinal)
       
  1008 {
       
  1009     Py_UNICODE s[1];
       
  1010 
       
  1011 #ifdef Py_UNICODE_WIDE
       
  1012     if (ordinal < 0 || ordinal > 0x10ffff) {
       
  1013 	PyErr_SetString(PyExc_ValueError,
       
  1014 			"unichr() arg not in range(0x110000) "
       
  1015 			"(wide Python build)");
       
  1016 	return NULL;
       
  1017     }
       
  1018 #else
       
  1019     if (ordinal < 0 || ordinal > 0xffff) {
       
  1020 	PyErr_SetString(PyExc_ValueError,
       
  1021 			"unichr() arg not in range(0x10000) "
       
  1022 			"(narrow Python build)");
       
  1023 	return NULL;
       
  1024     }
       
  1025 #endif
       
  1026 
       
  1027     s[0] = (Py_UNICODE)ordinal;
       
  1028     return PyUnicode_FromUnicode(s, 1);
       
  1029 }
       
  1030 
       
  1031 PyObject *PyUnicode_FromObject(register PyObject *obj)
       
  1032 {
       
  1033     /* XXX Perhaps we should make this API an alias of
       
  1034            PyObject_Unicode() instead ?! */
       
  1035     if (PyUnicode_CheckExact(obj)) {
       
  1036 	Py_INCREF(obj);
       
  1037 	return obj;
       
  1038     }
       
  1039     if (PyUnicode_Check(obj)) {
       
  1040 	/* For a Unicode subtype that's not a Unicode object,
       
  1041 	   return a true Unicode object with the same data. */
       
  1042 	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
       
  1043 				     PyUnicode_GET_SIZE(obj));
       
  1044     }
       
  1045     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
       
  1046 }
       
  1047 
       
  1048 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
       
  1049 				      const char *encoding,
       
  1050 				      const char *errors)
       
  1051 {
       
  1052     const char *s = NULL;
       
  1053     Py_ssize_t len;
       
  1054     PyObject *v;
       
  1055 
       
  1056     if (obj == NULL) {
       
  1057 	PyErr_BadInternalCall();
       
  1058 	return NULL;
       
  1059     }
       
  1060 
       
  1061 #if 0
       
  1062     /* For b/w compatibility we also accept Unicode objects provided
       
  1063        that no encodings is given and then redirect to
       
  1064        PyObject_Unicode() which then applies the additional logic for
       
  1065        Unicode subclasses.
       
  1066 
       
  1067        NOTE: This API should really only be used for object which
       
  1068              represent *encoded* Unicode !
       
  1069 
       
  1070     */
       
  1071 	if (PyUnicode_Check(obj)) {
       
  1072 	    if (encoding) {
       
  1073 		PyErr_SetString(PyExc_TypeError,
       
  1074 				"decoding Unicode is not supported");
       
  1075 	    return NULL;
       
  1076 	    }
       
  1077 	return PyObject_Unicode(obj);
       
  1078 	    }
       
  1079 #else
       
  1080     if (PyUnicode_Check(obj)) {
       
  1081 	PyErr_SetString(PyExc_TypeError,
       
  1082 			"decoding Unicode is not supported");
       
  1083 	return NULL;
       
  1084 	}
       
  1085 #endif
       
  1086 
       
  1087     /* Coerce object */
       
  1088     if (PyString_Check(obj)) {
       
  1089 	    s = PyString_AS_STRING(obj);
       
  1090 	    len = PyString_GET_SIZE(obj);
       
  1091     }
       
  1092     else if (PyByteArray_Check(obj)) {
       
  1093         /* Python 2.x specific */
       
  1094         PyErr_Format(PyExc_TypeError,
       
  1095                      "decoding bytearray is not supported");
       
  1096         return NULL;
       
  1097     }
       
  1098     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
       
  1099 	/* Overwrite the error message with something more useful in
       
  1100 	   case of a TypeError. */
       
  1101 	if (PyErr_ExceptionMatches(PyExc_TypeError))
       
  1102 	PyErr_Format(PyExc_TypeError,
       
  1103 			 "coercing to Unicode: need string or buffer, "
       
  1104 			 "%.80s found",
       
  1105 		     Py_TYPE(obj)->tp_name);
       
  1106 	goto onError;
       
  1107     }
       
  1108 
       
  1109     /* Convert to Unicode */
       
  1110     if (len == 0) {
       
  1111 	Py_INCREF(unicode_empty);
       
  1112 	v = (PyObject *)unicode_empty;
       
  1113     }
       
  1114     else
       
  1115 	v = PyUnicode_Decode(s, len, encoding, errors);
       
  1116 
       
  1117     return v;
       
  1118 
       
  1119  onError:
       
  1120     return NULL;
       
  1121 }
       
  1122 
       
  1123 PyObject *PyUnicode_Decode(const char *s,
       
  1124 			   Py_ssize_t size,
       
  1125 			   const char *encoding,
       
  1126 			   const char *errors)
       
  1127 {
       
  1128     PyObject *buffer = NULL, *unicode;
       
  1129 
       
  1130     if (encoding == NULL)
       
  1131 	encoding = PyUnicode_GetDefaultEncoding();
       
  1132 
       
  1133     /* Shortcuts for common default encodings */
       
  1134     if (strcmp(encoding, "utf-8") == 0)
       
  1135         return PyUnicode_DecodeUTF8(s, size, errors);
       
  1136     else if (strcmp(encoding, "latin-1") == 0)
       
  1137         return PyUnicode_DecodeLatin1(s, size, errors);
       
  1138 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
       
  1139     else if (strcmp(encoding, "mbcs") == 0)
       
  1140         return PyUnicode_DecodeMBCS(s, size, errors);
       
  1141 #endif
       
  1142     else if (strcmp(encoding, "ascii") == 0)
       
  1143         return PyUnicode_DecodeASCII(s, size, errors);
       
  1144 
       
  1145     /* Decode via the codec registry */
       
  1146     buffer = PyBuffer_FromMemory((void *)s, size);
       
  1147     if (buffer == NULL)
       
  1148         goto onError;
       
  1149     unicode = PyCodec_Decode(buffer, encoding, errors);
       
  1150     if (unicode == NULL)
       
  1151         goto onError;
       
  1152     if (!PyUnicode_Check(unicode)) {
       
  1153         PyErr_Format(PyExc_TypeError,
       
  1154                      "decoder did not return an unicode object (type=%.400s)",
       
  1155                      Py_TYPE(unicode)->tp_name);
       
  1156         Py_DECREF(unicode);
       
  1157         goto onError;
       
  1158     }
       
  1159     Py_DECREF(buffer);
       
  1160     return unicode;
       
  1161 
       
  1162  onError:
       
  1163     Py_XDECREF(buffer);
       
  1164     return NULL;
       
  1165 }
       
  1166 
       
  1167 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
       
  1168                                     const char *encoding,
       
  1169                                     const char *errors)
       
  1170 {
       
  1171     PyObject *v;
       
  1172 
       
  1173     if (!PyUnicode_Check(unicode)) {
       
  1174         PyErr_BadArgument();
       
  1175         goto onError;
       
  1176     }
       
  1177 
       
  1178     if (encoding == NULL)
       
  1179 	encoding = PyUnicode_GetDefaultEncoding();
       
  1180 
       
  1181     /* Decode via the codec registry */
       
  1182     v = PyCodec_Decode(unicode, encoding, errors);
       
  1183     if (v == NULL)
       
  1184         goto onError;
       
  1185     return v;
       
  1186 
       
  1187  onError:
       
  1188     return NULL;
       
  1189 }
       
  1190 
       
  1191 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
       
  1192 			   Py_ssize_t size,
       
  1193 			   const char *encoding,
       
  1194 			   const char *errors)
       
  1195 {
       
  1196     PyObject *v, *unicode;
       
  1197 
       
  1198     unicode = PyUnicode_FromUnicode(s, size);
       
  1199     if (unicode == NULL)
       
  1200 	return NULL;
       
  1201     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
       
  1202     Py_DECREF(unicode);
       
  1203     return v;
       
  1204 }
       
  1205 
       
  1206 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
       
  1207                                     const char *encoding,
       
  1208                                     const char *errors)
       
  1209 {
       
  1210     PyObject *v;
       
  1211 
       
  1212     if (!PyUnicode_Check(unicode)) {
       
  1213         PyErr_BadArgument();
       
  1214         goto onError;
       
  1215     }
       
  1216 
       
  1217     if (encoding == NULL)
       
  1218 	encoding = PyUnicode_GetDefaultEncoding();
       
  1219 
       
  1220     /* Encode via the codec registry */
       
  1221     v = PyCodec_Encode(unicode, encoding, errors);
       
  1222     if (v == NULL)
       
  1223         goto onError;
       
  1224     return v;
       
  1225 
       
  1226  onError:
       
  1227     return NULL;
       
  1228 }
       
  1229 
       
  1230 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
       
  1231                                     const char *encoding,
       
  1232                                     const char *errors)
       
  1233 {
       
  1234     PyObject *v;
       
  1235 
       
  1236     if (!PyUnicode_Check(unicode)) {
       
  1237         PyErr_BadArgument();
       
  1238         goto onError;
       
  1239     }
       
  1240 
       
  1241     if (encoding == NULL)
       
  1242 	encoding = PyUnicode_GetDefaultEncoding();
       
  1243 
       
  1244     /* Shortcuts for common default encodings */
       
  1245     if (errors == NULL) {
       
  1246 	if (strcmp(encoding, "utf-8") == 0)
       
  1247 	    return PyUnicode_AsUTF8String(unicode);
       
  1248 	else if (strcmp(encoding, "latin-1") == 0)
       
  1249 	    return PyUnicode_AsLatin1String(unicode);
       
  1250 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
       
  1251 	else if (strcmp(encoding, "mbcs") == 0)
       
  1252 	    return PyUnicode_AsMBCSString(unicode);
       
  1253 #endif
       
  1254 	else if (strcmp(encoding, "ascii") == 0)
       
  1255 	    return PyUnicode_AsASCIIString(unicode);
       
  1256     }
       
  1257 
       
  1258     /* Encode via the codec registry */
       
  1259     v = PyCodec_Encode(unicode, encoding, errors);
       
  1260     if (v == NULL)
       
  1261         goto onError;
       
  1262     if (!PyString_Check(v)) {
       
  1263         PyErr_Format(PyExc_TypeError,
       
  1264                      "encoder did not return a string object (type=%.400s)",
       
  1265                      Py_TYPE(v)->tp_name);
       
  1266         Py_DECREF(v);
       
  1267         goto onError;
       
  1268     }
       
  1269     return v;
       
  1270 
       
  1271  onError:
       
  1272     return NULL;
       
  1273 }
       
  1274 
       
  1275 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
       
  1276 					    const char *errors)
       
  1277 {
       
  1278     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
       
  1279 
       
  1280     if (v)
       
  1281         return v;
       
  1282     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
       
  1283     if (v && errors == NULL)
       
  1284         ((PyUnicodeObject *)unicode)->defenc = v;
       
  1285     return v;
       
  1286 }
       
  1287 
       
  1288 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
       
  1289 {
       
  1290     if (!PyUnicode_Check(unicode)) {
       
  1291         PyErr_BadArgument();
       
  1292         goto onError;
       
  1293     }
       
  1294     return PyUnicode_AS_UNICODE(unicode);
       
  1295 
       
  1296  onError:
       
  1297     return NULL;
       
  1298 }
       
  1299 
       
  1300 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
       
  1301 {
       
  1302     if (!PyUnicode_Check(unicode)) {
       
  1303         PyErr_BadArgument();
       
  1304         goto onError;
       
  1305     }
       
  1306     return PyUnicode_GET_SIZE(unicode);
       
  1307 
       
  1308  onError:
       
  1309     return -1;
       
  1310 }
       
  1311 
       
  1312 const char *PyUnicode_GetDefaultEncoding(void)
       
  1313 {
       
  1314     return unicode_default_encoding;
       
  1315 }
       
  1316 
       
  1317 int PyUnicode_SetDefaultEncoding(const char *encoding)
       
  1318 {
       
  1319     PyObject *v;
       
  1320 
       
  1321     /* Make sure the encoding is valid. As side effect, this also
       
  1322        loads the encoding into the codec registry cache. */
       
  1323     v = _PyCodec_Lookup(encoding);
       
  1324     if (v == NULL)
       
  1325 	goto onError;
       
  1326     Py_DECREF(v);
       
  1327     strncpy(unicode_default_encoding,
       
  1328 	    encoding,
       
  1329 	    sizeof(unicode_default_encoding));
       
  1330     return 0;
       
  1331 
       
  1332  onError:
       
  1333     return -1;
       
  1334 }
       
  1335 
       
  1336 /* error handling callback helper:
       
  1337    build arguments, call the callback and check the arguments,
       
  1338    if no exception occurred, copy the replacement to the output
       
  1339    and adjust various state variables.
       
  1340    return 0 on success, -1 on error
       
  1341 */
       
  1342 
       
  1343 static
       
  1344 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
       
  1345                  const char *encoding, const char *reason,
       
  1346                  const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
       
  1347                  Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
       
  1348                  PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
       
  1349 {
       
  1350     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
       
  1351 
       
  1352     PyObject *restuple = NULL;
       
  1353     PyObject *repunicode = NULL;
       
  1354     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
       
  1355     Py_ssize_t requiredsize;
       
  1356     Py_ssize_t newpos;
       
  1357     Py_UNICODE *repptr;
       
  1358     Py_ssize_t repsize;
       
  1359     int res = -1;
       
  1360 
       
  1361     if (*errorHandler == NULL) {
       
  1362 	*errorHandler = PyCodec_LookupError(errors);
       
  1363 	if (*errorHandler == NULL)
       
  1364 	   goto onError;
       
  1365     }
       
  1366 
       
  1367     if (*exceptionObject == NULL) {
       
  1368     	*exceptionObject = PyUnicodeDecodeError_Create(
       
  1369 	    encoding, input, insize, *startinpos, *endinpos, reason);
       
  1370 	if (*exceptionObject == NULL)
       
  1371 	   goto onError;
       
  1372     }
       
  1373     else {
       
  1374 	if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
       
  1375 	    goto onError;
       
  1376 	if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
       
  1377 	    goto onError;
       
  1378 	if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
       
  1379 	    goto onError;
       
  1380     }
       
  1381 
       
  1382     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
       
  1383     if (restuple == NULL)
       
  1384 	goto onError;
       
  1385     if (!PyTuple_Check(restuple)) {
       
  1386 	PyErr_Format(PyExc_TypeError, &argparse[4]);
       
  1387 	goto onError;
       
  1388     }
       
  1389     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
       
  1390 	goto onError;
       
  1391     if (newpos<0)
       
  1392 	newpos = insize+newpos;
       
  1393     if (newpos<0 || newpos>insize) {
       
  1394 	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
       
  1395 	goto onError;
       
  1396     }
       
  1397 
       
  1398     /* need more space? (at least enough for what we
       
  1399        have+the replacement+the rest of the string (starting
       
  1400        at the new input position), so we won't have to check space
       
  1401        when there are no errors in the rest of the string) */
       
  1402     repptr = PyUnicode_AS_UNICODE(repunicode);
       
  1403     repsize = PyUnicode_GET_SIZE(repunicode);
       
  1404     requiredsize = *outpos + repsize + insize-newpos;
       
  1405     if (requiredsize > outsize) {
       
  1406 	if (requiredsize<2*outsize)
       
  1407 	    requiredsize = 2*outsize;
       
  1408 	if (PyUnicode_Resize(output, requiredsize) < 0)
       
  1409 	    goto onError;
       
  1410 	*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
       
  1411     }
       
  1412     *endinpos = newpos;
       
  1413     *inptr = input + newpos;
       
  1414     Py_UNICODE_COPY(*outptr, repptr, repsize);
       
  1415     *outptr += repsize;
       
  1416     *outpos += repsize;
       
  1417     /* we made it! */
       
  1418     res = 0;
       
  1419 
       
  1420     onError:
       
  1421     Py_XDECREF(restuple);
       
  1422     return res;
       
  1423 }
       
  1424 
       
  1425 /* --- UTF-7 Codec -------------------------------------------------------- */
       
  1426 
       
  1427 /* see RFC2152 for details */
       
  1428 
       
  1429 static
       
  1430 char utf7_special[128] = {
       
  1431     /* indicate whether a UTF-7 character is special i.e. cannot be directly
       
  1432        encoded:
       
  1433 	   0 - not special
       
  1434 	   1 - special
       
  1435 	   2 - whitespace (optional)
       
  1436 	   3 - RFC2152 Set O (optional) */
       
  1437     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
       
  1438     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       
  1439     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
       
  1440     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
       
  1441     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       
  1442     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
       
  1443     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       
  1444     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
       
  1445 
       
  1446 };
       
  1447 
       
  1448 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
       
  1449    warnings about the comparison always being false; since
       
  1450    utf7_special[0] is 1, we can safely make that one comparison
       
  1451    true  */
       
  1452 
       
  1453 #define SPECIAL(c, encodeO, encodeWS) \
       
  1454     ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
       
  1455      (encodeWS && (utf7_special[(c)] == 2)) || \
       
  1456      (encodeO && (utf7_special[(c)] == 3)))
       
  1457 
       
  1458 #define B64(n)  \
       
  1459     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
       
  1460 #define B64CHAR(c) \
       
  1461     (isalnum(c) || (c) == '+' || (c) == '/')
       
  1462 #define UB64(c) \
       
  1463     ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?                   \
       
  1464      (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
       
  1465 
       
  1466 #define ENCODE(out, ch, bits)                   \
       
  1467     while (bits >= 6) {                         \
       
  1468         *out++ = B64(ch >> (bits-6));           \
       
  1469         bits -= 6;                              \
       
  1470     }
       
  1471 
       
  1472 #define DECODE(out, ch, bits, surrogate)                                \
       
  1473     while (bits >= 16) {                                                \
       
  1474         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
       
  1475         bits -= 16;                                                     \
       
  1476         if (surrogate) {                                                \
       
  1477             /* We have already generated an error for the high surrogate \
       
  1478                so let's not bother seeing if the low surrogate is correct or not */ \
       
  1479             surrogate = 0;                                              \
       
  1480         } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
       
  1481             /* This is a surrogate pair. Unfortunately we can't represent \
       
  1482                it in a 16-bit character */                              \
       
  1483             surrogate = 1;                                              \
       
  1484             errmsg = "code pairs are not supported";                    \
       
  1485             goto utf7Error;                                             \
       
  1486         } else {                                                        \
       
  1487             *out++ = outCh;                                             \
       
  1488         }                                                               \
       
  1489     }
       
  1490 
       
  1491 PyObject *PyUnicode_DecodeUTF7(const char *s,
       
  1492 			       Py_ssize_t size,
       
  1493 			       const char *errors)
       
  1494 {
       
  1495     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
       
  1496 }
       
  1497 
       
  1498 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
       
  1499 			       Py_ssize_t size,
       
  1500 			       const char *errors,
       
  1501 			       Py_ssize_t *consumed)
       
  1502 {
       
  1503     const char *starts = s;
       
  1504     Py_ssize_t startinpos;
       
  1505     Py_ssize_t endinpos;
       
  1506     Py_ssize_t outpos;
       
  1507     const char *e;
       
  1508     PyUnicodeObject *unicode;
       
  1509     Py_UNICODE *p;
       
  1510     const char *errmsg = "";
       
  1511     int inShift = 0;
       
  1512     unsigned int bitsleft = 0;
       
  1513     unsigned long charsleft = 0;
       
  1514     int surrogate = 0;
       
  1515     PyObject *errorHandler = NULL;
       
  1516     PyObject *exc = NULL;
       
  1517 
       
  1518     unicode = _PyUnicode_New(size);
       
  1519     if (!unicode)
       
  1520         return NULL;
       
  1521     if (size == 0) {
       
  1522         if (consumed)
       
  1523             *consumed = 0;
       
  1524         return (PyObject *)unicode;
       
  1525     }
       
  1526 
       
  1527     p = unicode->str;
       
  1528     e = s + size;
       
  1529 
       
  1530     while (s < e) {
       
  1531         Py_UNICODE ch;
       
  1532         restart:
       
  1533         ch = (unsigned char) *s;
       
  1534 
       
  1535         if (inShift) {
       
  1536             if ((ch == '-') || !B64CHAR(ch)) {
       
  1537                 inShift = 0;
       
  1538                 s++;
       
  1539 
       
  1540                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
       
  1541                 if (bitsleft >= 6) {
       
  1542                     /* The shift sequence has a partial character in it. If
       
  1543                        bitsleft < 6 then we could just classify it as padding
       
  1544                        but that is not the case here */
       
  1545 
       
  1546                     errmsg = "partial character in shift sequence";
       
  1547                     goto utf7Error;
       
  1548                 }
       
  1549                 /* According to RFC2152 the remaining bits should be zero. We
       
  1550                    choose to signal an error/insert a replacement character
       
  1551                    here so indicate the potential of a misencoded character. */
       
  1552 
       
  1553                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
       
  1554                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
       
  1555                     errmsg = "non-zero padding bits in shift sequence";
       
  1556                     goto utf7Error;
       
  1557                 }
       
  1558 
       
  1559                 if (ch == '-') {
       
  1560                     if ((s < e) && (*(s) == '-')) {
       
  1561                         *p++ = '-';
       
  1562                         inShift = 1;
       
  1563                     }
       
  1564                 } else if (SPECIAL(ch,0,0)) {
       
  1565                     errmsg = "unexpected special character";
       
  1566 	                goto utf7Error;
       
  1567                 } else  {
       
  1568                     *p++ = ch;
       
  1569                 }
       
  1570             } else {
       
  1571                 charsleft = (charsleft << 6) | UB64(ch);
       
  1572                 bitsleft += 6;
       
  1573                 s++;
       
  1574                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
       
  1575             }
       
  1576         }
       
  1577         else if ( ch == '+' ) {
       
  1578             startinpos = s-starts;
       
  1579             s++;
       
  1580             if (s < e && *s == '-') {
       
  1581                 s++;
       
  1582                 *p++ = '+';
       
  1583             } else
       
  1584             {
       
  1585                 inShift = 1;
       
  1586                 bitsleft = 0;
       
  1587             }
       
  1588         }
       
  1589         else if (SPECIAL(ch,0,0)) {
       
  1590             startinpos = s-starts;
       
  1591             errmsg = "unexpected special character";
       
  1592             s++;
       
  1593 	        goto utf7Error;
       
  1594         }
       
  1595         else {
       
  1596             *p++ = ch;
       
  1597             s++;
       
  1598         }
       
  1599         continue;
       
  1600     utf7Error:
       
  1601         outpos = p-PyUnicode_AS_UNICODE(unicode);
       
  1602         endinpos = s-starts;
       
  1603         if (unicode_decode_call_errorhandler(
       
  1604              errors, &errorHandler,
       
  1605              "utf7", errmsg,
       
  1606              starts, size, &startinpos, &endinpos, &exc, &s,
       
  1607              (PyObject **)&unicode, &outpos, &p))
       
  1608         goto onError;
       
  1609     }
       
  1610 
       
  1611     if (inShift && !consumed) {
       
  1612         outpos = p-PyUnicode_AS_UNICODE(unicode);
       
  1613         endinpos = size;
       
  1614         if (unicode_decode_call_errorhandler(
       
  1615              errors, &errorHandler,
       
  1616              "utf7", "unterminated shift sequence",
       
  1617              starts, size, &startinpos, &endinpos, &exc, &s,
       
  1618              (PyObject **)&unicode, &outpos, &p))
       
  1619             goto onError;
       
  1620         if (s < e)
       
  1621            goto restart;
       
  1622     }
       
  1623     if (consumed) {
       
  1624         if(inShift)
       
  1625             *consumed = startinpos;
       
  1626         else
       
  1627             *consumed = s-starts;
       
  1628     }
       
  1629 
       
  1630     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
       
  1631         goto onError;
       
  1632 
       
  1633     Py_XDECREF(errorHandler);
       
  1634     Py_XDECREF(exc);
       
  1635     return (PyObject *)unicode;
       
  1636 
       
  1637 onError:
       
  1638     Py_XDECREF(errorHandler);
       
  1639     Py_XDECREF(exc);
       
  1640     Py_DECREF(unicode);
       
  1641     return NULL;
       
  1642 }
       
  1643 
       
  1644 
       
  1645 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
       
  1646                    Py_ssize_t size,
       
  1647                    int encodeSetO,
       
  1648                    int encodeWhiteSpace,
       
  1649                    const char *errors)
       
  1650 {
       
  1651     PyObject *v;
       
  1652     /* It might be possible to tighten this worst case */
       
  1653     Py_ssize_t cbAllocated = 5 * size;
       
  1654     int inShift = 0;
       
  1655     Py_ssize_t i = 0;
       
  1656     unsigned int bitsleft = 0;
       
  1657     unsigned long charsleft = 0;
       
  1658     char * out;
       
  1659     char * start;
       
  1660 
       
  1661     if (cbAllocated / 5 != size)
       
  1662         return PyErr_NoMemory();
       
  1663 
       
  1664     if (size == 0)
       
  1665 		return PyString_FromStringAndSize(NULL, 0);
       
  1666 
       
  1667     v = PyString_FromStringAndSize(NULL, cbAllocated);
       
  1668     if (v == NULL)
       
  1669         return NULL;
       
  1670 
       
  1671     start = out = PyString_AS_STRING(v);
       
  1672     for (;i < size; ++i) {
       
  1673         Py_UNICODE ch = s[i];
       
  1674 
       
  1675         if (!inShift) {
       
  1676             if (ch == '+') {
       
  1677                 *out++ = '+';
       
  1678                 *out++ = '-';
       
  1679             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
       
  1680                 charsleft = ch;
       
  1681                 bitsleft = 16;
       
  1682                 *out++ = '+';
       
  1683                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
       
  1684                 inShift = bitsleft > 0;
       
  1685             } else {
       
  1686                 *out++ = (char) ch;
       
  1687             }
       
  1688         } else {
       
  1689             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
       
  1690                 *out++ = B64(charsleft << (6-bitsleft));
       
  1691                 charsleft = 0;
       
  1692                 bitsleft = 0;
       
  1693                 /* Characters not in the BASE64 set implicitly unshift the sequence
       
  1694                    so no '-' is required, except if the character is itself a '-' */
       
  1695                 if (B64CHAR(ch) || ch == '-') {
       
  1696                     *out++ = '-';
       
  1697                 }
       
  1698                 inShift = 0;
       
  1699                 *out++ = (char) ch;
       
  1700             } else {
       
  1701                 bitsleft += 16;
       
  1702                 charsleft = (charsleft << 16) | ch;
       
  1703                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
       
  1704 
       
  1705                 /* If the next character is special then we dont' need to terminate
       
  1706                    the shift sequence. If the next character is not a BASE64 character
       
  1707                    or '-' then the shift sequence will be terminated implicitly and we
       
  1708                    don't have to insert a '-'. */
       
  1709 
       
  1710                 if (bitsleft == 0) {
       
  1711                     if (i + 1 < size) {
       
  1712                         Py_UNICODE ch2 = s[i+1];
       
  1713 
       
  1714                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
       
  1715 
       
  1716                         } else if (B64CHAR(ch2) || ch2 == '-') {
       
  1717                             *out++ = '-';
       
  1718                             inShift = 0;
       
  1719                         } else {
       
  1720                             inShift = 0;
       
  1721                         }
       
  1722 
       
  1723                     }
       
  1724                     else {
       
  1725                         *out++ = '-';
       
  1726                         inShift = 0;
       
  1727                     }
       
  1728                 }
       
  1729             }
       
  1730         }
       
  1731     }
       
  1732     if (bitsleft) {
       
  1733         *out++= B64(charsleft << (6-bitsleft) );
       
  1734         *out++ = '-';
       
  1735     }
       
  1736 
       
  1737     _PyString_Resize(&v, out - start);
       
  1738     return v;
       
  1739 }
       
  1740 
       
  1741 #undef SPECIAL
       
  1742 #undef B64
       
  1743 #undef B64CHAR
       
  1744 #undef UB64
       
  1745 #undef ENCODE
       
  1746 #undef DECODE
       
  1747 
       
  1748 /* --- UTF-8 Codec -------------------------------------------------------- */
       
  1749 
       
  1750 static
       
  1751 char utf8_code_length[256] = {
       
  1752     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
       
  1753        illegal prefix.  see RFC 2279 for details */
       
  1754     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       
  1755     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       
  1756     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       
  1757     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       
  1758     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       
  1759     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       
  1760     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       
  1761     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       
  1762     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       
  1763     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       
  1764     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       
  1765     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       
  1766     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       
  1767     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       
  1768     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       
  1769     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
       
  1770 };
       
  1771 
       
  1772 PyObject *PyUnicode_DecodeUTF8(const char *s,
       
  1773 			       Py_ssize_t size,
       
  1774 			       const char *errors)
       
  1775 {
       
  1776     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
       
  1777 }
       
  1778 
       
  1779 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
       
  1780 			                Py_ssize_t size,
       
  1781 			                const char *errors,
       
  1782 			                Py_ssize_t *consumed)
       
  1783 {
       
  1784     const char *starts = s;
       
  1785     int n;
       
  1786     Py_ssize_t startinpos;
       
  1787     Py_ssize_t endinpos;
       
  1788     Py_ssize_t outpos;
       
  1789     const char *e;
       
  1790     PyUnicodeObject *unicode;
       
  1791     Py_UNICODE *p;
       
  1792     const char *errmsg = "";
       
  1793     PyObject *errorHandler = NULL;
       
  1794     PyObject *exc = NULL;
       
  1795 
       
  1796     /* Note: size will always be longer than the resulting Unicode
       
  1797        character count */
       
  1798     unicode = _PyUnicode_New(size);
       
  1799     if (!unicode)
       
  1800         return NULL;
       
  1801     if (size == 0) {
       
  1802         if (consumed)
       
  1803             *consumed = 0;
       
  1804         return (PyObject *)unicode;
       
  1805     }
       
  1806 
       
  1807     /* Unpack UTF-8 encoded data */
       
  1808     p = unicode->str;
       
  1809     e = s + size;
       
  1810 
       
  1811     while (s < e) {
       
  1812         Py_UCS4 ch = (unsigned char)*s;
       
  1813 
       
  1814         if (ch < 0x80) {
       
  1815             *p++ = (Py_UNICODE)ch;
       
  1816             s++;
       
  1817             continue;
       
  1818         }
       
  1819 
       
  1820         n = utf8_code_length[ch];
       
  1821 
       
  1822         if (s + n > e) {
       
  1823 	    if (consumed)
       
  1824 		break;
       
  1825 	    else {
       
  1826 		errmsg = "unexpected end of data";
       
  1827 		startinpos = s-starts;
       
  1828 		endinpos = size;
       
  1829 		goto utf8Error;
       
  1830 	    }
       
  1831 	}
       
  1832 
       
  1833         switch (n) {
       
  1834 
       
  1835         case 0:
       
  1836             errmsg = "unexpected code byte";
       
  1837 	    startinpos = s-starts;
       
  1838 	    endinpos = startinpos+1;
       
  1839 	    goto utf8Error;
       
  1840 
       
  1841         case 1:
       
  1842             errmsg = "internal error";
       
  1843 	    startinpos = s-starts;
       
  1844 	    endinpos = startinpos+1;
       
  1845 	    goto utf8Error;
       
  1846 
       
  1847         case 2:
       
  1848             if ((s[1] & 0xc0) != 0x80) {
       
  1849                 errmsg = "invalid data";
       
  1850 		startinpos = s-starts;
       
  1851 		endinpos = startinpos+2;
       
  1852 		goto utf8Error;
       
  1853 	    }
       
  1854             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
       
  1855             if (ch < 0x80) {
       
  1856 		startinpos = s-starts;
       
  1857 		endinpos = startinpos+2;
       
  1858                 errmsg = "illegal encoding";
       
  1859 		goto utf8Error;
       
  1860 	    }
       
  1861 	    else
       
  1862 		*p++ = (Py_UNICODE)ch;
       
  1863             break;
       
  1864 
       
  1865         case 3:
       
  1866             if ((s[1] & 0xc0) != 0x80 ||
       
  1867                 (s[2] & 0xc0) != 0x80) {
       
  1868                 errmsg = "invalid data";
       
  1869 		startinpos = s-starts;
       
  1870 		endinpos = startinpos+3;
       
  1871 		goto utf8Error;
       
  1872 	    }
       
  1873             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
       
  1874             if (ch < 0x0800) {
       
  1875 		/* Note: UTF-8 encodings of surrogates are considered
       
  1876 		   legal UTF-8 sequences;
       
  1877 
       
  1878 		   XXX For wide builds (UCS-4) we should probably try
       
  1879 		       to recombine the surrogates into a single code
       
  1880 		       unit.
       
  1881 		*/
       
  1882                 errmsg = "illegal encoding";
       
  1883 		startinpos = s-starts;
       
  1884 		endinpos = startinpos+3;
       
  1885 		goto utf8Error;
       
  1886 	    }
       
  1887 	    else
       
  1888 		*p++ = (Py_UNICODE)ch;
       
  1889             break;
       
  1890 
       
  1891         case 4:
       
  1892             if ((s[1] & 0xc0) != 0x80 ||
       
  1893                 (s[2] & 0xc0) != 0x80 ||
       
  1894                 (s[3] & 0xc0) != 0x80) {
       
  1895                 errmsg = "invalid data";
       
  1896 		startinpos = s-starts;
       
  1897 		endinpos = startinpos+4;
       
  1898 		goto utf8Error;
       
  1899 	    }
       
  1900             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
       
  1901                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
       
  1902             /* validate and convert to UTF-16 */
       
  1903             if ((ch < 0x10000)        /* minimum value allowed for 4
       
  1904 					 byte encoding */
       
  1905                 || (ch > 0x10ffff))   /* maximum value allowed for
       
  1906 					 UTF-16 */
       
  1907 	    {
       
  1908                 errmsg = "illegal encoding";
       
  1909 		startinpos = s-starts;
       
  1910 		endinpos = startinpos+4;
       
  1911 		goto utf8Error;
       
  1912 	    }
       
  1913 #ifdef Py_UNICODE_WIDE
       
  1914 	    *p++ = (Py_UNICODE)ch;
       
  1915 #else
       
  1916             /*  compute and append the two surrogates: */
       
  1917 
       
  1918             /*  translate from 10000..10FFFF to 0..FFFF */
       
  1919             ch -= 0x10000;
       
  1920 
       
  1921             /*  high surrogate = top 10 bits added to D800 */
       
  1922             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
       
  1923 
       
  1924             /*  low surrogate = bottom 10 bits added to DC00 */
       
  1925             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
       
  1926 #endif
       
  1927             break;
       
  1928 
       
  1929         default:
       
  1930             /* Other sizes are only needed for UCS-4 */
       
  1931             errmsg = "unsupported Unicode code range";
       
  1932 	    startinpos = s-starts;
       
  1933 	    endinpos = startinpos+n;
       
  1934 	    goto utf8Error;
       
  1935         }
       
  1936         s += n;
       
  1937 	continue;
       
  1938 
       
  1939     utf8Error:
       
  1940     outpos = p-PyUnicode_AS_UNICODE(unicode);
       
  1941     if (unicode_decode_call_errorhandler(
       
  1942 	     errors, &errorHandler,
       
  1943 	     "utf8", errmsg,
       
  1944 	     starts, size, &startinpos, &endinpos, &exc, &s,
       
  1945 	     (PyObject **)&unicode, &outpos, &p))
       
  1946 	goto onError;
       
  1947     }
       
  1948     if (consumed)
       
  1949 	*consumed = s-starts;
       
  1950 
       
  1951     /* Adjust length */
       
  1952     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
       
  1953         goto onError;
       
  1954 
       
  1955     Py_XDECREF(errorHandler);
       
  1956     Py_XDECREF(exc);
       
  1957     return (PyObject *)unicode;
       
  1958 
       
  1959 onError:
       
  1960     Py_XDECREF(errorHandler);
       
  1961     Py_XDECREF(exc);
       
  1962     Py_DECREF(unicode);
       
  1963     return NULL;
       
  1964 }
       
  1965 
       
  1966 /* Allocation strategy:  if the string is short, convert into a stack buffer
       
  1967    and allocate exactly as much space needed at the end.  Else allocate the
       
  1968    maximum possible needed (4 result bytes per Unicode character), and return
       
  1969    the excess memory at the end.
       
  1970 */
       
  1971 PyObject *
       
  1972 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
       
  1973 		     Py_ssize_t size,
       
  1974 		     const char *errors)
       
  1975 {
       
  1976 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
       
  1977 
       
  1978     Py_ssize_t i;           /* index into s of next input byte */
       
  1979     PyObject *v;        /* result string object */
       
  1980     char *p;            /* next free byte in output buffer */
       
  1981     Py_ssize_t nallocated;  /* number of result bytes allocated */
       
  1982     Py_ssize_t nneeded;        /* number of result bytes needed */
       
  1983     char stackbuf[MAX_SHORT_UNICHARS * 4];
       
  1984 
       
  1985     assert(s != NULL);
       
  1986     assert(size >= 0);
       
  1987 
       
  1988     if (size <= MAX_SHORT_UNICHARS) {
       
  1989         /* Write into the stack buffer; nallocated can't overflow.
       
  1990          * At the end, we'll allocate exactly as much heap space as it
       
  1991          * turns out we need.
       
  1992          */
       
  1993         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
       
  1994         v = NULL;   /* will allocate after we're done */
       
  1995         p = stackbuf;
       
  1996     }
       
  1997     else {
       
  1998         /* Overallocate on the heap, and give the excess back at the end. */
       
  1999         nallocated = size * 4;
       
  2000         if (nallocated / 4 != size)  /* overflow! */
       
  2001             return PyErr_NoMemory();
       
  2002         v = PyString_FromStringAndSize(NULL, nallocated);
       
  2003         if (v == NULL)
       
  2004             return NULL;
       
  2005         p = PyString_AS_STRING(v);
       
  2006     }
       
  2007 
       
  2008     for (i = 0; i < size;) {
       
  2009         Py_UCS4 ch = s[i++];
       
  2010 
       
  2011         if (ch < 0x80)
       
  2012             /* Encode ASCII */
       
  2013             *p++ = (char) ch;
       
  2014 
       
  2015         else if (ch < 0x0800) {
       
  2016             /* Encode Latin-1 */
       
  2017             *p++ = (char)(0xc0 | (ch >> 6));
       
  2018             *p++ = (char)(0x80 | (ch & 0x3f));
       
  2019         }
       
  2020         else {
       
  2021             /* Encode UCS2 Unicode ordinals */
       
  2022             if (ch < 0x10000) {
       
  2023                 /* Special case: check for high surrogate */
       
  2024                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
       
  2025                     Py_UCS4 ch2 = s[i];
       
  2026                     /* Check for low surrogate and combine the two to
       
  2027                        form a UCS4 value */
       
  2028                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
       
  2029                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
       
  2030                         i++;
       
  2031                         goto encodeUCS4;
       
  2032                     }
       
  2033                     /* Fall through: handles isolated high surrogates */
       
  2034                 }
       
  2035                 *p++ = (char)(0xe0 | (ch >> 12));
       
  2036                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
       
  2037                 *p++ = (char)(0x80 | (ch & 0x3f));
       
  2038                 continue;
       
  2039     	    }
       
  2040 encodeUCS4:
       
  2041             /* Encode UCS4 Unicode ordinals */
       
  2042             *p++ = (char)(0xf0 | (ch >> 18));
       
  2043             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
       
  2044             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
       
  2045             *p++ = (char)(0x80 | (ch & 0x3f));
       
  2046         }
       
  2047     }
       
  2048 
       
  2049     if (v == NULL) {
       
  2050         /* This was stack allocated. */
       
  2051         nneeded = p - stackbuf;
       
  2052         assert(nneeded <= nallocated);
       
  2053         v = PyString_FromStringAndSize(stackbuf, nneeded);
       
  2054     }
       
  2055     else {
       
  2056     	/* Cut back to size actually needed. */
       
  2057         nneeded = p - PyString_AS_STRING(v);
       
  2058         assert(nneeded <= nallocated);
       
  2059         _PyString_Resize(&v, nneeded);
       
  2060     }
       
  2061     return v;
       
  2062 
       
  2063 #undef MAX_SHORT_UNICHARS
       
  2064 }
       
  2065 
       
  2066 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
       
  2067 {
       
  2068     if (!PyUnicode_Check(unicode)) {
       
  2069         PyErr_BadArgument();
       
  2070         return NULL;
       
  2071     }
       
  2072     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
       
  2073 				PyUnicode_GET_SIZE(unicode),
       
  2074 				NULL);
       
  2075 }
       
  2076 
       
  2077 /* --- UTF-32 Codec ------------------------------------------------------- */
       
  2078 
       
  2079 PyObject *
       
  2080 PyUnicode_DecodeUTF32(const char *s,
       
  2081 		      Py_ssize_t size,
       
  2082 		      const char *errors,
       
  2083 		      int *byteorder)
       
  2084 {
       
  2085     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
       
  2086 }
       
  2087 
       
  2088 PyObject *
       
  2089 PyUnicode_DecodeUTF32Stateful(const char *s,
       
  2090 			      Py_ssize_t size,
       
  2091 			      const char *errors,
       
  2092 			      int *byteorder,
       
  2093 			      Py_ssize_t *consumed)
       
  2094 {
       
  2095     const char *starts = s;
       
  2096     Py_ssize_t startinpos;
       
  2097     Py_ssize_t endinpos;
       
  2098     Py_ssize_t outpos;
       
  2099     PyUnicodeObject *unicode;
       
  2100     Py_UNICODE *p;
       
  2101 #ifndef Py_UNICODE_WIDE
       
  2102     int i, pairs;
       
  2103 #else
       
  2104     const int pairs = 0;
       
  2105 #endif
       
  2106     const unsigned char *q, *e;
       
  2107     int bo = 0;       /* assume native ordering by default */
       
  2108     const char *errmsg = "";
       
  2109     /* Offsets from q for retrieving bytes in the right order. */
       
  2110 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
       
  2111     int iorder[] = {0, 1, 2, 3};
       
  2112 #else
       
  2113     int iorder[] = {3, 2, 1, 0};
       
  2114 #endif
       
  2115     PyObject *errorHandler = NULL;
       
  2116     PyObject *exc = NULL;
       
  2117     /* On narrow builds we split characters outside the BMP into two
       
  2118        codepoints => count how much extra space we need. */
       
  2119 #ifndef Py_UNICODE_WIDE
       
  2120     for (i = pairs = 0; i < size/4; i++)
       
  2121 	if (((Py_UCS4 *)s)[i] >= 0x10000)
       
  2122 	    pairs++;
       
  2123 #endif
       
  2124 
       
  2125     /* This might be one to much, because of a BOM */
       
  2126     unicode = _PyUnicode_New((size+3)/4+pairs);
       
  2127     if (!unicode)
       
  2128         return NULL;
       
  2129     if (size == 0)
       
  2130         return (PyObject *)unicode;
       
  2131 
       
  2132     /* Unpack UTF-32 encoded data */
       
  2133     p = unicode->str;
       
  2134     q = (unsigned char *)s;
       
  2135     e = q + size;
       
  2136 
       
  2137     if (byteorder)
       
  2138         bo = *byteorder;
       
  2139 
       
  2140     /* Check for BOM marks (U+FEFF) in the input and adjust current
       
  2141        byte order setting accordingly. In native mode, the leading BOM
       
  2142        mark is skipped, in all other modes, it is copied to the output
       
  2143        stream as-is (giving a ZWNBSP character). */
       
  2144     if (bo == 0) {
       
  2145         if (size >= 4) {
       
  2146             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
       
  2147                                 (q[iorder[1]] << 8) | q[iorder[0]];
       
  2148 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
       
  2149 	    if (bom == 0x0000FEFF) {
       
  2150 		q += 4;
       
  2151 		bo = -1;
       
  2152 	    }
       
  2153 	    else if (bom == 0xFFFE0000) {
       
  2154 		q += 4;
       
  2155 		bo = 1;
       
  2156 	    }
       
  2157 #else
       
  2158 	    if (bom == 0x0000FEFF) {
       
  2159 		q += 4;
       
  2160 		bo = 1;
       
  2161 	    }
       
  2162 	    else if (bom == 0xFFFE0000) {
       
  2163 		q += 4;
       
  2164 		bo = -1;
       
  2165 	    }
       
  2166 #endif
       
  2167 	}
       
  2168     }
       
  2169 
       
  2170     if (bo == -1) {
       
  2171         /* force LE */
       
  2172         iorder[0] = 0;
       
  2173         iorder[1] = 1;
       
  2174         iorder[2] = 2;
       
  2175         iorder[3] = 3;
       
  2176     }
       
  2177     else if (bo == 1) {
       
  2178         /* force BE */
       
  2179         iorder[0] = 3;
       
  2180         iorder[1] = 2;
       
  2181         iorder[2] = 1;
       
  2182         iorder[3] = 0;
       
  2183     }
       
  2184 
       
  2185     while (q < e) {
       
  2186 	Py_UCS4 ch;
       
  2187 	/* remaining bytes at the end? (size should be divisible by 4) */
       
  2188 	if (e-q<4) {
       
  2189 	    if (consumed)
       
  2190 		break;
       
  2191 	    errmsg = "truncated data";
       
  2192 	    startinpos = ((const char *)q)-starts;
       
  2193 	    endinpos = ((const char *)e)-starts;
       
  2194 	    goto utf32Error;
       
  2195 	    /* The remaining input chars are ignored if the callback
       
  2196 	       chooses to skip the input */
       
  2197 	}
       
  2198 	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
       
  2199 	     (q[iorder[1]] << 8) | q[iorder[0]];
       
  2200 
       
  2201 	if (ch >= 0x110000)
       
  2202 	{
       
  2203 	    errmsg = "codepoint not in range(0x110000)";
       
  2204 	    startinpos = ((const char *)q)-starts;
       
  2205 	    endinpos = startinpos+4;
       
  2206 	    goto utf32Error;
       
  2207 	}
       
  2208 #ifndef Py_UNICODE_WIDE
       
  2209 	if (ch >= 0x10000)
       
  2210 	{
       
  2211 	    *p++ = 0xD800 | ((ch-0x10000) >> 10);
       
  2212 	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
       
  2213 	}
       
  2214 	else
       
  2215 #endif
       
  2216 	    *p++ = ch;
       
  2217 	q += 4;
       
  2218 	continue;
       
  2219     utf32Error:
       
  2220 	outpos = p-PyUnicode_AS_UNICODE(unicode);
       
  2221     if (unicode_decode_call_errorhandler(
       
  2222          errors, &errorHandler,
       
  2223          "utf32", errmsg,
       
  2224          starts, size, &startinpos, &endinpos, &exc, &s,
       
  2225          (PyObject **)&unicode, &outpos, &p))
       
  2226 	    goto onError;
       
  2227     }
       
  2228 
       
  2229     if (byteorder)
       
  2230         *byteorder = bo;
       
  2231 
       
  2232     if (consumed)
       
  2233 	*consumed = (const char *)q-starts;
       
  2234 
       
  2235     /* Adjust length */
       
  2236     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
       
  2237         goto onError;
       
  2238 
       
  2239     Py_XDECREF(errorHandler);
       
  2240     Py_XDECREF(exc);
       
  2241     return (PyObject *)unicode;
       
  2242 
       
  2243 onError:
       
  2244     Py_DECREF(unicode);
       
  2245     Py_XDECREF(errorHandler);
       
  2246     Py_XDECREF(exc);
       
  2247     return NULL;
       
  2248 }
       
  2249 
       
  2250 PyObject *
       
  2251 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
       
  2252 		      Py_ssize_t size,
       
  2253 		      const char *errors,
       
  2254 		      int byteorder)
       
  2255 {
       
  2256     PyObject *v;
       
  2257     unsigned char *p;
       
  2258     Py_ssize_t nsize, bytesize;
       
  2259 #ifndef Py_UNICODE_WIDE
       
  2260     Py_ssize_t i, pairs;
       
  2261 #else
       
  2262     const int pairs = 0;
       
  2263 #endif
       
  2264     /* Offsets from p for storing byte pairs in the right order. */
       
  2265 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
       
  2266     int iorder[] = {0, 1, 2, 3};
       
  2267 #else
       
  2268     int iorder[] = {3, 2, 1, 0};
       
  2269 #endif
       
  2270 
       
  2271 #define STORECHAR(CH)                       \
       
  2272     do {                                    \
       
  2273         p[iorder[3]] = ((CH) >> 24) & 0xff; \
       
  2274         p[iorder[2]] = ((CH) >> 16) & 0xff; \
       
  2275         p[iorder[1]] = ((CH) >> 8) & 0xff;  \
       
  2276         p[iorder[0]] = (CH) & 0xff;         \
       
  2277         p += 4;                             \
       
  2278     } while(0)
       
  2279 
       
  2280     /* In narrow builds we can output surrogate pairs as one codepoint,
       
  2281        so we need less space. */
       
  2282 #ifndef Py_UNICODE_WIDE
       
  2283     for (i = pairs = 0; i < size-1; i++)
       
  2284 	if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
       
  2285 	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
       
  2286 	    pairs++;
       
  2287 #endif
       
  2288     nsize = (size - pairs + (byteorder == 0));
       
  2289     bytesize = nsize * 4;
       
  2290     if (bytesize / 4 != nsize)
       
  2291 	return PyErr_NoMemory();
       
  2292     v = PyString_FromStringAndSize(NULL, bytesize);
       
  2293     if (v == NULL)
       
  2294         return NULL;
       
  2295 
       
  2296     p = (unsigned char *)PyString_AS_STRING(v);
       
  2297     if (byteorder == 0)
       
  2298 	STORECHAR(0xFEFF);
       
  2299     if (size == 0)
       
  2300         return v;
       
  2301 
       
  2302     if (byteorder == -1) {
       
  2303         /* force LE */
       
  2304         iorder[0] = 0;
       
  2305         iorder[1] = 1;
       
  2306         iorder[2] = 2;
       
  2307         iorder[3] = 3;
       
  2308     }
       
  2309     else if (byteorder == 1) {
       
  2310         /* force BE */
       
  2311         iorder[0] = 3;
       
  2312         iorder[1] = 2;
       
  2313         iorder[2] = 1;
       
  2314         iorder[3] = 0;
       
  2315     }
       
  2316 
       
  2317     while (size-- > 0) {
       
  2318 	Py_UCS4 ch = *s++;
       
  2319 #ifndef Py_UNICODE_WIDE
       
  2320 	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
       
  2321 	    Py_UCS4 ch2 = *s;
       
  2322 	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
       
  2323 		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
       
  2324 		s++;
       
  2325 		size--;
       
  2326 	    }
       
  2327 	}
       
  2328 #endif
       
  2329         STORECHAR(ch);
       
  2330     }
       
  2331     return v;
       
  2332 #undef STORECHAR
       
  2333 }
       
  2334 
       
  2335 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
       
  2336 {
       
  2337     if (!PyUnicode_Check(unicode)) {
       
  2338         PyErr_BadArgument();
       
  2339         return NULL;
       
  2340     }
       
  2341     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
       
  2342 				 PyUnicode_GET_SIZE(unicode),
       
  2343 				 NULL,
       
  2344 				 0);
       
  2345 }
       
  2346 
       
  2347 /* --- UTF-16 Codec ------------------------------------------------------- */
       
  2348 
       
  2349 PyObject *
       
  2350 PyUnicode_DecodeUTF16(const char *s,
       
  2351 		      Py_ssize_t size,
       
  2352 		      const char *errors,
       
  2353 		      int *byteorder)
       
  2354 {
       
  2355     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
       
  2356 }
       
  2357 
       
  2358 PyObject *
       
  2359 PyUnicode_DecodeUTF16Stateful(const char *s,
       
  2360 			      Py_ssize_t size,
       
  2361 			      const char *errors,
       
  2362 			      int *byteorder,
       
  2363 			      Py_ssize_t *consumed)
       
  2364 {
       
  2365     const char *starts = s;
       
  2366     Py_ssize_t startinpos;
       
  2367     Py_ssize_t endinpos;
       
  2368     Py_ssize_t outpos;
       
  2369     PyUnicodeObject *unicode;
       
  2370     Py_UNICODE *p;
       
  2371     const unsigned char *q, *e;
       
  2372     int bo = 0;       /* assume native ordering by default */
       
  2373     const char *errmsg = "";
       
  2374     /* Offsets from q for retrieving byte pairs in the right order. */
       
  2375 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
       
  2376     int ihi = 1, ilo = 0;
       
  2377 #else
       
  2378     int ihi = 0, ilo = 1;
       
  2379 #endif
       
  2380     PyObject *errorHandler = NULL;
       
  2381     PyObject *exc = NULL;
       
  2382 
       
  2383     /* Note: size will always be longer than the resulting Unicode
       
  2384        character count */
       
  2385     unicode = _PyUnicode_New(size);
       
  2386     if (!unicode)
       
  2387         return NULL;
       
  2388     if (size == 0)
       
  2389         return (PyObject *)unicode;
       
  2390 
       
  2391     /* Unpack UTF-16 encoded data */
       
  2392     p = unicode->str;
       
  2393     q = (unsigned char *)s;
       
  2394     e = q + size;
       
  2395 
       
  2396     if (byteorder)
       
  2397         bo = *byteorder;
       
  2398 
       
  2399     /* Check for BOM marks (U+FEFF) in the input and adjust current
       
  2400        byte order setting accordingly. In native mode, the leading BOM
       
  2401        mark is skipped, in all other modes, it is copied to the output
       
  2402        stream as-is (giving a ZWNBSP character). */
       
  2403     if (bo == 0) {
       
  2404         if (size >= 2) {
       
  2405             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
       
  2406 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
       
  2407 	    if (bom == 0xFEFF) {
       
  2408 		q += 2;
       
  2409 		bo = -1;
       
  2410 	    }
       
  2411 	    else if (bom == 0xFFFE) {
       
  2412 		q += 2;
       
  2413 		bo = 1;
       
  2414 	    }
       
  2415 #else
       
  2416 	    if (bom == 0xFEFF) {
       
  2417 		q += 2;
       
  2418 		bo = 1;
       
  2419 	    }
       
  2420 	    else if (bom == 0xFFFE) {
       
  2421 		q += 2;
       
  2422 		bo = -1;
       
  2423 	    }
       
  2424 #endif
       
  2425 	}
       
  2426     }
       
  2427 
       
  2428     if (bo == -1) {
       
  2429         /* force LE */
       
  2430         ihi = 1;
       
  2431         ilo = 0;
       
  2432     }
       
  2433     else if (bo == 1) {
       
  2434         /* force BE */
       
  2435         ihi = 0;
       
  2436         ilo = 1;
       
  2437     }
       
  2438 
       
  2439     while (q < e) {
       
  2440 	Py_UNICODE ch;
       
  2441 	/* remaining bytes at the end? (size should be even) */
       
  2442 	if (e-q<2) {
       
  2443 	    if (consumed)
       
  2444 		break;
       
  2445 	    errmsg = "truncated data";
       
  2446 	    startinpos = ((const char *)q)-starts;
       
  2447 	    endinpos = ((const char *)e)-starts;
       
  2448 	    goto utf16Error;
       
  2449 	    /* The remaining input chars are ignored if the callback
       
  2450 	       chooses to skip the input */
       
  2451 	}
       
  2452 	ch = (q[ihi] << 8) | q[ilo];
       
  2453 
       
  2454 	q += 2;
       
  2455 
       
  2456 	if (ch < 0xD800 || ch > 0xDFFF) {
       
  2457 	    *p++ = ch;
       
  2458 	    continue;
       
  2459 	}
       
  2460 
       
  2461 	/* UTF-16 code pair: */
       
  2462 	if (q >= e) {
       
  2463 	    errmsg = "unexpected end of data";
       
  2464 	    startinpos = (((const char *)q)-2)-starts;
       
  2465 	    endinpos = ((const char *)e)-starts;
       
  2466 	    goto utf16Error;
       
  2467 	}
       
  2468 	if (0xD800 <= ch && ch <= 0xDBFF) {
       
  2469 	    Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
       
  2470 	    q += 2;
       
  2471 	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
       
  2472 #ifndef Py_UNICODE_WIDE
       
  2473 		*p++ = ch;
       
  2474 		*p++ = ch2;
       
  2475 #else
       
  2476 		*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
       
  2477 #endif
       
  2478 		continue;
       
  2479 	    }
       
  2480 	    else {
       
  2481                 errmsg = "illegal UTF-16 surrogate";
       
  2482 		startinpos = (((const char *)q)-4)-starts;
       
  2483 		endinpos = startinpos+2;
       
  2484 		goto utf16Error;
       
  2485 	    }
       
  2486 
       
  2487 	}
       
  2488 	errmsg = "illegal encoding";
       
  2489 	startinpos = (((const char *)q)-2)-starts;
       
  2490 	endinpos = startinpos+2;
       
  2491 	/* Fall through to report the error */
       
  2492 
       
  2493     utf16Error:
       
  2494 	outpos = p-PyUnicode_AS_UNICODE(unicode);
       
  2495 	if (unicode_decode_call_errorhandler(
       
  2496 	         errors, &errorHandler,
       
  2497 	         "utf16", errmsg,
       
  2498 	         starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
       
  2499 	         (PyObject **)&unicode, &outpos, &p))
       
  2500 	    goto onError;
       
  2501     }
       
  2502 
       
  2503     if (byteorder)
       
  2504         *byteorder = bo;
       
  2505 
       
  2506     if (consumed)
       
  2507 	*consumed = (const char *)q-starts;
       
  2508 
       
  2509     /* Adjust length */
       
  2510     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
       
  2511         goto onError;
       
  2512 
       
  2513     Py_XDECREF(errorHandler);
       
  2514     Py_XDECREF(exc);
       
  2515     return (PyObject *)unicode;
       
  2516 
       
  2517 onError:
       
  2518     Py_DECREF(unicode);
       
  2519     Py_XDECREF(errorHandler);
       
  2520     Py_XDECREF(exc);
       
  2521     return NULL;
       
  2522 }
       
  2523 
       
  2524 PyObject *
       
  2525 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
       
  2526 		      Py_ssize_t size,
       
  2527 		      const char *errors,
       
  2528 		      int byteorder)
       
  2529 {
       
  2530     PyObject *v;
       
  2531     unsigned char *p;
       
  2532     Py_ssize_t nsize, bytesize;
       
  2533 #ifdef Py_UNICODE_WIDE
       
  2534     Py_ssize_t i, pairs;
       
  2535 #else
       
  2536     const int pairs = 0;
       
  2537 #endif
       
  2538     /* Offsets from p for storing byte pairs in the right order. */
       
  2539 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
       
  2540     int ihi = 1, ilo = 0;
       
  2541 #else
       
  2542     int ihi = 0, ilo = 1;
       
  2543 #endif
       
  2544 
       
  2545 #define STORECHAR(CH)                   \
       
  2546     do {                                \
       
  2547         p[ihi] = ((CH) >> 8) & 0xff;    \
       
  2548         p[ilo] = (CH) & 0xff;           \
       
  2549         p += 2;                         \
       
  2550     } while(0)
       
  2551 
       
  2552 #ifdef Py_UNICODE_WIDE
       
  2553     for (i = pairs = 0; i < size; i++)
       
  2554 	if (s[i] >= 0x10000)
       
  2555 	    pairs++;
       
  2556 #endif
       
  2557     /* 2 * (size + pairs + (byteorder == 0)) */
       
  2558     if (size > PY_SSIZE_T_MAX ||
       
  2559         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
       
  2560 	return PyErr_NoMemory();
       
  2561     nsize = size + pairs + (byteorder == 0);
       
  2562     bytesize = nsize * 2;
       
  2563     if (bytesize / 2 != nsize)
       
  2564 	return PyErr_NoMemory();
       
  2565     v = PyString_FromStringAndSize(NULL, bytesize);
       
  2566     if (v == NULL)
       
  2567         return NULL;
       
  2568 
       
  2569     p = (unsigned char *)PyString_AS_STRING(v);
       
  2570     if (byteorder == 0)
       
  2571 	STORECHAR(0xFEFF);
       
  2572     if (size == 0)
       
  2573         return v;
       
  2574 
       
  2575     if (byteorder == -1) {
       
  2576         /* force LE */
       
  2577         ihi = 1;
       
  2578         ilo = 0;
       
  2579     }
       
  2580     else if (byteorder == 1) {
       
  2581         /* force BE */
       
  2582         ihi = 0;
       
  2583         ilo = 1;
       
  2584     }
       
  2585 
       
  2586     while (size-- > 0) {
       
  2587 	Py_UNICODE ch = *s++;
       
  2588 	Py_UNICODE ch2 = 0;
       
  2589 #ifdef Py_UNICODE_WIDE
       
  2590 	if (ch >= 0x10000) {
       
  2591 	    ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
       
  2592 	    ch  = 0xD800 | ((ch-0x10000) >> 10);
       
  2593 	}
       
  2594 #endif
       
  2595         STORECHAR(ch);
       
  2596         if (ch2)
       
  2597             STORECHAR(ch2);
       
  2598     }
       
  2599     return v;
       
  2600 #undef STORECHAR
       
  2601 }
       
  2602 
       
  2603 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
       
  2604 {
       
  2605     if (!PyUnicode_Check(unicode)) {
       
  2606         PyErr_BadArgument();
       
  2607         return NULL;
       
  2608     }
       
  2609     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
       
  2610 				 PyUnicode_GET_SIZE(unicode),
       
  2611 				 NULL,
       
  2612 				 0);
       
  2613 }
       
  2614 
       
  2615 /* --- Unicode Escape Codec ----------------------------------------------- */
       
  2616 
       
  2617 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
       
  2618 
       
  2619 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
       
  2620 					Py_ssize_t size,
       
  2621 					const char *errors)
       
  2622 {
       
  2623     const char *starts = s;
       
  2624     Py_ssize_t startinpos;
       
  2625     Py_ssize_t endinpos;
       
  2626     Py_ssize_t outpos;
       
  2627     int i;
       
  2628     PyUnicodeObject *v;
       
  2629     Py_UNICODE *p;
       
  2630     const char *end;
       
  2631     char* message;
       
  2632     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
       
  2633     PyObject *errorHandler = NULL;
       
  2634     PyObject *exc = NULL;
       
  2635 
       
  2636     /* Escaped strings will always be longer than the resulting
       
  2637        Unicode string, so we start with size here and then reduce the
       
  2638        length after conversion to the true value.
       
  2639        (but if the error callback returns a long replacement string
       
  2640        we'll have to allocate more space) */
       
  2641     v = _PyUnicode_New(size);
       
  2642     if (v == NULL)
       
  2643         goto onError;
       
  2644     if (size == 0)
       
  2645         return (PyObject *)v;
       
  2646 
       
  2647     p = PyUnicode_AS_UNICODE(v);
       
  2648     end = s + size;
       
  2649 
       
  2650     while (s < end) {
       
  2651         unsigned char c;
       
  2652         Py_UNICODE x;
       
  2653         int digits;
       
  2654 
       
  2655         /* Non-escape characters are interpreted as Unicode ordinals */
       
  2656         if (*s != '\\') {
       
  2657             *p++ = (unsigned char) *s++;
       
  2658             continue;
       
  2659         }
       
  2660 
       
  2661         startinpos = s-starts;
       
  2662         /* \ - Escapes */
       
  2663         s++;
       
  2664         c = *s++;
       
  2665         if (s > end)
       
  2666             c = '\0'; /* Invalid after \ */
       
  2667         switch (c) {
       
  2668 
       
  2669         /* \x escapes */
       
  2670         case '\n': break;
       
  2671         case '\\': *p++ = '\\'; break;
       
  2672         case '\'': *p++ = '\''; break;
       
  2673         case '\"': *p++ = '\"'; break;
       
  2674         case 'b': *p++ = '\b'; break;
       
  2675         case 'f': *p++ = '\014'; break; /* FF */
       
  2676         case 't': *p++ = '\t'; break;
       
  2677         case 'n': *p++ = '\n'; break;
       
  2678         case 'r': *p++ = '\r'; break;
       
  2679         case 'v': *p++ = '\013'; break; /* VT */
       
  2680         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
       
  2681 
       
  2682         /* \OOO (octal) escapes */
       
  2683         case '0': case '1': case '2': case '3':
       
  2684         case '4': case '5': case '6': case '7':
       
  2685             x = s[-1] - '0';
       
  2686             if (s < end && '0' <= *s && *s <= '7') {
       
  2687                 x = (x<<3) + *s++ - '0';
       
  2688                 if (s < end && '0' <= *s && *s <= '7')
       
  2689                     x = (x<<3) + *s++ - '0';
       
  2690             }
       
  2691             *p++ = x;
       
  2692             break;
       
  2693 
       
  2694         /* hex escapes */
       
  2695         /* \xXX */
       
  2696         case 'x':
       
  2697             digits = 2;
       
  2698             message = "truncated \\xXX escape";
       
  2699             goto hexescape;
       
  2700 
       
  2701         /* \uXXXX */
       
  2702         case 'u':
       
  2703             digits = 4;
       
  2704             message = "truncated \\uXXXX escape";
       
  2705             goto hexescape;
       
  2706 
       
  2707         /* \UXXXXXXXX */
       
  2708         case 'U':
       
  2709             digits = 8;
       
  2710             message = "truncated \\UXXXXXXXX escape";
       
  2711         hexescape:
       
  2712             chr = 0;
       
  2713             outpos = p-PyUnicode_AS_UNICODE(v);
       
  2714             if (s+digits>end) {
       
  2715                 endinpos = size;
       
  2716                 if (unicode_decode_call_errorhandler(
       
  2717                     errors, &errorHandler,
       
  2718                     "unicodeescape", "end of string in escape sequence",
       
  2719                     starts, size, &startinpos, &endinpos, &exc, &s,
       
  2720                     (PyObject **)&v, &outpos, &p))
       
  2721                     goto onError;
       
  2722                 goto nextByte;
       
  2723             }
       
  2724             for (i = 0; i < digits; ++i) {
       
  2725                 c = (unsigned char) s[i];
       
  2726                 if (!isxdigit(c)) {
       
  2727                     endinpos = (s+i+1)-starts;
       
  2728                     if (unicode_decode_call_errorhandler(
       
  2729                         errors, &errorHandler,
       
  2730                         "unicodeescape", message,
       
  2731                         starts, size, &startinpos, &endinpos, &exc, &s,
       
  2732                         (PyObject **)&v, &outpos, &p))
       
  2733                         goto onError;
       
  2734                     goto nextByte;
       
  2735                 }
       
  2736                 chr = (chr<<4) & ~0xF;
       
  2737                 if (c >= '0' && c <= '9')
       
  2738                     chr += c - '0';
       
  2739                 else if (c >= 'a' && c <= 'f')
       
  2740                     chr += 10 + c - 'a';
       
  2741                 else
       
  2742                     chr += 10 + c - 'A';
       
  2743             }
       
  2744             s += i;
       
  2745             if (chr == 0xffffffff && PyErr_Occurred())
       
  2746                 /* _decoding_error will have already written into the
       
  2747                    target buffer. */
       
  2748                 break;
       
  2749         store:
       
  2750             /* when we get here, chr is a 32-bit unicode character */
       
  2751             if (chr <= 0xffff)
       
  2752                 /* UCS-2 character */
       
  2753                 *p++ = (Py_UNICODE) chr;
       
  2754             else if (chr <= 0x10ffff) {
       
  2755                 /* UCS-4 character. Either store directly, or as
       
  2756                    surrogate pair. */
       
  2757 #ifdef Py_UNICODE_WIDE
       
  2758                 *p++ = chr;
       
  2759 #else
       
  2760                 chr -= 0x10000L;
       
  2761                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
       
  2762                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
       
  2763 #endif
       
  2764             } else {
       
  2765                 endinpos = s-starts;
       
  2766                 outpos = p-PyUnicode_AS_UNICODE(v);
       
  2767                 if (unicode_decode_call_errorhandler(
       
  2768                     errors, &errorHandler,
       
  2769                     "unicodeescape", "illegal Unicode character",
       
  2770                     starts, size, &startinpos, &endinpos, &exc, &s,
       
  2771                     (PyObject **)&v, &outpos, &p))
       
  2772                     goto onError;
       
  2773             }
       
  2774             break;
       
  2775 
       
  2776         /* \N{name} */
       
  2777         case 'N':
       
  2778             message = "malformed \\N character escape";
       
  2779             if (ucnhash_CAPI == NULL) {
       
  2780                 /* load the unicode data module */
       
  2781                 PyObject *m, *api;
       
  2782                 m = PyImport_ImportModuleNoBlock("unicodedata");
       
  2783                 if (m == NULL)
       
  2784                     goto ucnhashError;
       
  2785                 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
       
  2786                 Py_DECREF(m);
       
  2787                 if (api == NULL)
       
  2788                     goto ucnhashError;
       
  2789                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
       
  2790                 Py_DECREF(api);
       
  2791                 if (ucnhash_CAPI == NULL)
       
  2792                     goto ucnhashError;
       
  2793             }
       
  2794             if (*s == '{') {
       
  2795                 const char *start = s+1;
       
  2796                 /* look for the closing brace */
       
  2797                 while (*s != '}' && s < end)
       
  2798                     s++;
       
  2799                 if (s > start && s < end && *s == '}') {
       
  2800                     /* found a name.  look it up in the unicode database */
       
  2801                     message = "unknown Unicode character name";
       
  2802                     s++;
       
  2803                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
       
  2804                         goto store;
       
  2805                 }
       
  2806             }
       
  2807             endinpos = s-starts;
       
  2808             outpos = p-PyUnicode_AS_UNICODE(v);
       
  2809             if (unicode_decode_call_errorhandler(
       
  2810                 errors, &errorHandler,
       
  2811                 "unicodeescape", message,
       
  2812                 starts, size, &startinpos, &endinpos, &exc, &s,
       
  2813                 (PyObject **)&v, &outpos, &p))
       
  2814                 goto onError;
       
  2815             break;
       
  2816 
       
  2817         default:
       
  2818             if (s > end) {
       
  2819                 message = "\\ at end of string";
       
  2820                 s--;
       
  2821                 endinpos = s-starts;
       
  2822                 outpos = p-PyUnicode_AS_UNICODE(v);
       
  2823                 if (unicode_decode_call_errorhandler(
       
  2824                     errors, &errorHandler,
       
  2825                     "unicodeescape", message,
       
  2826                     starts, size, &startinpos, &endinpos, &exc, &s,
       
  2827                     (PyObject **)&v, &outpos, &p))
       
  2828                     goto onError;
       
  2829             }
       
  2830             else {
       
  2831                 *p++ = '\\';
       
  2832                 *p++ = (unsigned char)s[-1];
       
  2833             }
       
  2834             break;
       
  2835         }
       
  2836         nextByte:
       
  2837         ;
       
  2838     }
       
  2839     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
       
  2840         goto onError;
       
  2841     Py_XDECREF(errorHandler);
       
  2842     Py_XDECREF(exc);
       
  2843     return (PyObject *)v;
       
  2844 
       
  2845 ucnhashError:
       
  2846     PyErr_SetString(
       
  2847         PyExc_UnicodeError,
       
  2848         "\\N escapes not supported (can't load unicodedata module)"
       
  2849         );
       
  2850     Py_XDECREF(v);
       
  2851     Py_XDECREF(errorHandler);
       
  2852     Py_XDECREF(exc);
       
  2853     return NULL;
       
  2854 
       
  2855 onError:
       
  2856     Py_XDECREF(v);
       
  2857     Py_XDECREF(errorHandler);
       
  2858     Py_XDECREF(exc);
       
  2859     return NULL;
       
  2860 }
       
  2861 
       
  2862 /* Return a Unicode-Escape string version of the Unicode object.
       
  2863 
       
  2864    If quotes is true, the string is enclosed in u"" or u'' quotes as
       
  2865    appropriate.
       
  2866 
       
  2867 */
       
  2868 
       
  2869 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
       
  2870                                       Py_ssize_t size,
       
  2871                                       Py_UNICODE ch)
       
  2872 {
       
  2873     /* like wcschr, but doesn't stop at NULL characters */
       
  2874 
       
  2875     while (size-- > 0) {
       
  2876         if (*s == ch)
       
  2877             return s;
       
  2878         s++;
       
  2879     }
       
  2880 
       
  2881     return NULL;
       
  2882 }
       
  2883 
       
  2884 static
       
  2885 PyObject *unicodeescape_string(const Py_UNICODE *s,
       
  2886                                Py_ssize_t size,
       
  2887                                int quotes)
       
  2888 {
       
  2889     PyObject *repr;
       
  2890     char *p;
       
  2891 
       
  2892     static const char *hexdigit = "0123456789abcdef";
       
  2893 #ifdef Py_UNICODE_WIDE
       
  2894     const Py_ssize_t expandsize = 10;
       
  2895 #else
       
  2896     const Py_ssize_t expandsize = 6;
       
  2897 #endif
       
  2898 
       
  2899     /* XXX(nnorwitz): rather than over-allocating, it would be
       
  2900        better to choose a different scheme.  Perhaps scan the
       
  2901        first N-chars of the string and allocate based on that size.
       
  2902     */
       
  2903     /* Initial allocation is based on the longest-possible unichr
       
  2904        escape.
       
  2905 
       
  2906        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
       
  2907        unichr, so in this case it's the longest unichr escape. In
       
  2908        narrow (UTF-16) builds this is five chars per source unichr
       
  2909        since there are two unichrs in the surrogate pair, so in narrow
       
  2910        (UTF-16) builds it's not the longest unichr escape.
       
  2911 
       
  2912        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
       
  2913        so in the narrow (UTF-16) build case it's the longest unichr
       
  2914        escape.
       
  2915     */
       
  2916 
       
  2917     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
       
  2918 	return PyErr_NoMemory();
       
  2919 
       
  2920     repr = PyString_FromStringAndSize(NULL,
       
  2921         2
       
  2922         + expandsize*size
       
  2923         + 1);
       
  2924     if (repr == NULL)
       
  2925         return NULL;
       
  2926 
       
  2927     p = PyString_AS_STRING(repr);
       
  2928 
       
  2929     if (quotes) {
       
  2930         *p++ = 'u';
       
  2931         *p++ = (findchar(s, size, '\'') &&
       
  2932                 !findchar(s, size, '"')) ? '"' : '\'';
       
  2933     }
       
  2934     while (size-- > 0) {
       
  2935         Py_UNICODE ch = *s++;
       
  2936 
       
  2937         /* Escape quotes and backslashes */
       
  2938         if ((quotes &&
       
  2939 	     ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
       
  2940             *p++ = '\\';
       
  2941             *p++ = (char) ch;
       
  2942 	    continue;
       
  2943         }
       
  2944 
       
  2945 #ifdef Py_UNICODE_WIDE
       
  2946         /* Map 21-bit characters to '\U00xxxxxx' */
       
  2947         else if (ch >= 0x10000) {
       
  2948             *p++ = '\\';
       
  2949             *p++ = 'U';
       
  2950             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
       
  2951             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
       
  2952             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
       
  2953             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
       
  2954             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
       
  2955             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
       
  2956             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
       
  2957             *p++ = hexdigit[ch & 0x0000000F];
       
  2958 	    continue;
       
  2959         }
       
  2960 #else
       
  2961 	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
       
  2962 	else if (ch >= 0xD800 && ch < 0xDC00) {
       
  2963 	    Py_UNICODE ch2;
       
  2964 	    Py_UCS4 ucs;
       
  2965 
       
  2966 	    ch2 = *s++;
       
  2967 	    size--;
       
  2968 	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
       
  2969 		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
       
  2970 		*p++ = '\\';
       
  2971 		*p++ = 'U';
       
  2972 		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
       
  2973 		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
       
  2974 		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
       
  2975 		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
       
  2976 		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
       
  2977 		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
       
  2978 		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
       
  2979 		*p++ = hexdigit[ucs & 0x0000000F];
       
  2980 		continue;
       
  2981 	    }
       
  2982 	    /* Fall through: isolated surrogates are copied as-is */
       
  2983 	    s--;
       
  2984 	    size++;
       
  2985 	}
       
  2986 #endif
       
  2987 
       
  2988         /* Map 16-bit characters to '\uxxxx' */
       
  2989         if (ch >= 256) {
       
  2990             *p++ = '\\';
       
  2991             *p++ = 'u';
       
  2992             *p++ = hexdigit[(ch >> 12) & 0x000F];
       
  2993             *p++ = hexdigit[(ch >> 8) & 0x000F];
       
  2994             *p++ = hexdigit[(ch >> 4) & 0x000F];
       
  2995             *p++ = hexdigit[ch & 0x000F];
       
  2996         }
       
  2997 
       
  2998         /* Map special whitespace to '\t', \n', '\r' */
       
  2999         else if (ch == '\t') {
       
  3000             *p++ = '\\';
       
  3001             *p++ = 't';
       
  3002         }
       
  3003         else if (ch == '\n') {
       
  3004             *p++ = '\\';
       
  3005             *p++ = 'n';
       
  3006         }
       
  3007         else if (ch == '\r') {
       
  3008             *p++ = '\\';
       
  3009             *p++ = 'r';
       
  3010         }
       
  3011 
       
  3012         /* Map non-printable US ASCII to '\xhh' */
       
  3013         else if (ch < ' ' || ch >= 0x7F) {
       
  3014             *p++ = '\\';
       
  3015             *p++ = 'x';
       
  3016             *p++ = hexdigit[(ch >> 4) & 0x000F];
       
  3017             *p++ = hexdigit[ch & 0x000F];
       
  3018         }
       
  3019 
       
  3020         /* Copy everything else as-is */
       
  3021         else
       
  3022             *p++ = (char) ch;
       
  3023     }
       
  3024     if (quotes)
       
  3025         *p++ = PyString_AS_STRING(repr)[1];
       
  3026 
       
  3027     *p = '\0';
       
  3028     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
       
  3029     return repr;
       
  3030 }
       
  3031 
       
  3032 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
       
  3033 					Py_ssize_t size)
       
  3034 {
       
  3035     return unicodeescape_string(s, size, 0);
       
  3036 }
       
  3037 
       
  3038 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
       
  3039 {
       
  3040     if (!PyUnicode_Check(unicode)) {
       
  3041         PyErr_BadArgument();
       
  3042         return NULL;
       
  3043     }
       
  3044     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
       
  3045 					 PyUnicode_GET_SIZE(unicode));
       
  3046 }
       
  3047 
       
  3048 /* --- Raw Unicode Escape Codec ------------------------------------------- */
       
  3049 
       
  3050 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
       
  3051 					   Py_ssize_t size,
       
  3052 					   const char *errors)
       
  3053 {
       
  3054     const char *starts = s;
       
  3055     Py_ssize_t startinpos;
       
  3056     Py_ssize_t endinpos;
       
  3057     Py_ssize_t outpos;
       
  3058     PyUnicodeObject *v;
       
  3059     Py_UNICODE *p;
       
  3060     const char *end;
       
  3061     const char *bs;
       
  3062     PyObject *errorHandler = NULL;
       
  3063     PyObject *exc = NULL;
       
  3064 
       
  3065     /* Escaped strings will always be longer than the resulting
       
  3066        Unicode string, so we start with size here and then reduce the
       
  3067        length after conversion to the true value. (But decoding error
       
  3068        handler might have to resize the string) */
       
  3069     v = _PyUnicode_New(size);
       
  3070     if (v == NULL)
       
  3071 	goto onError;
       
  3072     if (size == 0)
       
  3073 	return (PyObject *)v;
       
  3074     p = PyUnicode_AS_UNICODE(v);
       
  3075     end = s + size;
       
  3076     while (s < end) {
       
  3077 	unsigned char c;
       
  3078 	Py_UCS4 x;
       
  3079 	int i;
       
  3080         int count;
       
  3081 
       
  3082 	/* Non-escape characters are interpreted as Unicode ordinals */
       
  3083 	if (*s != '\\') {
       
  3084 	    *p++ = (unsigned char)*s++;
       
  3085 	    continue;
       
  3086 	}
       
  3087 	startinpos = s-starts;
       
  3088 
       
  3089 	/* \u-escapes are only interpreted iff the number of leading
       
  3090 	   backslashes if odd */
       
  3091 	bs = s;
       
  3092 	for (;s < end;) {
       
  3093 	    if (*s != '\\')
       
  3094 		break;
       
  3095 	    *p++ = (unsigned char)*s++;
       
  3096 	}
       
  3097 	if (((s - bs) & 1) == 0 ||
       
  3098 	    s >= end ||
       
  3099 	    (*s != 'u' && *s != 'U')) {
       
  3100 	    continue;
       
  3101 	}
       
  3102 	p--;
       
  3103         count = *s=='u' ? 4 : 8;
       
  3104 	s++;
       
  3105 
       
  3106 	/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
       
  3107 	outpos = p-PyUnicode_AS_UNICODE(v);
       
  3108 	for (x = 0, i = 0; i < count; ++i, ++s) {
       
  3109 	    c = (unsigned char)*s;
       
  3110 	    if (!isxdigit(c)) {
       
  3111 		endinpos = s-starts;
       
  3112 		if (unicode_decode_call_errorhandler(
       
  3113 		    errors, &errorHandler,
       
  3114 		    "rawunicodeescape", "truncated \\uXXXX",
       
  3115 		    starts, size, &startinpos, &endinpos, &exc, &s,
       
  3116 		    (PyObject **)&v, &outpos, &p))
       
  3117 		    goto onError;
       
  3118 		goto nextByte;
       
  3119 	    }
       
  3120 	    x = (x<<4) & ~0xF;
       
  3121 	    if (c >= '0' && c <= '9')
       
  3122 		x += c - '0';
       
  3123 	    else if (c >= 'a' && c <= 'f')
       
  3124 		x += 10 + c - 'a';
       
  3125 	    else
       
  3126 		x += 10 + c - 'A';
       
  3127 	}
       
  3128         if (x <= 0xffff)
       
  3129                 /* UCS-2 character */
       
  3130                 *p++ = (Py_UNICODE) x;
       
  3131         else if (x <= 0x10ffff) {
       
  3132                 /* UCS-4 character. Either store directly, or as
       
  3133                    surrogate pair. */
       
  3134 #ifdef Py_UNICODE_WIDE
       
  3135                 *p++ = (Py_UNICODE) x;
       
  3136 #else
       
  3137                 x -= 0x10000L;
       
  3138                 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
       
  3139                 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
       
  3140 #endif
       
  3141         } else {
       
  3142             endinpos = s-starts;
       
  3143             outpos = p-PyUnicode_AS_UNICODE(v);
       
  3144             if (unicode_decode_call_errorhandler(
       
  3145                     errors, &errorHandler,
       
  3146                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
       
  3147 		    starts, size, &startinpos, &endinpos, &exc, &s,
       
  3148 		    (PyObject **)&v, &outpos, &p))
       
  3149 		    goto onError;
       
  3150         }
       
  3151 	nextByte:
       
  3152 	;
       
  3153     }
       
  3154     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
       
  3155 	goto onError;
       
  3156     Py_XDECREF(errorHandler);
       
  3157     Py_XDECREF(exc);
       
  3158     return (PyObject *)v;
       
  3159 
       
  3160  onError:
       
  3161     Py_XDECREF(v);
       
  3162     Py_XDECREF(errorHandler);
       
  3163     Py_XDECREF(exc);
       
  3164     return NULL;
       
  3165 }
       
  3166 
       
  3167 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
       
  3168 					   Py_ssize_t size)
       
  3169 {
       
  3170     PyObject *repr;
       
  3171     char *p;
       
  3172     char *q;
       
  3173 
       
  3174     static const char *hexdigit = "0123456789abcdef";
       
  3175 #ifdef Py_UNICODE_WIDE
       
  3176     const Py_ssize_t expandsize = 10;
       
  3177 #else
       
  3178     const Py_ssize_t expandsize = 6;
       
  3179 #endif
       
  3180     
       
  3181     if (size > PY_SSIZE_T_MAX / expandsize)
       
  3182 	return PyErr_NoMemory();
       
  3183     
       
  3184     repr = PyString_FromStringAndSize(NULL, expandsize * size);
       
  3185     if (repr == NULL)
       
  3186         return NULL;
       
  3187     if (size == 0)
       
  3188 	return repr;
       
  3189 
       
  3190     p = q = PyString_AS_STRING(repr);
       
  3191     while (size-- > 0) {
       
  3192         Py_UNICODE ch = *s++;
       
  3193 #ifdef Py_UNICODE_WIDE
       
  3194 	/* Map 32-bit characters to '\Uxxxxxxxx' */
       
  3195 	if (ch >= 0x10000) {
       
  3196             *p++ = '\\';
       
  3197             *p++ = 'U';
       
  3198             *p++ = hexdigit[(ch >> 28) & 0xf];
       
  3199             *p++ = hexdigit[(ch >> 24) & 0xf];
       
  3200             *p++ = hexdigit[(ch >> 20) & 0xf];
       
  3201             *p++ = hexdigit[(ch >> 16) & 0xf];
       
  3202             *p++ = hexdigit[(ch >> 12) & 0xf];
       
  3203             *p++ = hexdigit[(ch >> 8) & 0xf];
       
  3204             *p++ = hexdigit[(ch >> 4) & 0xf];
       
  3205             *p++ = hexdigit[ch & 15];
       
  3206         }
       
  3207         else
       
  3208 #else
       
  3209 	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
       
  3210 	if (ch >= 0xD800 && ch < 0xDC00) {
       
  3211 	    Py_UNICODE ch2;
       
  3212 	    Py_UCS4 ucs;
       
  3213 
       
  3214 	    ch2 = *s++;
       
  3215 	    size--;
       
  3216 	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
       
  3217 		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
       
  3218 		*p++ = '\\';
       
  3219 		*p++ = 'U';
       
  3220 		*p++ = hexdigit[(ucs >> 28) & 0xf];
       
  3221 		*p++ = hexdigit[(ucs >> 24) & 0xf];
       
  3222 		*p++ = hexdigit[(ucs >> 20) & 0xf];
       
  3223 		*p++ = hexdigit[(ucs >> 16) & 0xf];
       
  3224 		*p++ = hexdigit[(ucs >> 12) & 0xf];
       
  3225 		*p++ = hexdigit[(ucs >> 8) & 0xf];
       
  3226 		*p++ = hexdigit[(ucs >> 4) & 0xf];
       
  3227 		*p++ = hexdigit[ucs & 0xf];
       
  3228 		continue;
       
  3229 	    }
       
  3230 	    /* Fall through: isolated surrogates are copied as-is */
       
  3231 	    s--;
       
  3232 	    size++;
       
  3233 	}
       
  3234 #endif
       
  3235 	/* Map 16-bit characters to '\uxxxx' */
       
  3236 	if (ch >= 256) {
       
  3237             *p++ = '\\';
       
  3238             *p++ = 'u';
       
  3239             *p++ = hexdigit[(ch >> 12) & 0xf];
       
  3240             *p++ = hexdigit[(ch >> 8) & 0xf];
       
  3241             *p++ = hexdigit[(ch >> 4) & 0xf];
       
  3242             *p++ = hexdigit[ch & 15];
       
  3243         }
       
  3244 	/* Copy everything else as-is */
       
  3245 	else
       
  3246             *p++ = (char) ch;
       
  3247     }
       
  3248     *p = '\0';
       
  3249     _PyString_Resize(&repr, p - q);
       
  3250     return repr;
       
  3251 }
       
  3252 
       
  3253 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
       
  3254 {
       
  3255     if (!PyUnicode_Check(unicode)) {
       
  3256 	PyErr_BadArgument();
       
  3257 	return NULL;
       
  3258     }
       
  3259     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
       
  3260 					    PyUnicode_GET_SIZE(unicode));
       
  3261 }
       
  3262 
       
  3263 /* --- Unicode Internal Codec ------------------------------------------- */
       
  3264 
       
  3265 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
       
  3266 					   Py_ssize_t size,
       
  3267 					   const char *errors)
       
  3268 {
       
  3269     const char *starts = s;
       
  3270     Py_ssize_t startinpos;
       
  3271     Py_ssize_t endinpos;
       
  3272     Py_ssize_t outpos;
       
  3273     PyUnicodeObject *v;
       
  3274     Py_UNICODE *p;
       
  3275     const char *end;
       
  3276     const char *reason;
       
  3277     PyObject *errorHandler = NULL;
       
  3278     PyObject *exc = NULL;
       
  3279 
       
  3280 #ifdef Py_UNICODE_WIDE
       
  3281     Py_UNICODE unimax = PyUnicode_GetMax();
       
  3282 #endif
       
  3283 
       
  3284     /* XXX overflow detection missing */
       
  3285     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
       
  3286     if (v == NULL)
       
  3287 	goto onError;
       
  3288     if (PyUnicode_GetSize((PyObject *)v) == 0)
       
  3289 	return (PyObject *)v;
       
  3290     p = PyUnicode_AS_UNICODE(v);
       
  3291     end = s + size;
       
  3292 
       
  3293     while (s < end) {
       
  3294         memcpy(p, s, sizeof(Py_UNICODE));
       
  3295         /* We have to sanity check the raw data, otherwise doom looms for
       
  3296            some malformed UCS-4 data. */
       
  3297         if (
       
  3298             #ifdef Py_UNICODE_WIDE
       
  3299             *p > unimax || *p < 0 ||
       
  3300             #endif
       
  3301             end-s < Py_UNICODE_SIZE
       
  3302             )
       
  3303             {
       
  3304             startinpos = s - starts;
       
  3305             if (end-s < Py_UNICODE_SIZE) {
       
  3306                 endinpos = end-starts;
       
  3307                 reason = "truncated input";
       
  3308             }
       
  3309             else {
       
  3310                 endinpos = s - starts + Py_UNICODE_SIZE;
       
  3311                 reason = "illegal code point (> 0x10FFFF)";
       
  3312             }
       
  3313             outpos = p - PyUnicode_AS_UNICODE(v);
       
  3314             if (unicode_decode_call_errorhandler(
       
  3315                     errors, &errorHandler,
       
  3316                     "unicode_internal", reason,
       
  3317                     starts, size, &startinpos, &endinpos, &exc, &s,
       
  3318                     (PyObject **)&v, &outpos, &p)) {
       
  3319                 goto onError;
       
  3320             }
       
  3321         }
       
  3322         else {
       
  3323             p++;
       
  3324             s += Py_UNICODE_SIZE;
       
  3325         }
       
  3326     }
       
  3327 
       
  3328     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
       
  3329         goto onError;
       
  3330     Py_XDECREF(errorHandler);
       
  3331     Py_XDECREF(exc);
       
  3332     return (PyObject *)v;
       
  3333 
       
  3334  onError:
       
  3335     Py_XDECREF(v);
       
  3336     Py_XDECREF(errorHandler);
       
  3337     Py_XDECREF(exc);
       
  3338     return NULL;
       
  3339 }
       
  3340 
       
  3341 /* --- Latin-1 Codec ------------------------------------------------------ */
       
  3342 
       
  3343 PyObject *PyUnicode_DecodeLatin1(const char *s,
       
  3344 				 Py_ssize_t size,
       
  3345 				 const char *errors)
       
  3346 {
       
  3347     PyUnicodeObject *v;
       
  3348     Py_UNICODE *p;
       
  3349 
       
  3350     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
       
  3351     if (size == 1) {
       
  3352 	Py_UNICODE r = *(unsigned char*)s;
       
  3353 	return PyUnicode_FromUnicode(&r, 1);
       
  3354     }
       
  3355 
       
  3356     v = _PyUnicode_New(size);
       
  3357     if (v == NULL)
       
  3358 	goto onError;
       
  3359     if (size == 0)
       
  3360 	return (PyObject *)v;
       
  3361     p = PyUnicode_AS_UNICODE(v);
       
  3362     while (size-- > 0)
       
  3363 	*p++ = (unsigned char)*s++;
       
  3364     return (PyObject *)v;
       
  3365 
       
  3366  onError:
       
  3367     Py_XDECREF(v);
       
  3368     return NULL;
       
  3369 }
       
  3370 
       
  3371 /* create or adjust a UnicodeEncodeError */
       
  3372 static void make_encode_exception(PyObject **exceptionObject,
       
  3373     const char *encoding,
       
  3374     const Py_UNICODE *unicode, Py_ssize_t size,
       
  3375     Py_ssize_t startpos, Py_ssize_t endpos,
       
  3376     const char *reason)
       
  3377 {
       
  3378     if (*exceptionObject == NULL) {
       
  3379 	*exceptionObject = PyUnicodeEncodeError_Create(
       
  3380 	    encoding, unicode, size, startpos, endpos, reason);
       
  3381     }
       
  3382     else {
       
  3383 	if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
       
  3384 	    goto onError;
       
  3385 	if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
       
  3386 	    goto onError;
       
  3387 	if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
       
  3388 	    goto onError;
       
  3389 	return;
       
  3390 	onError:
       
  3391 	Py_DECREF(*exceptionObject);
       
  3392 	*exceptionObject = NULL;
       
  3393     }
       
  3394 }
       
  3395 
       
  3396 /* raises a UnicodeEncodeError */
       
  3397 static void raise_encode_exception(PyObject **exceptionObject,
       
  3398     const char *encoding,
       
  3399     const Py_UNICODE *unicode, Py_ssize_t size,
       
  3400     Py_ssize_t startpos, Py_ssize_t endpos,
       
  3401     const char *reason)
       
  3402 {
       
  3403     make_encode_exception(exceptionObject,
       
  3404 	encoding, unicode, size, startpos, endpos, reason);
       
  3405     if (*exceptionObject != NULL)
       
  3406 	PyCodec_StrictErrors(*exceptionObject);
       
  3407 }
       
  3408 
       
  3409 /* error handling callback helper:
       
  3410    build arguments, call the callback and check the arguments,
       
  3411    put the result into newpos and return the replacement string, which
       
  3412    has to be freed by the caller */
       
  3413 static PyObject *unicode_encode_call_errorhandler(const char *errors,
       
  3414     PyObject **errorHandler,
       
  3415     const char *encoding, const char *reason,
       
  3416     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
       
  3417     Py_ssize_t startpos, Py_ssize_t endpos,
       
  3418     Py_ssize_t *newpos)
       
  3419 {
       
  3420     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
       
  3421 
       
  3422     PyObject *restuple;
       
  3423     PyObject *resunicode;
       
  3424 
       
  3425     if (*errorHandler == NULL) {
       
  3426 	*errorHandler = PyCodec_LookupError(errors);
       
  3427         if (*errorHandler == NULL)
       
  3428 	    return NULL;
       
  3429     }
       
  3430 
       
  3431     make_encode_exception(exceptionObject,
       
  3432 	encoding, unicode, size, startpos, endpos, reason);
       
  3433     if (*exceptionObject == NULL)
       
  3434 	return NULL;
       
  3435 
       
  3436     restuple = PyObject_CallFunctionObjArgs(
       
  3437 	*errorHandler, *exceptionObject, NULL);
       
  3438     if (restuple == NULL)
       
  3439 	return NULL;
       
  3440     if (!PyTuple_Check(restuple)) {
       
  3441 	PyErr_Format(PyExc_TypeError, &argparse[4]);
       
  3442 	Py_DECREF(restuple);
       
  3443 	return NULL;
       
  3444     }
       
  3445     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
       
  3446 	&resunicode, newpos)) {
       
  3447 	Py_DECREF(restuple);
       
  3448 	return NULL;
       
  3449     }
       
  3450     if (*newpos<0)
       
  3451 	*newpos = size+*newpos;
       
  3452     if (*newpos<0 || *newpos>size) {
       
  3453 	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
       
  3454 	Py_DECREF(restuple);
       
  3455 	return NULL;
       
  3456     }
       
  3457     Py_INCREF(resunicode);
       
  3458     Py_DECREF(restuple);
       
  3459     return resunicode;
       
  3460 }
       
  3461 
       
  3462 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
       
  3463 				 Py_ssize_t size,
       
  3464 				 const char *errors,
       
  3465 				 int limit)
       
  3466 {
       
  3467     /* output object */
       
  3468     PyObject *res;
       
  3469     /* pointers to the beginning and end+1 of input */
       
  3470     const Py_UNICODE *startp = p;
       
  3471     const Py_UNICODE *endp = p + size;
       
  3472     /* pointer to the beginning of the unencodable characters */
       
  3473     /* const Py_UNICODE *badp = NULL; */
       
  3474     /* pointer into the output */
       
  3475     char *str;
       
  3476     /* current output position */
       
  3477     Py_ssize_t respos = 0;
       
  3478     Py_ssize_t ressize;
       
  3479     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
       
  3480     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
       
  3481     PyObject *errorHandler = NULL;
       
  3482     PyObject *exc = NULL;
       
  3483     /* the following variable is used for caching string comparisons
       
  3484      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
       
  3485     int known_errorHandler = -1;
       
  3486 
       
  3487     /* allocate enough for a simple encoding without
       
  3488        replacements, if we need more, we'll resize */
       
  3489     res = PyString_FromStringAndSize(NULL, size);
       
  3490     if (res == NULL)
       
  3491         goto onError;
       
  3492     if (size == 0)
       
  3493 	return res;
       
  3494     str = PyString_AS_STRING(res);
       
  3495     ressize = size;
       
  3496 
       
  3497     while (p<endp) {
       
  3498 	Py_UNICODE c = *p;
       
  3499 
       
  3500 	/* can we encode this? */
       
  3501 	if (c<limit) {
       
  3502 	    /* no overflow check, because we know that the space is enough */
       
  3503 	    *str++ = (char)c;
       
  3504 	    ++p;
       
  3505 	}
       
  3506 	else {
       
  3507 	    Py_ssize_t unicodepos = p-startp;
       
  3508 	    Py_ssize_t requiredsize;
       
  3509 	    PyObject *repunicode;
       
  3510 	    Py_ssize_t repsize;
       
  3511 	    Py_ssize_t newpos;
       
  3512 	    Py_ssize_t respos;
       
  3513 	    Py_UNICODE *uni2;
       
  3514 	    /* startpos for collecting unencodable chars */
       
  3515 	    const Py_UNICODE *collstart = p;
       
  3516 	    const Py_UNICODE *collend = p;
       
  3517 	    /* find all unecodable characters */
       
  3518 	    while ((collend < endp) && ((*collend)>=limit))
       
  3519 		++collend;
       
  3520 	    /* cache callback name lookup (if not done yet, i.e. it's the first error) */
       
  3521 	    if (known_errorHandler==-1) {
       
  3522 		if ((errors==NULL) || (!strcmp(errors, "strict")))
       
  3523 		    known_errorHandler = 1;
       
  3524 		else if (!strcmp(errors, "replace"))
       
  3525 		    known_errorHandler = 2;
       
  3526 		else if (!strcmp(errors, "ignore"))
       
  3527 		    known_errorHandler = 3;
       
  3528 		else if (!strcmp(errors, "xmlcharrefreplace"))
       
  3529 		    known_errorHandler = 4;
       
  3530 		else
       
  3531 		    known_errorHandler = 0;
       
  3532 	    }
       
  3533 	    switch (known_errorHandler) {
       
  3534 		case 1: /* strict */
       
  3535 		    raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
       
  3536 		    goto onError;
       
  3537 		case 2: /* replace */
       
  3538 		    while (collstart++<collend)
       
  3539 			*str++ = '?'; /* fall through */
       
  3540 		case 3: /* ignore */
       
  3541 		    p = collend;
       
  3542 		    break;
       
  3543 		case 4: /* xmlcharrefreplace */
       
  3544 		    respos = str-PyString_AS_STRING(res);
       
  3545 		    /* determine replacement size (temporarily (mis)uses p) */
       
  3546 		    for (p = collstart, repsize = 0; p < collend; ++p) {
       
  3547 			if (*p<10)
       
  3548 			    repsize += 2+1+1;
       
  3549 			else if (*p<100)
       
  3550 			    repsize += 2+2+1;
       
  3551 			else if (*p<1000)
       
  3552 			    repsize += 2+3+1;
       
  3553 			else if (*p<10000)
       
  3554 			    repsize += 2+4+1;
       
  3555 #ifndef Py_UNICODE_WIDE
       
  3556 			else
       
  3557 			    repsize += 2+5+1;
       
  3558 #else
       
  3559 			else if (*p<100000)
       
  3560 			    repsize += 2+5+1;
       
  3561 			else if (*p<1000000)
       
  3562 			    repsize += 2+6+1;
       
  3563 			else
       
  3564 			    repsize += 2+7+1;
       
  3565 #endif
       
  3566 		    }
       
  3567 		    requiredsize = respos+repsize+(endp-collend);
       
  3568 		    if (requiredsize > ressize) {
       
  3569 			if (requiredsize<2*ressize)
       
  3570 			    requiredsize = 2*ressize;
       
  3571 			if (_PyString_Resize(&res, requiredsize))
       
  3572 			    goto onError;
       
  3573 			str = PyString_AS_STRING(res) + respos;
       
  3574 			ressize = requiredsize;
       
  3575 		    }
       
  3576 		    /* generate replacement (temporarily (mis)uses p) */
       
  3577 		    for (p = collstart; p < collend; ++p) {
       
  3578 			str += sprintf(str, "&#%d;", (int)*p);
       
  3579 		    }
       
  3580 		    p = collend;
       
  3581 		    break;
       
  3582 		default:
       
  3583 		    repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
       
  3584 			encoding, reason, startp, size, &exc,
       
  3585 			collstart-startp, collend-startp, &newpos);
       
  3586 		    if (repunicode == NULL)
       
  3587 			goto onError;
       
  3588 		    /* need more space? (at least enough for what we
       
  3589 		       have+the replacement+the rest of the string, so
       
  3590 		       we won't have to check space for encodable characters) */
       
  3591 		    respos = str-PyString_AS_STRING(res);
       
  3592 		    repsize = PyUnicode_GET_SIZE(repunicode);
       
  3593 		    requiredsize = respos+repsize+(endp-collend);
       
  3594 		    if (requiredsize > ressize) {
       
  3595 			if (requiredsize<2*ressize)
       
  3596 			    requiredsize = 2*ressize;
       
  3597 			if (_PyString_Resize(&res, requiredsize)) {
       
  3598 			    Py_DECREF(repunicode);
       
  3599 			    goto onError;
       
  3600 			}
       
  3601 			str = PyString_AS_STRING(res) + respos;
       
  3602 			ressize = requiredsize;
       
  3603 		    }
       
  3604 		    /* check if there is anything unencodable in the replacement
       
  3605 		       and copy it to the output */
       
  3606 		    for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
       
  3607 			c = *uni2;
       
  3608 			if (c >= limit) {
       
  3609 			    raise_encode_exception(&exc, encoding, startp, size,
       
  3610 				unicodepos, unicodepos+1, reason);
       
  3611 			    Py_DECREF(repunicode);
       
  3612 			    goto onError;
       
  3613 			}
       
  3614 			*str = (char)c;
       
  3615 		    }
       
  3616 		    p = startp + newpos;
       
  3617 		    Py_DECREF(repunicode);
       
  3618 	    }
       
  3619 	}
       
  3620     }
       
  3621     /* Resize if we allocated to much */
       
  3622     respos = str-PyString_AS_STRING(res);
       
  3623     if (respos<ressize)
       
  3624        /* If this falls res will be NULL */
       
  3625 	_PyString_Resize(&res, respos);
       
  3626     Py_XDECREF(errorHandler);
       
  3627     Py_XDECREF(exc);
       
  3628     return res;
       
  3629 
       
  3630     onError:
       
  3631     Py_XDECREF(res);
       
  3632     Py_XDECREF(errorHandler);
       
  3633     Py_XDECREF(exc);
       
  3634     return NULL;
       
  3635 }
       
  3636 
       
  3637 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
       
  3638 				 Py_ssize_t size,
       
  3639 				 const char *errors)
       
  3640 {
       
  3641     return unicode_encode_ucs1(p, size, errors, 256);
       
  3642 }
       
  3643 
       
  3644 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
       
  3645 {
       
  3646     if (!PyUnicode_Check(unicode)) {
       
  3647 	PyErr_BadArgument();
       
  3648 	return NULL;
       
  3649     }
       
  3650     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
       
  3651 				  PyUnicode_GET_SIZE(unicode),
       
  3652 				  NULL);
       
  3653 }
       
  3654 
       
  3655 /* --- 7-bit ASCII Codec -------------------------------------------------- */
       
  3656 
       
  3657 PyObject *PyUnicode_DecodeASCII(const char *s,
       
  3658 				Py_ssize_t size,
       
  3659 				const char *errors)
       
  3660 {
       
  3661     const char *starts = s;
       
  3662     PyUnicodeObject *v;
       
  3663     Py_UNICODE *p;
       
  3664     Py_ssize_t startinpos;
       
  3665     Py_ssize_t endinpos;
       
  3666     Py_ssize_t outpos;
       
  3667     const char *e;
       
  3668     PyObject *errorHandler = NULL;
       
  3669     PyObject *exc = NULL;
       
  3670 
       
  3671     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
       
  3672     if (size == 1 && *(unsigned char*)s < 128) {
       
  3673 	Py_UNICODE r = *(unsigned char*)s;
       
  3674 	return PyUnicode_FromUnicode(&r, 1);
       
  3675     }
       
  3676 
       
  3677     v = _PyUnicode_New(size);
       
  3678     if (v == NULL)
       
  3679 	goto onError;
       
  3680     if (size == 0)
       
  3681 	return (PyObject *)v;
       
  3682     p = PyUnicode_AS_UNICODE(v);
       
  3683     e = s + size;
       
  3684     while (s < e) {
       
  3685 	register unsigned char c = (unsigned char)*s;
       
  3686 	if (c < 128) {
       
  3687 	    *p++ = c;
       
  3688 	    ++s;
       
  3689 	}
       
  3690 	else {
       
  3691 	    startinpos = s-starts;
       
  3692 	    endinpos = startinpos + 1;
       
  3693 	    outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
       
  3694 	    if (unicode_decode_call_errorhandler(
       
  3695 		 errors, &errorHandler,
       
  3696 		 "ascii", "ordinal not in range(128)",
       
  3697 		 starts, size, &startinpos, &endinpos, &exc, &s,
       
  3698 		 (PyObject **)&v, &outpos, &p))
       
  3699 		goto onError;
       
  3700 	}
       
  3701     }
       
  3702     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
       
  3703 	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
       
  3704 	    goto onError;
       
  3705     Py_XDECREF(errorHandler);
       
  3706     Py_XDECREF(exc);
       
  3707     return (PyObject *)v;
       
  3708 
       
  3709  onError:
       
  3710     Py_XDECREF(v);
       
  3711     Py_XDECREF(errorHandler);
       
  3712     Py_XDECREF(exc);
       
  3713     return NULL;
       
  3714 }
       
  3715 
       
  3716 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
       
  3717 				Py_ssize_t size,
       
  3718 				const char *errors)
       
  3719 {
       
  3720     return unicode_encode_ucs1(p, size, errors, 128);
       
  3721 }
       
  3722 
       
  3723 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
       
  3724 {
       
  3725     if (!PyUnicode_Check(unicode)) {
       
  3726 	PyErr_BadArgument();
       
  3727 	return NULL;
       
  3728     }
       
  3729     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
       
  3730 				 PyUnicode_GET_SIZE(unicode),
       
  3731 				 NULL);
       
  3732 }
       
  3733 
       
  3734 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
       
  3735 
       
  3736 /* --- MBCS codecs for Windows -------------------------------------------- */
       
  3737 
       
  3738 #if SIZEOF_INT < SIZEOF_SSIZE_T
       
  3739 #define NEED_RETRY
       
  3740 #endif
       
  3741 
       
  3742 /* XXX This code is limited to "true" double-byte encodings, as
       
  3743    a) it assumes an incomplete character consists of a single byte, and
       
  3744    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
       
  3745       encodings, see IsDBCSLeadByteEx documentation. */
       
  3746 
       
  3747 static int is_dbcs_lead_byte(const char *s, int offset)
       
  3748 {
       
  3749     const char *curr = s + offset;
       
  3750 
       
  3751     if (IsDBCSLeadByte(*curr)) {
       
  3752 	const char *prev = CharPrev(s, curr);
       
  3753 	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
       
  3754     }
       
  3755     return 0;
       
  3756 }
       
  3757 
       
  3758 /*
       
  3759  * Decode MBCS string into unicode object. If 'final' is set, converts
       
  3760  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
       
  3761  */
       
  3762 static int decode_mbcs(PyUnicodeObject **v,
       
  3763 			const char *s, /* MBCS string */
       
  3764 			int size, /* sizeof MBCS string */
       
  3765 			int final)
       
  3766 {
       
  3767     Py_UNICODE *p;
       
  3768     Py_ssize_t n = 0;
       
  3769     int usize = 0;
       
  3770 
       
  3771     assert(size >= 0);
       
  3772 
       
  3773     /* Skip trailing lead-byte unless 'final' is set */
       
  3774     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
       
  3775 	--size;
       
  3776 
       
  3777     /* First get the size of the result */
       
  3778     if (size > 0) {
       
  3779 	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
       
  3780 	if (usize == 0) {
       
  3781 	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
       
  3782 	    return -1;
       
  3783 	}
       
  3784     }
       
  3785 
       
  3786     if (*v == NULL) {
       
  3787 	/* Create unicode object */
       
  3788 	*v = _PyUnicode_New(usize);
       
  3789 	if (*v == NULL)
       
  3790 	    return -1;
       
  3791     }
       
  3792     else {
       
  3793 	/* Extend unicode object */
       
  3794 	n = PyUnicode_GET_SIZE(*v);
       
  3795 	if (_PyUnicode_Resize(v, n + usize) < 0)
       
  3796 	    return -1;
       
  3797     }
       
  3798 
       
  3799     /* Do the conversion */
       
  3800     if (size > 0) {
       
  3801 	p = PyUnicode_AS_UNICODE(*v) + n;
       
  3802 	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
       
  3803 	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
       
  3804 	    return -1;
       
  3805 	}
       
  3806     }
       
  3807 
       
  3808     return size;
       
  3809 }
       
  3810 
       
  3811 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
       
  3812 					Py_ssize_t size,
       
  3813 					const char *errors,
       
  3814 					Py_ssize_t *consumed)
       
  3815 {
       
  3816     PyUnicodeObject *v = NULL;
       
  3817     int done;
       
  3818 
       
  3819     if (consumed)
       
  3820 	*consumed = 0;
       
  3821 
       
  3822 #ifdef NEED_RETRY
       
  3823   retry:
       
  3824     if (size > INT_MAX)
       
  3825 	done = decode_mbcs(&v, s, INT_MAX, 0);
       
  3826     else
       
  3827 #endif
       
  3828 	done = decode_mbcs(&v, s, (int)size, !consumed);
       
  3829 
       
  3830     if (done < 0) {
       
  3831         Py_XDECREF(v);
       
  3832 	return NULL;
       
  3833     }
       
  3834 
       
  3835     if (consumed)
       
  3836 	*consumed += done;
       
  3837 
       
  3838 #ifdef NEED_RETRY
       
  3839     if (size > INT_MAX) {
       
  3840 	s += done;
       
  3841 	size -= done;
       
  3842 	goto retry;
       
  3843     }
       
  3844 #endif
       
  3845 
       
  3846     return (PyObject *)v;
       
  3847 }
       
  3848 
       
  3849 PyObject *PyUnicode_DecodeMBCS(const char *s,
       
  3850 				Py_ssize_t size,
       
  3851 				const char *errors)
       
  3852 {
       
  3853     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
       
  3854 }
       
  3855 
       
  3856 /*
       
  3857  * Convert unicode into string object (MBCS).
       
  3858  * Returns 0 if succeed, -1 otherwise.
       
  3859  */
       
  3860 static int encode_mbcs(PyObject **repr,
       
  3861 			const Py_UNICODE *p, /* unicode */
       
  3862 			int size) /* size of unicode */
       
  3863 {
       
  3864     int mbcssize = 0;
       
  3865     Py_ssize_t n = 0;
       
  3866 
       
  3867     assert(size >= 0);
       
  3868 
       
  3869     /* First get the size of the result */
       
  3870     if (size > 0) {
       
  3871 	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
       
  3872 	if (mbcssize == 0) {
       
  3873 	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
       
  3874 	    return -1;
       
  3875 	}
       
  3876     }
       
  3877 
       
  3878     if (*repr == NULL) {
       
  3879 	/* Create string object */
       
  3880 	*repr = PyString_FromStringAndSize(NULL, mbcssize);
       
  3881 	if (*repr == NULL)
       
  3882 	    return -1;
       
  3883     }
       
  3884     else {
       
  3885 	/* Extend string object */
       
  3886 	n = PyString_Size(*repr);
       
  3887 	if (_PyString_Resize(repr, n + mbcssize) < 0)
       
  3888 	    return -1;
       
  3889     }
       
  3890 
       
  3891     /* Do the conversion */
       
  3892     if (size > 0) {
       
  3893 	char *s = PyString_AS_STRING(*repr) + n;
       
  3894 	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
       
  3895 	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
       
  3896 	    return -1;
       
  3897 	}
       
  3898     }
       
  3899 
       
  3900     return 0;
       
  3901 }
       
  3902 
       
  3903 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
       
  3904 				Py_ssize_t size,
       
  3905 				const char *errors)
       
  3906 {
       
  3907     PyObject *repr = NULL;
       
  3908     int ret;
       
  3909 
       
  3910 #ifdef NEED_RETRY
       
  3911  retry:
       
  3912     if (size > INT_MAX)
       
  3913 	ret = encode_mbcs(&repr, p, INT_MAX);
       
  3914     else
       
  3915 #endif
       
  3916 	ret = encode_mbcs(&repr, p, (int)size);
       
  3917 
       
  3918     if (ret < 0) {
       
  3919 	Py_XDECREF(repr);
       
  3920 	return NULL;
       
  3921     }
       
  3922 
       
  3923 #ifdef NEED_RETRY
       
  3924     if (size > INT_MAX) {
       
  3925 	p += INT_MAX;
       
  3926 	size -= INT_MAX;
       
  3927 	goto retry;
       
  3928     }
       
  3929 #endif
       
  3930 
       
  3931     return repr;
       
  3932 }
       
  3933 
       
  3934 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
       
  3935 {
       
  3936     if (!PyUnicode_Check(unicode)) {
       
  3937         PyErr_BadArgument();
       
  3938         return NULL;
       
  3939     }
       
  3940     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
       
  3941 				PyUnicode_GET_SIZE(unicode),
       
  3942 				NULL);
       
  3943 }
       
  3944 
       
  3945 #undef NEED_RETRY
       
  3946 
       
  3947 #endif /* MS_WINDOWS */
       
  3948 
       
  3949 /* --- Character Mapping Codec -------------------------------------------- */
       
  3950 
       
  3951 PyObject *PyUnicode_DecodeCharmap(const char *s,
       
  3952 				  Py_ssize_t size,
       
  3953 				  PyObject *mapping,
       
  3954 				  const char *errors)
       
  3955 {
       
  3956     const char *starts = s;
       
  3957     Py_ssize_t startinpos;
       
  3958     Py_ssize_t endinpos;
       
  3959     Py_ssize_t outpos;
       
  3960     const char *e;
       
  3961     PyUnicodeObject *v;
       
  3962     Py_UNICODE *p;
       
  3963     Py_ssize_t extrachars = 0;
       
  3964     PyObject *errorHandler = NULL;
       
  3965     PyObject *exc = NULL;
       
  3966     Py_UNICODE *mapstring = NULL;
       
  3967     Py_ssize_t maplen = 0;
       
  3968 
       
  3969     /* Default to Latin-1 */
       
  3970     if (mapping == NULL)
       
  3971 	return PyUnicode_DecodeLatin1(s, size, errors);
       
  3972 
       
  3973     v = _PyUnicode_New(size);
       
  3974     if (v == NULL)
       
  3975 	goto onError;
       
  3976     if (size == 0)
       
  3977 	return (PyObject *)v;
       
  3978     p = PyUnicode_AS_UNICODE(v);
       
  3979     e = s + size;
       
  3980     if (PyUnicode_CheckExact(mapping)) {
       
  3981 	mapstring = PyUnicode_AS_UNICODE(mapping);
       
  3982 	maplen = PyUnicode_GET_SIZE(mapping);
       
  3983 	while (s < e) {
       
  3984 	    unsigned char ch = *s;
       
  3985 	    Py_UNICODE x = 0xfffe; /* illegal value */
       
  3986 
       
  3987 	    if (ch < maplen)
       
  3988 		x = mapstring[ch];
       
  3989 
       
  3990 	    if (x == 0xfffe) {
       
  3991 		/* undefined mapping */
       
  3992 		outpos = p-PyUnicode_AS_UNICODE(v);
       
  3993 		startinpos = s-starts;
       
  3994 		endinpos = startinpos+1;
       
  3995 		if (unicode_decode_call_errorhandler(
       
  3996 		     errors, &errorHandler,
       
  3997 		     "charmap", "character maps to <undefined>",
       
  3998 		     starts, size, &startinpos, &endinpos, &exc, &s,
       
  3999 		     (PyObject **)&v, &outpos, &p)) {
       
  4000 		    goto onError;
       
  4001 		}
       
  4002 		continue;
       
  4003 	    }
       
  4004 	    *p++ = x;
       
  4005 	    ++s;
       
  4006 	}
       
  4007     }
       
  4008     else {
       
  4009 	while (s < e) {
       
  4010 	    unsigned char ch = *s;
       
  4011 	    PyObject *w, *x;
       
  4012 
       
  4013 	    /* Get mapping (char ordinal -> integer, Unicode char or None) */
       
  4014 	    w = PyInt_FromLong((long)ch);
       
  4015 	    if (w == NULL)
       
  4016 		goto onError;
       
  4017 	    x = PyObject_GetItem(mapping, w);
       
  4018 	    Py_DECREF(w);
       
  4019 	    if (x == NULL) {
       
  4020 		if (PyErr_ExceptionMatches(PyExc_LookupError)) {
       
  4021 		    /* No mapping found means: mapping is undefined. */
       
  4022 		    PyErr_Clear();
       
  4023 		    x = Py_None;
       
  4024 		    Py_INCREF(x);
       
  4025 		} else
       
  4026 		    goto onError;
       
  4027 	    }
       
  4028     
       
  4029 	    /* Apply mapping */
       
  4030 	    if (PyInt_Check(x)) {
       
  4031 		long value = PyInt_AS_LONG(x);
       
  4032 		if (value < 0 || value > 65535) {
       
  4033 		    PyErr_SetString(PyExc_TypeError,
       
  4034 				    "character mapping must be in range(65536)");
       
  4035 		    Py_DECREF(x);
       
  4036 		    goto onError;
       
  4037 		}
       
  4038 		*p++ = (Py_UNICODE)value;
       
  4039 	    }
       
  4040 	    else if (x == Py_None) {
       
  4041 		/* undefined mapping */
       
  4042 		outpos = p-PyUnicode_AS_UNICODE(v);
       
  4043 		startinpos = s-starts;
       
  4044 		endinpos = startinpos+1;
       
  4045 		if (unicode_decode_call_errorhandler(
       
  4046 		     errors, &errorHandler,
       
  4047 		     "charmap", "character maps to <undefined>",
       
  4048 		     starts, size, &startinpos, &endinpos, &exc, &s,
       
  4049 		     (PyObject **)&v, &outpos, &p)) {
       
  4050 		    Py_DECREF(x);
       
  4051 		    goto onError;
       
  4052 		}
       
  4053 		Py_DECREF(x);
       
  4054 		continue;
       
  4055 	    }
       
  4056 	    else if (PyUnicode_Check(x)) {
       
  4057 		Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
       
  4058     
       
  4059 		if (targetsize == 1)
       
  4060 		    /* 1-1 mapping */
       
  4061 		    *p++ = *PyUnicode_AS_UNICODE(x);
       
  4062     
       
  4063 		else if (targetsize > 1) {
       
  4064 		    /* 1-n mapping */
       
  4065 		    if (targetsize > extrachars) {
       
  4066 			/* resize first */
       
  4067 			Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
       
  4068 			Py_ssize_t needed = (targetsize - extrachars) + \
       
  4069 				     (targetsize << 2);
       
  4070 			extrachars += needed;
       
  4071 			/* XXX overflow detection missing */
       
  4072 			if (_PyUnicode_Resize(&v,
       
  4073 					     PyUnicode_GET_SIZE(v) + needed) < 0) {
       
  4074 			    Py_DECREF(x);
       
  4075 			    goto onError;
       
  4076 			}
       
  4077 			p = PyUnicode_AS_UNICODE(v) + oldpos;
       
  4078 		    }
       
  4079 		    Py_UNICODE_COPY(p,
       
  4080 				    PyUnicode_AS_UNICODE(x),
       
  4081 				    targetsize);
       
  4082 		    p += targetsize;
       
  4083 		    extrachars -= targetsize;
       
  4084 		}
       
  4085 		/* 1-0 mapping: skip the character */
       
  4086 	    }
       
  4087 	    else {
       
  4088 		/* wrong return value */
       
  4089 		PyErr_SetString(PyExc_TypeError,
       
  4090 		      "character mapping must return integer, None or unicode");
       
  4091 		Py_DECREF(x);
       
  4092 		goto onError;
       
  4093 	    }
       
  4094 	    Py_DECREF(x);
       
  4095 	    ++s;
       
  4096 	}
       
  4097     }
       
  4098     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
       
  4099 	if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
       
  4100 	    goto onError;
       
  4101     Py_XDECREF(errorHandler);
       
  4102     Py_XDECREF(exc);
       
  4103     return (PyObject *)v;
       
  4104 
       
  4105  onError:
       
  4106     Py_XDECREF(errorHandler);
       
  4107     Py_XDECREF(exc);
       
  4108     Py_XDECREF(v);
       
  4109     return NULL;
       
  4110 }
       
  4111 
       
  4112 /* Charmap encoding: the lookup table */
       
  4113 
       
  4114 struct encoding_map{
       
  4115   PyObject_HEAD
       
  4116   unsigned char level1[32];
       
  4117   int count2, count3;
       
  4118   unsigned char level23[1];
       
  4119 };
       
  4120 
       
  4121 static PyObject*
       
  4122 encoding_map_size(PyObject *obj, PyObject* args)
       
  4123 {
       
  4124     struct encoding_map *map = (struct encoding_map*)obj;
       
  4125     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 + 
       
  4126                           128*map->count3);
       
  4127 }
       
  4128 
       
  4129 static PyMethodDef encoding_map_methods[] = {
       
  4130 	{"size", encoding_map_size, METH_NOARGS, 
       
  4131          PyDoc_STR("Return the size (in bytes) of this object") },
       
  4132         { 0 }
       
  4133 };
       
  4134 
       
  4135 static void
       
  4136 encoding_map_dealloc(PyObject* o)
       
  4137 {
       
  4138 	PyObject_FREE(o);
       
  4139 }
       
  4140 
       
  4141 static PyTypeObject EncodingMapType = {
       
  4142 	PyVarObject_HEAD_INIT(NULL, 0)
       
  4143         "EncodingMap",          /*tp_name*/
       
  4144         sizeof(struct encoding_map),   /*tp_basicsize*/
       
  4145         0,                      /*tp_itemsize*/
       
  4146         /* methods */
       
  4147         encoding_map_dealloc,   /*tp_dealloc*/
       
  4148         0,                      /*tp_print*/
       
  4149         0,                      /*tp_getattr*/
       
  4150         0,                      /*tp_setattr*/
       
  4151         0,                      /*tp_compare*/
       
  4152         0,                      /*tp_repr*/
       
  4153         0,                      /*tp_as_number*/
       
  4154         0,                      /*tp_as_sequence*/
       
  4155         0,                      /*tp_as_mapping*/
       
  4156         0,                      /*tp_hash*/
       
  4157         0,                      /*tp_call*/
       
  4158         0,                      /*tp_str*/
       
  4159         0,                      /*tp_getattro*/
       
  4160         0,                      /*tp_setattro*/
       
  4161         0,                      /*tp_as_buffer*/
       
  4162         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
       
  4163         0,                      /*tp_doc*/
       
  4164         0,                      /*tp_traverse*/
       
  4165         0,                      /*tp_clear*/
       
  4166         0,                      /*tp_richcompare*/
       
  4167         0,                      /*tp_weaklistoffset*/
       
  4168         0,                      /*tp_iter*/
       
  4169         0,                      /*tp_iternext*/
       
  4170         encoding_map_methods,   /*tp_methods*/
       
  4171         0,                      /*tp_members*/
       
  4172         0,                      /*tp_getset*/
       
  4173         0,                      /*tp_base*/
       
  4174         0,                      /*tp_dict*/
       
  4175         0,                      /*tp_descr_get*/
       
  4176         0,                      /*tp_descr_set*/
       
  4177         0,                      /*tp_dictoffset*/
       
  4178         0,                      /*tp_init*/
       
  4179         0,                      /*tp_alloc*/
       
  4180         0,                      /*tp_new*/
       
  4181         0,                      /*tp_free*/
       
  4182         0,                      /*tp_is_gc*/
       
  4183 };
       
  4184 
       
  4185 PyObject*
       
  4186 PyUnicode_BuildEncodingMap(PyObject* string)
       
  4187 {
       
  4188     Py_UNICODE *decode;
       
  4189     PyObject *result;
       
  4190     struct encoding_map *mresult;
       
  4191     int i;
       
  4192     int need_dict = 0;
       
  4193     unsigned char level1[32];
       
  4194     unsigned char level2[512];
       
  4195     unsigned char *mlevel1, *mlevel2, *mlevel3;
       
  4196     int count2 = 0, count3 = 0;
       
  4197 
       
  4198     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
       
  4199         PyErr_BadArgument();
       
  4200         return NULL;
       
  4201     }
       
  4202     decode = PyUnicode_AS_UNICODE(string);
       
  4203     memset(level1, 0xFF, sizeof level1);
       
  4204     memset(level2, 0xFF, sizeof level2);
       
  4205 
       
  4206     /* If there isn't a one-to-one mapping of NULL to \0,
       
  4207        or if there are non-BMP characters, we need to use
       
  4208        a mapping dictionary. */
       
  4209     if (decode[0] != 0)
       
  4210         need_dict = 1;
       
  4211     for (i = 1; i < 256; i++) {
       
  4212         int l1, l2;
       
  4213         if (decode[i] == 0
       
  4214             #ifdef Py_UNICODE_WIDE
       
  4215             || decode[i] > 0xFFFF
       
  4216             #endif
       
  4217         ) {
       
  4218             need_dict = 1;
       
  4219             break;
       
  4220         }
       
  4221         if (decode[i] == 0xFFFE)
       
  4222             /* unmapped character */
       
  4223             continue;
       
  4224         l1 = decode[i] >> 11;
       
  4225         l2 = decode[i] >> 7;
       
  4226         if (level1[l1] == 0xFF)
       
  4227             level1[l1] = count2++;
       
  4228         if (level2[l2] == 0xFF)
       
  4229             level2[l2] = count3++; 
       
  4230     }
       
  4231 
       
  4232     if (count2 >= 0xFF || count3 >= 0xFF)
       
  4233         need_dict = 1;
       
  4234 
       
  4235     if (need_dict) {
       
  4236         PyObject *result = PyDict_New();
       
  4237         PyObject *key, *value;
       
  4238         if (!result)
       
  4239             return NULL;
       
  4240         for (i = 0; i < 256; i++) {
       
  4241             key = value = NULL;
       
  4242             key = PyInt_FromLong(decode[i]);
       
  4243             value = PyInt_FromLong(i);
       
  4244             if (!key || !value)
       
  4245                 goto failed1;
       
  4246             if (PyDict_SetItem(result, key, value) == -1)
       
  4247                 goto failed1;
       
  4248             Py_DECREF(key);
       
  4249             Py_DECREF(value);
       
  4250         }
       
  4251         return result;
       
  4252       failed1:
       
  4253         Py_XDECREF(key);
       
  4254         Py_XDECREF(value);
       
  4255         Py_DECREF(result);
       
  4256         return NULL;
       
  4257     }
       
  4258 
       
  4259     /* Create a three-level trie */
       
  4260     result = PyObject_MALLOC(sizeof(struct encoding_map) +
       
  4261                              16*count2 + 128*count3 - 1);
       
  4262     if (!result)
       
  4263         return PyErr_NoMemory();
       
  4264     PyObject_Init(result, &EncodingMapType);
       
  4265     mresult = (struct encoding_map*)result;
       
  4266     mresult->count2 = count2;
       
  4267     mresult->count3 = count3;
       
  4268     mlevel1 = mresult->level1;
       
  4269     mlevel2 = mresult->level23;
       
  4270     mlevel3 = mresult->level23 + 16*count2;
       
  4271     memcpy(mlevel1, level1, 32);
       
  4272     memset(mlevel2, 0xFF, 16*count2);
       
  4273     memset(mlevel3, 0, 128*count3);
       
  4274     count3 = 0;
       
  4275     for (i = 1; i < 256; i++) {
       
  4276         int o1, o2, o3, i2, i3;
       
  4277         if (decode[i] == 0xFFFE)
       
  4278             /* unmapped character */
       
  4279             continue;
       
  4280         o1 = decode[i]>>11;
       
  4281         o2 = (decode[i]>>7) & 0xF;
       
  4282         i2 = 16*mlevel1[o1] + o2;
       
  4283         if (mlevel2[i2] == 0xFF)
       
  4284             mlevel2[i2] = count3++;
       
  4285         o3 = decode[i] & 0x7F;
       
  4286         i3 = 128*mlevel2[i2] + o3;
       
  4287         mlevel3[i3] = i;
       
  4288     }
       
  4289     return result;
       
  4290 }
       
  4291 
       
  4292 static int
       
  4293 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
       
  4294 {
       
  4295     struct encoding_map *map = (struct encoding_map*)mapping;
       
  4296     int l1 = c>>11;
       
  4297     int l2 = (c>>7) & 0xF;
       
  4298     int l3 = c & 0x7F;
       
  4299     int i;
       
  4300 
       
  4301 #ifdef Py_UNICODE_WIDE
       
  4302     if (c > 0xFFFF) {
       
  4303 	return -1;
       
  4304     }
       
  4305 #endif
       
  4306     if (c == 0)
       
  4307         return 0;
       
  4308     /* level 1*/
       
  4309     i = map->level1[l1];
       
  4310     if (i == 0xFF) {
       
  4311         return -1;
       
  4312     }
       
  4313     /* level 2*/
       
  4314     i = map->level23[16*i+l2];
       
  4315     if (i == 0xFF) {
       
  4316         return -1;
       
  4317     }
       
  4318     /* level 3 */
       
  4319     i = map->level23[16*map->count2 + 128*i + l3];
       
  4320     if (i == 0) {
       
  4321         return -1;
       
  4322     }
       
  4323     return i;
       
  4324 }
       
  4325 
       
  4326 /* Lookup the character ch in the mapping. If the character
       
  4327    can't be found, Py_None is returned (or NULL, if another
       
  4328    error occurred). */
       
  4329 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
       
  4330 {
       
  4331     PyObject *w = PyInt_FromLong((long)c);
       
  4332     PyObject *x;
       
  4333 
       
  4334     if (w == NULL)
       
  4335 	 return NULL;
       
  4336     x = PyObject_GetItem(mapping, w);
       
  4337     Py_DECREF(w);
       
  4338     if (x == NULL) {
       
  4339 	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
       
  4340 	    /* No mapping found means: mapping is undefined. */
       
  4341 	    PyErr_Clear();
       
  4342 	    x = Py_None;
       
  4343 	    Py_INCREF(x);
       
  4344 	    return x;
       
  4345 	} else
       
  4346 	    return NULL;
       
  4347     }
       
  4348     else if (x == Py_None)
       
  4349 	return x;
       
  4350     else if (PyInt_Check(x)) {
       
  4351 	long value = PyInt_AS_LONG(x);
       
  4352 	if (value < 0 || value > 255) {
       
  4353 	    PyErr_SetString(PyExc_TypeError,
       
  4354 			     "character mapping must be in range(256)");
       
  4355 	    Py_DECREF(x);
       
  4356 	    return NULL;
       
  4357 	}
       
  4358 	return x;
       
  4359     }
       
  4360     else if (PyString_Check(x))
       
  4361 	return x;
       
  4362     else {
       
  4363 	/* wrong return value */
       
  4364 	PyErr_SetString(PyExc_TypeError,
       
  4365 	      "character mapping must return integer, None or str");
       
  4366 	Py_DECREF(x);
       
  4367 	return NULL;
       
  4368     }
       
  4369 }
       
  4370 
       
  4371 static int
       
  4372 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
       
  4373 {
       
  4374 	Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
       
  4375 	/* exponentially overallocate to minimize reallocations */
       
  4376 	if (requiredsize < 2*outsize)
       
  4377 	    requiredsize = 2*outsize;
       
  4378 	if (_PyString_Resize(outobj, requiredsize)) {
       
  4379 	    return 0;
       
  4380 	}
       
  4381 	return 1;
       
  4382 }
       
  4383 
       
  4384 typedef enum charmapencode_result { 
       
  4385   enc_SUCCESS, enc_FAILED, enc_EXCEPTION 
       
  4386 }charmapencode_result;
       
  4387 /* lookup the character, put the result in the output string and adjust
       
  4388    various state variables. Reallocate the output string if not enough
       
  4389    space is available. Return a new reference to the object that
       
  4390    was put in the output buffer, or Py_None, if the mapping was undefined
       
  4391    (in which case no character was written) or NULL, if a
       
  4392    reallocation error occurred. The caller must decref the result */
       
  4393 static
       
  4394 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
       
  4395     PyObject **outobj, Py_ssize_t *outpos)
       
  4396 {
       
  4397     PyObject *rep;
       
  4398     char *outstart;
       
  4399     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
       
  4400 
       
  4401     if (Py_TYPE(mapping) == &EncodingMapType) {
       
  4402         int res = encoding_map_lookup(c, mapping);
       
  4403 	Py_ssize_t requiredsize = *outpos+1;
       
  4404         if (res == -1)
       
  4405             return enc_FAILED;
       
  4406 	if (outsize<requiredsize) 
       
  4407 	    if (!charmapencode_resize(outobj, outpos, requiredsize))
       
  4408 		return enc_EXCEPTION;
       
  4409         outstart = PyString_AS_STRING(*outobj);
       
  4410 	outstart[(*outpos)++] = (char)res;
       
  4411 	return enc_SUCCESS;
       
  4412     }
       
  4413 
       
  4414     rep = charmapencode_lookup(c, mapping);
       
  4415     if (rep==NULL)
       
  4416 	return enc_EXCEPTION;
       
  4417     else if (rep==Py_None) {
       
  4418 	Py_DECREF(rep);
       
  4419 	return enc_FAILED;
       
  4420     } else {
       
  4421 	if (PyInt_Check(rep)) {
       
  4422 	    Py_ssize_t requiredsize = *outpos+1;
       
  4423 	    if (outsize<requiredsize)
       
  4424 		if (!charmapencode_resize(outobj, outpos, requiredsize)) {
       
  4425 		    Py_DECREF(rep);
       
  4426 		    return enc_EXCEPTION;
       
  4427 		}
       
  4428             outstart = PyString_AS_STRING(*outobj);
       
  4429 	    outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
       
  4430 	}
       
  4431 	else {
       
  4432 	    const char *repchars = PyString_AS_STRING(rep);
       
  4433 	    Py_ssize_t repsize = PyString_GET_SIZE(rep);
       
  4434 	    Py_ssize_t requiredsize = *outpos+repsize;
       
  4435 	    if (outsize<requiredsize)
       
  4436 		if (!charmapencode_resize(outobj, outpos, requiredsize)) {
       
  4437 		    Py_DECREF(rep);
       
  4438 		    return enc_EXCEPTION;
       
  4439 		}
       
  4440             outstart = PyString_AS_STRING(*outobj);
       
  4441 	    memcpy(outstart + *outpos, repchars, repsize);
       
  4442 	    *outpos += repsize;
       
  4443 	}
       
  4444     }
       
  4445     Py_DECREF(rep);
       
  4446     return enc_SUCCESS;
       
  4447 }
       
  4448 
       
  4449 /* handle an error in PyUnicode_EncodeCharmap
       
  4450    Return 0 on success, -1 on error */
       
  4451 static
       
  4452 int charmap_encoding_error(
       
  4453     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
       
  4454     PyObject **exceptionObject,
       
  4455     int *known_errorHandler, PyObject **errorHandler, const char *errors,
       
  4456     PyObject **res, Py_ssize_t *respos)
       
  4457 {
       
  4458     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
       
  4459     Py_ssize_t repsize;
       
  4460     Py_ssize_t newpos;
       
  4461     Py_UNICODE *uni2;
       
  4462     /* startpos for collecting unencodable chars */
       
  4463     Py_ssize_t collstartpos = *inpos;
       
  4464     Py_ssize_t collendpos = *inpos+1;
       
  4465     Py_ssize_t collpos;
       
  4466     char *encoding = "charmap";
       
  4467     char *reason = "character maps to <undefined>";
       
  4468     charmapencode_result x;
       
  4469 
       
  4470     /* find all unencodable characters */
       
  4471     while (collendpos < size) {
       
  4472         PyObject *rep;
       
  4473         if (Py_TYPE(mapping) == &EncodingMapType) {
       
  4474 	    int res = encoding_map_lookup(p[collendpos], mapping);
       
  4475 	    if (res != -1)
       
  4476 		break;
       
  4477 	    ++collendpos;
       
  4478 	    continue;
       
  4479 	}
       
  4480             
       
  4481 	rep = charmapencode_lookup(p[collendpos], mapping);
       
  4482 	if (rep==NULL)
       
  4483 	    return -1;
       
  4484 	else if (rep!=Py_None) {
       
  4485 	    Py_DECREF(rep);
       
  4486 	    break;
       
  4487 	}
       
  4488 	Py_DECREF(rep);
       
  4489 	++collendpos;
       
  4490     }
       
  4491     /* cache callback name lookup
       
  4492      * (if not done yet, i.e. it's the first error) */
       
  4493     if (*known_errorHandler==-1) {
       
  4494 	if ((errors==NULL) || (!strcmp(errors, "strict")))
       
  4495 	    *known_errorHandler = 1;
       
  4496 	else if (!strcmp(errors, "replace"))
       
  4497 	    *known_errorHandler = 2;
       
  4498 	else if (!strcmp(errors, "ignore"))
       
  4499 	    *known_errorHandler = 3;
       
  4500 	else if (!strcmp(errors, "xmlcharrefreplace"))
       
  4501 	    *known_errorHandler = 4;
       
  4502 	else
       
  4503 	    *known_errorHandler = 0;
       
  4504     }
       
  4505     switch (*known_errorHandler) {
       
  4506 	case 1: /* strict */
       
  4507 	    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
       
  4508 	    return -1;
       
  4509 	case 2: /* replace */
       
  4510 	    for (collpos = collstartpos; collpos<collendpos; ++collpos) {
       
  4511 		x = charmapencode_output('?', mapping, res, respos);
       
  4512 		if (x==enc_EXCEPTION) {
       
  4513 		    return -1;
       
  4514 		}
       
  4515 		else if (x==enc_FAILED) {
       
  4516 		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
       
  4517 		    return -1;
       
  4518 		}
       
  4519 	    }
       
  4520 	    /* fall through */
       
  4521 	case 3: /* ignore */
       
  4522 	    *inpos = collendpos;
       
  4523 	    break;
       
  4524 	case 4: /* xmlcharrefreplace */
       
  4525 	    /* generate replacement (temporarily (mis)uses p) */
       
  4526 	    for (collpos = collstartpos; collpos < collendpos; ++collpos) {
       
  4527 		char buffer[2+29+1+1];
       
  4528 		char *cp;
       
  4529 		sprintf(buffer, "&#%d;", (int)p[collpos]);
       
  4530 		for (cp = buffer; *cp; ++cp) {
       
  4531 		    x = charmapencode_output(*cp, mapping, res, respos);
       
  4532 		    if (x==enc_EXCEPTION)
       
  4533 			return -1;
       
  4534 		    else if (x==enc_FAILED) {
       
  4535 			raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
       
  4536 			return -1;
       
  4537 		    }
       
  4538 		}
       
  4539 	    }
       
  4540 	    *inpos = collendpos;
       
  4541 	    break;
       
  4542 	default:
       
  4543 	    repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
       
  4544 		encoding, reason, p, size, exceptionObject,
       
  4545 		collstartpos, collendpos, &newpos);
       
  4546 	    if (repunicode == NULL)
       
  4547 		return -1;
       
  4548 	    /* generate replacement  */
       
  4549 	    repsize = PyUnicode_GET_SIZE(repunicode);
       
  4550 	    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
       
  4551 		x = charmapencode_output(*uni2, mapping, res, respos);
       
  4552 		if (x==enc_EXCEPTION) {
       
  4553 		    return -1;
       
  4554 		}
       
  4555 		else if (x==enc_FAILED) {
       
  4556 		    Py_DECREF(repunicode);
       
  4557 		    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
       
  4558 		    return -1;
       
  4559 		}
       
  4560 	    }
       
  4561 	    *inpos = newpos;
       
  4562 	    Py_DECREF(repunicode);
       
  4563     }
       
  4564     return 0;
       
  4565 }
       
  4566 
       
  4567 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
       
  4568 				  Py_ssize_t size,
       
  4569 				  PyObject *mapping,
       
  4570 				  const char *errors)
       
  4571 {
       
  4572     /* output object */
       
  4573     PyObject *res = NULL;
       
  4574     /* current input position */
       
  4575     Py_ssize_t inpos = 0;
       
  4576     /* current output position */
       
  4577     Py_ssize_t respos = 0;
       
  4578     PyObject *errorHandler = NULL;
       
  4579     PyObject *exc = NULL;
       
  4580     /* the following variable is used for caching string comparisons
       
  4581      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
       
  4582      * 3=ignore, 4=xmlcharrefreplace */
       
  4583     int known_errorHandler = -1;
       
  4584 
       
  4585     /* Default to Latin-1 */
       
  4586     if (mapping == NULL)
       
  4587 	return PyUnicode_EncodeLatin1(p, size, errors);
       
  4588 
       
  4589     /* allocate enough for a simple encoding without
       
  4590        replacements, if we need more, we'll resize */
       
  4591     res = PyString_FromStringAndSize(NULL, size);
       
  4592     if (res == NULL)
       
  4593         goto onError;
       
  4594     if (size == 0)
       
  4595 	return res;
       
  4596 
       
  4597     while (inpos<size) {
       
  4598 	/* try to encode it */
       
  4599 	charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
       
  4600 	if (x==enc_EXCEPTION) /* error */
       
  4601 	    goto onError;
       
  4602 	if (x==enc_FAILED) { /* unencodable character */
       
  4603 	    if (charmap_encoding_error(p, size, &inpos, mapping,
       
  4604 		&exc,
       
  4605 		&known_errorHandler, &errorHandler, errors,
       
  4606 		&res, &respos)) {
       
  4607 		goto onError;
       
  4608 	    }
       
  4609 	}
       
  4610 	else
       
  4611 	    /* done with this character => adjust input position */
       
  4612 	    ++inpos;
       
  4613     }
       
  4614 
       
  4615     /* Resize if we allocated to much */
       
  4616     if (respos<PyString_GET_SIZE(res)) {
       
  4617 	if (_PyString_Resize(&res, respos))
       
  4618 	    goto onError;
       
  4619     }
       
  4620     Py_XDECREF(exc);
       
  4621     Py_XDECREF(errorHandler);
       
  4622     return res;
       
  4623 
       
  4624     onError:
       
  4625     Py_XDECREF(res);
       
  4626     Py_XDECREF(exc);
       
  4627     Py_XDECREF(errorHandler);
       
  4628     return NULL;
       
  4629 }
       
  4630 
       
  4631 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
       
  4632 				    PyObject *mapping)
       
  4633 {
       
  4634     if (!PyUnicode_Check(unicode) || mapping == NULL) {
       
  4635 	PyErr_BadArgument();
       
  4636 	return NULL;
       
  4637     }
       
  4638     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
       
  4639 				   PyUnicode_GET_SIZE(unicode),
       
  4640 				   mapping,
       
  4641 				   NULL);
       
  4642 }
       
  4643 
       
  4644 /* create or adjust a UnicodeTranslateError */
       
  4645 static void make_translate_exception(PyObject **exceptionObject,
       
  4646     const Py_UNICODE *unicode, Py_ssize_t size,
       
  4647     Py_ssize_t startpos, Py_ssize_t endpos,
       
  4648     const char *reason)
       
  4649 {
       
  4650     if (*exceptionObject == NULL) {
       
  4651     	*exceptionObject = PyUnicodeTranslateError_Create(
       
  4652 	    unicode, size, startpos, endpos, reason);
       
  4653     }
       
  4654     else {
       
  4655 	if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
       
  4656 	    goto onError;
       
  4657 	if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
       
  4658 	    goto onError;
       
  4659 	if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
       
  4660 	    goto onError;
       
  4661 	return;
       
  4662 	onError:
       
  4663 	Py_DECREF(*exceptionObject);
       
  4664 	*exceptionObject = NULL;
       
  4665     }
       
  4666 }
       
  4667 
       
  4668 /* raises a UnicodeTranslateError */
       
  4669 static void raise_translate_exception(PyObject **exceptionObject,
       
  4670     const Py_UNICODE *unicode, Py_ssize_t size,
       
  4671     Py_ssize_t startpos, Py_ssize_t endpos,
       
  4672     const char *reason)
       
  4673 {
       
  4674     make_translate_exception(exceptionObject,
       
  4675 	unicode, size, startpos, endpos, reason);
       
  4676     if (*exceptionObject != NULL)
       
  4677 	PyCodec_StrictErrors(*exceptionObject);
       
  4678 }
       
  4679 
       
  4680 /* error handling callback helper:
       
  4681    build arguments, call the callback and check the arguments,
       
  4682    put the result into newpos and return the replacement string, which
       
  4683    has to be freed by the caller */
       
  4684 static PyObject *unicode_translate_call_errorhandler(const char *errors,
       
  4685     PyObject **errorHandler,
       
  4686     const char *reason,
       
  4687     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
       
  4688     Py_ssize_t startpos, Py_ssize_t endpos,
       
  4689     Py_ssize_t *newpos)
       
  4690 {
       
  4691     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
       
  4692 
       
  4693     Py_ssize_t i_newpos;
       
  4694     PyObject *restuple;
       
  4695     PyObject *resunicode;
       
  4696 
       
  4697     if (*errorHandler == NULL) {
       
  4698 	*errorHandler = PyCodec_LookupError(errors);
       
  4699         if (*errorHandler == NULL)
       
  4700 	    return NULL;
       
  4701     }
       
  4702 
       
  4703     make_translate_exception(exceptionObject,
       
  4704 	unicode, size, startpos, endpos, reason);
       
  4705     if (*exceptionObject == NULL)
       
  4706 	return NULL;
       
  4707 
       
  4708     restuple = PyObject_CallFunctionObjArgs(
       
  4709 	*errorHandler, *exceptionObject, NULL);
       
  4710     if (restuple == NULL)
       
  4711 	return NULL;
       
  4712     if (!PyTuple_Check(restuple)) {
       
  4713 	PyErr_Format(PyExc_TypeError, &argparse[4]);
       
  4714 	Py_DECREF(restuple);
       
  4715 	return NULL;
       
  4716     }
       
  4717     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
       
  4718 	&resunicode, &i_newpos)) {
       
  4719 	Py_DECREF(restuple);
       
  4720 	return NULL;
       
  4721     }
       
  4722     if (i_newpos<0)
       
  4723 	*newpos = size+i_newpos;
       
  4724     else
       
  4725         *newpos = i_newpos;
       
  4726     if (*newpos<0 || *newpos>size) {
       
  4727 	PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
       
  4728 	Py_DECREF(restuple);
       
  4729 	return NULL;
       
  4730     }
       
  4731     Py_INCREF(resunicode);
       
  4732     Py_DECREF(restuple);
       
  4733     return resunicode;
       
  4734 }
       
  4735 
       
  4736 /* Lookup the character ch in the mapping and put the result in result,
       
  4737    which must be decrefed by the caller.
       
  4738    Return 0 on success, -1 on error */
       
  4739 static
       
  4740 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
       
  4741 {
       
  4742     PyObject *w = PyInt_FromLong((long)c);
       
  4743     PyObject *x;
       
  4744 
       
  4745     if (w == NULL)
       
  4746 	 return -1;
       
  4747     x = PyObject_GetItem(mapping, w);
       
  4748     Py_DECREF(w);
       
  4749     if (x == NULL) {
       
  4750 	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
       
  4751 	    /* No mapping found means: use 1:1 mapping. */
       
  4752 	    PyErr_Clear();
       
  4753 	    *result = NULL;
       
  4754 	    return 0;
       
  4755 	} else
       
  4756 	    return -1;
       
  4757     }
       
  4758     else if (x == Py_None) {
       
  4759 	*result = x;
       
  4760 	return 0;
       
  4761     }
       
  4762     else if (PyInt_Check(x)) {
       
  4763 	long value = PyInt_AS_LONG(x);
       
  4764 	long max = PyUnicode_GetMax();
       
  4765 	if (value < 0 || value > max) {
       
  4766 	    PyErr_Format(PyExc_TypeError,
       
  4767 			     "character mapping must be in range(0x%lx)", max+1);
       
  4768 	    Py_DECREF(x);
       
  4769 	    return -1;
       
  4770 	}
       
  4771 	*result = x;
       
  4772 	return 0;
       
  4773     }
       
  4774     else if (PyUnicode_Check(x)) {
       
  4775 	*result = x;
       
  4776 	return 0;
       
  4777     }
       
  4778     else {
       
  4779 	/* wrong return value */
       
  4780 	PyErr_SetString(PyExc_TypeError,
       
  4781 	      "character mapping must return integer, None or unicode");
       
  4782 	Py_DECREF(x);
       
  4783 	return -1;
       
  4784     }
       
  4785 }
       
  4786 /* ensure that *outobj is at least requiredsize characters long,
       
  4787 if not reallocate and adjust various state variables.
       
  4788 Return 0 on success, -1 on error */
       
  4789 static
       
  4790 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
       
  4791     Py_ssize_t requiredsize)
       
  4792 {
       
  4793     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
       
  4794     if (requiredsize > oldsize) {
       
  4795 	/* remember old output position */
       
  4796 	Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
       
  4797 	/* exponentially overallocate to minimize reallocations */
       
  4798 	if (requiredsize < 2 * oldsize)
       
  4799 	    requiredsize = 2 * oldsize;
       
  4800 	if (_PyUnicode_Resize(outobj, requiredsize) < 0)
       
  4801 	    return -1;
       
  4802 	*outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
       
  4803     }
       
  4804     return 0;
       
  4805 }
       
  4806 /* lookup the character, put the result in the output string and adjust
       
  4807    various state variables. Return a new reference to the object that
       
  4808    was put in the output buffer in *result, or Py_None, if the mapping was
       
  4809    undefined (in which case no character was written).
       
  4810    The called must decref result.
       
  4811    Return 0 on success, -1 on error. */
       
  4812 static
       
  4813 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
       
  4814     Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
       
  4815     PyObject **res)
       
  4816 {
       
  4817     if (charmaptranslate_lookup(*curinp, mapping, res))
       
  4818 	return -1;
       
  4819     if (*res==NULL) {
       
  4820 	/* not found => default to 1:1 mapping */
       
  4821 	*(*outp)++ = *curinp;
       
  4822     }
       
  4823     else if (*res==Py_None)
       
  4824 	;
       
  4825     else if (PyInt_Check(*res)) {
       
  4826 	/* no overflow check, because we know that the space is enough */
       
  4827 	*(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
       
  4828     }
       
  4829     else if (PyUnicode_Check(*res)) {
       
  4830 	Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
       
  4831 	if (repsize==1) {
       
  4832 	    /* no overflow check, because we know that the space is enough */
       
  4833 	    *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
       
  4834 	}
       
  4835 	else if (repsize!=0) {
       
  4836 	    /* more than one character */
       
  4837 	    Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
       
  4838 		(insize - (curinp-startinp)) +
       
  4839 		repsize - 1;
       
  4840 	    if (charmaptranslate_makespace(outobj, outp, requiredsize))
       
  4841 		return -1;
       
  4842 	    memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
       
  4843 	    *outp += repsize;
       
  4844 	}
       
  4845     }
       
  4846     else
       
  4847 	return -1;
       
  4848     return 0;
       
  4849 }
       
  4850 
       
  4851 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
       
  4852 				     Py_ssize_t size,
       
  4853 				     PyObject *mapping,
       
  4854 				     const char *errors)
       
  4855 {
       
  4856     /* output object */
       
  4857     PyObject *res = NULL;
       
  4858     /* pointers to the beginning and end+1 of input */
       
  4859     const Py_UNICODE *startp = p;
       
  4860     const Py_UNICODE *endp = p + size;
       
  4861     /* pointer into the output */
       
  4862     Py_UNICODE *str;
       
  4863     /* current output position */
       
  4864     Py_ssize_t respos = 0;
       
  4865     char *reason = "character maps to <undefined>";
       
  4866     PyObject *errorHandler = NULL;
       
  4867     PyObject *exc = NULL;
       
  4868     /* the following variable is used for caching string comparisons
       
  4869      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
       
  4870      * 3=ignore, 4=xmlcharrefreplace */
       
  4871     int known_errorHandler = -1;
       
  4872 
       
  4873     if (mapping == NULL) {
       
  4874 	PyErr_BadArgument();
       
  4875 	return NULL;
       
  4876     }
       
  4877 
       
  4878     /* allocate enough for a simple 1:1 translation without
       
  4879        replacements, if we need more, we'll resize */
       
  4880     res = PyUnicode_FromUnicode(NULL, size);
       
  4881     if (res == NULL)
       
  4882 	goto onError;
       
  4883     if (size == 0)
       
  4884 	return res;
       
  4885     str = PyUnicode_AS_UNICODE(res);
       
  4886 
       
  4887     while (p<endp) {
       
  4888 	/* try to encode it */
       
  4889 	PyObject *x = NULL;
       
  4890 	if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
       
  4891 	    Py_XDECREF(x);
       
  4892 	    goto onError;
       
  4893 	}
       
  4894 	Py_XDECREF(x);
       
  4895 	if (x!=Py_None) /* it worked => adjust input pointer */
       
  4896 	    ++p;
       
  4897 	else { /* untranslatable character */
       
  4898 	    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
       
  4899 	    Py_ssize_t repsize;
       
  4900 	    Py_ssize_t newpos;
       
  4901 	    Py_UNICODE *uni2;
       
  4902 	    /* startpos for collecting untranslatable chars */
       
  4903 	    const Py_UNICODE *collstart = p;
       
  4904 	    const Py_UNICODE *collend = p+1;
       
  4905 	    const Py_UNICODE *coll;
       
  4906 
       
  4907 	    /* find all untranslatable characters */
       
  4908 	    while (collend < endp) {
       
  4909 		if (charmaptranslate_lookup(*collend, mapping, &x))
       
  4910 		    goto onError;
       
  4911 		Py_XDECREF(x);
       
  4912 		if (x!=Py_None)
       
  4913 		    break;
       
  4914 		++collend;
       
  4915 	    }
       
  4916 	    /* cache callback name lookup
       
  4917 	     * (if not done yet, i.e. it's the first error) */
       
  4918 	    if (known_errorHandler==-1) {
       
  4919 		if ((errors==NULL) || (!strcmp(errors, "strict")))
       
  4920 		    known_errorHandler = 1;
       
  4921 		else if (!strcmp(errors, "replace"))
       
  4922 		    known_errorHandler = 2;
       
  4923 		else if (!strcmp(errors, "ignore"))
       
  4924 		    known_errorHandler = 3;
       
  4925 		else if (!strcmp(errors, "xmlcharrefreplace"))
       
  4926 		    known_errorHandler = 4;
       
  4927 		else
       
  4928 		    known_errorHandler = 0;
       
  4929 	    }
       
  4930 	    switch (known_errorHandler) {
       
  4931 		case 1: /* strict */
       
  4932 		    raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
       
  4933 		    goto onError;
       
  4934 		case 2: /* replace */
       
  4935 		    /* No need to check for space, this is a 1:1 replacement */
       
  4936 		    for (coll = collstart; coll<collend; ++coll)
       
  4937 			*str++ = '?';
       
  4938 		    /* fall through */
       
  4939 		case 3: /* ignore */
       
  4940 		    p = collend;
       
  4941 		    break;
       
  4942 		case 4: /* xmlcharrefreplace */
       
  4943 		    /* generate replacement (temporarily (mis)uses p) */
       
  4944 		    for (p = collstart; p < collend; ++p) {
       
  4945 			char buffer[2+29+1+1];
       
  4946 			char *cp;
       
  4947 			sprintf(buffer, "&#%d;", (int)*p);
       
  4948 			if (charmaptranslate_makespace(&res, &str,
       
  4949 			    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
       
  4950 			    goto onError;
       
  4951 			for (cp = buffer; *cp; ++cp)
       
  4952 			    *str++ = *cp;
       
  4953 		    }
       
  4954 		    p = collend;
       
  4955 		    break;
       
  4956 		default:
       
  4957 		    repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
       
  4958 			reason, startp, size, &exc,
       
  4959 			collstart-startp, collend-startp, &newpos);
       
  4960 		    if (repunicode == NULL)
       
  4961 			goto onError;
       
  4962 		    /* generate replacement  */
       
  4963 		    repsize = PyUnicode_GET_SIZE(repunicode);
       
  4964 		    if (charmaptranslate_makespace(&res, &str,
       
  4965 			(str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
       
  4966 			Py_DECREF(repunicode);
       
  4967 			goto onError;
       
  4968 		    }
       
  4969 		    for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
       
  4970 			*str++ = *uni2;
       
  4971 		    p = startp + newpos;
       
  4972 		    Py_DECREF(repunicode);
       
  4973 	    }
       
  4974 	}
       
  4975     }
       
  4976     /* Resize if we allocated to much */
       
  4977     respos = str-PyUnicode_AS_UNICODE(res);
       
  4978     if (respos<PyUnicode_GET_SIZE(res)) {
       
  4979 	if (_PyUnicode_Resize(&res, respos) < 0)
       
  4980 	    goto onError;
       
  4981     }
       
  4982     Py_XDECREF(exc);
       
  4983     Py_XDECREF(errorHandler);
       
  4984     return res;
       
  4985 
       
  4986     onError:
       
  4987     Py_XDECREF(res);
       
  4988     Py_XDECREF(exc);
       
  4989     Py_XDECREF(errorHandler);
       
  4990     return NULL;
       
  4991 }
       
  4992 
       
  4993 PyObject *PyUnicode_Translate(PyObject *str,
       
  4994 			      PyObject *mapping,
       
  4995 			      const char *errors)
       
  4996 {
       
  4997     PyObject *result;
       
  4998 
       
  4999     str = PyUnicode_FromObject(str);
       
  5000     if (str == NULL)
       
  5001 	goto onError;
       
  5002     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
       
  5003 					PyUnicode_GET_SIZE(str),
       
  5004 					mapping,
       
  5005 					errors);
       
  5006     Py_DECREF(str);
       
  5007     return result;
       
  5008 
       
  5009  onError:
       
  5010     Py_XDECREF(str);
       
  5011     return NULL;
       
  5012 }
       
  5013 
       
  5014 /* --- Decimal Encoder ---------------------------------------------------- */
       
  5015 
       
  5016 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
       
  5017 			    Py_ssize_t length,
       
  5018 			    char *output,
       
  5019 			    const char *errors)
       
  5020 {
       
  5021     Py_UNICODE *p, *end;
       
  5022     PyObject *errorHandler = NULL;
       
  5023     PyObject *exc = NULL;
       
  5024     const char *encoding = "decimal";
       
  5025     const char *reason = "invalid decimal Unicode string";
       
  5026     /* the following variable is used for caching string comparisons
       
  5027      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
       
  5028     int known_errorHandler = -1;
       
  5029 
       
  5030     if (output == NULL) {
       
  5031 	PyErr_BadArgument();
       
  5032 	return -1;
       
  5033     }
       
  5034 
       
  5035     p = s;
       
  5036     end = s + length;
       
  5037     while (p < end) {
       
  5038 	register Py_UNICODE ch = *p;
       
  5039 	int decimal;
       
  5040 	PyObject *repunicode;
       
  5041 	Py_ssize_t repsize;
       
  5042 	Py_ssize_t newpos;
       
  5043 	Py_UNICODE *uni2;
       
  5044 	Py_UNICODE *collstart;
       
  5045 	Py_UNICODE *collend;
       
  5046 
       
  5047 	if (Py_UNICODE_ISSPACE(ch)) {
       
  5048 	    *output++ = ' ';
       
  5049 	    ++p;
       
  5050 	    continue;
       
  5051 	}
       
  5052 	decimal = Py_UNICODE_TODECIMAL(ch);
       
  5053 	if (decimal >= 0) {
       
  5054 	    *output++ = '0' + decimal;
       
  5055 	    ++p;
       
  5056 	    continue;
       
  5057 	}
       
  5058 	if (0 < ch && ch < 256) {
       
  5059 	    *output++ = (char)ch;
       
  5060 	    ++p;
       
  5061 	    continue;
       
  5062 	}
       
  5063 	/* All other characters are considered unencodable */
       
  5064 	collstart = p;
       
  5065 	collend = p+1;
       
  5066 	while (collend < end) {
       
  5067 	    if ((0 < *collend && *collend < 256) ||
       
  5068 	        !Py_UNICODE_ISSPACE(*collend) ||
       
  5069 	        Py_UNICODE_TODECIMAL(*collend))
       
  5070 		break;
       
  5071 	}
       
  5072 	/* cache callback name lookup
       
  5073 	 * (if not done yet, i.e. it's the first error) */
       
  5074 	if (known_errorHandler==-1) {
       
  5075 	    if ((errors==NULL) || (!strcmp(errors, "strict")))
       
  5076 		known_errorHandler = 1;
       
  5077 	    else if (!strcmp(errors, "replace"))
       
  5078 		known_errorHandler = 2;
       
  5079 	    else if (!strcmp(errors, "ignore"))
       
  5080 		known_errorHandler = 3;
       
  5081 	    else if (!strcmp(errors, "xmlcharrefreplace"))
       
  5082 		known_errorHandler = 4;
       
  5083 	    else
       
  5084 		known_errorHandler = 0;
       
  5085 	}
       
  5086 	switch (known_errorHandler) {
       
  5087 	    case 1: /* strict */
       
  5088 		raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
       
  5089 		goto onError;
       
  5090 	    case 2: /* replace */
       
  5091 		for (p = collstart; p < collend; ++p)
       
  5092 		    *output++ = '?';
       
  5093 		/* fall through */
       
  5094 	    case 3: /* ignore */
       
  5095 		p = collend;
       
  5096 		break;
       
  5097 	    case 4: /* xmlcharrefreplace */
       
  5098 		/* generate replacement (temporarily (mis)uses p) */
       
  5099 		for (p = collstart; p < collend; ++p)
       
  5100 		    output += sprintf(output, "&#%d;", (int)*p);
       
  5101 		p = collend;
       
  5102 		break;
       
  5103 	    default:
       
  5104 		repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
       
  5105 		    encoding, reason, s, length, &exc,
       
  5106 		    collstart-s, collend-s, &newpos);
       
  5107 		if (repunicode == NULL)
       
  5108 		    goto onError;
       
  5109 		/* generate replacement  */
       
  5110 		repsize = PyUnicode_GET_SIZE(repunicode);
       
  5111 		for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
       
  5112 		    Py_UNICODE ch = *uni2;
       
  5113 		    if (Py_UNICODE_ISSPACE(ch))
       
  5114 			*output++ = ' ';
       
  5115 		    else {
       
  5116 			decimal = Py_UNICODE_TODECIMAL(ch);
       
  5117 			if (decimal >= 0)
       
  5118 			    *output++ = '0' + decimal;
       
  5119 			else if (0 < ch && ch < 256)
       
  5120 			    *output++ = (char)ch;
       
  5121 			else {
       
  5122 			    Py_DECREF(repunicode);
       
  5123 			    raise_encode_exception(&exc, encoding,
       
  5124 				s, length, collstart-s, collend-s, reason);
       
  5125 			    goto onError;
       
  5126 			}
       
  5127 		    }
       
  5128 		}
       
  5129 		p = s + newpos;
       
  5130 		Py_DECREF(repunicode);
       
  5131 	}
       
  5132     }
       
  5133     /* 0-terminate the output string */
       
  5134     *output++ = '\0';
       
  5135     Py_XDECREF(exc);
       
  5136     Py_XDECREF(errorHandler);
       
  5137     return 0;
       
  5138 
       
  5139  onError:
       
  5140     Py_XDECREF(exc);
       
  5141     Py_XDECREF(errorHandler);
       
  5142     return -1;
       
  5143 }
       
  5144 
       
  5145 /* --- Helpers ------------------------------------------------------------ */
       
  5146 
       
  5147 #include "stringlib/unicodedefs.h"
       
  5148 
       
  5149 #define FROM_UNICODE
       
  5150 
       
  5151 #include "stringlib/fastsearch.h"
       
  5152 
       
  5153 #include "stringlib/count.h"
       
  5154 #include "stringlib/find.h"
       
  5155 #include "stringlib/partition.h"
       
  5156 
       
  5157 /* helper macro to fixup start/end slice values */
       
  5158 #define FIX_START_END(obj)                      \
       
  5159     if (start < 0)                              \
       
  5160         start += (obj)->length;                 \
       
  5161     if (start < 0)                              \
       
  5162         start = 0;                              \
       
  5163     if (end > (obj)->length)                    \
       
  5164         end = (obj)->length;                    \
       
  5165     if (end < 0)                                \
       
  5166         end += (obj)->length;                   \
       
  5167     if (end < 0)                                \
       
  5168         end = 0;
       
  5169 
       
  5170 Py_ssize_t PyUnicode_Count(PyObject *str,
       
  5171                            PyObject *substr,
       
  5172                            Py_ssize_t start,
       
  5173                            Py_ssize_t end)
       
  5174 {
       
  5175     Py_ssize_t result;
       
  5176     PyUnicodeObject* str_obj;
       
  5177     PyUnicodeObject* sub_obj;
       
  5178 
       
  5179     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
       
  5180     if (!str_obj)
       
  5181 	return -1;
       
  5182     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
       
  5183     if (!sub_obj) {
       
  5184 	Py_DECREF(str_obj);
       
  5185 	return -1;
       
  5186     }
       
  5187 
       
  5188     FIX_START_END(str_obj);
       
  5189 
       
  5190     result = stringlib_count(
       
  5191         str_obj->str + start, end - start, sub_obj->str, sub_obj->length
       
  5192         );
       
  5193 
       
  5194     Py_DECREF(sub_obj);
       
  5195     Py_DECREF(str_obj);
       
  5196 
       
  5197     return result;
       
  5198 }
       
  5199 
       
  5200 Py_ssize_t PyUnicode_Find(PyObject *str,
       
  5201                           PyObject *sub,
       
  5202                           Py_ssize_t start,
       
  5203                           Py_ssize_t end,
       
  5204                           int direction)
       
  5205 {
       
  5206     Py_ssize_t result;
       
  5207 
       
  5208     str = PyUnicode_FromObject(str);
       
  5209     if (!str)
       
  5210 	return -2;
       
  5211     sub = PyUnicode_FromObject(sub);
       
  5212     if (!sub) {
       
  5213 	Py_DECREF(str);
       
  5214 	return -2;
       
  5215     }
       
  5216 
       
  5217     if (direction > 0)
       
  5218         result = stringlib_find_slice(
       
  5219             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
       
  5220             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
       
  5221             start, end
       
  5222             );
       
  5223     else
       
  5224         result = stringlib_rfind_slice(
       
  5225             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
       
  5226             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
       
  5227             start, end
       
  5228             );
       
  5229 
       
  5230     Py_DECREF(str);
       
  5231     Py_DECREF(sub);
       
  5232 
       
  5233     return result;
       
  5234 }
       
  5235 
       
  5236 static
       
  5237 int tailmatch(PyUnicodeObject *self,
       
  5238 	      PyUnicodeObject *substring,
       
  5239 	      Py_ssize_t start,
       
  5240 	      Py_ssize_t end,
       
  5241 	      int direction)
       
  5242 {
       
  5243     if (substring->length == 0)
       
  5244         return 1;
       
  5245 
       
  5246     FIX_START_END(self);
       
  5247 
       
  5248     end -= substring->length;
       
  5249     if (end < start)
       
  5250 	return 0;
       
  5251 
       
  5252     if (direction > 0) {
       
  5253 	if (Py_UNICODE_MATCH(self, end, substring))
       
  5254 	    return 1;
       
  5255     } else {
       
  5256         if (Py_UNICODE_MATCH(self, start, substring))
       
  5257 	    return 1;
       
  5258     }
       
  5259 
       
  5260     return 0;
       
  5261 }
       
  5262 
       
  5263 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
       
  5264 			PyObject *substr,
       
  5265 			Py_ssize_t start,
       
  5266 			Py_ssize_t end,
       
  5267 			int direction)
       
  5268 {
       
  5269     Py_ssize_t result;
       
  5270 
       
  5271     str = PyUnicode_FromObject(str);
       
  5272     if (str == NULL)
       
  5273 	return -1;
       
  5274     substr = PyUnicode_FromObject(substr);
       
  5275     if (substr == NULL) {
       
  5276 	Py_DECREF(str);
       
  5277 	return -1;
       
  5278     }
       
  5279 
       
  5280     result = tailmatch((PyUnicodeObject *)str,
       
  5281 		       (PyUnicodeObject *)substr,
       
  5282 		       start, end, direction);
       
  5283     Py_DECREF(str);
       
  5284     Py_DECREF(substr);
       
  5285     return result;
       
  5286 }
       
  5287 
       
  5288 /* Apply fixfct filter to the Unicode object self and return a
       
  5289    reference to the modified object */
       
  5290 
       
  5291 static
       
  5292 PyObject *fixup(PyUnicodeObject *self,
       
  5293 		int (*fixfct)(PyUnicodeObject *s))
       
  5294 {
       
  5295 
       
  5296     PyUnicodeObject *u;
       
  5297 
       
  5298     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
       
  5299     if (u == NULL)
       
  5300 	return NULL;
       
  5301 
       
  5302     Py_UNICODE_COPY(u->str, self->str, self->length);
       
  5303 
       
  5304     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
       
  5305 	/* fixfct should return TRUE if it modified the buffer. If
       
  5306 	   FALSE, return a reference to the original buffer instead
       
  5307 	   (to save space, not time) */
       
  5308 	Py_INCREF(self);
       
  5309 	Py_DECREF(u);
       
  5310 	return (PyObject*) self;
       
  5311     }
       
  5312     return (PyObject*) u;
       
  5313 }
       
  5314 
       
  5315 static
       
  5316 int fixupper(PyUnicodeObject *self)
       
  5317 {
       
  5318     Py_ssize_t len = self->length;
       
  5319     Py_UNICODE *s = self->str;
       
  5320     int status = 0;
       
  5321 
       
  5322     while (len-- > 0) {
       
  5323 	register Py_UNICODE ch;
       
  5324 
       
  5325 	ch = Py_UNICODE_TOUPPER(*s);
       
  5326 	if (ch != *s) {
       
  5327             status = 1;
       
  5328 	    *s = ch;
       
  5329 	}
       
  5330         s++;
       
  5331     }
       
  5332 
       
  5333     return status;
       
  5334 }
       
  5335 
       
  5336 static
       
  5337 int fixlower(PyUnicodeObject *self)
       
  5338 {
       
  5339     Py_ssize_t len = self->length;
       
  5340     Py_UNICODE *s = self->str;
       
  5341     int status = 0;
       
  5342 
       
  5343     while (len-- > 0) {
       
  5344 	register Py_UNICODE ch;
       
  5345 
       
  5346 	ch = Py_UNICODE_TOLOWER(*s);
       
  5347 	if (ch != *s) {
       
  5348             status = 1;
       
  5349 	    *s = ch;
       
  5350 	}
       
  5351         s++;
       
  5352     }
       
  5353 
       
  5354     return status;
       
  5355 }
       
  5356 
       
  5357 static
       
  5358 int fixswapcase(PyUnicodeObject *self)
       
  5359 {
       
  5360     Py_ssize_t len = self->length;
       
  5361     Py_UNICODE *s = self->str;
       
  5362     int status = 0;
       
  5363 
       
  5364     while (len-- > 0) {
       
  5365         if (Py_UNICODE_ISUPPER(*s)) {
       
  5366             *s = Py_UNICODE_TOLOWER(*s);
       
  5367             status = 1;
       
  5368         } else if (Py_UNICODE_ISLOWER(*s)) {
       
  5369             *s = Py_UNICODE_TOUPPER(*s);
       
  5370             status = 1;
       
  5371         }
       
  5372         s++;
       
  5373     }
       
  5374 
       
  5375     return status;
       
  5376 }
       
  5377 
       
  5378 static
       
  5379 int fixcapitalize(PyUnicodeObject *self)
       
  5380 {
       
  5381     Py_ssize_t len = self->length;
       
  5382     Py_UNICODE *s = self->str;
       
  5383     int status = 0;
       
  5384 
       
  5385     if (len == 0)
       
  5386 	return 0;
       
  5387     if (Py_UNICODE_ISLOWER(*s)) {
       
  5388 	*s = Py_UNICODE_TOUPPER(*s);
       
  5389 	status = 1;
       
  5390     }
       
  5391     s++;
       
  5392     while (--len > 0) {
       
  5393         if (Py_UNICODE_ISUPPER(*s)) {
       
  5394             *s = Py_UNICODE_TOLOWER(*s);
       
  5395             status = 1;
       
  5396         }
       
  5397         s++;
       
  5398     }
       
  5399     return status;
       
  5400 }
       
  5401 
       
  5402 static
       
  5403 int fixtitle(PyUnicodeObject *self)
       
  5404 {
       
  5405     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
       
  5406     register Py_UNICODE *e;
       
  5407     int previous_is_cased;
       
  5408 
       
  5409     /* Shortcut for single character strings */
       
  5410     if (PyUnicode_GET_SIZE(self) == 1) {
       
  5411 	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
       
  5412 	if (*p != ch) {
       
  5413 	    *p = ch;
       
  5414 	    return 1;
       
  5415 	}
       
  5416 	else
       
  5417 	    return 0;
       
  5418     }
       
  5419 
       
  5420     e = p + PyUnicode_GET_SIZE(self);
       
  5421     previous_is_cased = 0;
       
  5422     for (; p < e; p++) {
       
  5423 	register const Py_UNICODE ch = *p;
       
  5424 
       
  5425 	if (previous_is_cased)
       
  5426 	    *p = Py_UNICODE_TOLOWER(ch);
       
  5427 	else
       
  5428 	    *p = Py_UNICODE_TOTITLE(ch);
       
  5429 
       
  5430 	if (Py_UNICODE_ISLOWER(ch) ||
       
  5431 	    Py_UNICODE_ISUPPER(ch) ||
       
  5432 	    Py_UNICODE_ISTITLE(ch))
       
  5433 	    previous_is_cased = 1;
       
  5434 	else
       
  5435 	    previous_is_cased = 0;
       
  5436     }
       
  5437     return 1;
       
  5438 }
       
  5439 
       
  5440 PyObject *
       
  5441 PyUnicode_Join(PyObject *separator, PyObject *seq)
       
  5442 {
       
  5443     PyObject *internal_separator = NULL;
       
  5444     const Py_UNICODE blank = ' ';
       
  5445     const Py_UNICODE *sep = &blank;
       
  5446     Py_ssize_t seplen = 1;
       
  5447     PyUnicodeObject *res = NULL; /* the result */
       
  5448     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
       
  5449     Py_ssize_t res_used;         /* # used bytes */
       
  5450     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
       
  5451     PyObject *fseq;          /* PySequence_Fast(seq) */
       
  5452     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
       
  5453     PyObject *item;
       
  5454     Py_ssize_t i;
       
  5455 
       
  5456     fseq = PySequence_Fast(seq, "");
       
  5457     if (fseq == NULL) {
       
  5458     	return NULL;
       
  5459     }
       
  5460 
       
  5461     /* Grrrr.  A codec may be invoked to convert str objects to
       
  5462      * Unicode, and so it's possible to call back into Python code
       
  5463      * during PyUnicode_FromObject(), and so it's possible for a sick
       
  5464      * codec to change the size of fseq (if seq is a list).  Therefore
       
  5465      * we have to keep refetching the size -- can't assume seqlen
       
  5466      * is invariant.
       
  5467      */
       
  5468     seqlen = PySequence_Fast_GET_SIZE(fseq);
       
  5469     /* If empty sequence, return u"". */
       
  5470     if (seqlen == 0) {
       
  5471     	res = _PyUnicode_New(0);  /* empty sequence; return u"" */
       
  5472     	goto Done;
       
  5473     }
       
  5474     /* If singleton sequence with an exact Unicode, return that. */
       
  5475     if (seqlen == 1) {
       
  5476 	item = PySequence_Fast_GET_ITEM(fseq, 0);
       
  5477 	if (PyUnicode_CheckExact(item)) {
       
  5478 	    Py_INCREF(item);
       
  5479 	    res = (PyUnicodeObject *)item;
       
  5480 	    goto Done;
       
  5481 	}
       
  5482     }
       
  5483 
       
  5484     /* At least two items to join, or one that isn't exact Unicode. */
       
  5485     if (seqlen > 1) {
       
  5486         /* Set up sep and seplen -- they're needed. */
       
  5487     	if (separator == NULL) {
       
  5488 	    sep = &blank;
       
  5489 	    seplen = 1;
       
  5490         }
       
  5491     	else {
       
  5492 	    internal_separator = PyUnicode_FromObject(separator);
       
  5493 	    if (internal_separator == NULL)
       
  5494 	        goto onError;
       
  5495 	    sep = PyUnicode_AS_UNICODE(internal_separator);
       
  5496 	    seplen = PyUnicode_GET_SIZE(internal_separator);
       
  5497 	    /* In case PyUnicode_FromObject() mutated seq. */
       
  5498 	    seqlen = PySequence_Fast_GET_SIZE(fseq);
       
  5499         }
       
  5500     }
       
  5501 
       
  5502     /* Get space. */
       
  5503     res = _PyUnicode_New(res_alloc);
       
  5504     if (res == NULL)
       
  5505         goto onError;
       
  5506     res_p = PyUnicode_AS_UNICODE(res);
       
  5507     res_used = 0;
       
  5508 
       
  5509     for (i = 0; i < seqlen; ++i) {
       
  5510 	Py_ssize_t itemlen;
       
  5511 	Py_ssize_t new_res_used;
       
  5512 
       
  5513 	item = PySequence_Fast_GET_ITEM(fseq, i);
       
  5514 	/* Convert item to Unicode. */
       
  5515 	if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
       
  5516 	    PyErr_Format(PyExc_TypeError,
       
  5517 			 "sequence item %zd: expected string or Unicode,"
       
  5518 			 " %.80s found",
       
  5519 			 i, Py_TYPE(item)->tp_name);
       
  5520 	    goto onError;
       
  5521 	}
       
  5522 	item = PyUnicode_FromObject(item);
       
  5523 	if (item == NULL)
       
  5524 	    goto onError;
       
  5525 	/* We own a reference to item from here on. */
       
  5526 
       
  5527 	/* In case PyUnicode_FromObject() mutated seq. */
       
  5528 	seqlen = PySequence_Fast_GET_SIZE(fseq);
       
  5529 
       
  5530         /* Make sure we have enough space for the separator and the item. */
       
  5531 	itemlen = PyUnicode_GET_SIZE(item);
       
  5532 	new_res_used = res_used + itemlen;
       
  5533 	if (new_res_used < 0)
       
  5534 	    goto Overflow;
       
  5535 	if (i < seqlen - 1) {
       
  5536 	    new_res_used += seplen;
       
  5537 	    if (new_res_used < 0)
       
  5538 		goto Overflow;
       
  5539 	}
       
  5540 	if (new_res_used > res_alloc) {
       
  5541 	    /* double allocated size until it's big enough */
       
  5542 	    do {
       
  5543 	        res_alloc += res_alloc;
       
  5544 	        if (res_alloc <= 0)
       
  5545 	            goto Overflow;
       
  5546 	    } while (new_res_used > res_alloc);
       
  5547 	    if (_PyUnicode_Resize(&res, res_alloc) < 0) {
       
  5548 		Py_DECREF(item);
       
  5549 		goto onError;
       
  5550 	    }
       
  5551             res_p = PyUnicode_AS_UNICODE(res) + res_used;
       
  5552 	}
       
  5553 
       
  5554 	/* Copy item, and maybe the separator. */
       
  5555 	Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
       
  5556 	res_p += itemlen;
       
  5557 	if (i < seqlen - 1) {
       
  5558 	    Py_UNICODE_COPY(res_p, sep, seplen);
       
  5559 	    res_p += seplen;
       
  5560 	}
       
  5561 	Py_DECREF(item);
       
  5562 	res_used = new_res_used;
       
  5563     }
       
  5564 
       
  5565     /* Shrink res to match the used area; this probably can't fail,
       
  5566      * but it's cheap to check.
       
  5567      */
       
  5568     if (_PyUnicode_Resize(&res, res_used) < 0)
       
  5569 	goto onError;
       
  5570 
       
  5571  Done:
       
  5572     Py_XDECREF(internal_separator);
       
  5573     Py_DECREF(fseq);
       
  5574     return (PyObject *)res;
       
  5575 
       
  5576  Overflow:
       
  5577     PyErr_SetString(PyExc_OverflowError,
       
  5578                     "join() result is too long for a Python string");
       
  5579     Py_DECREF(item);
       
  5580     /* fall through */
       
  5581 
       
  5582  onError:
       
  5583     Py_XDECREF(internal_separator);
       
  5584     Py_DECREF(fseq);
       
  5585     Py_XDECREF(res);
       
  5586     return NULL;
       
  5587 }
       
  5588 
       
  5589 static
       
  5590 PyUnicodeObject *pad(PyUnicodeObject *self,
       
  5591 		     Py_ssize_t left,
       
  5592 		     Py_ssize_t right,
       
  5593 		     Py_UNICODE fill)
       
  5594 {
       
  5595     PyUnicodeObject *u;
       
  5596 
       
  5597     if (left < 0)
       
  5598         left = 0;
       
  5599     if (right < 0)
       
  5600         right = 0;
       
  5601 
       
  5602     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
       
  5603         Py_INCREF(self);
       
  5604         return self;
       
  5605     }
       
  5606 
       
  5607     if (left > PY_SSIZE_T_MAX - self->length ||
       
  5608         right > PY_SSIZE_T_MAX - (left + self->length)) {
       
  5609         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
       
  5610         return NULL;
       
  5611     }
       
  5612     u = _PyUnicode_New(left + self->length + right);
       
  5613     if (u) {
       
  5614         if (left)
       
  5615             Py_UNICODE_FILL(u->str, fill, left);
       
  5616         Py_UNICODE_COPY(u->str + left, self->str, self->length);
       
  5617         if (right)
       
  5618             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
       
  5619     }
       
  5620 
       
  5621     return u;
       
  5622 }
       
  5623 
       
  5624 #define SPLIT_APPEND(data, left, right)					\
       
  5625 	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
       
  5626 	if (!str)							\
       
  5627 	    goto onError;						\
       
  5628 	if (PyList_Append(list, str)) {					\
       
  5629 	    Py_DECREF(str);						\
       
  5630 	    goto onError;						\
       
  5631 	}								\
       
  5632         else								\
       
  5633             Py_DECREF(str);
       
  5634 
       
  5635 static
       
  5636 PyObject *split_whitespace(PyUnicodeObject *self,
       
  5637 			   PyObject *list,
       
  5638 			   Py_ssize_t maxcount)
       
  5639 {
       
  5640     register Py_ssize_t i;
       
  5641     register Py_ssize_t j;
       
  5642     Py_ssize_t len = self->length;
       
  5643     PyObject *str;
       
  5644     register const Py_UNICODE *buf = self->str;
       
  5645 
       
  5646     for (i = j = 0; i < len; ) {
       
  5647 	/* find a token */
       
  5648 	while (i < len && Py_UNICODE_ISSPACE(buf[i]))
       
  5649 	    i++;
       
  5650 	j = i;
       
  5651 	while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
       
  5652 	    i++;
       
  5653 	if (j < i) {
       
  5654 	    if (maxcount-- <= 0)
       
  5655 		break;
       
  5656 	    SPLIT_APPEND(buf, j, i);
       
  5657 	    while (i < len && Py_UNICODE_ISSPACE(buf[i]))
       
  5658 		i++;
       
  5659 	    j = i;
       
  5660 	}
       
  5661     }
       
  5662     if (j < len) {
       
  5663 	SPLIT_APPEND(buf, j, len);
       
  5664     }
       
  5665     return list;
       
  5666 
       
  5667  onError:
       
  5668     Py_DECREF(list);
       
  5669     return NULL;
       
  5670 }
       
  5671 
       
  5672 PyObject *PyUnicode_Splitlines(PyObject *string,
       
  5673 			       int keepends)
       
  5674 {
       
  5675     register Py_ssize_t i;
       
  5676     register Py_ssize_t j;
       
  5677     Py_ssize_t len;
       
  5678     PyObject *list;
       
  5679     PyObject *str;
       
  5680     Py_UNICODE *data;
       
  5681 
       
  5682     string = PyUnicode_FromObject(string);
       
  5683     if (string == NULL)
       
  5684 	return NULL;
       
  5685     data = PyUnicode_AS_UNICODE(string);
       
  5686     len = PyUnicode_GET_SIZE(string);
       
  5687 
       
  5688     list = PyList_New(0);
       
  5689     if (!list)
       
  5690         goto onError;
       
  5691 
       
  5692     for (i = j = 0; i < len; ) {
       
  5693 	Py_ssize_t eol;
       
  5694 
       
  5695 	/* Find a line and append it */
       
  5696 	while (i < len && !BLOOM_LINEBREAK(data[i]))
       
  5697 	    i++;
       
  5698 
       
  5699 	/* Skip the line break reading CRLF as one line break */
       
  5700 	eol = i;
       
  5701 	if (i < len) {
       
  5702 	    if (data[i] == '\r' && i + 1 < len &&
       
  5703 		data[i+1] == '\n')
       
  5704 		i += 2;
       
  5705 	    else
       
  5706 		i++;
       
  5707 	    if (keepends)
       
  5708 		eol = i;
       
  5709 	}
       
  5710 	SPLIT_APPEND(data, j, eol);
       
  5711 	j = i;
       
  5712     }
       
  5713     if (j < len) {
       
  5714 	SPLIT_APPEND(data, j, len);
       
  5715     }
       
  5716 
       
  5717     Py_DECREF(string);
       
  5718     return list;
       
  5719 
       
  5720  onError:
       
  5721     Py_XDECREF(list);
       
  5722     Py_DECREF(string);
       
  5723     return NULL;
       
  5724 }
       
  5725 
       
  5726 static
       
  5727 PyObject *split_char(PyUnicodeObject *self,
       
  5728 		     PyObject *list,
       
  5729 		     Py_UNICODE ch,
       
  5730 		     Py_ssize_t maxcount)
       
  5731 {
       
  5732     register Py_ssize_t i;
       
  5733     register Py_ssize_t j;
       
  5734     Py_ssize_t len = self->length;
       
  5735     PyObject *str;
       
  5736     register const Py_UNICODE *buf = self->str;
       
  5737 
       
  5738     for (i = j = 0; i < len; ) {
       
  5739 	if (buf[i] == ch) {
       
  5740 	    if (maxcount-- <= 0)
       
  5741 		break;
       
  5742 	    SPLIT_APPEND(buf, j, i);
       
  5743 	    i = j = i + 1;
       
  5744 	} else
       
  5745 	    i++;
       
  5746     }
       
  5747     if (j <= len) {
       
  5748 	SPLIT_APPEND(buf, j, len);
       
  5749     }
       
  5750     return list;
       
  5751 
       
  5752  onError:
       
  5753     Py_DECREF(list);
       
  5754     return NULL;
       
  5755 }
       
  5756 
       
  5757 static
       
  5758 PyObject *split_substring(PyUnicodeObject *self,
       
  5759 			  PyObject *list,
       
  5760 			  PyUnicodeObject *substring,
       
  5761 			  Py_ssize_t maxcount)
       
  5762 {
       
  5763     register Py_ssize_t i;
       
  5764     register Py_ssize_t j;
       
  5765     Py_ssize_t len = self->length;
       
  5766     Py_ssize_t sublen = substring->length;
       
  5767     PyObject *str;
       
  5768 
       
  5769     for (i = j = 0; i <= len - sublen; ) {
       
  5770 	if (Py_UNICODE_MATCH(self, i, substring)) {
       
  5771 	    if (maxcount-- <= 0)
       
  5772 		break;
       
  5773 	    SPLIT_APPEND(self->str, j, i);
       
  5774 	    i = j = i + sublen;
       
  5775 	} else
       
  5776 	    i++;
       
  5777     }
       
  5778     if (j <= len) {
       
  5779 	SPLIT_APPEND(self->str, j, len);
       
  5780     }
       
  5781     return list;
       
  5782 
       
  5783  onError:
       
  5784     Py_DECREF(list);
       
  5785     return NULL;
       
  5786 }
       
  5787 
       
  5788 static
       
  5789 PyObject *rsplit_whitespace(PyUnicodeObject *self,
       
  5790 			    PyObject *list,
       
  5791 			    Py_ssize_t maxcount)
       
  5792 {
       
  5793     register Py_ssize_t i;
       
  5794     register Py_ssize_t j;
       
  5795     Py_ssize_t len = self->length;
       
  5796     PyObject *str;
       
  5797     register const Py_UNICODE *buf = self->str;
       
  5798 
       
  5799     for (i = j = len - 1; i >= 0; ) {
       
  5800 	/* find a token */
       
  5801 	while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
       
  5802 	    i--;
       
  5803 	j = i;
       
  5804 	while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
       
  5805 	    i--;
       
  5806 	if (j > i) {
       
  5807 	    if (maxcount-- <= 0)
       
  5808 		break;
       
  5809 	    SPLIT_APPEND(buf, i + 1, j + 1);
       
  5810 	    while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
       
  5811 		i--;
       
  5812 	    j = i;
       
  5813 	}
       
  5814     }
       
  5815     if (j >= 0) {
       
  5816 	SPLIT_APPEND(buf, 0, j + 1);
       
  5817     }
       
  5818     if (PyList_Reverse(list) < 0)
       
  5819         goto onError;
       
  5820     return list;
       
  5821 
       
  5822  onError:
       
  5823     Py_DECREF(list);
       
  5824     return NULL;
       
  5825 }
       
  5826 
       
  5827 static 
       
  5828 PyObject *rsplit_char(PyUnicodeObject *self,
       
  5829 		      PyObject *list,
       
  5830 		      Py_UNICODE ch,
       
  5831 		      Py_ssize_t maxcount)
       
  5832 {
       
  5833     register Py_ssize_t i;
       
  5834     register Py_ssize_t j;
       
  5835     Py_ssize_t len = self->length;
       
  5836     PyObject *str;
       
  5837     register const Py_UNICODE *buf = self->str;
       
  5838 
       
  5839     for (i = j = len - 1; i >= 0; ) {
       
  5840 	if (buf[i] == ch) {
       
  5841 	    if (maxcount-- <= 0)
       
  5842 		break;
       
  5843 	    SPLIT_APPEND(buf, i + 1, j + 1);
       
  5844 	    j = i = i - 1;
       
  5845 	} else
       
  5846 	    i--;
       
  5847     }
       
  5848     if (j >= -1) {
       
  5849 	SPLIT_APPEND(buf, 0, j + 1);
       
  5850     }
       
  5851     if (PyList_Reverse(list) < 0)
       
  5852         goto onError;
       
  5853     return list;
       
  5854 
       
  5855  onError:
       
  5856     Py_DECREF(list);
       
  5857     return NULL;
       
  5858 }
       
  5859 
       
  5860 static 
       
  5861 PyObject *rsplit_substring(PyUnicodeObject *self,
       
  5862 			   PyObject *list,
       
  5863 			   PyUnicodeObject *substring,
       
  5864 			   Py_ssize_t maxcount)
       
  5865 {
       
  5866     register Py_ssize_t i;
       
  5867     register Py_ssize_t j;
       
  5868     Py_ssize_t len = self->length;
       
  5869     Py_ssize_t sublen = substring->length;
       
  5870     PyObject *str;
       
  5871 
       
  5872     for (i = len - sublen, j = len; i >= 0; ) {
       
  5873 	if (Py_UNICODE_MATCH(self, i, substring)) {
       
  5874 	    if (maxcount-- <= 0)
       
  5875 		break;
       
  5876 	    SPLIT_APPEND(self->str, i + sublen, j);
       
  5877 	    j = i;
       
  5878 	    i -= sublen;
       
  5879 	} else
       
  5880 	    i--;
       
  5881     }
       
  5882     if (j >= 0) {
       
  5883 	SPLIT_APPEND(self->str, 0, j);
       
  5884     }
       
  5885     if (PyList_Reverse(list) < 0)
       
  5886         goto onError;
       
  5887     return list;
       
  5888 
       
  5889  onError:
       
  5890     Py_DECREF(list);
       
  5891     return NULL;
       
  5892 }
       
  5893 
       
  5894 #undef SPLIT_APPEND
       
  5895 
       
  5896 static
       
  5897 PyObject *split(PyUnicodeObject *self,
       
  5898 		PyUnicodeObject *substring,
       
  5899 		Py_ssize_t maxcount)
       
  5900 {
       
  5901     PyObject *list;
       
  5902 
       
  5903     if (maxcount < 0)
       
  5904         maxcount = PY_SSIZE_T_MAX;
       
  5905 
       
  5906     list = PyList_New(0);
       
  5907     if (!list)
       
  5908         return NULL;
       
  5909 
       
  5910     if (substring == NULL)
       
  5911 	return split_whitespace(self,list,maxcount);
       
  5912 
       
  5913     else if (substring->length == 1)
       
  5914 	return split_char(self,list,substring->str[0],maxcount);
       
  5915 
       
  5916     else if (substring->length == 0) {
       
  5917 	Py_DECREF(list);
       
  5918 	PyErr_SetString(PyExc_ValueError, "empty separator");
       
  5919 	return NULL;
       
  5920     }
       
  5921     else
       
  5922 	return split_substring(self,list,substring,maxcount);
       
  5923 }
       
  5924 
       
  5925 static
       
  5926 PyObject *rsplit(PyUnicodeObject *self,
       
  5927 		 PyUnicodeObject *substring,
       
  5928 		 Py_ssize_t maxcount)
       
  5929 {
       
  5930     PyObject *list;
       
  5931 
       
  5932     if (maxcount < 0)
       
  5933         maxcount = PY_SSIZE_T_MAX;
       
  5934 
       
  5935     list = PyList_New(0);
       
  5936     if (!list)
       
  5937         return NULL;
       
  5938 
       
  5939     if (substring == NULL)
       
  5940 	return rsplit_whitespace(self,list,maxcount);
       
  5941 
       
  5942     else if (substring->length == 1)
       
  5943 	return rsplit_char(self,list,substring->str[0],maxcount);
       
  5944 
       
  5945     else if (substring->length == 0) {
       
  5946 	Py_DECREF(list);
       
  5947 	PyErr_SetString(PyExc_ValueError, "empty separator");
       
  5948 	return NULL;
       
  5949     }
       
  5950     else
       
  5951 	return rsplit_substring(self,list,substring,maxcount);
       
  5952 }
       
  5953 
       
  5954 static
       
  5955 PyObject *replace(PyUnicodeObject *self,
       
  5956 		  PyUnicodeObject *str1,
       
  5957 		  PyUnicodeObject *str2,
       
  5958 		  Py_ssize_t maxcount)
       
  5959 {
       
  5960     PyUnicodeObject *u;
       
  5961 
       
  5962     if (maxcount < 0)
       
  5963 	maxcount = PY_SSIZE_T_MAX;
       
  5964 
       
  5965     if (str1->length == str2->length) {
       
  5966         /* same length */
       
  5967         Py_ssize_t i;
       
  5968         if (str1->length == 1) {
       
  5969             /* replace characters */
       
  5970             Py_UNICODE u1, u2;
       
  5971             if (!findchar(self->str, self->length, str1->str[0]))
       
  5972                 goto nothing;
       
  5973             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
       
  5974             if (!u)
       
  5975                 return NULL;
       
  5976             Py_UNICODE_COPY(u->str, self->str, self->length);
       
  5977             u1 = str1->str[0];
       
  5978             u2 = str2->str[0];
       
  5979             for (i = 0; i < u->length; i++)
       
  5980                 if (u->str[i] == u1) {
       
  5981                     if (--maxcount < 0)
       
  5982                         break;
       
  5983                     u->str[i] = u2;
       
  5984                 }
       
  5985         } else {
       
  5986             i = fastsearch(
       
  5987                 self->str, self->length, str1->str, str1->length, FAST_SEARCH
       
  5988                 );
       
  5989             if (i < 0)
       
  5990                 goto nothing;
       
  5991             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
       
  5992             if (!u)
       
  5993                 return NULL;
       
  5994             Py_UNICODE_COPY(u->str, self->str, self->length);
       
  5995             while (i <= self->length - str1->length)
       
  5996                 if (Py_UNICODE_MATCH(self, i, str1)) {
       
  5997                     if (--maxcount < 0)
       
  5998                         break;
       
  5999                     Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
       
  6000                     i += str1->length;
       
  6001                 } else
       
  6002                     i++;
       
  6003         }
       
  6004     } else {
       
  6005 
       
  6006         Py_ssize_t n, i, j, e;
       
  6007         Py_ssize_t product, new_size, delta;
       
  6008         Py_UNICODE *p;
       
  6009 
       
  6010         /* replace strings */
       
  6011         n = stringlib_count(self->str, self->length, str1->str, str1->length);
       
  6012         if (n > maxcount)
       
  6013             n = maxcount;
       
  6014         if (n == 0)
       
  6015             goto nothing;
       
  6016         /* new_size = self->length + n * (str2->length - str1->length)); */
       
  6017         delta = (str2->length - str1->length);
       
  6018         if (delta == 0) {
       
  6019             new_size = self->length;
       
  6020         } else {
       
  6021             product = n * (str2->length - str1->length);
       
  6022             if ((product / (str2->length - str1->length)) != n) {
       
  6023                 PyErr_SetString(PyExc_OverflowError,
       
  6024                                 "replace string is too long");
       
  6025                 return NULL;
       
  6026             }
       
  6027             new_size = self->length + product;
       
  6028             if (new_size < 0) {
       
  6029                 PyErr_SetString(PyExc_OverflowError,
       
  6030                                 "replace string is too long");
       
  6031                 return NULL;
       
  6032             }
       
  6033         }
       
  6034         u = _PyUnicode_New(new_size);
       
  6035         if (!u)
       
  6036             return NULL;
       
  6037         i = 0;
       
  6038         p = u->str;
       
  6039         e = self->length - str1->length;
       
  6040         if (str1->length > 0) {
       
  6041             while (n-- > 0) {
       
  6042                 /* look for next match */
       
  6043                 j = i;
       
  6044                 while (j <= e) {
       
  6045                     if (Py_UNICODE_MATCH(self, j, str1))
       
  6046                         break;
       
  6047                     j++;
       
  6048                 }
       
  6049 		if (j > i) {
       
  6050                     if (j > e)
       
  6051                         break;
       
  6052                     /* copy unchanged part [i:j] */
       
  6053                     Py_UNICODE_COPY(p, self->str+i, j-i);
       
  6054                     p += j - i;
       
  6055                 }
       
  6056                 /* copy substitution string */
       
  6057                 if (str2->length > 0) {
       
  6058                     Py_UNICODE_COPY(p, str2->str, str2->length);
       
  6059                     p += str2->length;
       
  6060                 }
       
  6061                 i = j + str1->length;
       
  6062             }
       
  6063             if (i < self->length)
       
  6064                 /* copy tail [i:] */
       
  6065                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
       
  6066         } else {
       
  6067             /* interleave */
       
  6068             while (n > 0) {
       
  6069                 Py_UNICODE_COPY(p, str2->str, str2->length);
       
  6070                 p += str2->length;
       
  6071                 if (--n <= 0)
       
  6072                     break;
       
  6073                 *p++ = self->str[i++];
       
  6074             }
       
  6075             Py_UNICODE_COPY(p, self->str+i, self->length-i);
       
  6076         }
       
  6077     }
       
  6078     return (PyObject *) u;
       
  6079 
       
  6080 nothing:
       
  6081     /* nothing to replace; return original string (when possible) */
       
  6082     if (PyUnicode_CheckExact(self)) {
       
  6083         Py_INCREF(self);
       
  6084         return (PyObject *) self;
       
  6085     }
       
  6086     return PyUnicode_FromUnicode(self->str, self->length);
       
  6087 }
       
  6088 
       
  6089 /* --- Unicode Object Methods --------------------------------------------- */
       
  6090 
       
  6091 PyDoc_STRVAR(title__doc__,
       
  6092 "S.title() -> unicode\n\
       
  6093 \n\
       
  6094 Return a titlecased version of S, i.e. words start with title case\n\
       
  6095 characters, all remaining cased characters have lower case.");
       
  6096 
       
  6097 static PyObject*
       
  6098 unicode_title(PyUnicodeObject *self)
       
  6099 {
       
  6100     return fixup(self, fixtitle);
       
  6101 }
       
  6102 
       
  6103 PyDoc_STRVAR(capitalize__doc__,
       
  6104 "S.capitalize() -> unicode\n\
       
  6105 \n\
       
  6106 Return a capitalized version of S, i.e. make the first character\n\
       
  6107 have upper case.");
       
  6108 
       
  6109 static PyObject*
       
  6110 unicode_capitalize(PyUnicodeObject *self)
       
  6111 {
       
  6112     return fixup(self, fixcapitalize);
       
  6113 }
       
  6114 
       
  6115 #if 0
       
  6116 PyDoc_STRVAR(capwords__doc__,
       
  6117 "S.capwords() -> unicode\n\
       
  6118 \n\
       
  6119 Apply .capitalize() to all words in S and return the result with\n\
       
  6120 normalized whitespace (all whitespace strings are replaced by ' ').");
       
  6121 
       
  6122 static PyObject*
       
  6123 unicode_capwords(PyUnicodeObject *self)
       
  6124 {
       
  6125     PyObject *list;
       
  6126     PyObject *item;
       
  6127     Py_ssize_t i;
       
  6128 
       
  6129     /* Split into words */
       
  6130     list = split(self, NULL, -1);
       
  6131     if (!list)
       
  6132         return NULL;
       
  6133 
       
  6134     /* Capitalize each word */
       
  6135     for (i = 0; i < PyList_GET_SIZE(list); i++) {
       
  6136         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
       
  6137 		     fixcapitalize);
       
  6138         if (item == NULL)
       
  6139             goto onError;
       
  6140         Py_DECREF(PyList_GET_ITEM(list, i));
       
  6141         PyList_SET_ITEM(list, i, item);
       
  6142     }
       
  6143 
       
  6144     /* Join the words to form a new string */
       
  6145     item = PyUnicode_Join(NULL, list);
       
  6146 
       
  6147 onError:
       
  6148     Py_DECREF(list);
       
  6149     return (PyObject *)item;
       
  6150 }
       
  6151 #endif
       
  6152 
       
  6153 /* Argument converter.  Coerces to a single unicode character */
       
  6154 
       
  6155 static int
       
  6156 convert_uc(PyObject *obj, void *addr)
       
  6157 {
       
  6158 	Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
       
  6159 	PyObject *uniobj;
       
  6160 	Py_UNICODE *unistr;
       
  6161 
       
  6162 	uniobj = PyUnicode_FromObject(obj);
       
  6163 	if (uniobj == NULL) {
       
  6164 		PyErr_SetString(PyExc_TypeError,
       
  6165 			"The fill character cannot be converted to Unicode");
       
  6166 		return 0;
       
  6167 	}
       
  6168 	if (PyUnicode_GET_SIZE(uniobj) != 1) {
       
  6169 		PyErr_SetString(PyExc_TypeError,
       
  6170 			"The fill character must be exactly one character long");
       
  6171 		Py_DECREF(uniobj);
       
  6172 		return 0;
       
  6173 	}
       
  6174 	unistr = PyUnicode_AS_UNICODE(uniobj);
       
  6175 	*fillcharloc = unistr[0];
       
  6176 	Py_DECREF(uniobj);
       
  6177 	return 1;
       
  6178 }
       
  6179 
       
  6180 PyDoc_STRVAR(center__doc__,
       
  6181 "S.center(width[, fillchar]) -> unicode\n\
       
  6182 \n\
       
  6183 Return S centered in a Unicode string of length width. Padding is\n\
       
  6184 done using the specified fill character (default is a space)");
       
  6185 
       
  6186 static PyObject *
       
  6187 unicode_center(PyUnicodeObject *self, PyObject *args)
       
  6188 {
       
  6189     Py_ssize_t marg, left;
       
  6190     Py_ssize_t width;
       
  6191     Py_UNICODE fillchar = ' ';
       
  6192 
       
  6193     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
       
  6194         return NULL;
       
  6195 
       
  6196     if (self->length >= width && PyUnicode_CheckExact(self)) {
       
  6197         Py_INCREF(self);
       
  6198         return (PyObject*) self;
       
  6199     }
       
  6200 
       
  6201     marg = width - self->length;
       
  6202     left = marg / 2 + (marg & width & 1);
       
  6203 
       
  6204     return (PyObject*) pad(self, left, marg - left, fillchar);
       
  6205 }
       
  6206 
       
  6207 #if 0
       
  6208 
       
  6209 /* This code should go into some future Unicode collation support
       
  6210    module. The basic comparison should compare ordinals on a naive
       
  6211    basis (this is what Java does and thus JPython too). */
       
  6212 
       
  6213 /* speedy UTF-16 code point order comparison */
       
  6214 /* gleaned from: */
       
  6215 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
       
  6216 
       
  6217 static short utf16Fixup[32] =
       
  6218 {
       
  6219     0, 0, 0, 0, 0, 0, 0, 0,
       
  6220     0, 0, 0, 0, 0, 0, 0, 0,
       
  6221     0, 0, 0, 0, 0, 0, 0, 0,
       
  6222     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
       
  6223 };
       
  6224 
       
  6225 static int
       
  6226 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
       
  6227 {
       
  6228     Py_ssize_t len1, len2;
       
  6229 
       
  6230     Py_UNICODE *s1 = str1->str;
       
  6231     Py_UNICODE *s2 = str2->str;
       
  6232 
       
  6233     len1 = str1->length;
       
  6234     len2 = str2->length;
       
  6235 
       
  6236     while (len1 > 0 && len2 > 0) {
       
  6237         Py_UNICODE c1, c2;
       
  6238 
       
  6239         c1 = *s1++;
       
  6240         c2 = *s2++;
       
  6241 
       
  6242 	if (c1 > (1<<11) * 26)
       
  6243 	    c1 += utf16Fixup[c1>>11];
       
  6244 	if (c2 > (1<<11) * 26)
       
  6245             c2 += utf16Fixup[c2>>11];
       
  6246         /* now c1 and c2 are in UTF-32-compatible order */
       
  6247 
       
  6248         if (c1 != c2)
       
  6249             return (c1 < c2) ? -1 : 1;
       
  6250 
       
  6251         len1--; len2--;
       
  6252     }
       
  6253 
       
  6254     return (len1 < len2) ? -1 : (len1 != len2);
       
  6255 }
       
  6256 
       
  6257 #else
       
  6258 
       
  6259 static int
       
  6260 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
       
  6261 {
       
  6262     register Py_ssize_t len1, len2;
       
  6263 
       
  6264     Py_UNICODE *s1 = str1->str;
       
  6265     Py_UNICODE *s2 = str2->str;
       
  6266 
       
  6267     len1 = str1->length;
       
  6268     len2 = str2->length;
       
  6269 
       
  6270     while (len1 > 0 && len2 > 0) {
       
  6271         Py_UNICODE c1, c2;
       
  6272 
       
  6273         c1 = *s1++;
       
  6274         c2 = *s2++;
       
  6275 
       
  6276         if (c1 != c2)
       
  6277             return (c1 < c2) ? -1 : 1;
       
  6278 
       
  6279         len1--; len2--;
       
  6280     }
       
  6281 
       
  6282     return (len1 < len2) ? -1 : (len1 != len2);
       
  6283 }
       
  6284 
       
  6285 #endif
       
  6286 
       
  6287 int PyUnicode_Compare(PyObject *left,
       
  6288 		      PyObject *right)
       
  6289 {
       
  6290     PyUnicodeObject *u = NULL, *v = NULL;
       
  6291     int result;
       
  6292 
       
  6293     /* Coerce the two arguments */
       
  6294     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
       
  6295     if (u == NULL)
       
  6296 	goto onError;
       
  6297     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
       
  6298     if (v == NULL)
       
  6299 	goto onError;
       
  6300 
       
  6301     /* Shortcut for empty or interned objects */
       
  6302     if (v == u) {
       
  6303 	Py_DECREF(u);
       
  6304 	Py_DECREF(v);
       
  6305 	return 0;
       
  6306     }
       
  6307 
       
  6308     result = unicode_compare(u, v);
       
  6309 
       
  6310     Py_DECREF(u);
       
  6311     Py_DECREF(v);
       
  6312     return result;
       
  6313 
       
  6314 onError:
       
  6315     Py_XDECREF(u);
       
  6316     Py_XDECREF(v);
       
  6317     return -1;
       
  6318 }
       
  6319 
       
  6320 PyObject *PyUnicode_RichCompare(PyObject *left,
       
  6321                                 PyObject *right,
       
  6322                                 int op)
       
  6323 {
       
  6324     int result;
       
  6325 
       
  6326     result = PyUnicode_Compare(left, right);
       
  6327     if (result == -1 && PyErr_Occurred())
       
  6328         goto onError;
       
  6329 
       
  6330     /* Convert the return value to a Boolean */
       
  6331     switch (op) {
       
  6332     case Py_EQ:
       
  6333         result = (result == 0);
       
  6334         break;
       
  6335     case Py_NE:
       
  6336         result = (result != 0);
       
  6337         break;
       
  6338     case Py_LE:
       
  6339         result = (result <= 0);
       
  6340         break;
       
  6341     case Py_GE:
       
  6342         result = (result >= 0);
       
  6343         break;
       
  6344     case Py_LT:
       
  6345         result = (result == -1);
       
  6346         break;
       
  6347     case Py_GT:
       
  6348         result = (result == 1);
       
  6349         break;
       
  6350     }
       
  6351     return PyBool_FromLong(result);
       
  6352 
       
  6353  onError:
       
  6354 
       
  6355     /* Standard case
       
  6356 
       
  6357        Type errors mean that PyUnicode_FromObject() could not convert
       
  6358        one of the arguments (usually the right hand side) to Unicode,
       
  6359        ie. we can't handle the comparison request. However, it is
       
  6360        possible that the other object knows a comparison method, which
       
  6361        is why we return Py_NotImplemented to give the other object a
       
  6362        chance.
       
  6363 
       
  6364     */
       
  6365     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
       
  6366         PyErr_Clear();
       
  6367         Py_INCREF(Py_NotImplemented);
       
  6368         return Py_NotImplemented;
       
  6369     }
       
  6370     if (op != Py_EQ && op != Py_NE)
       
  6371         return NULL;
       
  6372 
       
  6373     /* Equality comparison.
       
  6374 
       
  6375        This is a special case: we silence any PyExc_UnicodeDecodeError
       
  6376        and instead turn it into a PyErr_UnicodeWarning.
       
  6377 
       
  6378     */
       
  6379     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
       
  6380         return NULL;
       
  6381     PyErr_Clear();
       
  6382     if (PyErr_Warn(PyExc_UnicodeWarning, 
       
  6383                    (op == Py_EQ) ? 
       
  6384                    "Unicode equal comparison "
       
  6385                    "failed to convert both arguments to Unicode - "
       
  6386                    "interpreting them as being unequal" :
       
  6387                    "Unicode unequal comparison "
       
  6388                    "failed to convert both arguments to Unicode - "
       
  6389                    "interpreting them as being unequal"
       
  6390                    ) < 0)
       
  6391         return NULL;
       
  6392     result = (op == Py_NE);
       
  6393     return PyBool_FromLong(result);
       
  6394 }
       
  6395 
       
  6396 int PyUnicode_Contains(PyObject *container,
       
  6397 		       PyObject *element)
       
  6398 {
       
  6399     PyObject *str, *sub;
       
  6400     int result;
       
  6401 
       
  6402     /* Coerce the two arguments */
       
  6403     sub = PyUnicode_FromObject(element);
       
  6404     if (!sub) {
       
  6405 	PyErr_SetString(PyExc_TypeError,
       
  6406 	    "'in <string>' requires string as left operand");
       
  6407         return -1;
       
  6408     }
       
  6409 
       
  6410     str = PyUnicode_FromObject(container);
       
  6411     if (!str) {
       
  6412         Py_DECREF(sub);
       
  6413         return -1;
       
  6414     }
       
  6415 
       
  6416     result = stringlib_contains_obj(str, sub);
       
  6417 
       
  6418     Py_DECREF(str);
       
  6419     Py_DECREF(sub);
       
  6420 
       
  6421     return result;
       
  6422 }
       
  6423 
       
  6424 /* Concat to string or Unicode object giving a new Unicode object. */
       
  6425 
       
  6426 PyObject *PyUnicode_Concat(PyObject *left,
       
  6427 			   PyObject *right)
       
  6428 {
       
  6429     PyUnicodeObject *u = NULL, *v = NULL, *w;
       
  6430 
       
  6431     /* Coerce the two arguments */
       
  6432     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
       
  6433     if (u == NULL)
       
  6434 	goto onError;
       
  6435     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
       
  6436     if (v == NULL)
       
  6437 	goto onError;
       
  6438 
       
  6439     /* Shortcuts */
       
  6440     if (v == unicode_empty) {
       
  6441 	Py_DECREF(v);
       
  6442 	return (PyObject *)u;
       
  6443     }
       
  6444     if (u == unicode_empty) {
       
  6445 	Py_DECREF(u);
       
  6446 	return (PyObject *)v;
       
  6447     }
       
  6448 
       
  6449     /* Concat the two Unicode strings */
       
  6450     w = _PyUnicode_New(u->length + v->length);
       
  6451     if (w == NULL)
       
  6452 	goto onError;
       
  6453     Py_UNICODE_COPY(w->str, u->str, u->length);
       
  6454     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
       
  6455 
       
  6456     Py_DECREF(u);
       
  6457     Py_DECREF(v);
       
  6458     return (PyObject *)w;
       
  6459 
       
  6460 onError:
       
  6461     Py_XDECREF(u);
       
  6462     Py_XDECREF(v);
       
  6463     return NULL;
       
  6464 }
       
  6465 
       
  6466 PyDoc_STRVAR(count__doc__,
       
  6467 "S.count(sub[, start[, end]]) -> int\n\
       
  6468 \n\
       
  6469 Return the number of non-overlapping occurrences of substring sub in\n\
       
  6470 Unicode string S[start:end].  Optional arguments start and end are\n\
       
  6471 interpreted as in slice notation.");
       
  6472 
       
  6473 static PyObject *
       
  6474 unicode_count(PyUnicodeObject *self, PyObject *args)
       
  6475 {
       
  6476     PyUnicodeObject *substring;
       
  6477     Py_ssize_t start = 0;
       
  6478     Py_ssize_t end = PY_SSIZE_T_MAX;
       
  6479     PyObject *result;
       
  6480 
       
  6481     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
       
  6482 		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
       
  6483         return NULL;
       
  6484 
       
  6485     substring = (PyUnicodeObject *)PyUnicode_FromObject(
       
  6486         (PyObject *)substring);
       
  6487     if (substring == NULL)
       
  6488 	return NULL;
       
  6489 
       
  6490     FIX_START_END(self);
       
  6491 
       
  6492     result = PyInt_FromSsize_t(
       
  6493         stringlib_count(self->str + start, end - start,
       
  6494                         substring->str, substring->length)
       
  6495         );
       
  6496 
       
  6497     Py_DECREF(substring);
       
  6498 
       
  6499     return result;
       
  6500 }
       
  6501 
       
  6502 PyDoc_STRVAR(encode__doc__,
       
  6503 "S.encode([encoding[,errors]]) -> string or unicode\n\
       
  6504 \n\
       
  6505 Encodes S using the codec registered for encoding. encoding defaults\n\
       
  6506 to the default encoding. errors may be given to set a different error\n\
       
  6507 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
       
  6508 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
       
  6509 'xmlcharrefreplace' as well as any other name registered with\n\
       
  6510 codecs.register_error that can handle UnicodeEncodeErrors.");
       
  6511 
       
  6512 static PyObject *
       
  6513 unicode_encode(PyUnicodeObject *self, PyObject *args)
       
  6514 {
       
  6515     char *encoding = NULL;
       
  6516     char *errors = NULL;
       
  6517     PyObject *v;
       
  6518     
       
  6519     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
       
  6520         return NULL;
       
  6521     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
       
  6522     if (v == NULL)
       
  6523         goto onError;
       
  6524     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
       
  6525         PyErr_Format(PyExc_TypeError,
       
  6526                      "encoder did not return a string/unicode object "
       
  6527                      "(type=%.400s)",
       
  6528                      Py_TYPE(v)->tp_name);
       
  6529         Py_DECREF(v);
       
  6530         return NULL;
       
  6531     }
       
  6532     return v;
       
  6533 
       
  6534  onError:
       
  6535     return NULL;
       
  6536 }
       
  6537 
       
  6538 PyDoc_STRVAR(decode__doc__,
       
  6539 "S.decode([encoding[,errors]]) -> string or unicode\n\
       
  6540 \n\
       
  6541 Decodes S using the codec registered for encoding. encoding defaults\n\
       
  6542 to the default encoding. errors may be given to set a different error\n\
       
  6543 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
       
  6544 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
       
  6545 as well as any other name registerd with codecs.register_error that is\n\
       
  6546 able to handle UnicodeDecodeErrors.");
       
  6547 
       
  6548 static PyObject *
       
  6549 unicode_decode(PyUnicodeObject *self, PyObject *args)
       
  6550 {
       
  6551     char *encoding = NULL;
       
  6552     char *errors = NULL;
       
  6553     PyObject *v;
       
  6554     
       
  6555     if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
       
  6556         return NULL;
       
  6557     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
       
  6558     if (v == NULL)
       
  6559         goto onError;
       
  6560     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
       
  6561         PyErr_Format(PyExc_TypeError,
       
  6562                      "decoder did not return a string/unicode object "
       
  6563                      "(type=%.400s)",
       
  6564                      Py_TYPE(v)->tp_name);
       
  6565         Py_DECREF(v);
       
  6566         return NULL;
       
  6567     }
       
  6568     return v;
       
  6569 
       
  6570  onError:
       
  6571     return NULL;
       
  6572 }
       
  6573 
       
  6574 PyDoc_STRVAR(expandtabs__doc__,
       
  6575 "S.expandtabs([tabsize]) -> unicode\n\
       
  6576 \n\
       
  6577 Return a copy of S where all tab characters are expanded using spaces.\n\
       
  6578 If tabsize is not given, a tab size of 8 characters is assumed.");
       
  6579 
       
  6580 static PyObject*
       
  6581 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
       
  6582 {
       
  6583     Py_UNICODE *e;
       
  6584     Py_UNICODE *p;
       
  6585     Py_UNICODE *q;
       
  6586     Py_UNICODE *qe;
       
  6587     Py_ssize_t i, j, incr;
       
  6588     PyUnicodeObject *u;
       
  6589     int tabsize = 8;
       
  6590 
       
  6591     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
       
  6592 	return NULL;
       
  6593 
       
  6594     /* First pass: determine size of output string */
       
  6595     i = 0; /* chars up to and including most recent \n or \r */
       
  6596     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
       
  6597     e = self->str + self->length; /* end of input */
       
  6598     for (p = self->str; p < e; p++)
       
  6599         if (*p == '\t') {
       
  6600 	    if (tabsize > 0) {
       
  6601 		incr = tabsize - (j % tabsize); /* cannot overflow */
       
  6602 		if (j > PY_SSIZE_T_MAX - incr)
       
  6603 		    goto overflow1;
       
  6604 		j += incr;
       
  6605             }
       
  6606 	}
       
  6607         else {
       
  6608 	    if (j > PY_SSIZE_T_MAX - 1)
       
  6609 		goto overflow1;
       
  6610             j++;
       
  6611             if (*p == '\n' || *p == '\r') {
       
  6612 		if (i > PY_SSIZE_T_MAX - j)
       
  6613 		    goto overflow1;
       
  6614                 i += j;
       
  6615                 j = 0;
       
  6616             }
       
  6617         }
       
  6618 
       
  6619     if (i > PY_SSIZE_T_MAX - j)
       
  6620 	goto overflow1;
       
  6621 
       
  6622     /* Second pass: create output string and fill it */
       
  6623     u = _PyUnicode_New(i + j);
       
  6624     if (!u)
       
  6625         return NULL;
       
  6626 
       
  6627     j = 0; /* same as in first pass */
       
  6628     q = u->str; /* next output char */
       
  6629     qe = u->str + u->length; /* end of output */
       
  6630 
       
  6631     for (p = self->str; p < e; p++)
       
  6632         if (*p == '\t') {
       
  6633 	    if (tabsize > 0) {
       
  6634 		i = tabsize - (j % tabsize);
       
  6635 		j += i;
       
  6636 		while (i--) {
       
  6637 		    if (q >= qe)
       
  6638 			goto overflow2;
       
  6639 		    *q++ = ' ';
       
  6640                 }
       
  6641 	    }
       
  6642 	}
       
  6643 	else {
       
  6644 	    if (q >= qe)
       
  6645 		goto overflow2;
       
  6646 	    *q++ = *p;
       
  6647             j++;
       
  6648             if (*p == '\n' || *p == '\r')
       
  6649                 j = 0;
       
  6650         }
       
  6651 
       
  6652     return (PyObject*) u;
       
  6653 
       
  6654   overflow2:
       
  6655     Py_DECREF(u);
       
  6656   overflow1:
       
  6657     PyErr_SetString(PyExc_OverflowError, "new string is too long");
       
  6658     return NULL;
       
  6659 }
       
  6660 
       
  6661 PyDoc_STRVAR(find__doc__,
       
  6662 "S.find(sub [,start [,end]]) -> int\n\
       
  6663 \n\
       
  6664 Return the lowest index in S where substring sub is found,\n\
       
  6665 such that sub is contained within s[start:end].  Optional\n\
       
  6666 arguments start and end are interpreted as in slice notation.\n\
       
  6667 \n\
       
  6668 Return -1 on failure.");
       
  6669 
       
  6670 static PyObject *
       
  6671 unicode_find(PyUnicodeObject *self, PyObject *args)
       
  6672 {
       
  6673     PyObject *substring;
       
  6674     Py_ssize_t start;
       
  6675     Py_ssize_t end;
       
  6676     Py_ssize_t result;
       
  6677 
       
  6678     if (!_ParseTupleFinds(args, &substring, &start, &end))
       
  6679         return NULL;
       
  6680 
       
  6681     result = stringlib_find_slice(
       
  6682         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
       
  6683         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
       
  6684         start, end
       
  6685         );
       
  6686 
       
  6687     Py_DECREF(substring);
       
  6688 
       
  6689     return PyInt_FromSsize_t(result);
       
  6690 }
       
  6691 
       
  6692 static PyObject *
       
  6693 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
       
  6694 {
       
  6695     if (index < 0 || index >= self->length) {
       
  6696         PyErr_SetString(PyExc_IndexError, "string index out of range");
       
  6697         return NULL;
       
  6698     }
       
  6699 
       
  6700     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
       
  6701 }
       
  6702 
       
  6703 static long
       
  6704 unicode_hash(PyUnicodeObject *self)
       
  6705 {
       
  6706     /* Since Unicode objects compare equal to their ASCII string
       
  6707        counterparts, they should use the individual character values
       
  6708        as basis for their hash value.  This is needed to assure that
       
  6709        strings and Unicode objects behave in the same way as
       
  6710        dictionary keys. */
       
  6711 
       
  6712     register Py_ssize_t len;
       
  6713     register Py_UNICODE *p;
       
  6714     register long x;
       
  6715 
       
  6716     if (self->hash != -1)
       
  6717 	return self->hash;
       
  6718     len = PyUnicode_GET_SIZE(self);
       
  6719     p = PyUnicode_AS_UNICODE(self);
       
  6720     x = *p << 7;
       
  6721     while (--len >= 0)
       
  6722 	x = (1000003*x) ^ *p++;
       
  6723     x ^= PyUnicode_GET_SIZE(self);
       
  6724     if (x == -1)
       
  6725 	x = -2;
       
  6726     self->hash = x;
       
  6727     return x;
       
  6728 }
       
  6729 
       
  6730 PyDoc_STRVAR(index__doc__,
       
  6731 "S.index(sub [,start [,end]]) -> int\n\
       
  6732 \n\
       
  6733 Like S.find() but raise ValueError when the substring is not found.");
       
  6734 
       
  6735 static PyObject *
       
  6736 unicode_index(PyUnicodeObject *self, PyObject *args)
       
  6737 {
       
  6738     Py_ssize_t result;
       
  6739     PyObject *substring;
       
  6740     Py_ssize_t start;
       
  6741     Py_ssize_t end;
       
  6742 
       
  6743     if (!_ParseTupleFinds(args, &substring, &start, &end))
       
  6744         return NULL;
       
  6745 
       
  6746     result = stringlib_find_slice(
       
  6747         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
       
  6748         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
       
  6749         start, end
       
  6750         );
       
  6751 
       
  6752     Py_DECREF(substring);
       
  6753 
       
  6754     if (result < 0) {
       
  6755         PyErr_SetString(PyExc_ValueError, "substring not found");
       
  6756         return NULL;
       
  6757     }
       
  6758 
       
  6759     return PyInt_FromSsize_t(result);
       
  6760 }
       
  6761 
       
  6762 PyDoc_STRVAR(islower__doc__,
       
  6763 "S.islower() -> bool\n\
       
  6764 \n\
       
  6765 Return True if all cased characters in S are lowercase and there is\n\
       
  6766 at least one cased character in S, False otherwise.");
       
  6767 
       
  6768 static PyObject*
       
  6769 unicode_islower(PyUnicodeObject *self)
       
  6770 {
       
  6771     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
       
  6772     register const Py_UNICODE *e;
       
  6773     int cased;
       
  6774 
       
  6775     /* Shortcut for single character strings */
       
  6776     if (PyUnicode_GET_SIZE(self) == 1)
       
  6777 	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
       
  6778 
       
  6779     /* Special case for empty strings */
       
  6780     if (PyUnicode_GET_SIZE(self) == 0)
       
  6781 	return PyBool_FromLong(0);
       
  6782 
       
  6783     e = p + PyUnicode_GET_SIZE(self);
       
  6784     cased = 0;
       
  6785     for (; p < e; p++) {
       
  6786 	register const Py_UNICODE ch = *p;
       
  6787 
       
  6788 	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
       
  6789 	    return PyBool_FromLong(0);
       
  6790 	else if (!cased && Py_UNICODE_ISLOWER(ch))
       
  6791 	    cased = 1;
       
  6792     }
       
  6793     return PyBool_FromLong(cased);
       
  6794 }
       
  6795 
       
  6796 PyDoc_STRVAR(isupper__doc__,
       
  6797 "S.isupper() -> bool\n\
       
  6798 \n\
       
  6799 Return True if all cased characters in S are uppercase and there is\n\
       
  6800 at least one cased character in S, False otherwise.");
       
  6801 
       
  6802 static PyObject*
       
  6803 unicode_isupper(PyUnicodeObject *self)
       
  6804 {
       
  6805     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
       
  6806     register const Py_UNICODE *e;
       
  6807     int cased;
       
  6808 
       
  6809     /* Shortcut for single character strings */
       
  6810     if (PyUnicode_GET_SIZE(self) == 1)
       
  6811 	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
       
  6812 
       
  6813     /* Special case for empty strings */
       
  6814     if (PyUnicode_GET_SIZE(self) == 0)
       
  6815 	return PyBool_FromLong(0);
       
  6816 
       
  6817     e = p + PyUnicode_GET_SIZE(self);
       
  6818     cased = 0;
       
  6819     for (; p < e; p++) {
       
  6820 	register const Py_UNICODE ch = *p;
       
  6821 
       
  6822 	if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
       
  6823 	    return PyBool_FromLong(0);
       
  6824 	else if (!cased && Py_UNICODE_ISUPPER(ch))
       
  6825 	    cased = 1;
       
  6826     }
       
  6827     return PyBool_FromLong(cased);
       
  6828 }
       
  6829 
       
  6830 PyDoc_STRVAR(istitle__doc__,
       
  6831 "S.istitle() -> bool\n\
       
  6832 \n\
       
  6833 Return True if S is a titlecased string and there is at least one\n\
       
  6834 character in S, i.e. upper- and titlecase characters may only\n\
       
  6835 follow uncased characters and lowercase characters only cased ones.\n\
       
  6836 Return False otherwise.");
       
  6837 
       
  6838 static PyObject*
       
  6839 unicode_istitle(PyUnicodeObject *self)
       
  6840 {
       
  6841     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
       
  6842     register const Py_UNICODE *e;
       
  6843     int cased, previous_is_cased;
       
  6844 
       
  6845     /* Shortcut for single character strings */
       
  6846     if (PyUnicode_GET_SIZE(self) == 1)
       
  6847 	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
       
  6848 			       (Py_UNICODE_ISUPPER(*p) != 0));
       
  6849 
       
  6850     /* Special case for empty strings */
       
  6851     if (PyUnicode_GET_SIZE(self) == 0)
       
  6852 	return PyBool_FromLong(0);
       
  6853 
       
  6854     e = p + PyUnicode_GET_SIZE(self);
       
  6855     cased = 0;
       
  6856     previous_is_cased = 0;
       
  6857     for (; p < e; p++) {
       
  6858 	register const Py_UNICODE ch = *p;
       
  6859 
       
  6860 	if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
       
  6861 	    if (previous_is_cased)
       
  6862 		return PyBool_FromLong(0);
       
  6863 	    previous_is_cased = 1;
       
  6864 	    cased = 1;
       
  6865 	}
       
  6866 	else if (Py_UNICODE_ISLOWER(ch)) {
       
  6867 	    if (!previous_is_cased)
       
  6868 		return PyBool_FromLong(0);
       
  6869 	    previous_is_cased = 1;
       
  6870 	    cased = 1;
       
  6871 	}
       
  6872 	else
       
  6873 	    previous_is_cased = 0;
       
  6874     }
       
  6875     return PyBool_FromLong(cased);
       
  6876 }
       
  6877 
       
  6878 PyDoc_STRVAR(isspace__doc__,
       
  6879 "S.isspace() -> bool\n\
       
  6880 \n\
       
  6881 Return True if all characters in S are whitespace\n\
       
  6882 and there is at least one character in S, False otherwise.");
       
  6883 
       
  6884 static PyObject*
       
  6885 unicode_isspace(PyUnicodeObject *self)
       
  6886 {
       
  6887     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
       
  6888     register const Py_UNICODE *e;
       
  6889 
       
  6890     /* Shortcut for single character strings */
       
  6891     if (PyUnicode_GET_SIZE(self) == 1 &&
       
  6892 	Py_UNICODE_ISSPACE(*p))
       
  6893 	return PyBool_FromLong(1);
       
  6894 
       
  6895     /* Special case for empty strings */
       
  6896     if (PyUnicode_GET_SIZE(self) == 0)
       
  6897 	return PyBool_FromLong(0);
       
  6898 
       
  6899     e = p + PyUnicode_GET_SIZE(self);
       
  6900     for (; p < e; p++) {
       
  6901 	if (!Py_UNICODE_ISSPACE(*p))
       
  6902 	    return PyBool_FromLong(0);
       
  6903     }
       
  6904     return PyBool_FromLong(1);
       
  6905 }
       
  6906 
       
  6907 PyDoc_STRVAR(isalpha__doc__,
       
  6908 "S.isalpha() -> bool\n\
       
  6909 \n\
       
  6910 Return True if all characters in S are alphabetic\n\
       
  6911 and there is at least one character in S, False otherwise.");
       
  6912 
       
  6913 static PyObject*
       
  6914 unicode_isalpha(PyUnicodeObject *self)
       
  6915 {
       
  6916     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
       
  6917     register const Py_UNICODE *e;
       
  6918 
       
  6919     /* Shortcut for single character strings */
       
  6920     if (PyUnicode_GET_SIZE(self) == 1 &&
       
  6921 	Py_UNICODE_ISALPHA(*p))
       
  6922 	return PyBool_FromLong(1);
       
  6923 
       
  6924     /* Special case for empty strings */
       
  6925     if (PyUnicode_GET_SIZE(self) == 0)
       
  6926 	return PyBool_FromLong(0);
       
  6927 
       
  6928     e = p + PyUnicode_GET_SIZE(self);
       
  6929     for (; p < e; p++) {
       
  6930 	if (!Py_UNICODE_ISALPHA(*p))
       
  6931 	    return PyBool_FromLong(0);
       
  6932     }
       
  6933     return PyBool_FromLong(1);
       
  6934 }
       
  6935 
       
  6936 PyDoc_STRVAR(isalnum__doc__,
       
  6937 "S.isalnum() -> bool\n\
       
  6938 \n\
       
  6939 Return True if all characters in S are alphanumeric\n\
       
  6940 and there is at least one character in S, False otherwise.");
       
  6941 
       
  6942 static PyObject*
       
  6943 unicode_isalnum(PyUnicodeObject *self)
       
  6944 {
       
  6945     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
       
  6946     register const Py_UNICODE *e;
       
  6947 
       
  6948     /* Shortcut for single character strings */
       
  6949     if (PyUnicode_GET_SIZE(self) == 1 &&
       
  6950 	Py_UNICODE_ISALNUM(*p))
       
  6951 	return PyBool_FromLong(1);
       
  6952 
       
  6953     /* Special case for empty strings */
       
  6954     if (PyUnicode_GET_SIZE(self) == 0)
       
  6955 	return PyBool_FromLong(0);
       
  6956 
       
  6957     e = p + PyUnicode_GET_SIZE(self);
       
  6958     for (; p < e; p++) {
       
  6959 	if (!Py_UNICODE_ISALNUM(*p))
       
  6960 	    return PyBool_FromLong(0);
       
  6961     }
       
  6962     return PyBool_FromLong(1);
       
  6963 }
       
  6964 
       
  6965 PyDoc_STRVAR(isdecimal__doc__,
       
  6966 "S.isdecimal() -> bool\n\
       
  6967 \n\
       
  6968 Return True if there are only decimal characters in S,\n\
       
  6969 False otherwise.");
       
  6970 
       
  6971 static PyObject*
       
  6972 unicode_isdecimal(PyUnicodeObject *self)
       
  6973 {
       
  6974     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
       
  6975     register const Py_UNICODE *e;
       
  6976 
       
  6977     /* Shortcut for single character strings */
       
  6978     if (PyUnicode_GET_SIZE(self) == 1 &&
       
  6979 	Py_UNICODE_ISDECIMAL(*p))
       
  6980 	return PyBool_FromLong(1);
       
  6981 
       
  6982     /* Special case for empty strings */
       
  6983     if (PyUnicode_GET_SIZE(self) == 0)
       
  6984 	return PyBool_FromLong(0);
       
  6985 
       
  6986     e = p + PyUnicode_GET_SIZE(self);
       
  6987     for (; p < e; p++) {
       
  6988 	if (!Py_UNICODE_ISDECIMAL(*p))
       
  6989 	    return PyBool_FromLong(0);
       
  6990     }
       
  6991     return PyBool_FromLong(1);
       
  6992 }
       
  6993 
       
  6994 PyDoc_STRVAR(isdigit__doc__,
       
  6995 "S.isdigit() -> bool\n\
       
  6996 \n\
       
  6997 Return True if all characters in S are digits\n\
       
  6998 and there is at least one character in S, False otherwise.");
       
  6999 
       
  7000 static PyObject*
       
  7001 unicode_isdigit(PyUnicodeObject *self)
       
  7002 {
       
  7003     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
       
  7004     register const Py_UNICODE *e;
       
  7005 
       
  7006     /* Shortcut for single character strings */
       
  7007     if (PyUnicode_GET_SIZE(self) == 1 &&
       
  7008 	Py_UNICODE_ISDIGIT(*p))
       
  7009 	return PyBool_FromLong(1);
       
  7010 
       
  7011     /* Special case for empty strings */
       
  7012     if (PyUnicode_GET_SIZE(self) == 0)
       
  7013 	return PyBool_FromLong(0);
       
  7014 
       
  7015     e = p + PyUnicode_GET_SIZE(self);
       
  7016     for (; p < e; p++) {
       
  7017 	if (!Py_UNICODE_ISDIGIT(*p))
       
  7018 	    return PyBool_FromLong(0);
       
  7019     }
       
  7020     return PyBool_FromLong(1);
       
  7021 }
       
  7022 
       
  7023 PyDoc_STRVAR(isnumeric__doc__,
       
  7024 "S.isnumeric() -> bool\n\
       
  7025 \n\
       
  7026 Return True if there are only numeric characters in S,\n\
       
  7027 False otherwise.");
       
  7028 
       
  7029 static PyObject*
       
  7030 unicode_isnumeric(PyUnicodeObject *self)
       
  7031 {
       
  7032     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
       
  7033     register const Py_UNICODE *e;
       
  7034 
       
  7035     /* Shortcut for single character strings */
       
  7036     if (PyUnicode_GET_SIZE(self) == 1 &&
       
  7037 	Py_UNICODE_ISNUMERIC(*p))
       
  7038 	return PyBool_FromLong(1);
       
  7039 
       
  7040     /* Special case for empty strings */
       
  7041     if (PyUnicode_GET_SIZE(self) == 0)
       
  7042 	return PyBool_FromLong(0);
       
  7043 
       
  7044     e = p + PyUnicode_GET_SIZE(self);
       
  7045     for (; p < e; p++) {
       
  7046 	if (!Py_UNICODE_ISNUMERIC(*p))
       
  7047 	    return PyBool_FromLong(0);
       
  7048     }
       
  7049     return PyBool_FromLong(1);
       
  7050 }
       
  7051 
       
  7052 PyDoc_STRVAR(join__doc__,
       
  7053 "S.join(sequence) -> unicode\n\
       
  7054 \n\
       
  7055 Return a string which is the concatenation of the strings in the\n\
       
  7056 sequence.  The separator between elements is S.");
       
  7057 
       
  7058 static PyObject*
       
  7059 unicode_join(PyObject *self, PyObject *data)
       
  7060 {
       
  7061     return PyUnicode_Join(self, data);
       
  7062 }
       
  7063 
       
  7064 static Py_ssize_t
       
  7065 unicode_length(PyUnicodeObject *self)
       
  7066 {
       
  7067     return self->length;
       
  7068 }
       
  7069 
       
  7070 PyDoc_STRVAR(ljust__doc__,
       
  7071 "S.ljust(width[, fillchar]) -> int\n\
       
  7072 \n\
       
  7073 Return S left-justified in a Unicode string of length width. Padding is\n\
       
  7074 done using the specified fill character (default is a space).");
       
  7075 
       
  7076 static PyObject *
       
  7077 unicode_ljust(PyUnicodeObject *self, PyObject *args)
       
  7078 {
       
  7079     Py_ssize_t width;
       
  7080     Py_UNICODE fillchar = ' ';
       
  7081 
       
  7082     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
       
  7083         return NULL;
       
  7084 
       
  7085     if (self->length >= width && PyUnicode_CheckExact(self)) {
       
  7086         Py_INCREF(self);
       
  7087         return (PyObject*) self;
       
  7088     }
       
  7089 
       
  7090     return (PyObject*) pad(self, 0, width - self->length, fillchar);
       
  7091 }
       
  7092 
       
  7093 PyDoc_STRVAR(lower__doc__,
       
  7094 "S.lower() -> unicode\n\
       
  7095 \n\
       
  7096 Return a copy of the string S converted to lowercase.");
       
  7097 
       
  7098 static PyObject*
       
  7099 unicode_lower(PyUnicodeObject *self)
       
  7100 {
       
  7101     return fixup(self, fixlower);
       
  7102 }
       
  7103 
       
  7104 #define LEFTSTRIP 0
       
  7105 #define RIGHTSTRIP 1
       
  7106 #define BOTHSTRIP 2
       
  7107 
       
  7108 /* Arrays indexed by above */
       
  7109 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
       
  7110 
       
  7111 #define STRIPNAME(i) (stripformat[i]+3)
       
  7112 
       
  7113 /* externally visible for str.strip(unicode) */
       
  7114 PyObject *
       
  7115 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
       
  7116 {
       
  7117 	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
       
  7118 	Py_ssize_t len = PyUnicode_GET_SIZE(self);
       
  7119 	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
       
  7120 	Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
       
  7121 	Py_ssize_t i, j;
       
  7122 
       
  7123         BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
       
  7124 
       
  7125 	i = 0;
       
  7126 	if (striptype != RIGHTSTRIP) {
       
  7127             while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
       
  7128                 i++;
       
  7129             }
       
  7130 	}
       
  7131 
       
  7132 	j = len;
       
  7133 	if (striptype != LEFTSTRIP) {
       
  7134             do {
       
  7135                 j--;
       
  7136             } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
       
  7137             j++;
       
  7138 	}
       
  7139 
       
  7140 	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
       
  7141             Py_INCREF(self);
       
  7142             return (PyObject*)self;
       
  7143 	}
       
  7144 	else
       
  7145             return PyUnicode_FromUnicode(s+i, j-i);
       
  7146 }
       
  7147 
       
  7148 
       
  7149 static PyObject *
       
  7150 do_strip(PyUnicodeObject *self, int striptype)
       
  7151 {
       
  7152 	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
       
  7153 	Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
       
  7154 
       
  7155 	i = 0;
       
  7156 	if (striptype != RIGHTSTRIP) {
       
  7157 		while (i < len && Py_UNICODE_ISSPACE(s[i])) {
       
  7158 			i++;
       
  7159 		}
       
  7160 	}
       
  7161 
       
  7162 	j = len;
       
  7163 	if (striptype != LEFTSTRIP) {
       
  7164 		do {
       
  7165 			j--;
       
  7166 		} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
       
  7167 		j++;
       
  7168 	}
       
  7169 
       
  7170 	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
       
  7171 		Py_INCREF(self);
       
  7172 		return (PyObject*)self;
       
  7173 	}
       
  7174 	else
       
  7175 		return PyUnicode_FromUnicode(s+i, j-i);
       
  7176 }
       
  7177 
       
  7178 
       
  7179 static PyObject *
       
  7180 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
       
  7181 {
       
  7182 	PyObject *sep = NULL;
       
  7183 
       
  7184 	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
       
  7185 		return NULL;
       
  7186 
       
  7187 	if (sep != NULL && sep != Py_None) {
       
  7188 		if (PyUnicode_Check(sep))
       
  7189 			return _PyUnicode_XStrip(self, striptype, sep);
       
  7190 		else if (PyString_Check(sep)) {
       
  7191 			PyObject *res;
       
  7192 			sep = PyUnicode_FromObject(sep);
       
  7193 			if (sep==NULL)
       
  7194 				return NULL;
       
  7195 			res = _PyUnicode_XStrip(self, striptype, sep);
       
  7196 			Py_DECREF(sep);
       
  7197 			return res;
       
  7198 		}
       
  7199 		else {
       
  7200 			PyErr_Format(PyExc_TypeError,
       
  7201 				     "%s arg must be None, unicode or str",
       
  7202 				     STRIPNAME(striptype));
       
  7203 			return NULL;
       
  7204 		}
       
  7205 	}
       
  7206 
       
  7207 	return do_strip(self, striptype);
       
  7208 }
       
  7209 
       
  7210 
       
  7211 PyDoc_STRVAR(strip__doc__,
       
  7212 "S.strip([chars]) -> unicode\n\
       
  7213 \n\
       
  7214 Return a copy of the string S with leading and trailing\n\
       
  7215 whitespace removed.\n\
       
  7216 If chars is given and not None, remove characters in chars instead.\n\
       
  7217 If chars is a str, it will be converted to unicode before stripping");
       
  7218 
       
  7219 static PyObject *
       
  7220 unicode_strip(PyUnicodeObject *self, PyObject *args)
       
  7221 {
       
  7222 	if (PyTuple_GET_SIZE(args) == 0)
       
  7223 		return do_strip(self, BOTHSTRIP); /* Common case */
       
  7224 	else
       
  7225 		return do_argstrip(self, BOTHSTRIP, args);
       
  7226 }
       
  7227 
       
  7228 
       
  7229 PyDoc_STRVAR(lstrip__doc__,
       
  7230 "S.lstrip([chars]) -> unicode\n\
       
  7231 \n\
       
  7232 Return a copy of the string S with leading whitespace removed.\n\
       
  7233 If chars is given and not None, remove characters in chars instead.\n\
       
  7234 If chars is a str, it will be converted to unicode before stripping");
       
  7235 
       
  7236 static PyObject *
       
  7237 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
       
  7238 {
       
  7239 	if (PyTuple_GET_SIZE(args) == 0)
       
  7240 		return do_strip(self, LEFTSTRIP); /* Common case */
       
  7241 	else
       
  7242 		return do_argstrip(self, LEFTSTRIP, args);
       
  7243 }
       
  7244 
       
  7245 
       
  7246 PyDoc_STRVAR(rstrip__doc__,
       
  7247 "S.rstrip([chars]) -> unicode\n\
       
  7248 \n\
       
  7249 Return a copy of the string S with trailing whitespace removed.\n\
       
  7250 If chars is given and not None, remove characters in chars instead.\n\
       
  7251 If chars is a str, it will be converted to unicode before stripping");
       
  7252 
       
  7253 static PyObject *
       
  7254 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
       
  7255 {
       
  7256 	if (PyTuple_GET_SIZE(args) == 0)
       
  7257 		return do_strip(self, RIGHTSTRIP); /* Common case */
       
  7258 	else
       
  7259 		return do_argstrip(self, RIGHTSTRIP, args);
       
  7260 }
       
  7261 
       
  7262 
       
  7263 static PyObject*
       
  7264 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
       
  7265 {
       
  7266     PyUnicodeObject *u;
       
  7267     Py_UNICODE *p;
       
  7268     Py_ssize_t nchars;
       
  7269     size_t nbytes;
       
  7270 
       
  7271     if (len < 0)
       
  7272         len = 0;
       
  7273 
       
  7274     if (len == 1 && PyUnicode_CheckExact(str)) {
       
  7275         /* no repeat, return original string */
       
  7276         Py_INCREF(str);
       
  7277         return (PyObject*) str;
       
  7278     }
       
  7279 
       
  7280     /* ensure # of chars needed doesn't overflow int and # of bytes
       
  7281      * needed doesn't overflow size_t
       
  7282      */
       
  7283     nchars = len * str->length;
       
  7284     if (len && nchars / len != str->length) {
       
  7285         PyErr_SetString(PyExc_OverflowError,
       
  7286                         "repeated string is too long");
       
  7287         return NULL;
       
  7288     }
       
  7289     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
       
  7290     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
       
  7291         PyErr_SetString(PyExc_OverflowError,
       
  7292                         "repeated string is too long");
       
  7293         return NULL;
       
  7294     }
       
  7295     u = _PyUnicode_New(nchars);
       
  7296     if (!u)
       
  7297         return NULL;
       
  7298 
       
  7299     p = u->str;
       
  7300 
       
  7301     if (str->length == 1 && len > 0) {
       
  7302         Py_UNICODE_FILL(p, str->str[0], len);
       
  7303     } else {
       
  7304 	Py_ssize_t done = 0; /* number of characters copied this far */
       
  7305 	if (done < nchars) {
       
  7306             Py_UNICODE_COPY(p, str->str, str->length);
       
  7307             done = str->length;
       
  7308 	}
       
  7309 	while (done < nchars) {
       
  7310             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
       
  7311             Py_UNICODE_COPY(p+done, p, n);
       
  7312             done += n;
       
  7313 	}
       
  7314     }
       
  7315 
       
  7316     return (PyObject*) u;
       
  7317 }
       
  7318 
       
  7319 PyObject *PyUnicode_Replace(PyObject *obj,
       
  7320 			    PyObject *subobj,
       
  7321 			    PyObject *replobj,
       
  7322 			    Py_ssize_t maxcount)
       
  7323 {
       
  7324     PyObject *self;
       
  7325     PyObject *str1;
       
  7326     PyObject *str2;
       
  7327     PyObject *result;
       
  7328 
       
  7329     self = PyUnicode_FromObject(obj);
       
  7330     if (self == NULL)
       
  7331 	return NULL;
       
  7332     str1 = PyUnicode_FromObject(subobj);
       
  7333     if (str1 == NULL) {
       
  7334 	Py_DECREF(self);
       
  7335 	return NULL;
       
  7336     }
       
  7337     str2 = PyUnicode_FromObject(replobj);
       
  7338     if (str2 == NULL) {
       
  7339 	Py_DECREF(self);
       
  7340 	Py_DECREF(str1);
       
  7341 	return NULL;
       
  7342     }
       
  7343     result = replace((PyUnicodeObject *)self,
       
  7344 		     (PyUnicodeObject *)str1,
       
  7345 		     (PyUnicodeObject *)str2,
       
  7346 		     maxcount);
       
  7347     Py_DECREF(self);
       
  7348     Py_DECREF(str1);
       
  7349     Py_DECREF(str2);
       
  7350     return result;
       
  7351 }
       
  7352 
       
  7353 PyDoc_STRVAR(replace__doc__,
       
  7354 "S.replace (old, new[, count]) -> unicode\n\
       
  7355 \n\
       
  7356 Return a copy of S with all occurrences of substring\n\
       
  7357 old replaced by new.  If the optional argument count is\n\
       
  7358 given, only the first count occurrences are replaced.");
       
  7359 
       
  7360 static PyObject*
       
  7361 unicode_replace(PyUnicodeObject *self, PyObject *args)
       
  7362 {
       
  7363     PyUnicodeObject *str1;
       
  7364     PyUnicodeObject *str2;
       
  7365     Py_ssize_t maxcount = -1;
       
  7366     PyObject *result;
       
  7367 
       
  7368     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
       
  7369         return NULL;
       
  7370     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
       
  7371     if (str1 == NULL)
       
  7372 	return NULL;
       
  7373     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
       
  7374     if (str2 == NULL) {
       
  7375 	Py_DECREF(str1);
       
  7376 	return NULL;
       
  7377     }
       
  7378 
       
  7379     result = replace(self, str1, str2, maxcount);
       
  7380 
       
  7381     Py_DECREF(str1);
       
  7382     Py_DECREF(str2);
       
  7383     return result;
       
  7384 }
       
  7385 
       
  7386 static
       
  7387 PyObject *unicode_repr(PyObject *unicode)
       
  7388 {
       
  7389     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
       
  7390 				PyUnicode_GET_SIZE(unicode),
       
  7391 				1);
       
  7392 }
       
  7393 
       
  7394 PyDoc_STRVAR(rfind__doc__,
       
  7395 "S.rfind(sub [,start [,end]]) -> int\n\
       
  7396 \n\
       
  7397 Return the highest index in S where substring sub is found,\n\
       
  7398 such that sub is contained within s[start:end].  Optional\n\
       
  7399 arguments start and end are interpreted as in slice notation.\n\
       
  7400 \n\
       
  7401 Return -1 on failure.");
       
  7402 
       
  7403 static PyObject *
       
  7404 unicode_rfind(PyUnicodeObject *self, PyObject *args)
       
  7405 {
       
  7406     PyObject *substring;
       
  7407     Py_ssize_t start;
       
  7408     Py_ssize_t end;
       
  7409     Py_ssize_t result;
       
  7410 
       
  7411     if (!_ParseTupleFinds(args, &substring, &start, &end))
       
  7412 	    return NULL;
       
  7413 
       
  7414     result = stringlib_rfind_slice(
       
  7415         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
       
  7416         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
       
  7417         start, end
       
  7418         );
       
  7419 
       
  7420     Py_DECREF(substring);
       
  7421 
       
  7422     return PyInt_FromSsize_t(result);
       
  7423 }
       
  7424 
       
  7425 PyDoc_STRVAR(rindex__doc__,
       
  7426 "S.rindex(sub [,start [,end]]) -> int\n\
       
  7427 \n\
       
  7428 Like S.rfind() but raise ValueError when the substring is not found.");
       
  7429 
       
  7430 static PyObject *
       
  7431 unicode_rindex(PyUnicodeObject *self, PyObject *args)
       
  7432 {
       
  7433     PyObject *substring;
       
  7434     Py_ssize_t start;
       
  7435     Py_ssize_t end;
       
  7436     Py_ssize_t result;
       
  7437 
       
  7438     if (!_ParseTupleFinds(args, &substring, &start, &end))
       
  7439 	    return NULL;
       
  7440 
       
  7441     result = stringlib_rfind_slice(
       
  7442         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
       
  7443         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
       
  7444         start, end
       
  7445         );
       
  7446 
       
  7447     Py_DECREF(substring);
       
  7448 
       
  7449     if (result < 0) {
       
  7450         PyErr_SetString(PyExc_ValueError, "substring not found");
       
  7451         return NULL;
       
  7452     }
       
  7453     return PyInt_FromSsize_t(result);
       
  7454 }
       
  7455 
       
  7456 PyDoc_STRVAR(rjust__doc__,
       
  7457 "S.rjust(width[, fillchar]) -> unicode\n\
       
  7458 \n\
       
  7459 Return S right-justified in a Unicode string of length width. Padding is\n\
       
  7460 done using the specified fill character (default is a space).");
       
  7461 
       
  7462 static PyObject *
       
  7463 unicode_rjust(PyUnicodeObject *self, PyObject *args)
       
  7464 {
       
  7465     Py_ssize_t width;
       
  7466     Py_UNICODE fillchar = ' ';
       
  7467 
       
  7468     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
       
  7469         return NULL;
       
  7470 
       
  7471     if (self->length >= width && PyUnicode_CheckExact(self)) {
       
  7472         Py_INCREF(self);
       
  7473         return (PyObject*) self;
       
  7474     }
       
  7475 
       
  7476     return (PyObject*) pad(self, width - self->length, 0, fillchar);
       
  7477 }
       
  7478 
       
  7479 static PyObject*
       
  7480 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
       
  7481 {
       
  7482     /* standard clamping */
       
  7483     if (start < 0)
       
  7484         start = 0;
       
  7485     if (end < 0)
       
  7486         end = 0;
       
  7487     if (end > self->length)
       
  7488         end = self->length;
       
  7489     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
       
  7490         /* full slice, return original string */
       
  7491         Py_INCREF(self);
       
  7492         return (PyObject*) self;
       
  7493     }
       
  7494     if (start > end)
       
  7495         start = end;
       
  7496     /* copy slice */
       
  7497     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
       
  7498 					     end - start);
       
  7499 }
       
  7500 
       
  7501 PyObject *PyUnicode_Split(PyObject *s,
       
  7502 			  PyObject *sep,
       
  7503 			  Py_ssize_t maxsplit)
       
  7504 {
       
  7505     PyObject *result;
       
  7506 
       
  7507     s = PyUnicode_FromObject(s);
       
  7508     if (s == NULL)
       
  7509 	return NULL;
       
  7510     if (sep != NULL) {
       
  7511 	sep = PyUnicode_FromObject(sep);
       
  7512 	if (sep == NULL) {
       
  7513 	    Py_DECREF(s);
       
  7514 	    return NULL;
       
  7515 	}
       
  7516     }
       
  7517 
       
  7518     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
       
  7519 
       
  7520     Py_DECREF(s);
       
  7521     Py_XDECREF(sep);
       
  7522     return result;
       
  7523 }
       
  7524 
       
  7525 PyDoc_STRVAR(split__doc__,
       
  7526 "S.split([sep [,maxsplit]]) -> list of strings\n\
       
  7527 \n\
       
  7528 Return a list of the words in S, using sep as the\n\
       
  7529 delimiter string.  If maxsplit is given, at most maxsplit\n\
       
  7530 splits are done. If sep is not specified or is None, any\n\
       
  7531 whitespace string is a separator and empty strings are\n\
       
  7532 removed from the result.");
       
  7533 
       
  7534 static PyObject*
       
  7535 unicode_split(PyUnicodeObject *self, PyObject *args)
       
  7536 {
       
  7537     PyObject *substring = Py_None;
       
  7538     Py_ssize_t maxcount = -1;
       
  7539 
       
  7540     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
       
  7541         return NULL;
       
  7542 
       
  7543     if (substring == Py_None)
       
  7544 	return split(self, NULL, maxcount);
       
  7545     else if (PyUnicode_Check(substring))
       
  7546 	return split(self, (PyUnicodeObject *)substring, maxcount);
       
  7547     else
       
  7548 	return PyUnicode_Split((PyObject *)self, substring, maxcount);
       
  7549 }
       
  7550 
       
  7551 PyObject *
       
  7552 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
       
  7553 {
       
  7554     PyObject* str_obj;
       
  7555     PyObject* sep_obj;
       
  7556     PyObject* out;
       
  7557 
       
  7558     str_obj = PyUnicode_FromObject(str_in);
       
  7559     if (!str_obj)
       
  7560 	return NULL;
       
  7561     sep_obj = PyUnicode_FromObject(sep_in);
       
  7562     if (!sep_obj) {
       
  7563         Py_DECREF(str_obj);
       
  7564         return NULL;
       
  7565     }
       
  7566 
       
  7567     out = stringlib_partition(
       
  7568         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
       
  7569         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
       
  7570         );
       
  7571 
       
  7572     Py_DECREF(sep_obj);
       
  7573     Py_DECREF(str_obj);
       
  7574 
       
  7575     return out;
       
  7576 }
       
  7577 
       
  7578 
       
  7579 PyObject *
       
  7580 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
       
  7581 {
       
  7582     PyObject* str_obj;
       
  7583     PyObject* sep_obj;
       
  7584     PyObject* out;
       
  7585 
       
  7586     str_obj = PyUnicode_FromObject(str_in);
       
  7587     if (!str_obj)
       
  7588 	return NULL;
       
  7589     sep_obj = PyUnicode_FromObject(sep_in);
       
  7590     if (!sep_obj) {
       
  7591         Py_DECREF(str_obj);
       
  7592         return NULL;
       
  7593     }
       
  7594 
       
  7595     out = stringlib_rpartition(
       
  7596         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
       
  7597         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
       
  7598         );
       
  7599 
       
  7600     Py_DECREF(sep_obj);
       
  7601     Py_DECREF(str_obj);
       
  7602 
       
  7603     return out;
       
  7604 }
       
  7605 
       
  7606 PyDoc_STRVAR(partition__doc__,
       
  7607 "S.partition(sep) -> (head, sep, tail)\n\
       
  7608 \n\
       
  7609 Search for the separator sep in S, and return the part before it,\n\
       
  7610 the separator itself, and the part after it.  If the separator is not\n\
       
  7611 found, return S and two empty strings.");
       
  7612 
       
  7613 static PyObject*
       
  7614 unicode_partition(PyUnicodeObject *self, PyObject *separator)
       
  7615 {
       
  7616     return PyUnicode_Partition((PyObject *)self, separator);
       
  7617 }
       
  7618 
       
  7619 PyDoc_STRVAR(rpartition__doc__,
       
  7620 "S.rpartition(sep) -> (tail, sep, head)\n\
       
  7621 \n\
       
  7622 Search for the separator sep in S, starting at the end of S, and return\n\
       
  7623 the part before it, the separator itself, and the part after it.  If the\n\
       
  7624 separator is not found, return two empty strings and S.");
       
  7625 
       
  7626 static PyObject*
       
  7627 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
       
  7628 {
       
  7629     return PyUnicode_RPartition((PyObject *)self, separator);
       
  7630 }
       
  7631 
       
  7632 PyObject *PyUnicode_RSplit(PyObject *s,
       
  7633 			   PyObject *sep,
       
  7634 			   Py_ssize_t maxsplit)
       
  7635 {
       
  7636     PyObject *result;
       
  7637     
       
  7638     s = PyUnicode_FromObject(s);
       
  7639     if (s == NULL)
       
  7640 	return NULL;
       
  7641     if (sep != NULL) {
       
  7642 	sep = PyUnicode_FromObject(sep);
       
  7643 	if (sep == NULL) {
       
  7644 	    Py_DECREF(s);
       
  7645 	    return NULL;
       
  7646 	}
       
  7647     }
       
  7648 
       
  7649     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
       
  7650 
       
  7651     Py_DECREF(s);
       
  7652     Py_XDECREF(sep);
       
  7653     return result;
       
  7654 }
       
  7655 
       
  7656 PyDoc_STRVAR(rsplit__doc__,
       
  7657 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
       
  7658 \n\
       
  7659 Return a list of the words in S, using sep as the\n\
       
  7660 delimiter string, starting at the end of the string and\n\
       
  7661 working to the front.  If maxsplit is given, at most maxsplit\n\
       
  7662 splits are done. If sep is not specified, any whitespace string\n\
       
  7663 is a separator.");
       
  7664 
       
  7665 static PyObject*
       
  7666 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
       
  7667 {
       
  7668     PyObject *substring = Py_None;
       
  7669     Py_ssize_t maxcount = -1;
       
  7670 
       
  7671     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
       
  7672         return NULL;
       
  7673 
       
  7674     if (substring == Py_None)
       
  7675 	return rsplit(self, NULL, maxcount);
       
  7676     else if (PyUnicode_Check(substring))
       
  7677 	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
       
  7678     else
       
  7679 	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
       
  7680 }
       
  7681 
       
  7682 PyDoc_STRVAR(splitlines__doc__,
       
  7683 "S.splitlines([keepends]]) -> list of strings\n\
       
  7684 \n\
       
  7685 Return a list of the lines in S, breaking at line boundaries.\n\
       
  7686 Line breaks are not included in the resulting list unless keepends\n\
       
  7687 is given and true.");
       
  7688 
       
  7689 static PyObject*
       
  7690 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
       
  7691 {
       
  7692     int keepends = 0;
       
  7693 
       
  7694     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
       
  7695         return NULL;
       
  7696 
       
  7697     return PyUnicode_Splitlines((PyObject *)self, keepends);
       
  7698 }
       
  7699 
       
  7700 static
       
  7701 PyObject *unicode_str(PyUnicodeObject *self)
       
  7702 {
       
  7703     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
       
  7704 }
       
  7705 
       
  7706 PyDoc_STRVAR(swapcase__doc__,
       
  7707 "S.swapcase() -> unicode\n\
       
  7708 \n\
       
  7709 Return a copy of S with uppercase characters converted to lowercase\n\
       
  7710 and vice versa.");
       
  7711 
       
  7712 static PyObject*
       
  7713 unicode_swapcase(PyUnicodeObject *self)
       
  7714 {
       
  7715     return fixup(self, fixswapcase);
       
  7716 }
       
  7717 
       
  7718 PyDoc_STRVAR(translate__doc__,
       
  7719 "S.translate(table) -> unicode\n\
       
  7720 \n\
       
  7721 Return a copy of the string S, where all characters have been mapped\n\
       
  7722 through the given translation table, which must be a mapping of\n\
       
  7723 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
       
  7724 Unmapped characters are left untouched. Characters mapped to None\n\
       
  7725 are deleted.");
       
  7726 
       
  7727 static PyObject*
       
  7728 unicode_translate(PyUnicodeObject *self, PyObject *table)
       
  7729 {
       
  7730     return PyUnicode_TranslateCharmap(self->str,
       
  7731 				      self->length,
       
  7732 				      table,
       
  7733 				      "ignore");
       
  7734 }
       
  7735 
       
  7736 PyDoc_STRVAR(upper__doc__,
       
  7737 "S.upper() -> unicode\n\
       
  7738 \n\
       
  7739 Return a copy of S converted to uppercase.");
       
  7740 
       
  7741 static PyObject*
       
  7742 unicode_upper(PyUnicodeObject *self)
       
  7743 {
       
  7744     return fixup(self, fixupper);
       
  7745 }
       
  7746 
       
  7747 PyDoc_STRVAR(zfill__doc__,
       
  7748 "S.zfill(width) -> unicode\n\
       
  7749 \n\
       
  7750 Pad a numeric string S with zeros on the left, to fill a field\n\
       
  7751 of the specified width. The string S is never truncated.");
       
  7752 
       
  7753 static PyObject *
       
  7754 unicode_zfill(PyUnicodeObject *self, PyObject *args)
       
  7755 {
       
  7756     Py_ssize_t fill;
       
  7757     PyUnicodeObject *u;
       
  7758 
       
  7759     Py_ssize_t width;
       
  7760     if (!PyArg_ParseTuple(args, "n:zfill", &width))
       
  7761         return NULL;
       
  7762 
       
  7763     if (self->length >= width) {
       
  7764         if (PyUnicode_CheckExact(self)) {
       
  7765             Py_INCREF(self);
       
  7766             return (PyObject*) self;
       
  7767         }
       
  7768         else
       
  7769             return PyUnicode_FromUnicode(
       
  7770                 PyUnicode_AS_UNICODE(self),
       
  7771                 PyUnicode_GET_SIZE(self)
       
  7772             );
       
  7773     }
       
  7774 
       
  7775     fill = width - self->length;
       
  7776 
       
  7777     u = pad(self, fill, 0, '0');
       
  7778 
       
  7779     if (u == NULL)
       
  7780         return NULL;
       
  7781 
       
  7782     if (u->str[fill] == '+' || u->str[fill] == '-') {
       
  7783         /* move sign to beginning of string */
       
  7784         u->str[0] = u->str[fill];
       
  7785         u->str[fill] = '0';
       
  7786     }
       
  7787 
       
  7788     return (PyObject*) u;
       
  7789 }
       
  7790 
       
  7791 #if 0
       
  7792 static PyObject*
       
  7793 free_listsize(PyUnicodeObject *self)
       
  7794 {
       
  7795     return PyInt_FromLong(numfree);
       
  7796 }
       
  7797 #endif
       
  7798 
       
  7799 PyDoc_STRVAR(startswith__doc__,
       
  7800 "S.startswith(prefix[, start[, end]]) -> bool\n\
       
  7801 \n\
       
  7802 Return True if S starts with the specified prefix, False otherwise.\n\
       
  7803 With optional start, test S beginning at that position.\n\
       
  7804 With optional end, stop comparing S at that position.\n\
       
  7805 prefix can also be a tuple of strings to try.");
       
  7806 
       
  7807 static PyObject *
       
  7808 unicode_startswith(PyUnicodeObject *self,
       
  7809 		   PyObject *args)
       
  7810 {
       
  7811     PyObject *subobj;
       
  7812     PyUnicodeObject *substring;
       
  7813     Py_ssize_t start = 0;
       
  7814     Py_ssize_t end = PY_SSIZE_T_MAX;
       
  7815     int result;
       
  7816 
       
  7817     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
       
  7818 		_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
       
  7819 	return NULL;
       
  7820     if (PyTuple_Check(subobj)) {
       
  7821         Py_ssize_t i;
       
  7822         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
       
  7823             substring = (PyUnicodeObject *)PyUnicode_FromObject(
       
  7824                             PyTuple_GET_ITEM(subobj, i));
       
  7825             if (substring == NULL)
       
  7826                 return NULL;
       
  7827             result = tailmatch(self, substring, start, end, -1);
       
  7828             Py_DECREF(substring);
       
  7829             if (result) {
       
  7830                 Py_RETURN_TRUE;
       
  7831             }
       
  7832         }
       
  7833         /* nothing matched */
       
  7834         Py_RETURN_FALSE;
       
  7835     }
       
  7836     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
       
  7837     if (substring == NULL)
       
  7838          return NULL;
       
  7839     result = tailmatch(self, substring, start, end, -1);
       
  7840     Py_DECREF(substring);
       
  7841     return PyBool_FromLong(result);
       
  7842 }
       
  7843 
       
  7844 
       
  7845 PyDoc_STRVAR(endswith__doc__,
       
  7846 "S.endswith(suffix[, start[, end]]) -> bool\n\
       
  7847 \n\
       
  7848 Return True if S ends with the specified suffix, False otherwise.\n\
       
  7849 With optional start, test S beginning at that position.\n\
       
  7850 With optional end, stop comparing S at that position.\n\
       
  7851 suffix can also be a tuple of strings to try.");
       
  7852 
       
  7853 static PyObject *
       
  7854 unicode_endswith(PyUnicodeObject *self,
       
  7855 		 PyObject *args)
       
  7856 {
       
  7857     PyObject *subobj;
       
  7858     PyUnicodeObject *substring;
       
  7859     Py_ssize_t start = 0;
       
  7860     Py_ssize_t end = PY_SSIZE_T_MAX;
       
  7861     int result;
       
  7862 
       
  7863     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
       
  7864         _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
       
  7865 	return NULL;
       
  7866     if (PyTuple_Check(subobj)) {
       
  7867         Py_ssize_t i;
       
  7868         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
       
  7869             substring = (PyUnicodeObject *)PyUnicode_FromObject(
       
  7870                             PyTuple_GET_ITEM(subobj, i));
       
  7871             if (substring == NULL)
       
  7872             return NULL;
       
  7873             result = tailmatch(self, substring, start, end, +1);
       
  7874             Py_DECREF(substring);
       
  7875             if (result) {
       
  7876                 Py_RETURN_TRUE;
       
  7877             }
       
  7878         }
       
  7879         Py_RETURN_FALSE;
       
  7880     }
       
  7881     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
       
  7882     if (substring == NULL)
       
  7883     return NULL;
       
  7884 
       
  7885     result = tailmatch(self, substring, start, end, +1);
       
  7886     Py_DECREF(substring);
       
  7887     return PyBool_FromLong(result);
       
  7888 }
       
  7889 
       
  7890 
       
  7891 /* Implements do_string_format, which is unicode because of stringlib */
       
  7892 #include "stringlib/string_format.h"
       
  7893 
       
  7894 PyDoc_STRVAR(format__doc__,
       
  7895 "S.format(*args, **kwargs) -> unicode\n\
       
  7896 \n\
       
  7897 ");
       
  7898 
       
  7899 static PyObject *
       
  7900 unicode__format__(PyObject *self, PyObject *args)
       
  7901 {
       
  7902     PyObject *format_spec;
       
  7903     PyObject *result = NULL;
       
  7904     PyObject *tmp = NULL;
       
  7905 
       
  7906     /* If 2.x, convert format_spec to the same type as value */
       
  7907     /* This is to allow things like u''.format('') */
       
  7908     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
       
  7909         goto done;
       
  7910     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
       
  7911         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
       
  7912 		     "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
       
  7913         goto done;
       
  7914     }
       
  7915     tmp = PyObject_Unicode(format_spec);
       
  7916     if (tmp == NULL)
       
  7917         goto done;
       
  7918     format_spec = tmp;
       
  7919 
       
  7920     result = _PyUnicode_FormatAdvanced(self,
       
  7921                                        PyUnicode_AS_UNICODE(format_spec),
       
  7922                                        PyUnicode_GET_SIZE(format_spec));
       
  7923 done:
       
  7924     Py_XDECREF(tmp);
       
  7925     return result;
       
  7926 }
       
  7927 
       
  7928 PyDoc_STRVAR(p_format__doc__,
       
  7929 "S.__format__(format_spec) -> unicode\n\
       
  7930 \n\
       
  7931 ");
       
  7932 
       
  7933 static PyObject *
       
  7934 unicode__sizeof__(PyUnicodeObject *v)
       
  7935 {
       
  7936     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
       
  7937                              sizeof(Py_UNICODE) * (v->length + 1));
       
  7938 }
       
  7939 
       
  7940 PyDoc_STRVAR(sizeof__doc__,
       
  7941 "S.__sizeof__() -> size of S in memory, in bytes\n\
       
  7942 \n\
       
  7943 ");
       
  7944 
       
  7945 static PyObject *
       
  7946 unicode_getnewargs(PyUnicodeObject *v)
       
  7947 {
       
  7948 	return Py_BuildValue("(u#)", v->str, v->length);
       
  7949 }
       
  7950 
       
  7951 
       
  7952 static PyMethodDef unicode_methods[] = {
       
  7953 
       
  7954     /* Order is according to common usage: often used methods should
       
  7955        appear first, since lookup is done sequentially. */
       
  7956 
       
  7957     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
       
  7958     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
       
  7959     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
       
  7960     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
       
  7961     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
       
  7962     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
       
  7963     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
       
  7964     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
       
  7965     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
       
  7966     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
       
  7967     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
       
  7968     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
       
  7969     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
       
  7970     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
       
  7971     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
       
  7972     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
       
  7973     {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
       
  7974 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
       
  7975     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
       
  7976     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
       
  7977     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
       
  7978     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
       
  7979     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
       
  7980     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
       
  7981     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
       
  7982     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
       
  7983     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
       
  7984     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
       
  7985     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
       
  7986     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
       
  7987     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
       
  7988     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
       
  7989     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
       
  7990     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
       
  7991     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
       
  7992     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
       
  7993     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
       
  7994     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
       
  7995     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
       
  7996     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
       
  7997     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
       
  7998     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
       
  7999     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
       
  8000     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
       
  8001     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
       
  8002 #if 0
       
  8003     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
       
  8004 #endif
       
  8005 
       
  8006 #if 0
       
  8007     /* This one is just used for debugging the implementation. */
       
  8008     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
       
  8009 #endif
       
  8010 
       
  8011     {"__getnewargs__",	(PyCFunction)unicode_getnewargs, METH_NOARGS},
       
  8012     {NULL, NULL}
       
  8013 };
       
  8014 
       
  8015 static PyObject *
       
  8016 unicode_mod(PyObject *v, PyObject *w)
       
  8017 {
       
  8018        if (!PyUnicode_Check(v)) {
       
  8019                Py_INCREF(Py_NotImplemented);
       
  8020                return Py_NotImplemented;
       
  8021        }
       
  8022        return PyUnicode_Format(v, w);
       
  8023 }
       
  8024 
       
  8025 static PyNumberMethods unicode_as_number = {
       
  8026 	0,				/*nb_add*/
       
  8027 	0,				/*nb_subtract*/
       
  8028 	0,				/*nb_multiply*/
       
  8029 	0,				/*nb_divide*/
       
  8030 	unicode_mod,			/*nb_remainder*/
       
  8031 };
       
  8032 
       
  8033 static PySequenceMethods unicode_as_sequence = {
       
  8034     (lenfunc) unicode_length, 		/* sq_length */
       
  8035     PyUnicode_Concat,		 	/* sq_concat */
       
  8036     (ssizeargfunc) unicode_repeat, 	/* sq_repeat */
       
  8037     (ssizeargfunc) unicode_getitem, 	/* sq_item */
       
  8038     (ssizessizeargfunc) unicode_slice, 	/* sq_slice */
       
  8039     0, 					/* sq_ass_item */
       
  8040     0, 					/* sq_ass_slice */
       
  8041     PyUnicode_Contains, 		/* sq_contains */
       
  8042 };
       
  8043 
       
  8044 static PyObject*
       
  8045 unicode_subscript(PyUnicodeObject* self, PyObject* item)
       
  8046 {
       
  8047     if (PyIndex_Check(item)) {
       
  8048         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
       
  8049         if (i == -1 && PyErr_Occurred())
       
  8050             return NULL;
       
  8051         if (i < 0)
       
  8052             i += PyUnicode_GET_SIZE(self);
       
  8053         return unicode_getitem(self, i);
       
  8054     } else if (PySlice_Check(item)) {
       
  8055         Py_ssize_t start, stop, step, slicelength, cur, i;
       
  8056         Py_UNICODE* source_buf;
       
  8057         Py_UNICODE* result_buf;
       
  8058         PyObject* result;
       
  8059 
       
  8060         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
       
  8061 				 &start, &stop, &step, &slicelength) < 0) {
       
  8062             return NULL;
       
  8063         }
       
  8064 
       
  8065         if (slicelength <= 0) {
       
  8066             return PyUnicode_FromUnicode(NULL, 0);
       
  8067         } else if (start == 0 && step == 1 && slicelength == self->length &&
       
  8068                    PyUnicode_CheckExact(self)) {
       
  8069             Py_INCREF(self);
       
  8070             return (PyObject *)self;
       
  8071         } else if (step == 1) {
       
  8072             return PyUnicode_FromUnicode(self->str + start, slicelength);
       
  8073         } else {
       
  8074             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
       
  8075             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
       
  8076                                                        sizeof(Py_UNICODE));
       
  8077 	    
       
  8078 	    if (result_buf == NULL)
       
  8079 		    return PyErr_NoMemory();
       
  8080 
       
  8081             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
       
  8082                 result_buf[i] = source_buf[cur];
       
  8083             }
       
  8084 
       
  8085             result = PyUnicode_FromUnicode(result_buf, slicelength);
       
  8086             PyObject_FREE(result_buf);
       
  8087             return result;
       
  8088         }
       
  8089     } else {
       
  8090         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
       
  8091         return NULL;
       
  8092     }
       
  8093 }
       
  8094 
       
  8095 static PyMappingMethods unicode_as_mapping = {
       
  8096     (lenfunc)unicode_length,		/* mp_length */
       
  8097     (binaryfunc)unicode_subscript,	/* mp_subscript */
       
  8098     (objobjargproc)0,			/* mp_ass_subscript */
       
  8099 };
       
  8100 
       
  8101 static Py_ssize_t
       
  8102 unicode_buffer_getreadbuf(PyUnicodeObject *self,
       
  8103 			  Py_ssize_t index,
       
  8104 			  const void **ptr)
       
  8105 {
       
  8106     if (index != 0) {
       
  8107         PyErr_SetString(PyExc_SystemError,
       
  8108 			"accessing non-existent unicode segment");
       
  8109         return -1;
       
  8110     }
       
  8111     *ptr = (void *) self->str;
       
  8112     return PyUnicode_GET_DATA_SIZE(self);
       
  8113 }
       
  8114 
       
  8115 static Py_ssize_t
       
  8116 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
       
  8117 			   const void **ptr)
       
  8118 {
       
  8119     PyErr_SetString(PyExc_TypeError,
       
  8120 		    "cannot use unicode as modifiable buffer");
       
  8121     return -1;
       
  8122 }
       
  8123 
       
  8124 static int
       
  8125 unicode_buffer_getsegcount(PyUnicodeObject *self,
       
  8126 			   Py_ssize_t *lenp)
       
  8127 {
       
  8128     if (lenp)
       
  8129         *lenp = PyUnicode_GET_DATA_SIZE(self);
       
  8130     return 1;
       
  8131 }
       
  8132 
       
  8133 static Py_ssize_t
       
  8134 unicode_buffer_getcharbuf(PyUnicodeObject *self,
       
  8135 			  Py_ssize_t index,
       
  8136 			  const void **ptr)
       
  8137 {
       
  8138     PyObject *str;
       
  8139 
       
  8140     if (index != 0) {
       
  8141         PyErr_SetString(PyExc_SystemError,
       
  8142 			"accessing non-existent unicode segment");
       
  8143         return -1;
       
  8144     }
       
  8145     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
       
  8146     if (str == NULL)
       
  8147 	return -1;
       
  8148     *ptr = (void *) PyString_AS_STRING(str);
       
  8149     return PyString_GET_SIZE(str);
       
  8150 }
       
  8151 
       
  8152 /* Helpers for PyUnicode_Format() */
       
  8153 
       
  8154 static PyObject *
       
  8155 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
       
  8156 {
       
  8157     Py_ssize_t argidx = *p_argidx;
       
  8158     if (argidx < arglen) {
       
  8159 	(*p_argidx)++;
       
  8160 	if (arglen < 0)
       
  8161 	    return args;
       
  8162 	else
       
  8163 	    return PyTuple_GetItem(args, argidx);
       
  8164     }
       
  8165     PyErr_SetString(PyExc_TypeError,
       
  8166 		    "not enough arguments for format string");
       
  8167     return NULL;
       
  8168 }
       
  8169 
       
  8170 #define F_LJUST (1<<0)
       
  8171 #define F_SIGN	(1<<1)
       
  8172 #define F_BLANK (1<<2)
       
  8173 #define F_ALT	(1<<3)
       
  8174 #define F_ZERO	(1<<4)
       
  8175 
       
  8176 static Py_ssize_t
       
  8177 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
       
  8178 {
       
  8179     register Py_ssize_t i;
       
  8180     Py_ssize_t len = strlen(charbuffer);
       
  8181     for (i = len - 1; i >= 0; i--)
       
  8182 	buffer[i] = (Py_UNICODE) charbuffer[i];
       
  8183 
       
  8184     return len;
       
  8185 }
       
  8186 
       
  8187 static int
       
  8188 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
       
  8189 {
       
  8190     Py_ssize_t result;
       
  8191 
       
  8192     PyOS_ascii_formatd((char *)buffer, len, format, x);
       
  8193     result = strtounicode(buffer, (char *)buffer);
       
  8194     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
       
  8195 }
       
  8196 
       
  8197 static int
       
  8198 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
       
  8199 {
       
  8200     Py_ssize_t result;
       
  8201 
       
  8202     PyOS_snprintf((char *)buffer, len, format, x);
       
  8203     result = strtounicode(buffer, (char *)buffer);
       
  8204     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
       
  8205 }
       
  8206 
       
  8207 /* XXX To save some code duplication, formatfloat/long/int could have been
       
  8208    shared with stringobject.c, converting from 8-bit to Unicode after the
       
  8209    formatting is done. */
       
  8210 
       
  8211 static int
       
  8212 formatfloat(Py_UNICODE *buf,
       
  8213 	    size_t buflen,
       
  8214 	    int flags,
       
  8215 	    int prec,
       
  8216 	    int type,
       
  8217 	    PyObject *v)
       
  8218 {
       
  8219     /* fmt = '%#.' + `prec` + `type`
       
  8220        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
       
  8221     char fmt[20];
       
  8222     double x;
       
  8223 
       
  8224     x = PyFloat_AsDouble(v);
       
  8225     if (x == -1.0 && PyErr_Occurred())
       
  8226 	return -1;
       
  8227     if (prec < 0)
       
  8228 	prec = 6;
       
  8229     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
       
  8230 	type = 'g';
       
  8231     /* Worst case length calc to ensure no buffer overrun:
       
  8232 
       
  8233        'g' formats:
       
  8234 	 fmt = %#.<prec>g
       
  8235 	 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
       
  8236 	    for any double rep.)
       
  8237 	 len = 1 + prec + 1 + 2 + 5 = 9 + prec
       
  8238 
       
  8239        'f' formats:
       
  8240 	 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
       
  8241 	 len = 1 + 50 + 1 + prec = 52 + prec
       
  8242 
       
  8243        If prec=0 the effective precision is 1 (the leading digit is
       
  8244        always given), therefore increase the length by one.
       
  8245 
       
  8246     */
       
  8247     if (((type == 'g' || type == 'G') && 
       
  8248           buflen <= (size_t)10 + (size_t)prec) ||
       
  8249 	(type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
       
  8250 	PyErr_SetString(PyExc_OverflowError,
       
  8251 			"formatted float is too long (precision too large?)");
       
  8252 	return -1;
       
  8253     }
       
  8254     PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
       
  8255 		  (flags&F_ALT) ? "#" : "",
       
  8256 		  prec, type);
       
  8257     return doubletounicode(buf, buflen, fmt, x);
       
  8258 }
       
  8259 
       
  8260 static PyObject*
       
  8261 formatlong(PyObject *val, int flags, int prec, int type)
       
  8262 {
       
  8263 	char *buf;
       
  8264 	int i, len;
       
  8265 	PyObject *str; /* temporary string object. */
       
  8266 	PyUnicodeObject *result;
       
  8267 
       
  8268 	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
       
  8269 	if (!str)
       
  8270 		return NULL;
       
  8271 	result = _PyUnicode_New(len);
       
  8272 	if (!result) {
       
  8273 		Py_DECREF(str);
       
  8274 		return NULL;
       
  8275 	}
       
  8276 	for (i = 0; i < len; i++)
       
  8277 		result->str[i] = buf[i];
       
  8278 	result->str[len] = 0;
       
  8279 	Py_DECREF(str);
       
  8280 	return (PyObject*)result;
       
  8281 }
       
  8282 
       
  8283 static int
       
  8284 formatint(Py_UNICODE *buf,
       
  8285 	  size_t buflen,
       
  8286 	  int flags,
       
  8287 	  int prec,
       
  8288 	  int type,
       
  8289 	  PyObject *v)
       
  8290 {
       
  8291     /* fmt = '%#.' + `prec` + 'l' + `type`
       
  8292      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
       
  8293      *                     + 1 + 1
       
  8294      *                   = 24
       
  8295      */
       
  8296     char fmt[64]; /* plenty big enough! */
       
  8297     char *sign;
       
  8298     long x;
       
  8299 
       
  8300     x = PyInt_AsLong(v);
       
  8301     if (x == -1 && PyErr_Occurred())
       
  8302         return -1;
       
  8303     if (x < 0 && type == 'u') {
       
  8304         type = 'd';
       
  8305     }
       
  8306     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
       
  8307         sign = "-";
       
  8308     else
       
  8309         sign = "";
       
  8310     if (prec < 0)
       
  8311         prec = 1;
       
  8312 
       
  8313     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
       
  8314      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
       
  8315      */
       
  8316     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
       
  8317         PyErr_SetString(PyExc_OverflowError,
       
  8318     	        "formatted integer is too long (precision too large?)");
       
  8319         return -1;
       
  8320     }
       
  8321 
       
  8322     if ((flags & F_ALT) &&
       
  8323         (type == 'x' || type == 'X')) {
       
  8324         /* When converting under %#x or %#X, there are a number
       
  8325          * of issues that cause pain:
       
  8326          * - when 0 is being converted, the C standard leaves off
       
  8327          *   the '0x' or '0X', which is inconsistent with other
       
  8328          *   %#x/%#X conversions and inconsistent with Python's
       
  8329          *   hex() function
       
  8330          * - there are platforms that violate the standard and
       
  8331          *   convert 0 with the '0x' or '0X'
       
  8332          *   (Metrowerks, Compaq Tru64)
       
  8333          * - there are platforms that give '0x' when converting
       
  8334          *   under %#X, but convert 0 in accordance with the
       
  8335          *   standard (OS/2 EMX)
       
  8336          *
       
  8337          * We can achieve the desired consistency by inserting our
       
  8338          * own '0x' or '0X' prefix, and substituting %x/%X in place
       
  8339          * of %#x/%#X.
       
  8340          *
       
  8341          * Note that this is the same approach as used in
       
  8342          * formatint() in stringobject.c
       
  8343          */
       
  8344         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
       
  8345                       sign, type, prec, type);
       
  8346     }
       
  8347     else {
       
  8348         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
       
  8349                       sign, (flags&F_ALT) ? "#" : "",
       
  8350                       prec, type);
       
  8351     }
       
  8352     if (sign[0])
       
  8353         return longtounicode(buf, buflen, fmt, -x);
       
  8354     else
       
  8355         return longtounicode(buf, buflen, fmt, x);
       
  8356 }
       
  8357 
       
  8358 static int
       
  8359 formatchar(Py_UNICODE *buf,
       
  8360            size_t buflen,
       
  8361            PyObject *v)
       
  8362 {
       
  8363     /* presume that the buffer is at least 2 characters long */
       
  8364     if (PyUnicode_Check(v)) {
       
  8365 	if (PyUnicode_GET_SIZE(v) != 1)
       
  8366 	    goto onError;
       
  8367 	buf[0] = PyUnicode_AS_UNICODE(v)[0];
       
  8368     }
       
  8369 
       
  8370     else if (PyString_Check(v)) {
       
  8371 	if (PyString_GET_SIZE(v) != 1)
       
  8372 	    goto onError;
       
  8373 	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
       
  8374     }
       
  8375 
       
  8376     else {
       
  8377 	/* Integer input truncated to a character */
       
  8378         long x;
       
  8379 	x = PyInt_AsLong(v);
       
  8380 	if (x == -1 && PyErr_Occurred())
       
  8381 	    goto onError;
       
  8382 #ifdef Py_UNICODE_WIDE
       
  8383 	if (x < 0 || x > 0x10ffff) {
       
  8384 	    PyErr_SetString(PyExc_OverflowError,
       
  8385 			    "%c arg not in range(0x110000) "
       
  8386 			    "(wide Python build)");
       
  8387 	    return -1;
       
  8388 	}
       
  8389 #else
       
  8390 	if (x < 0 || x > 0xffff) {
       
  8391 	    PyErr_SetString(PyExc_OverflowError,
       
  8392 			    "%c arg not in range(0x10000) "
       
  8393 			    "(narrow Python build)");
       
  8394 	    return -1;
       
  8395 	}
       
  8396 #endif
       
  8397 	buf[0] = (Py_UNICODE) x;
       
  8398     }
       
  8399     buf[1] = '\0';
       
  8400     return 1;
       
  8401 
       
  8402  onError:
       
  8403     PyErr_SetString(PyExc_TypeError,
       
  8404 		    "%c requires int or char");
       
  8405     return -1;
       
  8406 }
       
  8407 
       
  8408 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
       
  8409 
       
  8410    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
       
  8411    chars are formatted. XXX This is a magic number. Each formatting
       
  8412    routine does bounds checking to ensure no overflow, but a better
       
  8413    solution may be to malloc a buffer of appropriate size for each
       
  8414    format. For now, the current solution is sufficient.
       
  8415 */
       
  8416 #define FORMATBUFLEN (size_t)120
       
  8417 
       
  8418 PyObject *PyUnicode_Format(PyObject *format,
       
  8419 			   PyObject *args)
       
  8420 {
       
  8421     Py_UNICODE *fmt, *res;
       
  8422     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
       
  8423     int args_owned = 0;
       
  8424     PyUnicodeObject *result = NULL;
       
  8425     PyObject *dict = NULL;
       
  8426     PyObject *uformat;
       
  8427 
       
  8428     if (format == NULL || args == NULL) {
       
  8429 	PyErr_BadInternalCall();
       
  8430 	return NULL;
       
  8431     }
       
  8432     uformat = PyUnicode_FromObject(format);
       
  8433     if (uformat == NULL)
       
  8434 	return NULL;
       
  8435     fmt = PyUnicode_AS_UNICODE(uformat);
       
  8436     fmtcnt = PyUnicode_GET_SIZE(uformat);
       
  8437 
       
  8438     reslen = rescnt = fmtcnt + 100;
       
  8439     result = _PyUnicode_New(reslen);
       
  8440     if (result == NULL)
       
  8441 	goto onError;
       
  8442     res = PyUnicode_AS_UNICODE(result);
       
  8443 
       
  8444     if (PyTuple_Check(args)) {
       
  8445 	arglen = PyTuple_Size(args);
       
  8446 	argidx = 0;
       
  8447     }
       
  8448     else {
       
  8449 	arglen = -1;
       
  8450 	argidx = -2;
       
  8451     }
       
  8452     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
       
  8453         !PyObject_TypeCheck(args, &PyBaseString_Type))
       
  8454 	dict = args;
       
  8455 
       
  8456     while (--fmtcnt >= 0) {
       
  8457 	if (*fmt != '%') {
       
  8458 	    if (--rescnt < 0) {
       
  8459 		rescnt = fmtcnt + 100;
       
  8460 		reslen += rescnt;
       
  8461 		if (_PyUnicode_Resize(&result, reslen) < 0)
       
  8462 		    goto onError;
       
  8463 		res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
       
  8464 		--rescnt;
       
  8465 	    }
       
  8466 	    *res++ = *fmt++;
       
  8467 	}
       
  8468 	else {
       
  8469 	    /* Got a format specifier */
       
  8470 	    int flags = 0;
       
  8471 	    Py_ssize_t width = -1;
       
  8472 	    int prec = -1;
       
  8473 	    Py_UNICODE c = '\0';
       
  8474 	    Py_UNICODE fill;
       
  8475 	    int isnumok;
       
  8476 	    PyObject *v = NULL;
       
  8477 	    PyObject *temp = NULL;
       
  8478 	    Py_UNICODE *pbuf;
       
  8479 	    Py_UNICODE sign;
       
  8480 	    Py_ssize_t len;
       
  8481 	    Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
       
  8482 
       
  8483 	    fmt++;
       
  8484 	    if (*fmt == '(') {
       
  8485 		Py_UNICODE *keystart;
       
  8486 		Py_ssize_t keylen;
       
  8487 		PyObject *key;
       
  8488 		int pcount = 1;
       
  8489 
       
  8490 		if (dict == NULL) {
       
  8491 		    PyErr_SetString(PyExc_TypeError,
       
  8492 				    "format requires a mapping");
       
  8493 		    goto onError;
       
  8494 		}
       
  8495 		++fmt;
       
  8496 		--fmtcnt;
       
  8497 		keystart = fmt;
       
  8498 		/* Skip over balanced parentheses */
       
  8499 		while (pcount > 0 && --fmtcnt >= 0) {
       
  8500 		    if (*fmt == ')')
       
  8501 			--pcount;
       
  8502 		    else if (*fmt == '(')
       
  8503 			++pcount;
       
  8504 		    fmt++;
       
  8505 		}
       
  8506 		keylen = fmt - keystart - 1;
       
  8507 		if (fmtcnt < 0 || pcount > 0) {
       
  8508 		    PyErr_SetString(PyExc_ValueError,
       
  8509 				    "incomplete format key");
       
  8510 		    goto onError;
       
  8511 		}
       
  8512 #if 0
       
  8513 		/* keys are converted to strings using UTF-8 and
       
  8514 		   then looked up since Python uses strings to hold
       
  8515 		   variables names etc. in its namespaces and we
       
  8516 		   wouldn't want to break common idioms. */
       
  8517 		key = PyUnicode_EncodeUTF8(keystart,
       
  8518 					   keylen,
       
  8519 					   NULL);
       
  8520 #else
       
  8521 		key = PyUnicode_FromUnicode(keystart, keylen);
       
  8522 #endif
       
  8523 		if (key == NULL)
       
  8524 		    goto onError;
       
  8525 		if (args_owned) {
       
  8526 		    Py_DECREF(args);
       
  8527 		    args_owned = 0;
       
  8528 		}
       
  8529 		args = PyObject_GetItem(dict, key);
       
  8530 		Py_DECREF(key);
       
  8531 		if (args == NULL) {
       
  8532 		    goto onError;
       
  8533 		}
       
  8534 		args_owned = 1;
       
  8535 		arglen = -1;
       
  8536 		argidx = -2;
       
  8537 	    }
       
  8538 	    while (--fmtcnt >= 0) {
       
  8539 		switch (c = *fmt++) {
       
  8540 		case '-': flags |= F_LJUST; continue;
       
  8541 		case '+': flags |= F_SIGN; continue;
       
  8542 		case ' ': flags |= F_BLANK; continue;
       
  8543 		case '#': flags |= F_ALT; continue;
       
  8544 		case '0': flags |= F_ZERO; continue;
       
  8545 		}
       
  8546 		break;
       
  8547 	    }
       
  8548 	    if (c == '*') {
       
  8549 		v = getnextarg(args, arglen, &argidx);
       
  8550 		if (v == NULL)
       
  8551 		    goto onError;
       
  8552 		if (!PyInt_Check(v)) {
       
  8553 		    PyErr_SetString(PyExc_TypeError,
       
  8554 				    "* wants int");
       
  8555 		    goto onError;
       
  8556 		}
       
  8557 		width = PyInt_AsLong(v);
       
  8558 		if (width < 0) {
       
  8559 		    flags |= F_LJUST;
       
  8560 		    width = -width;
       
  8561 		}
       
  8562 		if (--fmtcnt >= 0)
       
  8563 		    c = *fmt++;
       
  8564 	    }
       
  8565 	    else if (c >= '0' && c <= '9') {
       
  8566 		width = c - '0';
       
  8567 		while (--fmtcnt >= 0) {
       
  8568 		    c = *fmt++;
       
  8569 		    if (c < '0' || c > '9')
       
  8570 			break;
       
  8571 		    if ((width*10) / 10 != width) {
       
  8572 			PyErr_SetString(PyExc_ValueError,
       
  8573 					"width too big");
       
  8574 			goto onError;
       
  8575 		    }
       
  8576 		    width = width*10 + (c - '0');
       
  8577 		}
       
  8578 	    }
       
  8579 	    if (c == '.') {
       
  8580 		prec = 0;
       
  8581 		if (--fmtcnt >= 0)
       
  8582 		    c = *fmt++;
       
  8583 		if (c == '*') {
       
  8584 		    v = getnextarg(args, arglen, &argidx);
       
  8585 		    if (v == NULL)
       
  8586 			goto onError;
       
  8587 		    if (!PyInt_Check(v)) {
       
  8588 			PyErr_SetString(PyExc_TypeError,
       
  8589 					"* wants int");
       
  8590 			goto onError;
       
  8591 		    }
       
  8592 		    prec = PyInt_AsLong(v);
       
  8593 		    if (prec < 0)
       
  8594 			prec = 0;
       
  8595 		    if (--fmtcnt >= 0)
       
  8596 			c = *fmt++;
       
  8597 		}
       
  8598 		else if (c >= '0' && c <= '9') {
       
  8599 		    prec = c - '0';
       
  8600 		    while (--fmtcnt >= 0) {
       
  8601 			c = Py_CHARMASK(*fmt++);
       
  8602 			if (c < '0' || c > '9')
       
  8603 			    break;
       
  8604 			if ((prec*10) / 10 != prec) {
       
  8605 			    PyErr_SetString(PyExc_ValueError,
       
  8606 					    "prec too big");
       
  8607 			    goto onError;
       
  8608 			}
       
  8609 			prec = prec*10 + (c - '0');
       
  8610 		    }
       
  8611 		}
       
  8612 	    } /* prec */
       
  8613 	    if (fmtcnt >= 0) {
       
  8614 		if (c == 'h' || c == 'l' || c == 'L') {
       
  8615 		    if (--fmtcnt >= 0)
       
  8616 			c = *fmt++;
       
  8617 		}
       
  8618 	    }
       
  8619 	    if (fmtcnt < 0) {
       
  8620 		PyErr_SetString(PyExc_ValueError,
       
  8621 				"incomplete format");
       
  8622 		goto onError;
       
  8623 	    }
       
  8624 	    if (c != '%') {
       
  8625 		v = getnextarg(args, arglen, &argidx);
       
  8626 		if (v == NULL)
       
  8627 		    goto onError;
       
  8628 	    }
       
  8629 	    sign = 0;
       
  8630 	    fill = ' ';
       
  8631 	    switch (c) {
       
  8632 
       
  8633 	    case '%':
       
  8634 		pbuf = formatbuf;
       
  8635 		/* presume that buffer length is at least 1 */
       
  8636 		pbuf[0] = '%';
       
  8637 		len = 1;
       
  8638 		break;
       
  8639 
       
  8640 	    case 's':
       
  8641 	    case 'r':
       
  8642 		if (PyUnicode_Check(v) && c == 's') {
       
  8643 		    temp = v;
       
  8644 		    Py_INCREF(temp);
       
  8645 		}
       
  8646 		else {
       
  8647 		    PyObject *unicode;
       
  8648 		    if (c == 's')
       
  8649 			temp = PyObject_Unicode(v);
       
  8650 		    else
       
  8651 			temp = PyObject_Repr(v);
       
  8652 		    if (temp == NULL)
       
  8653 			goto onError;
       
  8654                     if (PyUnicode_Check(temp))
       
  8655                         /* nothing to do */;
       
  8656                     else if (PyString_Check(temp)) {
       
  8657                         /* convert to string to Unicode */
       
  8658 		        unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
       
  8659 						   PyString_GET_SIZE(temp),
       
  8660 						   NULL,
       
  8661 						   "strict");
       
  8662 		        Py_DECREF(temp);
       
  8663 		        temp = unicode;
       
  8664 		        if (temp == NULL)
       
  8665 			    goto onError;
       
  8666 		    }
       
  8667 		    else {
       
  8668 			Py_DECREF(temp);
       
  8669 			PyErr_SetString(PyExc_TypeError,
       
  8670 					"%s argument has non-string str()");
       
  8671 			goto onError;
       
  8672 		    }
       
  8673 		}
       
  8674 		pbuf = PyUnicode_AS_UNICODE(temp);
       
  8675 		len = PyUnicode_GET_SIZE(temp);
       
  8676 		if (prec >= 0 && len > prec)
       
  8677 		    len = prec;
       
  8678 		break;
       
  8679 
       
  8680 	    case 'i':
       
  8681 	    case 'd':
       
  8682 	    case 'u':
       
  8683 	    case 'o':
       
  8684 	    case 'x':
       
  8685 	    case 'X':
       
  8686 		if (c == 'i')
       
  8687 		    c = 'd';
       
  8688 		isnumok = 0;
       
  8689 		if (PyNumber_Check(v)) {
       
  8690 			PyObject *iobj=NULL;
       
  8691 
       
  8692 			if (PyInt_Check(v) || (PyLong_Check(v))) {
       
  8693 				iobj = v;
       
  8694 				Py_INCREF(iobj);
       
  8695 			}
       
  8696 			else {
       
  8697 				iobj = PyNumber_Int(v);
       
  8698 				if (iobj==NULL) iobj = PyNumber_Long(v);
       
  8699 			}
       
  8700 			if (iobj!=NULL) {
       
  8701 				if (PyInt_Check(iobj)) {
       
  8702 					isnumok = 1;
       
  8703 					pbuf = formatbuf;
       
  8704 					len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
       
  8705 						    flags, prec, c, iobj);
       
  8706 					Py_DECREF(iobj);
       
  8707 					if (len < 0)
       
  8708 					    goto onError;
       
  8709 					sign = 1;
       
  8710 				} 
       
  8711 				else if (PyLong_Check(iobj)) {
       
  8712 					isnumok = 1;
       
  8713 					temp = formatlong(iobj, flags, prec, c);
       
  8714 					Py_DECREF(iobj);
       
  8715 					if (!temp)
       
  8716 					    goto onError;
       
  8717 					pbuf = PyUnicode_AS_UNICODE(temp);
       
  8718 					len = PyUnicode_GET_SIZE(temp);
       
  8719 					sign = 1;
       
  8720 				}
       
  8721 				else {
       
  8722 					Py_DECREF(iobj);
       
  8723 				}
       
  8724 			}
       
  8725 		}
       
  8726 		if (!isnumok) {
       
  8727 			PyErr_Format(PyExc_TypeError, 
       
  8728 			    "%%%c format: a number is required, "
       
  8729                                      "not %.200s", (char)c, Py_TYPE(v)->tp_name);
       
  8730 			goto onError;
       
  8731 		}
       
  8732 		if (flags & F_ZERO)
       
  8733 		    fill = '0';
       
  8734 		break;
       
  8735 
       
  8736 	    case 'e':
       
  8737 	    case 'E':
       
  8738 	    case 'f':
       
  8739 	    case 'F':
       
  8740 	    case 'g':
       
  8741 	    case 'G':
       
  8742 		if (c == 'F')
       
  8743 			c = 'f';
       
  8744 		pbuf = formatbuf;
       
  8745 		len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
       
  8746 			flags, prec, c, v);
       
  8747 		if (len < 0)
       
  8748 		    goto onError;
       
  8749 		sign = 1;
       
  8750 		if (flags & F_ZERO)
       
  8751 		    fill = '0';
       
  8752 		break;
       
  8753 
       
  8754 	    case 'c':
       
  8755 		pbuf = formatbuf;
       
  8756 		len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
       
  8757 		if (len < 0)
       
  8758 		    goto onError;
       
  8759 		break;
       
  8760 
       
  8761 	    default:
       
  8762 		PyErr_Format(PyExc_ValueError,
       
  8763 			     "unsupported format character '%c' (0x%x) "
       
  8764 			     "at index %zd",
       
  8765 			     (31<=c && c<=126) ? (char)c : '?',
       
  8766                              (int)c,
       
  8767 			     (Py_ssize_t)(fmt - 1 -
       
  8768 					  PyUnicode_AS_UNICODE(uformat)));
       
  8769 		goto onError;
       
  8770 	    }
       
  8771 	    if (sign) {
       
  8772 		if (*pbuf == '-' || *pbuf == '+') {
       
  8773 		    sign = *pbuf++;
       
  8774 		    len--;
       
  8775 		}
       
  8776 		else if (flags & F_SIGN)
       
  8777 		    sign = '+';
       
  8778 		else if (flags & F_BLANK)
       
  8779 		    sign = ' ';
       
  8780 		else
       
  8781 		    sign = 0;
       
  8782 	    }
       
  8783 	    if (width < len)
       
  8784 		width = len;
       
  8785 	    if (rescnt - (sign != 0) < width) {
       
  8786 		reslen -= rescnt;
       
  8787 		rescnt = width + fmtcnt + 100;
       
  8788 		reslen += rescnt;
       
  8789 		if (reslen < 0) {
       
  8790 		    Py_XDECREF(temp);
       
  8791 		    PyErr_NoMemory();
       
  8792 		    goto onError;
       
  8793 		}
       
  8794 		if (_PyUnicode_Resize(&result, reslen) < 0) {
       
  8795 		    Py_XDECREF(temp);
       
  8796 		    goto onError;
       
  8797 		}
       
  8798 		res = PyUnicode_AS_UNICODE(result)
       
  8799 		    + reslen - rescnt;
       
  8800 	    }
       
  8801 	    if (sign) {
       
  8802 		if (fill != ' ')
       
  8803 		    *res++ = sign;
       
  8804 		rescnt--;
       
  8805 		if (width > len)
       
  8806 		    width--;
       
  8807 	    }
       
  8808 	    if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
       
  8809 		assert(pbuf[0] == '0');
       
  8810 		assert(pbuf[1] == c);
       
  8811 		if (fill != ' ') {
       
  8812 		    *res++ = *pbuf++;
       
  8813 		    *res++ = *pbuf++;
       
  8814 		}
       
  8815 		rescnt -= 2;
       
  8816 		width -= 2;
       
  8817 		if (width < 0)
       
  8818 		    width = 0;
       
  8819 		len -= 2;
       
  8820 	    }
       
  8821 	    if (width > len && !(flags & F_LJUST)) {
       
  8822 		do {
       
  8823 		    --rescnt;
       
  8824 		    *res++ = fill;
       
  8825 		} while (--width > len);
       
  8826 	    }
       
  8827 	    if (fill == ' ') {
       
  8828 		if (sign)
       
  8829 		    *res++ = sign;
       
  8830 		if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
       
  8831 		    assert(pbuf[0] == '0');
       
  8832 		    assert(pbuf[1] == c);
       
  8833 		    *res++ = *pbuf++;
       
  8834 		    *res++ = *pbuf++;
       
  8835 		}
       
  8836 	    }
       
  8837 	    Py_UNICODE_COPY(res, pbuf, len);
       
  8838 	    res += len;
       
  8839 	    rescnt -= len;
       
  8840 	    while (--width >= len) {
       
  8841 		--rescnt;
       
  8842 		*res++ = ' ';
       
  8843 	    }
       
  8844 	    if (dict && (argidx < arglen) && c != '%') {
       
  8845 		PyErr_SetString(PyExc_TypeError,
       
  8846 				"not all arguments converted during string formatting");
       
  8847                 Py_XDECREF(temp);
       
  8848 		goto onError;
       
  8849 	    }
       
  8850 	    Py_XDECREF(temp);
       
  8851 	} /* '%' */
       
  8852     } /* until end */
       
  8853     if (argidx < arglen && !dict) {
       
  8854 	PyErr_SetString(PyExc_TypeError,
       
  8855 			"not all arguments converted during string formatting");
       
  8856 	goto onError;
       
  8857     }
       
  8858 
       
  8859     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
       
  8860 	goto onError;
       
  8861     if (args_owned) {
       
  8862 	Py_DECREF(args);
       
  8863     }
       
  8864     Py_DECREF(uformat);
       
  8865     return (PyObject *)result;
       
  8866 
       
  8867  onError:
       
  8868     Py_XDECREF(result);
       
  8869     Py_DECREF(uformat);
       
  8870     if (args_owned) {
       
  8871 	Py_DECREF(args);
       
  8872     }
       
  8873     return NULL;
       
  8874 }
       
  8875 
       
  8876 static PyBufferProcs unicode_as_buffer = {
       
  8877     (readbufferproc) unicode_buffer_getreadbuf,
       
  8878     (writebufferproc) unicode_buffer_getwritebuf,
       
  8879     (segcountproc) unicode_buffer_getsegcount,
       
  8880     (charbufferproc) unicode_buffer_getcharbuf,
       
  8881 };
       
  8882 
       
  8883 static PyObject *
       
  8884 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
       
  8885 
       
  8886 static PyObject *
       
  8887 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
       
  8888 {
       
  8889         PyObject *x = NULL;
       
  8890 	static char *kwlist[] = {"string", "encoding", "errors", 0};
       
  8891 	char *encoding = NULL;
       
  8892 	char *errors = NULL;
       
  8893 
       
  8894 	if (type != &PyUnicode_Type)
       
  8895 		return unicode_subtype_new(type, args, kwds);
       
  8896 	if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
       
  8897 					  kwlist, &x, &encoding, &errors))
       
  8898 	    return NULL;
       
  8899 	if (x == NULL)
       
  8900 		return (PyObject *)_PyUnicode_New(0);
       
  8901 	if (encoding == NULL && errors == NULL)
       
  8902 	    return PyObject_Unicode(x);
       
  8903 	else
       
  8904 	return PyUnicode_FromEncodedObject(x, encoding, errors);
       
  8905 }
       
  8906 
       
  8907 static PyObject *
       
  8908 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
       
  8909 {
       
  8910 	PyUnicodeObject *tmp, *pnew;
       
  8911 	Py_ssize_t n;
       
  8912 
       
  8913 	assert(PyType_IsSubtype(type, &PyUnicode_Type));
       
  8914 	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
       
  8915 	if (tmp == NULL)
       
  8916 		return NULL;
       
  8917 	assert(PyUnicode_Check(tmp));
       
  8918 	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
       
  8919 	if (pnew == NULL) {
       
  8920 		Py_DECREF(tmp);
       
  8921 		return NULL;
       
  8922 	}
       
  8923 	pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
       
  8924 	if (pnew->str == NULL) {
       
  8925 		_Py_ForgetReference((PyObject *)pnew);
       
  8926 		PyObject_Del(pnew);
       
  8927 		Py_DECREF(tmp);
       
  8928 		return PyErr_NoMemory();
       
  8929 	}
       
  8930 	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
       
  8931 	pnew->length = n;
       
  8932 	pnew->hash = tmp->hash;
       
  8933 	Py_DECREF(tmp);
       
  8934 	return (PyObject *)pnew;
       
  8935 }
       
  8936 
       
  8937 PyDoc_STRVAR(unicode_doc,
       
  8938 "unicode(string [, encoding[, errors]]) -> object\n\
       
  8939 \n\
       
  8940 Create a new Unicode object from the given encoded string.\n\
       
  8941 encoding defaults to the current default string encoding.\n\
       
  8942 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
       
  8943 
       
  8944 PyTypeObject PyUnicode_Type = {
       
  8945     PyVarObject_HEAD_INIT(&PyType_Type, 0)
       
  8946     "unicode", 				/* tp_name */
       
  8947     sizeof(PyUnicodeObject), 		/* tp_size */
       
  8948     0, 					/* tp_itemsize */
       
  8949     /* Slots */
       
  8950     (destructor)unicode_dealloc, 	/* tp_dealloc */
       
  8951     0, 					/* tp_print */
       
  8952     0,				 	/* tp_getattr */
       
  8953     0, 					/* tp_setattr */
       
  8954     0, 					/* tp_compare */
       
  8955     unicode_repr, 			/* tp_repr */
       
  8956     &unicode_as_number, 		/* tp_as_number */
       
  8957     &unicode_as_sequence, 		/* tp_as_sequence */
       
  8958     &unicode_as_mapping, 		/* tp_as_mapping */
       
  8959     (hashfunc) unicode_hash, 		/* tp_hash*/
       
  8960     0, 					/* tp_call*/
       
  8961     (reprfunc) unicode_str,	 	/* tp_str */
       
  8962     PyObject_GenericGetAttr, 		/* tp_getattro */
       
  8963     0,			 		/* tp_setattro */
       
  8964     &unicode_as_buffer,			/* tp_as_buffer */
       
  8965     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
       
  8966 	    Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,	/* tp_flags */
       
  8967     unicode_doc,			/* tp_doc */
       
  8968     0,					/* tp_traverse */
       
  8969     0,					/* tp_clear */
       
  8970     PyUnicode_RichCompare,		/* tp_richcompare */
       
  8971     0,					/* tp_weaklistoffset */
       
  8972     0,					/* tp_iter */
       
  8973     0,					/* tp_iternext */
       
  8974     unicode_methods,			/* tp_methods */
       
  8975     0,					/* tp_members */
       
  8976     0,					/* tp_getset */
       
  8977     &PyBaseString_Type,			/* tp_base */
       
  8978     0,					/* tp_dict */
       
  8979     0,					/* tp_descr_get */
       
  8980     0,					/* tp_descr_set */
       
  8981     0,					/* tp_dictoffset */
       
  8982     0,					/* tp_init */
       
  8983     0,					/* tp_alloc */
       
  8984     unicode_new,			/* tp_new */
       
  8985     PyObject_Del,      		/* tp_free */
       
  8986 };
       
  8987 
       
  8988 /* Initialize the Unicode implementation */
       
  8989 
       
  8990 void _PyUnicode_Init(void)
       
  8991 {
       
  8992     int i;
       
  8993 
       
  8994     /* XXX - move this array to unicodectype.c ? */
       
  8995     Py_UNICODE linebreak[] = {
       
  8996         0x000A, /* LINE FEED */
       
  8997         0x000D, /* CARRIAGE RETURN */
       
  8998         0x001C, /* FILE SEPARATOR */
       
  8999         0x001D, /* GROUP SEPARATOR */
       
  9000         0x001E, /* RECORD SEPARATOR */
       
  9001         0x0085, /* NEXT LINE */
       
  9002         0x2028, /* LINE SEPARATOR */
       
  9003         0x2029, /* PARAGRAPH SEPARATOR */
       
  9004     };
       
  9005 
       
  9006     /* Init the implementation */
       
  9007     free_list = NULL;
       
  9008     numfree = 0;
       
  9009     unicode_empty = _PyUnicode_New(0);
       
  9010     if (!unicode_empty)
       
  9011 	return;
       
  9012 
       
  9013     strcpy(unicode_default_encoding, "ascii");
       
  9014     for (i = 0; i < 256; i++)
       
  9015 	unicode_latin1[i] = NULL;
       
  9016     if (PyType_Ready(&PyUnicode_Type) < 0)
       
  9017 	Py_FatalError("Can't initialize 'unicode'");
       
  9018 
       
  9019     /* initialize the linebreak bloom filter */
       
  9020     bloom_linebreak = make_bloom_mask(
       
  9021         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
       
  9022         );
       
  9023 
       
  9024     PyType_Ready(&EncodingMapType);
       
  9025 }
       
  9026 
       
  9027 /* Finalize the Unicode implementation */
       
  9028 
       
  9029 int
       
  9030 PyUnicode_ClearFreeList(void)
       
  9031 {
       
  9032     int freelist_size = numfree;
       
  9033     PyUnicodeObject *u;
       
  9034 
       
  9035     for (u = free_list; u != NULL;) {
       
  9036 	PyUnicodeObject *v = u;
       
  9037 	u = *(PyUnicodeObject **)u;
       
  9038 	if (v->str)
       
  9039 	    PyObject_DEL(v->str);
       
  9040 	Py_XDECREF(v->defenc);
       
  9041 	PyObject_Del(v);
       
  9042 	numfree--;
       
  9043     }
       
  9044     free_list = NULL;
       
  9045     assert(numfree == 0);
       
  9046     return freelist_size;
       
  9047 }
       
  9048 
       
  9049 void
       
  9050 _PyUnicode_Fini(void)
       
  9051 {
       
  9052     int i;
       
  9053 
       
  9054     Py_XDECREF(unicode_empty);
       
  9055     unicode_empty = NULL;
       
  9056 
       
  9057     for (i = 0; i < 256; i++) {
       
  9058 	if (unicode_latin1[i]) {
       
  9059 	    Py_DECREF(unicode_latin1[i]);
       
  9060 	    unicode_latin1[i] = NULL;
       
  9061 	}
       
  9062     }
       
  9063     (void)PyUnicode_ClearFreeList();
       
  9064 }
       
  9065 
       
  9066 #ifdef __cplusplus
       
  9067 }
       
  9068 #endif
       
  9069 
       
  9070 
       
  9071 /*
       
  9072 Local variables:
       
  9073 c-basic-offset: 4
       
  9074 indent-tabs-mode: nil
       
  9075 End:
       
  9076 */