|
1 /************************************************* |
|
2 * Perl-Compatible Regular Expressions * |
|
3 *************************************************/ |
|
4 |
|
5 /* PCRE is a library of functions to support regular expressions whose syntax |
|
6 and semantics are as close as possible to those of the Perl 5 language. |
|
7 |
|
8 Written by Philip Hazel |
|
9 Copyright (c) 1997-2008 University of Cambridge |
|
10 |
|
11 ----------------------------------------------------------------------------- |
|
12 Redistribution and use in source and binary forms, with or without |
|
13 modification, are permitted provided that the following conditions are met: |
|
14 |
|
15 * Redistributions of source code must retain the above copyright notice, |
|
16 this list of conditions and the following disclaimer. |
|
17 |
|
18 * Redistributions in binary form must reproduce the above copyright |
|
19 notice, this list of conditions and the following disclaimer in the |
|
20 documentation and/or other materials provided with the distribution. |
|
21 |
|
22 * Neither the name of the University of Cambridge nor the names of its |
|
23 contributors may be used to endorse or promote products derived from |
|
24 this software without specific prior written permission. |
|
25 |
|
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
36 POSSIBILITY OF SUCH DAMAGE. |
|
37 ----------------------------------------------------------------------------- |
|
38 */ |
|
39 |
|
40 |
|
41 /* This module contains an internal function for validating UTF-8 character |
|
42 strings. */ |
|
43 |
|
44 |
|
45 #ifdef HAVE_CONFIG_H |
|
46 #include "config.h" |
|
47 #endif |
|
48 |
|
49 #include "pcre_internal.h" |
|
50 |
|
51 |
|
52 /************************************************* |
|
53 * Validate a UTF-8 string * |
|
54 *************************************************/ |
|
55 |
|
56 /* This function is called (optionally) at the start of compile or match, to |
|
57 validate that a supposed UTF-8 string is actually valid. The early check means |
|
58 that subsequent code can assume it is dealing with a valid string. The check |
|
59 can be turned off for maximum performance, but the consequences of supplying |
|
60 an invalid string are then undefined. |
|
61 |
|
62 Originally, this function checked according to RFC 2279, allowing for values in |
|
63 the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in |
|
64 the canonical format. Once somebody had pointed out RFC 3629 to me (it |
|
65 obsoletes 2279), additional restrictions were applied. The values are now |
|
66 limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the |
|
67 subrange 0xd000 to 0xdfff is excluded. |
|
68 |
|
69 Arguments: |
|
70 string points to the string |
|
71 length length of string, or -1 if the string is zero-terminated |
|
72 |
|
73 Returns: < 0 if the string is a valid UTF-8 string |
|
74 >= 0 otherwise; the value is the offset of the bad byte |
|
75 */ |
|
76 |
|
77 int |
|
78 _pcre_valid_utf8(const uschar *string, int length) |
|
79 { |
|
80 #ifdef SUPPORT_UTF8 |
|
81 register const uschar *p; |
|
82 |
|
83 if (length < 0) |
|
84 { |
|
85 for (p = string; *p != 0; p++); |
|
86 length = p - string; |
|
87 } |
|
88 |
|
89 for (p = string; length-- > 0; p++) |
|
90 { |
|
91 register int ab; |
|
92 register int c = *p; |
|
93 if (c < 128) continue; |
|
94 if (c < 0xc0) return p - string; |
|
95 ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ |
|
96 if (length < ab || ab > 3) return p - string; |
|
97 length -= ab; |
|
98 |
|
99 /* Check top bits in the second byte */ |
|
100 if ((*(++p) & 0xc0) != 0x80) return p - string; |
|
101 |
|
102 /* Check for overlong sequences for each different length, and for the |
|
103 excluded range 0xd000 to 0xdfff. */ |
|
104 |
|
105 switch (ab) |
|
106 { |
|
107 /* Check for xx00 000x (overlong sequence) */ |
|
108 |
|
109 case 1: |
|
110 if ((c & 0x3e) == 0) return p - string; |
|
111 continue; /* We know there aren't any more bytes to check */ |
|
112 |
|
113 /* Check for 1110 0000, xx0x xxxx (overlong sequence) or |
|
114 1110 1101, 1010 xxxx (0xd000 - 0xdfff) */ |
|
115 |
|
116 case 2: |
|
117 if ((c == 0xe0 && (*p & 0x20) == 0) || |
|
118 (c == 0xed && *p >= 0xa0)) |
|
119 return p - string; |
|
120 break; |
|
121 |
|
122 /* Check for 1111 0000, xx00 xxxx (overlong sequence) or |
|
123 greater than 0x0010ffff (f4 8f bf bf) */ |
|
124 |
|
125 case 3: |
|
126 if ((c == 0xf0 && (*p & 0x30) == 0) || |
|
127 (c > 0xf4 ) || |
|
128 (c == 0xf4 && *p > 0x8f)) |
|
129 return p - string; |
|
130 break; |
|
131 |
|
132 #if 0 |
|
133 /* These cases can no longer occur, as we restrict to a maximum of four |
|
134 bytes nowadays. Leave the code here in case we ever want to add an option |
|
135 for longer sequences. */ |
|
136 |
|
137 /* Check for 1111 1000, xx00 0xxx */ |
|
138 case 4: |
|
139 if (c == 0xf8 && (*p & 0x38) == 0) return p - string; |
|
140 break; |
|
141 |
|
142 /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */ |
|
143 case 5: |
|
144 if (c == 0xfe || c == 0xff || |
|
145 (c == 0xfc && (*p & 0x3c) == 0)) return p - string; |
|
146 break; |
|
147 #endif |
|
148 |
|
149 } |
|
150 |
|
151 /* Check for valid bytes after the 2nd, if any; all must start 10 */ |
|
152 while (--ab > 0) |
|
153 { |
|
154 if ((*(++p) & 0xc0) != 0x80) return p - string; |
|
155 } |
|
156 } |
|
157 #else |
|
158 (void)(string); /* Keep picky compilers happy */ |
|
159 (void)(length); |
|
160 #endif |
|
161 |
|
162 return -1; |
|
163 } |
|
164 |
|
165 /* End of pcre_valid_utf8.c */ |