|
1 /* |
|
2 * Copyright (C) 2004, 2007, 2008 Apple Inc. All rights reserved. |
|
3 * |
|
4 * Redistribution and use in source and binary forms, with or without |
|
5 * modification, are permitted provided that the following conditions |
|
6 * are met: |
|
7 * 1. Redistributions of source code must retain the above copyright |
|
8 * notice, this list of conditions and the following disclaimer. |
|
9 * 2. Redistributions in binary form must reproduce the above copyright |
|
10 * notice, this list of conditions and the following disclaimer in the |
|
11 * documentation and/or other materials provided with the distribution. |
|
12 * |
|
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
|
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR |
|
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
24 */ |
|
25 |
|
26 #include "config.h" |
|
27 |
|
28 #if !USE(GOOGLEURL) |
|
29 |
|
30 #include "KURL.h" |
|
31 |
|
32 #include "StringHash.h" |
|
33 #include "TextEncoding.h" |
|
34 #include <wtf/text/CString.h> |
|
35 #include <wtf/HashMap.h> |
|
36 #include <wtf/StdLibExtras.h> |
|
37 |
|
38 #if USE(ICU_UNICODE) |
|
39 #include <unicode/uidna.h> |
|
40 #elif USE(QT4_UNICODE) |
|
41 #include <QUrl> |
|
42 #elif USE(GLIB_UNICODE) |
|
43 #include <glib.h> |
|
44 #include "GOwnPtr.h" |
|
45 #endif |
|
46 |
|
47 #include <stdio.h> |
|
48 |
|
49 using namespace std; |
|
50 using namespace WTF; |
|
51 |
|
52 namespace WebCore { |
|
53 |
|
54 typedef Vector<char, 512> CharBuffer; |
|
55 typedef Vector<UChar, 512> UCharBuffer; |
|
56 |
|
57 // FIXME: This file makes too much use of the + operator on String. |
|
58 // We either have to optimize that operator so it doesn't involve |
|
59 // so many allocations, or change this to use Vector<UChar> instead. |
|
60 |
|
61 enum URLCharacterClasses { |
|
62 // alpha |
|
63 SchemeFirstChar = 1 << 0, |
|
64 |
|
65 // ( alpha | digit | "+" | "-" | "." ) |
|
66 SchemeChar = 1 << 1, |
|
67 |
|
68 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" |
|
69 // unreserved = alphanum | mark |
|
70 // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," ) |
|
71 UserInfoChar = 1 << 2, |
|
72 |
|
73 // alnum | "." | "-" | "%" |
|
74 // The above is what the specification says, but we are lenient to |
|
75 // match existing practice and also allow: |
|
76 // "_" |
|
77 HostnameChar = 1 << 3, |
|
78 |
|
79 // hexdigit | ":" | "%" |
|
80 IPv6Char = 1 << 4, |
|
81 |
|
82 // "#" | "?" | "/" | nul |
|
83 PathSegmentEndChar = 1 << 5, |
|
84 |
|
85 // not allowed in path |
|
86 BadChar = 1 << 6 |
|
87 }; |
|
88 |
|
89 static const char hexDigits[17] = "0123456789ABCDEF"; |
|
90 |
|
91 static const unsigned char characterClassTable[256] = { |
|
92 /* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar, |
|
93 /* 2 stx */ BadChar, /* 3 etx */ BadChar, |
|
94 /* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar, |
|
95 /* 8 bs */ BadChar, /* 9 ht */ BadChar, /* 10 nl */ BadChar, /* 11 vt */ BadChar, |
|
96 /* 12 np */ BadChar, /* 13 cr */ BadChar, /* 14 so */ BadChar, /* 15 si */ BadChar, |
|
97 /* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar, |
|
98 /* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar, |
|
99 /* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar, |
|
100 /* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar, |
|
101 /* 32 sp */ BadChar, /* 33 ! */ UserInfoChar, |
|
102 /* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar, |
|
103 /* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar, |
|
104 /* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar, |
|
105 /* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar, |
|
106 /* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar, |
|
107 /* 44 , */ UserInfoChar, |
|
108 /* 45 - */ SchemeChar | UserInfoChar | HostnameChar, |
|
109 /* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
110 /* 47 / */ PathSegmentEndChar, |
|
111 /* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
112 /* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
113 /* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
114 /* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
115 /* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
116 /* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
117 /* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
118 /* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
119 /* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
120 /* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
121 /* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar, |
|
122 /* 60 < */ BadChar, /* 61 = */ UserInfoChar, |
|
123 /* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar, |
|
124 /* 64 @ */ 0, |
|
125 /* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
126 /* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
127 /* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
128 /* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
129 /* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
130 /* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
131 /* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
132 /* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
133 /* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
134 /* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
135 /* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
136 /* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
137 /* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
138 /* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
139 /* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
140 /* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
141 /* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
142 /* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
143 /* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
144 /* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
145 /* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
146 /* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
147 /* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
148 /* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
149 /* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
150 /* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
151 /* 91 [ */ 0, |
|
152 /* 92 \ */ 0, /* 93 ] */ 0, |
|
153 /* 94 ^ */ 0, |
|
154 /* 95 _ */ UserInfoChar | HostnameChar, |
|
155 /* 96 ` */ 0, |
|
156 /* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
157 /* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
158 /* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
159 /* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
160 /* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
161 /* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, |
|
162 /* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
163 /* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
164 /* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
165 /* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
166 /* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
167 /* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
168 /* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
169 /* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
170 /* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
171 /* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
172 /* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
173 /* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
174 /* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
175 /* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
176 /* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
177 /* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
178 /* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
179 /* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
180 /* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
181 /* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, |
|
182 /* 123 { */ 0, |
|
183 /* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar, |
|
184 /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar, |
|
185 /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar, |
|
186 /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar, |
|
187 /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar, |
|
188 /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar, |
|
189 /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar, |
|
190 /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar, |
|
191 /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar, |
|
192 /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar, |
|
193 /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar, |
|
194 /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar, |
|
195 /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar, |
|
196 /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar, |
|
197 /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar, |
|
198 /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar, |
|
199 /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar, |
|
200 /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar, |
|
201 /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar, |
|
202 /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar, |
|
203 /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar, |
|
204 /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar, |
|
205 /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar, |
|
206 /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar, |
|
207 /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar, |
|
208 /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar, |
|
209 /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar, |
|
210 /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar, |
|
211 /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar, |
|
212 /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar, |
|
213 /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar, |
|
214 /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar, |
|
215 /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar |
|
216 }; |
|
217 |
|
218 static const unsigned maximumValidPortNumber = 0xFFFE; |
|
219 static const unsigned invalidPortNumber = 0xFFFF; |
|
220 |
|
221 static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd); |
|
222 static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput); |
|
223 static String substituteBackslashes(const String&); |
|
224 static bool isValidProtocol(const String&); |
|
225 |
|
226 static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; } |
|
227 static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); } |
|
228 static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; } |
|
229 static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); } |
|
230 static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; } |
|
231 static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; } |
|
232 static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; } |
|
233 static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; } |
|
234 static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); } |
|
235 static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; } |
|
236 |
|
237 static inline int hexDigitValue(UChar c) |
|
238 { |
|
239 ASSERT(isASCIIHexDigit(c)); |
|
240 if (c < 'A') |
|
241 return c - '0'; |
|
242 return (c - 'A' + 10) & 0xF; // handle both upper and lower case without a branch |
|
243 } |
|
244 |
|
245 // Copies the source to the destination, assuming all the source characters are |
|
246 // ASCII. The destination buffer must be large enough. Null characters are allowed |
|
247 // in the source string, and no attempt is made to null-terminate the result. |
|
248 static void copyASCII(const UChar* src, int length, char* dest) |
|
249 { |
|
250 for (int i = 0; i < length; i++) |
|
251 dest[i] = static_cast<char>(src[i]); |
|
252 } |
|
253 |
|
254 static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer) |
|
255 { |
|
256 buffer.resize(base.length() + len + 1); |
|
257 copyASCII(base.characters(), base.length(), buffer.data()); |
|
258 memcpy(buffer.data() + base.length(), rel, len); |
|
259 buffer[buffer.size() - 1] = '\0'; |
|
260 } |
|
261 |
|
262 // FIXME: Move to PlatformString.h eventually. |
|
263 // Returns the index of the first index in string |s| of any of the characters |
|
264 // in |toFind|. |toFind| should be a null-terminated string, all characters up |
|
265 // to the null will be searched. Returns int if not found. |
|
266 static int findFirstOf(const UChar* s, int sLen, int startPos, const char* toFind) |
|
267 { |
|
268 for (int i = startPos; i < sLen; i++) { |
|
269 const char* cur = toFind; |
|
270 while (*cur) { |
|
271 if (s[i] == *(cur++)) |
|
272 return i; |
|
273 } |
|
274 } |
|
275 return -1; |
|
276 } |
|
277 |
|
278 #ifndef NDEBUG |
|
279 static void checkEncodedString(const String& url) |
|
280 { |
|
281 for (unsigned i = 0; i < url.length(); ++i) |
|
282 ASSERT(!(url[i] & ~0x7F)); |
|
283 |
|
284 ASSERT(!url.length() || isSchemeFirstChar(url[0])); |
|
285 } |
|
286 #else |
|
287 static inline void checkEncodedString(const String&) |
|
288 { |
|
289 } |
|
290 #endif |
|
291 |
|
292 inline bool KURL::protocolIs(const String& string, const char* protocol) |
|
293 { |
|
294 return WebCore::protocolIs(string, protocol); |
|
295 } |
|
296 |
|
297 void KURL::invalidate() |
|
298 { |
|
299 m_isValid = false; |
|
300 m_protocolInHTTPFamily = false; |
|
301 m_schemeEnd = 0; |
|
302 m_userStart = 0; |
|
303 m_userEnd = 0; |
|
304 m_passwordEnd = 0; |
|
305 m_hostEnd = 0; |
|
306 m_portEnd = 0; |
|
307 m_pathEnd = 0; |
|
308 m_pathAfterLastSlash = 0; |
|
309 m_queryEnd = 0; |
|
310 m_fragmentEnd = 0; |
|
311 } |
|
312 |
|
313 KURL::KURL(ParsedURLStringTag, const char* url) |
|
314 { |
|
315 parse(url, 0); |
|
316 ASSERT(url == m_string); |
|
317 } |
|
318 |
|
319 KURL::KURL(ParsedURLStringTag, const String& url) |
|
320 { |
|
321 parse(url); |
|
322 ASSERT(url == m_string); |
|
323 } |
|
324 |
|
325 KURL::KURL(const KURL& base, const String& relative) |
|
326 { |
|
327 init(base, relative, UTF8Encoding()); |
|
328 } |
|
329 |
|
330 KURL::KURL(const KURL& base, const String& relative, const TextEncoding& encoding) |
|
331 { |
|
332 // For UTF-{7,16,32}, we want to use UTF-8 for the query part as |
|
333 // we do when submitting a form. A form with GET method |
|
334 // has its contents added to a URL as query params and it makes sense |
|
335 // to be consistent. |
|
336 init(base, relative, encoding.encodingForFormSubmission()); |
|
337 } |
|
338 |
|
339 void KURL::init(const KURL& base, const String& relative, const TextEncoding& encoding) |
|
340 { |
|
341 // Allow resolutions with a null or empty base URL, but not with any other invalid one. |
|
342 // FIXME: Is this a good rule? |
|
343 if (!base.m_isValid && !base.isEmpty()) { |
|
344 m_string = relative; |
|
345 invalidate(); |
|
346 return; |
|
347 } |
|
348 |
|
349 // For compatibility with Win IE, treat backslashes as if they were slashes, |
|
350 // as long as we're not dealing with javascript: or data: URLs. |
|
351 String rel = relative; |
|
352 if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data"))) |
|
353 rel = substituteBackslashes(rel); |
|
354 |
|
355 String* originalString = &rel; |
|
356 |
|
357 bool allASCII = charactersAreAllASCII(rel.characters(), rel.length()); |
|
358 CharBuffer strBuffer; |
|
359 char* str; |
|
360 size_t len; |
|
361 if (allASCII) { |
|
362 len = rel.length(); |
|
363 strBuffer.resize(len + 1); |
|
364 copyASCII(rel.characters(), len, strBuffer.data()); |
|
365 strBuffer[len] = 0; |
|
366 str = strBuffer.data(); |
|
367 } else { |
|
368 originalString = 0; |
|
369 encodeRelativeString(rel, encoding, strBuffer); |
|
370 str = strBuffer.data(); |
|
371 len = strlen(str); |
|
372 } |
|
373 |
|
374 // Get rid of leading whitespace. |
|
375 while (*str == ' ') { |
|
376 originalString = 0; |
|
377 str++; |
|
378 --len; |
|
379 } |
|
380 |
|
381 // Get rid of trailing whitespace. |
|
382 while (len && str[len - 1] == ' ') { |
|
383 originalString = 0; |
|
384 str[--len] = '\0'; |
|
385 } |
|
386 |
|
387 // According to the RFC, the reference should be interpreted as an |
|
388 // absolute URI if possible, using the "leftmost, longest" |
|
389 // algorithm. If the URI reference is absolute it will have a |
|
390 // scheme, meaning that it will have a colon before the first |
|
391 // non-scheme element. |
|
392 bool absolute = false; |
|
393 char* p = str; |
|
394 if (isSchemeFirstChar(*p)) { |
|
395 ++p; |
|
396 while (isSchemeChar(*p)) { |
|
397 ++p; |
|
398 } |
|
399 if (*p == ':') { |
|
400 if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical()) { |
|
401 str = p + 1; |
|
402 originalString = 0; |
|
403 } else |
|
404 absolute = true; |
|
405 } |
|
406 } |
|
407 |
|
408 CharBuffer parseBuffer; |
|
409 |
|
410 if (absolute) { |
|
411 parse(str, originalString); |
|
412 } else { |
|
413 // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid |
|
414 // unless the relative URL is a single fragment. |
|
415 if (!base.isHierarchical()) { |
|
416 if (str[0] == '#') { |
|
417 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer); |
|
418 parse(parseBuffer.data(), 0); |
|
419 } else { |
|
420 m_string = relative; |
|
421 invalidate(); |
|
422 } |
|
423 return; |
|
424 } |
|
425 |
|
426 switch (str[0]) { |
|
427 case '\0': |
|
428 // The reference is empty, so this is a reference to the same document with any fragment identifier removed. |
|
429 *this = base; |
|
430 removeFragmentIdentifier(); |
|
431 break; |
|
432 case '#': { |
|
433 // must be fragment-only reference |
|
434 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer); |
|
435 parse(parseBuffer.data(), 0); |
|
436 break; |
|
437 } |
|
438 case '?': { |
|
439 // query-only reference, special case needed for non-URL results |
|
440 appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer); |
|
441 parse(parseBuffer.data(), 0); |
|
442 break; |
|
443 } |
|
444 case '/': |
|
445 // must be net-path or absolute-path reference |
|
446 if (str[1] == '/') { |
|
447 // net-path |
|
448 appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer); |
|
449 parse(parseBuffer.data(), 0); |
|
450 } else { |
|
451 // abs-path |
|
452 appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer); |
|
453 parse(parseBuffer.data(), 0); |
|
454 } |
|
455 break; |
|
456 default: |
|
457 { |
|
458 // must be relative-path reference |
|
459 |
|
460 // Base part plus relative part plus one possible slash added in between plus terminating \0 byte. |
|
461 parseBuffer.resize(base.m_pathEnd + 1 + len + 1); |
|
462 |
|
463 char* bufferPos = parseBuffer.data(); |
|
464 |
|
465 // first copy everything before the path from the base |
|
466 unsigned baseLength = base.m_string.length(); |
|
467 const UChar* baseCharacters = base.m_string.characters(); |
|
468 CharBuffer baseStringBuffer(baseLength); |
|
469 copyASCII(baseCharacters, baseLength, baseStringBuffer.data()); |
|
470 const char* baseString = baseStringBuffer.data(); |
|
471 const char* baseStringStart = baseString; |
|
472 const char* pathStart = baseStringStart + base.m_portEnd; |
|
473 while (baseStringStart < pathStart) |
|
474 *bufferPos++ = *baseStringStart++; |
|
475 char* bufferPathStart = bufferPos; |
|
476 |
|
477 // now copy the base path |
|
478 const char* baseStringEnd = baseString + base.m_pathEnd; |
|
479 |
|
480 // go back to the last slash |
|
481 while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/') |
|
482 baseStringEnd--; |
|
483 |
|
484 if (baseStringEnd == baseStringStart) { |
|
485 // no path in base, add a path separator if necessary |
|
486 if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#') |
|
487 *bufferPos++ = '/'; |
|
488 } else { |
|
489 bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart); |
|
490 } |
|
491 |
|
492 const char* relStringStart = str; |
|
493 const char* relStringPos = relStringStart; |
|
494 |
|
495 while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') { |
|
496 if (relStringPos[0] == '.' && bufferPos[-1] == '/') { |
|
497 if (isPathSegmentEndChar(relStringPos[1])) { |
|
498 // skip over "." segment |
|
499 relStringPos += 1; |
|
500 if (relStringPos[0] == '/') |
|
501 relStringPos++; |
|
502 continue; |
|
503 } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) { |
|
504 // skip over ".." segment and rewind the last segment |
|
505 // the RFC leaves it up to the app to decide what to do with excess |
|
506 // ".." segments - we choose to drop them since some web content |
|
507 // relies on this. |
|
508 relStringPos += 2; |
|
509 if (relStringPos[0] == '/') |
|
510 relStringPos++; |
|
511 if (bufferPos > bufferPathStart + 1) |
|
512 bufferPos--; |
|
513 while (bufferPos > bufferPathStart + 1 && bufferPos[-1] != '/') |
|
514 bufferPos--; |
|
515 continue; |
|
516 } |
|
517 } |
|
518 |
|
519 *bufferPos = *relStringPos; |
|
520 relStringPos++; |
|
521 bufferPos++; |
|
522 } |
|
523 |
|
524 // all done with the path work, now copy any remainder |
|
525 // of the relative reference; this will also add a null terminator |
|
526 strcpy(bufferPos, relStringPos); |
|
527 |
|
528 parse(parseBuffer.data(), 0); |
|
529 |
|
530 ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size()); |
|
531 break; |
|
532 } |
|
533 } |
|
534 } |
|
535 } |
|
536 |
|
537 KURL KURL::copy() const |
|
538 { |
|
539 KURL result = *this; |
|
540 result.m_string = result.m_string.crossThreadString(); |
|
541 return result; |
|
542 } |
|
543 |
|
544 bool KURL::hasPath() const |
|
545 { |
|
546 return m_pathEnd != m_portEnd; |
|
547 } |
|
548 |
|
549 String KURL::lastPathComponent() const |
|
550 { |
|
551 if (!hasPath()) |
|
552 return String(); |
|
553 |
|
554 int end = m_pathEnd - 1; |
|
555 if (m_string[end] == '/') |
|
556 --end; |
|
557 |
|
558 int start = m_string.reverseFind('/', end); |
|
559 if (start < m_portEnd) |
|
560 return String(); |
|
561 ++start; |
|
562 |
|
563 return m_string.substring(start, end - start + 1); |
|
564 } |
|
565 |
|
566 String KURL::protocol() const |
|
567 { |
|
568 return m_string.left(m_schemeEnd); |
|
569 } |
|
570 |
|
571 String KURL::host() const |
|
572 { |
|
573 int start = hostStart(); |
|
574 return decodeURLEscapeSequences(m_string.substring(start, m_hostEnd - start)); |
|
575 } |
|
576 |
|
577 unsigned short KURL::port() const |
|
578 { |
|
579 // We return a port of 0 if there is no port specified. This can happen in two situations: |
|
580 // 1) The URL contains no colon after the host name and before the path component of the URL. |
|
581 // 2) The URL contains a colon but there's no port number before the path component of the URL begins. |
|
582 if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1) |
|
583 return 0; |
|
584 |
|
585 const UChar* stringData = m_string.characters(); |
|
586 bool ok = false; |
|
587 unsigned number = charactersToUIntStrict(stringData + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok); |
|
588 if (!ok || number > maximumValidPortNumber) |
|
589 return invalidPortNumber; |
|
590 return number; |
|
591 } |
|
592 |
|
593 String KURL::pass() const |
|
594 { |
|
595 if (m_passwordEnd == m_userEnd) |
|
596 return String(); |
|
597 |
|
598 return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1)); |
|
599 } |
|
600 |
|
601 String KURL::user() const |
|
602 { |
|
603 return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart)); |
|
604 } |
|
605 |
|
606 String KURL::fragmentIdentifier() const |
|
607 { |
|
608 if (m_fragmentEnd == m_queryEnd) |
|
609 return String(); |
|
610 |
|
611 return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1)); |
|
612 } |
|
613 |
|
614 bool KURL::hasFragmentIdentifier() const |
|
615 { |
|
616 return m_fragmentEnd != m_queryEnd; |
|
617 } |
|
618 |
|
619 String KURL::baseAsString() const |
|
620 { |
|
621 return m_string.left(m_pathAfterLastSlash); |
|
622 } |
|
623 |
|
624 #ifdef NDEBUG |
|
625 |
|
626 static inline void assertProtocolIsGood(const char*) |
|
627 { |
|
628 } |
|
629 |
|
630 #else |
|
631 |
|
632 static void assertProtocolIsGood(const char* protocol) |
|
633 { |
|
634 const char* p = protocol; |
|
635 while (*p) { |
|
636 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z')); |
|
637 ++p; |
|
638 } |
|
639 } |
|
640 |
|
641 #endif |
|
642 |
|
643 bool KURL::protocolIs(const char* protocol) const |
|
644 { |
|
645 assertProtocolIsGood(protocol); |
|
646 |
|
647 // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid. |
|
648 // The free function protocolIsJavaScript() should be used instead. |
|
649 ASSERT(!equalIgnoringCase(protocol, String("javascript"))); |
|
650 |
|
651 if (!m_isValid) |
|
652 return false; |
|
653 |
|
654 // Do the comparison without making a new string object. |
|
655 for (int i = 0; i < m_schemeEnd; ++i) { |
|
656 if (!protocol[i] || toASCIILower(m_string[i]) != protocol[i]) |
|
657 return false; |
|
658 } |
|
659 return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument. |
|
660 } |
|
661 |
|
662 String KURL::query() const |
|
663 { |
|
664 if (m_queryEnd == m_pathEnd) |
|
665 return String(); |
|
666 |
|
667 return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1)); |
|
668 } |
|
669 |
|
670 String KURL::path() const |
|
671 { |
|
672 return decodeURLEscapeSequences(m_string.substring(m_portEnd, m_pathEnd - m_portEnd)); |
|
673 } |
|
674 |
|
675 bool KURL::setProtocol(const String& s) |
|
676 { |
|
677 // Firefox and IE remove everything after the first ':'. |
|
678 int separatorPosition = s.find(':'); |
|
679 String newProtocol = s.substring(0, separatorPosition); |
|
680 |
|
681 if (!isValidProtocol(newProtocol)) |
|
682 return false; |
|
683 |
|
684 if (!m_isValid) { |
|
685 parse(newProtocol + ":" + m_string); |
|
686 return true; |
|
687 } |
|
688 |
|
689 parse(newProtocol + m_string.substring(m_schemeEnd)); |
|
690 return true; |
|
691 } |
|
692 |
|
693 void KURL::setHost(const String& s) |
|
694 { |
|
695 if (!m_isValid) |
|
696 return; |
|
697 |
|
698 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, |
|
699 // and to avoid changing more than just the host. |
|
700 |
|
701 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1; |
|
702 |
|
703 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd)); |
|
704 } |
|
705 |
|
706 void KURL::removePort() |
|
707 { |
|
708 if (m_hostEnd == m_portEnd) |
|
709 return; |
|
710 parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd)); |
|
711 } |
|
712 |
|
713 void KURL::setPort(unsigned short i) |
|
714 { |
|
715 if (!m_isValid) |
|
716 return; |
|
717 |
|
718 bool colonNeeded = m_portEnd == m_hostEnd; |
|
719 int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1); |
|
720 |
|
721 parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd)); |
|
722 } |
|
723 |
|
724 void KURL::setHostAndPort(const String& hostAndPort) |
|
725 { |
|
726 if (!m_isValid) |
|
727 return; |
|
728 |
|
729 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, |
|
730 // and to avoid changing more than just host and port. |
|
731 |
|
732 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1; |
|
733 |
|
734 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd)); |
|
735 } |
|
736 |
|
737 void KURL::setUser(const String& user) |
|
738 { |
|
739 if (!m_isValid) |
|
740 return; |
|
741 |
|
742 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, |
|
743 // and to avoid changing more than just the user login. |
|
744 String u; |
|
745 int end = m_userEnd; |
|
746 if (!user.isEmpty()) { |
|
747 u = user; |
|
748 if (m_userStart == m_schemeEnd + 1) |
|
749 u = "//" + u; |
|
750 // Add '@' if we didn't have one before. |
|
751 if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@')) |
|
752 u.append('@'); |
|
753 } else { |
|
754 // Remove '@' if we now have neither user nor password. |
|
755 if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@') |
|
756 end += 1; |
|
757 } |
|
758 parse(m_string.left(m_userStart) + u + m_string.substring(end)); |
|
759 } |
|
760 |
|
761 void KURL::setPass(const String& password) |
|
762 { |
|
763 if (!m_isValid) |
|
764 return; |
|
765 |
|
766 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, |
|
767 // and to avoid changing more than just the user password. |
|
768 String p; |
|
769 int end = m_passwordEnd; |
|
770 if (!password.isEmpty()) { |
|
771 p = ":" + password + "@"; |
|
772 if (m_userEnd == m_schemeEnd + 1) |
|
773 p = "//" + p; |
|
774 // Eat the existing '@' since we are going to add our own. |
|
775 if (end != m_hostEnd && m_string[end] == '@') |
|
776 end += 1; |
|
777 } else { |
|
778 // Remove '@' if we now have neither user nor password. |
|
779 if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@') |
|
780 end += 1; |
|
781 } |
|
782 parse(m_string.left(m_userEnd) + p + m_string.substring(end)); |
|
783 } |
|
784 |
|
785 void KURL::setFragmentIdentifier(const String& s) |
|
786 { |
|
787 if (!m_isValid) |
|
788 return; |
|
789 |
|
790 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations. |
|
791 parse(m_string.left(m_queryEnd) + "#" + s); |
|
792 } |
|
793 |
|
794 void KURL::removeFragmentIdentifier() |
|
795 { |
|
796 if (!m_isValid) |
|
797 return; |
|
798 parse(m_string.left(m_queryEnd)); |
|
799 } |
|
800 |
|
801 void KURL::setQuery(const String& query) |
|
802 { |
|
803 if (!m_isValid) |
|
804 return; |
|
805 |
|
806 // FIXME: '#' and non-ASCII characters must be encoded and escaped. |
|
807 // Usually, the query is encoded using document encoding, not UTF-8, but we don't have |
|
808 // access to the document in this function. |
|
809 if ((query.isEmpty() || query[0] != '?') && !query.isNull()) |
|
810 parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd)); |
|
811 else |
|
812 parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd)); |
|
813 |
|
814 } |
|
815 |
|
816 void KURL::setPath(const String& s) |
|
817 { |
|
818 if (!m_isValid) |
|
819 return; |
|
820 |
|
821 // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts |
|
822 // may be inadvertently affected. |
|
823 parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(s) + m_string.substring(m_pathEnd)); |
|
824 } |
|
825 |
|
826 String KURL::prettyURL() const |
|
827 { |
|
828 if (!m_isValid) |
|
829 return m_string; |
|
830 |
|
831 Vector<UChar> result; |
|
832 |
|
833 append(result, protocol()); |
|
834 result.append(':'); |
|
835 |
|
836 Vector<UChar> authority; |
|
837 |
|
838 if (m_hostEnd != m_passwordEnd) { |
|
839 if (m_userEnd != m_userStart) { |
|
840 append(authority, user()); |
|
841 authority.append('@'); |
|
842 } |
|
843 append(authority, host()); |
|
844 if (hasPort()) { |
|
845 authority.append(':'); |
|
846 append(authority, String::number(port())); |
|
847 } |
|
848 } |
|
849 |
|
850 if (!authority.isEmpty()) { |
|
851 result.append('/'); |
|
852 result.append('/'); |
|
853 result.append(authority); |
|
854 } else if (protocolIs("file")) { |
|
855 result.append('/'); |
|
856 result.append('/'); |
|
857 } |
|
858 |
|
859 append(result, path()); |
|
860 |
|
861 if (m_pathEnd != m_queryEnd) { |
|
862 result.append('?'); |
|
863 append(result, query()); |
|
864 } |
|
865 |
|
866 if (m_fragmentEnd != m_queryEnd) { |
|
867 result.append('#'); |
|
868 append(result, fragmentIdentifier()); |
|
869 } |
|
870 |
|
871 return String::adopt(result); |
|
872 } |
|
873 |
|
874 String decodeURLEscapeSequences(const String& str) |
|
875 { |
|
876 return decodeURLEscapeSequences(str, UTF8Encoding()); |
|
877 } |
|
878 |
|
879 String decodeURLEscapeSequences(const String& str, const TextEncoding& encoding) |
|
880 { |
|
881 Vector<UChar> result; |
|
882 |
|
883 CharBuffer buffer; |
|
884 |
|
885 int length = str.length(); |
|
886 int decodedPosition = 0; |
|
887 int searchPosition = 0; |
|
888 int encodedRunPosition; |
|
889 while ((encodedRunPosition = str.find('%', searchPosition)) >= 0) { |
|
890 // Find the sequence of %-escape codes. |
|
891 int encodedRunEnd = encodedRunPosition; |
|
892 while (length - encodedRunEnd >= 3 |
|
893 && str[encodedRunEnd] == '%' |
|
894 && isASCIIHexDigit(str[encodedRunEnd + 1]) |
|
895 && isASCIIHexDigit(str[encodedRunEnd + 2])) |
|
896 encodedRunEnd += 3; |
|
897 if (encodedRunEnd == encodedRunPosition) { |
|
898 ++searchPosition; |
|
899 continue; |
|
900 } |
|
901 searchPosition = encodedRunEnd; |
|
902 |
|
903 // Decode the %-escapes into bytes. |
|
904 unsigned runLength = (encodedRunEnd - encodedRunPosition) / 3; |
|
905 buffer.resize(runLength); |
|
906 char* p = buffer.data(); |
|
907 const UChar* q = str.characters() + encodedRunPosition; |
|
908 for (unsigned i = 0; i < runLength; ++i) { |
|
909 *p++ = (hexDigitValue(q[1]) << 4) | hexDigitValue(q[2]); |
|
910 q += 3; |
|
911 } |
|
912 |
|
913 // Decode the bytes into Unicode characters. |
|
914 String decoded = (encoding.isValid() ? encoding : UTF8Encoding()).decode(buffer.data(), p - buffer.data()); |
|
915 if (decoded.isEmpty()) |
|
916 continue; |
|
917 |
|
918 // Build up the string with what we just skipped and what we just decoded. |
|
919 result.append(str.characters() + decodedPosition, encodedRunPosition - decodedPosition); |
|
920 result.append(decoded.characters(), decoded.length()); |
|
921 decodedPosition = encodedRunEnd; |
|
922 } |
|
923 |
|
924 result.append(str.characters() + decodedPosition, length - decodedPosition); |
|
925 |
|
926 return String::adopt(result); |
|
927 } |
|
928 |
|
929 bool KURL::isLocalFile() const |
|
930 { |
|
931 // Including feed here might be a bad idea since drag and drop uses this check |
|
932 // and including feed would allow feeds to potentially let someone's blog |
|
933 // read the contents of the clipboard on a drag, even without a drop. |
|
934 // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function. |
|
935 return protocolIs("file"); |
|
936 } |
|
937 |
|
938 static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length) |
|
939 { |
|
940 char* p = buffer; |
|
941 |
|
942 const char* str = strStart; |
|
943 const char* strEnd = strStart + length; |
|
944 while (str < strEnd) { |
|
945 unsigned char c = *str++; |
|
946 if (isBadChar(c)) { |
|
947 if (c == '%' || c == '?') { |
|
948 *p++ = c; |
|
949 } else if (c != 0x09 && c != 0x0a && c != 0x0d) { |
|
950 *p++ = '%'; |
|
951 *p++ = hexDigits[c >> 4]; |
|
952 *p++ = hexDigits[c & 0xF]; |
|
953 } |
|
954 } else { |
|
955 *p++ = c; |
|
956 } |
|
957 } |
|
958 |
|
959 buffer = p; |
|
960 } |
|
961 |
|
962 // copy a path, accounting for "." and ".." segments |
|
963 static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd) |
|
964 { |
|
965 char* bufferPathStart = dst; |
|
966 |
|
967 // empty path is a special case, and need not have a leading slash |
|
968 if (srcStart != srcEnd) { |
|
969 const char* baseStringStart = src + srcStart; |
|
970 const char* baseStringEnd = src + srcEnd; |
|
971 const char* baseStringPos = baseStringStart; |
|
972 |
|
973 // this code is unprepared for paths that do not begin with a |
|
974 // slash and we should always have one in the source string |
|
975 ASSERT(baseStringPos[0] == '/'); |
|
976 |
|
977 // copy the leading slash into the destination |
|
978 *dst = *baseStringPos; |
|
979 baseStringPos++; |
|
980 dst++; |
|
981 |
|
982 while (baseStringPos < baseStringEnd) { |
|
983 if (baseStringPos[0] == '.' && dst[-1] == '/') { |
|
984 if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) { |
|
985 // skip over "." segment |
|
986 baseStringPos += 2; |
|
987 continue; |
|
988 } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' || |
|
989 baseStringPos + 2 == baseStringEnd)) { |
|
990 // skip over ".." segment and rewind the last segment |
|
991 // the RFC leaves it up to the app to decide what to do with excess |
|
992 // ".." segments - we choose to drop them since some web content |
|
993 // relies on this. |
|
994 baseStringPos += 3; |
|
995 if (dst > bufferPathStart + 1) |
|
996 dst--; |
|
997 // Note that these two while blocks differ subtly. |
|
998 // The first helps to remove multiple adjoining slashes as we rewind. |
|
999 // The +1 to bufferPathStart in the first while block prevents eating a leading slash |
|
1000 while (dst > bufferPathStart + 1 && dst[-1] == '/') |
|
1001 dst--; |
|
1002 while (dst > bufferPathStart && dst[-1] != '/') |
|
1003 dst--; |
|
1004 continue; |
|
1005 } |
|
1006 } |
|
1007 |
|
1008 *dst = *baseStringPos; |
|
1009 baseStringPos++; |
|
1010 dst++; |
|
1011 } |
|
1012 } |
|
1013 *dst = '\0'; |
|
1014 return dst - bufferPathStart; |
|
1015 } |
|
1016 |
|
1017 static inline bool hasSlashDotOrDotDot(const char* str) |
|
1018 { |
|
1019 const unsigned char* p = reinterpret_cast<const unsigned char*>(str); |
|
1020 if (!*p) |
|
1021 return false; |
|
1022 unsigned char pc = *p; |
|
1023 while (unsigned char c = *++p) { |
|
1024 if (c == '.' && (pc == '/' || pc == '.')) |
|
1025 return true; |
|
1026 pc = c; |
|
1027 } |
|
1028 return false; |
|
1029 } |
|
1030 |
|
1031 static inline bool matchLetter(char c, char lowercaseLetter) |
|
1032 { |
|
1033 return (c | 0x20) == lowercaseLetter; |
|
1034 } |
|
1035 |
|
1036 void KURL::parse(const String& string) |
|
1037 { |
|
1038 checkEncodedString(string); |
|
1039 |
|
1040 CharBuffer buffer(string.length() + 1); |
|
1041 copyASCII(string.characters(), string.length(), buffer.data()); |
|
1042 buffer[string.length()] = '\0'; |
|
1043 parse(buffer.data(), &string); |
|
1044 } |
|
1045 |
|
1046 void KURL::parse(const char* url, const String* originalString) |
|
1047 { |
|
1048 if (!url || url[0] == '\0') { |
|
1049 // valid URL must be non-empty |
|
1050 m_string = originalString ? *originalString : url; |
|
1051 invalidate(); |
|
1052 return; |
|
1053 } |
|
1054 |
|
1055 if (!isSchemeFirstChar(url[0])) { |
|
1056 // scheme must start with an alphabetic character |
|
1057 m_string = originalString ? *originalString : url; |
|
1058 invalidate(); |
|
1059 return; |
|
1060 } |
|
1061 |
|
1062 int schemeEnd = 0; |
|
1063 while (isSchemeChar(url[schemeEnd])) |
|
1064 schemeEnd++; |
|
1065 |
|
1066 if (url[schemeEnd] != ':') { |
|
1067 m_string = originalString ? *originalString : url; |
|
1068 invalidate(); |
|
1069 return; |
|
1070 } |
|
1071 |
|
1072 int userStart = schemeEnd + 1; |
|
1073 int userEnd; |
|
1074 int passwordStart; |
|
1075 int passwordEnd; |
|
1076 int hostStart; |
|
1077 int hostEnd; |
|
1078 int portStart; |
|
1079 int portEnd; |
|
1080 |
|
1081 bool hierarchical = url[schemeEnd + 1] == '/'; |
|
1082 |
|
1083 bool isFile = schemeEnd == 4 |
|
1084 && matchLetter(url[0], 'f') |
|
1085 && matchLetter(url[1], 'i') |
|
1086 && matchLetter(url[2], 'l') |
|
1087 && matchLetter(url[3], 'e'); |
|
1088 |
|
1089 m_protocolInHTTPFamily = matchLetter(url[0], 'h') |
|
1090 && matchLetter(url[1], 't') |
|
1091 && matchLetter(url[2], 't') |
|
1092 && matchLetter(url[3], 'p') |
|
1093 && (url[4] == ':' || (matchLetter(url[4], 's') && url[5] == ':')); |
|
1094 |
|
1095 if (hierarchical && url[schemeEnd + 2] == '/') { |
|
1096 // The part after the scheme is either a net_path or an abs_path whose first path segment is empty. |
|
1097 // Attempt to find an authority. |
|
1098 |
|
1099 // FIXME: Authority characters may be scanned twice, and it would be nice to be faster. |
|
1100 userStart += 2; |
|
1101 userEnd = userStart; |
|
1102 |
|
1103 int colonPos = 0; |
|
1104 while (isUserInfoChar(url[userEnd])) { |
|
1105 if (url[userEnd] == ':' && colonPos == 0) |
|
1106 colonPos = userEnd; |
|
1107 userEnd++; |
|
1108 } |
|
1109 |
|
1110 if (url[userEnd] == '@') { |
|
1111 // actual end of the userinfo, start on the host |
|
1112 if (colonPos != 0) { |
|
1113 passwordEnd = userEnd; |
|
1114 userEnd = colonPos; |
|
1115 passwordStart = colonPos + 1; |
|
1116 } else |
|
1117 passwordStart = passwordEnd = userEnd; |
|
1118 |
|
1119 hostStart = passwordEnd + 1; |
|
1120 } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) { |
|
1121 // hit the end of the authority, must have been no user |
|
1122 // or looks like an IPv6 hostname |
|
1123 // either way, try to parse it as a hostname |
|
1124 userEnd = userStart; |
|
1125 passwordStart = passwordEnd = userEnd; |
|
1126 hostStart = userStart; |
|
1127 } else { |
|
1128 // invalid character |
|
1129 m_string = originalString ? *originalString : url; |
|
1130 invalidate(); |
|
1131 return; |
|
1132 } |
|
1133 |
|
1134 hostEnd = hostStart; |
|
1135 |
|
1136 // IPV6 IP address |
|
1137 if (url[hostEnd] == '[') { |
|
1138 hostEnd++; |
|
1139 while (isIPv6Char(url[hostEnd])) |
|
1140 hostEnd++; |
|
1141 if (url[hostEnd] == ']') |
|
1142 hostEnd++; |
|
1143 else { |
|
1144 // invalid character |
|
1145 m_string = originalString ? *originalString : url; |
|
1146 invalidate(); |
|
1147 return; |
|
1148 } |
|
1149 } else { |
|
1150 while (isHostnameChar(url[hostEnd])) |
|
1151 hostEnd++; |
|
1152 } |
|
1153 |
|
1154 if (url[hostEnd] == ':') { |
|
1155 portStart = portEnd = hostEnd + 1; |
|
1156 |
|
1157 // possible start of port |
|
1158 portEnd = portStart; |
|
1159 while (isASCIIDigit(url[portEnd])) |
|
1160 portEnd++; |
|
1161 } else |
|
1162 portStart = portEnd = hostEnd; |
|
1163 |
|
1164 if (!isPathSegmentEndChar(url[portEnd])) { |
|
1165 // invalid character |
|
1166 m_string = originalString ? *originalString : url; |
|
1167 invalidate(); |
|
1168 return; |
|
1169 } |
|
1170 |
|
1171 if (userStart == portEnd && !m_protocolInHTTPFamily && !isFile) { |
|
1172 // No authority found, which means that this is not a net_path, but rather an abs_path whose first two |
|
1173 // path segments are empty. For file, http and https only, an empty authority is allowed. |
|
1174 userStart -= 2; |
|
1175 userEnd = userStart; |
|
1176 passwordStart = userEnd; |
|
1177 passwordEnd = passwordStart; |
|
1178 hostStart = passwordEnd; |
|
1179 hostEnd = hostStart; |
|
1180 portStart = hostEnd; |
|
1181 portEnd = hostEnd; |
|
1182 } |
|
1183 } else { |
|
1184 // the part after the scheme must be an opaque_part or an abs_path |
|
1185 userEnd = userStart; |
|
1186 passwordStart = passwordEnd = userEnd; |
|
1187 hostStart = hostEnd = passwordEnd; |
|
1188 portStart = portEnd = hostEnd; |
|
1189 } |
|
1190 |
|
1191 int pathStart = portEnd; |
|
1192 int pathEnd = pathStart; |
|
1193 while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#') |
|
1194 pathEnd++; |
|
1195 |
|
1196 int queryStart = pathEnd; |
|
1197 int queryEnd = queryStart; |
|
1198 if (url[queryStart] == '?') { |
|
1199 while (url[queryEnd] && url[queryEnd] != '#') |
|
1200 queryEnd++; |
|
1201 } |
|
1202 |
|
1203 int fragmentStart = queryEnd; |
|
1204 int fragmentEnd = fragmentStart; |
|
1205 if (url[fragmentStart] == '#') { |
|
1206 fragmentStart++; |
|
1207 fragmentEnd = fragmentStart; |
|
1208 while (url[fragmentEnd]) |
|
1209 fragmentEnd++; |
|
1210 } |
|
1211 |
|
1212 // assemble it all, remembering the real ranges |
|
1213 |
|
1214 Vector<char, 4096> buffer(fragmentEnd * 3 + 1); |
|
1215 |
|
1216 char *p = buffer.data(); |
|
1217 const char *strPtr = url; |
|
1218 |
|
1219 // copy in the scheme |
|
1220 const char *schemeEndPtr = url + schemeEnd; |
|
1221 while (strPtr < schemeEndPtr) |
|
1222 *p++ = *strPtr++; |
|
1223 m_schemeEnd = p - buffer.data(); |
|
1224 |
|
1225 bool hostIsLocalHost = portEnd - userStart == 9 |
|
1226 && matchLetter(url[userStart], 'l') |
|
1227 && matchLetter(url[userStart+1], 'o') |
|
1228 && matchLetter(url[userStart+2], 'c') |
|
1229 && matchLetter(url[userStart+3], 'a') |
|
1230 && matchLetter(url[userStart+4], 'l') |
|
1231 && matchLetter(url[userStart+5], 'h') |
|
1232 && matchLetter(url[userStart+6], 'o') |
|
1233 && matchLetter(url[userStart+7], 's') |
|
1234 && matchLetter(url[userStart+8], 't'); |
|
1235 |
|
1236 // File URLs need a host part unless it is just file:// or file://localhost |
|
1237 bool degenFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost); |
|
1238 |
|
1239 bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || portStart != portEnd; |
|
1240 |
|
1241 // add ":" after scheme |
|
1242 *p++ = ':'; |
|
1243 |
|
1244 // if we have at least one authority part or a file URL - add "//" and authority |
|
1245 if (isFile ? !degenFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) { |
|
1246 *p++ = '/'; |
|
1247 *p++ = '/'; |
|
1248 |
|
1249 m_userStart = p - buffer.data(); |
|
1250 |
|
1251 // copy in the user |
|
1252 strPtr = url + userStart; |
|
1253 const char* userEndPtr = url + userEnd; |
|
1254 while (strPtr < userEndPtr) |
|
1255 *p++ = *strPtr++; |
|
1256 m_userEnd = p - buffer.data(); |
|
1257 |
|
1258 // copy in the password |
|
1259 if (passwordEnd != passwordStart) { |
|
1260 *p++ = ':'; |
|
1261 strPtr = url + passwordStart; |
|
1262 const char* passwordEndPtr = url + passwordEnd; |
|
1263 while (strPtr < passwordEndPtr) |
|
1264 *p++ = *strPtr++; |
|
1265 } |
|
1266 m_passwordEnd = p - buffer.data(); |
|
1267 |
|
1268 // If we had any user info, add "@" |
|
1269 if (p - buffer.data() != m_userStart) |
|
1270 *p++ = '@'; |
|
1271 |
|
1272 // copy in the host, except in the case of a file URL with authority="localhost" |
|
1273 if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) { |
|
1274 strPtr = url + hostStart; |
|
1275 const char* hostEndPtr = url + hostEnd; |
|
1276 while (strPtr < hostEndPtr) |
|
1277 *p++ = *strPtr++; |
|
1278 } |
|
1279 m_hostEnd = p - buffer.data(); |
|
1280 |
|
1281 // copy in the port |
|
1282 if (hostEnd != portStart) { |
|
1283 *p++ = ':'; |
|
1284 strPtr = url + portStart; |
|
1285 const char *portEndPtr = url + portEnd; |
|
1286 while (strPtr < portEndPtr) |
|
1287 *p++ = *strPtr++; |
|
1288 } |
|
1289 m_portEnd = p - buffer.data(); |
|
1290 } else |
|
1291 m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data(); |
|
1292 |
|
1293 // For canonicalization, ensure we have a '/' for no path. |
|
1294 // Do this only for hierarchical URL with protocol http or https. |
|
1295 if (m_protocolInHTTPFamily && hierarchical && pathEnd == pathStart) |
|
1296 *p++ = '/'; |
|
1297 |
|
1298 // add path, escaping bad characters |
|
1299 if (!hierarchical || !hasSlashDotOrDotDot(url)) |
|
1300 appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart); |
|
1301 else { |
|
1302 CharBuffer pathBuffer(pathEnd - pathStart + 1); |
|
1303 size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd); |
|
1304 appendEscapingBadChars(p, pathBuffer.data(), length); |
|
1305 } |
|
1306 |
|
1307 m_pathEnd = p - buffer.data(); |
|
1308 |
|
1309 // Find the position after the last slash in the path, or |
|
1310 // the position before the path if there are no slashes in it. |
|
1311 int i; |
|
1312 for (i = m_pathEnd; i > m_portEnd; --i) { |
|
1313 if (buffer[i - 1] == '/') |
|
1314 break; |
|
1315 } |
|
1316 m_pathAfterLastSlash = i; |
|
1317 |
|
1318 // add query, escaping bad characters |
|
1319 appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart); |
|
1320 m_queryEnd = p - buffer.data(); |
|
1321 |
|
1322 // add fragment, escaping bad characters |
|
1323 if (fragmentEnd != queryEnd) { |
|
1324 *p++ = '#'; |
|
1325 appendEscapingBadChars(p, url + fragmentStart, fragmentEnd - fragmentStart); |
|
1326 } |
|
1327 m_fragmentEnd = p - buffer.data(); |
|
1328 |
|
1329 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size())); |
|
1330 |
|
1331 // If we didn't end up actually changing the original string and |
|
1332 // it was already in a String, reuse it to avoid extra allocation. |
|
1333 if (originalString && originalString->length() == static_cast<unsigned>(m_fragmentEnd) && strncmp(buffer.data(), url, m_fragmentEnd) == 0) |
|
1334 m_string = *originalString; |
|
1335 else |
|
1336 m_string = String(buffer.data(), m_fragmentEnd); |
|
1337 |
|
1338 m_isValid = true; |
|
1339 } |
|
1340 |
|
1341 bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b) |
|
1342 { |
|
1343 if (a.m_queryEnd != b.m_queryEnd) |
|
1344 return false; |
|
1345 unsigned queryLength = a.m_queryEnd; |
|
1346 for (unsigned i = 0; i < queryLength; ++i) |
|
1347 if (a.string()[i] != b.string()[i]) |
|
1348 return false; |
|
1349 return true; |
|
1350 } |
|
1351 |
|
1352 bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b) |
|
1353 { |
|
1354 if (a.m_schemeEnd != b.m_schemeEnd) |
|
1355 return false; |
|
1356 |
|
1357 int hostStartA = a.hostStart(); |
|
1358 int hostLengthA = a.hostEnd() - hostStartA; |
|
1359 int hostStartB = b.hostStart(); |
|
1360 int hostLengthB = b.hostEnd() - b.hostStart(); |
|
1361 if (hostLengthA != hostLengthB) |
|
1362 return false; |
|
1363 |
|
1364 // Check the scheme |
|
1365 for (int i = 0; i < a.m_schemeEnd; ++i) |
|
1366 if (a.string()[i] != b.string()[i]) |
|
1367 return false; |
|
1368 |
|
1369 // And the host |
|
1370 for (int i = 0; i < hostLengthA; ++i) |
|
1371 if (a.string()[hostStartA + i] != b.string()[hostStartB + i]) |
|
1372 return false; |
|
1373 |
|
1374 if (a.port() != b.port()) |
|
1375 return false; |
|
1376 |
|
1377 return true; |
|
1378 } |
|
1379 |
|
1380 String encodeWithURLEscapeSequences(const String& notEncodedString) |
|
1381 { |
|
1382 CString asUTF8 = notEncodedString.utf8(); |
|
1383 |
|
1384 CharBuffer buffer(asUTF8.length() * 3 + 1); |
|
1385 char* p = buffer.data(); |
|
1386 |
|
1387 const char* str = asUTF8.data(); |
|
1388 const char* strEnd = str + asUTF8.length(); |
|
1389 while (str < strEnd) { |
|
1390 unsigned char c = *str++; |
|
1391 if (isBadChar(c)) { |
|
1392 *p++ = '%'; |
|
1393 *p++ = hexDigits[c >> 4]; |
|
1394 *p++ = hexDigits[c & 0xF]; |
|
1395 } else |
|
1396 *p++ = c; |
|
1397 } |
|
1398 |
|
1399 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size())); |
|
1400 |
|
1401 return String(buffer.data(), p - buffer.data()); |
|
1402 } |
|
1403 |
|
1404 // Appends the punycoded hostname identified by the given string and length to |
|
1405 // the output buffer. The result will not be null terminated. |
|
1406 static void appendEncodedHostname(UCharBuffer& buffer, const UChar* str, unsigned strLen) |
|
1407 { |
|
1408 // Needs to be big enough to hold an IDN-encoded name. |
|
1409 // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK. |
|
1410 const unsigned hostnameBufferLength = 2048; |
|
1411 |
|
1412 if (strLen > hostnameBufferLength || charactersAreAllASCII(str, strLen)) { |
|
1413 buffer.append(str, strLen); |
|
1414 return; |
|
1415 } |
|
1416 |
|
1417 #if USE(ICU_UNICODE) |
|
1418 UChar hostnameBuffer[hostnameBufferLength]; |
|
1419 UErrorCode error = U_ZERO_ERROR; |
|
1420 int32_t numCharactersConverted = uidna_IDNToASCII(str, strLen, hostnameBuffer, |
|
1421 hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error); |
|
1422 if (error == U_ZERO_ERROR) |
|
1423 buffer.append(hostnameBuffer, numCharactersConverted); |
|
1424 #elif USE(QT4_UNICODE) |
|
1425 QByteArray result = QUrl::toAce(String(str, strLen)); |
|
1426 buffer.append(result.constData(), result.length()); |
|
1427 #elif USE(GLIB_UNICODE) |
|
1428 GOwnPtr<gchar> utf8Hostname; |
|
1429 GOwnPtr<GError> utf8Err; |
|
1430 utf8Hostname.set(g_utf16_to_utf8(str, strLen, 0, 0, &utf8Err.outPtr())); |
|
1431 if (utf8Err) |
|
1432 return; |
|
1433 |
|
1434 GOwnPtr<gchar> encodedHostname; |
|
1435 encodedHostname.set(g_hostname_to_ascii(utf8Hostname.get())); |
|
1436 if (!encodedHostname) |
|
1437 return; |
|
1438 |
|
1439 buffer.append(encodedHostname.get(), strlen(encodedHostname.get())); |
|
1440 #endif |
|
1441 } |
|
1442 |
|
1443 static void findHostnamesInMailToURL(const UChar* str, int strLen, Vector<pair<int, int> >& nameRanges) |
|
1444 { |
|
1445 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character. |
|
1446 // Skip quoted strings so that characters in them don't confuse us. |
|
1447 // When we find a '?' character, we are past the part of the URL that contains host names. |
|
1448 |
|
1449 nameRanges.clear(); |
|
1450 |
|
1451 int p = 0; |
|
1452 while (1) { |
|
1453 // Find start of host name or of quoted string. |
|
1454 int hostnameOrStringStart = findFirstOf(str, strLen, p, "\"@?"); |
|
1455 if (hostnameOrStringStart == -1) |
|
1456 return; |
|
1457 UChar c = str[hostnameOrStringStart]; |
|
1458 p = hostnameOrStringStart + 1; |
|
1459 |
|
1460 if (c == '?') |
|
1461 return; |
|
1462 |
|
1463 if (c == '@') { |
|
1464 // Find end of host name. |
|
1465 int hostnameStart = p; |
|
1466 int hostnameEnd = findFirstOf(str, strLen, p, ">,?"); |
|
1467 bool done; |
|
1468 if (hostnameEnd == -1) { |
|
1469 hostnameEnd = strLen; |
|
1470 done = true; |
|
1471 } else { |
|
1472 p = hostnameEnd; |
|
1473 done = false; |
|
1474 } |
|
1475 |
|
1476 nameRanges.append(make_pair(hostnameStart, hostnameEnd)); |
|
1477 |
|
1478 if (done) |
|
1479 return; |
|
1480 } else { |
|
1481 // Skip quoted string. |
|
1482 ASSERT(c == '"'); |
|
1483 while (1) { |
|
1484 int escapedCharacterOrStringEnd = findFirstOf(str, strLen, p, "\"\\"); |
|
1485 if (escapedCharacterOrStringEnd == -1) |
|
1486 return; |
|
1487 |
|
1488 c = str[escapedCharacterOrStringEnd]; |
|
1489 p = escapedCharacterOrStringEnd + 1; |
|
1490 |
|
1491 // If we are the end of the string, then break from the string loop back to the host name loop. |
|
1492 if (c == '"') |
|
1493 break; |
|
1494 |
|
1495 // Skip escaped character. |
|
1496 ASSERT(c == '\\'); |
|
1497 if (p == strLen) |
|
1498 return; |
|
1499 |
|
1500 ++p; |
|
1501 } |
|
1502 } |
|
1503 } |
|
1504 } |
|
1505 |
|
1506 static bool findHostnameInHierarchicalURL(const UChar* str, int strLen, int& startOffset, int& endOffset) |
|
1507 { |
|
1508 // Find the host name in a hierarchical URL. |
|
1509 // It comes after a "://" sequence, with scheme characters preceding, and |
|
1510 // this should be the first colon in the string. |
|
1511 // It ends with the end of the string or a ":" or a path segment ending character. |
|
1512 // If there is a "@" character, the host part is just the part after the "@". |
|
1513 int separator = findFirstOf(str, strLen, 0, ":"); |
|
1514 if (separator == -1 || separator + 2 >= strLen || |
|
1515 str[separator + 1] != '/' || str[separator + 2] != '/') |
|
1516 return false; |
|
1517 |
|
1518 // Check that all characters before the :// are valid scheme characters. |
|
1519 if (!isSchemeFirstChar(str[0])) |
|
1520 return false; |
|
1521 for (int i = 1; i < separator; ++i) { |
|
1522 if (!isSchemeChar(str[i])) |
|
1523 return false; |
|
1524 } |
|
1525 |
|
1526 // Start after the separator. |
|
1527 int authorityStart = separator + 3; |
|
1528 |
|
1529 // Find terminating character. |
|
1530 int hostnameEnd = strLen; |
|
1531 for (int i = authorityStart; i < strLen; ++i) { |
|
1532 UChar c = str[i]; |
|
1533 if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) { |
|
1534 hostnameEnd = i; |
|
1535 break; |
|
1536 } |
|
1537 } |
|
1538 |
|
1539 // Find "@" for the start of the host name. |
|
1540 int userInfoTerminator = findFirstOf(str, strLen, authorityStart, "@"); |
|
1541 int hostnameStart; |
|
1542 if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd) |
|
1543 hostnameStart = authorityStart; |
|
1544 else |
|
1545 hostnameStart = userInfoTerminator + 1; |
|
1546 |
|
1547 startOffset = hostnameStart; |
|
1548 endOffset = hostnameEnd; |
|
1549 return true; |
|
1550 } |
|
1551 |
|
1552 // Converts all hostnames found in the given input to punycode, preserving the |
|
1553 // rest of the URL unchanged. The output will NOT be null-terminated. |
|
1554 static void encodeHostnames(const String& str, UCharBuffer& output) |
|
1555 { |
|
1556 output.clear(); |
|
1557 |
|
1558 if (protocolIs(str, "mailto")) { |
|
1559 Vector<pair<int, int> > hostnameRanges; |
|
1560 findHostnamesInMailToURL(str.characters(), str.length(), hostnameRanges); |
|
1561 int n = hostnameRanges.size(); |
|
1562 int p = 0; |
|
1563 for (int i = 0; i < n; ++i) { |
|
1564 const pair<int, int>& r = hostnameRanges[i]; |
|
1565 output.append(&str.characters()[p], r.first - p); |
|
1566 appendEncodedHostname(output, &str.characters()[r.first], r.second - r.first); |
|
1567 p = r.second; |
|
1568 } |
|
1569 // This will copy either everything after the last hostname, or the |
|
1570 // whole thing if there is no hostname. |
|
1571 output.append(&str.characters()[p], str.length() - p); |
|
1572 } else { |
|
1573 int hostStart, hostEnd; |
|
1574 if (findHostnameInHierarchicalURL(str.characters(), str.length(), hostStart, hostEnd)) { |
|
1575 output.append(str.characters(), hostStart); // Before hostname. |
|
1576 appendEncodedHostname(output, &str.characters()[hostStart], hostEnd - hostStart); |
|
1577 output.append(&str.characters()[hostEnd], str.length() - hostEnd); // After hostname. |
|
1578 } else { |
|
1579 // No hostname to encode, return the input. |
|
1580 output.append(str.characters(), str.length()); |
|
1581 } |
|
1582 } |
|
1583 } |
|
1584 |
|
1585 static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output) |
|
1586 { |
|
1587 UCharBuffer s; |
|
1588 encodeHostnames(rel, s); |
|
1589 |
|
1590 TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme. |
|
1591 |
|
1592 int pathEnd = -1; |
|
1593 if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) { |
|
1594 // Find the first instance of either # or ?, keep pathEnd at -1 otherwise. |
|
1595 pathEnd = findFirstOf(s.data(), s.size(), 0, "#?"); |
|
1596 } |
|
1597 |
|
1598 if (pathEnd == -1) { |
|
1599 CString decoded = pathEncoding.encode(s.data(), s.size(), URLEncodedEntitiesForUnencodables); |
|
1600 output.resize(decoded.length()); |
|
1601 memcpy(output.data(), decoded.data(), decoded.length()); |
|
1602 } else { |
|
1603 CString pathDecoded = pathEncoding.encode(s.data(), pathEnd, URLEncodedEntitiesForUnencodables); |
|
1604 // Unencodable characters in URLs are represented by converting |
|
1605 // them to XML entities and escaping non-alphanumeric characters. |
|
1606 CString otherDecoded = encoding.encode(s.data() + pathEnd, s.size() - pathEnd, URLEncodedEntitiesForUnencodables); |
|
1607 |
|
1608 output.resize(pathDecoded.length() + otherDecoded.length()); |
|
1609 memcpy(output.data(), pathDecoded.data(), pathDecoded.length()); |
|
1610 memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length()); |
|
1611 } |
|
1612 output.append('\0'); // null-terminate the output. |
|
1613 } |
|
1614 |
|
1615 static String substituteBackslashes(const String& string) |
|
1616 { |
|
1617 int questionPos = string.find('?'); |
|
1618 int hashPos = string.find('#'); |
|
1619 int pathEnd; |
|
1620 |
|
1621 if (hashPos >= 0 && (questionPos < 0 || questionPos > hashPos)) |
|
1622 pathEnd = hashPos; |
|
1623 else if (questionPos >= 0) |
|
1624 pathEnd = questionPos; |
|
1625 else |
|
1626 pathEnd = string.length(); |
|
1627 |
|
1628 return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd); |
|
1629 } |
|
1630 |
|
1631 bool KURL::isHierarchical() const |
|
1632 { |
|
1633 if (!m_isValid) |
|
1634 return false; |
|
1635 ASSERT(m_string[m_schemeEnd] == ':'); |
|
1636 return m_string[m_schemeEnd + 1] == '/'; |
|
1637 } |
|
1638 |
|
1639 void KURL::copyToBuffer(CharBuffer& buffer) const |
|
1640 { |
|
1641 // FIXME: This throws away the high bytes of all the characters in the string! |
|
1642 // That's fine for a valid URL, which is all ASCII, but not for invalid URLs. |
|
1643 buffer.resize(m_string.length()); |
|
1644 copyASCII(m_string.characters(), m_string.length(), buffer.data()); |
|
1645 } |
|
1646 |
|
1647 bool protocolIs(const String& url, const char* protocol) |
|
1648 { |
|
1649 // Do the comparison without making a new string object. |
|
1650 assertProtocolIsGood(protocol); |
|
1651 for (int i = 0; ; ++i) { |
|
1652 if (!protocol[i]) |
|
1653 return url[i] == ':'; |
|
1654 if (toASCIILower(url[i]) != protocol[i]) |
|
1655 return false; |
|
1656 } |
|
1657 } |
|
1658 |
|
1659 bool protocolIsJavaScript(const String& url) |
|
1660 { |
|
1661 return protocolIs(url, "javascript"); |
|
1662 } |
|
1663 |
|
1664 bool isValidProtocol(const String& protocol) |
|
1665 { |
|
1666 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) |
|
1667 if (protocol.isEmpty()) |
|
1668 return false; |
|
1669 if (!isSchemeFirstChar(protocol[0])) |
|
1670 return false; |
|
1671 unsigned protocolLength = protocol.length(); |
|
1672 for (unsigned i = 1; i < protocolLength; i++) { |
|
1673 if (!isSchemeChar(protocol[i])) |
|
1674 return false; |
|
1675 } |
|
1676 return true; |
|
1677 } |
|
1678 |
|
1679 bool isDefaultPortForProtocol(unsigned short port, const String& protocol) |
|
1680 { |
|
1681 if (protocol.isEmpty()) |
|
1682 return false; |
|
1683 |
|
1684 typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap; |
|
1685 DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ()); |
|
1686 if (defaultPorts.isEmpty()) { |
|
1687 defaultPorts.set("http", 80); |
|
1688 defaultPorts.set("https", 443); |
|
1689 defaultPorts.set("ftp", 21); |
|
1690 defaultPorts.set("ftps", 990); |
|
1691 } |
|
1692 return defaultPorts.get(protocol) == port; |
|
1693 } |
|
1694 |
|
1695 bool portAllowed(const KURL& url) |
|
1696 { |
|
1697 unsigned short port = url.port(); |
|
1698 |
|
1699 // Since most URLs don't have a port, return early for the "no port" case. |
|
1700 if (!port) |
|
1701 return true; |
|
1702 |
|
1703 // This blocked port list matches the port blocking that Mozilla implements. |
|
1704 // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information. |
|
1705 static const unsigned short blockedPortList[] = { |
|
1706 1, // tcpmux |
|
1707 7, // echo |
|
1708 9, // discard |
|
1709 11, // systat |
|
1710 13, // daytime |
|
1711 15, // netstat |
|
1712 17, // qotd |
|
1713 19, // chargen |
|
1714 20, // FTP-data |
|
1715 21, // FTP-control |
|
1716 22, // SSH |
|
1717 23, // telnet |
|
1718 25, // SMTP |
|
1719 37, // time |
|
1720 42, // name |
|
1721 43, // nicname |
|
1722 53, // domain |
|
1723 77, // priv-rjs |
|
1724 79, // finger |
|
1725 87, // ttylink |
|
1726 95, // supdup |
|
1727 101, // hostriame |
|
1728 102, // iso-tsap |
|
1729 103, // gppitnp |
|
1730 104, // acr-nema |
|
1731 109, // POP2 |
|
1732 110, // POP3 |
|
1733 111, // sunrpc |
|
1734 113, // auth |
|
1735 115, // SFTP |
|
1736 117, // uucp-path |
|
1737 119, // nntp |
|
1738 123, // NTP |
|
1739 135, // loc-srv / epmap |
|
1740 139, // netbios |
|
1741 143, // IMAP2 |
|
1742 179, // BGP |
|
1743 389, // LDAP |
|
1744 465, // SMTP+SSL |
|
1745 512, // print / exec |
|
1746 513, // login |
|
1747 514, // shell |
|
1748 515, // printer |
|
1749 526, // tempo |
|
1750 530, // courier |
|
1751 531, // Chat |
|
1752 532, // netnews |
|
1753 540, // UUCP |
|
1754 556, // remotefs |
|
1755 563, // NNTP+SSL |
|
1756 587, // ESMTP |
|
1757 601, // syslog-conn |
|
1758 636, // LDAP+SSL |
|
1759 993, // IMAP+SSL |
|
1760 995, // POP3+SSL |
|
1761 2049, // NFS |
|
1762 3659, // apple-sasl / PasswordServer [Apple addition] |
|
1763 4045, // lockd |
|
1764 6000, // X11 |
|
1765 6665, // Alternate IRC [Apple addition] |
|
1766 6666, // Alternate IRC [Apple addition] |
|
1767 6667, // Standard IRC [Apple addition] |
|
1768 6668, // Alternate IRC [Apple addition] |
|
1769 6669, // Alternate IRC [Apple addition] |
|
1770 invalidPortNumber, // Used to block all invalid port numbers |
|
1771 }; |
|
1772 const unsigned short* const blockedPortListEnd = blockedPortList + sizeof(blockedPortList) / sizeof(blockedPortList[0]); |
|
1773 |
|
1774 #ifndef NDEBUG |
|
1775 // The port list must be sorted for binary_search to work. |
|
1776 static bool checkedPortList = false; |
|
1777 if (!checkedPortList) { |
|
1778 for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p) |
|
1779 ASSERT(*p < *(p + 1)); |
|
1780 checkedPortList = true; |
|
1781 } |
|
1782 #endif |
|
1783 |
|
1784 // If the port is not in the blocked port list, allow it. |
|
1785 if (!binary_search(blockedPortList, blockedPortListEnd, port)) |
|
1786 return true; |
|
1787 |
|
1788 // Allow ports 21 and 22 for FTP URLs, as Mozilla does. |
|
1789 if ((port == 21 || port == 22) && url.protocolIs("ftp")) |
|
1790 return true; |
|
1791 |
|
1792 // Allow any port number in a file URL, since the port number is ignored. |
|
1793 if (url.protocolIs("file")) |
|
1794 return true; |
|
1795 |
|
1796 return false; |
|
1797 } |
|
1798 |
|
1799 String mimeTypeFromDataURL(const String& url) |
|
1800 { |
|
1801 ASSERT(protocolIs(url, "data")); |
|
1802 int index = url.find(';'); |
|
1803 if (index == -1) |
|
1804 index = url.find(','); |
|
1805 if (index != -1) { |
|
1806 int len = index - 5; |
|
1807 if (len > 0) |
|
1808 return url.substring(5, len); |
|
1809 return "text/plain"; // Data URLs with no MIME type are considered text/plain. |
|
1810 } |
|
1811 return ""; |
|
1812 } |
|
1813 |
|
1814 const KURL& blankURL() |
|
1815 { |
|
1816 DEFINE_STATIC_LOCAL(KURL, staticBlankURL, (ParsedURLString, "about:blank")); |
|
1817 return staticBlankURL; |
|
1818 } |
|
1819 |
|
1820 #ifndef NDEBUG |
|
1821 void KURL::print() const |
|
1822 { |
|
1823 printf("%s\n", m_string.utf8().data()); |
|
1824 } |
|
1825 #endif |
|
1826 |
|
1827 } |
|
1828 |
|
1829 #endif // !USE(GOOGLEURL) |