diff -r 000000000000 -r 4f2f89ce4247 WebCore/platform/KURL.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/WebCore/platform/KURL.cpp Fri Sep 17 09:02:29 2010 +0300 @@ -0,0 +1,1829 @@ +/* + * Copyright (C) 2004, 2007, 2008 Apple Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#if !USE(GOOGLEURL) + +#include "KURL.h" + +#include "StringHash.h" +#include "TextEncoding.h" +#include +#include +#include + +#if USE(ICU_UNICODE) +#include +#elif USE(QT4_UNICODE) +#include +#elif USE(GLIB_UNICODE) +#include +#include "GOwnPtr.h" +#endif + +#include + +using namespace std; +using namespace WTF; + +namespace WebCore { + +typedef Vector CharBuffer; +typedef Vector UCharBuffer; + +// FIXME: This file makes too much use of the + operator on String. +// We either have to optimize that operator so it doesn't involve +// so many allocations, or change this to use Vector instead. + +enum URLCharacterClasses { + // alpha + SchemeFirstChar = 1 << 0, + + // ( alpha | digit | "+" | "-" | "." ) + SchemeChar = 1 << 1, + + // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" + // unreserved = alphanum | mark + // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," ) + UserInfoChar = 1 << 2, + + // alnum | "." | "-" | "%" + // The above is what the specification says, but we are lenient to + // match existing practice and also allow: + // "_" + HostnameChar = 1 << 3, + + // hexdigit | ":" | "%" + IPv6Char = 1 << 4, + + // "#" | "?" | "/" | nul + PathSegmentEndChar = 1 << 5, + + // not allowed in path + BadChar = 1 << 6 +}; + +static const char hexDigits[17] = "0123456789ABCDEF"; + +static const unsigned char characterClassTable[256] = { + /* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar, + /* 2 stx */ BadChar, /* 3 etx */ BadChar, + /* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar, + /* 8 bs */ BadChar, /* 9 ht */ BadChar, /* 10 nl */ BadChar, /* 11 vt */ BadChar, + /* 12 np */ BadChar, /* 13 cr */ BadChar, /* 14 so */ BadChar, /* 15 si */ BadChar, + /* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar, + /* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar, + /* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar, + /* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar, + /* 32 sp */ BadChar, /* 33 ! */ UserInfoChar, + /* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar, + /* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar, + /* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar, + /* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar, + /* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar, + /* 44 , */ UserInfoChar, + /* 45 - */ SchemeChar | UserInfoChar | HostnameChar, + /* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 47 / */ PathSegmentEndChar, + /* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar, + /* 60 < */ BadChar, /* 61 = */ UserInfoChar, + /* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar, + /* 64 @ */ 0, + /* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 91 [ */ 0, + /* 92 \ */ 0, /* 93 ] */ 0, + /* 94 ^ */ 0, + /* 95 _ */ UserInfoChar | HostnameChar, + /* 96 ` */ 0, + /* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, + /* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, + /* 123 { */ 0, + /* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar, + /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar, + /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar, + /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar, + /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar, + /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar, + /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar, + /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar, + /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar, + /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar, + /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar, + /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar, + /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar, + /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar, + /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar, + /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar, + /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar, + /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar, + /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar, + /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar, + /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar, + /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar, + /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar, + /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar, + /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar, + /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar, + /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar, + /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar, + /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar, + /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar, + /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar, + /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar, + /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar +}; + +static const unsigned maximumValidPortNumber = 0xFFFE; +static const unsigned invalidPortNumber = 0xFFFF; + +static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd); +static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput); +static String substituteBackslashes(const String&); +static bool isValidProtocol(const String&); + +static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast(c)] & SchemeFirstChar; } +static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); } +static inline bool isSchemeChar(char c) { return characterClassTable[static_cast(c)] & SchemeChar; } +static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); } +static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; } +static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; } +static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; } +static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast(c)] & PathSegmentEndChar; } +static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); } +static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; } + +static inline int hexDigitValue(UChar c) +{ + ASSERT(isASCIIHexDigit(c)); + if (c < 'A') + return c - '0'; + return (c - 'A' + 10) & 0xF; // handle both upper and lower case without a branch +} + +// Copies the source to the destination, assuming all the source characters are +// ASCII. The destination buffer must be large enough. Null characters are allowed +// in the source string, and no attempt is made to null-terminate the result. +static void copyASCII(const UChar* src, int length, char* dest) +{ + for (int i = 0; i < length; i++) + dest[i] = static_cast(src[i]); +} + +static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer) +{ + buffer.resize(base.length() + len + 1); + copyASCII(base.characters(), base.length(), buffer.data()); + memcpy(buffer.data() + base.length(), rel, len); + buffer[buffer.size() - 1] = '\0'; +} + +// FIXME: Move to PlatformString.h eventually. +// Returns the index of the first index in string |s| of any of the characters +// in |toFind|. |toFind| should be a null-terminated string, all characters up +// to the null will be searched. Returns int if not found. +static int findFirstOf(const UChar* s, int sLen, int startPos, const char* toFind) +{ + for (int i = startPos; i < sLen; i++) { + const char* cur = toFind; + while (*cur) { + if (s[i] == *(cur++)) + return i; + } + } + return -1; +} + +#ifndef NDEBUG +static void checkEncodedString(const String& url) +{ + for (unsigned i = 0; i < url.length(); ++i) + ASSERT(!(url[i] & ~0x7F)); + + ASSERT(!url.length() || isSchemeFirstChar(url[0])); +} +#else +static inline void checkEncodedString(const String&) +{ +} +#endif + +inline bool KURL::protocolIs(const String& string, const char* protocol) +{ + return WebCore::protocolIs(string, protocol); +} + +void KURL::invalidate() +{ + m_isValid = false; + m_protocolInHTTPFamily = false; + m_schemeEnd = 0; + m_userStart = 0; + m_userEnd = 0; + m_passwordEnd = 0; + m_hostEnd = 0; + m_portEnd = 0; + m_pathEnd = 0; + m_pathAfterLastSlash = 0; + m_queryEnd = 0; + m_fragmentEnd = 0; +} + +KURL::KURL(ParsedURLStringTag, const char* url) +{ + parse(url, 0); + ASSERT(url == m_string); +} + +KURL::KURL(ParsedURLStringTag, const String& url) +{ + parse(url); + ASSERT(url == m_string); +} + +KURL::KURL(const KURL& base, const String& relative) +{ + init(base, relative, UTF8Encoding()); +} + +KURL::KURL(const KURL& base, const String& relative, const TextEncoding& encoding) +{ + // For UTF-{7,16,32}, we want to use UTF-8 for the query part as + // we do when submitting a form. A form with GET method + // has its contents added to a URL as query params and it makes sense + // to be consistent. + init(base, relative, encoding.encodingForFormSubmission()); +} + +void KURL::init(const KURL& base, const String& relative, const TextEncoding& encoding) +{ + // Allow resolutions with a null or empty base URL, but not with any other invalid one. + // FIXME: Is this a good rule? + if (!base.m_isValid && !base.isEmpty()) { + m_string = relative; + invalidate(); + return; + } + + // For compatibility with Win IE, treat backslashes as if they were slashes, + // as long as we're not dealing with javascript: or data: URLs. + String rel = relative; + if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data"))) + rel = substituteBackslashes(rel); + + String* originalString = &rel; + + bool allASCII = charactersAreAllASCII(rel.characters(), rel.length()); + CharBuffer strBuffer; + char* str; + size_t len; + if (allASCII) { + len = rel.length(); + strBuffer.resize(len + 1); + copyASCII(rel.characters(), len, strBuffer.data()); + strBuffer[len] = 0; + str = strBuffer.data(); + } else { + originalString = 0; + encodeRelativeString(rel, encoding, strBuffer); + str = strBuffer.data(); + len = strlen(str); + } + + // Get rid of leading whitespace. + while (*str == ' ') { + originalString = 0; + str++; + --len; + } + + // Get rid of trailing whitespace. + while (len && str[len - 1] == ' ') { + originalString = 0; + str[--len] = '\0'; + } + + // According to the RFC, the reference should be interpreted as an + // absolute URI if possible, using the "leftmost, longest" + // algorithm. If the URI reference is absolute it will have a + // scheme, meaning that it will have a colon before the first + // non-scheme element. + bool absolute = false; + char* p = str; + if (isSchemeFirstChar(*p)) { + ++p; + while (isSchemeChar(*p)) { + ++p; + } + if (*p == ':') { + if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical()) { + str = p + 1; + originalString = 0; + } else + absolute = true; + } + } + + CharBuffer parseBuffer; + + if (absolute) { + parse(str, originalString); + } else { + // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid + // unless the relative URL is a single fragment. + if (!base.isHierarchical()) { + if (str[0] == '#') { + appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer); + parse(parseBuffer.data(), 0); + } else { + m_string = relative; + invalidate(); + } + return; + } + + switch (str[0]) { + case '\0': + // The reference is empty, so this is a reference to the same document with any fragment identifier removed. + *this = base; + removeFragmentIdentifier(); + break; + case '#': { + // must be fragment-only reference + appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer); + parse(parseBuffer.data(), 0); + break; + } + case '?': { + // query-only reference, special case needed for non-URL results + appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer); + parse(parseBuffer.data(), 0); + break; + } + case '/': + // must be net-path or absolute-path reference + if (str[1] == '/') { + // net-path + appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer); + parse(parseBuffer.data(), 0); + } else { + // abs-path + appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer); + parse(parseBuffer.data(), 0); + } + break; + default: + { + // must be relative-path reference + + // Base part plus relative part plus one possible slash added in between plus terminating \0 byte. + parseBuffer.resize(base.m_pathEnd + 1 + len + 1); + + char* bufferPos = parseBuffer.data(); + + // first copy everything before the path from the base + unsigned baseLength = base.m_string.length(); + const UChar* baseCharacters = base.m_string.characters(); + CharBuffer baseStringBuffer(baseLength); + copyASCII(baseCharacters, baseLength, baseStringBuffer.data()); + const char* baseString = baseStringBuffer.data(); + const char* baseStringStart = baseString; + const char* pathStart = baseStringStart + base.m_portEnd; + while (baseStringStart < pathStart) + *bufferPos++ = *baseStringStart++; + char* bufferPathStart = bufferPos; + + // now copy the base path + const char* baseStringEnd = baseString + base.m_pathEnd; + + // go back to the last slash + while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/') + baseStringEnd--; + + if (baseStringEnd == baseStringStart) { + // no path in base, add a path separator if necessary + if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#') + *bufferPos++ = '/'; + } else { + bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart); + } + + const char* relStringStart = str; + const char* relStringPos = relStringStart; + + while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') { + if (relStringPos[0] == '.' && bufferPos[-1] == '/') { + if (isPathSegmentEndChar(relStringPos[1])) { + // skip over "." segment + relStringPos += 1; + if (relStringPos[0] == '/') + relStringPos++; + continue; + } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) { + // skip over ".." segment and rewind the last segment + // the RFC leaves it up to the app to decide what to do with excess + // ".." segments - we choose to drop them since some web content + // relies on this. + relStringPos += 2; + if (relStringPos[0] == '/') + relStringPos++; + if (bufferPos > bufferPathStart + 1) + bufferPos--; + while (bufferPos > bufferPathStart + 1 && bufferPos[-1] != '/') + bufferPos--; + continue; + } + } + + *bufferPos = *relStringPos; + relStringPos++; + bufferPos++; + } + + // all done with the path work, now copy any remainder + // of the relative reference; this will also add a null terminator + strcpy(bufferPos, relStringPos); + + parse(parseBuffer.data(), 0); + + ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size()); + break; + } + } + } +} + +KURL KURL::copy() const +{ + KURL result = *this; + result.m_string = result.m_string.crossThreadString(); + return result; +} + +bool KURL::hasPath() const +{ + return m_pathEnd != m_portEnd; +} + +String KURL::lastPathComponent() const +{ + if (!hasPath()) + return String(); + + int end = m_pathEnd - 1; + if (m_string[end] == '/') + --end; + + int start = m_string.reverseFind('/', end); + if (start < m_portEnd) + return String(); + ++start; + + return m_string.substring(start, end - start + 1); +} + +String KURL::protocol() const +{ + return m_string.left(m_schemeEnd); +} + +String KURL::host() const +{ + int start = hostStart(); + return decodeURLEscapeSequences(m_string.substring(start, m_hostEnd - start)); +} + +unsigned short KURL::port() const +{ + // We return a port of 0 if there is no port specified. This can happen in two situations: + // 1) The URL contains no colon after the host name and before the path component of the URL. + // 2) The URL contains a colon but there's no port number before the path component of the URL begins. + if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1) + return 0; + + const UChar* stringData = m_string.characters(); + bool ok = false; + unsigned number = charactersToUIntStrict(stringData + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok); + if (!ok || number > maximumValidPortNumber) + return invalidPortNumber; + return number; +} + +String KURL::pass() const +{ + if (m_passwordEnd == m_userEnd) + return String(); + + return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1)); +} + +String KURL::user() const +{ + return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart)); +} + +String KURL::fragmentIdentifier() const +{ + if (m_fragmentEnd == m_queryEnd) + return String(); + + return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1)); +} + +bool KURL::hasFragmentIdentifier() const +{ + return m_fragmentEnd != m_queryEnd; +} + +String KURL::baseAsString() const +{ + return m_string.left(m_pathAfterLastSlash); +} + +#ifdef NDEBUG + +static inline void assertProtocolIsGood(const char*) +{ +} + +#else + +static void assertProtocolIsGood(const char* protocol) +{ + const char* p = protocol; + while (*p) { + ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z')); + ++p; + } +} + +#endif + +bool KURL::protocolIs(const char* protocol) const +{ + assertProtocolIsGood(protocol); + + // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid. + // The free function protocolIsJavaScript() should be used instead. + ASSERT(!equalIgnoringCase(protocol, String("javascript"))); + + if (!m_isValid) + return false; + + // Do the comparison without making a new string object. + for (int i = 0; i < m_schemeEnd; ++i) { + if (!protocol[i] || toASCIILower(m_string[i]) != protocol[i]) + return false; + } + return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument. +} + +String KURL::query() const +{ + if (m_queryEnd == m_pathEnd) + return String(); + + return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1)); +} + +String KURL::path() const +{ + return decodeURLEscapeSequences(m_string.substring(m_portEnd, m_pathEnd - m_portEnd)); +} + +bool KURL::setProtocol(const String& s) +{ + // Firefox and IE remove everything after the first ':'. + int separatorPosition = s.find(':'); + String newProtocol = s.substring(0, separatorPosition); + + if (!isValidProtocol(newProtocol)) + return false; + + if (!m_isValid) { + parse(newProtocol + ":" + m_string); + return true; + } + + parse(newProtocol + m_string.substring(m_schemeEnd)); + return true; +} + +void KURL::setHost(const String& s) +{ + if (!m_isValid) + return; + + // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, + // and to avoid changing more than just the host. + + bool slashSlashNeeded = m_userStart == m_schemeEnd + 1; + + parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd)); +} + +void KURL::removePort() +{ + if (m_hostEnd == m_portEnd) + return; + parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd)); +} + +void KURL::setPort(unsigned short i) +{ + if (!m_isValid) + return; + + bool colonNeeded = m_portEnd == m_hostEnd; + int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1); + + parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd)); +} + +void KURL::setHostAndPort(const String& hostAndPort) +{ + if (!m_isValid) + return; + + // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, + // and to avoid changing more than just host and port. + + bool slashSlashNeeded = m_userStart == m_schemeEnd + 1; + + parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd)); +} + +void KURL::setUser(const String& user) +{ + if (!m_isValid) + return; + + // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, + // and to avoid changing more than just the user login. + String u; + int end = m_userEnd; + if (!user.isEmpty()) { + u = user; + if (m_userStart == m_schemeEnd + 1) + u = "//" + u; + // Add '@' if we didn't have one before. + if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@')) + u.append('@'); + } else { + // Remove '@' if we now have neither user nor password. + if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@') + end += 1; + } + parse(m_string.left(m_userStart) + u + m_string.substring(end)); +} + +void KURL::setPass(const String& password) +{ + if (!m_isValid) + return; + + // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, + // and to avoid changing more than just the user password. + String p; + int end = m_passwordEnd; + if (!password.isEmpty()) { + p = ":" + password + "@"; + if (m_userEnd == m_schemeEnd + 1) + p = "//" + p; + // Eat the existing '@' since we are going to add our own. + if (end != m_hostEnd && m_string[end] == '@') + end += 1; + } else { + // Remove '@' if we now have neither user nor password. + if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@') + end += 1; + } + parse(m_string.left(m_userEnd) + p + m_string.substring(end)); +} + +void KURL::setFragmentIdentifier(const String& s) +{ + if (!m_isValid) + return; + + // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations. + parse(m_string.left(m_queryEnd) + "#" + s); +} + +void KURL::removeFragmentIdentifier() +{ + if (!m_isValid) + return; + parse(m_string.left(m_queryEnd)); +} + +void KURL::setQuery(const String& query) +{ + if (!m_isValid) + return; + + // FIXME: '#' and non-ASCII characters must be encoded and escaped. + // Usually, the query is encoded using document encoding, not UTF-8, but we don't have + // access to the document in this function. + if ((query.isEmpty() || query[0] != '?') && !query.isNull()) + parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd)); + else + parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd)); + +} + +void KURL::setPath(const String& s) +{ + if (!m_isValid) + return; + + // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts + // may be inadvertently affected. + parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(s) + m_string.substring(m_pathEnd)); +} + +String KURL::prettyURL() const +{ + if (!m_isValid) + return m_string; + + Vector result; + + append(result, protocol()); + result.append(':'); + + Vector authority; + + if (m_hostEnd != m_passwordEnd) { + if (m_userEnd != m_userStart) { + append(authority, user()); + authority.append('@'); + } + append(authority, host()); + if (hasPort()) { + authority.append(':'); + append(authority, String::number(port())); + } + } + + if (!authority.isEmpty()) { + result.append('/'); + result.append('/'); + result.append(authority); + } else if (protocolIs("file")) { + result.append('/'); + result.append('/'); + } + + append(result, path()); + + if (m_pathEnd != m_queryEnd) { + result.append('?'); + append(result, query()); + } + + if (m_fragmentEnd != m_queryEnd) { + result.append('#'); + append(result, fragmentIdentifier()); + } + + return String::adopt(result); +} + +String decodeURLEscapeSequences(const String& str) +{ + return decodeURLEscapeSequences(str, UTF8Encoding()); +} + +String decodeURLEscapeSequences(const String& str, const TextEncoding& encoding) +{ + Vector result; + + CharBuffer buffer; + + int length = str.length(); + int decodedPosition = 0; + int searchPosition = 0; + int encodedRunPosition; + while ((encodedRunPosition = str.find('%', searchPosition)) >= 0) { + // Find the sequence of %-escape codes. + int encodedRunEnd = encodedRunPosition; + while (length - encodedRunEnd >= 3 + && str[encodedRunEnd] == '%' + && isASCIIHexDigit(str[encodedRunEnd + 1]) + && isASCIIHexDigit(str[encodedRunEnd + 2])) + encodedRunEnd += 3; + if (encodedRunEnd == encodedRunPosition) { + ++searchPosition; + continue; + } + searchPosition = encodedRunEnd; + + // Decode the %-escapes into bytes. + unsigned runLength = (encodedRunEnd - encodedRunPosition) / 3; + buffer.resize(runLength); + char* p = buffer.data(); + const UChar* q = str.characters() + encodedRunPosition; + for (unsigned i = 0; i < runLength; ++i) { + *p++ = (hexDigitValue(q[1]) << 4) | hexDigitValue(q[2]); + q += 3; + } + + // Decode the bytes into Unicode characters. + String decoded = (encoding.isValid() ? encoding : UTF8Encoding()).decode(buffer.data(), p - buffer.data()); + if (decoded.isEmpty()) + continue; + + // Build up the string with what we just skipped and what we just decoded. + result.append(str.characters() + decodedPosition, encodedRunPosition - decodedPosition); + result.append(decoded.characters(), decoded.length()); + decodedPosition = encodedRunEnd; + } + + result.append(str.characters() + decodedPosition, length - decodedPosition); + + return String::adopt(result); +} + +bool KURL::isLocalFile() const +{ + // Including feed here might be a bad idea since drag and drop uses this check + // and including feed would allow feeds to potentially let someone's blog + // read the contents of the clipboard on a drag, even without a drop. + // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function. + return protocolIs("file"); +} + +static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length) +{ + char* p = buffer; + + const char* str = strStart; + const char* strEnd = strStart + length; + while (str < strEnd) { + unsigned char c = *str++; + if (isBadChar(c)) { + if (c == '%' || c == '?') { + *p++ = c; + } else if (c != 0x09 && c != 0x0a && c != 0x0d) { + *p++ = '%'; + *p++ = hexDigits[c >> 4]; + *p++ = hexDigits[c & 0xF]; + } + } else { + *p++ = c; + } + } + + buffer = p; +} + +// copy a path, accounting for "." and ".." segments +static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd) +{ + char* bufferPathStart = dst; + + // empty path is a special case, and need not have a leading slash + if (srcStart != srcEnd) { + const char* baseStringStart = src + srcStart; + const char* baseStringEnd = src + srcEnd; + const char* baseStringPos = baseStringStart; + + // this code is unprepared for paths that do not begin with a + // slash and we should always have one in the source string + ASSERT(baseStringPos[0] == '/'); + + // copy the leading slash into the destination + *dst = *baseStringPos; + baseStringPos++; + dst++; + + while (baseStringPos < baseStringEnd) { + if (baseStringPos[0] == '.' && dst[-1] == '/') { + if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) { + // skip over "." segment + baseStringPos += 2; + continue; + } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' || + baseStringPos + 2 == baseStringEnd)) { + // skip over ".." segment and rewind the last segment + // the RFC leaves it up to the app to decide what to do with excess + // ".." segments - we choose to drop them since some web content + // relies on this. + baseStringPos += 3; + if (dst > bufferPathStart + 1) + dst--; + // Note that these two while blocks differ subtly. + // The first helps to remove multiple adjoining slashes as we rewind. + // The +1 to bufferPathStart in the first while block prevents eating a leading slash + while (dst > bufferPathStart + 1 && dst[-1] == '/') + dst--; + while (dst > bufferPathStart && dst[-1] != '/') + dst--; + continue; + } + } + + *dst = *baseStringPos; + baseStringPos++; + dst++; + } + } + *dst = '\0'; + return dst - bufferPathStart; +} + +static inline bool hasSlashDotOrDotDot(const char* str) +{ + const unsigned char* p = reinterpret_cast(str); + if (!*p) + return false; + unsigned char pc = *p; + while (unsigned char c = *++p) { + if (c == '.' && (pc == '/' || pc == '.')) + return true; + pc = c; + } + return false; +} + +static inline bool matchLetter(char c, char lowercaseLetter) +{ + return (c | 0x20) == lowercaseLetter; +} + +void KURL::parse(const String& string) +{ + checkEncodedString(string); + + CharBuffer buffer(string.length() + 1); + copyASCII(string.characters(), string.length(), buffer.data()); + buffer[string.length()] = '\0'; + parse(buffer.data(), &string); +} + +void KURL::parse(const char* url, const String* originalString) +{ + if (!url || url[0] == '\0') { + // valid URL must be non-empty + m_string = originalString ? *originalString : url; + invalidate(); + return; + } + + if (!isSchemeFirstChar(url[0])) { + // scheme must start with an alphabetic character + m_string = originalString ? *originalString : url; + invalidate(); + return; + } + + int schemeEnd = 0; + while (isSchemeChar(url[schemeEnd])) + schemeEnd++; + + if (url[schemeEnd] != ':') { + m_string = originalString ? *originalString : url; + invalidate(); + return; + } + + int userStart = schemeEnd + 1; + int userEnd; + int passwordStart; + int passwordEnd; + int hostStart; + int hostEnd; + int portStart; + int portEnd; + + bool hierarchical = url[schemeEnd + 1] == '/'; + + bool isFile = schemeEnd == 4 + && matchLetter(url[0], 'f') + && matchLetter(url[1], 'i') + && matchLetter(url[2], 'l') + && matchLetter(url[3], 'e'); + + m_protocolInHTTPFamily = matchLetter(url[0], 'h') + && matchLetter(url[1], 't') + && matchLetter(url[2], 't') + && matchLetter(url[3], 'p') + && (url[4] == ':' || (matchLetter(url[4], 's') && url[5] == ':')); + + if (hierarchical && url[schemeEnd + 2] == '/') { + // The part after the scheme is either a net_path or an abs_path whose first path segment is empty. + // Attempt to find an authority. + + // FIXME: Authority characters may be scanned twice, and it would be nice to be faster. + userStart += 2; + userEnd = userStart; + + int colonPos = 0; + while (isUserInfoChar(url[userEnd])) { + if (url[userEnd] == ':' && colonPos == 0) + colonPos = userEnd; + userEnd++; + } + + if (url[userEnd] == '@') { + // actual end of the userinfo, start on the host + if (colonPos != 0) { + passwordEnd = userEnd; + userEnd = colonPos; + passwordStart = colonPos + 1; + } else + passwordStart = passwordEnd = userEnd; + + hostStart = passwordEnd + 1; + } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) { + // hit the end of the authority, must have been no user + // or looks like an IPv6 hostname + // either way, try to parse it as a hostname + userEnd = userStart; + passwordStart = passwordEnd = userEnd; + hostStart = userStart; + } else { + // invalid character + m_string = originalString ? *originalString : url; + invalidate(); + return; + } + + hostEnd = hostStart; + + // IPV6 IP address + if (url[hostEnd] == '[') { + hostEnd++; + while (isIPv6Char(url[hostEnd])) + hostEnd++; + if (url[hostEnd] == ']') + hostEnd++; + else { + // invalid character + m_string = originalString ? *originalString : url; + invalidate(); + return; + } + } else { + while (isHostnameChar(url[hostEnd])) + hostEnd++; + } + + if (url[hostEnd] == ':') { + portStart = portEnd = hostEnd + 1; + + // possible start of port + portEnd = portStart; + while (isASCIIDigit(url[portEnd])) + portEnd++; + } else + portStart = portEnd = hostEnd; + + if (!isPathSegmentEndChar(url[portEnd])) { + // invalid character + m_string = originalString ? *originalString : url; + invalidate(); + return; + } + + if (userStart == portEnd && !m_protocolInHTTPFamily && !isFile) { + // No authority found, which means that this is not a net_path, but rather an abs_path whose first two + // path segments are empty. For file, http and https only, an empty authority is allowed. + userStart -= 2; + userEnd = userStart; + passwordStart = userEnd; + passwordEnd = passwordStart; + hostStart = passwordEnd; + hostEnd = hostStart; + portStart = hostEnd; + portEnd = hostEnd; + } + } else { + // the part after the scheme must be an opaque_part or an abs_path + userEnd = userStart; + passwordStart = passwordEnd = userEnd; + hostStart = hostEnd = passwordEnd; + portStart = portEnd = hostEnd; + } + + int pathStart = portEnd; + int pathEnd = pathStart; + while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#') + pathEnd++; + + int queryStart = pathEnd; + int queryEnd = queryStart; + if (url[queryStart] == '?') { + while (url[queryEnd] && url[queryEnd] != '#') + queryEnd++; + } + + int fragmentStart = queryEnd; + int fragmentEnd = fragmentStart; + if (url[fragmentStart] == '#') { + fragmentStart++; + fragmentEnd = fragmentStart; + while (url[fragmentEnd]) + fragmentEnd++; + } + + // assemble it all, remembering the real ranges + + Vector buffer(fragmentEnd * 3 + 1); + + char *p = buffer.data(); + const char *strPtr = url; + + // copy in the scheme + const char *schemeEndPtr = url + schemeEnd; + while (strPtr < schemeEndPtr) + *p++ = *strPtr++; + m_schemeEnd = p - buffer.data(); + + bool hostIsLocalHost = portEnd - userStart == 9 + && matchLetter(url[userStart], 'l') + && matchLetter(url[userStart+1], 'o') + && matchLetter(url[userStart+2], 'c') + && matchLetter(url[userStart+3], 'a') + && matchLetter(url[userStart+4], 'l') + && matchLetter(url[userStart+5], 'h') + && matchLetter(url[userStart+6], 'o') + && matchLetter(url[userStart+7], 's') + && matchLetter(url[userStart+8], 't'); + + // File URLs need a host part unless it is just file:// or file://localhost + bool degenFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost); + + bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || portStart != portEnd; + + // add ":" after scheme + *p++ = ':'; + + // if we have at least one authority part or a file URL - add "//" and authority + if (isFile ? !degenFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) { + *p++ = '/'; + *p++ = '/'; + + m_userStart = p - buffer.data(); + + // copy in the user + strPtr = url + userStart; + const char* userEndPtr = url + userEnd; + while (strPtr < userEndPtr) + *p++ = *strPtr++; + m_userEnd = p - buffer.data(); + + // copy in the password + if (passwordEnd != passwordStart) { + *p++ = ':'; + strPtr = url + passwordStart; + const char* passwordEndPtr = url + passwordEnd; + while (strPtr < passwordEndPtr) + *p++ = *strPtr++; + } + m_passwordEnd = p - buffer.data(); + + // If we had any user info, add "@" + if (p - buffer.data() != m_userStart) + *p++ = '@'; + + // copy in the host, except in the case of a file URL with authority="localhost" + if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) { + strPtr = url + hostStart; + const char* hostEndPtr = url + hostEnd; + while (strPtr < hostEndPtr) + *p++ = *strPtr++; + } + m_hostEnd = p - buffer.data(); + + // copy in the port + if (hostEnd != portStart) { + *p++ = ':'; + strPtr = url + portStart; + const char *portEndPtr = url + portEnd; + while (strPtr < portEndPtr) + *p++ = *strPtr++; + } + m_portEnd = p - buffer.data(); + } else + m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data(); + + // For canonicalization, ensure we have a '/' for no path. + // Do this only for hierarchical URL with protocol http or https. + if (m_protocolInHTTPFamily && hierarchical && pathEnd == pathStart) + *p++ = '/'; + + // add path, escaping bad characters + if (!hierarchical || !hasSlashDotOrDotDot(url)) + appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart); + else { + CharBuffer pathBuffer(pathEnd - pathStart + 1); + size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd); + appendEscapingBadChars(p, pathBuffer.data(), length); + } + + m_pathEnd = p - buffer.data(); + + // Find the position after the last slash in the path, or + // the position before the path if there are no slashes in it. + int i; + for (i = m_pathEnd; i > m_portEnd; --i) { + if (buffer[i - 1] == '/') + break; + } + m_pathAfterLastSlash = i; + + // add query, escaping bad characters + appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart); + m_queryEnd = p - buffer.data(); + + // add fragment, escaping bad characters + if (fragmentEnd != queryEnd) { + *p++ = '#'; + appendEscapingBadChars(p, url + fragmentStart, fragmentEnd - fragmentStart); + } + m_fragmentEnd = p - buffer.data(); + + ASSERT(p - buffer.data() <= static_cast(buffer.size())); + + // If we didn't end up actually changing the original string and + // it was already in a String, reuse it to avoid extra allocation. + if (originalString && originalString->length() == static_cast(m_fragmentEnd) && strncmp(buffer.data(), url, m_fragmentEnd) == 0) + m_string = *originalString; + else + m_string = String(buffer.data(), m_fragmentEnd); + + m_isValid = true; +} + +bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b) +{ + if (a.m_queryEnd != b.m_queryEnd) + return false; + unsigned queryLength = a.m_queryEnd; + for (unsigned i = 0; i < queryLength; ++i) + if (a.string()[i] != b.string()[i]) + return false; + return true; +} + +bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b) +{ + if (a.m_schemeEnd != b.m_schemeEnd) + return false; + + int hostStartA = a.hostStart(); + int hostLengthA = a.hostEnd() - hostStartA; + int hostStartB = b.hostStart(); + int hostLengthB = b.hostEnd() - b.hostStart(); + if (hostLengthA != hostLengthB) + return false; + + // Check the scheme + for (int i = 0; i < a.m_schemeEnd; ++i) + if (a.string()[i] != b.string()[i]) + return false; + + // And the host + for (int i = 0; i < hostLengthA; ++i) + if (a.string()[hostStartA + i] != b.string()[hostStartB + i]) + return false; + + if (a.port() != b.port()) + return false; + + return true; +} + +String encodeWithURLEscapeSequences(const String& notEncodedString) +{ + CString asUTF8 = notEncodedString.utf8(); + + CharBuffer buffer(asUTF8.length() * 3 + 1); + char* p = buffer.data(); + + const char* str = asUTF8.data(); + const char* strEnd = str + asUTF8.length(); + while (str < strEnd) { + unsigned char c = *str++; + if (isBadChar(c)) { + *p++ = '%'; + *p++ = hexDigits[c >> 4]; + *p++ = hexDigits[c & 0xF]; + } else + *p++ = c; + } + + ASSERT(p - buffer.data() <= static_cast(buffer.size())); + + return String(buffer.data(), p - buffer.data()); +} + +// Appends the punycoded hostname identified by the given string and length to +// the output buffer. The result will not be null terminated. +static void appendEncodedHostname(UCharBuffer& buffer, const UChar* str, unsigned strLen) +{ + // Needs to be big enough to hold an IDN-encoded name. + // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK. + const unsigned hostnameBufferLength = 2048; + + if (strLen > hostnameBufferLength || charactersAreAllASCII(str, strLen)) { + buffer.append(str, strLen); + return; + } + +#if USE(ICU_UNICODE) + UChar hostnameBuffer[hostnameBufferLength]; + UErrorCode error = U_ZERO_ERROR; + int32_t numCharactersConverted = uidna_IDNToASCII(str, strLen, hostnameBuffer, + hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error); + if (error == U_ZERO_ERROR) + buffer.append(hostnameBuffer, numCharactersConverted); +#elif USE(QT4_UNICODE) + QByteArray result = QUrl::toAce(String(str, strLen)); + buffer.append(result.constData(), result.length()); +#elif USE(GLIB_UNICODE) + GOwnPtr utf8Hostname; + GOwnPtr utf8Err; + utf8Hostname.set(g_utf16_to_utf8(str, strLen, 0, 0, &utf8Err.outPtr())); + if (utf8Err) + return; + + GOwnPtr encodedHostname; + encodedHostname.set(g_hostname_to_ascii(utf8Hostname.get())); + if (!encodedHostname) + return; + + buffer.append(encodedHostname.get(), strlen(encodedHostname.get())); +#endif +} + +static void findHostnamesInMailToURL(const UChar* str, int strLen, Vector >& nameRanges) +{ + // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character. + // Skip quoted strings so that characters in them don't confuse us. + // When we find a '?' character, we are past the part of the URL that contains host names. + + nameRanges.clear(); + + int p = 0; + while (1) { + // Find start of host name or of quoted string. + int hostnameOrStringStart = findFirstOf(str, strLen, p, "\"@?"); + if (hostnameOrStringStart == -1) + return; + UChar c = str[hostnameOrStringStart]; + p = hostnameOrStringStart + 1; + + if (c == '?') + return; + + if (c == '@') { + // Find end of host name. + int hostnameStart = p; + int hostnameEnd = findFirstOf(str, strLen, p, ">,?"); + bool done; + if (hostnameEnd == -1) { + hostnameEnd = strLen; + done = true; + } else { + p = hostnameEnd; + done = false; + } + + nameRanges.append(make_pair(hostnameStart, hostnameEnd)); + + if (done) + return; + } else { + // Skip quoted string. + ASSERT(c == '"'); + while (1) { + int escapedCharacterOrStringEnd = findFirstOf(str, strLen, p, "\"\\"); + if (escapedCharacterOrStringEnd == -1) + return; + + c = str[escapedCharacterOrStringEnd]; + p = escapedCharacterOrStringEnd + 1; + + // If we are the end of the string, then break from the string loop back to the host name loop. + if (c == '"') + break; + + // Skip escaped character. + ASSERT(c == '\\'); + if (p == strLen) + return; + + ++p; + } + } + } +} + +static bool findHostnameInHierarchicalURL(const UChar* str, int strLen, int& startOffset, int& endOffset) +{ + // Find the host name in a hierarchical URL. + // It comes after a "://" sequence, with scheme characters preceding, and + // this should be the first colon in the string. + // It ends with the end of the string or a ":" or a path segment ending character. + // If there is a "@" character, the host part is just the part after the "@". + int separator = findFirstOf(str, strLen, 0, ":"); + if (separator == -1 || separator + 2 >= strLen || + str[separator + 1] != '/' || str[separator + 2] != '/') + return false; + + // Check that all characters before the :// are valid scheme characters. + if (!isSchemeFirstChar(str[0])) + return false; + for (int i = 1; i < separator; ++i) { + if (!isSchemeChar(str[i])) + return false; + } + + // Start after the separator. + int authorityStart = separator + 3; + + // Find terminating character. + int hostnameEnd = strLen; + for (int i = authorityStart; i < strLen; ++i) { + UChar c = str[i]; + if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) { + hostnameEnd = i; + break; + } + } + + // Find "@" for the start of the host name. + int userInfoTerminator = findFirstOf(str, strLen, authorityStart, "@"); + int hostnameStart; + if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd) + hostnameStart = authorityStart; + else + hostnameStart = userInfoTerminator + 1; + + startOffset = hostnameStart; + endOffset = hostnameEnd; + return true; +} + +// Converts all hostnames found in the given input to punycode, preserving the +// rest of the URL unchanged. The output will NOT be null-terminated. +static void encodeHostnames(const String& str, UCharBuffer& output) +{ + output.clear(); + + if (protocolIs(str, "mailto")) { + Vector > hostnameRanges; + findHostnamesInMailToURL(str.characters(), str.length(), hostnameRanges); + int n = hostnameRanges.size(); + int p = 0; + for (int i = 0; i < n; ++i) { + const pair& r = hostnameRanges[i]; + output.append(&str.characters()[p], r.first - p); + appendEncodedHostname(output, &str.characters()[r.first], r.second - r.first); + p = r.second; + } + // This will copy either everything after the last hostname, or the + // whole thing if there is no hostname. + output.append(&str.characters()[p], str.length() - p); + } else { + int hostStart, hostEnd; + if (findHostnameInHierarchicalURL(str.characters(), str.length(), hostStart, hostEnd)) { + output.append(str.characters(), hostStart); // Before hostname. + appendEncodedHostname(output, &str.characters()[hostStart], hostEnd - hostStart); + output.append(&str.characters()[hostEnd], str.length() - hostEnd); // After hostname. + } else { + // No hostname to encode, return the input. + output.append(str.characters(), str.length()); + } + } +} + +static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output) +{ + UCharBuffer s; + encodeHostnames(rel, s); + + TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme. + + int pathEnd = -1; + if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) { + // Find the first instance of either # or ?, keep pathEnd at -1 otherwise. + pathEnd = findFirstOf(s.data(), s.size(), 0, "#?"); + } + + if (pathEnd == -1) { + CString decoded = pathEncoding.encode(s.data(), s.size(), URLEncodedEntitiesForUnencodables); + output.resize(decoded.length()); + memcpy(output.data(), decoded.data(), decoded.length()); + } else { + CString pathDecoded = pathEncoding.encode(s.data(), pathEnd, URLEncodedEntitiesForUnencodables); + // Unencodable characters in URLs are represented by converting + // them to XML entities and escaping non-alphanumeric characters. + CString otherDecoded = encoding.encode(s.data() + pathEnd, s.size() - pathEnd, URLEncodedEntitiesForUnencodables); + + output.resize(pathDecoded.length() + otherDecoded.length()); + memcpy(output.data(), pathDecoded.data(), pathDecoded.length()); + memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length()); + } + output.append('\0'); // null-terminate the output. +} + +static String substituteBackslashes(const String& string) +{ + int questionPos = string.find('?'); + int hashPos = string.find('#'); + int pathEnd; + + if (hashPos >= 0 && (questionPos < 0 || questionPos > hashPos)) + pathEnd = hashPos; + else if (questionPos >= 0) + pathEnd = questionPos; + else + pathEnd = string.length(); + + return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd); +} + +bool KURL::isHierarchical() const +{ + if (!m_isValid) + return false; + ASSERT(m_string[m_schemeEnd] == ':'); + return m_string[m_schemeEnd + 1] == '/'; +} + +void KURL::copyToBuffer(CharBuffer& buffer) const +{ + // FIXME: This throws away the high bytes of all the characters in the string! + // That's fine for a valid URL, which is all ASCII, but not for invalid URLs. + buffer.resize(m_string.length()); + copyASCII(m_string.characters(), m_string.length(), buffer.data()); +} + +bool protocolIs(const String& url, const char* protocol) +{ + // Do the comparison without making a new string object. + assertProtocolIsGood(protocol); + for (int i = 0; ; ++i) { + if (!protocol[i]) + return url[i] == ':'; + if (toASCIILower(url[i]) != protocol[i]) + return false; + } +} + +bool protocolIsJavaScript(const String& url) +{ + return protocolIs(url, "javascript"); +} + +bool isValidProtocol(const String& protocol) +{ + // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + if (protocol.isEmpty()) + return false; + if (!isSchemeFirstChar(protocol[0])) + return false; + unsigned protocolLength = protocol.length(); + for (unsigned i = 1; i < protocolLength; i++) { + if (!isSchemeChar(protocol[i])) + return false; + } + return true; +} + +bool isDefaultPortForProtocol(unsigned short port, const String& protocol) +{ + if (protocol.isEmpty()) + return false; + + typedef HashMap DefaultPortsMap; + DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ()); + if (defaultPorts.isEmpty()) { + defaultPorts.set("http", 80); + defaultPorts.set("https", 443); + defaultPorts.set("ftp", 21); + defaultPorts.set("ftps", 990); + } + return defaultPorts.get(protocol) == port; +} + +bool portAllowed(const KURL& url) +{ + unsigned short port = url.port(); + + // Since most URLs don't have a port, return early for the "no port" case. + if (!port) + return true; + + // This blocked port list matches the port blocking that Mozilla implements. + // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information. + static const unsigned short blockedPortList[] = { + 1, // tcpmux + 7, // echo + 9, // discard + 11, // systat + 13, // daytime + 15, // netstat + 17, // qotd + 19, // chargen + 20, // FTP-data + 21, // FTP-control + 22, // SSH + 23, // telnet + 25, // SMTP + 37, // time + 42, // name + 43, // nicname + 53, // domain + 77, // priv-rjs + 79, // finger + 87, // ttylink + 95, // supdup + 101, // hostriame + 102, // iso-tsap + 103, // gppitnp + 104, // acr-nema + 109, // POP2 + 110, // POP3 + 111, // sunrpc + 113, // auth + 115, // SFTP + 117, // uucp-path + 119, // nntp + 123, // NTP + 135, // loc-srv / epmap + 139, // netbios + 143, // IMAP2 + 179, // BGP + 389, // LDAP + 465, // SMTP+SSL + 512, // print / exec + 513, // login + 514, // shell + 515, // printer + 526, // tempo + 530, // courier + 531, // Chat + 532, // netnews + 540, // UUCP + 556, // remotefs + 563, // NNTP+SSL + 587, // ESMTP + 601, // syslog-conn + 636, // LDAP+SSL + 993, // IMAP+SSL + 995, // POP3+SSL + 2049, // NFS + 3659, // apple-sasl / PasswordServer [Apple addition] + 4045, // lockd + 6000, // X11 + 6665, // Alternate IRC [Apple addition] + 6666, // Alternate IRC [Apple addition] + 6667, // Standard IRC [Apple addition] + 6668, // Alternate IRC [Apple addition] + 6669, // Alternate IRC [Apple addition] + invalidPortNumber, // Used to block all invalid port numbers + }; + const unsigned short* const blockedPortListEnd = blockedPortList + sizeof(blockedPortList) / sizeof(blockedPortList[0]); + +#ifndef NDEBUG + // The port list must be sorted for binary_search to work. + static bool checkedPortList = false; + if (!checkedPortList) { + for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p) + ASSERT(*p < *(p + 1)); + checkedPortList = true; + } +#endif + + // If the port is not in the blocked port list, allow it. + if (!binary_search(blockedPortList, blockedPortListEnd, port)) + return true; + + // Allow ports 21 and 22 for FTP URLs, as Mozilla does. + if ((port == 21 || port == 22) && url.protocolIs("ftp")) + return true; + + // Allow any port number in a file URL, since the port number is ignored. + if (url.protocolIs("file")) + return true; + + return false; +} + +String mimeTypeFromDataURL(const String& url) +{ + ASSERT(protocolIs(url, "data")); + int index = url.find(';'); + if (index == -1) + index = url.find(','); + if (index != -1) { + int len = index - 5; + if (len > 0) + return url.substring(5, len); + return "text/plain"; // Data URLs with no MIME type are considered text/plain. + } + return ""; +} + +const KURL& blankURL() +{ + DEFINE_STATIC_LOCAL(KURL, staticBlankURL, (ParsedURLString, "about:blank")); + return staticBlankURL; +} + +#ifndef NDEBUG +void KURL::print() const +{ + printf("%s\n", m_string.utf8().data()); +} +#endif + +} + +#endif // !USE(GOOGLEURL)