|
1 // -*- c-basic-offset: 2 -*- |
|
2 /* |
|
3 * This file is part of the KDE libraries |
|
4 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org) |
|
5 * Copyright (C) 2004 Apple Computer, Inc. |
|
6 * |
|
7 * This library is free software; you can redistribute it and/or |
|
8 * modify it under the terms of the GNU Library General Public |
|
9 * License as published by the Free Software Foundation; either |
|
10 * version 2 of the License, or (at your option) any later version. |
|
11 * |
|
12 * This library is distributed in the hope that it will be useful, |
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 * Library General Public License for more details. |
|
16 * |
|
17 * You should have received a copy of the GNU Library General Public License |
|
18 * along with this library; see the file COPYING.LIB. If not, write to |
|
19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |
|
20 * Boston, MA 02110-1301, USA. |
|
21 * |
|
22 */ |
|
23 |
|
24 #ifndef _KJS_USTRING_H_ |
|
25 #define _KJS_USTRING_H_ |
|
26 |
|
27 #include "JSLock.h" |
|
28 #include "collector.h" |
|
29 #include <stdint.h> |
|
30 #include <wtf/Assertions.h> |
|
31 #include <wtf/FastMalloc.h> |
|
32 #include <wtf/PassRefPtr.h> |
|
33 #include <wtf/RefPtr.h> |
|
34 |
|
35 /* On some ARM platforms GCC won't pack structures by default so sizeof(UChar) |
|
36 will end up being != 2 which causes crashes since the code depends on that. */ |
|
37 #if COMPILER(GCC) && PLATFORM(FORCE_PACK) |
|
38 #define PACK_STRUCT __attribute__((packed)) |
|
39 #else |
|
40 #define PACK_STRUCT |
|
41 #endif |
|
42 |
|
43 /** |
|
44 * @internal |
|
45 */ |
|
46 namespace DOM { |
|
47 class DOMString; |
|
48 class AtomicString; |
|
49 } |
|
50 class KJScript; |
|
51 |
|
52 namespace KJS { |
|
53 |
|
54 class UString; |
|
55 |
|
56 /** |
|
57 * @short Unicode character. |
|
58 * |
|
59 * UChar represents a 16 bit Unicode character. It's internal data |
|
60 * representation is compatible to XChar2b and QChar. It's therefore |
|
61 * possible to exchange data with X and Qt with shallow copies. |
|
62 */ |
|
63 struct UChar { |
|
64 /** |
|
65 * Construct a character with uninitialized value. |
|
66 */ |
|
67 UChar(); |
|
68 /** |
|
69 * Construct a character with the value denoted by the arguments. |
|
70 * @param h higher byte |
|
71 * @param l lower byte |
|
72 */ |
|
73 UChar(unsigned char h , unsigned char l); |
|
74 /** |
|
75 * Construct a character with the given value. |
|
76 * @param u 16 bit Unicode value |
|
77 */ |
|
78 UChar(char u); |
|
79 UChar(unsigned char u); |
|
80 UChar(unsigned short u); |
|
81 /** |
|
82 * @return The higher byte of the character. |
|
83 */ |
|
84 unsigned char high() const { return static_cast<unsigned char>(uc >> 8); } |
|
85 /** |
|
86 * @return The lower byte of the character. |
|
87 */ |
|
88 unsigned char low() const { return static_cast<unsigned char>(uc); } |
|
89 /** |
|
90 * @return the 16 bit Unicode value of the character |
|
91 */ |
|
92 unsigned short unicode() const { return uc; } |
|
93 |
|
94 unsigned short uc; |
|
95 } PACK_STRUCT; |
|
96 |
|
97 inline UChar::UChar() { } |
|
98 inline UChar::UChar(unsigned char h , unsigned char l) : uc(h << 8 | l) { } |
|
99 inline UChar::UChar(char u) : uc((unsigned char)u) { } |
|
100 inline UChar::UChar(unsigned char u) : uc(u) { } |
|
101 inline UChar::UChar(unsigned short u) : uc(u) { } |
|
102 |
|
103 /** |
|
104 * @short 8 bit char based string class |
|
105 */ |
|
106 class CString { |
|
107 public: |
|
108 CString() : data(0), length(0) { } |
|
109 CString(const char *c); |
|
110 CString(const char *c, size_t len); |
|
111 CString(const CString &); |
|
112 |
|
113 IMPORT ~CString(); |
|
114 |
|
115 IMPORT CString &append(const CString &); |
|
116 CString &operator=(const char *c); |
|
117 CString &operator=(const CString &); |
|
118 CString &operator+=(const CString &c) { return append(c); } |
|
119 |
|
120 size_t size() const { return length; } |
|
121 const char *c_str() const { return data; } |
|
122 private: |
|
123 char *data; |
|
124 size_t length; |
|
125 }; |
|
126 |
|
127 /** |
|
128 * @short Unicode string class |
|
129 */ |
|
130 class UString { |
|
131 friend bool operator==(const UString&, const UString&); |
|
132 |
|
133 public: |
|
134 /** |
|
135 * @internal |
|
136 */ |
|
137 struct Rep { |
|
138 |
|
139 static PassRefPtr<Rep> create(UChar *d, int l); |
|
140 static PassRefPtr<Rep> createCopying(const UChar *d, int l); |
|
141 static PassRefPtr<Rep> create(PassRefPtr<Rep> base, int offset, int length); |
|
142 |
|
143 IMPORT void destroy(); |
|
144 |
|
145 bool baseIsSelf() const { return baseString == this; } |
|
146 UChar* data() const { return baseString->buf + baseString->preCapacity + offset; } |
|
147 int size() const { return len; } |
|
148 |
|
149 unsigned hash() const { if (_hash == 0) _hash = computeHash(data(), len); return _hash; } |
|
150 static unsigned computeHash(const UChar *, int length); |
|
151 static unsigned computeHash(const char *); |
|
152 |
|
153 Rep* ref() { ASSERT(JSLock::lockCount() > 0); ++rc; return this; } |
|
154 void deref() { ASSERT(JSLock::lockCount() > 0); if (--rc == 0) destroy(); } |
|
155 |
|
156 // unshared data |
|
157 int offset; |
|
158 int len; |
|
159 int rc; |
|
160 mutable unsigned _hash; |
|
161 bool isIdentifier; |
|
162 UString::Rep* baseString; |
|
163 |
|
164 // potentially shared data |
|
165 UChar *buf; |
|
166 int usedCapacity; |
|
167 int capacity; |
|
168 int usedPreCapacity; |
|
169 int preCapacity; |
|
170 |
|
171 static Rep null; |
|
172 static Rep empty; |
|
173 }; |
|
174 |
|
175 public: |
|
176 |
|
177 /** |
|
178 * Constructs a null string. |
|
179 */ |
|
180 IMPORT UString(); |
|
181 /** |
|
182 * Constructs a string from a classical zero-terminated char string. |
|
183 */ |
|
184 IMPORT UString(const char *c); |
|
185 /** |
|
186 * Constructs a string from an array of Unicode characters of the specified |
|
187 * length. |
|
188 */ |
|
189 IMPORT UString(const UChar *c, int length); |
|
190 /** |
|
191 * If copy is false the string data will be adopted. |
|
192 * That means that the data will NOT be copied and the pointer will |
|
193 * be deleted when the UString object is modified or destroyed. |
|
194 * Behaviour defaults to a deep copy if copy is true. |
|
195 */ |
|
196 UString(UChar *c, int length, bool copy); |
|
197 /** |
|
198 * Copy constructor. Makes a shallow copy only. |
|
199 */ |
|
200 UString(const UString &s) : m_rep(s.m_rep) {} |
|
201 /** |
|
202 * Convenience declaration only ! You'll be on your own to write the |
|
203 * implementation for a construction from DOM::DOMString. |
|
204 * |
|
205 * Note: feel free to contact me if you want to see a dummy header for |
|
206 * your favorite FooString class here ! |
|
207 */ |
|
208 UString(const DOM::DOMString&); |
|
209 /** |
|
210 * Convenience declaration only ! See UString(const DOM::DOMString&). |
|
211 */ |
|
212 UString(const DOM::AtomicString&); |
|
213 |
|
214 /** |
|
215 * Concatenation constructor. Makes operator+ more efficient. |
|
216 */ |
|
217 IMPORT UString(const UString &, const UString &); |
|
218 /** |
|
219 * Destructor. |
|
220 */ |
|
221 ~UString() {} |
|
222 |
|
223 /** |
|
224 * Constructs a string from an int. |
|
225 */ |
|
226 IMPORT static UString from(int i); |
|
227 /** |
|
228 * Constructs a string from an unsigned int. |
|
229 */ |
|
230 IMPORT static UString from(unsigned int u); |
|
231 /** |
|
232 * Constructs a string from a long int. |
|
233 */ |
|
234 static UString from(long u); |
|
235 /** |
|
236 * Constructs a string from a double. |
|
237 */ |
|
238 static UString from(double d); |
|
239 |
|
240 struct Range { |
|
241 public: |
|
242 Range(int pos, int len) : position(pos), length(len) {} |
|
243 Range() {} |
|
244 int position; |
|
245 int length; |
|
246 }; |
|
247 |
|
248 UString spliceSubstringsWithSeparators(const Range *substringRanges, int rangeCount, const UString *separators, int separatorCount) const; |
|
249 |
|
250 /** |
|
251 * Append another string. |
|
252 */ |
|
253 IMPORT UString &append(const UString &); |
|
254 IMPORT UString &append(const char *); |
|
255 UString &append(unsigned short); |
|
256 UString &append(char c) { return append(static_cast<unsigned short>(static_cast<unsigned char>(c))); } |
|
257 UString &append(UChar c) { return append(c.uc); } |
|
258 |
|
259 /** |
|
260 * @return The string converted to the 8-bit string type CString(). |
|
261 */ |
|
262 CString cstring() const; |
|
263 /** |
|
264 * Convert the Unicode string to plain ASCII chars chopping of any higher |
|
265 * bytes. This method should only be used for *debugging* purposes as it |
|
266 * is neither Unicode safe nor free from side effects. In order not to |
|
267 * waste any memory the char buffer is static and *shared* by all UString |
|
268 * instances. |
|
269 */ |
|
270 char *ascii() const; |
|
271 |
|
272 /** |
|
273 * Convert the string to UTF-8, assuming it is UTF-16 encoded. |
|
274 * Since this function is tolerant of badly formed UTF-16, it can create UTF-8 |
|
275 * strings that are invalid because they have characters in the range |
|
276 * U+D800-U+DDFF, U+FFFE, or U+FFFF, but the UTF-8 string is guaranteed to |
|
277 * be otherwise valid. |
|
278 */ |
|
279 IMPORT CString UTF8String() const; |
|
280 |
|
281 /** |
|
282 * @see UString(const DOM::DOMString&). |
|
283 */ |
|
284 DOM::DOMString domString() const; |
|
285 |
|
286 /** |
|
287 * Assignment operator. |
|
288 */ |
|
289 IMPORT UString &operator=(const char *c); |
|
290 /** |
|
291 * Appends the specified string. |
|
292 */ |
|
293 UString &operator+=(const UString &s) { return append(s); } |
|
294 UString &operator+=(const char *s) { return append(s); } |
|
295 |
|
296 /** |
|
297 * @return A pointer to the internal Unicode data. |
|
298 */ |
|
299 const UChar* data() const { return m_rep->data(); } |
|
300 /** |
|
301 * @return True if null. |
|
302 */ |
|
303 #if PLATFORM(SYMBIAN) |
|
304 // compiler issue |
|
305 IMPORT bool isNull() const; |
|
306 #else |
|
307 bool isNull() const { return (m_rep == &Rep::null); } |
|
308 #endif |
|
309 /** |
|
310 * @return True if null or zero length. |
|
311 */ |
|
312 bool isEmpty() const { return (!m_rep->len); } |
|
313 /** |
|
314 * Use this if you want to make sure that this string is a plain ASCII |
|
315 * string. For example, if you don't want to lose any information when |
|
316 * using cstring() or ascii(). |
|
317 * |
|
318 * @return True if the string doesn't contain any non-ASCII characters. |
|
319 */ |
|
320 IMPORT bool is8Bit() const; |
|
321 /** |
|
322 * @return The length of the string. |
|
323 */ |
|
324 int size() const { return m_rep->size(); } |
|
325 /** |
|
326 * Const character at specified position. |
|
327 */ |
|
328 const UChar operator[](int pos) const; |
|
329 |
|
330 /** |
|
331 * Attempts an conversion to a number. Apart from floating point numbers, |
|
332 * the algorithm will recognize hexadecimal representations (as |
|
333 * indicated by a 0x or 0X prefix) and +/- Infinity. |
|
334 * Returns NaN if the conversion failed. |
|
335 * @param tolerateTrailingJunk if true, toDouble can tolerate garbage after the number. |
|
336 * @param tolerateEmptyString if false, toDouble will turn an empty string into NaN rather than 0. |
|
337 */ |
|
338 double toDouble(bool tolerateTrailingJunk, bool tolerateEmptyString) const; |
|
339 double toDouble(bool tolerateTrailingJunk) const; |
|
340 double toDouble() const; |
|
341 |
|
342 /** |
|
343 * Attempts an conversion to a 32-bit integer. ok will be set |
|
344 * according to the success. |
|
345 * @param tolerateEmptyString if false, toUInt32 will return false for *ok for an empty string. |
|
346 */ |
|
347 IMPORT uint32_t toUInt32(bool *ok = 0) const; |
|
348 IMPORT uint32_t toUInt32(bool *ok, bool tolerateEmptyString) const; |
|
349 IMPORT uint32_t toStrictUInt32(bool *ok = 0) const; |
|
350 |
|
351 /** |
|
352 * Attempts an conversion to an array index. The "ok" boolean will be set |
|
353 * to true if it is a valid array index according to the rule from |
|
354 * ECMA 15.2 about what an array index is. It must exactly match the string |
|
355 * form of an unsigned integer, and be less than 2^32 - 1. |
|
356 */ |
|
357 unsigned toArrayIndex(bool *ok = 0) const; |
|
358 |
|
359 /** |
|
360 * @return Position of first occurrence of f starting at position pos. |
|
361 * -1 if the search was not successful. |
|
362 */ |
|
363 IMPORT int find(const UString &f, int pos = 0) const; |
|
364 int find(UChar, int pos = 0) const; |
|
365 /** |
|
366 * @return Position of first occurrence of f searching backwards from |
|
367 * position pos. |
|
368 * -1 if the search was not successful. |
|
369 */ |
|
370 int rfind(const UString &f, int pos) const; |
|
371 int rfind(UChar, int pos) const; |
|
372 /** |
|
373 * @return The sub string starting at position pos and length len. |
|
374 */ |
|
375 UString substr(int pos = 0, int len = -1) const; |
|
376 /** |
|
377 * Static instance of a null string. |
|
378 */ |
|
379 static const UString &null(); |
|
380 #ifdef KJS_DEBUG_MEM |
|
381 /** |
|
382 * Clear statically allocated resources. |
|
383 */ |
|
384 static void globalClear(); |
|
385 #endif |
|
386 |
|
387 Rep* rep() const { return m_rep.get(); } |
|
388 UString(PassRefPtr<Rep> r) : m_rep(r) { ASSERT(m_rep); } |
|
389 |
|
390 size_t cost() const; |
|
391 |
|
392 private: |
|
393 size_t expandedSize(size_t size, size_t otherSize) const; |
|
394 int usedCapacity() const; |
|
395 int usedPreCapacity() const; |
|
396 void expandCapacity(int requiredLength); |
|
397 void expandPreCapacity(int requiredPreCap); |
|
398 |
|
399 RefPtr<Rep> m_rep; |
|
400 }; |
|
401 |
|
402 inline bool operator==(const UChar &c1, const UChar &c2) { |
|
403 return (c1.uc == c2.uc); |
|
404 } |
|
405 bool operator==(const UString& s1, const UString& s2); |
|
406 inline bool operator!=(const UString& s1, const UString& s2) { |
|
407 return !KJS::operator==(s1, s2); |
|
408 } |
|
409 bool operator<(const UString& s1, const UString& s2); |
|
410 bool operator==(const UString& s1, const char *s2); |
|
411 inline bool operator!=(const UString& s1, const char *s2) { |
|
412 return !KJS::operator==(s1, s2); |
|
413 } |
|
414 inline bool operator==(const char *s1, const UString& s2) { |
|
415 return operator==(s2, s1); |
|
416 } |
|
417 inline bool operator!=(const char *s1, const UString& s2) { |
|
418 return !KJS::operator==(s1, s2); |
|
419 } |
|
420 bool operator==(const CString& s1, const CString& s2); |
|
421 inline UString operator+(const UString& s1, const UString& s2) { |
|
422 return UString(s1, s2); |
|
423 } |
|
424 |
|
425 int compare(const UString &, const UString &); |
|
426 |
|
427 // Given a first byte, gives the length of the UTF-8 sequence it begins. |
|
428 // Returns 0 for bytes that are not legal starts of UTF-8 sequences. |
|
429 // Only allows sequences of up to 4 bytes, since that works for all Unicode characters (U-00000000 to U-0010FFFF). |
|
430 int UTF8SequenceLength(char); |
|
431 |
|
432 // Takes a null-terminated C-style string with a UTF-8 sequence in it and converts it to a character. |
|
433 // Only allows Unicode characters (U-00000000 to U-0010FFFF). |
|
434 // Returns -1 if the sequence is not valid (including presence of extra bytes). |
|
435 int decodeUTF8Sequence(const char *); |
|
436 |
|
437 #if !PLATFORM(SYMBIAN) |
|
438 inline UString::UString() |
|
439 : m_rep(&Rep::null) |
|
440 { |
|
441 } |
|
442 #endif |
|
443 |
|
444 // Rule from ECMA 15.2 about what an array index is. |
|
445 // Must exactly match string form of an unsigned integer, and be less than 2^32 - 1. |
|
446 inline unsigned UString::toArrayIndex(bool *ok) const |
|
447 { |
|
448 unsigned i = toStrictUInt32(ok); |
|
449 if (ok && i >= 0xFFFFFFFFU) |
|
450 *ok = false; |
|
451 return i; |
|
452 } |
|
453 |
|
454 inline size_t UString::cost() const |
|
455 { |
|
456 // If this string is sharing with a base, then don't count any cost. We will never share |
|
457 // with a base that wasn't already big enough to register extra cost, so a string holding that |
|
458 // buffer has already paid extra cost at some point; and if we just |
|
459 // enlarged it by a huge amount, it must have been by appending a string |
|
460 // that itself paid extra cost, or a huge number of small strings. Either way, GC will come |
|
461 // relatively soon. |
|
462 |
|
463 // If we didn't do this, the shared substring optimization would result |
|
464 // in constantly garbage collecting when sharing with one big string. |
|
465 |
|
466 if (!m_rep->baseIsSelf()) |
|
467 return 0; |
|
468 |
|
469 return (m_rep->capacity + m_rep->preCapacity) * sizeof(UChar); |
|
470 } |
|
471 |
|
472 } // namespace |
|
473 |
|
474 #endif |