|
1 /* |
|
2 * |
|
3 * Copyright (c) 2004 |
|
4 * John Maddock |
|
5 * |
|
6 * Use, modification and distribution are subject to the |
|
7 * Boost Software License, Version 1.0. (See accompanying file |
|
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
|
9 * |
|
10 */ |
|
11 |
|
12 /* |
|
13 * LOCATION: see http://www.boost.org for most recent version. |
|
14 * FILE unicode_iterator.hpp |
|
15 * VERSION see <boost/version.hpp> |
|
16 * DESCRIPTION: Iterator adapters for converting between different Unicode encodings. |
|
17 */ |
|
18 |
|
19 /**************************************************************************** |
|
20 |
|
21 Contents: |
|
22 ~~~~~~~~~ |
|
23 |
|
24 1) Read Only, Input Adapters: |
|
25 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|
26 |
|
27 template <class BaseIterator, class U8Type = ::boost::uint8_t> |
|
28 class u32_to_u8_iterator; |
|
29 |
|
30 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8. |
|
31 |
|
32 template <class BaseIterator, class U32Type = ::boost::uint32_t> |
|
33 class u8_to_u32_iterator; |
|
34 |
|
35 Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32. |
|
36 |
|
37 template <class BaseIterator, class U16Type = ::boost::uint16_t> |
|
38 class u32_to_u16_iterator; |
|
39 |
|
40 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16. |
|
41 |
|
42 template <class BaseIterator, class U32Type = ::boost::uint32_t> |
|
43 class u16_to_u32_iterator; |
|
44 |
|
45 Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32. |
|
46 |
|
47 2) Single pass output iterator adapters: |
|
48 |
|
49 template <class BaseIterator> |
|
50 class utf8_output_iterator; |
|
51 |
|
52 Accepts UTF-32 code points and forwards them on as UTF-8 code points. |
|
53 |
|
54 template <class BaseIterator> |
|
55 class utf16_output_iterator; |
|
56 |
|
57 Accepts UTF-32 code points and forwards them on as UTF-16 code points. |
|
58 |
|
59 ****************************************************************************/ |
|
60 |
|
61 #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP |
|
62 #define BOOST_REGEX_UNICODE_ITERATOR_HPP |
|
63 #include <boost/cstdint.hpp> |
|
64 #include <boost/assert.hpp> |
|
65 #include <boost/iterator/iterator_facade.hpp> |
|
66 #include <boost/static_assert.hpp> |
|
67 #include <boost/throw_exception.hpp> |
|
68 #include <stdexcept> |
|
69 #ifndef BOOST_NO_STD_LOCALE |
|
70 #include <sstream> |
|
71 #include <ios> |
|
72 #endif |
|
73 #include <limits.h> // CHAR_BIT |
|
74 |
|
75 namespace boost{ |
|
76 |
|
77 namespace detail{ |
|
78 |
|
79 static const ::boost::uint16_t high_surrogate_base = 0xD7C0u; |
|
80 static const ::boost::uint16_t low_surrogate_base = 0xDC00u; |
|
81 static const ::boost::uint32_t ten_bit_mask = 0x3FFu; |
|
82 |
|
83 inline bool is_high_surrogate(::boost::uint16_t v) |
|
84 { |
|
85 return (v & 0xFC00u) == 0xd800u; |
|
86 } |
|
87 inline bool is_low_surrogate(::boost::uint16_t v) |
|
88 { |
|
89 return (v & 0xFC00u) == 0xdc00u; |
|
90 } |
|
91 template <class T> |
|
92 inline bool is_surrogate(T v) |
|
93 { |
|
94 return (v & 0xF800u) == 0xd800; |
|
95 } |
|
96 |
|
97 inline unsigned utf8_byte_count(boost::uint8_t c) |
|
98 { |
|
99 // if the most significant bit with a zero in it is in position |
|
100 // 8-N then there are N bytes in this UTF-8 sequence: |
|
101 boost::uint8_t mask = 0x80u; |
|
102 unsigned result = 0; |
|
103 while(c & mask) |
|
104 { |
|
105 ++result; |
|
106 mask >>= 1; |
|
107 } |
|
108 return (result == 0) ? 1 : ((result > 4) ? 4 : result); |
|
109 } |
|
110 |
|
111 inline unsigned utf8_trailing_byte_count(boost::uint8_t c) |
|
112 { |
|
113 return utf8_byte_count(c) - 1; |
|
114 } |
|
115 |
|
116 inline void invalid_utf32_code_point(::boost::uint32_t val) |
|
117 { |
|
118 #ifndef BOOST_NO_STD_LOCALE |
|
119 std::stringstream ss; |
|
120 ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence"; |
|
121 std::out_of_range e(ss.str()); |
|
122 #else |
|
123 std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-16 sequence"); |
|
124 #endif |
|
125 boost::throw_exception(e); |
|
126 } |
|
127 |
|
128 |
|
129 } // namespace detail |
|
130 |
|
131 template <class BaseIterator, class U16Type = ::boost::uint16_t> |
|
132 class u32_to_u16_iterator |
|
133 : public boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type> |
|
134 { |
|
135 typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type; |
|
136 |
|
137 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) |
|
138 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type; |
|
139 |
|
140 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32); |
|
141 BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16); |
|
142 #endif |
|
143 |
|
144 public: |
|
145 typename base_type::reference |
|
146 dereference()const |
|
147 { |
|
148 if(m_current == 2) |
|
149 extract_current(); |
|
150 return m_values[m_current]; |
|
151 } |
|
152 bool equal(const u32_to_u16_iterator& that)const |
|
153 { |
|
154 if(m_position == that.m_position) |
|
155 { |
|
156 // Both m_currents must be equal, or both even |
|
157 // this is the same as saying their sum must be even: |
|
158 return (m_current + that.m_current) & 1u ? false : true; |
|
159 } |
|
160 return false; |
|
161 } |
|
162 void increment() |
|
163 { |
|
164 // if we have a pending read then read now, so that we know whether |
|
165 // to skip a position, or move to a low-surrogate: |
|
166 if(m_current == 2) |
|
167 { |
|
168 // pending read: |
|
169 extract_current(); |
|
170 } |
|
171 // move to the next surrogate position: |
|
172 ++m_current; |
|
173 // if we've reached the end skip a position: |
|
174 if(m_values[m_current] == 0) |
|
175 { |
|
176 m_current = 2; |
|
177 ++m_position; |
|
178 } |
|
179 } |
|
180 void decrement() |
|
181 { |
|
182 if(m_current != 1) |
|
183 { |
|
184 // decrementing an iterator always leads to a valid position: |
|
185 --m_position; |
|
186 extract_current(); |
|
187 m_current = m_values[1] ? 1 : 0; |
|
188 } |
|
189 else |
|
190 { |
|
191 m_current = 0; |
|
192 } |
|
193 } |
|
194 BaseIterator base()const |
|
195 { |
|
196 return m_position; |
|
197 } |
|
198 // construct: |
|
199 u32_to_u16_iterator() : m_position(), m_current(0) |
|
200 { |
|
201 m_values[0] = 0; |
|
202 m_values[1] = 0; |
|
203 m_values[2] = 0; |
|
204 } |
|
205 u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2) |
|
206 { |
|
207 m_values[0] = 0; |
|
208 m_values[1] = 0; |
|
209 m_values[2] = 0; |
|
210 } |
|
211 private: |
|
212 |
|
213 void extract_current()const |
|
214 { |
|
215 // begin by checking for a code point out of range: |
|
216 ::boost::uint32_t v = *m_position; |
|
217 if(v >= 0x10000u) |
|
218 { |
|
219 if(v > 0x10FFFFu) |
|
220 detail::invalid_utf32_code_point(*m_position); |
|
221 // split into two surrogates: |
|
222 m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base; |
|
223 m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base; |
|
224 m_current = 0; |
|
225 BOOST_ASSERT(detail::is_high_surrogate(m_values[0])); |
|
226 BOOST_ASSERT(detail::is_low_surrogate(m_values[1])); |
|
227 } |
|
228 else |
|
229 { |
|
230 // 16-bit code point: |
|
231 m_values[0] = static_cast<U16Type>(*m_position); |
|
232 m_values[1] = 0; |
|
233 m_current = 0; |
|
234 // value must not be a surrogate: |
|
235 if(detail::is_surrogate(m_values[0])) |
|
236 detail::invalid_utf32_code_point(*m_position); |
|
237 } |
|
238 } |
|
239 BaseIterator m_position; |
|
240 mutable U16Type m_values[3]; |
|
241 mutable unsigned m_current; |
|
242 }; |
|
243 |
|
244 template <class BaseIterator, class U32Type = ::boost::uint32_t> |
|
245 class u16_to_u32_iterator |
|
246 : public boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> |
|
247 { |
|
248 typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type; |
|
249 // special values for pending iterator reads: |
|
250 BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu); |
|
251 |
|
252 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) |
|
253 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type; |
|
254 |
|
255 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16); |
|
256 BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32); |
|
257 #endif |
|
258 |
|
259 public: |
|
260 typename base_type::reference |
|
261 dereference()const |
|
262 { |
|
263 if(m_value == pending_read) |
|
264 extract_current(); |
|
265 return m_value; |
|
266 } |
|
267 bool equal(const u16_to_u32_iterator& that)const |
|
268 { |
|
269 return m_position == that.m_position; |
|
270 } |
|
271 void increment() |
|
272 { |
|
273 // skip high surrogate first if there is one: |
|
274 if(detail::is_high_surrogate(*m_position)) ++m_position; |
|
275 ++m_position; |
|
276 m_value = pending_read; |
|
277 } |
|
278 void decrement() |
|
279 { |
|
280 --m_position; |
|
281 // if we have a low surrogate then go back one more: |
|
282 if(detail::is_low_surrogate(*m_position)) |
|
283 --m_position; |
|
284 m_value = pending_read; |
|
285 } |
|
286 BaseIterator base()const |
|
287 { |
|
288 return m_position; |
|
289 } |
|
290 // construct: |
|
291 u16_to_u32_iterator() : m_position() |
|
292 { |
|
293 m_value = pending_read; |
|
294 } |
|
295 u16_to_u32_iterator(BaseIterator b) : m_position(b) |
|
296 { |
|
297 m_value = pending_read; |
|
298 } |
|
299 private: |
|
300 static void invalid_code_point(::boost::uint16_t val) |
|
301 { |
|
302 #ifndef BOOST_NO_STD_LOCALE |
|
303 std::stringstream ss; |
|
304 ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence"; |
|
305 std::out_of_range e(ss.str()); |
|
306 #else |
|
307 std::out_of_range e("Misplaced UTF-16 surrogate encountered while trying to encode UTF-32 sequence"); |
|
308 #endif |
|
309 boost::throw_exception(e); |
|
310 } |
|
311 void extract_current()const |
|
312 { |
|
313 m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position)); |
|
314 // if the last value is a high surrogate then adjust m_position and m_value as needed: |
|
315 if(detail::is_high_surrogate(*m_position)) |
|
316 { |
|
317 // precondition; next value must have be a low-surrogate: |
|
318 BaseIterator next(m_position); |
|
319 ::boost::uint16_t t = *++next; |
|
320 if((t & 0xFC00u) != 0xDC00u) |
|
321 invalid_code_point(t); |
|
322 m_value = (m_value - detail::high_surrogate_base) << 10; |
|
323 m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask); |
|
324 } |
|
325 // postcondition; result must not be a surrogate: |
|
326 if(detail::is_surrogate(m_value)) |
|
327 invalid_code_point(static_cast< ::boost::uint16_t>(m_value)); |
|
328 } |
|
329 BaseIterator m_position; |
|
330 mutable U32Type m_value; |
|
331 }; |
|
332 |
|
333 template <class BaseIterator, class U8Type = ::boost::uint8_t> |
|
334 class u32_to_u8_iterator |
|
335 : public boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type> |
|
336 { |
|
337 typedef boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type; |
|
338 |
|
339 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) |
|
340 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type; |
|
341 |
|
342 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32); |
|
343 BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8); |
|
344 #endif |
|
345 |
|
346 public: |
|
347 typename base_type::reference |
|
348 dereference()const |
|
349 { |
|
350 if(m_current == 4) |
|
351 extract_current(); |
|
352 return m_values[m_current]; |
|
353 } |
|
354 bool equal(const u32_to_u8_iterator& that)const |
|
355 { |
|
356 if(m_position == that.m_position) |
|
357 { |
|
358 // either the m_current's must be equal, or one must be 0 and |
|
359 // the other 4: which means neither must have bits 1 or 2 set: |
|
360 return (m_current == that.m_current) |
|
361 || (((m_current | that.m_current) & 3) == 0); |
|
362 } |
|
363 return false; |
|
364 } |
|
365 void increment() |
|
366 { |
|
367 // if we have a pending read then read now, so that we know whether |
|
368 // to skip a position, or move to a low-surrogate: |
|
369 if(m_current == 4) |
|
370 { |
|
371 // pending read: |
|
372 extract_current(); |
|
373 } |
|
374 // move to the next surrogate position: |
|
375 ++m_current; |
|
376 // if we've reached the end skip a position: |
|
377 if(m_values[m_current] == 0) |
|
378 { |
|
379 m_current = 4; |
|
380 ++m_position; |
|
381 } |
|
382 } |
|
383 void decrement() |
|
384 { |
|
385 if((m_current & 3) == 0) |
|
386 { |
|
387 --m_position; |
|
388 extract_current(); |
|
389 m_current = 3; |
|
390 while(m_current && (m_values[m_current] == 0)) |
|
391 --m_current; |
|
392 } |
|
393 else |
|
394 --m_current; |
|
395 } |
|
396 BaseIterator base()const |
|
397 { |
|
398 return m_position; |
|
399 } |
|
400 // construct: |
|
401 u32_to_u8_iterator() : m_position(), m_current(0) |
|
402 { |
|
403 m_values[0] = 0; |
|
404 m_values[1] = 0; |
|
405 m_values[2] = 0; |
|
406 m_values[3] = 0; |
|
407 m_values[4] = 0; |
|
408 } |
|
409 u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4) |
|
410 { |
|
411 m_values[0] = 0; |
|
412 m_values[1] = 0; |
|
413 m_values[2] = 0; |
|
414 m_values[3] = 0; |
|
415 m_values[4] = 0; |
|
416 } |
|
417 private: |
|
418 |
|
419 void extract_current()const |
|
420 { |
|
421 boost::uint32_t c = *m_position; |
|
422 if(c > 0x10FFFFu) |
|
423 detail::invalid_utf32_code_point(c); |
|
424 if(c < 0x80u) |
|
425 { |
|
426 m_values[0] = static_cast<unsigned char>(c); |
|
427 m_values[1] = static_cast<unsigned char>(0u); |
|
428 m_values[2] = static_cast<unsigned char>(0u); |
|
429 m_values[3] = static_cast<unsigned char>(0u); |
|
430 } |
|
431 else if(c < 0x800u) |
|
432 { |
|
433 m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6)); |
|
434 m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu)); |
|
435 m_values[2] = static_cast<unsigned char>(0u); |
|
436 m_values[3] = static_cast<unsigned char>(0u); |
|
437 } |
|
438 else if(c < 0x10000u) |
|
439 { |
|
440 m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12)); |
|
441 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu)); |
|
442 m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu)); |
|
443 m_values[3] = static_cast<unsigned char>(0u); |
|
444 } |
|
445 else |
|
446 { |
|
447 m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18)); |
|
448 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu)); |
|
449 m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu)); |
|
450 m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu)); |
|
451 } |
|
452 m_current= 0; |
|
453 } |
|
454 BaseIterator m_position; |
|
455 mutable U8Type m_values[5]; |
|
456 mutable unsigned m_current; |
|
457 }; |
|
458 |
|
459 template <class BaseIterator, class U32Type = ::boost::uint32_t> |
|
460 class u8_to_u32_iterator |
|
461 : public boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> |
|
462 { |
|
463 typedef boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type; |
|
464 // special values for pending iterator reads: |
|
465 BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu); |
|
466 |
|
467 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) |
|
468 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type; |
|
469 |
|
470 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8); |
|
471 BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32); |
|
472 #endif |
|
473 |
|
474 public: |
|
475 typename base_type::reference |
|
476 dereference()const |
|
477 { |
|
478 if(m_value == pending_read) |
|
479 extract_current(); |
|
480 return m_value; |
|
481 } |
|
482 bool equal(const u8_to_u32_iterator& that)const |
|
483 { |
|
484 return m_position == that.m_position; |
|
485 } |
|
486 void increment() |
|
487 { |
|
488 // skip high surrogate first if there is one: |
|
489 unsigned c = detail::utf8_byte_count(*m_position); |
|
490 std::advance(m_position, c); |
|
491 m_value = pending_read; |
|
492 } |
|
493 void decrement() |
|
494 { |
|
495 // Keep backtracking until we don't have a trailing character: |
|
496 unsigned count = 0; |
|
497 while((*--m_position & 0xC0u) == 0x80u) ++count; |
|
498 // now check that the sequence was valid: |
|
499 if(count != detail::utf8_trailing_byte_count(*m_position)) |
|
500 invalid_sequnce(); |
|
501 m_value = pending_read; |
|
502 } |
|
503 BaseIterator base()const |
|
504 { |
|
505 return m_position; |
|
506 } |
|
507 // construct: |
|
508 u8_to_u32_iterator() : m_position() |
|
509 { |
|
510 m_value = pending_read; |
|
511 } |
|
512 u8_to_u32_iterator(BaseIterator b) : m_position(b) |
|
513 { |
|
514 m_value = pending_read; |
|
515 } |
|
516 private: |
|
517 static void invalid_sequnce() |
|
518 { |
|
519 std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character"); |
|
520 boost::throw_exception(e); |
|
521 } |
|
522 void extract_current()const |
|
523 { |
|
524 m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position)); |
|
525 // we must not have a continuation character: |
|
526 if((m_value & 0xC0u) == 0x80u) |
|
527 invalid_sequnce(); |
|
528 // see how many extra byts we have: |
|
529 unsigned extra = detail::utf8_trailing_byte_count(*m_position); |
|
530 // extract the extra bits, 6 from each extra byte: |
|
531 BaseIterator next(m_position); |
|
532 for(unsigned c = 0; c < extra; ++c) |
|
533 { |
|
534 ++next; |
|
535 m_value <<= 6; |
|
536 m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu; |
|
537 } |
|
538 // we now need to remove a few of the leftmost bits, but how many depends |
|
539 // upon how many extra bytes we've extracted: |
|
540 static const boost::uint32_t masks[4] = |
|
541 { |
|
542 0x7Fu, |
|
543 0x7FFu, |
|
544 0xFFFFu, |
|
545 0x1FFFFFu, |
|
546 }; |
|
547 m_value &= masks[extra]; |
|
548 // check the result: |
|
549 if(m_value > static_cast<U32Type>(0x10FFFFu)) |
|
550 invalid_sequnce(); |
|
551 } |
|
552 BaseIterator m_position; |
|
553 mutable U32Type m_value; |
|
554 }; |
|
555 |
|
556 template <class BaseIterator> |
|
557 class utf16_output_iterator |
|
558 { |
|
559 public: |
|
560 typedef void difference_type; |
|
561 typedef void value_type; |
|
562 typedef boost::uint32_t* pointer; |
|
563 typedef boost::uint32_t& reference; |
|
564 typedef std::output_iterator_tag iterator_category; |
|
565 |
|
566 utf16_output_iterator(const BaseIterator& b) |
|
567 : m_position(b){} |
|
568 utf16_output_iterator(const utf16_output_iterator& that) |
|
569 : m_position(that.m_position){} |
|
570 utf16_output_iterator& operator=(const utf16_output_iterator& that) |
|
571 { |
|
572 m_position = that.m_position; |
|
573 return *this; |
|
574 } |
|
575 const utf16_output_iterator& operator*()const |
|
576 { |
|
577 return *this; |
|
578 } |
|
579 void operator=(boost::uint32_t val)const |
|
580 { |
|
581 push(val); |
|
582 } |
|
583 utf16_output_iterator& operator++() |
|
584 { |
|
585 return *this; |
|
586 } |
|
587 utf16_output_iterator& operator++(int) |
|
588 { |
|
589 return *this; |
|
590 } |
|
591 BaseIterator base()const |
|
592 { |
|
593 return m_position; |
|
594 } |
|
595 private: |
|
596 void push(boost::uint32_t v)const |
|
597 { |
|
598 if(v >= 0x10000u) |
|
599 { |
|
600 // begin by checking for a code point out of range: |
|
601 if(v > 0x10FFFFu) |
|
602 detail::invalid_utf32_code_point(v); |
|
603 // split into two surrogates: |
|
604 *m_position++ = static_cast<boost::uint16_t>(v >> 10) + detail::high_surrogate_base; |
|
605 *m_position++ = static_cast<boost::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base; |
|
606 } |
|
607 else |
|
608 { |
|
609 // 16-bit code point: |
|
610 // value must not be a surrogate: |
|
611 if(detail::is_surrogate(v)) |
|
612 detail::invalid_utf32_code_point(v); |
|
613 *m_position++ = static_cast<boost::uint16_t>(v); |
|
614 } |
|
615 } |
|
616 mutable BaseIterator m_position; |
|
617 }; |
|
618 |
|
619 template <class BaseIterator> |
|
620 class utf8_output_iterator |
|
621 { |
|
622 public: |
|
623 typedef void difference_type; |
|
624 typedef void value_type; |
|
625 typedef boost::uint32_t* pointer; |
|
626 typedef boost::uint32_t& reference; |
|
627 typedef std::output_iterator_tag iterator_category; |
|
628 |
|
629 utf8_output_iterator(const BaseIterator& b) |
|
630 : m_position(b){} |
|
631 utf8_output_iterator(const utf8_output_iterator& that) |
|
632 : m_position(that.m_position){} |
|
633 utf8_output_iterator& operator=(const utf8_output_iterator& that) |
|
634 { |
|
635 m_position = that.m_position; |
|
636 return *this; |
|
637 } |
|
638 const utf8_output_iterator& operator*()const |
|
639 { |
|
640 return *this; |
|
641 } |
|
642 void operator=(boost::uint32_t val)const |
|
643 { |
|
644 push(val); |
|
645 } |
|
646 utf8_output_iterator& operator++() |
|
647 { |
|
648 return *this; |
|
649 } |
|
650 utf8_output_iterator& operator++(int) |
|
651 { |
|
652 return *this; |
|
653 } |
|
654 BaseIterator base()const |
|
655 { |
|
656 return m_position; |
|
657 } |
|
658 private: |
|
659 void push(boost::uint32_t c)const |
|
660 { |
|
661 if(c > 0x10FFFFu) |
|
662 detail::invalid_utf32_code_point(c); |
|
663 if(c < 0x80u) |
|
664 { |
|
665 *m_position++ = static_cast<unsigned char>(c); |
|
666 } |
|
667 else if(c < 0x800u) |
|
668 { |
|
669 *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6)); |
|
670 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu)); |
|
671 } |
|
672 else if(c < 0x10000u) |
|
673 { |
|
674 *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12)); |
|
675 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu)); |
|
676 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu)); |
|
677 } |
|
678 else |
|
679 { |
|
680 *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18)); |
|
681 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu)); |
|
682 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu)); |
|
683 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu)); |
|
684 } |
|
685 } |
|
686 mutable BaseIterator m_position; |
|
687 }; |
|
688 |
|
689 } // namespace boost |
|
690 |
|
691 #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP |
|
692 |