|
1 // Boost token_functions.hpp ------------------------------------------------// |
|
2 |
|
3 // Copyright John R. Bandela 2001. |
|
4 |
|
5 // Distributed under the Boost Software License, Version 1.0. (See |
|
6 // accompanying file LICENSE_1_0.txt or copy at |
|
7 // http://www.boost.org/LICENSE_1_0.txt) |
|
8 |
|
9 // See http://www.boost.org/libs/tokenizer/ for documentation. |
|
10 |
|
11 // Revision History: |
|
12 // 01 Oct 2004 Joaquín M López Muñoz |
|
13 // Workaround for a problem with string::assign in msvc-stlport |
|
14 // 06 Apr 2004 John Bandela |
|
15 // Fixed a bug involving using char_delimiter with a true input iterator |
|
16 // 28 Nov 2003 Robert Zeh and John Bandela |
|
17 // Converted into "fast" functions that avoid using += when |
|
18 // the supplied iterator isn't an input_iterator; based on |
|
19 // some work done at Archelon and a version that was checked into |
|
20 // the boost CVS for a short period of time. |
|
21 // 20 Feb 2002 John Maddock |
|
22 // Removed using namespace std declarations and added |
|
23 // workaround for BOOST_NO_STDC_NAMESPACE (the library |
|
24 // can be safely mixed with regex). |
|
25 // 06 Feb 2002 Jeremy Siek |
|
26 // Added char_separator. |
|
27 // 02 Feb 2002 Jeremy Siek |
|
28 // Removed tabs and a little cleanup. |
|
29 |
|
30 |
|
31 #ifndef BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_ |
|
32 #define BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_ |
|
33 |
|
34 #include <vector> |
|
35 #include <stdexcept> |
|
36 #include <string> |
|
37 #include <cctype> |
|
38 #include <algorithm> // for find_if |
|
39 #include <boost/config.hpp> |
|
40 #include <boost/assert.hpp> |
|
41 #include <boost/detail/workaround.hpp> |
|
42 #include <boost/mpl/if.hpp> |
|
43 |
|
44 // |
|
45 // the following must not be macros if we are to prefix them |
|
46 // with std:: (they shouldn't be macros anyway...) |
|
47 // |
|
48 #ifdef ispunct |
|
49 # undef ispunct |
|
50 #endif |
|
51 #ifdef isspace |
|
52 # undef isspace |
|
53 #endif |
|
54 // |
|
55 // fix namespace problems: |
|
56 // |
|
57 #ifdef BOOST_NO_STDC_NAMESPACE |
|
58 namespace std{ |
|
59 using ::ispunct; |
|
60 using ::isspace; |
|
61 } |
|
62 #endif |
|
63 |
|
64 namespace boost{ |
|
65 |
|
66 //=========================================================================== |
|
67 // The escaped_list_separator class. Which is a model of TokenizerFunction |
|
68 // An escaped list is a super-set of what is commonly known as a comma |
|
69 // separated value (csv) list.It is separated into fields by a comma or |
|
70 // other character. If the delimiting character is inside quotes, then it is |
|
71 // counted as a regular character.To allow for embedded quotes in a field, |
|
72 // there can be escape sequences using the \ much like C. |
|
73 // The role of the comma, the quotation mark, and the escape |
|
74 // character (backslash \), can be assigned to other characters. |
|
75 |
|
76 struct escaped_list_error : public std::runtime_error{ |
|
77 escaped_list_error(const std::string& what_arg):std::runtime_error(what_arg) { } |
|
78 }; |
|
79 |
|
80 |
|
81 // The out of the box GCC 2.95 on cygwin does not have a char_traits class. |
|
82 // MSVC does not like the following typename |
|
83 #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300 |
|
84 template <class Char, |
|
85 class Traits = typename std::basic_string<Char>::traits_type > |
|
86 #else |
|
87 template <class Char, |
|
88 class Traits = std::basic_string<Char>::traits_type > |
|
89 #endif |
|
90 class escaped_list_separator { |
|
91 |
|
92 private: |
|
93 typedef std::basic_string<Char,Traits> string_type; |
|
94 struct char_eq { |
|
95 Char e_; |
|
96 char_eq(Char e):e_(e) { } |
|
97 bool operator()(Char c) { |
|
98 return Traits::eq(e_,c); |
|
99 } |
|
100 }; |
|
101 string_type escape_; |
|
102 string_type c_; |
|
103 string_type quote_; |
|
104 bool last_; |
|
105 |
|
106 bool is_escape(Char e) { |
|
107 char_eq f(e); |
|
108 return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end(); |
|
109 } |
|
110 bool is_c(Char e) { |
|
111 char_eq f(e); |
|
112 return std::find_if(c_.begin(),c_.end(),f)!=c_.end(); |
|
113 } |
|
114 bool is_quote(Char e) { |
|
115 char_eq f(e); |
|
116 return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end(); |
|
117 } |
|
118 template <typename iterator, typename Token> |
|
119 void do_escape(iterator& next,iterator end,Token& tok) { |
|
120 if (++next == end) |
|
121 throw escaped_list_error(std::string("cannot end with escape")); |
|
122 if (Traits::eq(*next,'n')) { |
|
123 tok+='\n'; |
|
124 return; |
|
125 } |
|
126 else if (is_quote(*next)) { |
|
127 tok+=*next; |
|
128 return; |
|
129 } |
|
130 else if (is_c(*next)) { |
|
131 tok+=*next; |
|
132 return; |
|
133 } |
|
134 else if (is_escape(*next)) { |
|
135 tok+=*next; |
|
136 return; |
|
137 } |
|
138 else |
|
139 throw escaped_list_error(std::string("unknown escape sequence")); |
|
140 } |
|
141 |
|
142 public: |
|
143 |
|
144 explicit escaped_list_separator(Char e = '\\', |
|
145 Char c = ',',Char q = '\"') |
|
146 : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { } |
|
147 |
|
148 escaped_list_separator(string_type e, string_type c, string_type q) |
|
149 : escape_(e), c_(c), quote_(q), last_(false) { } |
|
150 |
|
151 void reset() {last_=false;} |
|
152 |
|
153 template <typename InputIterator, typename Token> |
|
154 bool operator()(InputIterator& next,InputIterator end,Token& tok) { |
|
155 bool bInQuote = false; |
|
156 tok = Token(); |
|
157 |
|
158 if (next == end) { |
|
159 if (last_) { |
|
160 last_ = false; |
|
161 return true; |
|
162 } |
|
163 else |
|
164 return false; |
|
165 } |
|
166 last_ = false; |
|
167 for (;next != end;++next) { |
|
168 if (is_escape(*next)) { |
|
169 do_escape(next,end,tok); |
|
170 } |
|
171 else if (is_c(*next)) { |
|
172 if (!bInQuote) { |
|
173 // If we are not in quote, then we are done |
|
174 ++next; |
|
175 // The last character was a c, that means there is |
|
176 // 1 more blank field |
|
177 last_ = true; |
|
178 return true; |
|
179 } |
|
180 else tok+=*next; |
|
181 } |
|
182 else if (is_quote(*next)) { |
|
183 bInQuote=!bInQuote; |
|
184 } |
|
185 else { |
|
186 tok += *next; |
|
187 } |
|
188 } |
|
189 return true; |
|
190 } |
|
191 }; |
|
192 |
|
193 //=========================================================================== |
|
194 // The classes here are used by offset_separator and char_separator to implement |
|
195 // faster assigning of tokens using assign instead of += |
|
196 |
|
197 namespace tokenizer_detail { |
|
198 |
|
199 // The assign_or_plus_equal struct contains functions that implement |
|
200 // assign, +=, and clearing based on the iterator type. The |
|
201 // generic case does nothing for plus_equal and clearing, while |
|
202 // passing through the call for assign. |
|
203 // |
|
204 // When an input iterator is being used, the situation is reversed. |
|
205 // The assign method does nothing, plus_equal invokes operator +=, |
|
206 // and the clearing method sets the supplied token to the default |
|
207 // token constructor's result. |
|
208 // |
|
209 |
|
210 template<class IteratorTag> |
|
211 struct assign_or_plus_equal { |
|
212 template<class Iterator, class Token> |
|
213 static void assign(Iterator b, Iterator e, Token &t) { |
|
214 |
|
215 #if BOOST_WORKAROUND(BOOST_MSVC, < 1300) &&\ |
|
216 BOOST_WORKAROUND(__SGI_STL_PORT, < 0x500) &&\ |
|
217 defined(_STLP_DEBUG) &&\ |
|
218 (defined(_STLP_USE_DYNAMIC_LIB) || defined(_DLL)) |
|
219 // Problem with string::assign for msvc-stlport in debug mode: the |
|
220 // linker tries to import the templatized version of this memfun, |
|
221 // which is obviously not exported. |
|
222 // See http://www.stlport.com/dcforum/DCForumID6/1763.html for details. |
|
223 |
|
224 t = Token(); |
|
225 while(b != e) t += *b++; |
|
226 #else |
|
227 t.assign(b, e); |
|
228 #endif |
|
229 |
|
230 } |
|
231 |
|
232 template<class Token, class Value> |
|
233 static void plus_equal(Token &, const Value &) { |
|
234 |
|
235 } |
|
236 |
|
237 // If we are doing an assign, there is no need for the |
|
238 // the clear. |
|
239 // |
|
240 template<class Token> |
|
241 static void clear(Token &) { |
|
242 |
|
243 } |
|
244 }; |
|
245 |
|
246 template <> |
|
247 struct assign_or_plus_equal<std::input_iterator_tag> { |
|
248 template<class Iterator, class Token> |
|
249 static void assign(Iterator b, Iterator e, Token &t) { |
|
250 |
|
251 } |
|
252 template<class Token, class Value> |
|
253 static void plus_equal(Token &t, const Value &v) { |
|
254 t += v; |
|
255 } |
|
256 template<class Token> |
|
257 static void clear(Token &t) { |
|
258 t = Token(); |
|
259 } |
|
260 }; |
|
261 |
|
262 |
|
263 template<class Iterator> |
|
264 struct pointer_iterator_category{ |
|
265 typedef std::random_access_iterator_tag type; |
|
266 }; |
|
267 |
|
268 |
|
269 template<class Iterator> |
|
270 struct class_iterator_category{ |
|
271 typedef typename Iterator::iterator_category type; |
|
272 }; |
|
273 |
|
274 |
|
275 |
|
276 // This portably gets the iterator_tag without partial template specialization |
|
277 template<class Iterator> |
|
278 struct get_iterator_category{ |
|
279 typedef typename mpl::if_<is_pointer<Iterator>, |
|
280 pointer_iterator_category<Iterator>, |
|
281 class_iterator_category<Iterator> |
|
282 >::type cat; |
|
283 |
|
284 typedef typename cat::type iterator_category; |
|
285 }; |
|
286 |
|
287 |
|
288 } |
|
289 |
|
290 |
|
291 //=========================================================================== |
|
292 // The offset_separator class, which is a model of TokenizerFunction. |
|
293 // Offset breaks a string into tokens based on a range of offsets |
|
294 |
|
295 class offset_separator { |
|
296 private: |
|
297 |
|
298 std::vector<int> offsets_; |
|
299 unsigned int current_offset_; |
|
300 bool wrap_offsets_; |
|
301 bool return_partial_last_; |
|
302 |
|
303 public: |
|
304 template <typename Iter> |
|
305 offset_separator(Iter begin, Iter end, bool wrap_offsets = true, |
|
306 bool return_partial_last = true) |
|
307 : offsets_(begin,end), current_offset_(0), |
|
308 wrap_offsets_(wrap_offsets), |
|
309 return_partial_last_(return_partial_last) { } |
|
310 |
|
311 offset_separator() |
|
312 : offsets_(1,1), current_offset_(), |
|
313 wrap_offsets_(true), return_partial_last_(true) { } |
|
314 |
|
315 void reset() { |
|
316 current_offset_ = 0; |
|
317 } |
|
318 |
|
319 template <typename InputIterator, typename Token> |
|
320 bool operator()(InputIterator& next, InputIterator end, Token& tok) |
|
321 { |
|
322 typedef tokenizer_detail::assign_or_plus_equal< |
|
323 #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300 |
|
324 typename |
|
325 #endif |
|
326 tokenizer_detail::get_iterator_category< |
|
327 InputIterator>::iterator_category> assigner; |
|
328 |
|
329 |
|
330 BOOST_ASSERT(!offsets_.empty()); |
|
331 |
|
332 assigner::clear(tok); |
|
333 InputIterator start(next); |
|
334 |
|
335 if (next == end) |
|
336 return false; |
|
337 |
|
338 if (current_offset_ == offsets_.size()) |
|
339 if (wrap_offsets_) |
|
340 current_offset_=0; |
|
341 else |
|
342 return false; |
|
343 |
|
344 int c = offsets_[current_offset_]; |
|
345 int i = 0; |
|
346 for (; i < c; ++i) { |
|
347 if (next == end)break; |
|
348 assigner::plus_equal(tok,*next++); |
|
349 } |
|
350 assigner::assign(start,next,tok); |
|
351 |
|
352 if (!return_partial_last_) |
|
353 if (i < (c-1) ) |
|
354 return false; |
|
355 |
|
356 ++current_offset_; |
|
357 return true; |
|
358 } |
|
359 }; |
|
360 |
|
361 |
|
362 //=========================================================================== |
|
363 // The char_separator class breaks a sequence of characters into |
|
364 // tokens based on the character delimiters (very much like bad old |
|
365 // strtok). A delimiter character can either be kept or dropped. A |
|
366 // kept delimiter shows up as an output token, whereas a dropped |
|
367 // delimiter does not. |
|
368 |
|
369 // This class replaces the char_delimiters_separator class. The |
|
370 // constructor for the char_delimiters_separator class was too |
|
371 // confusing and needed to be deprecated. However, because of the |
|
372 // default arguments to the constructor, adding the new constructor |
|
373 // would cause ambiguity, so instead I deprecated the whole class. |
|
374 // The implementation of the class was also simplified considerably. |
|
375 |
|
376 enum empty_token_policy { drop_empty_tokens, keep_empty_tokens }; |
|
377 |
|
378 // The out of the box GCC 2.95 on cygwin does not have a char_traits class. |
|
379 #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300 |
|
380 template <typename Char, |
|
381 typename Traits = typename std::basic_string<Char>::traits_type > |
|
382 #else |
|
383 template <typename Char, |
|
384 typename Traits = std::basic_string<Char>::traits_type > |
|
385 #endif |
|
386 class char_separator |
|
387 { |
|
388 typedef std::basic_string<Char,Traits> string_type; |
|
389 public: |
|
390 explicit |
|
391 char_separator(const Char* dropped_delims, |
|
392 const Char* kept_delims = 0, |
|
393 empty_token_policy empty_tokens = drop_empty_tokens) |
|
394 : m_dropped_delims(dropped_delims), |
|
395 m_use_ispunct(false), |
|
396 m_use_isspace(false), |
|
397 m_empty_tokens(empty_tokens), |
|
398 m_output_done(false) |
|
399 { |
|
400 // Borland workaround |
|
401 if (kept_delims) |
|
402 m_kept_delims = kept_delims; |
|
403 } |
|
404 |
|
405 // use ispunct() for kept delimiters and isspace for dropped. |
|
406 explicit |
|
407 char_separator() |
|
408 : m_use_ispunct(true), |
|
409 m_use_isspace(true), |
|
410 m_empty_tokens(drop_empty_tokens) { } |
|
411 |
|
412 void reset() { } |
|
413 |
|
414 template <typename InputIterator, typename Token> |
|
415 bool operator()(InputIterator& next, InputIterator end, Token& tok) |
|
416 { |
|
417 typedef tokenizer_detail::assign_or_plus_equal< |
|
418 #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300 |
|
419 typename |
|
420 #endif |
|
421 tokenizer_detail::get_iterator_category< |
|
422 InputIterator>::iterator_category> assigner; |
|
423 |
|
424 assigner::clear(tok); |
|
425 |
|
426 // skip past all dropped_delims |
|
427 if (m_empty_tokens == drop_empty_tokens) |
|
428 for (; next != end && is_dropped(*next); ++next) |
|
429 { } |
|
430 |
|
431 InputIterator start(next); |
|
432 |
|
433 if (m_empty_tokens == drop_empty_tokens) { |
|
434 |
|
435 if (next == end) |
|
436 return false; |
|
437 |
|
438 |
|
439 // if we are on a kept_delims move past it and stop |
|
440 if (is_kept(*next)) { |
|
441 assigner::plus_equal(tok,*next); |
|
442 ++next; |
|
443 } else |
|
444 // append all the non delim characters |
|
445 for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next) |
|
446 assigner::plus_equal(tok,*next); |
|
447 } |
|
448 else { // m_empty_tokens == keep_empty_tokens |
|
449 |
|
450 // Handle empty token at the end |
|
451 if (next == end) |
|
452 if (m_output_done == false) { |
|
453 m_output_done = true; |
|
454 assigner::assign(start,next,tok); |
|
455 return true; |
|
456 } else |
|
457 return false; |
|
458 |
|
459 if (is_kept(*next)) { |
|
460 if (m_output_done == false) |
|
461 m_output_done = true; |
|
462 else { |
|
463 assigner::plus_equal(tok,*next); |
|
464 ++next; |
|
465 m_output_done = false; |
|
466 } |
|
467 } |
|
468 else if (m_output_done == false && is_dropped(*next)) { |
|
469 m_output_done = true; |
|
470 } |
|
471 else { |
|
472 if (is_dropped(*next)) |
|
473 start=++next; |
|
474 for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next) |
|
475 assigner::plus_equal(tok,*next); |
|
476 m_output_done = true; |
|
477 } |
|
478 } |
|
479 assigner::assign(start,next,tok); |
|
480 return true; |
|
481 } |
|
482 |
|
483 private: |
|
484 string_type m_kept_delims; |
|
485 string_type m_dropped_delims; |
|
486 bool m_use_ispunct; |
|
487 bool m_use_isspace; |
|
488 empty_token_policy m_empty_tokens; |
|
489 bool m_output_done; |
|
490 |
|
491 bool is_kept(Char E) const |
|
492 { |
|
493 if (m_kept_delims.length()) |
|
494 return m_kept_delims.find(E) != string_type::npos; |
|
495 else if (m_use_ispunct) { |
|
496 return std::ispunct(E) != 0; |
|
497 } else |
|
498 return false; |
|
499 } |
|
500 bool is_dropped(Char E) const |
|
501 { |
|
502 if (m_dropped_delims.length()) |
|
503 return m_dropped_delims.find(E) != string_type::npos; |
|
504 else if (m_use_isspace) { |
|
505 return std::isspace(E) != 0; |
|
506 } else |
|
507 return false; |
|
508 } |
|
509 }; |
|
510 |
|
511 //=========================================================================== |
|
512 // The following class is DEPRECATED, use class char_separators instead. |
|
513 // |
|
514 // The char_delimiters_separator class, which is a model of |
|
515 // TokenizerFunction. char_delimiters_separator breaks a string |
|
516 // into tokens based on character delimiters. There are 2 types of |
|
517 // delimiters. returnable delimiters can be returned as |
|
518 // tokens. These are often punctuation. nonreturnable delimiters |
|
519 // cannot be returned as tokens. These are often whitespace |
|
520 |
|
521 // The out of the box GCC 2.95 on cygwin does not have a char_traits class. |
|
522 #if !defined(BOOST_MSVC) || BOOST_MSVC > 1300 |
|
523 template <class Char, |
|
524 class Traits = typename std::basic_string<Char>::traits_type > |
|
525 #else |
|
526 template <class Char, |
|
527 class Traits = std::basic_string<Char>::traits_type > |
|
528 #endif |
|
529 class char_delimiters_separator { |
|
530 private: |
|
531 |
|
532 typedef std::basic_string<Char,Traits> string_type; |
|
533 string_type returnable_; |
|
534 string_type nonreturnable_; |
|
535 bool return_delims_; |
|
536 bool no_ispunct_; |
|
537 bool no_isspace_; |
|
538 |
|
539 bool is_ret(Char E)const |
|
540 { |
|
541 if (returnable_.length()) |
|
542 return returnable_.find(E) != string_type::npos; |
|
543 else{ |
|
544 if (no_ispunct_) {return false;} |
|
545 else{ |
|
546 int r = std::ispunct(E); |
|
547 return r != 0; |
|
548 } |
|
549 } |
|
550 } |
|
551 bool is_nonret(Char E)const |
|
552 { |
|
553 if (nonreturnable_.length()) |
|
554 return nonreturnable_.find(E) != string_type::npos; |
|
555 else{ |
|
556 if (no_isspace_) {return false;} |
|
557 else{ |
|
558 int r = std::isspace(E); |
|
559 return r != 0; |
|
560 } |
|
561 } |
|
562 } |
|
563 |
|
564 public: |
|
565 explicit char_delimiters_separator(bool return_delims = false, |
|
566 const Char* returnable = 0, |
|
567 const Char* nonreturnable = 0) |
|
568 : returnable_(returnable ? returnable : string_type().c_str()), |
|
569 nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()), |
|
570 return_delims_(return_delims), no_ispunct_(returnable!=0), |
|
571 no_isspace_(nonreturnable!=0) { } |
|
572 |
|
573 void reset() { } |
|
574 |
|
575 public: |
|
576 |
|
577 template <typename InputIterator, typename Token> |
|
578 bool operator()(InputIterator& next, InputIterator end,Token& tok) { |
|
579 tok = Token(); |
|
580 |
|
581 // skip past all nonreturnable delims |
|
582 // skip past the returnable only if we are not returning delims |
|
583 for (;next!=end && ( is_nonret(*next) || (is_ret(*next) |
|
584 && !return_delims_ ) );++next) { } |
|
585 |
|
586 if (next == end) { |
|
587 return false; |
|
588 } |
|
589 |
|
590 // if we are to return delims and we are one a returnable one |
|
591 // move past it and stop |
|
592 if (is_ret(*next) && return_delims_) { |
|
593 tok+=*next; |
|
594 ++next; |
|
595 } |
|
596 else |
|
597 // append all the non delim characters |
|
598 for (;next!=end && !is_nonret(*next) && !is_ret(*next);++next) |
|
599 tok+=*next; |
|
600 |
|
601 |
|
602 return true; |
|
603 } |
|
604 }; |
|
605 |
|
606 |
|
607 } //namespace boost |
|
608 |
|
609 |
|
610 #endif |
|
611 |
|
612 |
|
613 |
|
614 |
|
615 |