|
1 /* |
|
2 * |
|
3 * Copyright (c) 2002 |
|
4 * John Maddock |
|
5 * |
|
6 * Use, modification and distribution are subject to the |
|
7 * Boost Software License, Version 1.0. (See accompanying file |
|
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
|
9 * |
|
10 */ |
|
11 |
|
12 #ifndef BOOST_REGEX_MATCHER_HPP |
|
13 #define BOOST_REGEX_MATCHER_HPP |
|
14 |
|
15 #include <boost/regex/v4/iterator_category.hpp> |
|
16 |
|
17 #ifdef BOOST_MSVC |
|
18 #pragma warning(push) |
|
19 #pragma warning(disable: 4103) |
|
20 #endif |
|
21 #ifdef BOOST_HAS_ABI_HEADERS |
|
22 # include BOOST_ABI_PREFIX |
|
23 #endif |
|
24 #ifdef BOOST_MSVC |
|
25 #pragma warning(pop) |
|
26 #endif |
|
27 |
|
28 #ifdef BOOST_MSVC |
|
29 # pragma warning(push) |
|
30 # pragma warning(disable: 4800) |
|
31 #endif |
|
32 |
|
33 namespace boost{ |
|
34 namespace re_detail{ |
|
35 |
|
36 // |
|
37 // error checking API: |
|
38 // |
|
39 BOOST_REGEX_DECL void BOOST_REGEX_CALL verify_options(boost::regex_constants::syntax_option_type ef, match_flag_type mf); |
|
40 // |
|
41 // function can_start: |
|
42 // |
|
43 template <class charT> |
|
44 inline bool can_start(charT c, const unsigned char* map, unsigned char mask) |
|
45 { |
|
46 return ((c < static_cast<charT>(0)) ? true : ((c >= static_cast<charT>(1 << CHAR_BIT)) ? true : map[c] & mask)); |
|
47 } |
|
48 inline bool can_start(char c, const unsigned char* map, unsigned char mask) |
|
49 { |
|
50 return map[(unsigned char)c] & mask; |
|
51 } |
|
52 inline bool can_start(signed char c, const unsigned char* map, unsigned char mask) |
|
53 { |
|
54 return map[(unsigned char)c] & mask; |
|
55 } |
|
56 inline bool can_start(unsigned char c, const unsigned char* map, unsigned char mask) |
|
57 { |
|
58 return map[c] & mask; |
|
59 } |
|
60 inline bool can_start(unsigned short c, const unsigned char* map, unsigned char mask) |
|
61 { |
|
62 return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask); |
|
63 } |
|
64 #if !defined(__hpux) // WCHAR_MIN not usable in pp-directives. |
|
65 #if defined(WCHAR_MIN) && (WCHAR_MIN == 0) && !defined(BOOST_NO_INTRINSIC_WCHAR_T) |
|
66 inline bool can_start(wchar_t c, const unsigned char* map, unsigned char mask) |
|
67 { |
|
68 return ((c >= static_cast<wchar_t>(1u << CHAR_BIT)) ? true : map[c] & mask); |
|
69 } |
|
70 #endif |
|
71 #endif |
|
72 #if !defined(BOOST_NO_INTRINSIC_WCHAR_T) |
|
73 inline bool can_start(unsigned int c, const unsigned char* map, unsigned char mask) |
|
74 { |
|
75 return (((c >= static_cast<unsigned int>(1u << CHAR_BIT)) ? true : map[c] & mask)); |
|
76 } |
|
77 #endif |
|
78 |
|
79 |
|
80 // |
|
81 // Unfortunately Rogue Waves standard library appears to have a bug |
|
82 // in std::basic_string::compare that results in eroneous answers |
|
83 // in some cases (tested with Borland C++ 5.1, Rogue Wave lib version |
|
84 // 0x020101) the test case was: |
|
85 // {39135,0} < {0xff,0} |
|
86 // which succeeds when it should not. |
|
87 // |
|
88 #ifndef _RWSTD_VER |
|
89 #if !BOOST_WORKAROUND(BOOST_MSVC, < 1310) |
|
90 template <class C, class T, class A> |
|
91 inline int string_compare(const std::basic_string<C,T,A>& s, const C* p) |
|
92 { |
|
93 if(0 == *p) |
|
94 { |
|
95 if(s.empty() || ((s.size() == 1) && (s[0] == 0))) |
|
96 return 0; |
|
97 } |
|
98 return s.compare(p); |
|
99 } |
|
100 #endif |
|
101 #else |
|
102 #if !BOOST_WORKAROUND(BOOST_MSVC, < 1310) |
|
103 template <class C, class T, class A> |
|
104 inline int string_compare(const std::basic_string<C,T,A>& s, const C* p) |
|
105 { |
|
106 if(0 == *p) |
|
107 { |
|
108 if(s.empty() || ((s.size() == 1) && (s[0] == 0))) |
|
109 return 0; |
|
110 } |
|
111 return s.compare(p); |
|
112 } |
|
113 #endif |
|
114 inline int string_compare(const std::string& s, const char* p) |
|
115 { return std::strcmp(s.c_str(), p); } |
|
116 # ifndef BOOST_NO_WREGEX |
|
117 inline int string_compare(const std::wstring& s, const wchar_t* p) |
|
118 { return std::wcscmp(s.c_str(), p); } |
|
119 #endif |
|
120 #endif |
|
121 template <class Seq, class C> |
|
122 inline int string_compare(const Seq& s, const C* p) |
|
123 { |
|
124 std::size_t i = 0; |
|
125 while((i < s.size()) && (p[i] == s[i])) |
|
126 { |
|
127 ++i; |
|
128 } |
|
129 return (i == s.size()) ? -p[i] : s[i] - p[i]; |
|
130 } |
|
131 # define STR_COMP(s,p) string_compare(s,p) |
|
132 |
|
133 template<class charT> |
|
134 inline const charT* re_skip_past_null(const charT* p) |
|
135 { |
|
136 while (*p != static_cast<charT>(0)) ++p; |
|
137 return ++p; |
|
138 } |
|
139 |
|
140 template <class iterator, class charT, class traits_type, class char_classT> |
|
141 iterator BOOST_REGEX_CALL re_is_set_member(iterator next, |
|
142 iterator last, |
|
143 const re_set_long<char_classT>* set_, |
|
144 const regex_data<charT, traits_type>& e, bool icase) |
|
145 { |
|
146 const charT* p = reinterpret_cast<const charT*>(set_+1); |
|
147 iterator ptr; |
|
148 unsigned int i; |
|
149 //bool icase = e.m_flags & regex_constants::icase; |
|
150 |
|
151 if(next == last) return next; |
|
152 |
|
153 typedef typename traits_type::string_type traits_string_type; |
|
154 const ::boost::regex_traits_wrapper<traits_type>& traits_inst = *(e.m_ptraits); |
|
155 |
|
156 // dwa 9/13/00 suppress incorrect MSVC warning - it claims this is never |
|
157 // referenced |
|
158 (void)traits_inst; |
|
159 |
|
160 // try and match a single character, could be a multi-character |
|
161 // collating element... |
|
162 for(i = 0; i < set_->csingles; ++i) |
|
163 { |
|
164 ptr = next; |
|
165 if(*p == static_cast<charT>(0)) |
|
166 { |
|
167 // treat null string as special case: |
|
168 if(traits_inst.translate(*ptr, icase) != *p) |
|
169 { |
|
170 while(*p == static_cast<charT>(0))++p; |
|
171 continue; |
|
172 } |
|
173 return set_->isnot ? next : (ptr == next) ? ++next : ptr; |
|
174 } |
|
175 else |
|
176 { |
|
177 while(*p && (ptr != last)) |
|
178 { |
|
179 if(traits_inst.translate(*ptr, icase) != *p) |
|
180 break; |
|
181 ++p; |
|
182 ++ptr; |
|
183 } |
|
184 |
|
185 if(*p == static_cast<charT>(0)) // if null we've matched |
|
186 return set_->isnot ? next : (ptr == next) ? ++next : ptr; |
|
187 |
|
188 p = re_skip_past_null(p); // skip null |
|
189 } |
|
190 } |
|
191 |
|
192 charT col = traits_inst.translate(*next, icase); |
|
193 |
|
194 |
|
195 if(set_->cranges || set_->cequivalents) |
|
196 { |
|
197 traits_string_type s1; |
|
198 // |
|
199 // try and match a range, NB only a single character can match |
|
200 if(set_->cranges) |
|
201 { |
|
202 if((e.m_flags & regex_constants::collate) == 0) |
|
203 s1.assign(1, col); |
|
204 else |
|
205 { |
|
206 charT a[2] = { col, charT(0), }; |
|
207 s1 = traits_inst.transform(a, a + 1); |
|
208 } |
|
209 for(i = 0; i < set_->cranges; ++i) |
|
210 { |
|
211 if(STR_COMP(s1, p) >= 0) |
|
212 { |
|
213 do{ ++p; }while(*p); |
|
214 ++p; |
|
215 if(STR_COMP(s1, p) <= 0) |
|
216 return set_->isnot ? next : ++next; |
|
217 } |
|
218 else |
|
219 { |
|
220 // skip first string |
|
221 do{ ++p; }while(*p); |
|
222 ++p; |
|
223 } |
|
224 // skip second string |
|
225 do{ ++p; }while(*p); |
|
226 ++p; |
|
227 } |
|
228 } |
|
229 // |
|
230 // try and match an equivalence class, NB only a single character can match |
|
231 if(set_->cequivalents) |
|
232 { |
|
233 charT a[2] = { col, charT(0), }; |
|
234 s1 = traits_inst.transform_primary(a, a +1); |
|
235 for(i = 0; i < set_->cequivalents; ++i) |
|
236 { |
|
237 if(STR_COMP(s1, p) == 0) |
|
238 return set_->isnot ? next : ++next; |
|
239 // skip string |
|
240 do{ ++p; }while(*p); |
|
241 ++p; |
|
242 } |
|
243 } |
|
244 } |
|
245 if(traits_inst.isctype(col, set_->cclasses) == true) |
|
246 return set_->isnot ? next : ++next; |
|
247 if((set_->cnclasses != 0) && (traits_inst.isctype(col, set_->cnclasses) == false)) |
|
248 return set_->isnot ? next : ++next; |
|
249 return set_->isnot ? ++next : next; |
|
250 } |
|
251 |
|
252 template <class BidiIterator> |
|
253 class repeater_count |
|
254 { |
|
255 repeater_count** stack; |
|
256 repeater_count* next; |
|
257 int state_id; |
|
258 std::size_t count; // the number of iterations so far |
|
259 BidiIterator start_pos; // where the last repeat started |
|
260 public: |
|
261 repeater_count(repeater_count** s) |
|
262 { |
|
263 stack = s; |
|
264 next = 0; |
|
265 state_id = -1; |
|
266 count = 0; |
|
267 } |
|
268 repeater_count(int i, repeater_count** s, BidiIterator start) |
|
269 : start_pos(start) |
|
270 { |
|
271 state_id = i; |
|
272 stack = s; |
|
273 next = *stack; |
|
274 *stack = this; |
|
275 if(state_id > next->state_id) |
|
276 count = 0; |
|
277 else |
|
278 { |
|
279 repeater_count* p = next; |
|
280 while(p->state_id != state_id) |
|
281 p = p->next; |
|
282 count = p->count; |
|
283 start_pos = p->start_pos; |
|
284 } |
|
285 } |
|
286 ~repeater_count() |
|
287 { |
|
288 *stack = next; |
|
289 } |
|
290 std::size_t get_count() { return count; } |
|
291 int get_id() { return state_id; } |
|
292 std::size_t operator++() { return ++count; } |
|
293 bool check_null_repeat(const BidiIterator& pos, std::size_t max) |
|
294 { |
|
295 // this is called when we are about to start a new repeat, |
|
296 // if the last one was NULL move our count to max, |
|
297 // otherwise save the current position. |
|
298 bool result = (count == 0) ? false : (pos == start_pos); |
|
299 if(result) |
|
300 count = max; |
|
301 else |
|
302 start_pos = pos; |
|
303 return result; |
|
304 } |
|
305 }; |
|
306 |
|
307 struct saved_state; |
|
308 |
|
309 enum saved_state_type |
|
310 { |
|
311 saved_type_end = 0, |
|
312 saved_type_paren = 1, |
|
313 saved_type_recurse = 2, |
|
314 saved_type_assertion = 3, |
|
315 saved_state_alt = 4, |
|
316 saved_state_repeater_count = 5, |
|
317 saved_state_extra_block = 6, |
|
318 saved_state_greedy_single_repeat = 7, |
|
319 saved_state_rep_slow_dot = 8, |
|
320 saved_state_rep_fast_dot = 9, |
|
321 saved_state_rep_char = 10, |
|
322 saved_state_rep_short_set = 11, |
|
323 saved_state_rep_long_set = 12, |
|
324 saved_state_non_greedy_long_repeat = 13, |
|
325 saved_state_count = 14 |
|
326 }; |
|
327 |
|
328 #ifdef BOOST_MSVC |
|
329 #pragma warning(push) |
|
330 #pragma warning(disable : 4251 4231 4660) |
|
331 #endif |
|
332 |
|
333 template <class BidiIterator, class Allocator, class traits> |
|
334 class perl_matcher |
|
335 { |
|
336 public: |
|
337 typedef typename traits::char_type char_type; |
|
338 typedef perl_matcher<BidiIterator, Allocator, traits> self_type; |
|
339 typedef bool (self_type::*matcher_proc_type)(void); |
|
340 typedef std::size_t traits_size_type; |
|
341 typedef typename is_byte<char_type>::width_type width_type; |
|
342 typedef typename regex_iterator_traits<BidiIterator>::difference_type difference_type; |
|
343 |
|
344 perl_matcher(BidiIterator first, BidiIterator end, |
|
345 match_results<BidiIterator, Allocator>& what, |
|
346 const basic_regex<char_type, traits>& e, |
|
347 match_flag_type f, |
|
348 BidiIterator l_base) |
|
349 : m_result(what), base(first), last(end), |
|
350 position(first), backstop(l_base), re(e), traits_inst(e.get_traits()), |
|
351 m_independent(false), next_count(&rep_obj), rep_obj(&next_count) |
|
352 { |
|
353 construct_init(e, f); |
|
354 } |
|
355 |
|
356 bool match(); |
|
357 bool find(); |
|
358 |
|
359 void setf(match_flag_type f) |
|
360 { m_match_flags |= f; } |
|
361 void unsetf(match_flag_type f) |
|
362 { m_match_flags &= ~f; } |
|
363 |
|
364 private: |
|
365 void construct_init(const basic_regex<char_type, traits>& e, match_flag_type f); |
|
366 |
|
367 bool find_imp(); |
|
368 bool match_imp(); |
|
369 #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD |
|
370 typedef bool (perl_matcher::*protected_proc_type)(); |
|
371 bool protected_call(protected_proc_type); |
|
372 #endif |
|
373 void estimate_max_state_count(std::random_access_iterator_tag*); |
|
374 void estimate_max_state_count(void*); |
|
375 bool match_prefix(); |
|
376 bool match_all_states(); |
|
377 |
|
378 // match procs, stored in s_match_vtable: |
|
379 bool match_startmark(); |
|
380 bool match_endmark(); |
|
381 bool match_literal(); |
|
382 bool match_start_line(); |
|
383 bool match_end_line(); |
|
384 bool match_wild(); |
|
385 bool match_match(); |
|
386 bool match_word_boundary(); |
|
387 bool match_within_word(); |
|
388 bool match_word_start(); |
|
389 bool match_word_end(); |
|
390 bool match_buffer_start(); |
|
391 bool match_buffer_end(); |
|
392 bool match_backref(); |
|
393 bool match_long_set(); |
|
394 bool match_set(); |
|
395 bool match_jump(); |
|
396 bool match_alt(); |
|
397 bool match_rep(); |
|
398 bool match_combining(); |
|
399 bool match_soft_buffer_end(); |
|
400 bool match_restart_continue(); |
|
401 bool match_long_set_repeat(); |
|
402 bool match_set_repeat(); |
|
403 bool match_char_repeat(); |
|
404 bool match_dot_repeat_fast(); |
|
405 bool match_dot_repeat_slow(); |
|
406 bool match_backstep(); |
|
407 bool match_assert_backref(); |
|
408 bool match_toggle_case(); |
|
409 #ifdef BOOST_REGEX_RECURSIVE |
|
410 bool backtrack_till_match(std::size_t count); |
|
411 #endif |
|
412 |
|
413 // find procs stored in s_find_vtable: |
|
414 bool find_restart_any(); |
|
415 bool find_restart_word(); |
|
416 bool find_restart_line(); |
|
417 bool find_restart_buf(); |
|
418 bool find_restart_lit(); |
|
419 |
|
420 private: |
|
421 // final result structure to be filled in: |
|
422 match_results<BidiIterator, Allocator>& m_result; |
|
423 // temporary result for POSIX matches: |
|
424 scoped_ptr<match_results<BidiIterator, Allocator> > m_temp_match; |
|
425 // pointer to actual result structure to fill in: |
|
426 match_results<BidiIterator, Allocator>* m_presult; |
|
427 // start of sequence being searched: |
|
428 BidiIterator base; |
|
429 // end of sequence being searched: |
|
430 BidiIterator last; |
|
431 // current character being examined: |
|
432 BidiIterator position; |
|
433 // where to restart next search after failed match attempt: |
|
434 BidiIterator restart; |
|
435 // where the current search started from, acts as base for $` during grep: |
|
436 BidiIterator search_base; |
|
437 // how far we can go back when matching lookbehind: |
|
438 BidiIterator backstop; |
|
439 // the expression being examined: |
|
440 const basic_regex<char_type, traits>& re; |
|
441 // the expression's traits class: |
|
442 const ::boost::regex_traits_wrapper<traits>& traits_inst; |
|
443 // the next state in the machine being matched: |
|
444 const re_syntax_base* pstate; |
|
445 // matching flags in use: |
|
446 match_flag_type m_match_flags; |
|
447 // how many states we have examined so far: |
|
448 boost::uintmax_t state_count; |
|
449 // max number of states to examine before giving up: |
|
450 boost::uintmax_t max_state_count; |
|
451 // whether we should ignore case or not: |
|
452 bool icase; |
|
453 // set to true when (position == last), indicates that we may have a partial match: |
|
454 bool m_has_partial_match; |
|
455 // set to true whenever we get a match: |
|
456 bool m_has_found_match; |
|
457 // set to true whenever we're inside an independent sub-expression: |
|
458 bool m_independent; |
|
459 // the current repeat being examined: |
|
460 repeater_count<BidiIterator>* next_count; |
|
461 // the first repeat being examined (top of linked list): |
|
462 repeater_count<BidiIterator> rep_obj; |
|
463 // the mask to pass when matching word boundaries: |
|
464 typename traits::char_class_type m_word_mask; |
|
465 // the bitmask to use when determining whether a match_any matches a newline or not: |
|
466 unsigned char match_any_mask; |
|
467 |
|
468 #ifdef BOOST_REGEX_NON_RECURSIVE |
|
469 // |
|
470 // additional members for non-recursive version: |
|
471 // |
|
472 typedef bool (self_type::*unwind_proc_type)(bool); |
|
473 |
|
474 void extend_stack(); |
|
475 bool unwind(bool); |
|
476 bool unwind_end(bool); |
|
477 bool unwind_paren(bool); |
|
478 bool unwind_recursion_stopper(bool); |
|
479 bool unwind_assertion(bool); |
|
480 bool unwind_alt(bool); |
|
481 bool unwind_repeater_counter(bool); |
|
482 bool unwind_extra_block(bool); |
|
483 bool unwind_greedy_single_repeat(bool); |
|
484 bool unwind_slow_dot_repeat(bool); |
|
485 bool unwind_fast_dot_repeat(bool); |
|
486 bool unwind_char_repeat(bool); |
|
487 bool unwind_short_set_repeat(bool); |
|
488 bool unwind_long_set_repeat(bool); |
|
489 bool unwind_non_greedy_repeat(bool); |
|
490 void destroy_single_repeat(); |
|
491 void push_matched_paren(int index, const sub_match<BidiIterator>& sub); |
|
492 void push_recursion_stopper(); |
|
493 void push_assertion(const re_syntax_base* ps, bool positive); |
|
494 void push_alt(const re_syntax_base* ps); |
|
495 void push_repeater_count(int i, repeater_count<BidiIterator>** s); |
|
496 void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int state_id); |
|
497 void push_non_greedy_repeat(const re_syntax_base* ps); |
|
498 |
|
499 |
|
500 // pointer to base of stack: |
|
501 saved_state* m_stack_base; |
|
502 // pointer to current stack position: |
|
503 saved_state* m_backup_state; |
|
504 // determines what value to return when unwinding from recursion, |
|
505 // allows for mixed recursive/non-recursive algorithm: |
|
506 bool m_recursive_result; |
|
507 // how many memory blocks have we used up?: |
|
508 unsigned used_block_count; |
|
509 #endif |
|
510 |
|
511 // these operations aren't allowed, so are declared private, |
|
512 // bodies are provided to keep explicit-instantiation requests happy: |
|
513 perl_matcher& operator=(const perl_matcher&) |
|
514 { |
|
515 return *this; |
|
516 } |
|
517 perl_matcher(const perl_matcher& that) |
|
518 : m_result(that.m_result), re(that.re), traits_inst(that.traits_inst), rep_obj(0) {} |
|
519 }; |
|
520 |
|
521 #ifdef BOOST_MSVC |
|
522 #pragma warning(pop) |
|
523 #endif |
|
524 |
|
525 } // namespace re_detail |
|
526 |
|
527 #ifdef BOOST_MSVC |
|
528 #pragma warning(push) |
|
529 #pragma warning(disable: 4103) |
|
530 #endif |
|
531 #ifdef BOOST_HAS_ABI_HEADERS |
|
532 # include BOOST_ABI_SUFFIX |
|
533 #endif |
|
534 #ifdef BOOST_MSVC |
|
535 #pragma warning(pop) |
|
536 #endif |
|
537 |
|
538 } // namespace boost |
|
539 |
|
540 #ifdef BOOST_MSVC |
|
541 # pragma warning(pop) |
|
542 #endif |
|
543 |
|
544 // |
|
545 // include the implementation of perl_matcher: |
|
546 // |
|
547 #ifdef BOOST_REGEX_RECURSIVE |
|
548 #include <boost/regex/v4/perl_matcher_recursive.hpp> |
|
549 #else |
|
550 #include <boost/regex/v4/perl_matcher_non_recursive.hpp> |
|
551 #endif |
|
552 // this one has to be last: |
|
553 #include <boost/regex/v4/perl_matcher_common.hpp> |
|
554 |
|
555 #endif |
|
556 |