|
1 /////////////////////////////////////////////////////////////////////////////// |
|
2 /// \file regex_compiler.hpp |
|
3 /// Contains the definition of regex_compiler, a factory for building regex objects |
|
4 /// from strings. |
|
5 // |
|
6 // Copyright 2004 Eric Niebler. Distributed under the Boost |
|
7 // Software License, Version 1.0. (See accompanying file |
|
8 // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
|
9 |
|
10 #ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005 |
|
11 #define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005 |
|
12 |
|
13 // MS compatible compilers support #pragma once |
|
14 #if defined(_MSC_VER) && (_MSC_VER >= 1020) |
|
15 # pragma once |
|
16 #endif |
|
17 |
|
18 #include <boost/xpressive/basic_regex.hpp> |
|
19 #include <boost/xpressive/detail/dynamic/parser.hpp> |
|
20 #include <boost/xpressive/detail/dynamic/parse_charset.hpp> |
|
21 #include <boost/xpressive/detail/dynamic/parser_enum.hpp> |
|
22 #include <boost/xpressive/detail/dynamic/parser_traits.hpp> |
|
23 #include <boost/xpressive/detail/core/linker.hpp> |
|
24 #include <boost/xpressive/detail/core/optimize.hpp> |
|
25 |
|
26 namespace boost { namespace xpressive |
|
27 { |
|
28 |
|
29 /////////////////////////////////////////////////////////////////////////////// |
|
30 // regex_compiler |
|
31 // |
|
32 /// \brief Class template regex_compiler is a factory for building basic_regex objects from a string. |
|
33 /// |
|
34 /// Class template regex_compiler is used to construct a basic_regex object from a string. The string |
|
35 /// should contain a valid regular expression. You can imbue a regex_compiler object with a locale, |
|
36 /// after which all basic_regex objects created with that regex_compiler object will use that locale. |
|
37 /// After creating a regex_compiler object, and optionally imbueing it with a locale, you can call the |
|
38 /// compile() method to construct a basic_regex object, passing it the string representing the regular |
|
39 /// expression. You can call compile() multiple times on the same regex_compiler object. Two basic_regex |
|
40 /// objects compiled from the same string will have different regex_id's. |
|
41 template<typename BidiIter, typename RegexTraits, typename CompilerTraits> |
|
42 struct regex_compiler |
|
43 { |
|
44 typedef BidiIter iterator_type; |
|
45 typedef typename iterator_value<BidiIter>::type char_type; |
|
46 typedef std::basic_string<char_type> string_type; |
|
47 typedef regex_constants::syntax_option_type flag_type; |
|
48 typedef RegexTraits traits_type; |
|
49 typedef typename traits_type::char_class_type char_class_type; |
|
50 typedef typename traits_type::locale_type locale_type; |
|
51 |
|
52 explicit regex_compiler(RegexTraits const &traits = RegexTraits()) |
|
53 : mark_count_(0) |
|
54 , hidden_mark_count_(0) |
|
55 , traits_(traits) |
|
56 , upper_(0) |
|
57 { |
|
58 this->upper_ = lookup_classname(this->rxtraits(), "upper"); |
|
59 BOOST_ASSERT(0 != this->upper_); |
|
60 } |
|
61 |
|
62 /////////////////////////////////////////////////////////////////////////// |
|
63 // imbue |
|
64 /// Specify the locale to be used by a regex_compiler. |
|
65 /// |
|
66 /// \param loc The locale that this regex_compiler should use. |
|
67 /// \return The previous locale. |
|
68 locale_type imbue(locale_type loc) |
|
69 { |
|
70 locale_type oldloc = this->traits_.imbue(loc); |
|
71 this->upper_ = lookup_classname(this->rxtraits(), "upper"); |
|
72 BOOST_ASSERT(0 != this->upper_); |
|
73 return oldloc; |
|
74 } |
|
75 |
|
76 /////////////////////////////////////////////////////////////////////////// |
|
77 // getloc |
|
78 /// Get the locale used by a regex_compiler. |
|
79 /// |
|
80 /// \param loc The locale that this regex_compiler uses. |
|
81 locale_type getloc() const |
|
82 { |
|
83 return this->traits_.getloc(); |
|
84 } |
|
85 |
|
86 /////////////////////////////////////////////////////////////////////////// |
|
87 // compile |
|
88 /// Builds a basic_regex object from a std::string. |
|
89 /// |
|
90 /// \param pat A std::string containing the regular expression pattern. |
|
91 /// \param flags Optional bitmask that determines how the pat string is interpreted. (See syntax_option_type.) |
|
92 /// \return A basic_regex object corresponding to the regular expression represented by the string. |
|
93 /// \pre The std::string pat contains a valid string-based representation of a regular expression. |
|
94 /// \throw regex_error when the string has invalid regular expression syntax. |
|
95 basic_regex<BidiIter> compile(string_type pat, flag_type flags = regex_constants::ECMAScript) |
|
96 { |
|
97 this->reset(); |
|
98 this->traits_.flags(flags); |
|
99 |
|
100 string_iterator begin = pat.begin(), end = pat.end(); |
|
101 |
|
102 // at the top level, a regex is a sequence of alternates |
|
103 alternates_list alternates; |
|
104 this->parse_alternates(begin, end, alternates); |
|
105 detail::ensure(begin == end, regex_constants::error_paren, "mismatched parenthesis"); |
|
106 |
|
107 // convert the alternates list to the appropriate matcher and terminate the sequence |
|
108 detail::sequence<BidiIter> seq = detail::alternates_to_matchable(alternates, alternates_factory()); |
|
109 seq += detail::make_dynamic_xpression<BidiIter>(detail::end_matcher()); |
|
110 |
|
111 // fill in the back-pointers by visiting the regex parse tree |
|
112 detail::xpression_linker<char_type> linker(this->rxtraits()); |
|
113 seq.first->link(linker); |
|
114 |
|
115 // bundle the regex information into a regex_impl object |
|
116 detail::regex_impl<BidiIter> impl; |
|
117 impl.xpr_ = seq.first; |
|
118 impl.traits_.reset(new RegexTraits(this->rxtraits())); |
|
119 impl.mark_count_ = this->mark_count_; |
|
120 impl.hidden_mark_count_ = this->hidden_mark_count_; |
|
121 |
|
122 // optimization: get the peek chars OR the boyer-moore search string |
|
123 detail::optimize_regex(impl, this->rxtraits(), detail::is_random<BidiIter>()); |
|
124 |
|
125 return detail::core_access<BidiIter>::make_regex(impl); |
|
126 } |
|
127 |
|
128 private: |
|
129 |
|
130 typedef typename string_type::const_iterator string_iterator; |
|
131 typedef std::list<detail::sequence<BidiIter> > alternates_list; |
|
132 typedef detail::escape_value<char_type, char_class_type> escape_value; |
|
133 typedef detail::alternates_factory_impl<BidiIter, traits_type> alternates_factory; |
|
134 |
|
135 /////////////////////////////////////////////////////////////////////////// |
|
136 // reset |
|
137 /// INTERNAL ONLY |
|
138 void reset() |
|
139 { |
|
140 this->mark_count_ = 0; |
|
141 this->hidden_mark_count_ = 0; |
|
142 this->traits_.flags(regex_constants::ECMAScript); |
|
143 } |
|
144 |
|
145 /////////////////////////////////////////////////////////////////////////// |
|
146 // regex_traits |
|
147 /// INTERNAL ONLY |
|
148 traits_type &rxtraits() |
|
149 { |
|
150 return this->traits_.traits(); |
|
151 } |
|
152 |
|
153 /////////////////////////////////////////////////////////////////////////// |
|
154 // regex_traits |
|
155 /// INTERNAL ONLY |
|
156 traits_type const &rxtraits() const |
|
157 { |
|
158 return this->traits_.traits(); |
|
159 } |
|
160 |
|
161 /////////////////////////////////////////////////////////////////////////// |
|
162 // parse_alternates |
|
163 /// INTERNAL ONLY |
|
164 void parse_alternates(string_iterator &begin, string_iterator end, alternates_list &alternates) |
|
165 { |
|
166 using namespace regex_constants; |
|
167 string_iterator old_begin; |
|
168 |
|
169 do |
|
170 { |
|
171 alternates.push_back(this->parse_sequence(begin, end)); |
|
172 old_begin = begin; |
|
173 } |
|
174 while(begin != end && token_alternate == this->traits_.get_token(begin, end)); |
|
175 |
|
176 begin = old_begin; |
|
177 } |
|
178 |
|
179 /////////////////////////////////////////////////////////////////////////// |
|
180 // parse_group |
|
181 /// INTERNAL ONLY |
|
182 detail::sequence<BidiIter> parse_group(string_iterator &begin, string_iterator end) |
|
183 { |
|
184 using namespace regex_constants; |
|
185 int mark_nbr = 0; |
|
186 bool keeper = false; |
|
187 bool lookahead = false; |
|
188 bool lookbehind = false; |
|
189 bool negative = false; |
|
190 std::size_t old_mark_count = this->mark_count_; |
|
191 |
|
192 detail::sequence<BidiIter> seq, seq_end; |
|
193 string_iterator tmp = string_iterator(); |
|
194 |
|
195 syntax_option_type old_flags = this->traits_.flags(); |
|
196 |
|
197 switch(this->traits_.get_group_type(begin, end)) |
|
198 { |
|
199 case token_no_mark: |
|
200 // Don't process empty groups like (?:) or (?i) |
|
201 // BUGBUG this doesn't handle the degenerate (?:)+ correctly |
|
202 if(token_group_end == this->traits_.get_token(tmp = begin, end)) |
|
203 { |
|
204 return this->parse_atom(begin = tmp, end); |
|
205 } |
|
206 break; |
|
207 |
|
208 case token_negative_lookahead: |
|
209 negative = true; // fall-through |
|
210 case token_positive_lookahead: |
|
211 lookahead = true; |
|
212 seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher()); |
|
213 break; |
|
214 |
|
215 case token_negative_lookbehind: |
|
216 negative = true; // fall-through |
|
217 case token_positive_lookbehind: |
|
218 lookbehind = true; |
|
219 seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher()); |
|
220 break; |
|
221 |
|
222 case token_independent_sub_expression: |
|
223 keeper = true; |
|
224 seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher()); |
|
225 break; |
|
226 |
|
227 case token_comment: |
|
228 while(detail::ensure(begin != end, error_paren, "mismatched parenthesis")) |
|
229 { |
|
230 switch(this->traits_.get_token(begin, end)) |
|
231 { |
|
232 case token_group_end: return this->parse_atom(begin, end); |
|
233 case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence"); |
|
234 case token_literal: ++begin; |
|
235 default:; |
|
236 } |
|
237 } |
|
238 break; |
|
239 |
|
240 default: |
|
241 mark_nbr = static_cast<int>(++this->mark_count_); |
|
242 seq = detail::make_dynamic_xpression<BidiIter>(detail::mark_begin_matcher(mark_nbr)); |
|
243 seq_end = detail::make_dynamic_xpression<BidiIter>(detail::mark_end_matcher(mark_nbr)); |
|
244 break; |
|
245 } |
|
246 |
|
247 // alternates |
|
248 alternates_list alternates; |
|
249 this->parse_alternates(begin, end, alternates); |
|
250 detail::ensure |
|
251 ( |
|
252 begin != end && token_group_end == this->traits_.get_token(begin, end) |
|
253 , error_paren |
|
254 , "mismatched parenthesis" |
|
255 ); |
|
256 |
|
257 seq += detail::alternates_to_matchable(alternates, alternates_factory()); |
|
258 seq += seq_end; |
|
259 |
|
260 typedef shared_ptr<detail::matchable<BidiIter> const> xpr_type; |
|
261 bool do_save = (this->mark_count_ != old_mark_count); |
|
262 |
|
263 if(lookahead) |
|
264 { |
|
265 detail::lookahead_matcher<xpr_type> lookahead(seq.first, negative, do_save); |
|
266 seq = detail::make_dynamic_xpression<BidiIter>(lookahead); |
|
267 } |
|
268 else if(lookbehind) |
|
269 { |
|
270 detail::lookbehind_matcher<xpr_type> lookbehind(seq.first, negative, do_save); |
|
271 seq = detail::make_dynamic_xpression<BidiIter>(lookbehind); |
|
272 } |
|
273 else if(keeper) // independent sub-expression |
|
274 { |
|
275 detail::keeper_matcher<xpr_type> keeper(seq.first, do_save); |
|
276 seq = detail::make_dynamic_xpression<BidiIter>(keeper); |
|
277 } |
|
278 |
|
279 // restore the modifiers |
|
280 this->traits_.flags(old_flags); |
|
281 return seq; |
|
282 } |
|
283 |
|
284 /////////////////////////////////////////////////////////////////////////// |
|
285 // parse_charset |
|
286 /// INTERNAL ONLY |
|
287 detail::sequence<BidiIter> parse_charset(string_iterator &begin, string_iterator end) |
|
288 { |
|
289 detail::compound_charset<traits_type> chset; |
|
290 |
|
291 // call out to a helper to actually parse the character set |
|
292 detail::parse_charset(begin, end, chset, this->traits_); |
|
293 |
|
294 return detail::make_charset_xpression<BidiIter> |
|
295 ( |
|
296 chset |
|
297 , this->rxtraits() |
|
298 , this->traits_.flags() |
|
299 ); |
|
300 } |
|
301 |
|
302 /////////////////////////////////////////////////////////////////////////// |
|
303 // parse_atom |
|
304 /// INTERNAL ONLY |
|
305 detail::sequence<BidiIter> parse_atom(string_iterator &begin, string_iterator end) |
|
306 { |
|
307 using namespace regex_constants; |
|
308 escape_value esc = { 0, 0, 0, detail::escape_char }; |
|
309 string_iterator old_begin = begin; |
|
310 |
|
311 switch(this->traits_.get_token(begin, end)) |
|
312 { |
|
313 case token_literal: |
|
314 return detail::make_literal_xpression<BidiIter> |
|
315 ( |
|
316 this->parse_literal(begin, end), this->traits_.flags(), this->rxtraits() |
|
317 ); |
|
318 |
|
319 case token_any: |
|
320 return detail::make_any_xpression<BidiIter>(this->traits_.flags(), this->rxtraits()); |
|
321 |
|
322 case token_assert_begin_sequence: |
|
323 return detail::make_dynamic_xpression<BidiIter>(detail::assert_bos_matcher()); |
|
324 |
|
325 case token_assert_end_sequence: |
|
326 return detail::make_dynamic_xpression<BidiIter>(detail::assert_eos_matcher()); |
|
327 |
|
328 case token_assert_begin_line: |
|
329 return detail::make_assert_begin_line<BidiIter>(this->traits_.flags(), this->rxtraits()); |
|
330 |
|
331 case token_assert_end_line: |
|
332 return detail::make_assert_end_line<BidiIter>(this->traits_.flags(), this->rxtraits()); |
|
333 |
|
334 case token_assert_word_boundary: |
|
335 return detail::make_assert_word<BidiIter>(detail::word_boundary<true>(), this->rxtraits()); |
|
336 |
|
337 case token_assert_not_word_boundary: |
|
338 return detail::make_assert_word<BidiIter>(detail::word_boundary<false>(), this->rxtraits()); |
|
339 |
|
340 case token_assert_word_begin: |
|
341 return detail::make_assert_word<BidiIter>(detail::word_begin(), this->rxtraits()); |
|
342 |
|
343 case token_assert_word_end: |
|
344 return detail::make_assert_word<BidiIter>(detail::word_end(), this->rxtraits()); |
|
345 |
|
346 case token_escape: |
|
347 esc = this->parse_escape(begin, end); |
|
348 switch(esc.type_) |
|
349 { |
|
350 case detail::escape_mark: |
|
351 return detail::make_backref_xpression<BidiIter> |
|
352 ( |
|
353 esc.mark_nbr_, this->traits_.flags(), this->rxtraits() |
|
354 ); |
|
355 case detail::escape_char: |
|
356 return detail::make_char_xpression<BidiIter> |
|
357 ( |
|
358 esc.ch_, this->traits_.flags(), this->rxtraits() |
|
359 ); |
|
360 case detail::escape_class: |
|
361 return detail::make_posix_charset_xpression<BidiIter> |
|
362 ( |
|
363 esc.class_ |
|
364 , this->rxtraits().isctype(*begin++, this->upper_) |
|
365 , this->traits_.flags() |
|
366 , this->rxtraits() |
|
367 ); |
|
368 } |
|
369 |
|
370 case token_group_begin: |
|
371 return this->parse_group(begin, end); |
|
372 |
|
373 case token_charset_begin: |
|
374 return this->parse_charset(begin, end); |
|
375 |
|
376 case token_invalid_quantifier: |
|
377 throw regex_error(error_badrepeat, "quantifier not expected"); |
|
378 |
|
379 case token_quote_meta_begin: |
|
380 return detail::make_literal_xpression<BidiIter> |
|
381 ( |
|
382 this->parse_quote_meta(begin, end), this->traits_.flags(), this->rxtraits() |
|
383 ); |
|
384 |
|
385 case token_quote_meta_end: |
|
386 throw regex_error |
|
387 ( |
|
388 error_escape |
|
389 , "found quote-meta end without corresponding quote-meta begin" |
|
390 ); |
|
391 |
|
392 case token_end_of_pattern: |
|
393 break; |
|
394 |
|
395 default: |
|
396 begin = old_begin; |
|
397 break; |
|
398 } |
|
399 |
|
400 return detail::sequence<BidiIter>(); |
|
401 } |
|
402 |
|
403 /////////////////////////////////////////////////////////////////////////// |
|
404 // parse_quant |
|
405 /// INTERNAL ONLY |
|
406 detail::sequence<BidiIter> parse_quant(string_iterator &begin, string_iterator end) |
|
407 { |
|
408 BOOST_ASSERT(begin != end); |
|
409 detail::quant_spec spec = { 0, 0, false }; |
|
410 detail::sequence<BidiIter> seq = this->parse_atom(begin, end); |
|
411 |
|
412 // BUGBUG this doesn't handle the degenerate (?:)+ correctly |
|
413 if(!seq.is_empty() && begin != end && seq.first->is_quantifiable()) |
|
414 { |
|
415 if(this->traits_.get_quant_spec(begin, end, spec)) |
|
416 { |
|
417 BOOST_ASSERT(spec.min_ <= spec.max_); |
|
418 |
|
419 if(0 == spec.max_) // quant {0,0} is degenerate -- matches nothing. |
|
420 { |
|
421 seq = this->parse_quant(begin, end); |
|
422 } |
|
423 else |
|
424 { |
|
425 seq = seq.first->quantify(spec, this->hidden_mark_count_, seq, alternates_factory()); |
|
426 } |
|
427 } |
|
428 } |
|
429 |
|
430 return seq; |
|
431 } |
|
432 |
|
433 /////////////////////////////////////////////////////////////////////////// |
|
434 // parse_sequence |
|
435 /// INTERNAL ONLY |
|
436 detail::sequence<BidiIter> parse_sequence(string_iterator &begin, string_iterator end) |
|
437 { |
|
438 detail::sequence<BidiIter> seq; |
|
439 |
|
440 while(begin != end) |
|
441 { |
|
442 detail::sequence<BidiIter> seq_quant = this->parse_quant(begin, end); |
|
443 |
|
444 // did we find a quantified atom? |
|
445 if(seq_quant.is_empty()) |
|
446 break; |
|
447 |
|
448 // chain it to the end of the xpression sequence |
|
449 seq += seq_quant; |
|
450 } |
|
451 |
|
452 return seq; |
|
453 } |
|
454 |
|
455 /////////////////////////////////////////////////////////////////////////// |
|
456 // parse_literal |
|
457 // scan ahead looking for char literals to be globbed together into a string literal |
|
458 /// INTERNAL ONLY |
|
459 string_type parse_literal(string_iterator &begin, string_iterator end) |
|
460 { |
|
461 using namespace regex_constants; |
|
462 BOOST_ASSERT(begin != end); |
|
463 BOOST_ASSERT(token_literal == this->traits_.get_token(begin, end)); |
|
464 escape_value esc = { 0, 0, 0, detail::escape_char }; |
|
465 string_type literal(1, *begin); |
|
466 |
|
467 for(string_iterator prev = begin, tmp = ++begin; begin != end; prev = begin, begin = tmp) |
|
468 { |
|
469 detail::quant_spec spec; |
|
470 if(this->traits_.get_quant_spec(tmp, end, spec)) |
|
471 { |
|
472 if(literal.size() != 1) |
|
473 { |
|
474 begin = prev; |
|
475 literal.erase(literal.size() - 1); |
|
476 } |
|
477 return literal; |
|
478 } |
|
479 else switch(this->traits_.get_token(tmp, end)) |
|
480 { |
|
481 case token_escape: |
|
482 esc = this->parse_escape(tmp, end); |
|
483 if(detail::escape_char != esc.type_) return literal; |
|
484 literal += esc.ch_; |
|
485 break; |
|
486 case token_literal: |
|
487 literal += *tmp++; |
|
488 break; |
|
489 default: |
|
490 return literal; |
|
491 } |
|
492 } |
|
493 |
|
494 return literal; |
|
495 } |
|
496 |
|
497 /////////////////////////////////////////////////////////////////////////// |
|
498 // parse_quote_meta |
|
499 // scan ahead looking for char literals to be globbed together into a string literal |
|
500 /// INTERNAL ONLY |
|
501 string_type parse_quote_meta(string_iterator &begin, string_iterator end) |
|
502 { |
|
503 using namespace regex_constants; |
|
504 string_iterator old_begin = begin, old_end; |
|
505 while(end != (old_end = begin)) |
|
506 { |
|
507 switch(this->traits_.get_token(begin, end)) |
|
508 { |
|
509 case token_quote_meta_end: return string_type(old_begin, old_end); |
|
510 case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence"); |
|
511 case token_literal: ++begin; |
|
512 default:; |
|
513 } |
|
514 } |
|
515 return string_type(old_begin, begin); |
|
516 } |
|
517 |
|
518 /////////////////////////////////////////////////////////////////////////////// |
|
519 // parse_escape |
|
520 /// INTERNAL ONLY |
|
521 escape_value parse_escape(string_iterator &begin, string_iterator end) |
|
522 { |
|
523 detail::ensure(begin != end, regex_constants::error_escape, "incomplete escape sequence"); |
|
524 |
|
525 // first, check to see if this can be a backreference |
|
526 if(0 < this->rxtraits().value(*begin, 10)) |
|
527 { |
|
528 // Parse at most 3 decimal digits. |
|
529 string_iterator tmp = begin; |
|
530 int mark_nbr = detail::toi(tmp, end, this->rxtraits(), 10, 999); |
|
531 |
|
532 // If the resulting number could conceivably be a backref, then it is. |
|
533 if(10 > mark_nbr || mark_nbr <= static_cast<int>(this->mark_count_)) |
|
534 { |
|
535 begin = tmp; |
|
536 escape_value esc = {0, mark_nbr, 0, detail::escape_mark}; |
|
537 return esc; |
|
538 } |
|
539 } |
|
540 |
|
541 // Not a backreference, defer to the parse_escape helper |
|
542 return detail::parse_escape(begin, end, this->traits_); |
|
543 } |
|
544 |
|
545 std::size_t mark_count_; |
|
546 std::size_t hidden_mark_count_; |
|
547 CompilerTraits traits_; |
|
548 typename RegexTraits::char_class_type upper_; |
|
549 }; |
|
550 |
|
551 }} // namespace boost::xpressive |
|
552 |
|
553 #endif |