ossrv_pub/boost_apis/boost/xpressive/regex_compiler.hpp
changeset 0 e4d67989cc36
equal deleted inserted replaced
-1:000000000000 0:e4d67989cc36
       
     1 ///////////////////////////////////////////////////////////////////////////////
       
     2 /// \file regex_compiler.hpp
       
     3 /// Contains the definition of regex_compiler, a factory for building regex objects
       
     4 /// from strings.
       
     5 //
       
     6 //  Copyright 2004 Eric Niebler. Distributed under the Boost
       
     7 //  Software License, Version 1.0. (See accompanying file
       
     8 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
       
     9 
       
    10 #ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
       
    11 #define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005
       
    12 
       
    13 // MS compatible compilers support #pragma once
       
    14 #if defined(_MSC_VER) && (_MSC_VER >= 1020)
       
    15 # pragma once
       
    16 #endif
       
    17 
       
    18 #include <boost/xpressive/basic_regex.hpp>
       
    19 #include <boost/xpressive/detail/dynamic/parser.hpp>
       
    20 #include <boost/xpressive/detail/dynamic/parse_charset.hpp>
       
    21 #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
       
    22 #include <boost/xpressive/detail/dynamic/parser_traits.hpp>
       
    23 #include <boost/xpressive/detail/core/linker.hpp>
       
    24 #include <boost/xpressive/detail/core/optimize.hpp>
       
    25 
       
    26 namespace boost { namespace xpressive
       
    27 {
       
    28 
       
    29 ///////////////////////////////////////////////////////////////////////////////
       
    30 // regex_compiler
       
    31 //
       
    32 /// \brief Class template regex_compiler is a factory for building basic_regex objects from a string.
       
    33 ///
       
    34 /// Class template regex_compiler is used to construct a basic_regex object from a string. The string
       
    35 /// should contain a valid regular expression. You can imbue a regex_compiler object with a locale,
       
    36 /// after which all basic_regex objects created with that regex_compiler object will use that locale.
       
    37 /// After creating a regex_compiler object, and optionally imbueing it with a locale, you can call the
       
    38 /// compile() method to construct a basic_regex object, passing it the string representing the regular
       
    39 /// expression. You can call compile() multiple times on the same regex_compiler object. Two basic_regex
       
    40 /// objects compiled from the same string will have different regex_id's.
       
    41 template<typename BidiIter, typename RegexTraits, typename CompilerTraits>
       
    42 struct regex_compiler
       
    43 {
       
    44     typedef BidiIter iterator_type;
       
    45     typedef typename iterator_value<BidiIter>::type char_type;
       
    46     typedef std::basic_string<char_type> string_type;
       
    47     typedef regex_constants::syntax_option_type flag_type;
       
    48     typedef RegexTraits traits_type;
       
    49     typedef typename traits_type::char_class_type char_class_type;
       
    50     typedef typename traits_type::locale_type locale_type;
       
    51 
       
    52     explicit regex_compiler(RegexTraits const &traits = RegexTraits())
       
    53       : mark_count_(0)
       
    54       , hidden_mark_count_(0)
       
    55       , traits_(traits)
       
    56       , upper_(0)
       
    57     {
       
    58         this->upper_ = lookup_classname(this->rxtraits(), "upper");
       
    59         BOOST_ASSERT(0 != this->upper_);
       
    60     }
       
    61 
       
    62     ///////////////////////////////////////////////////////////////////////////
       
    63     // imbue
       
    64     /// Specify the locale to be used by a regex_compiler.
       
    65     ///
       
    66     /// \param loc The locale that this regex_compiler should use.
       
    67     /// \return The previous locale.
       
    68     locale_type imbue(locale_type loc)
       
    69     {
       
    70         locale_type oldloc = this->traits_.imbue(loc);
       
    71         this->upper_ = lookup_classname(this->rxtraits(), "upper");
       
    72         BOOST_ASSERT(0 != this->upper_);
       
    73         return oldloc;
       
    74     }
       
    75 
       
    76     ///////////////////////////////////////////////////////////////////////////
       
    77     // getloc
       
    78     /// Get the locale used by a regex_compiler.
       
    79     ///
       
    80     /// \param loc The locale that this regex_compiler uses.
       
    81     locale_type getloc() const
       
    82     {
       
    83         return this->traits_.getloc();
       
    84     }
       
    85 
       
    86     ///////////////////////////////////////////////////////////////////////////
       
    87     // compile
       
    88     /// Builds a basic_regex object from a std::string.
       
    89     ///
       
    90     /// \param  pat A std::string containing the regular expression pattern.
       
    91     /// \param  flags Optional bitmask that determines how the pat string is interpreted. (See syntax_option_type.)
       
    92     /// \return A basic_regex object corresponding to the regular expression represented by the string.
       
    93     /// \pre    The std::string pat contains a valid string-based representation of a regular expression.
       
    94     /// \throw  regex_error when the string has invalid regular expression syntax.
       
    95     basic_regex<BidiIter> compile(string_type pat, flag_type flags = regex_constants::ECMAScript)
       
    96     {
       
    97         this->reset();
       
    98         this->traits_.flags(flags);
       
    99 
       
   100         string_iterator begin = pat.begin(), end = pat.end();
       
   101 
       
   102         // at the top level, a regex is a sequence of alternates
       
   103         alternates_list alternates;
       
   104         this->parse_alternates(begin, end, alternates);
       
   105         detail::ensure(begin == end, regex_constants::error_paren, "mismatched parenthesis");
       
   106 
       
   107         // convert the alternates list to the appropriate matcher and terminate the sequence
       
   108         detail::sequence<BidiIter> seq = detail::alternates_to_matchable(alternates, alternates_factory());
       
   109         seq += detail::make_dynamic_xpression<BidiIter>(detail::end_matcher());
       
   110 
       
   111         // fill in the back-pointers by visiting the regex parse tree
       
   112         detail::xpression_linker<char_type> linker(this->rxtraits());
       
   113         seq.first->link(linker);
       
   114 
       
   115         // bundle the regex information into a regex_impl object
       
   116         detail::regex_impl<BidiIter> impl;
       
   117         impl.xpr_ = seq.first;
       
   118         impl.traits_.reset(new RegexTraits(this->rxtraits()));
       
   119         impl.mark_count_ = this->mark_count_;
       
   120         impl.hidden_mark_count_ = this->hidden_mark_count_;
       
   121 
       
   122         // optimization: get the peek chars OR the boyer-moore search string
       
   123         detail::optimize_regex(impl, this->rxtraits(), detail::is_random<BidiIter>());
       
   124 
       
   125         return detail::core_access<BidiIter>::make_regex(impl);
       
   126     }
       
   127 
       
   128 private:
       
   129 
       
   130     typedef typename string_type::const_iterator string_iterator;
       
   131     typedef std::list<detail::sequence<BidiIter> > alternates_list;
       
   132     typedef detail::escape_value<char_type, char_class_type> escape_value;
       
   133     typedef detail::alternates_factory_impl<BidiIter, traits_type> alternates_factory;
       
   134 
       
   135     ///////////////////////////////////////////////////////////////////////////
       
   136     // reset
       
   137     /// INTERNAL ONLY
       
   138     void reset()
       
   139     {
       
   140         this->mark_count_ = 0;
       
   141         this->hidden_mark_count_ = 0;
       
   142         this->traits_.flags(regex_constants::ECMAScript);
       
   143     }
       
   144 
       
   145     ///////////////////////////////////////////////////////////////////////////
       
   146     // regex_traits
       
   147     /// INTERNAL ONLY
       
   148     traits_type &rxtraits()
       
   149     {
       
   150         return this->traits_.traits();
       
   151     }
       
   152 
       
   153     ///////////////////////////////////////////////////////////////////////////
       
   154     // regex_traits
       
   155     /// INTERNAL ONLY
       
   156     traits_type const &rxtraits() const
       
   157     {
       
   158         return this->traits_.traits();
       
   159     }
       
   160 
       
   161     ///////////////////////////////////////////////////////////////////////////
       
   162     // parse_alternates
       
   163     /// INTERNAL ONLY
       
   164     void parse_alternates(string_iterator &begin, string_iterator end, alternates_list &alternates)
       
   165     {
       
   166         using namespace regex_constants;
       
   167         string_iterator old_begin;
       
   168 
       
   169         do
       
   170         {
       
   171             alternates.push_back(this->parse_sequence(begin, end));
       
   172             old_begin = begin;
       
   173         }
       
   174         while(begin != end && token_alternate == this->traits_.get_token(begin, end));
       
   175 
       
   176         begin = old_begin;
       
   177     }
       
   178 
       
   179     ///////////////////////////////////////////////////////////////////////////
       
   180     // parse_group
       
   181     /// INTERNAL ONLY
       
   182     detail::sequence<BidiIter> parse_group(string_iterator &begin, string_iterator end)
       
   183     {
       
   184         using namespace regex_constants;
       
   185         int mark_nbr = 0;
       
   186         bool keeper = false;
       
   187         bool lookahead = false;
       
   188         bool lookbehind = false;
       
   189         bool negative = false;
       
   190         std::size_t old_mark_count = this->mark_count_;
       
   191 
       
   192         detail::sequence<BidiIter> seq, seq_end;
       
   193         string_iterator tmp = string_iterator();
       
   194 
       
   195         syntax_option_type old_flags = this->traits_.flags();
       
   196 
       
   197         switch(this->traits_.get_group_type(begin, end))
       
   198         {
       
   199         case token_no_mark:
       
   200             // Don't process empty groups like (?:) or (?i)
       
   201             // BUGBUG this doesn't handle the degenerate (?:)+ correctly
       
   202             if(token_group_end == this->traits_.get_token(tmp = begin, end))
       
   203             {
       
   204                 return this->parse_atom(begin = tmp, end);
       
   205             }
       
   206             break;
       
   207 
       
   208         case token_negative_lookahead:
       
   209             negative = true; // fall-through
       
   210         case token_positive_lookahead:
       
   211             lookahead = true;
       
   212             seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher());
       
   213             break;
       
   214 
       
   215         case token_negative_lookbehind:
       
   216             negative = true; // fall-through
       
   217         case token_positive_lookbehind:
       
   218             lookbehind = true;
       
   219             seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher());
       
   220             break;
       
   221 
       
   222         case token_independent_sub_expression:
       
   223             keeper = true;
       
   224             seq_end = detail::make_dynamic_xpression<BidiIter>(detail::true_matcher());
       
   225             break;
       
   226 
       
   227         case token_comment:
       
   228             while(detail::ensure(begin != end, error_paren, "mismatched parenthesis"))
       
   229             {
       
   230                 switch(this->traits_.get_token(begin, end))
       
   231                 {
       
   232                 case token_group_end: return this->parse_atom(begin, end);
       
   233                 case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence");
       
   234                 case token_literal: ++begin;
       
   235                 default:;
       
   236                 }
       
   237             }
       
   238             break;
       
   239 
       
   240         default:
       
   241             mark_nbr = static_cast<int>(++this->mark_count_);
       
   242             seq = detail::make_dynamic_xpression<BidiIter>(detail::mark_begin_matcher(mark_nbr));
       
   243             seq_end = detail::make_dynamic_xpression<BidiIter>(detail::mark_end_matcher(mark_nbr));
       
   244             break;
       
   245         }
       
   246 
       
   247         // alternates
       
   248         alternates_list alternates;
       
   249         this->parse_alternates(begin, end, alternates);
       
   250         detail::ensure
       
   251         (
       
   252             begin != end && token_group_end == this->traits_.get_token(begin, end)
       
   253           , error_paren
       
   254           , "mismatched parenthesis"
       
   255         );
       
   256 
       
   257         seq += detail::alternates_to_matchable(alternates, alternates_factory());
       
   258         seq += seq_end;
       
   259 
       
   260         typedef shared_ptr<detail::matchable<BidiIter> const> xpr_type;
       
   261         bool do_save = (this->mark_count_ != old_mark_count);
       
   262 
       
   263         if(lookahead)
       
   264         {
       
   265             detail::lookahead_matcher<xpr_type> lookahead(seq.first, negative, do_save);
       
   266             seq = detail::make_dynamic_xpression<BidiIter>(lookahead);
       
   267         }
       
   268         else if(lookbehind)
       
   269         {
       
   270             detail::lookbehind_matcher<xpr_type> lookbehind(seq.first, negative, do_save);
       
   271             seq = detail::make_dynamic_xpression<BidiIter>(lookbehind);
       
   272         }
       
   273         else if(keeper) // independent sub-expression
       
   274         {
       
   275             detail::keeper_matcher<xpr_type> keeper(seq.first, do_save);
       
   276             seq = detail::make_dynamic_xpression<BidiIter>(keeper);
       
   277         }
       
   278 
       
   279         // restore the modifiers
       
   280         this->traits_.flags(old_flags);
       
   281         return seq;
       
   282     }
       
   283 
       
   284     ///////////////////////////////////////////////////////////////////////////
       
   285     // parse_charset
       
   286     /// INTERNAL ONLY
       
   287     detail::sequence<BidiIter> parse_charset(string_iterator &begin, string_iterator end)
       
   288     {
       
   289         detail::compound_charset<traits_type> chset;
       
   290 
       
   291         // call out to a helper to actually parse the character set
       
   292         detail::parse_charset(begin, end, chset, this->traits_);
       
   293 
       
   294         return detail::make_charset_xpression<BidiIter>
       
   295         (
       
   296             chset
       
   297           , this->rxtraits()
       
   298           , this->traits_.flags()
       
   299         );
       
   300     }
       
   301 
       
   302     ///////////////////////////////////////////////////////////////////////////
       
   303     // parse_atom
       
   304     /// INTERNAL ONLY
       
   305     detail::sequence<BidiIter> parse_atom(string_iterator &begin, string_iterator end)
       
   306     {
       
   307         using namespace regex_constants;
       
   308         escape_value esc = { 0, 0, 0, detail::escape_char };
       
   309         string_iterator old_begin = begin;
       
   310 
       
   311         switch(this->traits_.get_token(begin, end))
       
   312         {
       
   313         case token_literal:
       
   314             return detail::make_literal_xpression<BidiIter>
       
   315             (
       
   316                 this->parse_literal(begin, end), this->traits_.flags(), this->rxtraits()
       
   317             );
       
   318 
       
   319         case token_any:
       
   320             return detail::make_any_xpression<BidiIter>(this->traits_.flags(), this->rxtraits());
       
   321 
       
   322         case token_assert_begin_sequence:
       
   323             return detail::make_dynamic_xpression<BidiIter>(detail::assert_bos_matcher());
       
   324 
       
   325         case token_assert_end_sequence:
       
   326             return detail::make_dynamic_xpression<BidiIter>(detail::assert_eos_matcher());
       
   327 
       
   328         case token_assert_begin_line:
       
   329             return detail::make_assert_begin_line<BidiIter>(this->traits_.flags(), this->rxtraits());
       
   330 
       
   331         case token_assert_end_line:
       
   332             return detail::make_assert_end_line<BidiIter>(this->traits_.flags(), this->rxtraits());
       
   333 
       
   334         case token_assert_word_boundary:
       
   335             return detail::make_assert_word<BidiIter>(detail::word_boundary<true>(), this->rxtraits());
       
   336 
       
   337         case token_assert_not_word_boundary:
       
   338             return detail::make_assert_word<BidiIter>(detail::word_boundary<false>(), this->rxtraits());
       
   339 
       
   340         case token_assert_word_begin:
       
   341             return detail::make_assert_word<BidiIter>(detail::word_begin(), this->rxtraits());
       
   342 
       
   343         case token_assert_word_end:
       
   344             return detail::make_assert_word<BidiIter>(detail::word_end(), this->rxtraits());
       
   345 
       
   346         case token_escape:
       
   347             esc = this->parse_escape(begin, end);
       
   348             switch(esc.type_)
       
   349             {
       
   350             case detail::escape_mark:
       
   351                 return detail::make_backref_xpression<BidiIter>
       
   352                 (
       
   353                     esc.mark_nbr_, this->traits_.flags(), this->rxtraits()
       
   354                 );
       
   355             case detail::escape_char:
       
   356                 return detail::make_char_xpression<BidiIter>
       
   357                 (
       
   358                     esc.ch_, this->traits_.flags(), this->rxtraits()
       
   359                 );
       
   360             case detail::escape_class:
       
   361                 return detail::make_posix_charset_xpression<BidiIter>
       
   362                 (
       
   363                     esc.class_
       
   364                   , this->rxtraits().isctype(*begin++, this->upper_)
       
   365                   , this->traits_.flags()
       
   366                   , this->rxtraits()
       
   367                 );
       
   368             }
       
   369 
       
   370         case token_group_begin:
       
   371             return this->parse_group(begin, end);
       
   372 
       
   373         case token_charset_begin:
       
   374             return this->parse_charset(begin, end);
       
   375 
       
   376         case token_invalid_quantifier:
       
   377             throw regex_error(error_badrepeat, "quantifier not expected");
       
   378 
       
   379         case token_quote_meta_begin:
       
   380             return detail::make_literal_xpression<BidiIter>
       
   381             (
       
   382                 this->parse_quote_meta(begin, end), this->traits_.flags(), this->rxtraits()
       
   383             );
       
   384 
       
   385         case token_quote_meta_end:
       
   386             throw regex_error
       
   387             (
       
   388                 error_escape
       
   389               , "found quote-meta end without corresponding quote-meta begin"
       
   390             );
       
   391 
       
   392         case token_end_of_pattern:
       
   393             break;
       
   394 
       
   395         default:
       
   396             begin = old_begin;
       
   397             break;
       
   398         }
       
   399 
       
   400         return detail::sequence<BidiIter>();
       
   401     }
       
   402 
       
   403     ///////////////////////////////////////////////////////////////////////////
       
   404     // parse_quant
       
   405     /// INTERNAL ONLY
       
   406     detail::sequence<BidiIter> parse_quant(string_iterator &begin, string_iterator end)
       
   407     {
       
   408         BOOST_ASSERT(begin != end);
       
   409         detail::quant_spec spec = { 0, 0, false };
       
   410         detail::sequence<BidiIter> seq = this->parse_atom(begin, end);
       
   411 
       
   412         // BUGBUG this doesn't handle the degenerate (?:)+ correctly
       
   413         if(!seq.is_empty() && begin != end && seq.first->is_quantifiable())
       
   414         {
       
   415             if(this->traits_.get_quant_spec(begin, end, spec))
       
   416             {
       
   417                 BOOST_ASSERT(spec.min_ <= spec.max_);
       
   418 
       
   419                 if(0 == spec.max_) // quant {0,0} is degenerate -- matches nothing.
       
   420                 {
       
   421                     seq = this->parse_quant(begin, end);
       
   422                 }
       
   423                 else
       
   424                 {
       
   425                     seq = seq.first->quantify(spec, this->hidden_mark_count_, seq, alternates_factory());
       
   426                 }
       
   427             }
       
   428         }
       
   429 
       
   430         return seq;
       
   431     }
       
   432 
       
   433     ///////////////////////////////////////////////////////////////////////////
       
   434     // parse_sequence
       
   435     /// INTERNAL ONLY
       
   436     detail::sequence<BidiIter> parse_sequence(string_iterator &begin, string_iterator end)
       
   437     {
       
   438         detail::sequence<BidiIter> seq;
       
   439 
       
   440         while(begin != end)
       
   441         {
       
   442             detail::sequence<BidiIter> seq_quant = this->parse_quant(begin, end);
       
   443 
       
   444             // did we find a quantified atom?
       
   445             if(seq_quant.is_empty())
       
   446                 break;
       
   447 
       
   448             // chain it to the end of the xpression sequence
       
   449             seq += seq_quant;
       
   450         }
       
   451 
       
   452         return seq;
       
   453     }
       
   454 
       
   455     ///////////////////////////////////////////////////////////////////////////
       
   456     // parse_literal
       
   457     //  scan ahead looking for char literals to be globbed together into a string literal
       
   458     /// INTERNAL ONLY
       
   459     string_type parse_literal(string_iterator &begin, string_iterator end)
       
   460     {
       
   461         using namespace regex_constants;
       
   462         BOOST_ASSERT(begin != end);
       
   463         BOOST_ASSERT(token_literal == this->traits_.get_token(begin, end));
       
   464         escape_value esc = { 0, 0, 0, detail::escape_char };
       
   465         string_type literal(1, *begin);
       
   466 
       
   467         for(string_iterator prev = begin, tmp = ++begin; begin != end; prev = begin, begin = tmp)
       
   468         {
       
   469             detail::quant_spec spec;
       
   470             if(this->traits_.get_quant_spec(tmp, end, spec))
       
   471             {
       
   472                 if(literal.size() != 1)
       
   473                 {
       
   474                     begin = prev;
       
   475                     literal.erase(literal.size() - 1);
       
   476                 }
       
   477                 return literal;
       
   478             }
       
   479             else switch(this->traits_.get_token(tmp, end))
       
   480             {
       
   481             case token_escape:
       
   482                 esc = this->parse_escape(tmp, end);
       
   483                 if(detail::escape_char != esc.type_) return literal;
       
   484                 literal += esc.ch_;
       
   485                 break;
       
   486             case token_literal:
       
   487                 literal += *tmp++;
       
   488                 break;
       
   489             default:
       
   490                 return literal;
       
   491             }
       
   492         }
       
   493 
       
   494         return literal;
       
   495     }
       
   496 
       
   497     ///////////////////////////////////////////////////////////////////////////
       
   498     // parse_quote_meta
       
   499     //  scan ahead looking for char literals to be globbed together into a string literal
       
   500     /// INTERNAL ONLY
       
   501     string_type parse_quote_meta(string_iterator &begin, string_iterator end)
       
   502     {
       
   503         using namespace regex_constants;
       
   504         string_iterator old_begin = begin, old_end;
       
   505         while(end != (old_end = begin))
       
   506         {
       
   507             switch(this->traits_.get_token(begin, end))
       
   508             {
       
   509             case token_quote_meta_end: return string_type(old_begin, old_end);
       
   510             case token_escape: detail::ensure(begin != end, error_escape, "incomplete escape sequence");
       
   511             case token_literal: ++begin;
       
   512             default:;
       
   513             }
       
   514         }
       
   515         return string_type(old_begin, begin);
       
   516     }
       
   517 
       
   518     ///////////////////////////////////////////////////////////////////////////////
       
   519     // parse_escape
       
   520     /// INTERNAL ONLY
       
   521     escape_value parse_escape(string_iterator &begin, string_iterator end)
       
   522     {
       
   523         detail::ensure(begin != end, regex_constants::error_escape, "incomplete escape sequence");
       
   524 
       
   525         // first, check to see if this can be a backreference
       
   526         if(0 < this->rxtraits().value(*begin, 10))
       
   527         {
       
   528             // Parse at most 3 decimal digits.
       
   529             string_iterator tmp = begin;
       
   530             int mark_nbr = detail::toi(tmp, end, this->rxtraits(), 10, 999);
       
   531 
       
   532             // If the resulting number could conceivably be a backref, then it is.
       
   533             if(10 > mark_nbr || mark_nbr <= static_cast<int>(this->mark_count_))
       
   534             {
       
   535                 begin = tmp;
       
   536                 escape_value esc = {0, mark_nbr, 0, detail::escape_mark};
       
   537                 return esc;
       
   538             }
       
   539         }
       
   540 
       
   541         // Not a backreference, defer to the parse_escape helper
       
   542         return detail::parse_escape(begin, end, this->traits_);
       
   543     }
       
   544 
       
   545     std::size_t mark_count_;
       
   546     std::size_t hidden_mark_count_;
       
   547     CompilerTraits traits_;
       
   548     typename RegexTraits::char_class_type upper_;
       
   549 };
       
   550 
       
   551 }} // namespace boost::xpressive
       
   552 
       
   553 #endif