600
|
1 |
/*
|
|
2 |
*
|
|
3 |
* Copyright (c) 1998-2002
|
|
4 |
* John Maddock
|
|
5 |
*
|
|
6 |
* Use, modification and distribution are subject to the
|
|
7 |
* Boost Software License, Version 1.0. (See accompanying file
|
|
8 |
* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
9 |
*
|
|
10 |
*/
|
|
11 |
|
|
12 |
/*
|
|
13 |
* LOCATION: see http://www.boost.org for most recent version.
|
|
14 |
* FILE states.cpp
|
|
15 |
* VERSION see <boost/version.hpp>
|
|
16 |
* DESCRIPTION: Declares internal state machine structures.
|
|
17 |
*/
|
|
18 |
|
|
19 |
#ifndef BOOST_REGEX_V4_STATES_HPP
|
|
20 |
#define BOOST_REGEX_V4_STATES_HPP
|
|
21 |
|
|
22 |
#ifdef BOOST_MSVC
|
|
23 |
#pragma warning(push)
|
|
24 |
#pragma warning(disable: 4103)
|
|
25 |
#endif
|
|
26 |
#ifdef BOOST_HAS_ABI_HEADERS
|
|
27 |
# include BOOST_ABI_PREFIX
|
|
28 |
#endif
|
|
29 |
#ifdef BOOST_MSVC
|
|
30 |
#pragma warning(pop)
|
|
31 |
#endif
|
|
32 |
|
|
33 |
namespace boost{
|
|
34 |
namespace re_detail{
|
|
35 |
|
|
36 |
/*** mask_type *******************************************************
|
|
37 |
Whenever we have a choice of two alternatives, we use an array of bytes
|
|
38 |
to indicate which of the two alternatives it is possible to take for any
|
|
39 |
given input character. If mask_take is set, then we can take the next
|
|
40 |
state, and if mask_skip is set then we can take the alternative.
|
|
41 |
***********************************************************************/
|
|
42 |
enum mask_type
|
|
43 |
{
|
|
44 |
mask_take = 1,
|
|
45 |
mask_skip = 2,
|
|
46 |
mask_init = 4,
|
|
47 |
mask_any = mask_skip | mask_take,
|
|
48 |
mask_all = mask_any
|
|
49 |
};
|
|
50 |
|
|
51 |
/*** helpers **********************************************************
|
|
52 |
These helpers let us use function overload resolution to detect whether
|
|
53 |
we have narrow or wide character strings:
|
|
54 |
***********************************************************************/
|
|
55 |
struct _narrow_type{};
|
|
56 |
struct _wide_type{};
|
|
57 |
template <class charT> struct is_byte;
|
|
58 |
template<> struct is_byte<char> { typedef _narrow_type width_type; };
|
|
59 |
template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
|
|
60 |
template<> struct is_byte<signed char> { typedef _narrow_type width_type; };
|
|
61 |
template <class charT> struct is_byte { typedef _wide_type width_type; };
|
|
62 |
|
|
63 |
/*** enum syntax_element_type ******************************************
|
|
64 |
Every record in the state machine falls into one of the following types:
|
|
65 |
***********************************************************************/
|
|
66 |
enum syntax_element_type
|
|
67 |
{
|
|
68 |
// start of a marked sub-expression, or perl-style (?...) extension
|
|
69 |
syntax_element_startmark = 0,
|
|
70 |
// end of a marked sub-expression, or perl-style (?...) extension
|
|
71 |
syntax_element_endmark = syntax_element_startmark + 1,
|
|
72 |
// any sequence of literal characters
|
|
73 |
syntax_element_literal = syntax_element_endmark + 1,
|
|
74 |
// start of line assertion: ^
|
|
75 |
syntax_element_start_line = syntax_element_literal + 1,
|
|
76 |
// end of line assertion $
|
|
77 |
syntax_element_end_line = syntax_element_start_line + 1,
|
|
78 |
// match any character: .
|
|
79 |
syntax_element_wild = syntax_element_end_line + 1,
|
|
80 |
// end of expression: we have a match when we get here
|
|
81 |
syntax_element_match = syntax_element_wild + 1,
|
|
82 |
// perl style word boundary: \b
|
|
83 |
syntax_element_word_boundary = syntax_element_match + 1,
|
|
84 |
// perl style within word boundary: \B
|
|
85 |
syntax_element_within_word = syntax_element_word_boundary + 1,
|
|
86 |
// start of word assertion: \<
|
|
87 |
syntax_element_word_start = syntax_element_within_word + 1,
|
|
88 |
// end of word assertion: \>
|
|
89 |
syntax_element_word_end = syntax_element_word_start + 1,
|
|
90 |
// start of buffer assertion: \`
|
|
91 |
syntax_element_buffer_start = syntax_element_word_end + 1,
|
|
92 |
// end of buffer assertion: \'
|
|
93 |
syntax_element_buffer_end = syntax_element_buffer_start + 1,
|
|
94 |
// backreference to previously matched sub-expression
|
|
95 |
syntax_element_backref = syntax_element_buffer_end + 1,
|
|
96 |
// either a wide character set [..] or one with multicharacter collating elements:
|
|
97 |
syntax_element_long_set = syntax_element_backref + 1,
|
|
98 |
// narrow character set: [...]
|
|
99 |
syntax_element_set = syntax_element_long_set + 1,
|
|
100 |
// jump to a new state in the machine:
|
|
101 |
syntax_element_jump = syntax_element_set + 1,
|
|
102 |
// choose between two production states:
|
|
103 |
syntax_element_alt = syntax_element_jump + 1,
|
|
104 |
// a repeat
|
|
105 |
syntax_element_rep = syntax_element_alt + 1,
|
|
106 |
// match a combining character sequence
|
|
107 |
syntax_element_combining = syntax_element_rep + 1,
|
|
108 |
// perl style soft buffer end: \z
|
|
109 |
syntax_element_soft_buffer_end = syntax_element_combining + 1,
|
|
110 |
// perl style continuation: \G
|
|
111 |
syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
|
|
112 |
// single character repeats:
|
|
113 |
syntax_element_dot_rep = syntax_element_restart_continue + 1,
|
|
114 |
syntax_element_char_rep = syntax_element_dot_rep + 1,
|
|
115 |
syntax_element_short_set_rep = syntax_element_char_rep + 1,
|
|
116 |
syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
|
|
117 |
// a backstep for lookbehind repeats:
|
|
118 |
syntax_element_backstep = syntax_element_long_set_rep + 1,
|
|
119 |
// an assertion that a mark was matched:
|
|
120 |
syntax_element_assert_backref = syntax_element_backstep + 1,
|
|
121 |
syntax_element_toggle_case = syntax_element_assert_backref + 1
|
|
122 |
};
|
|
123 |
|
|
124 |
#ifdef BOOST_REGEX_DEBUG
|
|
125 |
// dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
|
|
126 |
std::ostream& operator<<(std::ostream&, syntax_element_type);
|
|
127 |
#endif
|
|
128 |
|
|
129 |
struct re_syntax_base;
|
|
130 |
|
|
131 |
/*** union offset_type ************************************************
|
|
132 |
Points to another state in the machine. During machine construction
|
|
133 |
we use integral offsets, but these are converted to pointers before
|
|
134 |
execution of the machine.
|
|
135 |
***********************************************************************/
|
|
136 |
union offset_type
|
|
137 |
{
|
|
138 |
re_syntax_base* p;
|
|
139 |
std::ptrdiff_t i;
|
|
140 |
};
|
|
141 |
|
|
142 |
/*** struct re_syntax_base ********************************************
|
|
143 |
Base class for all states in the machine.
|
|
144 |
***********************************************************************/
|
|
145 |
struct re_syntax_base
|
|
146 |
{
|
|
147 |
syntax_element_type type; // what kind of state this is
|
|
148 |
offset_type next; // next state in the machine
|
|
149 |
};
|
|
150 |
|
|
151 |
/*** struct re_brace **************************************************
|
|
152 |
A marked parenthesis.
|
|
153 |
***********************************************************************/
|
|
154 |
struct re_brace : public re_syntax_base
|
|
155 |
{
|
|
156 |
// The index to match, can be zero (don't mark the sub-expression)
|
|
157 |
// or negative (for perl style (?...) extentions):
|
|
158 |
int index;
|
|
159 |
};
|
|
160 |
|
|
161 |
/*** struct re_dot **************************************************
|
|
162 |
Match anything.
|
|
163 |
***********************************************************************/
|
|
164 |
enum
|
|
165 |
{
|
|
166 |
dont_care = 1,
|
|
167 |
force_not_newline = 0,
|
|
168 |
force_newline = 2,
|
|
169 |
|
|
170 |
test_not_newline = 2,
|
|
171 |
test_newline = 3
|
|
172 |
};
|
|
173 |
struct re_dot : public re_syntax_base
|
|
174 |
{
|
|
175 |
unsigned char mask;
|
|
176 |
};
|
|
177 |
|
|
178 |
/*** struct re_literal ************************************************
|
|
179 |
A string of literals, following this structure will be an
|
|
180 |
array of characters: charT[length]
|
|
181 |
***********************************************************************/
|
|
182 |
struct re_literal : public re_syntax_base
|
|
183 |
{
|
|
184 |
unsigned int length;
|
|
185 |
};
|
|
186 |
|
|
187 |
/*** struct re_case ************************************************
|
|
188 |
Indicates whether we are moving to a case insensive block or not
|
|
189 |
***********************************************************************/
|
|
190 |
struct re_case : public re_syntax_base
|
|
191 |
{
|
|
192 |
bool icase;
|
|
193 |
};
|
|
194 |
|
|
195 |
/*** struct re_set_long ***********************************************
|
|
196 |
A wide character set of characters, following this structure will be
|
|
197 |
an array of type charT:
|
|
198 |
First csingles null-terminated strings
|
|
199 |
Then 2 * cranges NULL terminated strings
|
|
200 |
Then cequivalents NULL terminated strings
|
|
201 |
***********************************************************************/
|
|
202 |
template <class mask_type>
|
|
203 |
struct re_set_long : public re_syntax_base
|
|
204 |
{
|
|
205 |
unsigned int csingles, cranges, cequivalents;
|
|
206 |
mask_type cclasses;
|
|
207 |
mask_type cnclasses;
|
|
208 |
bool isnot;
|
|
209 |
bool singleton;
|
|
210 |
};
|
|
211 |
|
|
212 |
/*** struct re_set ****************************************************
|
|
213 |
A set of narrow-characters, matches any of _map which is none-zero
|
|
214 |
***********************************************************************/
|
|
215 |
struct re_set : public re_syntax_base
|
|
216 |
{
|
|
217 |
unsigned char _map[1 << CHAR_BIT];
|
|
218 |
};
|
|
219 |
|
|
220 |
/*** struct re_jump ***************************************************
|
|
221 |
Jump to a new location in the machine (not next).
|
|
222 |
***********************************************************************/
|
|
223 |
struct re_jump : public re_syntax_base
|
|
224 |
{
|
|
225 |
offset_type alt; // location to jump to
|
|
226 |
};
|
|
227 |
|
|
228 |
/*** struct re_alt ***************************************************
|
|
229 |
Jump to a new location in the machine (possibly next).
|
|
230 |
***********************************************************************/
|
|
231 |
struct re_alt : public re_jump
|
|
232 |
{
|
|
233 |
unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump
|
|
234 |
unsigned int can_be_null; // true if we match a NULL string
|
|
235 |
};
|
|
236 |
|
|
237 |
/*** struct re_repeat *************************************************
|
|
238 |
Repeat a section of the machine
|
|
239 |
***********************************************************************/
|
|
240 |
struct re_repeat : public re_alt
|
|
241 |
{
|
|
242 |
std::size_t min, max; // min and max allowable repeats
|
|
243 |
int state_id; // Unique identifier for this repeat
|
|
244 |
bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches)
|
|
245 |
bool greedy; // True if this is a greedy repeat
|
|
246 |
};
|
|
247 |
|
|
248 |
/*** enum re_jump_size_type *******************************************
|
|
249 |
Provides compiled size of re_jump structure (allowing for trailing alignment).
|
|
250 |
We provide this so we know how manybytes to insert when constructing the machine
|
|
251 |
(The value of padding_mask is defined in regex_raw_buffer.hpp).
|
|
252 |
***********************************************************************/
|
|
253 |
enum re_jump_size_type
|
|
254 |
{
|
|
255 |
re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
|
|
256 |
re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
|
|
257 |
re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
|
|
258 |
};
|
|
259 |
|
|
260 |
/*** proc re_is_set_member *********************************************
|
|
261 |
Forward declaration: we'll need this one later...
|
|
262 |
***********************************************************************/
|
|
263 |
|
|
264 |
template<class charT, class traits>
|
|
265 |
struct regex_data;
|
|
266 |
|
|
267 |
template <class iterator, class charT, class traits_type, class char_classT>
|
|
268 |
iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
|
|
269 |
iterator last,
|
|
270 |
const re_set_long<char_classT>* set_,
|
|
271 |
const regex_data<charT, traits_type>& e, bool icase);
|
|
272 |
|
|
273 |
} // namespace re_detail
|
|
274 |
|
|
275 |
} // namespace boost
|
|
276 |
|
|
277 |
#ifdef BOOST_MSVC
|
|
278 |
#pragma warning(push)
|
|
279 |
#pragma warning(disable: 4103)
|
|
280 |
#endif
|
|
281 |
#ifdef BOOST_HAS_ABI_HEADERS
|
|
282 |
# include BOOST_ABI_SUFFIX
|
|
283 |
#endif
|
|
284 |
#ifdef BOOST_MSVC
|
|
285 |
#pragma warning(pop)
|
|
286 |
#endif
|
|
287 |
|
|
288 |
#endif
|
|
289 |
|
|
290 |
|