|
1 /* |
|
2 * |
|
3 * Copyright (c) 1998-2002 |
|
4 * John Maddock |
|
5 * |
|
6 * Use, modification and distribution are subject to the |
|
7 * Boost Software License, Version 1.0. (See accompanying file |
|
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
|
9 * |
|
10 */ |
|
11 |
|
12 /* |
|
13 * LOCATION: see http://www.boost.org for most recent version. |
|
14 * FILE states.cpp |
|
15 * VERSION see <boost/version.hpp> |
|
16 * DESCRIPTION: Declares internal state machine structures. |
|
17 */ |
|
18 |
|
19 #ifndef BOOST_REGEX_V4_STATES_HPP |
|
20 #define BOOST_REGEX_V4_STATES_HPP |
|
21 |
|
22 #ifdef BOOST_MSVC |
|
23 #pragma warning(push) |
|
24 #pragma warning(disable: 4103) |
|
25 #endif |
|
26 #ifdef BOOST_HAS_ABI_HEADERS |
|
27 # include BOOST_ABI_PREFIX |
|
28 #endif |
|
29 #ifdef BOOST_MSVC |
|
30 #pragma warning(pop) |
|
31 #endif |
|
32 |
|
33 namespace boost{ |
|
34 namespace re_detail{ |
|
35 |
|
36 /*** mask_type ******************************************************* |
|
37 Whenever we have a choice of two alternatives, we use an array of bytes |
|
38 to indicate which of the two alternatives it is possible to take for any |
|
39 given input character. If mask_take is set, then we can take the next |
|
40 state, and if mask_skip is set then we can take the alternative. |
|
41 ***********************************************************************/ |
|
42 enum mask_type |
|
43 { |
|
44 mask_take = 1, |
|
45 mask_skip = 2, |
|
46 mask_init = 4, |
|
47 mask_any = mask_skip | mask_take, |
|
48 mask_all = mask_any |
|
49 }; |
|
50 |
|
51 /*** helpers ********************************************************** |
|
52 These helpers let us use function overload resolution to detect whether |
|
53 we have narrow or wide character strings: |
|
54 ***********************************************************************/ |
|
55 struct _narrow_type{}; |
|
56 struct _wide_type{}; |
|
57 template <class charT> struct is_byte; |
|
58 template<> struct is_byte<char> { typedef _narrow_type width_type; }; |
|
59 template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; }; |
|
60 template<> struct is_byte<signed char> { typedef _narrow_type width_type; }; |
|
61 template <class charT> struct is_byte { typedef _wide_type width_type; }; |
|
62 |
|
63 /*** enum syntax_element_type ****************************************** |
|
64 Every record in the state machine falls into one of the following types: |
|
65 ***********************************************************************/ |
|
66 enum syntax_element_type |
|
67 { |
|
68 // start of a marked sub-expression, or perl-style (?...) extension |
|
69 syntax_element_startmark = 0, |
|
70 // end of a marked sub-expression, or perl-style (?...) extension |
|
71 syntax_element_endmark = syntax_element_startmark + 1, |
|
72 // any sequence of literal characters |
|
73 syntax_element_literal = syntax_element_endmark + 1, |
|
74 // start of line assertion: ^ |
|
75 syntax_element_start_line = syntax_element_literal + 1, |
|
76 // end of line assertion $ |
|
77 syntax_element_end_line = syntax_element_start_line + 1, |
|
78 // match any character: . |
|
79 syntax_element_wild = syntax_element_end_line + 1, |
|
80 // end of expression: we have a match when we get here |
|
81 syntax_element_match = syntax_element_wild + 1, |
|
82 // perl style word boundary: \b |
|
83 syntax_element_word_boundary = syntax_element_match + 1, |
|
84 // perl style within word boundary: \B |
|
85 syntax_element_within_word = syntax_element_word_boundary + 1, |
|
86 // start of word assertion: \< |
|
87 syntax_element_word_start = syntax_element_within_word + 1, |
|
88 // end of word assertion: \> |
|
89 syntax_element_word_end = syntax_element_word_start + 1, |
|
90 // start of buffer assertion: \` |
|
91 syntax_element_buffer_start = syntax_element_word_end + 1, |
|
92 // end of buffer assertion: \' |
|
93 syntax_element_buffer_end = syntax_element_buffer_start + 1, |
|
94 // backreference to previously matched sub-expression |
|
95 syntax_element_backref = syntax_element_buffer_end + 1, |
|
96 // either a wide character set [..] or one with multicharacter collating elements: |
|
97 syntax_element_long_set = syntax_element_backref + 1, |
|
98 // narrow character set: [...] |
|
99 syntax_element_set = syntax_element_long_set + 1, |
|
100 // jump to a new state in the machine: |
|
101 syntax_element_jump = syntax_element_set + 1, |
|
102 // choose between two production states: |
|
103 syntax_element_alt = syntax_element_jump + 1, |
|
104 // a repeat |
|
105 syntax_element_rep = syntax_element_alt + 1, |
|
106 // match a combining character sequence |
|
107 syntax_element_combining = syntax_element_rep + 1, |
|
108 // perl style soft buffer end: \z |
|
109 syntax_element_soft_buffer_end = syntax_element_combining + 1, |
|
110 // perl style continuation: \G |
|
111 syntax_element_restart_continue = syntax_element_soft_buffer_end + 1, |
|
112 // single character repeats: |
|
113 syntax_element_dot_rep = syntax_element_restart_continue + 1, |
|
114 syntax_element_char_rep = syntax_element_dot_rep + 1, |
|
115 syntax_element_short_set_rep = syntax_element_char_rep + 1, |
|
116 syntax_element_long_set_rep = syntax_element_short_set_rep + 1, |
|
117 // a backstep for lookbehind repeats: |
|
118 syntax_element_backstep = syntax_element_long_set_rep + 1, |
|
119 // an assertion that a mark was matched: |
|
120 syntax_element_assert_backref = syntax_element_backstep + 1, |
|
121 syntax_element_toggle_case = syntax_element_assert_backref + 1 |
|
122 }; |
|
123 |
|
124 #ifdef BOOST_REGEX_DEBUG |
|
125 // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion |
|
126 std::ostream& operator<<(std::ostream&, syntax_element_type); |
|
127 #endif |
|
128 |
|
129 struct re_syntax_base; |
|
130 |
|
131 /*** union offset_type ************************************************ |
|
132 Points to another state in the machine. During machine construction |
|
133 we use integral offsets, but these are converted to pointers before |
|
134 execution of the machine. |
|
135 ***********************************************************************/ |
|
136 union offset_type |
|
137 { |
|
138 re_syntax_base* p; |
|
139 std::ptrdiff_t i; |
|
140 }; |
|
141 |
|
142 /*** struct re_syntax_base ******************************************** |
|
143 Base class for all states in the machine. |
|
144 ***********************************************************************/ |
|
145 struct re_syntax_base |
|
146 { |
|
147 syntax_element_type type; // what kind of state this is |
|
148 offset_type next; // next state in the machine |
|
149 }; |
|
150 |
|
151 /*** struct re_brace ************************************************** |
|
152 A marked parenthesis. |
|
153 ***********************************************************************/ |
|
154 struct re_brace : public re_syntax_base |
|
155 { |
|
156 // The index to match, can be zero (don't mark the sub-expression) |
|
157 // or negative (for perl style (?...) extentions): |
|
158 int index; |
|
159 }; |
|
160 |
|
161 /*** struct re_dot ************************************************** |
|
162 Match anything. |
|
163 ***********************************************************************/ |
|
164 enum |
|
165 { |
|
166 dont_care = 1, |
|
167 force_not_newline = 0, |
|
168 force_newline = 2, |
|
169 |
|
170 test_not_newline = 2, |
|
171 test_newline = 3 |
|
172 }; |
|
173 struct re_dot : public re_syntax_base |
|
174 { |
|
175 unsigned char mask; |
|
176 }; |
|
177 |
|
178 /*** struct re_literal ************************************************ |
|
179 A string of literals, following this structure will be an |
|
180 array of characters: charT[length] |
|
181 ***********************************************************************/ |
|
182 struct re_literal : public re_syntax_base |
|
183 { |
|
184 unsigned int length; |
|
185 }; |
|
186 |
|
187 /*** struct re_case ************************************************ |
|
188 Indicates whether we are moving to a case insensive block or not |
|
189 ***********************************************************************/ |
|
190 struct re_case : public re_syntax_base |
|
191 { |
|
192 bool icase; |
|
193 }; |
|
194 |
|
195 /*** struct re_set_long *********************************************** |
|
196 A wide character set of characters, following this structure will be |
|
197 an array of type charT: |
|
198 First csingles null-terminated strings |
|
199 Then 2 * cranges NULL terminated strings |
|
200 Then cequivalents NULL terminated strings |
|
201 ***********************************************************************/ |
|
202 template <class mask_type> |
|
203 struct re_set_long : public re_syntax_base |
|
204 { |
|
205 unsigned int csingles, cranges, cequivalents; |
|
206 mask_type cclasses; |
|
207 mask_type cnclasses; |
|
208 bool isnot; |
|
209 bool singleton; |
|
210 }; |
|
211 |
|
212 /*** struct re_set **************************************************** |
|
213 A set of narrow-characters, matches any of _map which is none-zero |
|
214 ***********************************************************************/ |
|
215 struct re_set : public re_syntax_base |
|
216 { |
|
217 unsigned char _map[1 << CHAR_BIT]; |
|
218 }; |
|
219 |
|
220 /*** struct re_jump *************************************************** |
|
221 Jump to a new location in the machine (not next). |
|
222 ***********************************************************************/ |
|
223 struct re_jump : public re_syntax_base |
|
224 { |
|
225 offset_type alt; // location to jump to |
|
226 }; |
|
227 |
|
228 /*** struct re_alt *************************************************** |
|
229 Jump to a new location in the machine (possibly next). |
|
230 ***********************************************************************/ |
|
231 struct re_alt : public re_jump |
|
232 { |
|
233 unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump |
|
234 unsigned int can_be_null; // true if we match a NULL string |
|
235 }; |
|
236 |
|
237 /*** struct re_repeat ************************************************* |
|
238 Repeat a section of the machine |
|
239 ***********************************************************************/ |
|
240 struct re_repeat : public re_alt |
|
241 { |
|
242 std::size_t min, max; // min and max allowable repeats |
|
243 int state_id; // Unique identifier for this repeat |
|
244 bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches) |
|
245 bool greedy; // True if this is a greedy repeat |
|
246 }; |
|
247 |
|
248 /*** enum re_jump_size_type ******************************************* |
|
249 Provides compiled size of re_jump structure (allowing for trailing alignment). |
|
250 We provide this so we know how manybytes to insert when constructing the machine |
|
251 (The value of padding_mask is defined in regex_raw_buffer.hpp). |
|
252 ***********************************************************************/ |
|
253 enum re_jump_size_type |
|
254 { |
|
255 re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask), |
|
256 re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask), |
|
257 re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask) |
|
258 }; |
|
259 |
|
260 /*** proc re_is_set_member ********************************************* |
|
261 Forward declaration: we'll need this one later... |
|
262 ***********************************************************************/ |
|
263 |
|
264 template<class charT, class traits> |
|
265 struct regex_data; |
|
266 |
|
267 template <class iterator, class charT, class traits_type, class char_classT> |
|
268 iterator BOOST_REGEX_CALL re_is_set_member(iterator next, |
|
269 iterator last, |
|
270 const re_set_long<char_classT>* set_, |
|
271 const regex_data<charT, traits_type>& e, bool icase); |
|
272 |
|
273 } // namespace re_detail |
|
274 |
|
275 } // namespace boost |
|
276 |
|
277 #ifdef BOOST_MSVC |
|
278 #pragma warning(push) |
|
279 #pragma warning(disable: 4103) |
|
280 #endif |
|
281 #ifdef BOOST_HAS_ABI_HEADERS |
|
282 # include BOOST_ABI_SUFFIX |
|
283 #endif |
|
284 #ifdef BOOST_MSVC |
|
285 #pragma warning(pop) |
|
286 #endif |
|
287 |
|
288 #endif |
|
289 |
|
290 |