|
1 /* |
|
2 * Copyright (C) 2009, 2010 Apple Inc. All rights reserved. |
|
3 * |
|
4 * Redistribution and use in source and binary forms, with or without |
|
5 * modification, are permitted provided that the following conditions |
|
6 * are met: |
|
7 * 1. Redistributions of source code must retain the above copyright |
|
8 * notice, this list of conditions and the following disclaimer. |
|
9 * 2. Redistributions in binary form must reproduce the above copyright |
|
10 * notice, this list of conditions and the following disclaimer in the |
|
11 * documentation and/or other materials provided with the distribution. |
|
12 * |
|
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
|
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
|
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
|
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
|
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
|
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
|
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
24 */ |
|
25 |
|
26 #ifndef RegexInterpreter_h |
|
27 #define RegexInterpreter_h |
|
28 |
|
29 #if ENABLE(YARR) |
|
30 |
|
31 #include "RegexParser.h" |
|
32 #include "RegexPattern.h" |
|
33 #include <wtf/PassOwnPtr.h> |
|
34 #include <wtf/unicode/Unicode.h> |
|
35 |
|
36 namespace JSC { namespace Yarr { |
|
37 |
|
38 class ByteDisjunction; |
|
39 |
|
40 struct ByteTerm { |
|
41 enum Type { |
|
42 TypeBodyAlternativeBegin, |
|
43 TypeBodyAlternativeDisjunction, |
|
44 TypeBodyAlternativeEnd, |
|
45 TypeAlternativeBegin, |
|
46 TypeAlternativeDisjunction, |
|
47 TypeAlternativeEnd, |
|
48 TypeSubpatternBegin, |
|
49 TypeSubpatternEnd, |
|
50 TypeAssertionBOL, |
|
51 TypeAssertionEOL, |
|
52 TypeAssertionWordBoundary, |
|
53 TypePatternCharacterOnce, |
|
54 TypePatternCharacterFixed, |
|
55 TypePatternCharacterGreedy, |
|
56 TypePatternCharacterNonGreedy, |
|
57 TypePatternCasedCharacterOnce, |
|
58 TypePatternCasedCharacterFixed, |
|
59 TypePatternCasedCharacterGreedy, |
|
60 TypePatternCasedCharacterNonGreedy, |
|
61 TypeCharacterClass, |
|
62 TypeBackReference, |
|
63 TypeParenthesesSubpattern, |
|
64 TypeParenthesesSubpatternOnceBegin, |
|
65 TypeParenthesesSubpatternOnceEnd, |
|
66 TypeParentheticalAssertionBegin, |
|
67 TypeParentheticalAssertionEnd, |
|
68 TypeCheckInput, |
|
69 } type; |
|
70 bool invertOrCapture; |
|
71 union { |
|
72 struct { |
|
73 union { |
|
74 UChar patternCharacter; |
|
75 struct { |
|
76 UChar lo; |
|
77 UChar hi; |
|
78 } casedCharacter; |
|
79 CharacterClass* characterClass; |
|
80 unsigned subpatternId; |
|
81 }; |
|
82 union { |
|
83 ByteDisjunction* parenthesesDisjunction; |
|
84 unsigned parenthesesWidth; |
|
85 }; |
|
86 QuantifierType quantityType; |
|
87 unsigned quantityCount; |
|
88 } atom; |
|
89 struct { |
|
90 int next; |
|
91 int end; |
|
92 } alternative; |
|
93 unsigned checkInputCount; |
|
94 }; |
|
95 unsigned frameLocation; |
|
96 int inputPosition; |
|
97 |
|
98 ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) |
|
99 : frameLocation(frameLocation) |
|
100 { |
|
101 switch (quantityType) { |
|
102 case QuantifierFixedCount: |
|
103 type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; |
|
104 break; |
|
105 case QuantifierGreedy: |
|
106 type = ByteTerm::TypePatternCharacterGreedy; |
|
107 break; |
|
108 case QuantifierNonGreedy: |
|
109 type = ByteTerm::TypePatternCharacterNonGreedy; |
|
110 break; |
|
111 } |
|
112 |
|
113 atom.patternCharacter = ch; |
|
114 atom.quantityType = quantityType; |
|
115 atom.quantityCount = quantityCount; |
|
116 inputPosition = inputPos; |
|
117 } |
|
118 |
|
119 ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) |
|
120 : frameLocation(frameLocation) |
|
121 { |
|
122 switch (quantityType) { |
|
123 case QuantifierFixedCount: |
|
124 type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; |
|
125 break; |
|
126 case QuantifierGreedy: |
|
127 type = ByteTerm::TypePatternCasedCharacterGreedy; |
|
128 break; |
|
129 case QuantifierNonGreedy: |
|
130 type = ByteTerm::TypePatternCasedCharacterNonGreedy; |
|
131 break; |
|
132 } |
|
133 |
|
134 atom.casedCharacter.lo = lo; |
|
135 atom.casedCharacter.hi = hi; |
|
136 atom.quantityType = quantityType; |
|
137 atom.quantityCount = quantityCount; |
|
138 inputPosition = inputPos; |
|
139 } |
|
140 |
|
141 ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) |
|
142 : type(ByteTerm::TypeCharacterClass) |
|
143 , invertOrCapture(invert) |
|
144 { |
|
145 atom.characterClass = characterClass; |
|
146 atom.quantityType = QuantifierFixedCount; |
|
147 atom.quantityCount = 1; |
|
148 inputPosition = inputPos; |
|
149 } |
|
150 |
|
151 ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool invertOrCapture, int inputPos) |
|
152 : type(type) |
|
153 , invertOrCapture(invertOrCapture) |
|
154 { |
|
155 atom.subpatternId = subpatternId; |
|
156 atom.parenthesesDisjunction = parenthesesInfo; |
|
157 atom.quantityType = QuantifierFixedCount; |
|
158 atom.quantityCount = 1; |
|
159 inputPosition = inputPos; |
|
160 } |
|
161 |
|
162 ByteTerm(Type type, bool invert = false) |
|
163 : type(type) |
|
164 , invertOrCapture(invert) |
|
165 { |
|
166 atom.quantityType = QuantifierFixedCount; |
|
167 atom.quantityCount = 1; |
|
168 } |
|
169 |
|
170 ByteTerm(Type type, unsigned subpatternId, bool invertOrCapture, int inputPos) |
|
171 : type(type) |
|
172 , invertOrCapture(invertOrCapture) |
|
173 { |
|
174 atom.subpatternId = subpatternId; |
|
175 atom.quantityType = QuantifierFixedCount; |
|
176 atom.quantityCount = 1; |
|
177 inputPosition = inputPos; |
|
178 } |
|
179 |
|
180 static ByteTerm BOL(int inputPos) |
|
181 { |
|
182 ByteTerm term(TypeAssertionBOL); |
|
183 term.inputPosition = inputPos; |
|
184 return term; |
|
185 } |
|
186 |
|
187 static ByteTerm CheckInput(unsigned count) |
|
188 { |
|
189 ByteTerm term(TypeCheckInput); |
|
190 term.checkInputCount = count; |
|
191 return term; |
|
192 } |
|
193 |
|
194 static ByteTerm EOL(int inputPos) |
|
195 { |
|
196 ByteTerm term(TypeAssertionEOL); |
|
197 term.inputPosition = inputPos; |
|
198 return term; |
|
199 } |
|
200 |
|
201 static ByteTerm WordBoundary(bool invert, int inputPos) |
|
202 { |
|
203 ByteTerm term(TypeAssertionWordBoundary, invert); |
|
204 term.inputPosition = inputPos; |
|
205 return term; |
|
206 } |
|
207 |
|
208 static ByteTerm BackReference(unsigned subpatternId, int inputPos) |
|
209 { |
|
210 return ByteTerm(TypeBackReference, subpatternId, false, inputPos); |
|
211 } |
|
212 |
|
213 static ByteTerm BodyAlternativeBegin() |
|
214 { |
|
215 ByteTerm term(TypeBodyAlternativeBegin); |
|
216 term.alternative.next = 0; |
|
217 term.alternative.end = 0; |
|
218 return term; |
|
219 } |
|
220 |
|
221 static ByteTerm BodyAlternativeDisjunction() |
|
222 { |
|
223 ByteTerm term(TypeBodyAlternativeDisjunction); |
|
224 term.alternative.next = 0; |
|
225 term.alternative.end = 0; |
|
226 return term; |
|
227 } |
|
228 |
|
229 static ByteTerm BodyAlternativeEnd() |
|
230 { |
|
231 ByteTerm term(TypeBodyAlternativeEnd); |
|
232 term.alternative.next = 0; |
|
233 term.alternative.end = 0; |
|
234 return term; |
|
235 } |
|
236 |
|
237 static ByteTerm AlternativeBegin() |
|
238 { |
|
239 ByteTerm term(TypeAlternativeBegin); |
|
240 term.alternative.next = 0; |
|
241 term.alternative.end = 0; |
|
242 return term; |
|
243 } |
|
244 |
|
245 static ByteTerm AlternativeDisjunction() |
|
246 { |
|
247 ByteTerm term(TypeAlternativeDisjunction); |
|
248 term.alternative.next = 0; |
|
249 term.alternative.end = 0; |
|
250 return term; |
|
251 } |
|
252 |
|
253 static ByteTerm AlternativeEnd() |
|
254 { |
|
255 ByteTerm term(TypeAlternativeEnd); |
|
256 term.alternative.next = 0; |
|
257 term.alternative.end = 0; |
|
258 return term; |
|
259 } |
|
260 |
|
261 static ByteTerm SubpatternBegin() |
|
262 { |
|
263 return ByteTerm(TypeSubpatternBegin); |
|
264 } |
|
265 |
|
266 static ByteTerm SubpatternEnd() |
|
267 { |
|
268 return ByteTerm(TypeSubpatternEnd); |
|
269 } |
|
270 |
|
271 bool invert() |
|
272 { |
|
273 return invertOrCapture; |
|
274 } |
|
275 |
|
276 bool capture() |
|
277 { |
|
278 return invertOrCapture; |
|
279 } |
|
280 }; |
|
281 |
|
282 class ByteDisjunction : public FastAllocBase { |
|
283 public: |
|
284 ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) |
|
285 : m_numSubpatterns(numSubpatterns) |
|
286 , m_frameSize(frameSize) |
|
287 { |
|
288 } |
|
289 |
|
290 Vector<ByteTerm> terms; |
|
291 unsigned m_numSubpatterns; |
|
292 unsigned m_frameSize; |
|
293 }; |
|
294 |
|
295 struct BytecodePattern : FastAllocBase { |
|
296 BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern) |
|
297 : m_body(body) |
|
298 , m_ignoreCase(pattern.m_ignoreCase) |
|
299 , m_multiline(pattern.m_multiline) |
|
300 { |
|
301 newlineCharacterClass = pattern.newlineCharacterClass(); |
|
302 wordcharCharacterClass = pattern.wordcharCharacterClass(); |
|
303 |
|
304 m_allParenthesesInfo.append(allParenthesesInfo); |
|
305 m_userCharacterClasses.append(pattern.m_userCharacterClasses); |
|
306 // 'Steal' the RegexPattern's CharacterClasses! We clear its |
|
307 // array, so that it won't delete them on destruction. We'll |
|
308 // take responsibility for that. |
|
309 pattern.m_userCharacterClasses.clear(); |
|
310 } |
|
311 |
|
312 ~BytecodePattern() |
|
313 { |
|
314 deleteAllValues(m_allParenthesesInfo); |
|
315 deleteAllValues(m_userCharacterClasses); |
|
316 } |
|
317 |
|
318 OwnPtr<ByteDisjunction> m_body; |
|
319 bool m_ignoreCase; |
|
320 bool m_multiline; |
|
321 |
|
322 CharacterClass* newlineCharacterClass; |
|
323 CharacterClass* wordcharCharacterClass; |
|
324 private: |
|
325 Vector<ByteDisjunction*> m_allParenthesesInfo; |
|
326 Vector<CharacterClass*> m_userCharacterClasses; |
|
327 }; |
|
328 |
|
329 PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false); |
|
330 int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output); |
|
331 |
|
332 } } // namespace JSC::Yarr |
|
333 |
|
334 #endif |
|
335 |
|
336 #endif // RegexInterpreter_h |