|
1 |
|
2 /*- |
|
3 * Copyright (c) 1992, 1993, 1994 Henry Spencer. |
|
4 * Copyright (c) 1992, 1993, 1994 |
|
5 * The Regents of the University of California. All rights reserved. |
|
6 * |
|
7 * This code is derived from software contributed to Berkeley by |
|
8 * Henry Spencer. |
|
9 * |
|
10 * Redistribution and use in source and binary forms, with or without |
|
11 * modification, are permitted provided that the following conditions |
|
12 * are met: |
|
13 * 1. Redistributions of source code must retain the above copyright |
|
14 * notice, this list of conditions and the following disclaimer. |
|
15 * 2. Redistributions in binary form must reproduce the above copyright |
|
16 * notice, this list of conditions and the following disclaimer in the |
|
17 * documentation and/or other materials provided with the distribution. |
|
18 * 4. Neither the name of the University nor the names of its contributors |
|
19 * may be used to endorse or promote products derived from this software |
|
20 * without specific prior written permission. |
|
21 * |
|
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
|
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
|
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
|
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
|
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
32 * SUCH DAMAGE. |
|
33 * Portions Copyright (c) 2005-2007 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. |
|
34 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 |
|
35 */ |
|
36 |
|
37 #if defined(LIBC_SCCS) && !defined(lint) |
|
38 static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; |
|
39 #endif /* LIBC_SCCS and not lint */ |
|
40 #include <sys/cdefs.h> |
|
41 |
|
42 #ifndef __SYMBIAN32__ |
|
43 __FBSDID("$FreeBSD: src/lib/libc/regex/regcomp.c,v 1.34 2004/10/03 15:42:59 stefanf Exp $"); |
|
44 #endif |
|
45 |
|
46 #ifdef __SYMBIAN32__ |
|
47 #ifdef __cplusplus |
|
48 #include <e32def.h> |
|
49 #endif |
|
50 #endif |
|
51 |
|
52 #include <sys/types.h> |
|
53 #include <stdio.h> |
|
54 #include <string.h> |
|
55 #include <ctype.h> |
|
56 #include <limits.h> |
|
57 #include <stdlib.h> |
|
58 #include <regex.h> |
|
59 #include <wchar.h> |
|
60 #include <wctype.h> |
|
61 |
|
62 #ifdef __SYMBIAN32__ |
|
63 #include <langinfo.h> |
|
64 #endif |
|
65 |
|
66 #ifndef __SYMBIAN32__ |
|
67 #include "libc_collate.h" |
|
68 #endif |
|
69 |
|
70 //Warning: #61-D: integer operation result is out of range |
|
71 #ifdef __ARMCC_3_1__ |
|
72 #pragma diag_suppress 61 |
|
73 #endif |
|
74 |
|
75 #include "utils.h" |
|
76 #include "regex2.h" |
|
77 |
|
78 #include "cname.h" |
|
79 |
|
80 #if (defined(__SYMBIAN32__) && (defined(__WINSCW__) || defined(__WINS__))) |
|
81 #include "libc_wsd_defs.h" |
|
82 #endif |
|
83 |
|
84 #ifdef EMULATOR |
|
85 #include "reent.h" |
|
86 #endif //EMULATOR |
|
87 |
|
88 /* |
|
89 * parse structure, passed up and down to avoid global variables and |
|
90 * other clumsinesses |
|
91 */ |
|
92 struct parse { |
|
93 char *next; /* next character in RE */ |
|
94 char *end; /* end of string (-> NUL normally) */ |
|
95 int error; /* has an error been seen? */ |
|
96 sop *strip; /* malloced strip */ |
|
97 sopno ssize; /* malloced strip size (allocated) */ |
|
98 sopno slen; /* malloced strip length (used) */ |
|
99 int ncsalloc; /* number of csets allocated */ |
|
100 struct re_guts *g; |
|
101 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ |
|
102 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ |
|
103 sopno pend[NPAREN]; /* -> ) ([0] unused) */ |
|
104 }; |
|
105 |
|
106 /* ========= begin header generated by ./mkh ========= */ |
|
107 #ifdef __cplusplus |
|
108 extern "C" { |
|
109 #endif |
|
110 #ifdef __SYMBIAN32__ |
|
111 extern char LC_CTYPE_LocaleName[30]; |
|
112 extern char LC_COLLATE_LocaleName[30]; |
|
113 extern char LC_NUMERIC_LocaleName[30]; |
|
114 extern char LC_MONETARY_LocaleName[30]; |
|
115 extern char LC_TIME_LocaleName[30]; |
|
116 extern char LC_ALL_LocaleName[30]; |
|
117 |
|
118 #ifdef EMULATOR |
|
119 char * GET_WSD_VAR_NAME(LC_CTYPE_LocaleName, g)(); |
|
120 char *GET_WSD_VAR_NAME(LC_COLLATE_LocaleName, g)(); |
|
121 char * GET_WSD_VAR_NAME(LC_NUMERIC_LocaleName, g)(); |
|
122 char * GET_WSD_VAR_NAME(LC_MONETARY_LocaleName, g)(); |
|
123 char * GET_WSD_VAR_NAME(LC_TIME_LocaleName, g)(); |
|
124 char * GET_WSD_VAR_NAME(LC_ALL_LocaleName, g)(); |
|
125 GET_STATIC_ARRAY_FROM_TLS(cnames, struct cname) |
|
126 |
|
127 #define LC_CTYPE_LocaleName (GET_WSD_VAR_NAME(LC_CTYPE_LocaleName, g)()) |
|
128 #define LC_COLLATE_LocaleName (GET_WSD_VAR_NAME(LC_COLLATE_LocaleName, g)()) |
|
129 #define LC_NUMERIC_LocaleName (GET_WSD_VAR_NAME(LC_NUMERIC_LocaleName, g)()) |
|
130 #define LC_MONETARY_LocaleName (GET_WSD_VAR_NAME(LC_MONETARY_LocaleName, g)()) |
|
131 #define LC_TIME_LocaleName (GET_WSD_VAR_NAME(LC_TIME_LocaleName, g)()) |
|
132 #define LC_ALL_LocaleName (GET_WSD_VAR_NAME(LC_ALL_LocaleName, g)()) |
|
133 #define cnames (GET_WSD_VAR_NAME(cnames, s)()) |
|
134 #endif //EMULATOR |
|
135 |
|
136 extern int __collate_range_cmp(int , int ); |
|
137 |
|
138 #endif |
|
139 |
|
140 /* === regcomp.c === */ |
|
141 static void p_ere(struct parse *p, wint_t stop); |
|
142 static void p_ere_exp(struct parse *p); |
|
143 static void p_str(struct parse *p); |
|
144 static void p_bre(struct parse *p, wint_t end1, wint_t end2); |
|
145 static int p_simp_re(struct parse *p, int starordinary); |
|
146 static int p_count(struct parse *p); |
|
147 static void p_bracket(struct parse *p); |
|
148 static void p_b_term(struct parse *p, cset *cs); |
|
149 static void p_b_cclass(struct parse *p, cset *cs); |
|
150 static void p_b_eclass(struct parse *p, cset *cs); |
|
151 static wint_t p_b_symbol(struct parse *p); |
|
152 static wint_t p_b_coll_elem(struct parse *p, wint_t endc); |
|
153 static wint_t othercase(wint_t ch); |
|
154 static void bothcases(struct parse *p, wint_t ch); |
|
155 static void ordinary(struct parse *p, wint_t ch); |
|
156 static void nonnewline(struct parse *p); |
|
157 static void repeat(struct parse *p, sopno start, int from, int to); |
|
158 static int seterr(struct parse *p, int e); |
|
159 static cset *allocset(struct parse *p); |
|
160 static void freeset(struct parse *p, cset *cs); |
|
161 static void CHadd(struct parse *p, cset *cs, wint_t ch); |
|
162 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max); |
|
163 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct); |
|
164 static wint_t singleton(cset *cs); |
|
165 static sopno dupl(struct parse *p, sopno start, sopno finish); |
|
166 static void doemit(struct parse *p, sop op, size_t opnd); |
|
167 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); |
|
168 static void dofwd(struct parse *p, sopno pos, sop value); |
|
169 static void enlarge(struct parse *p, sopno size); |
|
170 static void stripsnug(struct parse *p, struct re_guts *g); |
|
171 static void findmust(struct parse *p, struct re_guts *g); |
|
172 static int altoffset(sop *scan, int offset); |
|
173 static void computejumps(struct parse *p, struct re_guts *g); |
|
174 static void computematchjumps(struct parse *p, struct re_guts *g); |
|
175 static sopno pluscount(struct parse *p, struct re_guts *g); |
|
176 static wint_t wgetnext(struct parse *p); |
|
177 |
|
178 #ifdef __cplusplus |
|
179 } |
|
180 #endif |
|
181 /* ========= end header generated by ./mkh ========= */ |
|
182 |
|
183 #ifndef EMULATOR |
|
184 |
|
185 static char nuls[10]; /* place to point scanner in event of error */ |
|
186 #else //EMULATOR |
|
187 |
|
188 GET_STATIC_ARRAY_FROM_TLS(nuls, char) |
|
189 #define nuls (GET_WSD_VAR_NAME(nuls, s)()) |
|
190 #endif //EMULATOR |
|
191 |
|
192 /* |
|
193 * macros for use with parse structure |
|
194 * BEWARE: these know that the parse structure is named `p' !!! |
|
195 */ |
|
196 #define PEEK() (*p->next) |
|
197 #define PEEK2() (*(p->next+1)) |
|
198 #define MORE() (p->next < p->end) |
|
199 #define MORE2() (p->next+1 < p->end) |
|
200 #define SEE(c) (MORE() && PEEK() == (c)) |
|
201 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) |
|
202 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) |
|
203 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) |
|
204 #define NEXT() (p->next++) |
|
205 #define NEXT2() (p->next += 2) |
|
206 #define NEXTn(n) (p->next += (n)) |
|
207 #define GETNEXT() (*p->next++) |
|
208 #define WGETNEXT() wgetnext(p) |
|
209 #define SETERROR(e) seterr(p, (e)) |
|
210 #define REQUIRE(co, e) ((co) || SETERROR(e)) |
|
211 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) |
|
212 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) |
|
213 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) |
|
214 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) |
|
215 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) |
|
216 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) |
|
217 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) |
|
218 #define HERE() (p->slen) |
|
219 #define THERE() (p->slen - 1) |
|
220 #define THERETHERE() (p->slen - 2) |
|
221 #define DROP(n) (p->slen -= (n)) |
|
222 |
|
223 #ifndef NDEBUG |
|
224 #ifndef EMULATOR |
|
225 |
|
226 static int never = 0; /* for use in asserts; shuts lint up */ |
|
227 #else //EMULATOR |
|
228 |
|
229 GET_STATIC_VAR_FROM_TLS(never, int) |
|
230 #define never (*GET_WSD_VAR_NAME(never, s)()) |
|
231 #endif //EMULATOR |
|
232 #else |
|
233 #define never 0 /* some <assert.h>s have bugs too */ |
|
234 #endif |
|
235 |
|
236 /* Macro used by computejump()/computematchjump() */ |
|
237 #define MIN(a,b) ((a)<(b)?(a):(b)) |
|
238 |
|
239 /* |
|
240 - regcomp - interface for parser and compilation |
|
241 = extern int regcomp(regex_t *, const char *, int); |
|
242 = #define REG_BASIC 0000 |
|
243 = #define REG_EXTENDED 0001 |
|
244 = #define REG_ICASE 0002 |
|
245 = #define REG_NOSUB 0004 |
|
246 = #define REG_NEWLINE 0010 |
|
247 = #define REG_NOSPEC 0020 |
|
248 = #define REG_PEND 0040 |
|
249 = #define REG_DUMP 0200 |
|
250 */ |
|
251 EXPORT_C int /* 0 success, otherwise REG_something */ |
|
252 regcomp(preg, pattern, cflags) |
|
253 regex_t * __restrict preg; |
|
254 const char * __restrict pattern; |
|
255 int cflags; |
|
256 { |
|
257 struct parse pa; |
|
258 struct re_guts *g; |
|
259 struct parse *p = &pa; |
|
260 int i; |
|
261 size_t len; |
|
262 #ifdef REDEBUG |
|
263 # define GOODFLAGS(f) (f) |
|
264 #else |
|
265 # define GOODFLAGS(f) ((f)&~REG_DUMP) |
|
266 #endif |
|
267 |
|
268 cflags = GOODFLAGS(cflags); |
|
269 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) |
|
270 return(REG_INVARG); |
|
271 |
|
272 if (cflags®_PEND) { |
|
273 if (preg->re_endp < pattern) |
|
274 return(REG_INVARG); |
|
275 len = preg->re_endp - pattern; |
|
276 } else |
|
277 len = strlen((char *)pattern); |
|
278 |
|
279 /* do the mallocs early so failure handling is easy */ |
|
280 g = (struct re_guts *)malloc(sizeof(struct re_guts)); |
|
281 if (g == NULL) |
|
282 return(REG_ESPACE); |
|
283 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ |
|
284 p->strip = (sop *)malloc(p->ssize * sizeof(sop)); |
|
285 p->slen = 0; |
|
286 if (p->strip == NULL) { |
|
287 free((char *)g); |
|
288 return(REG_ESPACE); |
|
289 } |
|
290 |
|
291 /* set things up */ |
|
292 p->g = g; |
|
293 p->next = (char *)pattern; /* convenience; we do not modify it */ |
|
294 p->end = p->next + len; |
|
295 p->error = 0; |
|
296 p->ncsalloc = 0; |
|
297 for (i = 0; i < NPAREN; i++) { |
|
298 p->pbegin[i] = 0; |
|
299 p->pend[i] = 0; |
|
300 } |
|
301 g->sets = NULL; |
|
302 g->ncsets = 0; |
|
303 g->cflags = cflags; |
|
304 g->iflags = 0; |
|
305 g->nbol = 0; |
|
306 g->neol = 0; |
|
307 g->must = NULL; |
|
308 g->moffset = -1; |
|
309 g->charjump = NULL; |
|
310 g->matchjump = NULL; |
|
311 g->mlen = 0; |
|
312 g->nsub = 0; |
|
313 g->backrefs = 0; |
|
314 |
|
315 /* do it */ |
|
316 EMIT(OEND, 0); |
|
317 g->firststate = THERE(); |
|
318 if (cflags®_EXTENDED) |
|
319 p_ere(p, OUT); |
|
320 else if (cflags®_NOSPEC) |
|
321 p_str(p); |
|
322 else |
|
323 p_bre(p, OUT, OUT); |
|
324 EMIT(OEND, 0); |
|
325 g->laststate = THERE(); |
|
326 |
|
327 /* tidy up loose ends and fill things in */ |
|
328 stripsnug(p, g); |
|
329 findmust(p, g); |
|
330 /* only use Boyer-Moore algorithm if the pattern is bigger |
|
331 * than three characters |
|
332 */ |
|
333 if(g->mlen > 3) { |
|
334 computejumps(p, g); |
|
335 computematchjumps(p, g); |
|
336 if(g->matchjump == NULL && g->charjump != NULL) { |
|
337 free(g->charjump); |
|
338 g->charjump = NULL; |
|
339 } |
|
340 } |
|
341 g->nplus = pluscount(p, g); |
|
342 g->magic = MAGIC2; |
|
343 preg->re_nsub = g->nsub; |
|
344 preg->re_g = g; |
|
345 preg->re_magic = MAGIC1; |
|
346 #ifndef REDEBUG |
|
347 /* not debugging, so can't rely on the assert() in regexec() */ |
|
348 if (g->iflags&BAD) |
|
349 SETERROR(REG_ASSERT); |
|
350 #endif |
|
351 |
|
352 /* win or lose, we're done */ |
|
353 if (p->error != 0) /* lose */ |
|
354 regfree(preg); |
|
355 return(p->error); |
|
356 } |
|
357 |
|
358 /* |
|
359 - p_ere - ERE parser top level, concatenation and alternation |
|
360 == static void p_ere(struct parse *p, int stop); |
|
361 */ |
|
362 static void |
|
363 p_ere(p, stop) |
|
364 struct parse *p; |
|
365 int stop; /* character this ERE should end at */ |
|
366 { |
|
367 char c; |
|
368 sopno prevback; |
|
369 sopno prevfwd; |
|
370 sopno conc; |
|
371 int first = 1; /* is this the first alternative? */ |
|
372 |
|
373 for (;;) { |
|
374 /* do a bunch of concatenated expressions */ |
|
375 conc = HERE(); |
|
376 while (MORE() && (c = PEEK()) != '|' && c != stop) |
|
377 p_ere_exp(p); |
|
378 (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ |
|
379 |
|
380 if (!EAT('|')) |
|
381 break; /* NOTE BREAK OUT */ |
|
382 |
|
383 if (first) { |
|
384 INSERT(OCH_, conc); /* offset is wrong */ |
|
385 prevfwd = conc; |
|
386 prevback = conc; |
|
387 first = 0; |
|
388 } |
|
389 ASTERN(OOR1, prevback); |
|
390 prevback = THERE(); |
|
391 AHEAD(prevfwd); /* fix previous offset */ |
|
392 prevfwd = HERE(); |
|
393 EMIT(OOR2, 0); /* offset is very wrong */ |
|
394 } |
|
395 |
|
396 if (!first) { /* tail-end fixups */ |
|
397 AHEAD(prevfwd); |
|
398 ASTERN(O_CH, prevback); |
|
399 } |
|
400 |
|
401 assert(!MORE() || SEE(stop)); |
|
402 } |
|
403 |
|
404 /* |
|
405 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op |
|
406 == static void p_ere_exp(struct parse *p); |
|
407 */ |
|
408 static void |
|
409 p_ere_exp(p) |
|
410 struct parse *p; |
|
411 { |
|
412 char c; |
|
413 wint_t wc; |
|
414 sopno pos; |
|
415 int count; |
|
416 int count2; |
|
417 sopno subno; |
|
418 int wascaret = 0; |
|
419 |
|
420 assert(MORE()); /* caller should have ensured this */ |
|
421 c = GETNEXT(); |
|
422 |
|
423 pos = HERE(); |
|
424 switch (c) { |
|
425 case '(': |
|
426 (void)REQUIRE(MORE(), REG_EPAREN); |
|
427 p->g->nsub++; |
|
428 subno = p->g->nsub; |
|
429 if (subno < NPAREN) |
|
430 p->pbegin[subno] = HERE(); |
|
431 EMIT(OLPAREN, subno); |
|
432 if (!SEE(')')) |
|
433 p_ere(p, ')'); |
|
434 if (subno < NPAREN) { |
|
435 p->pend[subno] = HERE(); |
|
436 assert(p->pend[subno] != 0); |
|
437 } |
|
438 EMIT(ORPAREN, subno); |
|
439 (void)MUSTEAT(')', REG_EPAREN); |
|
440 break; |
|
441 #ifndef POSIX_MISTAKE |
|
442 case ')': /* happens only if no current unmatched ( */ |
|
443 /* |
|
444 * You may ask, why the ifndef? Because I didn't notice |
|
445 * this until slightly too late for 1003.2, and none of the |
|
446 * other 1003.2 regular-expression reviewers noticed it at |
|
447 * all. So an unmatched ) is legal POSIX, at least until |
|
448 * we can get it fixed. |
|
449 */ |
|
450 SETERROR(REG_EPAREN); |
|
451 break; |
|
452 #endif |
|
453 case '^': |
|
454 EMIT(OBOL, 0); |
|
455 p->g->iflags |= USEBOL; |
|
456 p->g->nbol++; |
|
457 wascaret = 1; |
|
458 break; |
|
459 case '$': |
|
460 EMIT(OEOL, 0); |
|
461 p->g->iflags |= USEEOL; |
|
462 p->g->neol++; |
|
463 break; |
|
464 case '|': |
|
465 SETERROR(REG_EMPTY); |
|
466 break; |
|
467 case '*': |
|
468 case '+': |
|
469 case '?': |
|
470 SETERROR(REG_BADRPT); |
|
471 break; |
|
472 case '.': |
|
473 if (p->g->cflags®_NEWLINE) |
|
474 nonnewline(p); |
|
475 else |
|
476 EMIT(OANY, 0); |
|
477 break; |
|
478 case '[': |
|
479 p_bracket(p); |
|
480 break; |
|
481 case '\\': |
|
482 (void)REQUIRE(MORE(), REG_EESCAPE); |
|
483 wc = WGETNEXT(); |
|
484 ordinary(p, wc); |
|
485 break; |
|
486 case '{': /* okay as ordinary except if digit follows */ |
|
487 (void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); |
|
488 /* FALLTHROUGH */ |
|
489 default: |
|
490 p->next--; |
|
491 wc = WGETNEXT(); |
|
492 ordinary(p, wc); |
|
493 break; |
|
494 } |
|
495 |
|
496 if (!MORE()) |
|
497 return; |
|
498 c = PEEK(); |
|
499 /* we call { a repetition if followed by a digit */ |
|
500 if (!( c == '*' || c == '+' || c == '?' || |
|
501 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) |
|
502 return; /* no repetition, we're done */ |
|
503 NEXT(); |
|
504 |
|
505 (void)REQUIRE(!wascaret, REG_BADRPT); |
|
506 switch (c) { |
|
507 case '*': /* implemented as +? */ |
|
508 /* this case does not require the (y|) trick, noKLUDGE */ |
|
509 INSERT(OPLUS_, pos); |
|
510 ASTERN(O_PLUS, pos); |
|
511 INSERT(OQUEST_, pos); |
|
512 ASTERN(O_QUEST, pos); |
|
513 break; |
|
514 case '+': |
|
515 INSERT(OPLUS_, pos); |
|
516 ASTERN(O_PLUS, pos); |
|
517 break; |
|
518 case '?': |
|
519 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
|
520 INSERT(OCH_, pos); /* offset slightly wrong */ |
|
521 ASTERN(OOR1, pos); /* this one's right */ |
|
522 AHEAD(pos); /* fix the OCH_ */ |
|
523 EMIT(OOR2, 0); /* offset very wrong... */ |
|
524 AHEAD(THERE()); /* ...so fix it */ |
|
525 ASTERN(O_CH, THERETHERE()); |
|
526 break; |
|
527 case '{': |
|
528 count = p_count(p); |
|
529 if (EAT(',')) { |
|
530 if (isdigit((uch)PEEK())) { |
|
531 count2 = p_count(p); |
|
532 (void)REQUIRE(count <= count2, REG_BADBR); |
|
533 } else /* single number with comma */ |
|
534 count2 = INFINITY; |
|
535 } else /* just a single number */ |
|
536 count2 = count; |
|
537 repeat(p, pos, count, count2); |
|
538 if (!EAT('}')) { /* error heuristics */ |
|
539 while (MORE() && PEEK() != '}') |
|
540 NEXT(); |
|
541 (void)REQUIRE(MORE(), REG_EBRACE); |
|
542 SETERROR(REG_BADBR); |
|
543 } |
|
544 break; |
|
545 } |
|
546 |
|
547 if (!MORE()) |
|
548 return; |
|
549 c = PEEK(); |
|
550 if (!( c == '*' || c == '+' || c == '?' || |
|
551 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) |
|
552 return; |
|
553 SETERROR(REG_BADRPT); |
|
554 } |
|
555 |
|
556 /* |
|
557 - p_str - string (no metacharacters) "parser" |
|
558 == static void p_str(struct parse *p); |
|
559 */ |
|
560 static void |
|
561 p_str(p) |
|
562 struct parse *p; |
|
563 { |
|
564 (void)REQUIRE(MORE(), REG_EMPTY); |
|
565 while (MORE()) |
|
566 ordinary(p, WGETNEXT()); |
|
567 } |
|
568 |
|
569 /* |
|
570 - p_bre - BRE parser top level, anchoring and concatenation |
|
571 == static void p_bre(struct parse *p, int end1, \ |
|
572 == int end2); |
|
573 * Giving end1 as OUT essentially eliminates the end1/end2 check. |
|
574 * |
|
575 * This implementation is a bit of a kludge, in that a trailing $ is first |
|
576 * taken as an ordinary character and then revised to be an anchor. |
|
577 * The amount of lookahead needed to avoid this kludge is excessive. |
|
578 */ |
|
579 static void |
|
580 p_bre(p, end1, end2) |
|
581 struct parse *p; |
|
582 int end1; /* first terminating character */ |
|
583 int end2; /* second terminating character */ |
|
584 { |
|
585 sopno start = HERE(); |
|
586 int first = 1; /* first subexpression? */ |
|
587 int wasdollar = 0; |
|
588 |
|
589 if (EAT('^')) { |
|
590 EMIT(OBOL, 0); |
|
591 p->g->iflags |= USEBOL; |
|
592 p->g->nbol++; |
|
593 } |
|
594 while (MORE() && !SEETWO(end1, end2)) { |
|
595 wasdollar = p_simp_re(p, first); |
|
596 first = 0; |
|
597 } |
|
598 if (wasdollar) { /* oops, that was a trailing anchor */ |
|
599 DROP(1); |
|
600 EMIT(OEOL, 0); |
|
601 p->g->iflags |= USEEOL; |
|
602 p->g->neol++; |
|
603 } |
|
604 |
|
605 (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ |
|
606 } |
|
607 |
|
608 /* |
|
609 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition |
|
610 == static int p_simp_re(struct parse *p, int starordinary); |
|
611 */ |
|
612 static int /* was the simple RE an unbackslashed $? */ |
|
613 p_simp_re(p, starordinary) |
|
614 struct parse *p; |
|
615 int starordinary; /* is a leading * an ordinary character? */ |
|
616 { |
|
617 int c; |
|
618 int count; |
|
619 int count2; |
|
620 sopno pos; |
|
621 int i; |
|
622 wint_t wc; |
|
623 sopno subno; |
|
624 # define BACKSL (1<<CHAR_BIT) |
|
625 |
|
626 pos = HERE(); /* repetion op, if any, covers from here */ |
|
627 |
|
628 assert(MORE()); /* caller should have ensured this */ |
|
629 c = GETNEXT(); |
|
630 if (c == '\\') { |
|
631 (void)REQUIRE(MORE(), REG_EESCAPE); |
|
632 c = BACKSL | GETNEXT(); |
|
633 } |
|
634 switch (c) { |
|
635 case '.': |
|
636 if (p->g->cflags®_NEWLINE) |
|
637 nonnewline(p); |
|
638 else |
|
639 EMIT(OANY, 0); |
|
640 break; |
|
641 case '[': |
|
642 p_bracket(p); |
|
643 break; |
|
644 case BACKSL|'{': |
|
645 SETERROR(REG_BADRPT); |
|
646 break; |
|
647 case BACKSL|'(': |
|
648 p->g->nsub++; |
|
649 subno = p->g->nsub; |
|
650 if (subno < NPAREN) |
|
651 p->pbegin[subno] = HERE(); |
|
652 EMIT(OLPAREN, subno); |
|
653 /* the MORE here is an error heuristic */ |
|
654 if (MORE() && !SEETWO('\\', ')')) |
|
655 p_bre(p, '\\', ')'); |
|
656 if (subno < NPAREN) { |
|
657 p->pend[subno] = HERE(); |
|
658 assert(p->pend[subno] != 0); |
|
659 } |
|
660 EMIT(ORPAREN, subno); |
|
661 (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); |
|
662 break; |
|
663 case BACKSL|')': /* should not get here -- must be user */ |
|
664 case BACKSL|'}': |
|
665 SETERROR(REG_EPAREN); |
|
666 break; |
|
667 case BACKSL|'1': |
|
668 case BACKSL|'2': |
|
669 case BACKSL|'3': |
|
670 case BACKSL|'4': |
|
671 case BACKSL|'5': |
|
672 case BACKSL|'6': |
|
673 case BACKSL|'7': |
|
674 case BACKSL|'8': |
|
675 case BACKSL|'9': |
|
676 i = (c&~BACKSL) - '0'; |
|
677 assert(i < NPAREN); |
|
678 if (p->pend[i] != 0) { |
|
679 assert(i <= p->g->nsub); |
|
680 EMIT(OBACK_, i); |
|
681 assert(p->pbegin[i] != 0); |
|
682 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); |
|
683 assert(OP(p->strip[p->pend[i]]) == ORPAREN); |
|
684 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); |
|
685 EMIT(O_BACK, i); |
|
686 } else |
|
687 SETERROR(REG_ESUBREG); |
|
688 p->g->backrefs = 1; |
|
689 break; |
|
690 case '*': |
|
691 (void)REQUIRE(starordinary, REG_BADRPT); |
|
692 /* FALLTHROUGH */ |
|
693 default: |
|
694 p->next--; |
|
695 wc = WGETNEXT(); |
|
696 ordinary(p, wc); |
|
697 break; |
|
698 } |
|
699 |
|
700 if (EAT('*')) { /* implemented as +? */ |
|
701 /* this case does not require the (y|) trick, noKLUDGE */ |
|
702 INSERT(OPLUS_, pos); |
|
703 ASTERN(O_PLUS, pos); |
|
704 INSERT(OQUEST_, pos); |
|
705 ASTERN(O_QUEST, pos); |
|
706 } else if (EATTWO('\\', '{')) { |
|
707 count = p_count(p); |
|
708 if (EAT(',')) { |
|
709 if (MORE() && isdigit((uch)PEEK())) { |
|
710 count2 = p_count(p); |
|
711 (void)REQUIRE(count <= count2, REG_BADBR); |
|
712 } else /* single number with comma */ |
|
713 count2 = INFINITY; |
|
714 } else /* just a single number */ |
|
715 count2 = count; |
|
716 repeat(p, pos, count, count2); |
|
717 if (!EATTWO('\\', '}')) { /* error heuristics */ |
|
718 while (MORE() && !SEETWO('\\', '}')) |
|
719 NEXT(); |
|
720 (void)REQUIRE(MORE(), REG_EBRACE); |
|
721 SETERROR(REG_BADBR); |
|
722 } |
|
723 } else if (c == '$') /* $ (but not \$) ends it */ |
|
724 return(1); |
|
725 |
|
726 return(0); |
|
727 } |
|
728 |
|
729 /* |
|
730 - p_count - parse a repetition count |
|
731 == static int p_count(struct parse *p); |
|
732 */ |
|
733 static int /* the value */ |
|
734 p_count(p) |
|
735 struct parse *p; |
|
736 { |
|
737 int count = 0; |
|
738 int ndigits = 0; |
|
739 |
|
740 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { |
|
741 count = count*10 + (GETNEXT() - '0'); |
|
742 ndigits++; |
|
743 } |
|
744 |
|
745 (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); |
|
746 return(count); |
|
747 } |
|
748 |
|
749 /* |
|
750 - p_bracket - parse a bracketed character list |
|
751 == static void p_bracket(struct parse *p); |
|
752 */ |
|
753 static void |
|
754 p_bracket(p) |
|
755 struct parse *p; |
|
756 { |
|
757 cset *cs; |
|
758 wint_t ch; |
|
759 |
|
760 /* Dept of Truly Sickening Special-Case Kludges */ |
|
761 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { |
|
762 EMIT(OBOW, 0); |
|
763 NEXTn(6); |
|
764 return; |
|
765 } |
|
766 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { |
|
767 EMIT(OEOW, 0); |
|
768 NEXTn(6); |
|
769 return; |
|
770 } |
|
771 |
|
772 if ((cs = allocset(p)) == NULL) |
|
773 return; |
|
774 |
|
775 if (p->g->cflags®_ICASE) |
|
776 cs->icase = 1; |
|
777 if (EAT('^')) |
|
778 cs->invert = 1; |
|
779 if (EAT(']')) |
|
780 CHadd(p, cs, ']'); |
|
781 else if (EAT('-')) |
|
782 CHadd(p, cs, '-'); |
|
783 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) |
|
784 p_b_term(p, cs); |
|
785 if (EAT('-')) |
|
786 CHadd(p, cs, '-'); |
|
787 (void)MUSTEAT(']', REG_EBRACK); |
|
788 |
|
789 if (p->error != 0) /* don't mess things up further */ |
|
790 return; |
|
791 |
|
792 if (cs->invert && p->g->cflags®_NEWLINE) |
|
793 cs->bmp['\n' >> 3] |= 1 << ('\n' & 7); |
|
794 |
|
795 if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */ |
|
796 ordinary(p, ch); |
|
797 freeset(p, cs); |
|
798 } else |
|
799 EMIT(OANYOF, (int)(cs - p->g->sets)); |
|
800 } |
|
801 |
|
802 /* |
|
803 - p_b_term - parse one term of a bracketed character list |
|
804 == static void p_b_term(struct parse *p, cset *cs); |
|
805 */ |
|
806 static void |
|
807 p_b_term(p, cs) |
|
808 struct parse *p; |
|
809 cset *cs; |
|
810 { |
|
811 char c; |
|
812 wint_t start, finish; |
|
813 wint_t i; |
|
814 |
|
815 /* classify what we've got */ |
|
816 switch ((MORE()) ? PEEK() : '\0') { |
|
817 case '[': |
|
818 c = (MORE2()) ? PEEK2() : '\0'; |
|
819 break; |
|
820 case '-': |
|
821 SETERROR(REG_ERANGE); |
|
822 return; /* NOTE RETURN */ |
|
823 break; |
|
824 default: |
|
825 c = '\0'; |
|
826 break; |
|
827 } |
|
828 |
|
829 switch (c) { |
|
830 case ':': /* character class */ |
|
831 NEXT2(); |
|
832 (void)REQUIRE(MORE(), REG_EBRACK); |
|
833 c = PEEK(); |
|
834 (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE); |
|
835 p_b_cclass(p, cs); |
|
836 (void)REQUIRE(MORE(), REG_EBRACK); |
|
837 (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE); |
|
838 break; |
|
839 case '=': /* equivalence class */ |
|
840 NEXT2(); |
|
841 (void)REQUIRE(MORE(), REG_EBRACK); |
|
842 c = PEEK(); |
|
843 (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE); |
|
844 p_b_eclass(p, cs); |
|
845 (void)REQUIRE(MORE(), REG_EBRACK); |
|
846 (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); |
|
847 break; |
|
848 default: /* symbol, ordinary character, or range */ |
|
849 start = p_b_symbol(p); |
|
850 if (SEE('-') && MORE2() && PEEK2() != ']') { |
|
851 /* range */ |
|
852 NEXT(); |
|
853 if (EAT('-')) |
|
854 finish = '-'; |
|
855 else |
|
856 finish = p_b_symbol(p); |
|
857 } else |
|
858 finish = start; |
|
859 if (start == finish) |
|
860 CHadd(p, cs, start); |
|
861 else { |
|
862 #ifndef __SYMBIAN32__ |
|
863 if (__collate_load_error ) |
|
864 #else |
|
865 if(((strcmp("C",(const char*) LC_COLLATE_LocaleName)==0) ||(strcmp("POSIX", (const char*) LC_COLLATE_LocaleName)==0))) |
|
866 #endif |
|
867 { |
|
868 (void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE); |
|
869 CHaddrange(p, cs, start, finish); |
|
870 } else { |
|
871 #ifndef __SYMBIAN32__ |
|
872 |
|
873 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); |
|
874 for (i = 0; i <= UCHAR_MAX; i++) { |
|
875 if ( __collate_range_cmp(start, i) <= 0 |
|
876 && __collate_range_cmp(i, finish) <= 0 |
|
877 ) |
|
878 CHadd(p, cs, i); |
|
879 |
|
880 #else |
|
881 |
|
882 int i_tmp=0; |
|
883 (void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE); |
|
884 for (i = start; i <= finish ; i++) { |
|
885 i_tmp=i; |
|
886 if ( __collate_range_cmp(start, i_tmp) <= 0 |
|
887 && __collate_range_cmp(i_tmp, finish) <= 0 |
|
888 ) |
|
889 { |
|
890 CHadd(p, cs, i_tmp); |
|
891 if(__collate_range_cmp(i_tmp, finish) == 0) |
|
892 break; |
|
893 } |
|
894 #endif |
|
895 } |
|
896 } |
|
897 } |
|
898 break; |
|
899 } |
|
900 } |
|
901 |
|
902 /* |
|
903 - p_b_cclass - parse a character-class name and deal with it |
|
904 == static void p_b_cclass(struct parse *p, cset *cs); |
|
905 */ |
|
906 static void |
|
907 p_b_cclass(p, cs) |
|
908 struct parse *p; |
|
909 cset *cs; |
|
910 { |
|
911 char *sp = p->next; |
|
912 size_t len; |
|
913 wctype_t wct; |
|
914 char clname[16]; |
|
915 |
|
916 while (MORE() && isalpha((uch)PEEK())) |
|
917 NEXT(); |
|
918 len = p->next - sp; |
|
919 if (len >= sizeof(clname) - 1) { |
|
920 SETERROR(REG_ECTYPE); |
|
921 return; |
|
922 } |
|
923 memcpy(clname, sp, len); |
|
924 clname[len] = '\0'; |
|
925 if ((wct = wctype(clname)) == 0) { |
|
926 SETERROR(REG_ECTYPE); |
|
927 return; |
|
928 } |
|
929 CHaddtype(p, cs, wct); |
|
930 } |
|
931 |
|
932 /* |
|
933 - p_b_eclass - parse an equivalence-class name and deal with it |
|
934 == static void p_b_eclass(struct parse *p, cset *cs); |
|
935 * |
|
936 * This implementation is incomplete. xxx |
|
937 */ |
|
938 static void |
|
939 p_b_eclass(p, cs) |
|
940 struct parse *p; |
|
941 cset *cs; |
|
942 { |
|
943 wint_t c; |
|
944 |
|
945 c = p_b_coll_elem(p, '='); |
|
946 CHadd(p, cs, c); |
|
947 } |
|
948 |
|
949 /* |
|
950 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol |
|
951 == static char p_b_symbol(struct parse *p); |
|
952 */ |
|
953 static wint_t /* value of symbol */ |
|
954 p_b_symbol(p) |
|
955 struct parse *p; |
|
956 { |
|
957 wint_t value; |
|
958 |
|
959 (void)REQUIRE(MORE(), REG_EBRACK); |
|
960 if (!EATTWO('[', '.')) |
|
961 return(WGETNEXT()); |
|
962 |
|
963 /* collating symbol */ |
|
964 value = p_b_coll_elem(p, '.'); |
|
965 (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); |
|
966 return(value); |
|
967 } |
|
968 |
|
969 /* |
|
970 - p_b_coll_elem - parse a collating-element name and look it up |
|
971 == static char p_b_coll_elem(struct parse *p, int endc); |
|
972 */ |
|
973 static wint_t /* value of collating element */ |
|
974 p_b_coll_elem(p, endc) |
|
975 struct parse *p; |
|
976 wint_t endc; /* name ended by endc,']' */ |
|
977 { |
|
978 char *sp = p->next; |
|
979 struct cname *cp; |
|
980 int len; |
|
981 mbstate_t mbs; |
|
982 wchar_t wc; |
|
983 size_t clen; |
|
984 |
|
985 while (MORE() && !SEETWO(endc, ']')) |
|
986 NEXT(); |
|
987 if (!MORE()) { |
|
988 SETERROR(REG_EBRACK); |
|
989 return(0); |
|
990 } |
|
991 len = p->next - sp; |
|
992 for (cp = cnames; cp->name != NULL; cp++) |
|
993 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') |
|
994 return(cp->code); /* known name */ |
|
995 memset(&mbs, 0, sizeof(mbs)); |
|
996 if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len) |
|
997 return (wc); /* single character */ |
|
998 else if (clen == (size_t)-1 || clen == (size_t)-2) |
|
999 SETERROR(REG_ILLSEQ); |
|
1000 else |
|
1001 SETERROR(REG_ECOLLATE); /* neither */ |
|
1002 return(0); |
|
1003 } |
|
1004 |
|
1005 /* |
|
1006 - othercase - return the case counterpart of an alphabetic |
|
1007 == static char othercase(int ch); |
|
1008 */ |
|
1009 static wint_t /* if no counterpart, return ch */ |
|
1010 othercase(ch) |
|
1011 wint_t ch; |
|
1012 { |
|
1013 assert(iswalpha(ch)); |
|
1014 if (iswupper(ch)) |
|
1015 return(towlower(ch)); |
|
1016 else if (iswlower(ch)) |
|
1017 return(towupper(ch)); |
|
1018 else /* peculiar, but could happen */ |
|
1019 return(ch); |
|
1020 } |
|
1021 |
|
1022 /* |
|
1023 - bothcases - emit a dualcase version of a two-case character |
|
1024 == static void bothcases(struct parse *p, int ch); |
|
1025 * |
|
1026 * Boy, is this implementation ever a kludge... |
|
1027 */ |
|
1028 static void |
|
1029 bothcases(p, ch) |
|
1030 struct parse *p; |
|
1031 wint_t ch; |
|
1032 { |
|
1033 char *oldnext = p->next; |
|
1034 char *oldend = p->end; |
|
1035 char bracket[3 + MB_LEN_MAX]; |
|
1036 size_t n; |
|
1037 mbstate_t mbs; |
|
1038 |
|
1039 assert(othercase(ch) != ch); /* p_bracket() would recurse */ |
|
1040 p->next = bracket; |
|
1041 memset(&mbs, 0, sizeof(mbs)); |
|
1042 n = wcrtomb(bracket, ch, &mbs); |
|
1043 assert(n != (size_t)-1); |
|
1044 bracket[n] = ']'; |
|
1045 bracket[n + 1] = '\0'; |
|
1046 p->end = bracket+n+1; |
|
1047 p_bracket(p); |
|
1048 assert(p->next == p->end); |
|
1049 p->next = oldnext; |
|
1050 p->end = oldend; |
|
1051 } |
|
1052 |
|
1053 /* |
|
1054 - ordinary - emit an ordinary character |
|
1055 == static void ordinary(struct parse *p, int ch); |
|
1056 */ |
|
1057 static void |
|
1058 ordinary(p, ch) |
|
1059 struct parse *p; |
|
1060 wint_t ch; |
|
1061 { |
|
1062 cset *cs; |
|
1063 |
|
1064 if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch) |
|
1065 bothcases(p, ch); |
|
1066 else if ((ch & OPDMASK) == ch) |
|
1067 EMIT(OCHAR, ch); |
|
1068 else { |
|
1069 /* |
|
1070 * Kludge: character is too big to fit into an OCHAR operand. |
|
1071 * Emit a singleton set. |
|
1072 */ |
|
1073 if ((cs = allocset(p)) == NULL) |
|
1074 return; |
|
1075 CHadd(p, cs, ch); |
|
1076 EMIT(OANYOF, (int)(cs - p->g->sets)); |
|
1077 } |
|
1078 } |
|
1079 |
|
1080 /* |
|
1081 - nonnewline - emit REG_NEWLINE version of OANY |
|
1082 == static void nonnewline(struct parse *p); |
|
1083 * |
|
1084 * Boy, is this implementation ever a kludge... |
|
1085 */ |
|
1086 static void |
|
1087 nonnewline(p) |
|
1088 struct parse *p; |
|
1089 { |
|
1090 char *oldnext = p->next; |
|
1091 char *oldend = p->end; |
|
1092 char bracket[4]; |
|
1093 |
|
1094 p->next = bracket; |
|
1095 p->end = bracket+3; |
|
1096 bracket[0] = '^'; |
|
1097 bracket[1] = '\n'; |
|
1098 bracket[2] = ']'; |
|
1099 bracket[3] = '\0'; |
|
1100 p_bracket(p); |
|
1101 assert(p->next == bracket+3); |
|
1102 p->next = oldnext; |
|
1103 p->end = oldend; |
|
1104 } |
|
1105 |
|
1106 /* |
|
1107 - repeat - generate code for a bounded repetition, recursively if needed |
|
1108 == static void repeat(struct parse *p, sopno start, int from, int to); |
|
1109 */ |
|
1110 static void |
|
1111 repeat(p, start, from, to) |
|
1112 struct parse *p; |
|
1113 sopno start; /* operand from here to end of strip */ |
|
1114 int from; /* repeated from this number */ |
|
1115 int to; /* to this number of times (maybe INFINITY) */ |
|
1116 { |
|
1117 sopno finish = HERE(); |
|
1118 # define N 2 |
|
1119 # define INF 3 |
|
1120 # define REP(f, t) ((f)*8 + (t)) |
|
1121 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) |
|
1122 sopno copy; |
|
1123 |
|
1124 if (p->error != 0) /* head off possible runaway recursion */ |
|
1125 return; |
|
1126 |
|
1127 assert(from <= to); |
|
1128 |
|
1129 switch (REP(MAP(from), MAP(to))) { |
|
1130 case REP(0, 0): /* must be user doing this */ |
|
1131 DROP(finish-start); /* drop the operand */ |
|
1132 break; |
|
1133 case REP(0, 1): /* as x{1,1}? */ |
|
1134 case REP(0, N): /* as x{1,n}? */ |
|
1135 case REP(0, INF): /* as x{1,}? */ |
|
1136 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
|
1137 INSERT(OCH_, start); /* offset is wrong... */ |
|
1138 repeat(p, start+1, 1, to); |
|
1139 ASTERN(OOR1, start); |
|
1140 AHEAD(start); /* ... fix it */ |
|
1141 EMIT(OOR2, 0); |
|
1142 AHEAD(THERE()); |
|
1143 ASTERN(O_CH, THERETHERE()); |
|
1144 break; |
|
1145 case REP(1, 1): /* trivial case */ |
|
1146 /* done */ |
|
1147 break; |
|
1148 case REP(1, N): /* as x?x{1,n-1} */ |
|
1149 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
|
1150 INSERT(OCH_, start); |
|
1151 ASTERN(OOR1, start); |
|
1152 AHEAD(start); |
|
1153 EMIT(OOR2, 0); /* offset very wrong... */ |
|
1154 AHEAD(THERE()); /* ...so fix it */ |
|
1155 ASTERN(O_CH, THERETHERE()); |
|
1156 copy = dupl(p, start+1, finish+1); |
|
1157 assert(copy == finish+4); |
|
1158 repeat(p, copy, 1, to-1); |
|
1159 break; |
|
1160 case REP(1, INF): /* as x+ */ |
|
1161 INSERT(OPLUS_, start); |
|
1162 ASTERN(O_PLUS, start); |
|
1163 break; |
|
1164 case REP(N, N): /* as xx{m-1,n-1} */ |
|
1165 copy = dupl(p, start, finish); |
|
1166 repeat(p, copy, from-1, to-1); |
|
1167 break; |
|
1168 case REP(N, INF): /* as xx{n-1,INF} */ |
|
1169 copy = dupl(p, start, finish); |
|
1170 repeat(p, copy, from-1, to); |
|
1171 break; |
|
1172 default: /* "can't happen" */ |
|
1173 SETERROR(REG_ASSERT); /* just in case */ |
|
1174 break; |
|
1175 } |
|
1176 } |
|
1177 |
|
1178 /* |
|
1179 - wgetnext - helper function for WGETNEXT() macro. Gets the next wide |
|
1180 - character from the parse struct, signals a REG_ILLSEQ error if the |
|
1181 - character can't be converted. Returns the number of bytes consumed. |
|
1182 */ |
|
1183 static wint_t |
|
1184 wgetnext(p) |
|
1185 struct parse *p; |
|
1186 { |
|
1187 mbstate_t mbs; |
|
1188 wchar_t wc; |
|
1189 size_t n; |
|
1190 |
|
1191 memset(&mbs, 0, sizeof(mbs)); |
|
1192 n = mbrtowc(&wc, p->next, p->end - p->next, &mbs); |
|
1193 if (n == (size_t)-1 || n == (size_t)-2) { |
|
1194 SETERROR(REG_ILLSEQ); |
|
1195 return (0); |
|
1196 } |
|
1197 if (n == 0) |
|
1198 n = 1; |
|
1199 p->next += n; |
|
1200 return (wc); |
|
1201 } |
|
1202 |
|
1203 /* |
|
1204 - seterr - set an error condition |
|
1205 == static int seterr(struct parse *p, int e); |
|
1206 */ |
|
1207 static int /* useless but makes type checking happy */ |
|
1208 seterr(p, e) |
|
1209 struct parse *p; |
|
1210 int e; |
|
1211 { |
|
1212 if (p->error == 0) /* keep earliest error condition */ |
|
1213 p->error = e; |
|
1214 p->next = nuls; /* try to bring things to a halt */ |
|
1215 p->end = nuls; |
|
1216 return(0); /* make the return value well-defined */ |
|
1217 } |
|
1218 |
|
1219 /* |
|
1220 - allocset - allocate a set of characters for [] |
|
1221 == static cset *allocset(struct parse *p); |
|
1222 */ |
|
1223 static cset * |
|
1224 allocset(p) |
|
1225 struct parse *p; |
|
1226 { |
|
1227 cset *cs, *ncs; |
|
1228 |
|
1229 ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs)); |
|
1230 if (ncs == NULL) { |
|
1231 SETERROR(REG_ESPACE); |
|
1232 return (NULL); |
|
1233 } |
|
1234 p->g->sets = ncs; |
|
1235 cs = &p->g->sets[p->g->ncsets++]; |
|
1236 memset(cs, 0, sizeof(*cs)); |
|
1237 |
|
1238 return(cs); |
|
1239 } |
|
1240 |
|
1241 /* |
|
1242 - freeset - free a now-unused set |
|
1243 == static void freeset(struct parse *p, cset *cs); |
|
1244 */ |
|
1245 static void |
|
1246 freeset(p, cs) |
|
1247 struct parse *p; |
|
1248 cset *cs; |
|
1249 { |
|
1250 cset *top = &p->g->sets[p->g->ncsets]; |
|
1251 |
|
1252 free(cs->wides); |
|
1253 free(cs->ranges); |
|
1254 free(cs->types); |
|
1255 memset(cs, 0, sizeof(*cs)); |
|
1256 if (cs == top-1) /* recover only the easy case */ |
|
1257 p->g->ncsets--; |
|
1258 } |
|
1259 |
|
1260 /* |
|
1261 - singleton - Determine whether a set contains only one character, |
|
1262 - returning it if so, otherwise returning OUT. |
|
1263 */ |
|
1264 static wint_t |
|
1265 singleton(cs) |
|
1266 cset *cs; |
|
1267 { |
|
1268 wint_t i, s, n; |
|
1269 |
|
1270 for (i = n = 0; i < NC; i++) |
|
1271 if (CHIN(cs, i)) { |
|
1272 n++; |
|
1273 s = i; |
|
1274 } |
|
1275 if (n == 1) |
|
1276 return (s); |
|
1277 if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 && |
|
1278 cs->icase == 0) |
|
1279 return (cs->wides[0]); |
|
1280 /* Don't bother handling the other cases. */ |
|
1281 return (OUT); |
|
1282 } |
|
1283 |
|
1284 /* |
|
1285 - CHadd - add character to character set. |
|
1286 */ |
|
1287 static void |
|
1288 CHadd(p, cs, ch) |
|
1289 struct parse *p; |
|
1290 cset *cs; |
|
1291 wint_t ch; |
|
1292 { |
|
1293 wint_t nch, *newwides; |
|
1294 assert(ch >= 0); |
|
1295 if (ch < NC) |
|
1296 cs->bmp[ch >> 3] |= 1 << (ch & 7); |
|
1297 else { |
|
1298 newwides = realloc(cs->wides, (cs->nwides + 1) * |
|
1299 sizeof(*cs->wides)); |
|
1300 if (newwides == NULL) { |
|
1301 SETERROR(REG_ESPACE); |
|
1302 return; |
|
1303 } |
|
1304 cs->wides = newwides; |
|
1305 cs->wides[cs->nwides++] = ch; |
|
1306 } |
|
1307 if (cs->icase) { |
|
1308 if ((nch = towlower(ch)) < NC) |
|
1309 cs->bmp[nch >> 3] |= 1 << (nch & 7); |
|
1310 if ((nch = towupper(ch)) < NC) |
|
1311 cs->bmp[nch >> 3] |= 1 << (nch & 7); |
|
1312 } |
|
1313 } |
|
1314 |
|
1315 /* |
|
1316 - CHaddrange - add all characters in the range [min,max] to a character set. |
|
1317 */ |
|
1318 static void |
|
1319 CHaddrange(p, cs, min, max) |
|
1320 struct parse *p; |
|
1321 cset *cs; |
|
1322 wint_t min, max; |
|
1323 { |
|
1324 crange *newranges; |
|
1325 |
|
1326 for (; min < NC && min <= max; min++) |
|
1327 CHadd(p, cs, min); |
|
1328 if (min >= max) |
|
1329 return; |
|
1330 newranges = realloc(cs->ranges, (cs->nranges + 1) * |
|
1331 sizeof(*cs->ranges)); |
|
1332 if (newranges == NULL) { |
|
1333 SETERROR(REG_ESPACE); |
|
1334 return; |
|
1335 } |
|
1336 cs->ranges = newranges; |
|
1337 cs->ranges[cs->nranges].min = min; |
|
1338 cs->ranges[cs->nranges].min = max; |
|
1339 cs->nranges++; |
|
1340 } |
|
1341 |
|
1342 /* |
|
1343 - CHaddtype - add all characters of a certain type to a character set. |
|
1344 */ |
|
1345 static void |
|
1346 CHaddtype(p, cs, wct) |
|
1347 struct parse *p; |
|
1348 cset *cs; |
|
1349 wctype_t wct; |
|
1350 { |
|
1351 wint_t i; |
|
1352 wctype_t *newtypes; |
|
1353 |
|
1354 for (i = 0; i < NC; i++) |
|
1355 if (iswctype(i, wct)) |
|
1356 CHadd(p, cs, i); |
|
1357 newtypes = realloc(cs->types, (cs->ntypes + 1) * |
|
1358 sizeof(*cs->types)); |
|
1359 if (newtypes == NULL) { |
|
1360 SETERROR(REG_ESPACE); |
|
1361 return; |
|
1362 } |
|
1363 cs->types = newtypes; |
|
1364 cs->types[cs->ntypes++] = wct; |
|
1365 } |
|
1366 |
|
1367 /* |
|
1368 - dupl - emit a duplicate of a bunch of sops |
|
1369 == static sopno dupl(struct parse *p, sopno start, sopno finish); |
|
1370 */ |
|
1371 static sopno /* start of duplicate */ |
|
1372 dupl(p, start, finish) |
|
1373 struct parse *p; |
|
1374 sopno start; /* from here */ |
|
1375 sopno finish; /* to this less one */ |
|
1376 { |
|
1377 sopno ret = HERE(); |
|
1378 sopno len = finish - start; |
|
1379 |
|
1380 assert(finish >= start); |
|
1381 if (len == 0) |
|
1382 return(ret); |
|
1383 enlarge(p, p->ssize + len); /* this many unexpected additions */ |
|
1384 assert(p->ssize >= p->slen + len); |
|
1385 (void) memcpy((char *)(p->strip + p->slen), |
|
1386 (char *)(p->strip + start), (size_t)len*sizeof(sop)); |
|
1387 p->slen += len; |
|
1388 return(ret); |
|
1389 } |
|
1390 |
|
1391 /* |
|
1392 - doemit - emit a strip operator |
|
1393 == static void doemit(struct parse *p, sop op, size_t opnd); |
|
1394 * |
|
1395 * It might seem better to implement this as a macro with a function as |
|
1396 * hard-case backup, but it's just too big and messy unless there are |
|
1397 * some changes to the data structures. Maybe later. |
|
1398 */ |
|
1399 static void |
|
1400 doemit(p, op, opnd) |
|
1401 struct parse *p; |
|
1402 sop op; |
|
1403 size_t opnd; |
|
1404 { |
|
1405 /* avoid making error situations worse */ |
|
1406 if (p->error != 0) |
|
1407 return; |
|
1408 |
|
1409 /* deal with oversize operands ("can't happen", more or less) */ |
|
1410 assert(opnd < 1<<OPSHIFT); |
|
1411 |
|
1412 /* deal with undersized strip */ |
|
1413 if (p->slen >= p->ssize) |
|
1414 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ |
|
1415 assert(p->slen < p->ssize); |
|
1416 |
|
1417 /* finally, it's all reduced to the easy case */ |
|
1418 #ifdef __SYMBIAN32__ |
|
1419 if(p->error != REG_ESPACE) |
|
1420 { |
|
1421 p->strip[p->slen++] = SOP(op, opnd); |
|
1422 } |
|
1423 #else |
|
1424 p->strip[p->slen++] = SOP(op, opnd); |
|
1425 #endif |
|
1426 } |
|
1427 |
|
1428 /* |
|
1429 - doinsert - insert a sop into the strip |
|
1430 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); |
|
1431 */ |
|
1432 static void |
|
1433 doinsert(p, op, opnd, pos) |
|
1434 struct parse *p; |
|
1435 sop op; |
|
1436 size_t opnd; |
|
1437 sopno pos; |
|
1438 { |
|
1439 sopno sn; |
|
1440 sop s; |
|
1441 int i; |
|
1442 |
|
1443 /* avoid making error situations worse */ |
|
1444 if (p->error != 0) |
|
1445 return; |
|
1446 |
|
1447 sn = HERE(); |
|
1448 EMIT(op, opnd); /* do checks, ensure space */ |
|
1449 assert(HERE() == sn+1); |
|
1450 s = p->strip[sn]; |
|
1451 |
|
1452 /* adjust paren pointers */ |
|
1453 assert(pos > 0); |
|
1454 for (i = 1; i < NPAREN; i++) { |
|
1455 if (p->pbegin[i] >= pos) { |
|
1456 p->pbegin[i]++; |
|
1457 } |
|
1458 if (p->pend[i] >= pos) { |
|
1459 p->pend[i]++; |
|
1460 } |
|
1461 } |
|
1462 |
|
1463 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], |
|
1464 (HERE()-pos-1)*sizeof(sop)); |
|
1465 p->strip[pos] = s; |
|
1466 } |
|
1467 |
|
1468 /* |
|
1469 - dofwd - complete a forward reference |
|
1470 == static void dofwd(struct parse *p, sopno pos, sop value); |
|
1471 */ |
|
1472 static void |
|
1473 dofwd(p, pos, value) |
|
1474 struct parse *p; |
|
1475 sopno pos; |
|
1476 sop value; |
|
1477 { |
|
1478 /* avoid making error situations worse */ |
|
1479 if (p->error != 0) |
|
1480 return; |
|
1481 |
|
1482 assert(value < 1<<OPSHIFT); |
|
1483 p->strip[pos] = OP(p->strip[pos]) | value; |
|
1484 } |
|
1485 |
|
1486 /* |
|
1487 - enlarge - enlarge the strip |
|
1488 == static void enlarge(struct parse *p, sopno size); |
|
1489 */ |
|
1490 static void |
|
1491 enlarge(p, size) |
|
1492 struct parse *p; |
|
1493 sopno size; |
|
1494 { |
|
1495 sop *sp; |
|
1496 |
|
1497 if (p->ssize >= size) |
|
1498 return; |
|
1499 |
|
1500 sp = (sop *)realloc(p->strip, size*sizeof(sop)); |
|
1501 if (sp == NULL) { |
|
1502 SETERROR(REG_ESPACE); |
|
1503 return; |
|
1504 } |
|
1505 p->strip = sp; |
|
1506 p->ssize = size; |
|
1507 } |
|
1508 |
|
1509 /* |
|
1510 - stripsnug - compact the strip |
|
1511 == static void stripsnug(struct parse *p, struct re_guts *g); |
|
1512 */ |
|
1513 static void |
|
1514 stripsnug(p, g) |
|
1515 struct parse *p; |
|
1516 struct re_guts *g; |
|
1517 { |
|
1518 g->nstates = p->slen; |
|
1519 |
|
1520 #ifdef __SYMBIAN32__ |
|
1521 if(p->error==REG_ESPACE) |
|
1522 { |
|
1523 g->strip=p->strip; |
|
1524 return; |
|
1525 } |
|
1526 #endif |
|
1527 |
|
1528 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); |
|
1529 if (g->strip == NULL) { |
|
1530 SETERROR(REG_ESPACE); |
|
1531 g->strip = p->strip; |
|
1532 } |
|
1533 } |
|
1534 |
|
1535 /* |
|
1536 - findmust - fill in must and mlen with longest mandatory literal string |
|
1537 == static void findmust(struct parse *p, struct re_guts *g); |
|
1538 * |
|
1539 * This algorithm could do fancy things like analyzing the operands of | |
|
1540 * for common subsequences. Someday. This code is simple and finds most |
|
1541 * of the interesting cases. |
|
1542 * |
|
1543 * Note that must and mlen got initialized during setup. |
|
1544 */ |
|
1545 static void |
|
1546 findmust(p, g) |
|
1547 struct parse *p; |
|
1548 struct re_guts *g; |
|
1549 { |
|
1550 sop *scan; |
|
1551 sop *start; |
|
1552 sop *newstart; |
|
1553 sopno newlen; |
|
1554 sop s; |
|
1555 char *cp; |
|
1556 int offset; |
|
1557 char buf[MB_LEN_MAX]; |
|
1558 size_t clen; |
|
1559 mbstate_t mbs; |
|
1560 |
|
1561 /* avoid making error situations worse */ |
|
1562 if (p->error != 0) |
|
1563 return; |
|
1564 |
|
1565 /* |
|
1566 * It's not generally safe to do a ``char'' substring search on |
|
1567 * multibyte character strings, but it's safe for at least |
|
1568 * UTF-8 (see RFC 3629). |
|
1569 */ |
|
1570 if (MB_CUR_MAX > 1 && |
|
1571 #ifdef __SYMBIAN32__ |
|
1572 strcmp(nl_langinfo(CODESET), "UTF-8") != 0) |
|
1573 #else |
|
1574 strcmp(_CurrentRuneLocale->__encoding, "UTF-8")!= 0) |
|
1575 #endif |
|
1576 return; |
|
1577 |
|
1578 /* find the longest OCHAR sequence in strip */ |
|
1579 newlen = 0; |
|
1580 offset = 0; |
|
1581 g->moffset = 0; |
|
1582 scan = g->strip + 1; |
|
1583 do { |
|
1584 s = *scan++; |
|
1585 switch (OP(s)) { |
|
1586 case OCHAR: /* sequence member */ |
|
1587 if (newlen == 0) { /* new sequence */ |
|
1588 memset(&mbs, 0, sizeof(mbs)); |
|
1589 newstart = scan - 1; |
|
1590 } |
|
1591 clen = wcrtomb(buf, OPND(s), &mbs); |
|
1592 if (clen == (size_t)-1) |
|
1593 goto toohard; |
|
1594 newlen += clen; |
|
1595 break; |
|
1596 case OPLUS_: /* things that don't break one */ |
|
1597 case OLPAREN: |
|
1598 case ORPAREN: |
|
1599 break; |
|
1600 case OQUEST_: /* things that must be skipped */ |
|
1601 case OCH_: |
|
1602 offset = altoffset(scan, offset); |
|
1603 scan--; |
|
1604 do { |
|
1605 scan += OPND(s); |
|
1606 s = *scan; |
|
1607 /* assert() interferes w debug printouts */ |
|
1608 if (OP(s) != O_QUEST && OP(s) != O_CH && |
|
1609 OP(s) != OOR2) { |
|
1610 g->iflags |= BAD; |
|
1611 return; |
|
1612 } |
|
1613 } while (OP(s) != O_QUEST && OP(s) != O_CH); |
|
1614 /* FALLTHROUGH */ |
|
1615 case OBOW: /* things that break a sequence */ |
|
1616 case OEOW: |
|
1617 case OBOL: |
|
1618 case OEOL: |
|
1619 case O_QUEST: |
|
1620 case O_CH: |
|
1621 case OEND: |
|
1622 if (newlen > g->mlen) { /* ends one */ |
|
1623 start = newstart; |
|
1624 g->mlen = newlen; |
|
1625 if (offset > -1) { |
|
1626 g->moffset += offset; |
|
1627 offset = newlen; |
|
1628 } else |
|
1629 g->moffset = offset; |
|
1630 } else { |
|
1631 if (offset > -1) |
|
1632 offset += newlen; |
|
1633 } |
|
1634 newlen = 0; |
|
1635 break; |
|
1636 case OANY: |
|
1637 if (newlen > g->mlen) { /* ends one */ |
|
1638 start = newstart; |
|
1639 g->mlen = newlen; |
|
1640 if (offset > -1) { |
|
1641 g->moffset += offset; |
|
1642 offset = newlen; |
|
1643 } else |
|
1644 g->moffset = offset; |
|
1645 } else { |
|
1646 if (offset > -1) |
|
1647 offset += newlen; |
|
1648 } |
|
1649 if (offset > -1) |
|
1650 offset++; |
|
1651 newlen = 0; |
|
1652 break; |
|
1653 case OANYOF: /* may or may not invalidate offset */ |
|
1654 /* First, everything as OANY */ |
|
1655 if (newlen > g->mlen) { /* ends one */ |
|
1656 start = newstart; |
|
1657 g->mlen = newlen; |
|
1658 if (offset > -1) { |
|
1659 g->moffset += offset; |
|
1660 offset = newlen; |
|
1661 } else |
|
1662 g->moffset = offset; |
|
1663 } else { |
|
1664 if (offset > -1) |
|
1665 offset += newlen; |
|
1666 } |
|
1667 if (offset > -1) |
|
1668 offset++; |
|
1669 newlen = 0; |
|
1670 break; |
|
1671 toohard: |
|
1672 default: |
|
1673 /* Anything here makes it impossible or too hard |
|
1674 * to calculate the offset -- so we give up; |
|
1675 * save the last known good offset, in case the |
|
1676 * must sequence doesn't occur later. |
|
1677 */ |
|
1678 if (newlen > g->mlen) { /* ends one */ |
|
1679 start = newstart; |
|
1680 g->mlen = newlen; |
|
1681 if (offset > -1) |
|
1682 g->moffset += offset; |
|
1683 else |
|
1684 g->moffset = offset; |
|
1685 } |
|
1686 offset = -1; |
|
1687 newlen = 0; |
|
1688 break; |
|
1689 } |
|
1690 } while (OP(s) != OEND); |
|
1691 |
|
1692 if (g->mlen == 0) { /* there isn't one */ |
|
1693 g->moffset = -1; |
|
1694 return; |
|
1695 } |
|
1696 |
|
1697 /* turn it into a character string */ |
|
1698 g->must = malloc((size_t)g->mlen + 1); |
|
1699 if (g->must == NULL) { /* argh; just forget it */ |
|
1700 g->mlen = 0; |
|
1701 g->moffset = -1; |
|
1702 return; |
|
1703 } |
|
1704 cp = g->must; |
|
1705 scan = start; |
|
1706 memset(&mbs, 0, sizeof(mbs)); |
|
1707 while (cp < g->must + g->mlen) { |
|
1708 while (OP(s = *scan++) != OCHAR) |
|
1709 continue; |
|
1710 clen = wcrtomb(cp, OPND(s), &mbs); |
|
1711 assert(clen != (size_t)-1); |
|
1712 cp += clen; |
|
1713 } |
|
1714 assert(cp == g->must + g->mlen); |
|
1715 *cp++ = '\0'; /* just on general principles */ |
|
1716 } |
|
1717 |
|
1718 /* |
|
1719 - altoffset - choose biggest offset among multiple choices |
|
1720 == static int altoffset(sop *scan, int offset); |
|
1721 * |
|
1722 * Compute, recursively if necessary, the largest offset among multiple |
|
1723 * re paths. |
|
1724 */ |
|
1725 static int |
|
1726 altoffset(scan, offset) |
|
1727 sop *scan; |
|
1728 int offset; |
|
1729 { |
|
1730 int largest; |
|
1731 int try; |
|
1732 sop s; |
|
1733 |
|
1734 /* If we gave up already on offsets, return */ |
|
1735 if (offset == -1) |
|
1736 return -1; |
|
1737 |
|
1738 largest = 0; |
|
1739 try = 0; |
|
1740 s = *scan++; |
|
1741 while (OP(s) != O_QUEST && OP(s) != O_CH) { |
|
1742 switch (OP(s)) { |
|
1743 case OOR1: |
|
1744 if (try > largest) |
|
1745 largest = try; |
|
1746 try = 0; |
|
1747 break; |
|
1748 case OQUEST_: |
|
1749 case OCH_: |
|
1750 try = altoffset(scan, try); |
|
1751 if (try == -1) |
|
1752 return -1; |
|
1753 scan--; |
|
1754 do { |
|
1755 scan += OPND(s); |
|
1756 s = *scan; |
|
1757 if (OP(s) != O_QUEST && OP(s) != O_CH && |
|
1758 OP(s) != OOR2) |
|
1759 return -1; |
|
1760 } while (OP(s) != O_QUEST && OP(s) != O_CH); |
|
1761 /* We must skip to the next position, or we'll |
|
1762 * leave altoffset() too early. |
|
1763 */ |
|
1764 scan++; |
|
1765 break; |
|
1766 case OANYOF: |
|
1767 case OCHAR: |
|
1768 case OANY: |
|
1769 try++; |
|
1770 case OBOW: |
|
1771 case OEOW: |
|
1772 case OLPAREN: |
|
1773 case ORPAREN: |
|
1774 case OOR2: |
|
1775 break; |
|
1776 default: |
|
1777 try = -1; |
|
1778 break; |
|
1779 } |
|
1780 if (try == -1) |
|
1781 return -1; |
|
1782 s = *scan++; |
|
1783 } |
|
1784 |
|
1785 if (try > largest) |
|
1786 largest = try; |
|
1787 |
|
1788 return largest+offset; |
|
1789 } |
|
1790 |
|
1791 /* |
|
1792 - computejumps - compute char jumps for BM scan |
|
1793 == static void computejumps(struct parse *p, struct re_guts *g); |
|
1794 * |
|
1795 * This algorithm assumes g->must exists and is has size greater than |
|
1796 * zero. It's based on the algorithm found on Computer Algorithms by |
|
1797 * Sara Baase. |
|
1798 * |
|
1799 * A char jump is the number of characters one needs to jump based on |
|
1800 * the value of the character from the text that was mismatched. |
|
1801 */ |
|
1802 static void |
|
1803 computejumps(p, g) |
|
1804 struct parse *p; |
|
1805 struct re_guts *g; |
|
1806 { |
|
1807 int ch; |
|
1808 int mindex; |
|
1809 |
|
1810 /* Avoid making errors worse */ |
|
1811 if (p->error != 0) |
|
1812 return; |
|
1813 |
|
1814 g->charjump = (int*) malloc((NC + 1) * sizeof(int)); |
|
1815 if (g->charjump == NULL) /* Not a fatal error */ |
|
1816 return; |
|
1817 /* Adjust for signed chars, if necessary */ |
|
1818 g->charjump = &g->charjump[-(CHAR_MIN)]; |
|
1819 |
|
1820 /* If the character does not exist in the pattern, the jump |
|
1821 * is equal to the number of characters in the pattern. |
|
1822 */ |
|
1823 for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++) |
|
1824 g->charjump[ch] = g->mlen; |
|
1825 |
|
1826 /* If the character does exist, compute the jump that would |
|
1827 * take us to the last character in the pattern equal to it |
|
1828 * (notice that we match right to left, so that last character |
|
1829 * is the first one that would be matched). |
|
1830 */ |
|
1831 for (mindex = 0; mindex < g->mlen; mindex++) |
|
1832 g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1; |
|
1833 } |
|
1834 |
|
1835 /* |
|
1836 - computematchjumps - compute match jumps for BM scan |
|
1837 == static void computematchjumps(struct parse *p, struct re_guts *g); |
|
1838 * |
|
1839 * This algorithm assumes g->must exists and is has size greater than |
|
1840 * zero. It's based on the algorithm found on Computer Algorithms by |
|
1841 * Sara Baase. |
|
1842 * |
|
1843 * A match jump is the number of characters one needs to advance based |
|
1844 * on the already-matched suffix. |
|
1845 * Notice that all values here are minus (g->mlen-1), because of the way |
|
1846 * the search algorithm works. |
|
1847 */ |
|
1848 static void |
|
1849 computematchjumps(p, g) |
|
1850 struct parse *p; |
|
1851 struct re_guts *g; |
|
1852 { |
|
1853 int mindex; /* General "must" iterator */ |
|
1854 int suffix; /* Keeps track of matching suffix */ |
|
1855 int ssuffix; /* Keeps track of suffixes' suffix */ |
|
1856 int* pmatches; /* pmatches[k] points to the next i |
|
1857 * such that i+1...mlen is a substring |
|
1858 * of k+1...k+mlen-i-1 |
|
1859 */ |
|
1860 |
|
1861 /* Avoid making errors worse */ |
|
1862 if (p->error != 0) |
|
1863 return; |
|
1864 |
|
1865 pmatches = (int*) malloc(g->mlen * sizeof(unsigned int)); |
|
1866 if (pmatches == NULL) { |
|
1867 g->matchjump = NULL; |
|
1868 return; |
|
1869 } |
|
1870 |
|
1871 g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int)); |
|
1872 if (g->matchjump == NULL) /* Not a fatal error */ |
|
1873 return; |
|
1874 |
|
1875 /* Set maximum possible jump for each character in the pattern */ |
|
1876 for (mindex = 0; mindex < g->mlen; mindex++) |
|
1877 g->matchjump[mindex] = 2*g->mlen - mindex - 1; |
|
1878 |
|
1879 /* Compute pmatches[] */ |
|
1880 for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; |
|
1881 mindex--, suffix--) { |
|
1882 pmatches[mindex] = suffix; |
|
1883 |
|
1884 /* If a mismatch is found, interrupting the substring, |
|
1885 * compute the matchjump for that position. If no |
|
1886 * mismatch is found, then a text substring mismatched |
|
1887 * against the suffix will also mismatch against the |
|
1888 * substring. |
|
1889 */ |
|
1890 while (suffix < g->mlen |
|
1891 && g->must[mindex] != g->must[suffix]) { |
|
1892 g->matchjump[suffix] = MIN(g->matchjump[suffix], |
|
1893 g->mlen - mindex - 1); |
|
1894 suffix = pmatches[suffix]; |
|
1895 } |
|
1896 } |
|
1897 |
|
1898 /* Compute the matchjump up to the last substring found to jump |
|
1899 * to the beginning of the largest must pattern prefix matching |
|
1900 * it's own suffix. |
|
1901 */ |
|
1902 for (mindex = 0; mindex <= suffix; mindex++) |
|
1903 g->matchjump[mindex] = MIN(g->matchjump[mindex], |
|
1904 g->mlen + suffix - mindex); |
|
1905 |
|
1906 ssuffix = pmatches[suffix]; |
|
1907 while (suffix < g->mlen) { |
|
1908 while (suffix <= ssuffix && suffix < g->mlen) { |
|
1909 g->matchjump[suffix] = MIN(g->matchjump[suffix], |
|
1910 g->mlen + ssuffix - suffix); |
|
1911 suffix++; |
|
1912 } |
|
1913 if (suffix < g->mlen) |
|
1914 ssuffix = pmatches[ssuffix]; |
|
1915 } |
|
1916 |
|
1917 free(pmatches); |
|
1918 } |
|
1919 |
|
1920 /* |
|
1921 - pluscount - count + nesting |
|
1922 == static sopno pluscount(struct parse *p, struct re_guts *g); |
|
1923 */ |
|
1924 static sopno /* nesting depth */ |
|
1925 pluscount(p, g) |
|
1926 struct parse *p; |
|
1927 struct re_guts *g; |
|
1928 { |
|
1929 sop *scan; |
|
1930 sop s; |
|
1931 sopno plusnest = 0; |
|
1932 sopno maxnest = 0; |
|
1933 |
|
1934 if (p->error != 0) |
|
1935 return(0); /* there may not be an OEND */ |
|
1936 |
|
1937 scan = g->strip + 1; |
|
1938 do { |
|
1939 s = *scan++; |
|
1940 switch (OP(s)) { |
|
1941 case OPLUS_: |
|
1942 plusnest++; |
|
1943 break; |
|
1944 case O_PLUS: |
|
1945 if (plusnest > maxnest) |
|
1946 maxnest = plusnest; |
|
1947 plusnest--; |
|
1948 break; |
|
1949 } |
|
1950 } while (OP(s) != OEND); |
|
1951 if (plusnest != 0) |
|
1952 g->iflags |= BAD; |
|
1953 return(maxnest); |
|
1954 } |