|
1 /************************************************* |
|
2 * PCRE DEMONSTRATION PROGRAM * |
|
3 *************************************************/ |
|
4 |
|
5 /* This is a demonstration program to illustrate the most straightforward ways |
|
6 of calling the PCRE regular expression library from a C program. See the |
|
7 pcresample documentation for a short discussion ("man pcresample" if you have |
|
8 the PCRE man pages installed). |
|
9 |
|
10 In Unix-like environments, compile this program thuswise: |
|
11 |
|
12 gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \ |
|
13 -R/usr/local/lib -lpcre |
|
14 |
|
15 Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and |
|
16 library files for PCRE are installed on your system. You don't need -I and -L |
|
17 if PCRE is installed in the standard system libraries. Only some operating |
|
18 systems (e.g. Solaris) use the -R option. |
|
19 |
|
20 Building under Windows: |
|
21 |
|
22 If you want to statically link this program against a non-dll .a file, you must |
|
23 define PCRE_STATIC before including pcre.h, otherwise the pcre_malloc() and |
|
24 pcre_free() exported functions will be declared __declspec(dllimport), with |
|
25 unwanted results. So in this environment, uncomment the following line. */ |
|
26 |
|
27 /* #define PCRE_STATIC */ |
|
28 |
|
29 #include <stdio.h> |
|
30 #include <string.h> |
|
31 #include <pcre.h> |
|
32 |
|
33 #define OVECCOUNT 30 /* should be a multiple of 3 */ |
|
34 |
|
35 |
|
36 int main(int argc, char **argv) |
|
37 { |
|
38 pcre *re; |
|
39 const char *error; |
|
40 char *pattern; |
|
41 char *subject; |
|
42 unsigned char *name_table; |
|
43 int erroffset; |
|
44 int find_all; |
|
45 int namecount; |
|
46 int name_entry_size; |
|
47 int ovector[OVECCOUNT]; |
|
48 int subject_length; |
|
49 int rc, i; |
|
50 |
|
51 |
|
52 /************************************************************************** |
|
53 * First, sort out the command line. There is only one possible option at * |
|
54 * the moment, "-g" to request repeated matching to find all occurrences, * |
|
55 * like Perl's /g option. We set the variable find_all to a non-zero value * |
|
56 * if the -g option is present. Apart from that, there must be exactly two * |
|
57 * arguments. * |
|
58 **************************************************************************/ |
|
59 |
|
60 find_all = 0; |
|
61 for (i = 1; i < argc; i++) |
|
62 { |
|
63 if (strcmp(argv[i], "-g") == 0) find_all = 1; |
|
64 else break; |
|
65 } |
|
66 |
|
67 /* After the options, we require exactly two arguments, which are the pattern, |
|
68 and the subject string. */ |
|
69 |
|
70 if (argc - i != 2) |
|
71 { |
|
72 printf("Two arguments required: a regex and a subject string\n"); |
|
73 return 1; |
|
74 } |
|
75 |
|
76 pattern = argv[i]; |
|
77 subject = argv[i+1]; |
|
78 subject_length = (int)strlen(subject); |
|
79 |
|
80 |
|
81 /************************************************************************* |
|
82 * Now we are going to compile the regular expression pattern, and handle * |
|
83 * and errors that are detected. * |
|
84 *************************************************************************/ |
|
85 |
|
86 re = pcre_compile( |
|
87 pattern, /* the pattern */ |
|
88 0, /* default options */ |
|
89 &error, /* for error message */ |
|
90 &erroffset, /* for error offset */ |
|
91 NULL); /* use default character tables */ |
|
92 |
|
93 /* Compilation failed: print the error message and exit */ |
|
94 |
|
95 if (re == NULL) |
|
96 { |
|
97 printf("PCRE compilation failed at offset %d: %s\n", erroffset, error); |
|
98 return 1; |
|
99 } |
|
100 |
|
101 |
|
102 /************************************************************************* |
|
103 * If the compilation succeeded, we call PCRE again, in order to do a * |
|
104 * pattern match against the subject string. This does just ONE match. If * |
|
105 * further matching is needed, it will be done below. * |
|
106 *************************************************************************/ |
|
107 |
|
108 rc = pcre_exec( |
|
109 re, /* the compiled pattern */ |
|
110 NULL, /* no extra data - we didn't study the pattern */ |
|
111 subject, /* the subject string */ |
|
112 subject_length, /* the length of the subject */ |
|
113 0, /* start at offset 0 in the subject */ |
|
114 0, /* default options */ |
|
115 ovector, /* output vector for substring information */ |
|
116 OVECCOUNT); /* number of elements in the output vector */ |
|
117 |
|
118 /* Matching failed: handle error cases */ |
|
119 |
|
120 if (rc < 0) |
|
121 { |
|
122 switch(rc) |
|
123 { |
|
124 case PCRE_ERROR_NOMATCH: printf("No match\n"); break; |
|
125 /* |
|
126 Handle other special cases if you like |
|
127 */ |
|
128 default: printf("Matching error %d\n", rc); break; |
|
129 } |
|
130 pcre_free(re); /* Release memory used for the compiled pattern */ |
|
131 return 1; |
|
132 } |
|
133 |
|
134 /* Match succeded */ |
|
135 |
|
136 printf("\nMatch succeeded at offset %d\n", ovector[0]); |
|
137 |
|
138 |
|
139 /************************************************************************* |
|
140 * We have found the first match within the subject string. If the output * |
|
141 * vector wasn't big enough, say so. Then output any substrings that were * |
|
142 * captured. * |
|
143 *************************************************************************/ |
|
144 |
|
145 /* The output vector wasn't big enough */ |
|
146 |
|
147 if (rc == 0) |
|
148 { |
|
149 rc = OVECCOUNT/3; |
|
150 printf("ovector only has room for %d captured substrings\n", rc - 1); |
|
151 } |
|
152 |
|
153 /* Show substrings stored in the output vector by number. Obviously, in a real |
|
154 application you might want to do things other than print them. */ |
|
155 |
|
156 for (i = 0; i < rc; i++) |
|
157 { |
|
158 char *substring_start = subject + ovector[2*i]; |
|
159 int substring_length = ovector[2*i+1] - ovector[2*i]; |
|
160 printf("%2d: %.*s\n", i, substring_length, substring_start); |
|
161 } |
|
162 |
|
163 |
|
164 /************************************************************************** |
|
165 * That concludes the basic part of this demonstration program. We have * |
|
166 * compiled a pattern, and performed a single match. The code that follows * |
|
167 * shows first how to access named substrings, and then how to code for * |
|
168 * repeated matches on the same subject. * |
|
169 **************************************************************************/ |
|
170 |
|
171 /* See if there are any named substrings, and if so, show them by name. First |
|
172 we have to extract the count of named parentheses from the pattern. */ |
|
173 |
|
174 (void)pcre_fullinfo( |
|
175 re, /* the compiled pattern */ |
|
176 NULL, /* no extra data - we didn't study the pattern */ |
|
177 PCRE_INFO_NAMECOUNT, /* number of named substrings */ |
|
178 &namecount); /* where to put the answer */ |
|
179 |
|
180 if (namecount <= 0) printf("No named substrings\n"); else |
|
181 { |
|
182 unsigned char *tabptr; |
|
183 printf("Named substrings\n"); |
|
184 |
|
185 /* Before we can access the substrings, we must extract the table for |
|
186 translating names to numbers, and the size of each entry in the table. */ |
|
187 |
|
188 (void)pcre_fullinfo( |
|
189 re, /* the compiled pattern */ |
|
190 NULL, /* no extra data - we didn't study the pattern */ |
|
191 PCRE_INFO_NAMETABLE, /* address of the table */ |
|
192 &name_table); /* where to put the answer */ |
|
193 |
|
194 (void)pcre_fullinfo( |
|
195 re, /* the compiled pattern */ |
|
196 NULL, /* no extra data - we didn't study the pattern */ |
|
197 PCRE_INFO_NAMEENTRYSIZE, /* size of each entry in the table */ |
|
198 &name_entry_size); /* where to put the answer */ |
|
199 |
|
200 /* Now we can scan the table and, for each entry, print the number, the name, |
|
201 and the substring itself. */ |
|
202 |
|
203 tabptr = name_table; |
|
204 for (i = 0; i < namecount; i++) |
|
205 { |
|
206 int n = (tabptr[0] << 8) | tabptr[1]; |
|
207 printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, |
|
208 ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]); |
|
209 tabptr += name_entry_size; |
|
210 } |
|
211 } |
|
212 |
|
213 |
|
214 /************************************************************************* |
|
215 * If the "-g" option was given on the command line, we want to continue * |
|
216 * to search for additional matches in the subject string, in a similar * |
|
217 * way to the /g option in Perl. This turns out to be trickier than you * |
|
218 * might think because of the possibility of matching an empty string. * |
|
219 * What happens is as follows: * |
|
220 * * |
|
221 * If the previous match was NOT for an empty string, we can just start * |
|
222 * the next match at the end of the previous one. * |
|
223 * * |
|
224 * If the previous match WAS for an empty string, we can't do that, as it * |
|
225 * would lead to an infinite loop. Instead, a special call of pcre_exec() * |
|
226 * is made with the PCRE_NOTEMPTY and PCRE_ANCHORED flags set. The first * |
|
227 * of these tells PCRE that an empty string is not a valid match; other * |
|
228 * possibilities must be tried. The second flag restricts PCRE to one * |
|
229 * match attempt at the initial string position. If this match succeeds, * |
|
230 * an alternative to the empty string match has been found, and we can * |
|
231 * proceed round the loop. * |
|
232 *************************************************************************/ |
|
233 |
|
234 if (!find_all) |
|
235 { |
|
236 pcre_free(re); /* Release the memory used for the compiled pattern */ |
|
237 return 0; /* Finish unless -g was given */ |
|
238 } |
|
239 |
|
240 /* Loop for second and subsequent matches */ |
|
241 |
|
242 for (;;) |
|
243 { |
|
244 int options = 0; /* Normally no options */ |
|
245 int start_offset = ovector[1]; /* Start at end of previous match */ |
|
246 |
|
247 /* If the previous match was for an empty string, we are finished if we are |
|
248 at the end of the subject. Otherwise, arrange to run another match at the |
|
249 same point to see if a non-empty match can be found. */ |
|
250 |
|
251 if (ovector[0] == ovector[1]) |
|
252 { |
|
253 if (ovector[0] == subject_length) break; |
|
254 options = PCRE_NOTEMPTY | PCRE_ANCHORED; |
|
255 } |
|
256 |
|
257 /* Run the next matching operation */ |
|
258 |
|
259 rc = pcre_exec( |
|
260 re, /* the compiled pattern */ |
|
261 NULL, /* no extra data - we didn't study the pattern */ |
|
262 subject, /* the subject string */ |
|
263 subject_length, /* the length of the subject */ |
|
264 start_offset, /* starting offset in the subject */ |
|
265 options, /* options */ |
|
266 ovector, /* output vector for substring information */ |
|
267 OVECCOUNT); /* number of elements in the output vector */ |
|
268 |
|
269 /* This time, a result of NOMATCH isn't an error. If the value in "options" |
|
270 is zero, it just means we have found all possible matches, so the loop ends. |
|
271 Otherwise, it means we have failed to find a non-empty-string match at a |
|
272 point where there was a previous empty-string match. In this case, we do what |
|
273 Perl does: advance the matching position by one, and continue. We do this by |
|
274 setting the "end of previous match" offset, because that is picked up at the |
|
275 top of the loop as the point at which to start again. */ |
|
276 |
|
277 if (rc == PCRE_ERROR_NOMATCH) |
|
278 { |
|
279 if (options == 0) break; |
|
280 ovector[1] = start_offset + 1; |
|
281 continue; /* Go round the loop again */ |
|
282 } |
|
283 |
|
284 /* Other matching errors are not recoverable. */ |
|
285 |
|
286 if (rc < 0) |
|
287 { |
|
288 printf("Matching error %d\n", rc); |
|
289 pcre_free(re); /* Release memory used for the compiled pattern */ |
|
290 return 1; |
|
291 } |
|
292 |
|
293 /* Match succeded */ |
|
294 |
|
295 printf("\nMatch succeeded again at offset %d\n", ovector[0]); |
|
296 |
|
297 /* The match succeeded, but the output vector wasn't big enough. */ |
|
298 |
|
299 if (rc == 0) |
|
300 { |
|
301 rc = OVECCOUNT/3; |
|
302 printf("ovector only has room for %d captured substrings\n", rc - 1); |
|
303 } |
|
304 |
|
305 /* As before, show substrings stored in the output vector by number, and then |
|
306 also any named substrings. */ |
|
307 |
|
308 for (i = 0; i < rc; i++) |
|
309 { |
|
310 char *substring_start = subject + ovector[2*i]; |
|
311 int substring_length = ovector[2*i+1] - ovector[2*i]; |
|
312 printf("%2d: %.*s\n", i, substring_length, substring_start); |
|
313 } |
|
314 |
|
315 if (namecount <= 0) printf("No named substrings\n"); else |
|
316 { |
|
317 unsigned char *tabptr = name_table; |
|
318 printf("Named substrings\n"); |
|
319 for (i = 0; i < namecount; i++) |
|
320 { |
|
321 int n = (tabptr[0] << 8) | tabptr[1]; |
|
322 printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, |
|
323 ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]); |
|
324 tabptr += name_entry_size; |
|
325 } |
|
326 } |
|
327 } /* End of loop to find second and subsequent matches */ |
|
328 |
|
329 printf("\n"); |
|
330 pcre_free(re); /* Release memory used for the compiled pattern */ |
|
331 return 0; |
|
332 } |
|
333 |
|
334 /* End of pcredemo.c */ |