libraries/spcre/libpcre/pcre/pcre_dfa_exec.c
changeset 0 7f656887cf89
equal deleted inserted replaced
-1:000000000000 0:7f656887cf89
       
     1 /*************************************************
       
     2 *      Perl-Compatible Regular Expressions       *
       
     3 *************************************************/
       
     4 
       
     5 /* PCRE is a library of functions to support regular expressions whose syntax
       
     6 and semantics are as close as possible to those of the Perl 5 language.
       
     7 
       
     8                        Written by Philip Hazel
       
     9            Copyright (c) 1997-2008 University of Cambridge
       
    10 
       
    11 -----------------------------------------------------------------------------
       
    12 Redistribution and use in source and binary forms, with or without
       
    13 modification, are permitted provided that the following conditions are met:
       
    14 
       
    15     * Redistributions of source code must retain the above copyright notice,
       
    16       this list of conditions and the following disclaimer.
       
    17 
       
    18     * Redistributions in binary form must reproduce the above copyright
       
    19       notice, this list of conditions and the following disclaimer in the
       
    20       documentation and/or other materials provided with the distribution.
       
    21 
       
    22     * Neither the name of the University of Cambridge nor the names of its
       
    23       contributors may be used to endorse or promote products derived from
       
    24       this software without specific prior written permission.
       
    25 
       
    26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
       
    27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
       
    30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       
    31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       
    32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       
    33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       
    34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       
    35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       
    36 POSSIBILITY OF SUCH DAMAGE.
       
    37 -----------------------------------------------------------------------------
       
    38 */
       
    39 
       
    40 
       
    41 /* This module contains the external function pcre_dfa_exec(), which is an
       
    42 alternative matching function that uses a sort of DFA algorithm (not a true
       
    43 FSM). This is NOT Perl- compatible, but it has advantages in certain
       
    44 applications. */
       
    45 
       
    46 
       
    47 #ifdef HAVE_CONFIG_H
       
    48 #include "config.h"
       
    49 #endif
       
    50 
       
    51 #define NLBLOCK md             /* Block containing newline information */
       
    52 #define PSSTART start_subject  /* Field containing processed string start */
       
    53 #define PSEND   end_subject    /* Field containing processed string end */
       
    54 
       
    55 #include "pcre_internal.h"
       
    56 
       
    57 
       
    58 /* For use to indent debugging output */
       
    59 
       
    60 #define SP "                   "
       
    61 
       
    62 
       
    63 
       
    64 /*************************************************
       
    65 *      Code parameters and static tables         *
       
    66 *************************************************/
       
    67 
       
    68 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
       
    69 into others, under special conditions. A gap of 20 between the blocks should be
       
    70 enough. The resulting opcodes don't have to be less than 256 because they are
       
    71 never stored, so we push them well clear of the normal opcodes. */
       
    72 
       
    73 #define OP_PROP_EXTRA       300
       
    74 #define OP_EXTUNI_EXTRA     320
       
    75 #define OP_ANYNL_EXTRA      340
       
    76 #define OP_HSPACE_EXTRA     360
       
    77 #define OP_VSPACE_EXTRA     380
       
    78 
       
    79 
       
    80 /* This table identifies those opcodes that are followed immediately by a
       
    81 character that is to be tested in some way. This makes is possible to
       
    82 centralize the loading of these characters. In the case of Type * etc, the
       
    83 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
       
    84 small value. ***NOTE*** If the start of this table is modified, the two tables
       
    85 that follow must also be modified. */
       
    86 
       
    87 static const uschar coptable[] = {
       
    88   0,                             /* End                                    */
       
    89   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
       
    90   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
       
    91   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
       
    92   0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
       
    93   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
       
    94   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
       
    95   1,                             /* Char                                   */
       
    96   1,                             /* Charnc                                 */
       
    97   1,                             /* not                                    */
       
    98   /* Positive single-char repeats                                          */
       
    99   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
       
   100   3, 3, 3,                       /* upto, minupto, exact                   */
       
   101   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
       
   102   /* Negative single-char repeats - only for chars < 256                   */
       
   103   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
       
   104   3, 3, 3,                       /* NOT upto, minupto, exact               */
       
   105   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
       
   106   /* Positive type repeats                                                 */
       
   107   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
       
   108   3, 3, 3,                       /* Type upto, minupto, exact              */
       
   109   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
       
   110   /* Character class & ref repeats                                         */
       
   111   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
       
   112   0, 0,                          /* CRRANGE, CRMINRANGE                    */
       
   113   0,                             /* CLASS                                  */
       
   114   0,                             /* NCLASS                                 */
       
   115   0,                             /* XCLASS - variable length               */
       
   116   0,                             /* REF                                    */
       
   117   0,                             /* RECURSE                                */
       
   118   0,                             /* CALLOUT                                */
       
   119   0,                             /* Alt                                    */
       
   120   0,                             /* Ket                                    */
       
   121   0,                             /* KetRmax                                */
       
   122   0,                             /* KetRmin                                */
       
   123   0,                             /* Assert                                 */
       
   124   0,                             /* Assert not                             */
       
   125   0,                             /* Assert behind                          */
       
   126   0,                             /* Assert behind not                      */
       
   127   0,                             /* Reverse                                */
       
   128   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
       
   129   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
       
   130   0,                             /* CREF                                   */
       
   131   0,                             /* RREF                                   */
       
   132   0,                             /* DEF                                    */
       
   133   0, 0,                          /* BRAZERO, BRAMINZERO                    */
       
   134   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
       
   135   0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
       
   136 };
       
   137 
       
   138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
       
   139 and \w */
       
   140 
       
   141 static const uschar toptable1[] = {
       
   142   0, 0, 0, 0, 0, 0,
       
   143   ctype_digit, ctype_digit,
       
   144   ctype_space, ctype_space,
       
   145   ctype_word,  ctype_word,
       
   146   0, 0                            /* OP_ANY, OP_ALLANY */
       
   147 };
       
   148 
       
   149 static const uschar toptable2[] = {
       
   150   0, 0, 0, 0, 0, 0,
       
   151   ctype_digit, 0,
       
   152   ctype_space, 0,
       
   153   ctype_word,  0,
       
   154   1, 1                            /* OP_ANY, OP_ALLANY */
       
   155 };
       
   156 
       
   157 
       
   158 /* Structure for holding data about a particular state, which is in effect the
       
   159 current data for an active path through the match tree. It must consist
       
   160 entirely of ints because the working vector we are passed, and which we put
       
   161 these structures in, is a vector of ints. */
       
   162 
       
   163 typedef struct stateblock {
       
   164   int offset;                     /* Offset to opcode */
       
   165   int count;                      /* Count for repeats */
       
   166   int ims;                        /* ims flag bits */
       
   167   int data;                       /* Some use extra data */
       
   168 } stateblock;
       
   169 
       
   170 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
       
   171 
       
   172 
       
   173 #ifdef DEBUG
       
   174 /*************************************************
       
   175 *             Print character string             *
       
   176 *************************************************/
       
   177 
       
   178 /* Character string printing function for debugging.
       
   179 
       
   180 Arguments:
       
   181   p            points to string
       
   182   length       number of bytes
       
   183   f            where to print
       
   184 
       
   185 Returns:       nothing
       
   186 */
       
   187 
       
   188 static void
       
   189 pchars(unsigned char *p, int length, FILE *f)
       
   190 {
       
   191 int c;
       
   192 while (length-- > 0)
       
   193   {
       
   194   if (isprint(c = *(p++)))
       
   195     fprintf(f, "%c", c);
       
   196   else
       
   197     fprintf(f, "\\x%02x", c);
       
   198   }
       
   199 }
       
   200 #endif
       
   201 
       
   202 
       
   203 
       
   204 /*************************************************
       
   205 *    Execute a Regular Expression - DFA engine   *
       
   206 *************************************************/
       
   207 
       
   208 /* This internal function applies a compiled pattern to a subject string,
       
   209 starting at a given point, using a DFA engine. This function is called from the
       
   210 external one, possibly multiple times if the pattern is not anchored. The
       
   211 function calls itself recursively for some kinds of subpattern.
       
   212 
       
   213 Arguments:
       
   214   md                the match_data block with fixed information
       
   215   this_start_code   the opening bracket of this subexpression's code
       
   216   current_subject   where we currently are in the subject string
       
   217   start_offset      start offset in the subject string
       
   218   offsets           vector to contain the matching string offsets
       
   219   offsetcount       size of same
       
   220   workspace         vector of workspace
       
   221   wscount           size of same
       
   222   ims               the current ims flags
       
   223   rlevel            function call recursion level
       
   224   recursing         regex recursive call level
       
   225 
       
   226 Returns:            > 0 => number of match offset pairs placed in offsets
       
   227                     = 0 => offsets overflowed; longest matches are present
       
   228                      -1 => failed to match
       
   229                    < -1 => some kind of unexpected problem
       
   230 
       
   231 The following macros are used for adding states to the two state vectors (one
       
   232 for the current character, one for the following character). */
       
   233 
       
   234 #define ADD_ACTIVE(x,y) \
       
   235   if (active_count++ < wscount) \
       
   236     { \
       
   237     next_active_state->offset = (x); \
       
   238     next_active_state->count  = (y); \
       
   239     next_active_state->ims    = ims; \
       
   240     next_active_state++; \
       
   241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
       
   242     } \
       
   243   else return PCRE_ERROR_DFA_WSSIZE
       
   244 
       
   245 #define ADD_ACTIVE_DATA(x,y,z) \
       
   246   if (active_count++ < wscount) \
       
   247     { \
       
   248     next_active_state->offset = (x); \
       
   249     next_active_state->count  = (y); \
       
   250     next_active_state->ims    = ims; \
       
   251     next_active_state->data   = (z); \
       
   252     next_active_state++; \
       
   253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
       
   254     } \
       
   255   else return PCRE_ERROR_DFA_WSSIZE
       
   256 
       
   257 #define ADD_NEW(x,y) \
       
   258   if (new_count++ < wscount) \
       
   259     { \
       
   260     next_new_state->offset = (x); \
       
   261     next_new_state->count  = (y); \
       
   262     next_new_state->ims    = ims; \
       
   263     next_new_state++; \
       
   264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
       
   265     } \
       
   266   else return PCRE_ERROR_DFA_WSSIZE
       
   267 
       
   268 #define ADD_NEW_DATA(x,y,z) \
       
   269   if (new_count++ < wscount) \
       
   270     { \
       
   271     next_new_state->offset = (x); \
       
   272     next_new_state->count  = (y); \
       
   273     next_new_state->ims    = ims; \
       
   274     next_new_state->data   = (z); \
       
   275     next_new_state++; \
       
   276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
       
   277     } \
       
   278   else return PCRE_ERROR_DFA_WSSIZE
       
   279 
       
   280 /* And now, here is the code */
       
   281 
       
   282 static int
       
   283 internal_dfa_exec(
       
   284   dfa_match_data *md,
       
   285   const uschar *this_start_code,
       
   286   const uschar *current_subject,
       
   287   int start_offset,
       
   288   int *offsets,
       
   289   int offsetcount,
       
   290   int *workspace,
       
   291   int wscount,
       
   292   int ims,
       
   293   int  rlevel,
       
   294   int  recursing)
       
   295 {
       
   296 stateblock *active_states, *new_states, *temp_states;
       
   297 stateblock *next_active_state, *next_new_state;
       
   298 
       
   299 const uschar *ctypes, *lcc, *fcc;
       
   300 const uschar *ptr;
       
   301 const uschar *end_code, *first_op;
       
   302 
       
   303 int active_count, new_count, match_count;
       
   304 
       
   305 /* Some fields in the md block are frequently referenced, so we load them into
       
   306 independent variables in the hope that this will perform better. */
       
   307 
       
   308 const uschar *start_subject = md->start_subject;
       
   309 const uschar *end_subject = md->end_subject;
       
   310 const uschar *start_code = md->start_code;
       
   311 
       
   312 #ifdef SUPPORT_UTF8
       
   313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
       
   314 #else
       
   315 BOOL utf8 = FALSE;
       
   316 #endif
       
   317 
       
   318 rlevel++;
       
   319 offsetcount &= (-2);
       
   320 
       
   321 wscount -= 2;
       
   322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
       
   323           (2 * INTS_PER_STATEBLOCK);
       
   324 
       
   325 DPRINTF(("\n%.*s---------------------\n"
       
   326   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
       
   327   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
       
   328 
       
   329 ctypes = md->tables + ctypes_offset;
       
   330 lcc = md->tables + lcc_offset;
       
   331 fcc = md->tables + fcc_offset;
       
   332 
       
   333 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
       
   334 
       
   335 active_states = (stateblock *)(workspace + 2);
       
   336 next_new_state = new_states = active_states + wscount;
       
   337 new_count = 0;
       
   338 
       
   339 first_op = this_start_code + 1 + LINK_SIZE +
       
   340   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
       
   341 
       
   342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
       
   343 the alternative states onto the list, and find out where the end is. This
       
   344 makes is possible to use this function recursively, when we want to stop at a
       
   345 matching internal ket rather than at the end.
       
   346 
       
   347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
       
   348 a backward assertion. In that case, we have to find out the maximum amount to
       
   349 move back, and set up each alternative appropriately. */
       
   350 
       
   351 if (*first_op == OP_REVERSE)
       
   352   {
       
   353   int max_back = 0;
       
   354   int gone_back;
       
   355 
       
   356   end_code = this_start_code;
       
   357   do
       
   358     {
       
   359     int back = GET(end_code, 2+LINK_SIZE);
       
   360     if (back > max_back) max_back = back;
       
   361     end_code += GET(end_code, 1);
       
   362     }
       
   363   while (*end_code == OP_ALT);
       
   364 
       
   365   /* If we can't go back the amount required for the longest lookbehind
       
   366   pattern, go back as far as we can; some alternatives may still be viable. */
       
   367 
       
   368 #ifdef SUPPORT_UTF8
       
   369   /* In character mode we have to step back character by character */
       
   370 
       
   371   if (utf8)
       
   372     {
       
   373     for (gone_back = 0; gone_back < max_back; gone_back++)
       
   374       {
       
   375       if (current_subject <= start_subject) break;
       
   376       current_subject--;
       
   377       while (current_subject > start_subject &&
       
   378              (*current_subject & 0xc0) == 0x80)
       
   379         current_subject--;
       
   380       }
       
   381     }
       
   382   else
       
   383 #endif
       
   384 
       
   385   /* In byte-mode we can do this quickly. */
       
   386 
       
   387     {
       
   388     gone_back = (current_subject - max_back < start_subject)?
       
   389       current_subject - start_subject : max_back;
       
   390     current_subject -= gone_back;
       
   391     }
       
   392 
       
   393   /* Now we can process the individual branches. */
       
   394 
       
   395   end_code = this_start_code;
       
   396   do
       
   397     {
       
   398     int back = GET(end_code, 2+LINK_SIZE);
       
   399     if (back <= gone_back)
       
   400       {
       
   401       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
       
   402       ADD_NEW_DATA(-bstate, 0, gone_back - back);
       
   403       }
       
   404     end_code += GET(end_code, 1);
       
   405     }
       
   406   while (*end_code == OP_ALT);
       
   407  }
       
   408 
       
   409 /* This is the code for a "normal" subpattern (not a backward assertion). The
       
   410 start of a whole pattern is always one of these. If we are at the top level,
       
   411 we may be asked to restart matching from the same point that we reached for a
       
   412 previous partial match. We still have to scan through the top-level branches to
       
   413 find the end state. */
       
   414 
       
   415 else
       
   416   {
       
   417   end_code = this_start_code;
       
   418 
       
   419   /* Restarting */
       
   420 
       
   421   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
       
   422     {
       
   423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
       
   424     new_count = workspace[1];
       
   425     if (!workspace[0])
       
   426       memcpy(new_states, active_states, new_count * sizeof(stateblock));
       
   427     }
       
   428 
       
   429   /* Not restarting */
       
   430 
       
   431   else
       
   432     {
       
   433     int length = 1 + LINK_SIZE +
       
   434       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
       
   435     do
       
   436       {
       
   437       ADD_NEW(end_code - start_code + length, 0);
       
   438       end_code += GET(end_code, 1);
       
   439       length = 1 + LINK_SIZE;
       
   440       }
       
   441     while (*end_code == OP_ALT);
       
   442     }
       
   443   }
       
   444 
       
   445 workspace[0] = 0;    /* Bit indicating which vector is current */
       
   446 
       
   447 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
       
   448 
       
   449 /* Loop for scanning the subject */
       
   450 
       
   451 ptr = current_subject;
       
   452 for (;;)
       
   453   {
       
   454   int i, j;
       
   455   int clen, dlen;
       
   456   unsigned int c, d;
       
   457 
       
   458   /* Make the new state list into the active state list and empty the
       
   459   new state list. */
       
   460 
       
   461   temp_states = active_states;
       
   462   active_states = new_states;
       
   463   new_states = temp_states;
       
   464   active_count = new_count;
       
   465   new_count = 0;
       
   466 
       
   467   workspace[0] ^= 1;              /* Remember for the restarting feature */
       
   468   workspace[1] = active_count;
       
   469 
       
   470 #ifdef DEBUG
       
   471   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
       
   472   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
       
   473   printf("\"\n");
       
   474 
       
   475   printf("%.*sActive states: ", rlevel*2-2, SP);
       
   476   for (i = 0; i < active_count; i++)
       
   477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
       
   478   printf("\n");
       
   479 #endif
       
   480 
       
   481   /* Set the pointers for adding new states */
       
   482 
       
   483   next_active_state = active_states + active_count;
       
   484   next_new_state = new_states;
       
   485 
       
   486   /* Load the current character from the subject outside the loop, as many
       
   487   different states may want to look at it, and we assume that at least one
       
   488   will. */
       
   489 
       
   490   if (ptr < end_subject)
       
   491     {
       
   492     clen = 1;        /* Number of bytes in the character */
       
   493 #ifdef SUPPORT_UTF8
       
   494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
       
   495 #endif  /* SUPPORT_UTF8 */
       
   496     c = *ptr;
       
   497     }
       
   498   else
       
   499     {
       
   500     clen = 0;        /* This indicates the end of the subject */
       
   501     c = NOTACHAR;    /* This value should never actually be used */
       
   502     }
       
   503 
       
   504   /* Scan up the active states and act on each one. The result of an action
       
   505   may be to add more states to the currently active list (e.g. on hitting a
       
   506   parenthesis) or it may be to put states on the new list, for considering
       
   507   when we move the character pointer on. */
       
   508 
       
   509   for (i = 0; i < active_count; i++)
       
   510     {
       
   511     stateblock *current_state = active_states + i;
       
   512     const uschar *code;
       
   513     int state_offset = current_state->offset;
       
   514     int count, codevalue;
       
   515 
       
   516 #ifdef DEBUG
       
   517     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
       
   518     if (clen == 0) printf("EOL\n");
       
   519       else if (c > 32 && c < 127) printf("'%c'\n", c);
       
   520         else printf("0x%02x\n", c);
       
   521 #endif
       
   522 
       
   523     /* This variable is referred to implicity in the ADD_xxx macros. */
       
   524 
       
   525     ims = current_state->ims;
       
   526 
       
   527     /* A negative offset is a special case meaning "hold off going to this
       
   528     (negated) state until the number of characters in the data field have
       
   529     been skipped". */
       
   530 
       
   531     if (state_offset < 0)
       
   532       {
       
   533       if (current_state->data > 0)
       
   534         {
       
   535         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
       
   536         ADD_NEW_DATA(state_offset, current_state->count,
       
   537           current_state->data - 1);
       
   538         continue;
       
   539         }
       
   540       else
       
   541         {
       
   542         current_state->offset = state_offset = -state_offset;
       
   543         }
       
   544       }
       
   545 
       
   546     /* Check for a duplicate state with the same count, and skip if found. */
       
   547 
       
   548     for (j = 0; j < i; j++)
       
   549       {
       
   550       if (active_states[j].offset == state_offset &&
       
   551           active_states[j].count == current_state->count)
       
   552         {
       
   553         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
       
   554         goto NEXT_ACTIVE_STATE;
       
   555         }
       
   556       }
       
   557 
       
   558     /* The state offset is the offset to the opcode */
       
   559 
       
   560     code = start_code + state_offset;
       
   561     codevalue = *code;
       
   562 
       
   563     /* If this opcode is followed by an inline character, load it. It is
       
   564     tempting to test for the presence of a subject character here, but that
       
   565     is wrong, because sometimes zero repetitions of the subject are
       
   566     permitted.
       
   567 
       
   568     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
       
   569     argument that is not a data character - but is always one byte long. We
       
   570     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
       
   571     this case. To keep the other cases fast, convert these ones to new opcodes.
       
   572     */
       
   573 
       
   574     if (coptable[codevalue] > 0)
       
   575       {
       
   576       dlen = 1;
       
   577 #ifdef SUPPORT_UTF8
       
   578       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
       
   579 #endif  /* SUPPORT_UTF8 */
       
   580       d = code[coptable[codevalue]];
       
   581       if (codevalue >= OP_TYPESTAR)
       
   582         {
       
   583         switch(d)
       
   584           {
       
   585           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
       
   586           case OP_NOTPROP:
       
   587           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
       
   588           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
       
   589           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
       
   590           case OP_NOT_HSPACE:
       
   591           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
       
   592           case OP_NOT_VSPACE:
       
   593           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
       
   594           default: break;
       
   595           }
       
   596         }
       
   597       }
       
   598     else
       
   599       {
       
   600       dlen = 0;         /* Not strictly necessary, but compilers moan */
       
   601       d = NOTACHAR;     /* if these variables are not set. */
       
   602       }
       
   603 
       
   604 
       
   605     /* Now process the individual opcodes */
       
   606 
       
   607     switch (codevalue)
       
   608       {
       
   609 
       
   610 /* ========================================================================== */
       
   611       /* Reached a closing bracket. If not at the end of the pattern, carry
       
   612       on with the next opcode. Otherwise, unless we have an empty string and
       
   613       PCRE_NOTEMPTY is set, save the match data, shifting up all previous
       
   614       matches so we always have the longest first. */
       
   615 
       
   616       case OP_KET:
       
   617       case OP_KETRMIN:
       
   618       case OP_KETRMAX:
       
   619       if (code != end_code)
       
   620         {
       
   621         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
       
   622         if (codevalue != OP_KET)
       
   623           {
       
   624           ADD_ACTIVE(state_offset - GET(code, 1), 0);
       
   625           }
       
   626         }
       
   627       else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
       
   628         {
       
   629         if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
       
   630           else if (match_count > 0 && ++match_count * 2 >= offsetcount)
       
   631             match_count = 0;
       
   632         count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
       
   633         if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
       
   634         if (offsetcount >= 2)
       
   635           {
       
   636           offsets[0] = current_subject - start_subject;
       
   637           offsets[1] = ptr - start_subject;
       
   638           DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
       
   639             offsets[1] - offsets[0], current_subject));
       
   640           }
       
   641         if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
       
   642           {
       
   643           DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
       
   644             "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
       
   645             match_count, rlevel*2-2, SP));
       
   646           return match_count;
       
   647           }
       
   648         }
       
   649       break;
       
   650 
       
   651 /* ========================================================================== */
       
   652       /* These opcodes add to the current list of states without looking
       
   653       at the current character. */
       
   654 
       
   655       /*-----------------------------------------------------------------*/
       
   656       case OP_ALT:
       
   657       do { code += GET(code, 1); } while (*code == OP_ALT);
       
   658       ADD_ACTIVE(code - start_code, 0);
       
   659       break;
       
   660 
       
   661       /*-----------------------------------------------------------------*/
       
   662       case OP_BRA:
       
   663       case OP_SBRA:
       
   664       do
       
   665         {
       
   666         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
       
   667         code += GET(code, 1);
       
   668         }
       
   669       while (*code == OP_ALT);
       
   670       break;
       
   671 
       
   672       /*-----------------------------------------------------------------*/
       
   673       case OP_CBRA:
       
   674       case OP_SCBRA:
       
   675       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
       
   676       code += GET(code, 1);
       
   677       while (*code == OP_ALT)
       
   678         {
       
   679         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
       
   680         code += GET(code, 1);
       
   681         }
       
   682       break;
       
   683 
       
   684       /*-----------------------------------------------------------------*/
       
   685       case OP_BRAZERO:
       
   686       case OP_BRAMINZERO:
       
   687       ADD_ACTIVE(state_offset + 1, 0);
       
   688       code += 1 + GET(code, 2);
       
   689       while (*code == OP_ALT) code += GET(code, 1);
       
   690       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
       
   691       break;
       
   692 
       
   693       /*-----------------------------------------------------------------*/
       
   694       case OP_SKIPZERO:
       
   695       code += 1 + GET(code, 2);
       
   696       while (*code == OP_ALT) code += GET(code, 1);
       
   697       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
       
   698       break;
       
   699 
       
   700       /*-----------------------------------------------------------------*/
       
   701       case OP_CIRC:
       
   702       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
       
   703           ((ims & PCRE_MULTILINE) != 0 &&
       
   704             ptr != end_subject &&
       
   705             WAS_NEWLINE(ptr)))
       
   706         { ADD_ACTIVE(state_offset + 1, 0); }
       
   707       break;
       
   708 
       
   709       /*-----------------------------------------------------------------*/
       
   710       case OP_EOD:
       
   711       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
       
   712       break;
       
   713 
       
   714       /*-----------------------------------------------------------------*/
       
   715       case OP_OPT:
       
   716       ims = code[1];
       
   717       ADD_ACTIVE(state_offset + 2, 0);
       
   718       break;
       
   719 
       
   720       /*-----------------------------------------------------------------*/
       
   721       case OP_SOD:
       
   722       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
       
   723       break;
       
   724 
       
   725       /*-----------------------------------------------------------------*/
       
   726       case OP_SOM:
       
   727       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
       
   728       break;
       
   729 
       
   730 
       
   731 /* ========================================================================== */
       
   732       /* These opcodes inspect the next subject character, and sometimes
       
   733       the previous one as well, but do not have an argument. The variable
       
   734       clen contains the length of the current character and is zero if we are
       
   735       at the end of the subject. */
       
   736 
       
   737       /*-----------------------------------------------------------------*/
       
   738       case OP_ANY:
       
   739       if (clen > 0 && !IS_NEWLINE(ptr))
       
   740         { ADD_NEW(state_offset + 1, 0); }
       
   741       break;
       
   742 
       
   743       /*-----------------------------------------------------------------*/
       
   744       case OP_ALLANY:
       
   745       if (clen > 0)
       
   746         { ADD_NEW(state_offset + 1, 0); }
       
   747       break;
       
   748 
       
   749       /*-----------------------------------------------------------------*/
       
   750       case OP_EODN:
       
   751       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
       
   752         { ADD_ACTIVE(state_offset + 1, 0); }
       
   753       break;
       
   754 
       
   755       /*-----------------------------------------------------------------*/
       
   756       case OP_DOLL:
       
   757       if ((md->moptions & PCRE_NOTEOL) == 0)
       
   758         {
       
   759         if (clen == 0 ||
       
   760             (IS_NEWLINE(ptr) &&
       
   761                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
       
   762             ))
       
   763           { ADD_ACTIVE(state_offset + 1, 0); }
       
   764         }
       
   765       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
       
   766         { ADD_ACTIVE(state_offset + 1, 0); }
       
   767       break;
       
   768 
       
   769       /*-----------------------------------------------------------------*/
       
   770 
       
   771       case OP_DIGIT:
       
   772       case OP_WHITESPACE:
       
   773       case OP_WORDCHAR:
       
   774       if (clen > 0 && c < 256 &&
       
   775             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
       
   776         { ADD_NEW(state_offset + 1, 0); }
       
   777       break;
       
   778 
       
   779       /*-----------------------------------------------------------------*/
       
   780       case OP_NOT_DIGIT:
       
   781       case OP_NOT_WHITESPACE:
       
   782       case OP_NOT_WORDCHAR:
       
   783       if (clen > 0 && (c >= 256 ||
       
   784             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
       
   785         { ADD_NEW(state_offset + 1, 0); }
       
   786       break;
       
   787 
       
   788       /*-----------------------------------------------------------------*/
       
   789       case OP_WORD_BOUNDARY:
       
   790       case OP_NOT_WORD_BOUNDARY:
       
   791         {
       
   792         int left_word, right_word;
       
   793 
       
   794         if (ptr > start_subject)
       
   795           {
       
   796           const uschar *temp = ptr - 1;
       
   797 #ifdef SUPPORT_UTF8
       
   798           if (utf8) BACKCHAR(temp);
       
   799 #endif
       
   800           GETCHARTEST(d, temp);
       
   801           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
       
   802           }
       
   803         else left_word = 0;
       
   804 
       
   805         if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
       
   806           else right_word = 0;
       
   807 
       
   808         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
       
   809           { ADD_ACTIVE(state_offset + 1, 0); }
       
   810         }
       
   811       break;
       
   812 
       
   813 
       
   814       /*-----------------------------------------------------------------*/
       
   815       /* Check the next character by Unicode property. We will get here only
       
   816       if the support is in the binary; otherwise a compile-time error occurs.
       
   817       */
       
   818 
       
   819 #ifdef SUPPORT_UCP
       
   820       case OP_PROP:
       
   821       case OP_NOTPROP:
       
   822       if (clen > 0)
       
   823         {
       
   824         BOOL OK;
       
   825         const ucd_record * prop = GET_UCD(c);
       
   826         switch(code[1])
       
   827           {
       
   828           case PT_ANY:
       
   829           OK = TRUE;
       
   830           break;
       
   831 
       
   832           case PT_LAMP:
       
   833           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
       
   834           break;
       
   835 
       
   836           case PT_GC:
       
   837           OK = _pcre_ucp_gentype[prop->chartype] == code[2];
       
   838           break;
       
   839 
       
   840           case PT_PC:
       
   841           OK = prop->chartype == code[2];
       
   842           break;
       
   843 
       
   844           case PT_SC:
       
   845           OK = prop->script == code[2];
       
   846           break;
       
   847 
       
   848           /* Should never occur, but keep compilers from grumbling. */
       
   849 
       
   850           default:
       
   851           OK = codevalue != OP_PROP;
       
   852           break;
       
   853           }
       
   854 
       
   855         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
       
   856         }
       
   857       break;
       
   858 #endif
       
   859 
       
   860 
       
   861 
       
   862 /* ========================================================================== */
       
   863       /* These opcodes likewise inspect the subject character, but have an
       
   864       argument that is not a data character. It is one of these opcodes:
       
   865       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
       
   866       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
       
   867 
       
   868       case OP_TYPEPLUS:
       
   869       case OP_TYPEMINPLUS:
       
   870       case OP_TYPEPOSPLUS:
       
   871       count = current_state->count;  /* Already matched */
       
   872       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
       
   873       if (clen > 0)
       
   874         {
       
   875         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
       
   876             (c < 256 &&
       
   877               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
       
   878               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
       
   879           {
       
   880           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
       
   881             {
       
   882             active_count--;            /* Remove non-match possibility */
       
   883             next_active_state--;
       
   884             }
       
   885           count++;
       
   886           ADD_NEW(state_offset, count);
       
   887           }
       
   888         }
       
   889       break;
       
   890 
       
   891       /*-----------------------------------------------------------------*/
       
   892       case OP_TYPEQUERY:
       
   893       case OP_TYPEMINQUERY:
       
   894       case OP_TYPEPOSQUERY:
       
   895       ADD_ACTIVE(state_offset + 2, 0);
       
   896       if (clen > 0)
       
   897         {
       
   898         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
       
   899             (c < 256 &&
       
   900               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
       
   901               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
       
   902           {
       
   903           if (codevalue == OP_TYPEPOSQUERY)
       
   904             {
       
   905             active_count--;            /* Remove non-match possibility */
       
   906             next_active_state--;
       
   907             }
       
   908           ADD_NEW(state_offset + 2, 0);
       
   909           }
       
   910         }
       
   911       break;
       
   912 
       
   913       /*-----------------------------------------------------------------*/
       
   914       case OP_TYPESTAR:
       
   915       case OP_TYPEMINSTAR:
       
   916       case OP_TYPEPOSSTAR:
       
   917       ADD_ACTIVE(state_offset + 2, 0);
       
   918       if (clen > 0)
       
   919         {
       
   920         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
       
   921             (c < 256 &&
       
   922               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
       
   923               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
       
   924           {
       
   925           if (codevalue == OP_TYPEPOSSTAR)
       
   926             {
       
   927             active_count--;            /* Remove non-match possibility */
       
   928             next_active_state--;
       
   929             }
       
   930           ADD_NEW(state_offset, 0);
       
   931           }
       
   932         }
       
   933       break;
       
   934 
       
   935       /*-----------------------------------------------------------------*/
       
   936       case OP_TYPEEXACT:
       
   937       count = current_state->count;  /* Number already matched */
       
   938       if (clen > 0)
       
   939         {
       
   940         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
       
   941             (c < 256 &&
       
   942               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
       
   943               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
       
   944           {
       
   945           if (++count >= GET2(code, 1))
       
   946             { ADD_NEW(state_offset + 4, 0); }
       
   947           else
       
   948             { ADD_NEW(state_offset, count); }
       
   949           }
       
   950         }
       
   951       break;
       
   952 
       
   953       /*-----------------------------------------------------------------*/
       
   954       case OP_TYPEUPTO:
       
   955       case OP_TYPEMINUPTO:
       
   956       case OP_TYPEPOSUPTO:
       
   957       ADD_ACTIVE(state_offset + 4, 0);
       
   958       count = current_state->count;  /* Number already matched */
       
   959       if (clen > 0)
       
   960         {
       
   961         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
       
   962             (c < 256 &&
       
   963               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
       
   964               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
       
   965           {
       
   966           if (codevalue == OP_TYPEPOSUPTO)
       
   967             {
       
   968             active_count--;           /* Remove non-match possibility */
       
   969             next_active_state--;
       
   970             }
       
   971           if (++count >= GET2(code, 1))
       
   972             { ADD_NEW(state_offset + 4, 0); }
       
   973           else
       
   974             { ADD_NEW(state_offset, count); }
       
   975           }
       
   976         }
       
   977       break;
       
   978 
       
   979 /* ========================================================================== */
       
   980       /* These are virtual opcodes that are used when something like
       
   981       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
       
   982       argument. It keeps the code above fast for the other cases. The argument
       
   983       is in the d variable. */
       
   984 
       
   985 #ifdef SUPPORT_UCP
       
   986       case OP_PROP_EXTRA + OP_TYPEPLUS:
       
   987       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
       
   988       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
       
   989       count = current_state->count;           /* Already matched */
       
   990       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
       
   991       if (clen > 0)
       
   992         {
       
   993         BOOL OK;
       
   994         const ucd_record * prop = GET_UCD(c);
       
   995         switch(code[2])
       
   996           {
       
   997           case PT_ANY:
       
   998           OK = TRUE;
       
   999           break;
       
  1000 
       
  1001           case PT_LAMP:
       
  1002           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
       
  1003           break;
       
  1004 
       
  1005           case PT_GC:
       
  1006           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
       
  1007           break;
       
  1008 
       
  1009           case PT_PC:
       
  1010           OK = prop->chartype == code[3];
       
  1011           break;
       
  1012 
       
  1013           case PT_SC:
       
  1014           OK = prop->script == code[3];
       
  1015           break;
       
  1016 
       
  1017           /* Should never occur, but keep compilers from grumbling. */
       
  1018 
       
  1019           default:
       
  1020           OK = codevalue != OP_PROP;
       
  1021           break;
       
  1022           }
       
  1023 
       
  1024         if (OK == (d == OP_PROP))
       
  1025           {
       
  1026           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
       
  1027             {
       
  1028             active_count--;           /* Remove non-match possibility */
       
  1029             next_active_state--;
       
  1030             }
       
  1031           count++;
       
  1032           ADD_NEW(state_offset, count);
       
  1033           }
       
  1034         }
       
  1035       break;
       
  1036 
       
  1037       /*-----------------------------------------------------------------*/
       
  1038       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
       
  1039       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
       
  1040       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
       
  1041       count = current_state->count;  /* Already matched */
       
  1042       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
       
  1043       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
       
  1044         {
       
  1045         const uschar *nptr = ptr + clen;
       
  1046         int ncount = 0;
       
  1047         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
       
  1048           {
       
  1049           active_count--;           /* Remove non-match possibility */
       
  1050           next_active_state--;
       
  1051           }
       
  1052         while (nptr < end_subject)
       
  1053           {
       
  1054           int nd;
       
  1055           int ndlen = 1;
       
  1056           GETCHARLEN(nd, nptr, ndlen);
       
  1057           if (UCD_CATEGORY(nd) != ucp_M) break;
       
  1058           ncount++;
       
  1059           nptr += ndlen;
       
  1060           }
       
  1061         count++;
       
  1062         ADD_NEW_DATA(-state_offset, count, ncount);
       
  1063         }
       
  1064       break;
       
  1065 #endif
       
  1066 
       
  1067       /*-----------------------------------------------------------------*/
       
  1068       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
       
  1069       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
       
  1070       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
       
  1071       count = current_state->count;  /* Already matched */
       
  1072       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
       
  1073       if (clen > 0)
       
  1074         {
       
  1075         int ncount = 0;
       
  1076         switch (c)
       
  1077           {
       
  1078           case 0x000b:
       
  1079           case 0x000c:
       
  1080           case 0x0085:
       
  1081           case 0x2028:
       
  1082           case 0x2029:
       
  1083           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
       
  1084           goto ANYNL01;
       
  1085 
       
  1086           case 0x000d:
       
  1087           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
       
  1088           /* Fall through */
       
  1089 
       
  1090           ANYNL01:
       
  1091           case 0x000a:
       
  1092           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
       
  1093             {
       
  1094             active_count--;           /* Remove non-match possibility */
       
  1095             next_active_state--;
       
  1096             }
       
  1097           count++;
       
  1098           ADD_NEW_DATA(-state_offset, count, ncount);
       
  1099           break;
       
  1100 
       
  1101           default:
       
  1102           break;
       
  1103           }
       
  1104         }
       
  1105       break;
       
  1106 
       
  1107       /*-----------------------------------------------------------------*/
       
  1108       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
       
  1109       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
       
  1110       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
       
  1111       count = current_state->count;  /* Already matched */
       
  1112       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
       
  1113       if (clen > 0)
       
  1114         {
       
  1115         BOOL OK;
       
  1116         switch (c)
       
  1117           {
       
  1118           case 0x000a:
       
  1119           case 0x000b:
       
  1120           case 0x000c:
       
  1121           case 0x000d:
       
  1122           case 0x0085:
       
  1123           case 0x2028:
       
  1124           case 0x2029:
       
  1125           OK = TRUE;
       
  1126           break;
       
  1127 
       
  1128           default:
       
  1129           OK = FALSE;
       
  1130           break;
       
  1131           }
       
  1132 
       
  1133         if (OK == (d == OP_VSPACE))
       
  1134           {
       
  1135           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
       
  1136             {
       
  1137             active_count--;           /* Remove non-match possibility */
       
  1138             next_active_state--;
       
  1139             }
       
  1140           count++;
       
  1141           ADD_NEW_DATA(-state_offset, count, 0);
       
  1142           }
       
  1143         }
       
  1144       break;
       
  1145 
       
  1146       /*-----------------------------------------------------------------*/
       
  1147       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
       
  1148       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
       
  1149       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
       
  1150       count = current_state->count;  /* Already matched */
       
  1151       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
       
  1152       if (clen > 0)
       
  1153         {
       
  1154         BOOL OK;
       
  1155         switch (c)
       
  1156           {
       
  1157           case 0x09:      /* HT */
       
  1158           case 0x20:      /* SPACE */
       
  1159           case 0xa0:      /* NBSP */
       
  1160           case 0x1680:    /* OGHAM SPACE MARK */
       
  1161           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
       
  1162           case 0x2000:    /* EN QUAD */
       
  1163           case 0x2001:    /* EM QUAD */
       
  1164           case 0x2002:    /* EN SPACE */
       
  1165           case 0x2003:    /* EM SPACE */
       
  1166           case 0x2004:    /* THREE-PER-EM SPACE */
       
  1167           case 0x2005:    /* FOUR-PER-EM SPACE */
       
  1168           case 0x2006:    /* SIX-PER-EM SPACE */
       
  1169           case 0x2007:    /* FIGURE SPACE */
       
  1170           case 0x2008:    /* PUNCTUATION SPACE */
       
  1171           case 0x2009:    /* THIN SPACE */
       
  1172           case 0x200A:    /* HAIR SPACE */
       
  1173           case 0x202f:    /* NARROW NO-BREAK SPACE */
       
  1174           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
       
  1175           case 0x3000:    /* IDEOGRAPHIC SPACE */
       
  1176           OK = TRUE;
       
  1177           break;
       
  1178 
       
  1179           default:
       
  1180           OK = FALSE;
       
  1181           break;
       
  1182           }
       
  1183 
       
  1184         if (OK == (d == OP_HSPACE))
       
  1185           {
       
  1186           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
       
  1187             {
       
  1188             active_count--;           /* Remove non-match possibility */
       
  1189             next_active_state--;
       
  1190             }
       
  1191           count++;
       
  1192           ADD_NEW_DATA(-state_offset, count, 0);
       
  1193           }
       
  1194         }
       
  1195       break;
       
  1196 
       
  1197       /*-----------------------------------------------------------------*/
       
  1198 #ifdef SUPPORT_UCP
       
  1199       case OP_PROP_EXTRA + OP_TYPEQUERY:
       
  1200       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
       
  1201       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
       
  1202       count = 4;
       
  1203       goto QS1;
       
  1204 
       
  1205       case OP_PROP_EXTRA + OP_TYPESTAR:
       
  1206       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
       
  1207       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
       
  1208       count = 0;
       
  1209 
       
  1210       QS1:
       
  1211 
       
  1212       ADD_ACTIVE(state_offset + 4, 0);
       
  1213       if (clen > 0)
       
  1214         {
       
  1215         BOOL OK;
       
  1216         const ucd_record * prop = GET_UCD(c);
       
  1217         switch(code[2])
       
  1218           {
       
  1219           case PT_ANY:
       
  1220           OK = TRUE;
       
  1221           break;
       
  1222 
       
  1223           case PT_LAMP:
       
  1224           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
       
  1225           break;
       
  1226 
       
  1227           case PT_GC:
       
  1228           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
       
  1229           break;
       
  1230 
       
  1231           case PT_PC:
       
  1232           OK = prop->chartype == code[3];
       
  1233           break;
       
  1234 
       
  1235           case PT_SC:
       
  1236           OK = prop->script == code[3];
       
  1237           break;
       
  1238 
       
  1239           /* Should never occur, but keep compilers from grumbling. */
       
  1240 
       
  1241           default:
       
  1242           OK = codevalue != OP_PROP;
       
  1243           break;
       
  1244           }
       
  1245 
       
  1246         if (OK == (d == OP_PROP))
       
  1247           {
       
  1248           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
       
  1249               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
       
  1250             {
       
  1251             active_count--;           /* Remove non-match possibility */
       
  1252             next_active_state--;
       
  1253             }
       
  1254           ADD_NEW(state_offset + count, 0);
       
  1255           }
       
  1256         }
       
  1257       break;
       
  1258 
       
  1259       /*-----------------------------------------------------------------*/
       
  1260       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
       
  1261       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
       
  1262       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
       
  1263       count = 2;
       
  1264       goto QS2;
       
  1265 
       
  1266       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
       
  1267       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
       
  1268       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
       
  1269       count = 0;
       
  1270 
       
  1271       QS2:
       
  1272 
       
  1273       ADD_ACTIVE(state_offset + 2, 0);
       
  1274       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
       
  1275         {
       
  1276         const uschar *nptr = ptr + clen;
       
  1277         int ncount = 0;
       
  1278         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
       
  1279             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
       
  1280           {
       
  1281           active_count--;           /* Remove non-match possibility */
       
  1282           next_active_state--;
       
  1283           }
       
  1284         while (nptr < end_subject)
       
  1285           {
       
  1286           int nd;
       
  1287           int ndlen = 1;
       
  1288           GETCHARLEN(nd, nptr, ndlen);
       
  1289           if (UCD_CATEGORY(nd) != ucp_M) break;
       
  1290           ncount++;
       
  1291           nptr += ndlen;
       
  1292           }
       
  1293         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
       
  1294         }
       
  1295       break;
       
  1296 #endif
       
  1297 
       
  1298       /*-----------------------------------------------------------------*/
       
  1299       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
       
  1300       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
       
  1301       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
       
  1302       count = 2;
       
  1303       goto QS3;
       
  1304 
       
  1305       case OP_ANYNL_EXTRA + OP_TYPESTAR:
       
  1306       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
       
  1307       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
       
  1308       count = 0;
       
  1309 
       
  1310       QS3:
       
  1311       ADD_ACTIVE(state_offset + 2, 0);
       
  1312       if (clen > 0)
       
  1313         {
       
  1314         int ncount = 0;
       
  1315         switch (c)
       
  1316           {
       
  1317           case 0x000b:
       
  1318           case 0x000c:
       
  1319           case 0x0085:
       
  1320           case 0x2028:
       
  1321           case 0x2029:
       
  1322           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
       
  1323           goto ANYNL02;
       
  1324 
       
  1325           case 0x000d:
       
  1326           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
       
  1327           /* Fall through */
       
  1328 
       
  1329           ANYNL02:
       
  1330           case 0x000a:
       
  1331           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
       
  1332               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
       
  1333             {
       
  1334             active_count--;           /* Remove non-match possibility */
       
  1335             next_active_state--;
       
  1336             }
       
  1337           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
       
  1338           break;
       
  1339 
       
  1340           default:
       
  1341           break;
       
  1342           }
       
  1343         }
       
  1344       break;
       
  1345 
       
  1346       /*-----------------------------------------------------------------*/
       
  1347       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
       
  1348       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
       
  1349       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
       
  1350       count = 2;
       
  1351       goto QS4;
       
  1352 
       
  1353       case OP_VSPACE_EXTRA + OP_TYPESTAR:
       
  1354       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
       
  1355       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
       
  1356       count = 0;
       
  1357 
       
  1358       QS4:
       
  1359       ADD_ACTIVE(state_offset + 2, 0);
       
  1360       if (clen > 0)
       
  1361         {
       
  1362         BOOL OK;
       
  1363         switch (c)
       
  1364           {
       
  1365           case 0x000a:
       
  1366           case 0x000b:
       
  1367           case 0x000c:
       
  1368           case 0x000d:
       
  1369           case 0x0085:
       
  1370           case 0x2028:
       
  1371           case 0x2029:
       
  1372           OK = TRUE;
       
  1373           break;
       
  1374 
       
  1375           default:
       
  1376           OK = FALSE;
       
  1377           break;
       
  1378           }
       
  1379         if (OK == (d == OP_VSPACE))
       
  1380           {
       
  1381           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
       
  1382               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
       
  1383             {
       
  1384             active_count--;           /* Remove non-match possibility */
       
  1385             next_active_state--;
       
  1386             }
       
  1387           ADD_NEW_DATA(-(state_offset + count), 0, 0);
       
  1388           }
       
  1389         }
       
  1390       break;
       
  1391 
       
  1392       /*-----------------------------------------------------------------*/
       
  1393       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
       
  1394       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
       
  1395       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
       
  1396       count = 2;
       
  1397       goto QS5;
       
  1398 
       
  1399       case OP_HSPACE_EXTRA + OP_TYPESTAR:
       
  1400       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
       
  1401       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
       
  1402       count = 0;
       
  1403 
       
  1404       QS5:
       
  1405       ADD_ACTIVE(state_offset + 2, 0);
       
  1406       if (clen > 0)
       
  1407         {
       
  1408         BOOL OK;
       
  1409         switch (c)
       
  1410           {
       
  1411           case 0x09:      /* HT */
       
  1412           case 0x20:      /* SPACE */
       
  1413           case 0xa0:      /* NBSP */
       
  1414           case 0x1680:    /* OGHAM SPACE MARK */
       
  1415           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
       
  1416           case 0x2000:    /* EN QUAD */
       
  1417           case 0x2001:    /* EM QUAD */
       
  1418           case 0x2002:    /* EN SPACE */
       
  1419           case 0x2003:    /* EM SPACE */
       
  1420           case 0x2004:    /* THREE-PER-EM SPACE */
       
  1421           case 0x2005:    /* FOUR-PER-EM SPACE */
       
  1422           case 0x2006:    /* SIX-PER-EM SPACE */
       
  1423           case 0x2007:    /* FIGURE SPACE */
       
  1424           case 0x2008:    /* PUNCTUATION SPACE */
       
  1425           case 0x2009:    /* THIN SPACE */
       
  1426           case 0x200A:    /* HAIR SPACE */
       
  1427           case 0x202f:    /* NARROW NO-BREAK SPACE */
       
  1428           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
       
  1429           case 0x3000:    /* IDEOGRAPHIC SPACE */
       
  1430           OK = TRUE;
       
  1431           break;
       
  1432 
       
  1433           default:
       
  1434           OK = FALSE;
       
  1435           break;
       
  1436           }
       
  1437 
       
  1438         if (OK == (d == OP_HSPACE))
       
  1439           {
       
  1440           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
       
  1441               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
       
  1442             {
       
  1443             active_count--;           /* Remove non-match possibility */
       
  1444             next_active_state--;
       
  1445             }
       
  1446           ADD_NEW_DATA(-(state_offset + count), 0, 0);
       
  1447           }
       
  1448         }
       
  1449       break;
       
  1450 
       
  1451       /*-----------------------------------------------------------------*/
       
  1452 #ifdef SUPPORT_UCP
       
  1453       case OP_PROP_EXTRA + OP_TYPEEXACT:
       
  1454       case OP_PROP_EXTRA + OP_TYPEUPTO:
       
  1455       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
       
  1456       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
       
  1457       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
       
  1458         { ADD_ACTIVE(state_offset + 6, 0); }
       
  1459       count = current_state->count;  /* Number already matched */
       
  1460       if (clen > 0)
       
  1461         {
       
  1462         BOOL OK;
       
  1463         const ucd_record * prop = GET_UCD(c);
       
  1464         switch(code[4])
       
  1465           {
       
  1466           case PT_ANY:
       
  1467           OK = TRUE;
       
  1468           break;
       
  1469 
       
  1470           case PT_LAMP:
       
  1471           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
       
  1472           break;
       
  1473 
       
  1474           case PT_GC:
       
  1475           OK = _pcre_ucp_gentype[prop->chartype] == code[5];
       
  1476           break;
       
  1477 
       
  1478           case PT_PC:
       
  1479           OK = prop->chartype == code[5];
       
  1480           break;
       
  1481 
       
  1482           case PT_SC:
       
  1483           OK = prop->script == code[5];
       
  1484           break;
       
  1485 
       
  1486           /* Should never occur, but keep compilers from grumbling. */
       
  1487 
       
  1488           default:
       
  1489           OK = codevalue != OP_PROP;
       
  1490           break;
       
  1491           }
       
  1492 
       
  1493         if (OK == (d == OP_PROP))
       
  1494           {
       
  1495           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
       
  1496             {
       
  1497             active_count--;           /* Remove non-match possibility */
       
  1498             next_active_state--;
       
  1499             }
       
  1500           if (++count >= GET2(code, 1))
       
  1501             { ADD_NEW(state_offset + 6, 0); }
       
  1502           else
       
  1503             { ADD_NEW(state_offset, count); }
       
  1504           }
       
  1505         }
       
  1506       break;
       
  1507 
       
  1508       /*-----------------------------------------------------------------*/
       
  1509       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
       
  1510       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
       
  1511       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
       
  1512       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
       
  1513       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
       
  1514         { ADD_ACTIVE(state_offset + 4, 0); }
       
  1515       count = current_state->count;  /* Number already matched */
       
  1516       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
       
  1517         {
       
  1518         const uschar *nptr = ptr + clen;
       
  1519         int ncount = 0;
       
  1520         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
       
  1521           {
       
  1522           active_count--;           /* Remove non-match possibility */
       
  1523           next_active_state--;
       
  1524           }
       
  1525         while (nptr < end_subject)
       
  1526           {
       
  1527           int nd;
       
  1528           int ndlen = 1;
       
  1529           GETCHARLEN(nd, nptr, ndlen);
       
  1530           if (UCD_CATEGORY(nd) != ucp_M) break;
       
  1531           ncount++;
       
  1532           nptr += ndlen;
       
  1533           }
       
  1534         if (++count >= GET2(code, 1))
       
  1535           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
       
  1536         else
       
  1537           { ADD_NEW_DATA(-state_offset, count, ncount); }
       
  1538         }
       
  1539       break;
       
  1540 #endif
       
  1541 
       
  1542       /*-----------------------------------------------------------------*/
       
  1543       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
       
  1544       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
       
  1545       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
       
  1546       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
       
  1547       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
       
  1548         { ADD_ACTIVE(state_offset + 4, 0); }
       
  1549       count = current_state->count;  /* Number already matched */
       
  1550       if (clen > 0)
       
  1551         {
       
  1552         int ncount = 0;
       
  1553         switch (c)
       
  1554           {
       
  1555           case 0x000b:
       
  1556           case 0x000c:
       
  1557           case 0x0085:
       
  1558           case 0x2028:
       
  1559           case 0x2029:
       
  1560           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
       
  1561           goto ANYNL03;
       
  1562 
       
  1563           case 0x000d:
       
  1564           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
       
  1565           /* Fall through */
       
  1566 
       
  1567           ANYNL03:
       
  1568           case 0x000a:
       
  1569           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
       
  1570             {
       
  1571             active_count--;           /* Remove non-match possibility */
       
  1572             next_active_state--;
       
  1573             }
       
  1574           if (++count >= GET2(code, 1))
       
  1575             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
       
  1576           else
       
  1577             { ADD_NEW_DATA(-state_offset, count, ncount); }
       
  1578           break;
       
  1579 
       
  1580           default:
       
  1581           break;
       
  1582           }
       
  1583         }
       
  1584       break;
       
  1585 
       
  1586       /*-----------------------------------------------------------------*/
       
  1587       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
       
  1588       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
       
  1589       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
       
  1590       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
       
  1591       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
       
  1592         { ADD_ACTIVE(state_offset + 4, 0); }
       
  1593       count = current_state->count;  /* Number already matched */
       
  1594       if (clen > 0)
       
  1595         {
       
  1596         BOOL OK;
       
  1597         switch (c)
       
  1598           {
       
  1599           case 0x000a:
       
  1600           case 0x000b:
       
  1601           case 0x000c:
       
  1602           case 0x000d:
       
  1603           case 0x0085:
       
  1604           case 0x2028:
       
  1605           case 0x2029:
       
  1606           OK = TRUE;
       
  1607           break;
       
  1608 
       
  1609           default:
       
  1610           OK = FALSE;
       
  1611           }
       
  1612 
       
  1613         if (OK == (d == OP_VSPACE))
       
  1614           {
       
  1615           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
       
  1616             {
       
  1617             active_count--;           /* Remove non-match possibility */
       
  1618             next_active_state--;
       
  1619             }
       
  1620           if (++count >= GET2(code, 1))
       
  1621             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
       
  1622           else
       
  1623             { ADD_NEW_DATA(-state_offset, count, 0); }
       
  1624           }
       
  1625         }
       
  1626       break;
       
  1627 
       
  1628       /*-----------------------------------------------------------------*/
       
  1629       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
       
  1630       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
       
  1631       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
       
  1632       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
       
  1633       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
       
  1634         { ADD_ACTIVE(state_offset + 4, 0); }
       
  1635       count = current_state->count;  /* Number already matched */
       
  1636       if (clen > 0)
       
  1637         {
       
  1638         BOOL OK;
       
  1639         switch (c)
       
  1640           {
       
  1641           case 0x09:      /* HT */
       
  1642           case 0x20:      /* SPACE */
       
  1643           case 0xa0:      /* NBSP */
       
  1644           case 0x1680:    /* OGHAM SPACE MARK */
       
  1645           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
       
  1646           case 0x2000:    /* EN QUAD */
       
  1647           case 0x2001:    /* EM QUAD */
       
  1648           case 0x2002:    /* EN SPACE */
       
  1649           case 0x2003:    /* EM SPACE */
       
  1650           case 0x2004:    /* THREE-PER-EM SPACE */
       
  1651           case 0x2005:    /* FOUR-PER-EM SPACE */
       
  1652           case 0x2006:    /* SIX-PER-EM SPACE */
       
  1653           case 0x2007:    /* FIGURE SPACE */
       
  1654           case 0x2008:    /* PUNCTUATION SPACE */
       
  1655           case 0x2009:    /* THIN SPACE */
       
  1656           case 0x200A:    /* HAIR SPACE */
       
  1657           case 0x202f:    /* NARROW NO-BREAK SPACE */
       
  1658           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
       
  1659           case 0x3000:    /* IDEOGRAPHIC SPACE */
       
  1660           OK = TRUE;
       
  1661           break;
       
  1662 
       
  1663           default:
       
  1664           OK = FALSE;
       
  1665           break;
       
  1666           }
       
  1667 
       
  1668         if (OK == (d == OP_HSPACE))
       
  1669           {
       
  1670           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
       
  1671             {
       
  1672             active_count--;           /* Remove non-match possibility */
       
  1673             next_active_state--;
       
  1674             }
       
  1675           if (++count >= GET2(code, 1))
       
  1676             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
       
  1677           else
       
  1678             { ADD_NEW_DATA(-state_offset, count, 0); }
       
  1679           }
       
  1680         }
       
  1681       break;
       
  1682 
       
  1683 /* ========================================================================== */
       
  1684       /* These opcodes are followed by a character that is usually compared
       
  1685       to the current subject character; it is loaded into d. We still get
       
  1686       here even if there is no subject character, because in some cases zero
       
  1687       repetitions are permitted. */
       
  1688 
       
  1689       /*-----------------------------------------------------------------*/
       
  1690       case OP_CHAR:
       
  1691       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
       
  1692       break;
       
  1693 
       
  1694       /*-----------------------------------------------------------------*/
       
  1695       case OP_CHARNC:
       
  1696       if (clen == 0) break;
       
  1697 
       
  1698 #ifdef SUPPORT_UTF8
       
  1699       if (utf8)
       
  1700         {
       
  1701         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
       
  1702           {
       
  1703           unsigned int othercase;
       
  1704           if (c < 128) othercase = fcc[c]; else
       
  1705 
       
  1706           /* If we have Unicode property support, we can use it to test the
       
  1707           other case of the character. */
       
  1708 
       
  1709 #ifdef SUPPORT_UCP
       
  1710           othercase = UCD_OTHERCASE(c);
       
  1711 #else
       
  1712           othercase = NOTACHAR;
       
  1713 #endif
       
  1714 
       
  1715           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
       
  1716           }
       
  1717         }
       
  1718       else
       
  1719 #endif  /* SUPPORT_UTF8 */
       
  1720 
       
  1721       /* Non-UTF-8 mode */
       
  1722         {
       
  1723         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
       
  1724         }
       
  1725       break;
       
  1726 
       
  1727 
       
  1728 #ifdef SUPPORT_UCP
       
  1729       /*-----------------------------------------------------------------*/
       
  1730       /* This is a tricky one because it can match more than one character.
       
  1731       Find out how many characters to skip, and then set up a negative state
       
  1732       to wait for them to pass before continuing. */
       
  1733 
       
  1734       case OP_EXTUNI:
       
  1735       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
       
  1736         {
       
  1737         const uschar *nptr = ptr + clen;
       
  1738         int ncount = 0;
       
  1739         while (nptr < end_subject)
       
  1740           {
       
  1741           int nclen = 1;
       
  1742           GETCHARLEN(c, nptr, nclen);
       
  1743           if (UCD_CATEGORY(c) != ucp_M) break;
       
  1744           ncount++;
       
  1745           nptr += nclen;
       
  1746           }
       
  1747         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
       
  1748         }
       
  1749       break;
       
  1750 #endif
       
  1751 
       
  1752       /*-----------------------------------------------------------------*/
       
  1753       /* This is a tricky like EXTUNI because it too can match more than one
       
  1754       character (when CR is followed by LF). In this case, set up a negative
       
  1755       state to wait for one character to pass before continuing. */
       
  1756 
       
  1757       case OP_ANYNL:
       
  1758       if (clen > 0) switch(c)
       
  1759         {
       
  1760         case 0x000b:
       
  1761         case 0x000c:
       
  1762         case 0x0085:
       
  1763         case 0x2028:
       
  1764         case 0x2029:
       
  1765         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
       
  1766 
       
  1767         case 0x000a:
       
  1768         ADD_NEW(state_offset + 1, 0);
       
  1769         break;
       
  1770 
       
  1771         case 0x000d:
       
  1772         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
       
  1773           {
       
  1774           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
       
  1775           }
       
  1776         else
       
  1777           {
       
  1778           ADD_NEW(state_offset + 1, 0);
       
  1779           }
       
  1780         break;
       
  1781         }
       
  1782       break;
       
  1783 
       
  1784       /*-----------------------------------------------------------------*/
       
  1785       case OP_NOT_VSPACE:
       
  1786       if (clen > 0) switch(c)
       
  1787         {
       
  1788         case 0x000a:
       
  1789         case 0x000b:
       
  1790         case 0x000c:
       
  1791         case 0x000d:
       
  1792         case 0x0085:
       
  1793         case 0x2028:
       
  1794         case 0x2029:
       
  1795         break;
       
  1796 
       
  1797         default:
       
  1798         ADD_NEW(state_offset + 1, 0);
       
  1799         break;
       
  1800         }
       
  1801       break;
       
  1802 
       
  1803       /*-----------------------------------------------------------------*/
       
  1804       case OP_VSPACE:
       
  1805       if (clen > 0) switch(c)
       
  1806         {
       
  1807         case 0x000a:
       
  1808         case 0x000b:
       
  1809         case 0x000c:
       
  1810         case 0x000d:
       
  1811         case 0x0085:
       
  1812         case 0x2028:
       
  1813         case 0x2029:
       
  1814         ADD_NEW(state_offset + 1, 0);
       
  1815         break;
       
  1816 
       
  1817         default: break;
       
  1818         }
       
  1819       break;
       
  1820 
       
  1821       /*-----------------------------------------------------------------*/
       
  1822       case OP_NOT_HSPACE:
       
  1823       if (clen > 0) switch(c)
       
  1824         {
       
  1825         case 0x09:      /* HT */
       
  1826         case 0x20:      /* SPACE */
       
  1827         case 0xa0:      /* NBSP */
       
  1828         case 0x1680:    /* OGHAM SPACE MARK */
       
  1829         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
       
  1830         case 0x2000:    /* EN QUAD */
       
  1831         case 0x2001:    /* EM QUAD */
       
  1832         case 0x2002:    /* EN SPACE */
       
  1833         case 0x2003:    /* EM SPACE */
       
  1834         case 0x2004:    /* THREE-PER-EM SPACE */
       
  1835         case 0x2005:    /* FOUR-PER-EM SPACE */
       
  1836         case 0x2006:    /* SIX-PER-EM SPACE */
       
  1837         case 0x2007:    /* FIGURE SPACE */
       
  1838         case 0x2008:    /* PUNCTUATION SPACE */
       
  1839         case 0x2009:    /* THIN SPACE */
       
  1840         case 0x200A:    /* HAIR SPACE */
       
  1841         case 0x202f:    /* NARROW NO-BREAK SPACE */
       
  1842         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
       
  1843         case 0x3000:    /* IDEOGRAPHIC SPACE */
       
  1844         break;
       
  1845 
       
  1846         default:
       
  1847         ADD_NEW(state_offset + 1, 0);
       
  1848         break;
       
  1849         }
       
  1850       break;
       
  1851 
       
  1852       /*-----------------------------------------------------------------*/
       
  1853       case OP_HSPACE:
       
  1854       if (clen > 0) switch(c)
       
  1855         {
       
  1856         case 0x09:      /* HT */
       
  1857         case 0x20:      /* SPACE */
       
  1858         case 0xa0:      /* NBSP */
       
  1859         case 0x1680:    /* OGHAM SPACE MARK */
       
  1860         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
       
  1861         case 0x2000:    /* EN QUAD */
       
  1862         case 0x2001:    /* EM QUAD */
       
  1863         case 0x2002:    /* EN SPACE */
       
  1864         case 0x2003:    /* EM SPACE */
       
  1865         case 0x2004:    /* THREE-PER-EM SPACE */
       
  1866         case 0x2005:    /* FOUR-PER-EM SPACE */
       
  1867         case 0x2006:    /* SIX-PER-EM SPACE */
       
  1868         case 0x2007:    /* FIGURE SPACE */
       
  1869         case 0x2008:    /* PUNCTUATION SPACE */
       
  1870         case 0x2009:    /* THIN SPACE */
       
  1871         case 0x200A:    /* HAIR SPACE */
       
  1872         case 0x202f:    /* NARROW NO-BREAK SPACE */
       
  1873         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
       
  1874         case 0x3000:    /* IDEOGRAPHIC SPACE */
       
  1875         ADD_NEW(state_offset + 1, 0);
       
  1876         break;
       
  1877         }
       
  1878       break;
       
  1879 
       
  1880       /*-----------------------------------------------------------------*/
       
  1881       /* Match a negated single character. This is only used for one-byte
       
  1882       characters, that is, we know that d < 256. The character we are
       
  1883       checking (c) can be multibyte. */
       
  1884 
       
  1885       case OP_NOT:
       
  1886       if (clen > 0)
       
  1887         {
       
  1888         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
       
  1889         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
       
  1890         }
       
  1891       break;
       
  1892 
       
  1893       /*-----------------------------------------------------------------*/
       
  1894       case OP_PLUS:
       
  1895       case OP_MINPLUS:
       
  1896       case OP_POSPLUS:
       
  1897       case OP_NOTPLUS:
       
  1898       case OP_NOTMINPLUS:
       
  1899       case OP_NOTPOSPLUS:
       
  1900       count = current_state->count;  /* Already matched */
       
  1901       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
       
  1902       if (clen > 0)
       
  1903         {
       
  1904         unsigned int otherd = NOTACHAR;
       
  1905         if ((ims & PCRE_CASELESS) != 0)
       
  1906           {
       
  1907 #ifdef SUPPORT_UTF8
       
  1908           if (utf8 && d >= 128)
       
  1909             {
       
  1910 #ifdef SUPPORT_UCP
       
  1911             otherd = UCD_OTHERCASE(d);
       
  1912 #endif  /* SUPPORT_UCP */
       
  1913             }
       
  1914           else
       
  1915 #endif  /* SUPPORT_UTF8 */
       
  1916           otherd = fcc[d];
       
  1917           }
       
  1918         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
       
  1919           {
       
  1920           if (count > 0 &&
       
  1921               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
       
  1922             {
       
  1923             active_count--;             /* Remove non-match possibility */
       
  1924             next_active_state--;
       
  1925             }
       
  1926           count++;
       
  1927           ADD_NEW(state_offset, count);
       
  1928           }
       
  1929         }
       
  1930       break;
       
  1931 
       
  1932       /*-----------------------------------------------------------------*/
       
  1933       case OP_QUERY:
       
  1934       case OP_MINQUERY:
       
  1935       case OP_POSQUERY:
       
  1936       case OP_NOTQUERY:
       
  1937       case OP_NOTMINQUERY:
       
  1938       case OP_NOTPOSQUERY:
       
  1939       ADD_ACTIVE(state_offset + dlen + 1, 0);
       
  1940       if (clen > 0)
       
  1941         {
       
  1942         unsigned int otherd = NOTACHAR;
       
  1943         if ((ims & PCRE_CASELESS) != 0)
       
  1944           {
       
  1945 #ifdef SUPPORT_UTF8
       
  1946           if (utf8 && d >= 128)
       
  1947             {
       
  1948 #ifdef SUPPORT_UCP
       
  1949             otherd = UCD_OTHERCASE(d);
       
  1950 #endif  /* SUPPORT_UCP */
       
  1951             }
       
  1952           else
       
  1953 #endif  /* SUPPORT_UTF8 */
       
  1954           otherd = fcc[d];
       
  1955           }
       
  1956         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
       
  1957           {
       
  1958           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
       
  1959             {
       
  1960             active_count--;            /* Remove non-match possibility */
       
  1961             next_active_state--;
       
  1962             }
       
  1963           ADD_NEW(state_offset + dlen + 1, 0);
       
  1964           }
       
  1965         }
       
  1966       break;
       
  1967 
       
  1968       /*-----------------------------------------------------------------*/
       
  1969       case OP_STAR:
       
  1970       case OP_MINSTAR:
       
  1971       case OP_POSSTAR:
       
  1972       case OP_NOTSTAR:
       
  1973       case OP_NOTMINSTAR:
       
  1974       case OP_NOTPOSSTAR:
       
  1975       ADD_ACTIVE(state_offset + dlen + 1, 0);
       
  1976       if (clen > 0)
       
  1977         {
       
  1978         unsigned int otherd = NOTACHAR;
       
  1979         if ((ims & PCRE_CASELESS) != 0)
       
  1980           {
       
  1981 #ifdef SUPPORT_UTF8
       
  1982           if (utf8 && d >= 128)
       
  1983             {
       
  1984 #ifdef SUPPORT_UCP
       
  1985             otherd = UCD_OTHERCASE(d);
       
  1986 #endif  /* SUPPORT_UCP */
       
  1987             }
       
  1988           else
       
  1989 #endif  /* SUPPORT_UTF8 */
       
  1990           otherd = fcc[d];
       
  1991           }
       
  1992         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
       
  1993           {
       
  1994           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
       
  1995             {
       
  1996             active_count--;            /* Remove non-match possibility */
       
  1997             next_active_state--;
       
  1998             }
       
  1999           ADD_NEW(state_offset, 0);
       
  2000           }
       
  2001         }
       
  2002       break;
       
  2003 
       
  2004       /*-----------------------------------------------------------------*/
       
  2005       case OP_EXACT:
       
  2006       case OP_NOTEXACT:
       
  2007       count = current_state->count;  /* Number already matched */
       
  2008       if (clen > 0)
       
  2009         {
       
  2010         unsigned int otherd = NOTACHAR;
       
  2011         if ((ims & PCRE_CASELESS) != 0)
       
  2012           {
       
  2013 #ifdef SUPPORT_UTF8
       
  2014           if (utf8 && d >= 128)
       
  2015             {
       
  2016 #ifdef SUPPORT_UCP
       
  2017             otherd = UCD_OTHERCASE(d);
       
  2018 #endif  /* SUPPORT_UCP */
       
  2019             }
       
  2020           else
       
  2021 #endif  /* SUPPORT_UTF8 */
       
  2022           otherd = fcc[d];
       
  2023           }
       
  2024         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
       
  2025           {
       
  2026           if (++count >= GET2(code, 1))
       
  2027             { ADD_NEW(state_offset + dlen + 3, 0); }
       
  2028           else
       
  2029             { ADD_NEW(state_offset, count); }
       
  2030           }
       
  2031         }
       
  2032       break;
       
  2033 
       
  2034       /*-----------------------------------------------------------------*/
       
  2035       case OP_UPTO:
       
  2036       case OP_MINUPTO:
       
  2037       case OP_POSUPTO:
       
  2038       case OP_NOTUPTO:
       
  2039       case OP_NOTMINUPTO:
       
  2040       case OP_NOTPOSUPTO:
       
  2041       ADD_ACTIVE(state_offset + dlen + 3, 0);
       
  2042       count = current_state->count;  /* Number already matched */
       
  2043       if (clen > 0)
       
  2044         {
       
  2045         unsigned int otherd = NOTACHAR;
       
  2046         if ((ims & PCRE_CASELESS) != 0)
       
  2047           {
       
  2048 #ifdef SUPPORT_UTF8
       
  2049           if (utf8 && d >= 128)
       
  2050             {
       
  2051 #ifdef SUPPORT_UCP
       
  2052             otherd = UCD_OTHERCASE(d);
       
  2053 #endif  /* SUPPORT_UCP */
       
  2054             }
       
  2055           else
       
  2056 #endif  /* SUPPORT_UTF8 */
       
  2057           otherd = fcc[d];
       
  2058           }
       
  2059         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
       
  2060           {
       
  2061           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
       
  2062             {
       
  2063             active_count--;             /* Remove non-match possibility */
       
  2064             next_active_state--;
       
  2065             }
       
  2066           if (++count >= GET2(code, 1))
       
  2067             { ADD_NEW(state_offset + dlen + 3, 0); }
       
  2068           else
       
  2069             { ADD_NEW(state_offset, count); }
       
  2070           }
       
  2071         }
       
  2072       break;
       
  2073 
       
  2074 
       
  2075 /* ========================================================================== */
       
  2076       /* These are the class-handling opcodes */
       
  2077 
       
  2078       case OP_CLASS:
       
  2079       case OP_NCLASS:
       
  2080       case OP_XCLASS:
       
  2081         {
       
  2082         BOOL isinclass = FALSE;
       
  2083         int next_state_offset;
       
  2084         const uschar *ecode;
       
  2085 
       
  2086         /* For a simple class, there is always just a 32-byte table, and we
       
  2087         can set isinclass from it. */
       
  2088 
       
  2089         if (codevalue != OP_XCLASS)
       
  2090           {
       
  2091           ecode = code + 33;
       
  2092           if (clen > 0)
       
  2093             {
       
  2094             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
       
  2095               ((code[1 + c/8] & (1 << (c&7))) != 0);
       
  2096             }
       
  2097           }
       
  2098 
       
  2099         /* An extended class may have a table or a list of single characters,
       
  2100         ranges, or both, and it may be positive or negative. There's a
       
  2101         function that sorts all this out. */
       
  2102 
       
  2103         else
       
  2104          {
       
  2105          ecode = code + GET(code, 1);
       
  2106          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
       
  2107          }
       
  2108 
       
  2109         /* At this point, isinclass is set for all kinds of class, and ecode
       
  2110         points to the byte after the end of the class. If there is a
       
  2111         quantifier, this is where it will be. */
       
  2112 
       
  2113         next_state_offset = ecode - start_code;
       
  2114 
       
  2115         switch (*ecode)
       
  2116           {
       
  2117           case OP_CRSTAR:
       
  2118           case OP_CRMINSTAR:
       
  2119           ADD_ACTIVE(next_state_offset + 1, 0);
       
  2120           if (isinclass) { ADD_NEW(state_offset, 0); }
       
  2121           break;
       
  2122 
       
  2123           case OP_CRPLUS:
       
  2124           case OP_CRMINPLUS:
       
  2125           count = current_state->count;  /* Already matched */
       
  2126           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
       
  2127           if (isinclass) { count++; ADD_NEW(state_offset, count); }
       
  2128           break;
       
  2129 
       
  2130           case OP_CRQUERY:
       
  2131           case OP_CRMINQUERY:
       
  2132           ADD_ACTIVE(next_state_offset + 1, 0);
       
  2133           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
       
  2134           break;
       
  2135 
       
  2136           case OP_CRRANGE:
       
  2137           case OP_CRMINRANGE:
       
  2138           count = current_state->count;  /* Already matched */
       
  2139           if (count >= GET2(ecode, 1))
       
  2140             { ADD_ACTIVE(next_state_offset + 5, 0); }
       
  2141           if (isinclass)
       
  2142             {
       
  2143             int max = GET2(ecode, 3);
       
  2144             if (++count >= max && max != 0)   /* Max 0 => no limit */
       
  2145               { ADD_NEW(next_state_offset + 5, 0); }
       
  2146             else
       
  2147               { ADD_NEW(state_offset, count); }
       
  2148             }
       
  2149           break;
       
  2150 
       
  2151           default:
       
  2152           if (isinclass) { ADD_NEW(next_state_offset, 0); }
       
  2153           break;
       
  2154           }
       
  2155         }
       
  2156       break;
       
  2157 
       
  2158 /* ========================================================================== */
       
  2159       /* These are the opcodes for fancy brackets of various kinds. We have
       
  2160       to use recursion in order to handle them. The "always failing" assersion
       
  2161       (?!) is optimised when compiling to OP_FAIL, so we have to support that,
       
  2162       though the other "backtracking verbs" are not supported. */
       
  2163 
       
  2164       case OP_FAIL:
       
  2165       break;
       
  2166 
       
  2167       case OP_ASSERT:
       
  2168       case OP_ASSERT_NOT:
       
  2169       case OP_ASSERTBACK:
       
  2170       case OP_ASSERTBACK_NOT:
       
  2171         {
       
  2172         int rc;
       
  2173         int local_offsets[2];
       
  2174         int local_workspace[1000];
       
  2175         const uschar *endasscode = code + GET(code, 1);
       
  2176 
       
  2177         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
       
  2178 
       
  2179         rc = internal_dfa_exec(
       
  2180           md,                                   /* static match data */
       
  2181           code,                                 /* this subexpression's code */
       
  2182           ptr,                                  /* where we currently are */
       
  2183           ptr - start_subject,                  /* start offset */
       
  2184           local_offsets,                        /* offset vector */
       
  2185           sizeof(local_offsets)/sizeof(int),    /* size of same */
       
  2186           local_workspace,                      /* workspace vector */
       
  2187           sizeof(local_workspace)/sizeof(int),  /* size of same */
       
  2188           ims,                                  /* the current ims flags */
       
  2189           rlevel,                               /* function recursion level */
       
  2190           recursing);                           /* pass on regex recursion */
       
  2191 
       
  2192         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
       
  2193             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
       
  2194         }
       
  2195       break;
       
  2196 
       
  2197       /*-----------------------------------------------------------------*/
       
  2198       case OP_COND:
       
  2199       case OP_SCOND:
       
  2200         {
       
  2201         int local_offsets[1000];
       
  2202         int local_workspace[1000];
       
  2203         int condcode = code[LINK_SIZE+1];
       
  2204 
       
  2205         /* Back reference conditions are not supported */
       
  2206 
       
  2207         if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
       
  2208 
       
  2209         /* The DEFINE condition is always false */
       
  2210 
       
  2211         if (condcode == OP_DEF)
       
  2212           {
       
  2213           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
       
  2214           }
       
  2215 
       
  2216         /* The only supported version of OP_RREF is for the value RREF_ANY,
       
  2217         which means "test if in any recursion". We can't test for specifically
       
  2218         recursed groups. */
       
  2219 
       
  2220         else if (condcode == OP_RREF)
       
  2221           {
       
  2222           int value = GET2(code, LINK_SIZE+2);
       
  2223           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
       
  2224           if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
       
  2225             else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
       
  2226           }
       
  2227 
       
  2228         /* Otherwise, the condition is an assertion */
       
  2229 
       
  2230         else
       
  2231           {
       
  2232           int rc;
       
  2233           const uschar *asscode = code + LINK_SIZE + 1;
       
  2234           const uschar *endasscode = asscode + GET(asscode, 1);
       
  2235 
       
  2236           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
       
  2237 
       
  2238           rc = internal_dfa_exec(
       
  2239             md,                                   /* fixed match data */
       
  2240             asscode,                              /* this subexpression's code */
       
  2241             ptr,                                  /* where we currently are */
       
  2242             ptr - start_subject,                  /* start offset */
       
  2243             local_offsets,                        /* offset vector */
       
  2244             sizeof(local_offsets)/sizeof(int),    /* size of same */
       
  2245             local_workspace,                      /* workspace vector */
       
  2246             sizeof(local_workspace)/sizeof(int),  /* size of same */
       
  2247             ims,                                  /* the current ims flags */
       
  2248             rlevel,                               /* function recursion level */
       
  2249             recursing);                           /* pass on regex recursion */
       
  2250 
       
  2251           if ((rc >= 0) ==
       
  2252                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
       
  2253             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
       
  2254           else
       
  2255             { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
       
  2256           }
       
  2257         }
       
  2258       break;
       
  2259 
       
  2260       /*-----------------------------------------------------------------*/
       
  2261       case OP_RECURSE:
       
  2262         {
       
  2263         int local_offsets[1000];
       
  2264         int local_workspace[1000];
       
  2265         int rc;
       
  2266 
       
  2267         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
       
  2268           recursing + 1));
       
  2269 
       
  2270         rc = internal_dfa_exec(
       
  2271           md,                                   /* fixed match data */
       
  2272           start_code + GET(code, 1),            /* this subexpression's code */
       
  2273           ptr,                                  /* where we currently are */
       
  2274           ptr - start_subject,                  /* start offset */
       
  2275           local_offsets,                        /* offset vector */
       
  2276           sizeof(local_offsets)/sizeof(int),    /* size of same */
       
  2277           local_workspace,                      /* workspace vector */
       
  2278           sizeof(local_workspace)/sizeof(int),  /* size of same */
       
  2279           ims,                                  /* the current ims flags */
       
  2280           rlevel,                               /* function recursion level */
       
  2281           recursing + 1);                       /* regex recurse level */
       
  2282 
       
  2283         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
       
  2284           recursing + 1, rc));
       
  2285 
       
  2286         /* Ran out of internal offsets */
       
  2287 
       
  2288         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
       
  2289 
       
  2290         /* For each successful matched substring, set up the next state with a
       
  2291         count of characters to skip before trying it. Note that the count is in
       
  2292         characters, not bytes. */
       
  2293 
       
  2294         if (rc > 0)
       
  2295           {
       
  2296           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
       
  2297             {
       
  2298             const uschar *p = start_subject + local_offsets[rc];
       
  2299             const uschar *pp = start_subject + local_offsets[rc+1];
       
  2300             int charcount = local_offsets[rc+1] - local_offsets[rc];
       
  2301             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
       
  2302             if (charcount > 0)
       
  2303               {
       
  2304               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
       
  2305               }
       
  2306             else
       
  2307               {
       
  2308               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
       
  2309               }
       
  2310             }
       
  2311           }
       
  2312         else if (rc != PCRE_ERROR_NOMATCH) return rc;
       
  2313         }
       
  2314       break;
       
  2315 
       
  2316       /*-----------------------------------------------------------------*/
       
  2317       case OP_ONCE:
       
  2318         {
       
  2319         int local_offsets[2];
       
  2320         int local_workspace[1000];
       
  2321 
       
  2322         int rc = internal_dfa_exec(
       
  2323           md,                                   /* fixed match data */
       
  2324           code,                                 /* this subexpression's code */
       
  2325           ptr,                                  /* where we currently are */
       
  2326           ptr - start_subject,                  /* start offset */
       
  2327           local_offsets,                        /* offset vector */
       
  2328           sizeof(local_offsets)/sizeof(int),    /* size of same */
       
  2329           local_workspace,                      /* workspace vector */
       
  2330           sizeof(local_workspace)/sizeof(int),  /* size of same */
       
  2331           ims,                                  /* the current ims flags */
       
  2332           rlevel,                               /* function recursion level */
       
  2333           recursing);                           /* pass on regex recursion */
       
  2334 
       
  2335         if (rc >= 0)
       
  2336           {
       
  2337           const uschar *end_subpattern = code;
       
  2338           int charcount = local_offsets[1] - local_offsets[0];
       
  2339           int next_state_offset, repeat_state_offset;
       
  2340 
       
  2341           do { end_subpattern += GET(end_subpattern, 1); }
       
  2342             while (*end_subpattern == OP_ALT);
       
  2343           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
       
  2344 
       
  2345           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
       
  2346           arrange for the repeat state also to be added to the relevant list.
       
  2347           Calculate the offset, or set -1 for no repeat. */
       
  2348 
       
  2349           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
       
  2350                                  *end_subpattern == OP_KETRMIN)?
       
  2351             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
       
  2352 
       
  2353           /* If we have matched an empty string, add the next state at the
       
  2354           current character pointer. This is important so that the duplicate
       
  2355           checking kicks in, which is what breaks infinite loops that match an
       
  2356           empty string. */
       
  2357 
       
  2358           if (charcount == 0)
       
  2359             {
       
  2360             ADD_ACTIVE(next_state_offset, 0);
       
  2361             }
       
  2362 
       
  2363           /* Optimization: if there are no more active states, and there
       
  2364           are no new states yet set up, then skip over the subject string
       
  2365           right here, to save looping. Otherwise, set up the new state to swing
       
  2366           into action when the end of the substring is reached. */
       
  2367 
       
  2368           else if (i + 1 >= active_count && new_count == 0)
       
  2369             {
       
  2370             ptr += charcount;
       
  2371             clen = 0;
       
  2372             ADD_NEW(next_state_offset, 0);
       
  2373 
       
  2374             /* If we are adding a repeat state at the new character position,
       
  2375             we must fudge things so that it is the only current state.
       
  2376             Otherwise, it might be a duplicate of one we processed before, and
       
  2377             that would cause it to be skipped. */
       
  2378 
       
  2379             if (repeat_state_offset >= 0)
       
  2380               {
       
  2381               next_active_state = active_states;
       
  2382               active_count = 0;
       
  2383               i = -1;
       
  2384               ADD_ACTIVE(repeat_state_offset, 0);
       
  2385               }
       
  2386             }
       
  2387           else
       
  2388             {
       
  2389             const uschar *p = start_subject + local_offsets[0];
       
  2390             const uschar *pp = start_subject + local_offsets[1];
       
  2391             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
       
  2392             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
       
  2393             if (repeat_state_offset >= 0)
       
  2394               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
       
  2395             }
       
  2396 
       
  2397           }
       
  2398         else if (rc != PCRE_ERROR_NOMATCH) return rc;
       
  2399         }
       
  2400       break;
       
  2401 
       
  2402 
       
  2403 /* ========================================================================== */
       
  2404       /* Handle callouts */
       
  2405 
       
  2406       case OP_CALLOUT:
       
  2407       if (pcre_callout != NULL)
       
  2408         {
       
  2409         int rrc;
       
  2410         pcre_callout_block cb;
       
  2411         cb.version          = 1;   /* Version 1 of the callout block */
       
  2412         cb.callout_number   = code[1];
       
  2413         cb.offset_vector    = offsets;
       
  2414         cb.subject          = (PCRE_SPTR)start_subject;
       
  2415         cb.subject_length   = end_subject - start_subject;
       
  2416         cb.start_match      = current_subject - start_subject;
       
  2417         cb.current_position = ptr - start_subject;
       
  2418         cb.pattern_position = GET(code, 2);
       
  2419         cb.next_item_length = GET(code, 2 + LINK_SIZE);
       
  2420         cb.capture_top      = 1;
       
  2421         cb.capture_last     = -1;
       
  2422         cb.callout_data     = md->callout_data;
       
  2423         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
       
  2424         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
       
  2425         }
       
  2426       break;
       
  2427 
       
  2428 
       
  2429 /* ========================================================================== */
       
  2430       default:        /* Unsupported opcode */
       
  2431       return PCRE_ERROR_DFA_UITEM;
       
  2432       }
       
  2433 
       
  2434     NEXT_ACTIVE_STATE: continue;
       
  2435 
       
  2436     }      /* End of loop scanning active states */
       
  2437 
       
  2438   /* We have finished the processing at the current subject character. If no
       
  2439   new states have been set for the next character, we have found all the
       
  2440   matches that we are going to find. If we are at the top level and partial
       
  2441   matching has been requested, check for appropriate conditions. */
       
  2442 
       
  2443   if (new_count <= 0)
       
  2444     {
       
  2445     if (match_count < 0 &&                     /* No matches found */
       
  2446         rlevel == 1 &&                         /* Top level match function */
       
  2447         (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */
       
  2448         ptr >= end_subject &&                  /* Reached end of subject */
       
  2449         ptr > current_subject)                 /* Matched non-empty string */
       
  2450       {
       
  2451       if (offsetcount >= 2)
       
  2452         {
       
  2453         offsets[0] = current_subject - start_subject;
       
  2454         offsets[1] = end_subject - start_subject;
       
  2455         }
       
  2456       match_count = PCRE_ERROR_PARTIAL;
       
  2457       }
       
  2458 
       
  2459     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
       
  2460       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
       
  2461       rlevel*2-2, SP));
       
  2462     break;        /* In effect, "return", but see the comment below */
       
  2463     }
       
  2464 
       
  2465   /* One or more states are active for the next character. */
       
  2466 
       
  2467   ptr += clen;    /* Advance to next subject character */
       
  2468   }               /* Loop to move along the subject string */
       
  2469 
       
  2470 /* Control gets here from "break" a few lines above. We do it this way because
       
  2471 if we use "return" above, we have compiler trouble. Some compilers warn if
       
  2472 there's nothing here because they think the function doesn't return a value. On
       
  2473 the other hand, if we put a dummy statement here, some more clever compilers
       
  2474 complain that it can't be reached. Sigh. */
       
  2475 
       
  2476 return match_count;
       
  2477 }
       
  2478 
       
  2479 
       
  2480 
       
  2481 
       
  2482 /*************************************************
       
  2483 *    Execute a Regular Expression - DFA engine   *
       
  2484 *************************************************/
       
  2485 
       
  2486 /* This external function applies a compiled re to a subject string using a DFA
       
  2487 engine. This function calls the internal function multiple times if the pattern
       
  2488 is not anchored.
       
  2489 
       
  2490 Arguments:
       
  2491   argument_re     points to the compiled expression
       
  2492   extra_data      points to extra data or is NULL
       
  2493   subject         points to the subject string
       
  2494   length          length of subject string (may contain binary zeros)
       
  2495   start_offset    where to start in the subject string
       
  2496   options         option bits
       
  2497   offsets         vector of match offsets
       
  2498   offsetcount     size of same
       
  2499   workspace       workspace vector
       
  2500   wscount         size of same
       
  2501 
       
  2502 Returns:          > 0 => number of match offset pairs placed in offsets
       
  2503                   = 0 => offsets overflowed; longest matches are present
       
  2504                    -1 => failed to match
       
  2505                  < -1 => some kind of unexpected problem
       
  2506 */
       
  2507 
       
  2508 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
       
  2509 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
       
  2510   const char *subject, int length, int start_offset, int options, int *offsets,
       
  2511   int offsetcount, int *workspace, int wscount)
       
  2512 {
       
  2513 real_pcre *re = (real_pcre *)argument_re;
       
  2514 dfa_match_data match_block;
       
  2515 dfa_match_data *md = &match_block;
       
  2516 BOOL utf8, anchored, startline, firstline;
       
  2517 const uschar *current_subject, *end_subject, *lcc;
       
  2518 
       
  2519 pcre_study_data internal_study;
       
  2520 const pcre_study_data *study = NULL;
       
  2521 real_pcre internal_re;
       
  2522 
       
  2523 const uschar *req_byte_ptr;
       
  2524 const uschar *start_bits = NULL;
       
  2525 BOOL first_byte_caseless = FALSE;
       
  2526 BOOL req_byte_caseless = FALSE;
       
  2527 int first_byte = -1;
       
  2528 int req_byte = -1;
       
  2529 int req_byte2 = -1;
       
  2530 int newline;
       
  2531 
       
  2532 /* Plausibility checks */
       
  2533 
       
  2534 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
       
  2535 if (re == NULL || subject == NULL || workspace == NULL ||
       
  2536    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
       
  2537 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
       
  2538 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
       
  2539 
       
  2540 /* We need to find the pointer to any study data before we test for byte
       
  2541 flipping, so we scan the extra_data block first. This may set two fields in the
       
  2542 match block, so we must initialize them beforehand. However, the other fields
       
  2543 in the match block must not be set until after the byte flipping. */
       
  2544 
       
  2545 md->tables = re->tables;
       
  2546 md->callout_data = NULL;
       
  2547 
       
  2548 if (extra_data != NULL)
       
  2549   {
       
  2550   unsigned int flags = extra_data->flags;
       
  2551   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
       
  2552     study = (const pcre_study_data *)extra_data->study_data;
       
  2553   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
       
  2554   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
       
  2555     return PCRE_ERROR_DFA_UMLIMIT;
       
  2556   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
       
  2557     md->callout_data = extra_data->callout_data;
       
  2558   if ((flags & PCRE_EXTRA_TABLES) != 0)
       
  2559     md->tables = extra_data->tables;
       
  2560   }
       
  2561 
       
  2562 /* Check that the first field in the block is the magic number. If it is not,
       
  2563 test for a regex that was compiled on a host of opposite endianness. If this is
       
  2564 the case, flipped values are put in internal_re and internal_study if there was
       
  2565 study data too. */
       
  2566 
       
  2567 if (re->magic_number != MAGIC_NUMBER)
       
  2568   {
       
  2569   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
       
  2570   if (re == NULL) return PCRE_ERROR_BADMAGIC;
       
  2571   if (study != NULL) study = &internal_study;
       
  2572   }
       
  2573 
       
  2574 /* Set some local values */
       
  2575 
       
  2576 current_subject = (const unsigned char *)subject + start_offset;
       
  2577 end_subject = (const unsigned char *)subject + length;
       
  2578 req_byte_ptr = current_subject - 1;
       
  2579 
       
  2580 #ifdef SUPPORT_UTF8
       
  2581 utf8 = (re->options & PCRE_UTF8) != 0;
       
  2582 #else
       
  2583 utf8 = FALSE;
       
  2584 #endif
       
  2585 
       
  2586 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
       
  2587   (re->options & PCRE_ANCHORED) != 0;
       
  2588 
       
  2589 /* The remaining fixed data for passing around. */
       
  2590 
       
  2591 md->start_code = (const uschar *)argument_re +
       
  2592     re->name_table_offset + re->name_count * re->name_entry_size;
       
  2593 md->start_subject = (const unsigned char *)subject;
       
  2594 md->end_subject = end_subject;
       
  2595 md->moptions = options;
       
  2596 md->poptions = re->options;
       
  2597 
       
  2598 /* If the BSR option is not set at match time, copy what was set
       
  2599 at compile time. */
       
  2600 
       
  2601 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
       
  2602   {
       
  2603   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
       
  2604     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
       
  2605 #ifdef BSR_ANYCRLF
       
  2606   else md->moptions |= PCRE_BSR_ANYCRLF;
       
  2607 #endif
       
  2608   }
       
  2609 
       
  2610 /* Handle different types of newline. The three bits give eight cases. If
       
  2611 nothing is set at run time, whatever was used at compile time applies. */
       
  2612 
       
  2613 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
       
  2614          PCRE_NEWLINE_BITS)
       
  2615   {
       
  2616   case 0: newline = NEWLINE; break;   /* Compile-time default */
       
  2617   case PCRE_NEWLINE_CR: newline = '\r'; break;
       
  2618   case PCRE_NEWLINE_LF: newline = '\n'; break;
       
  2619   case PCRE_NEWLINE_CR+
       
  2620        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
       
  2621   case PCRE_NEWLINE_ANY: newline = -1; break;
       
  2622   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
       
  2623   default: return PCRE_ERROR_BADNEWLINE;
       
  2624   }
       
  2625 
       
  2626 if (newline == -2)
       
  2627   {
       
  2628   md->nltype = NLTYPE_ANYCRLF;
       
  2629   }
       
  2630 else if (newline < 0)
       
  2631   {
       
  2632   md->nltype = NLTYPE_ANY;
       
  2633   }
       
  2634 else
       
  2635   {
       
  2636   md->nltype = NLTYPE_FIXED;
       
  2637   if (newline > 255)
       
  2638     {
       
  2639     md->nllen = 2;
       
  2640     md->nl[0] = (newline >> 8) & 255;
       
  2641     md->nl[1] = newline & 255;
       
  2642     }
       
  2643   else
       
  2644     {
       
  2645     md->nllen = 1;
       
  2646     md->nl[0] = newline;
       
  2647     }
       
  2648   }
       
  2649 
       
  2650 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
       
  2651 back the character offset. */
       
  2652 
       
  2653 #ifdef SUPPORT_UTF8
       
  2654 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
       
  2655   {
       
  2656   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
       
  2657     return PCRE_ERROR_BADUTF8;
       
  2658   if (start_offset > 0 && start_offset < length)
       
  2659     {
       
  2660     int tb = ((uschar *)subject)[start_offset];
       
  2661     if (tb > 127)
       
  2662       {
       
  2663       tb &= 0xc0;
       
  2664       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
       
  2665       }
       
  2666     }
       
  2667   }
       
  2668 #endif
       
  2669 
       
  2670 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
       
  2671 is a feature that makes it possible to save compiled regex and re-use them
       
  2672 in other programs later. */
       
  2673 
       
  2674 if (md->tables == NULL) md->tables = _pcre_default_tables;
       
  2675 
       
  2676 /* The lower casing table and the "must be at the start of a line" flag are
       
  2677 used in a loop when finding where to start. */
       
  2678 
       
  2679 lcc = md->tables + lcc_offset;
       
  2680 startline = (re->flags & PCRE_STARTLINE) != 0;
       
  2681 firstline = (re->options & PCRE_FIRSTLINE) != 0;
       
  2682 
       
  2683 /* Set up the first character to match, if available. The first_byte value is
       
  2684 never set for an anchored regular expression, but the anchoring may be forced
       
  2685 at run time, so we have to test for anchoring. The first char may be unset for
       
  2686 an unanchored pattern, of course. If there's no first char and the pattern was
       
  2687 studied, there may be a bitmap of possible first characters. */
       
  2688 
       
  2689 if (!anchored)
       
  2690   {
       
  2691   if ((re->flags & PCRE_FIRSTSET) != 0)
       
  2692     {
       
  2693     first_byte = re->first_byte & 255;
       
  2694     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
       
  2695       first_byte = lcc[first_byte];
       
  2696     }
       
  2697   else
       
  2698     {
       
  2699     if (startline && study != NULL &&
       
  2700          (study->options & PCRE_STUDY_MAPPED) != 0)
       
  2701       start_bits = study->start_bits;
       
  2702     }
       
  2703   }
       
  2704 
       
  2705 /* For anchored or unanchored matches, there may be a "last known required
       
  2706 character" set. */
       
  2707 
       
  2708 if ((re->flags & PCRE_REQCHSET) != 0)
       
  2709   {
       
  2710   req_byte = re->req_byte & 255;
       
  2711   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
       
  2712   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
       
  2713   }
       
  2714 
       
  2715 /* Call the main matching function, looping for a non-anchored regex after a
       
  2716 failed match. Unless restarting, optimize by moving to the first match
       
  2717 character if possible, when not anchored. Then unless wanting a partial match,
       
  2718 check for a required later character. */
       
  2719 
       
  2720 for (;;)
       
  2721   {
       
  2722   int rc;
       
  2723 
       
  2724   if ((options & PCRE_DFA_RESTART) == 0)
       
  2725     {
       
  2726     const uschar *save_end_subject = end_subject;
       
  2727 
       
  2728     /* Advance to a unique first char if possible. If firstline is TRUE, the
       
  2729     start of the match is constrained to the first line of a multiline string.
       
  2730     Implement this by temporarily adjusting end_subject so that we stop
       
  2731     scanning at a newline. If the match fails at the newline, later code breaks
       
  2732     this loop. */
       
  2733 
       
  2734     if (firstline)
       
  2735       {
       
  2736       USPTR t = current_subject;
       
  2737 #ifdef SUPPORT_UTF8
       
  2738       if (utf8)
       
  2739         {
       
  2740         while (t < md->end_subject && !IS_NEWLINE(t))
       
  2741           {
       
  2742           t++;
       
  2743           while (t < end_subject && (*t & 0xc0) == 0x80) t++;
       
  2744           }
       
  2745         }
       
  2746       else
       
  2747 #endif
       
  2748       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
       
  2749       end_subject = t;
       
  2750       }
       
  2751 
       
  2752     if (first_byte >= 0)
       
  2753       {
       
  2754       if (first_byte_caseless)
       
  2755         while (current_subject < end_subject &&
       
  2756                lcc[*current_subject] != first_byte)
       
  2757           current_subject++;
       
  2758       else
       
  2759         while (current_subject < end_subject && *current_subject != first_byte)
       
  2760           current_subject++;
       
  2761       }
       
  2762 
       
  2763     /* Or to just after a linebreak for a multiline match if possible */
       
  2764 
       
  2765     else if (startline)
       
  2766       {
       
  2767       if (current_subject > md->start_subject + start_offset)
       
  2768         {
       
  2769 #ifdef SUPPORT_UTF8
       
  2770         if (utf8)
       
  2771           {
       
  2772           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
       
  2773             {
       
  2774             current_subject++;
       
  2775             while(current_subject < end_subject &&
       
  2776                   (*current_subject & 0xc0) == 0x80)
       
  2777               current_subject++;
       
  2778             }
       
  2779           }
       
  2780         else
       
  2781 #endif
       
  2782         while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
       
  2783           current_subject++;
       
  2784 
       
  2785         /* If we have just passed a CR and the newline option is ANY or
       
  2786         ANYCRLF, and we are now at a LF, advance the match position by one more
       
  2787         character. */
       
  2788 
       
  2789         if (current_subject[-1] == '\r' &&
       
  2790              (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
       
  2791              current_subject < end_subject &&
       
  2792              *current_subject == '\n')
       
  2793           current_subject++;
       
  2794         }
       
  2795       }
       
  2796 
       
  2797     /* Or to a non-unique first char after study */
       
  2798 
       
  2799     else if (start_bits != NULL)
       
  2800       {
       
  2801       while (current_subject < end_subject)
       
  2802         {
       
  2803         register unsigned int c = *current_subject;
       
  2804         if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
       
  2805           else break;
       
  2806         }
       
  2807       }
       
  2808 
       
  2809     /* Restore fudged end_subject */
       
  2810 
       
  2811     end_subject = save_end_subject;
       
  2812     }
       
  2813 
       
  2814   /* If req_byte is set, we know that that character must appear in the subject
       
  2815   for the match to succeed. If the first character is set, req_byte must be
       
  2816   later in the subject; otherwise the test starts at the match point. This
       
  2817   optimization can save a huge amount of work in patterns with nested unlimited
       
  2818   repeats that aren't going to match. Writing separate code for cased/caseless
       
  2819   versions makes it go faster, as does using an autoincrement and backing off
       
  2820   on a match.
       
  2821 
       
  2822   HOWEVER: when the subject string is very, very long, searching to its end can
       
  2823   take a long time, and give bad performance on quite ordinary patterns. This
       
  2824   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
       
  2825   don't do this when the string is sufficiently long.
       
  2826 
       
  2827   ALSO: this processing is disabled when partial matching is requested.
       
  2828   */
       
  2829 
       
  2830   if (req_byte >= 0 &&
       
  2831       end_subject - current_subject < REQ_BYTE_MAX &&
       
  2832       (options & PCRE_PARTIAL) == 0)
       
  2833     {
       
  2834     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
       
  2835 
       
  2836     /* We don't need to repeat the search if we haven't yet reached the
       
  2837     place we found it at last time. */
       
  2838 
       
  2839     if (p > req_byte_ptr)
       
  2840       {
       
  2841       if (req_byte_caseless)
       
  2842         {
       
  2843         while (p < end_subject)
       
  2844           {
       
  2845           register int pp = *p++;
       
  2846           if (pp == req_byte || pp == req_byte2) { p--; break; }
       
  2847           }
       
  2848         }
       
  2849       else
       
  2850         {
       
  2851         while (p < end_subject)
       
  2852           {
       
  2853           if (*p++ == req_byte) { p--; break; }
       
  2854           }
       
  2855         }
       
  2856 
       
  2857       /* If we can't find the required character, break the matching loop,
       
  2858       which will cause a return or PCRE_ERROR_NOMATCH. */
       
  2859 
       
  2860       if (p >= end_subject) break;
       
  2861 
       
  2862       /* If we have found the required character, save the point where we
       
  2863       found it, so that we don't search again next time round the loop if
       
  2864       the start hasn't passed this character yet. */
       
  2865 
       
  2866       req_byte_ptr = p;
       
  2867       }
       
  2868     }
       
  2869 
       
  2870   /* OK, now we can do the business */
       
  2871 
       
  2872   rc = internal_dfa_exec(
       
  2873     md,                                /* fixed match data */
       
  2874     md->start_code,                    /* this subexpression's code */
       
  2875     current_subject,                   /* where we currently are */
       
  2876     start_offset,                      /* start offset in subject */
       
  2877     offsets,                           /* offset vector */
       
  2878     offsetcount,                       /* size of same */
       
  2879     workspace,                         /* workspace vector */
       
  2880     wscount,                           /* size of same */
       
  2881     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
       
  2882     0,                                 /* function recurse level */
       
  2883     0);                                /* regex recurse level */
       
  2884 
       
  2885   /* Anything other than "no match" means we are done, always; otherwise, carry
       
  2886   on only if not anchored. */
       
  2887 
       
  2888   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
       
  2889 
       
  2890   /* Advance to the next subject character unless we are at the end of a line
       
  2891   and firstline is set. */
       
  2892 
       
  2893   if (firstline && IS_NEWLINE(current_subject)) break;
       
  2894   current_subject++;
       
  2895   if (utf8)
       
  2896     {
       
  2897     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
       
  2898       current_subject++;
       
  2899     }
       
  2900   if (current_subject > end_subject) break;
       
  2901 
       
  2902   /* If we have just passed a CR and we are now at a LF, and the pattern does
       
  2903   not contain any explicit matches for \r or \n, and the newline option is CRLF
       
  2904   or ANY or ANYCRLF, advance the match position by one more character. */
       
  2905 
       
  2906   if (current_subject[-1] == '\r' &&
       
  2907       current_subject < end_subject &&
       
  2908       *current_subject == '\n' &&
       
  2909       (re->flags & PCRE_HASCRORLF) == 0 &&
       
  2910         (md->nltype == NLTYPE_ANY ||
       
  2911          md->nltype == NLTYPE_ANYCRLF ||
       
  2912          md->nllen == 2))
       
  2913     current_subject++;
       
  2914 
       
  2915   }   /* "Bumpalong" loop */
       
  2916 
       
  2917 return PCRE_ERROR_NOMATCH;
       
  2918 }
       
  2919 
       
  2920 /* End of pcre_dfa_exec.c */