webengine/osswebengine/JavaScriptCore/pcre/pcre_study.c
changeset 0 dd21522fd290
equal deleted inserted replaced
-1:000000000000 0:dd21522fd290
       
     1 /*************************************************
       
     2 *      Perl-Compatible Regular Expressions       *
       
     3 *************************************************/
       
     4 
       
     5 /* PCRE is a library of functions to support regular expressions whose syntax
       
     6 and semantics are as close as possible to those of the Perl 5 language.
       
     7 
       
     8                        Written by Philip Hazel
       
     9            Copyright (c) 1997-2005 University of Cambridge
       
    10 
       
    11 -----------------------------------------------------------------------------
       
    12 Redistribution and use in source and binary forms, with or without
       
    13 modification, are permitted provided that the following conditions are met:
       
    14 
       
    15     * Redistributions of source code must retain the above copyright notice,
       
    16       this list of conditions and the following disclaimer.
       
    17 
       
    18     * Redistributions in binary form must reproduce the above copyright
       
    19       notice, this list of conditions and the following disclaimer in the
       
    20       documentation and/or other materials provided with the distribution.
       
    21 
       
    22     * Neither the name of the University of Cambridge nor the names of its
       
    23       contributors may be used to endorse or promote products derived from
       
    24       this software without specific prior written permission.
       
    25 
       
    26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
       
    27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
       
    30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       
    31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
       
    32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
       
    33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
       
    34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
       
    35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       
    36 POSSIBILITY OF SUCH DAMAGE.
       
    37 -----------------------------------------------------------------------------
       
    38 */
       
    39 
       
    40 
       
    41 /* This module contains the external function pcre_study(), along with local
       
    42 supporting functions. */
       
    43 
       
    44 
       
    45 #include "pcre_internal.h"
       
    46 
       
    47 
       
    48 /*************************************************
       
    49 *      Set a bit and maybe its alternate case    *
       
    50 *************************************************/
       
    51 
       
    52 /* Given a character, set its bit in the table, and also the bit for the other
       
    53 version of a letter if we are caseless.
       
    54 
       
    55 Arguments:
       
    56   start_bits    points to the bit map
       
    57   c             is the character
       
    58   caseless      the caseless flag
       
    59   cd            the block with char table pointers
       
    60 
       
    61 Returns:        nothing
       
    62 */
       
    63 
       
    64 static void
       
    65 set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
       
    66 {
       
    67 start_bits[c/8] |= (1 << (c&7));
       
    68 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
       
    69   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
       
    70 }
       
    71 
       
    72 
       
    73 
       
    74 /*************************************************
       
    75 *          Create bitmap of starting chars       *
       
    76 *************************************************/
       
    77 
       
    78 /* This function scans a compiled unanchored expression and attempts to build a
       
    79 bitmap of the set of initial characters. If it can't, it returns FALSE. As time
       
    80 goes by, we may be able to get more clever at doing this.
       
    81 
       
    82 Arguments:
       
    83   code         points to an expression
       
    84   start_bits   points to a 32-byte table, initialized to 0
       
    85   caseless     the current state of the caseless flag
       
    86   utf8         TRUE if in UTF-8 mode
       
    87   cd           the block with char table pointers
       
    88 
       
    89 Returns:       TRUE if table built, FALSE otherwise
       
    90 */
       
    91 
       
    92 static BOOL
       
    93 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
       
    94   BOOL utf8, compile_data *cd)
       
    95 {
       
    96 register int c;
       
    97 
       
    98 /* This next statement and the later reference to dummy are here in order to
       
    99 trick the optimizer of the IBM C compiler for OS/2 into generating correct
       
   100 code. Apparently IBM isn't going to fix the problem, and we would rather not
       
   101 disable optimization (in this module it actually makes a big difference, and
       
   102 the pcre module can use all the optimization it can get). */
       
   103 
       
   104 volatile int dummy;
       
   105 
       
   106 do
       
   107   {
       
   108   const uschar *tcode = code + 1 + LINK_SIZE;
       
   109   BOOL try_next = TRUE;
       
   110 
       
   111   while (try_next)
       
   112     {
       
   113     /* If a branch starts with a bracket or a positive lookahead assertion,
       
   114     recurse to set bits from within them. That's all for this branch. */
       
   115 
       
   116     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
       
   117       {
       
   118       if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
       
   119         return FALSE;
       
   120       try_next = FALSE;
       
   121       }
       
   122 
       
   123     else switch(*tcode)
       
   124       {
       
   125       default:
       
   126       return FALSE;
       
   127 
       
   128       /* Skip over callout */
       
   129 
       
   130       case OP_CALLOUT:
       
   131       tcode += 2 + 2*LINK_SIZE;
       
   132       break;
       
   133 
       
   134       /* Skip over extended extraction bracket number */
       
   135 
       
   136       case OP_BRANUMBER:
       
   137       tcode += 3;
       
   138       break;
       
   139 
       
   140       /* Skip over lookbehind and negative lookahead assertions */
       
   141 
       
   142       case OP_ASSERT_NOT:
       
   143       case OP_ASSERTBACK:
       
   144       case OP_ASSERTBACK_NOT:
       
   145       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
       
   146       tcode += 1+LINK_SIZE;
       
   147       break;
       
   148 
       
   149       /* Skip over an option setting, changing the caseless flag */
       
   150 
       
   151       case OP_OPT:
       
   152       caseless = (tcode[1] & PCRE_CASELESS) != 0;
       
   153       tcode += 2;
       
   154       break;
       
   155 
       
   156       /* BRAZERO does the bracket, but carries on. */
       
   157 
       
   158       case OP_BRAZERO:
       
   159       case OP_BRAMINZERO:
       
   160       if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
       
   161         return FALSE;
       
   162       dummy = 1;
       
   163       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
       
   164       tcode += 1+LINK_SIZE;
       
   165       break;
       
   166 
       
   167       /* Single-char * or ? sets the bit and tries the next item */
       
   168 
       
   169       case OP_STAR:
       
   170       case OP_MINSTAR:
       
   171       case OP_QUERY:
       
   172       case OP_MINQUERY:
       
   173       set_bit(start_bits, tcode[1], caseless, cd);
       
   174       tcode += 2;
       
   175 #ifdef SUPPORT_UTF8
       
   176       if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
       
   177 #endif
       
   178       break;
       
   179 
       
   180       /* Single-char upto sets the bit and tries the next */
       
   181 
       
   182       case OP_UPTO:
       
   183       case OP_MINUPTO:
       
   184       set_bit(start_bits, tcode[3], caseless, cd);
       
   185       tcode += 4;
       
   186 #ifdef SUPPORT_UTF8
       
   187       if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
       
   188 #endif
       
   189       break;
       
   190 
       
   191       /* At least one single char sets the bit and stops */
       
   192 
       
   193       case OP_EXACT:       /* Fall through */
       
   194       tcode += 2;
       
   195 
       
   196       case OP_CHAR:
       
   197       case OP_CHARNC:
       
   198       case OP_PLUS:
       
   199       case OP_MINPLUS:
       
   200       set_bit(start_bits, tcode[1], caseless, cd);
       
   201       try_next = FALSE;
       
   202       break;
       
   203 
       
   204       /* Single character type sets the bits and stops */
       
   205 
       
   206       case OP_NOT_DIGIT:
       
   207       for (c = 0; c < 32; c++)
       
   208         start_bits[c] |= ~cd->cbits[c+cbit_digit];
       
   209       try_next = FALSE;
       
   210       break;
       
   211 
       
   212       case OP_DIGIT:
       
   213       for (c = 0; c < 32; c++)
       
   214         start_bits[c] |= cd->cbits[c+cbit_digit];
       
   215       try_next = FALSE;
       
   216       break;
       
   217 
       
   218       case OP_NOT_WHITESPACE:
       
   219       for (c = 0; c < 32; c++)
       
   220         start_bits[c] |= ~cd->cbits[c+cbit_space];
       
   221       try_next = FALSE;
       
   222       break;
       
   223 
       
   224       case OP_WHITESPACE:
       
   225       for (c = 0; c < 32; c++)
       
   226         start_bits[c] |= cd->cbits[c+cbit_space];
       
   227       try_next = FALSE;
       
   228       break;
       
   229 
       
   230       case OP_NOT_WORDCHAR:
       
   231       for (c = 0; c < 32; c++)
       
   232         start_bits[c] |= ~cd->cbits[c+cbit_word];
       
   233       try_next = FALSE;
       
   234       break;
       
   235 
       
   236       case OP_WORDCHAR:
       
   237       for (c = 0; c < 32; c++)
       
   238         start_bits[c] |= cd->cbits[c+cbit_word];
       
   239       try_next = FALSE;
       
   240       break;
       
   241 
       
   242       /* One or more character type fudges the pointer and restarts, knowing
       
   243       it will hit a single character type and stop there. */
       
   244 
       
   245       case OP_TYPEPLUS:
       
   246       case OP_TYPEMINPLUS:
       
   247       tcode++;
       
   248       break;
       
   249 
       
   250       case OP_TYPEEXACT:
       
   251       tcode += 3;
       
   252       break;
       
   253 
       
   254       /* Zero or more repeats of character types set the bits and then
       
   255       try again. */
       
   256 
       
   257       case OP_TYPEUPTO:
       
   258       case OP_TYPEMINUPTO:
       
   259       tcode += 2;               /* Fall through */
       
   260 
       
   261       case OP_TYPESTAR:
       
   262       case OP_TYPEMINSTAR:
       
   263       case OP_TYPEQUERY:
       
   264       case OP_TYPEMINQUERY:
       
   265       switch(tcode[1])
       
   266         {
       
   267         case OP_ANY:
       
   268         return FALSE;
       
   269 
       
   270         case OP_NOT_DIGIT:
       
   271         for (c = 0; c < 32; c++)
       
   272           start_bits[c] |= ~cd->cbits[c+cbit_digit];
       
   273         break;
       
   274 
       
   275         case OP_DIGIT:
       
   276         for (c = 0; c < 32; c++)
       
   277           start_bits[c] |= cd->cbits[c+cbit_digit];
       
   278         break;
       
   279 
       
   280         case OP_NOT_WHITESPACE:
       
   281         for (c = 0; c < 32; c++)
       
   282           start_bits[c] |= ~cd->cbits[c+cbit_space];
       
   283         break;
       
   284 
       
   285         case OP_WHITESPACE:
       
   286         for (c = 0; c < 32; c++)
       
   287           start_bits[c] |= cd->cbits[c+cbit_space];
       
   288         break;
       
   289 
       
   290         case OP_NOT_WORDCHAR:
       
   291         for (c = 0; c < 32; c++)
       
   292           start_bits[c] |= ~cd->cbits[c+cbit_word];
       
   293         break;
       
   294 
       
   295         case OP_WORDCHAR:
       
   296         for (c = 0; c < 32; c++)
       
   297           start_bits[c] |= cd->cbits[c+cbit_word];
       
   298         break;
       
   299         }
       
   300 
       
   301       tcode += 2;
       
   302       break;
       
   303 
       
   304       /* Character class where all the information is in a bit map: set the
       
   305       bits and either carry on or not, according to the repeat count. If it was
       
   306       a negative class, and we are operating with UTF-8 characters, any byte
       
   307       with a value >= 0xc4 is a potentially valid starter because it starts a
       
   308       character with a value > 255. */
       
   309 
       
   310       case OP_NCLASS:
       
   311       if (utf8)
       
   312         {
       
   313         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
       
   314         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
       
   315         }
       
   316       /* Fall through */
       
   317 
       
   318       case OP_CLASS:
       
   319         {
       
   320         tcode++;
       
   321 
       
   322         /* In UTF-8 mode, the bits in a bit map correspond to character
       
   323         values, not to byte values. However, the bit map we are constructing is
       
   324         for byte values. So we have to do a conversion for characters whose
       
   325         value is > 127. In fact, there are only two possible starting bytes for
       
   326         characters in the range 128 - 255. */
       
   327 
       
   328         if (utf8)
       
   329           {
       
   330           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
       
   331           for (c = 128; c < 256; c++)
       
   332             {
       
   333             if ((tcode[c/8] && (1 << (c&7))) != 0)
       
   334               {
       
   335               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
       
   336               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
       
   337               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
       
   338               }
       
   339             }
       
   340           }
       
   341 
       
   342         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
       
   343 
       
   344         else
       
   345           {
       
   346           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
       
   347           }
       
   348 
       
   349         /* Advance past the bit map, and act on what follows */
       
   350 
       
   351         tcode += 32;
       
   352         switch (*tcode)
       
   353           {
       
   354           case OP_CRSTAR:
       
   355           case OP_CRMINSTAR:
       
   356           case OP_CRQUERY:
       
   357           case OP_CRMINQUERY:
       
   358           tcode++;
       
   359           break;
       
   360 
       
   361           case OP_CRRANGE:
       
   362           case OP_CRMINRANGE:
       
   363           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
       
   364             else try_next = FALSE;
       
   365           break;
       
   366 
       
   367           default:
       
   368           try_next = FALSE;
       
   369           break;
       
   370           }
       
   371         }
       
   372       break; /* End of bitmap class handling */
       
   373 
       
   374       }      /* End of switch */
       
   375     }        /* End of try_next loop */
       
   376 
       
   377   code += GET(code, 1);   /* Advance to next branch */
       
   378   }
       
   379 while (*code == OP_ALT);
       
   380 return TRUE;
       
   381 }
       
   382 
       
   383 
       
   384 
       
   385 /*************************************************
       
   386 *          Study a compiled expression           *
       
   387 *************************************************/
       
   388 
       
   389 /* This function is handed a compiled expression that it must study to produce
       
   390 information that will speed up the matching. It returns a pcre_extra block
       
   391 which then gets handed back to pcre_exec().
       
   392 
       
   393 Arguments:
       
   394   re        points to the compiled expression
       
   395   options   contains option bits
       
   396   errorptr  points to where to place error messages;
       
   397             set NULL unless error
       
   398 
       
   399 Returns:    pointer to a pcre_extra block, with study_data filled in and the
       
   400               appropriate flag set;
       
   401             NULL on error or if no optimization possible
       
   402 */
       
   403 
       
   404 PCRE_EXPORT pcre_extra *
       
   405 pcre_study(const pcre *external_re, int options, const char **errorptr)
       
   406 {
       
   407 uschar start_bits[32];
       
   408 pcre_extra *extra;
       
   409 pcre_study_data *study;
       
   410 const uschar *tables;
       
   411 const real_pcre *re = (const real_pcre *)external_re;
       
   412 uschar *code;
       
   413 compile_data compile_block;
       
   414 
       
   415 *errorptr = NULL;
       
   416 
       
   417 if (re == NULL || re->magic_number != MAGIC_NUMBER)
       
   418   {
       
   419   *errorptr = "argument is not a compiled regular expression";
       
   420   return NULL;
       
   421   }
       
   422 
       
   423 code = (uschar *)re + re->name_table_offset +
       
   424   (re->name_count * re->name_entry_size);
       
   425 
       
   426 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
       
   427   {
       
   428   *errorptr = "unknown or incorrect option bit(s) set";
       
   429   return NULL;
       
   430   }
       
   431 
       
   432 /* For an anchored pattern, or an unanchored pattern that has a first char, or
       
   433 a multiline pattern that matches only at "line starts", no further processing
       
   434 at present. */
       
   435 
       
   436 if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
       
   437   return NULL;
       
   438 
       
   439 /* Set the character tables in the block that is passed around */
       
   440 
       
   441 tables = re->tables;
       
   442 if (tables == NULL)
       
   443   (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
       
   444   (void *)(&tables));
       
   445 
       
   446 compile_block.lcc = tables + lcc_offset;
       
   447 compile_block.fcc = tables + fcc_offset;
       
   448 compile_block.cbits = tables + cbits_offset;
       
   449 compile_block.ctypes = tables + ctypes_offset;
       
   450 
       
   451 /* See if we can find a fixed set of initial characters for the pattern. */
       
   452 
       
   453 memset(start_bits, 0, 32 * sizeof(uschar));
       
   454 if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
       
   455   (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
       
   456 
       
   457 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
       
   458 the latter, which is pointed to by the former, which may also get additional
       
   459 data set later by the calling program. At the moment, the size of
       
   460 pcre_study_data is fixed. We nevertheless save it in a field for returning via
       
   461 the pcre_fullinfo() function so that if it becomes variable in the future, we
       
   462 don't have to change that code. */
       
   463 
       
   464 extra = (pcre_extra *)(pcre_malloc)
       
   465   (sizeof(pcre_extra) + sizeof(pcre_study_data));
       
   466 
       
   467 if (extra == NULL)
       
   468   {
       
   469   *errorptr = "failed to get memory";
       
   470   return NULL;
       
   471   }
       
   472 
       
   473 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
       
   474 extra->flags = PCRE_EXTRA_STUDY_DATA;
       
   475 extra->study_data = study;
       
   476 
       
   477 study->size = sizeof(pcre_study_data);
       
   478 study->options = PCRE_STUDY_MAPPED;
       
   479 memcpy(study->start_bits, start_bits, sizeof(start_bits));
       
   480 
       
   481 return extra;
       
   482 }
       
   483 
       
   484 /* End of pcre_study.c */