JavaScriptCore/yarr/RegexInterpreter.h
changeset 0 4f2f89ce4247
equal deleted inserted replaced
-1:000000000000 0:4f2f89ce4247
       
     1 /*
       
     2  * Copyright (C) 2009, 2010 Apple Inc. All rights reserved.
       
     3  *
       
     4  * Redistribution and use in source and binary forms, with or without
       
     5  * modification, are permitted provided that the following conditions
       
     6  * are met:
       
     7  * 1. Redistributions of source code must retain the above copyright
       
     8  *    notice, this list of conditions and the following disclaimer.
       
     9  * 2. Redistributions in binary form must reproduce the above copyright
       
    10  *    notice, this list of conditions and the following disclaimer in the
       
    11  *    documentation and/or other materials provided with the distribution.
       
    12  *
       
    13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
       
    14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       
    16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
       
    17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
       
    18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
       
    19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
       
    20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
       
    21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       
    22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
       
    23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
       
    24  */
       
    25 
       
    26 #ifndef RegexInterpreter_h
       
    27 #define RegexInterpreter_h
       
    28 
       
    29 #if ENABLE(YARR)
       
    30 
       
    31 #include "RegexParser.h"
       
    32 #include "RegexPattern.h"
       
    33 #include <wtf/PassOwnPtr.h>
       
    34 #include <wtf/unicode/Unicode.h>
       
    35 
       
    36 namespace JSC { namespace Yarr {
       
    37 
       
    38 class ByteDisjunction;
       
    39 
       
    40 struct ByteTerm {
       
    41     enum Type {
       
    42         TypeBodyAlternativeBegin,
       
    43         TypeBodyAlternativeDisjunction,
       
    44         TypeBodyAlternativeEnd,
       
    45         TypeAlternativeBegin,
       
    46         TypeAlternativeDisjunction,
       
    47         TypeAlternativeEnd,
       
    48         TypeSubpatternBegin,
       
    49         TypeSubpatternEnd,
       
    50         TypeAssertionBOL,
       
    51         TypeAssertionEOL,
       
    52         TypeAssertionWordBoundary,
       
    53         TypePatternCharacterOnce,
       
    54         TypePatternCharacterFixed,
       
    55         TypePatternCharacterGreedy,
       
    56         TypePatternCharacterNonGreedy,
       
    57         TypePatternCasedCharacterOnce,
       
    58         TypePatternCasedCharacterFixed,
       
    59         TypePatternCasedCharacterGreedy,
       
    60         TypePatternCasedCharacterNonGreedy,
       
    61         TypeCharacterClass,
       
    62         TypeBackReference,
       
    63         TypeParenthesesSubpattern,
       
    64         TypeParenthesesSubpatternOnceBegin,
       
    65         TypeParenthesesSubpatternOnceEnd,
       
    66         TypeParentheticalAssertionBegin,
       
    67         TypeParentheticalAssertionEnd,
       
    68         TypeCheckInput,
       
    69     } type;
       
    70     bool invertOrCapture;
       
    71     union {
       
    72         struct {
       
    73             union {
       
    74                 UChar patternCharacter;
       
    75                 struct {
       
    76                     UChar lo;
       
    77                     UChar hi;
       
    78                 } casedCharacter;
       
    79                 CharacterClass* characterClass;
       
    80                 unsigned subpatternId;
       
    81             };
       
    82             union {
       
    83                 ByteDisjunction* parenthesesDisjunction;
       
    84                 unsigned parenthesesWidth;
       
    85             };
       
    86             QuantifierType quantityType;
       
    87             unsigned quantityCount;
       
    88         } atom;
       
    89         struct {
       
    90             int next;
       
    91             int end;
       
    92         } alternative;
       
    93         unsigned checkInputCount;
       
    94     };
       
    95     unsigned frameLocation;
       
    96     int inputPosition;
       
    97 
       
    98     ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
       
    99         : frameLocation(frameLocation)
       
   100     {
       
   101         switch (quantityType) {
       
   102         case QuantifierFixedCount:
       
   103             type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
       
   104             break;
       
   105         case QuantifierGreedy:
       
   106             type = ByteTerm::TypePatternCharacterGreedy;
       
   107             break;
       
   108         case QuantifierNonGreedy:
       
   109             type = ByteTerm::TypePatternCharacterNonGreedy;
       
   110             break;
       
   111         }
       
   112 
       
   113         atom.patternCharacter = ch;
       
   114         atom.quantityType = quantityType;
       
   115         atom.quantityCount = quantityCount;
       
   116         inputPosition = inputPos;
       
   117     }
       
   118 
       
   119     ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
       
   120         : frameLocation(frameLocation)
       
   121     {
       
   122         switch (quantityType) {
       
   123         case QuantifierFixedCount:
       
   124             type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
       
   125             break;
       
   126         case QuantifierGreedy:
       
   127             type = ByteTerm::TypePatternCasedCharacterGreedy;
       
   128             break;
       
   129         case QuantifierNonGreedy:
       
   130             type = ByteTerm::TypePatternCasedCharacterNonGreedy;
       
   131             break;
       
   132         }
       
   133 
       
   134         atom.casedCharacter.lo = lo;
       
   135         atom.casedCharacter.hi = hi;
       
   136         atom.quantityType = quantityType;
       
   137         atom.quantityCount = quantityCount;
       
   138         inputPosition = inputPos;
       
   139     }
       
   140 
       
   141     ByteTerm(CharacterClass* characterClass, bool invert, int inputPos)
       
   142         : type(ByteTerm::TypeCharacterClass)
       
   143         , invertOrCapture(invert)
       
   144     {
       
   145         atom.characterClass = characterClass;
       
   146         atom.quantityType = QuantifierFixedCount;
       
   147         atom.quantityCount = 1;
       
   148         inputPosition = inputPos;
       
   149     }
       
   150 
       
   151     ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool invertOrCapture, int inputPos)
       
   152         : type(type)
       
   153         , invertOrCapture(invertOrCapture)
       
   154     {
       
   155         atom.subpatternId = subpatternId;
       
   156         atom.parenthesesDisjunction = parenthesesInfo;
       
   157         atom.quantityType = QuantifierFixedCount;
       
   158         atom.quantityCount = 1;
       
   159         inputPosition = inputPos;
       
   160     }
       
   161     
       
   162     ByteTerm(Type type, bool invert = false)
       
   163         : type(type)
       
   164         , invertOrCapture(invert)
       
   165     {
       
   166         atom.quantityType = QuantifierFixedCount;
       
   167         atom.quantityCount = 1;
       
   168     }
       
   169 
       
   170     ByteTerm(Type type, unsigned subpatternId, bool invertOrCapture, int inputPos)
       
   171         : type(type)
       
   172         , invertOrCapture(invertOrCapture)
       
   173     {
       
   174         atom.subpatternId = subpatternId;
       
   175         atom.quantityType = QuantifierFixedCount;
       
   176         atom.quantityCount = 1;
       
   177         inputPosition = inputPos;
       
   178     }
       
   179 
       
   180     static ByteTerm BOL(int inputPos)
       
   181     {
       
   182         ByteTerm term(TypeAssertionBOL);
       
   183         term.inputPosition = inputPos;
       
   184         return term;
       
   185     }
       
   186 
       
   187     static ByteTerm CheckInput(unsigned count)
       
   188     {
       
   189         ByteTerm term(TypeCheckInput);
       
   190         term.checkInputCount = count;
       
   191         return term;
       
   192     }
       
   193 
       
   194     static ByteTerm EOL(int inputPos)
       
   195     {
       
   196         ByteTerm term(TypeAssertionEOL);
       
   197         term.inputPosition = inputPos;
       
   198         return term;
       
   199     }
       
   200 
       
   201     static ByteTerm WordBoundary(bool invert, int inputPos)
       
   202     {
       
   203         ByteTerm term(TypeAssertionWordBoundary, invert);
       
   204         term.inputPosition = inputPos;
       
   205         return term;
       
   206     }
       
   207     
       
   208     static ByteTerm BackReference(unsigned subpatternId, int inputPos)
       
   209     {
       
   210         return ByteTerm(TypeBackReference, subpatternId, false, inputPos);
       
   211     }
       
   212 
       
   213     static ByteTerm BodyAlternativeBegin()
       
   214     {
       
   215         ByteTerm term(TypeBodyAlternativeBegin);
       
   216         term.alternative.next = 0;
       
   217         term.alternative.end = 0;
       
   218         return term;
       
   219     }
       
   220 
       
   221     static ByteTerm BodyAlternativeDisjunction()
       
   222     {
       
   223         ByteTerm term(TypeBodyAlternativeDisjunction);
       
   224         term.alternative.next = 0;
       
   225         term.alternative.end = 0;
       
   226         return term;
       
   227     }
       
   228 
       
   229     static ByteTerm BodyAlternativeEnd()
       
   230     {
       
   231         ByteTerm term(TypeBodyAlternativeEnd);
       
   232         term.alternative.next = 0;
       
   233         term.alternative.end = 0;
       
   234         return term;
       
   235     }
       
   236 
       
   237     static ByteTerm AlternativeBegin()
       
   238     {
       
   239         ByteTerm term(TypeAlternativeBegin);
       
   240         term.alternative.next = 0;
       
   241         term.alternative.end = 0;
       
   242         return term;
       
   243     }
       
   244 
       
   245     static ByteTerm AlternativeDisjunction()
       
   246     {
       
   247         ByteTerm term(TypeAlternativeDisjunction);
       
   248         term.alternative.next = 0;
       
   249         term.alternative.end = 0;
       
   250         return term;
       
   251     }
       
   252 
       
   253     static ByteTerm AlternativeEnd()
       
   254     {
       
   255         ByteTerm term(TypeAlternativeEnd);
       
   256         term.alternative.next = 0;
       
   257         term.alternative.end = 0;
       
   258         return term;
       
   259     }
       
   260 
       
   261     static ByteTerm SubpatternBegin()
       
   262     {
       
   263         return ByteTerm(TypeSubpatternBegin);
       
   264     }
       
   265 
       
   266     static ByteTerm SubpatternEnd()
       
   267     {
       
   268         return ByteTerm(TypeSubpatternEnd);
       
   269     }
       
   270 
       
   271     bool invert()
       
   272     {
       
   273         return invertOrCapture;
       
   274     }
       
   275 
       
   276     bool capture()
       
   277     {
       
   278         return invertOrCapture;
       
   279     }
       
   280 };
       
   281 
       
   282 class ByteDisjunction : public FastAllocBase {
       
   283 public:
       
   284     ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
       
   285         : m_numSubpatterns(numSubpatterns)
       
   286         , m_frameSize(frameSize)
       
   287     {
       
   288     }
       
   289 
       
   290     Vector<ByteTerm> terms;
       
   291     unsigned m_numSubpatterns;
       
   292     unsigned m_frameSize;
       
   293 };
       
   294 
       
   295 struct BytecodePattern : FastAllocBase {
       
   296     BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern)
       
   297         : m_body(body)
       
   298         , m_ignoreCase(pattern.m_ignoreCase)
       
   299         , m_multiline(pattern.m_multiline)
       
   300     {
       
   301         newlineCharacterClass = pattern.newlineCharacterClass();
       
   302         wordcharCharacterClass = pattern.wordcharCharacterClass();
       
   303 
       
   304         m_allParenthesesInfo.append(allParenthesesInfo);
       
   305         m_userCharacterClasses.append(pattern.m_userCharacterClasses);
       
   306         // 'Steal' the RegexPattern's CharacterClasses!  We clear its
       
   307         // array, so that it won't delete them on destruction.  We'll
       
   308         // take responsibility for that.
       
   309         pattern.m_userCharacterClasses.clear();
       
   310     }
       
   311 
       
   312     ~BytecodePattern()
       
   313     {
       
   314         deleteAllValues(m_allParenthesesInfo);
       
   315         deleteAllValues(m_userCharacterClasses);
       
   316     }
       
   317 
       
   318     OwnPtr<ByteDisjunction> m_body;
       
   319     bool m_ignoreCase;
       
   320     bool m_multiline;
       
   321     
       
   322     CharacterClass* newlineCharacterClass;
       
   323     CharacterClass* wordcharCharacterClass;
       
   324 private:
       
   325     Vector<ByteDisjunction*> m_allParenthesesInfo;
       
   326     Vector<CharacterClass*> m_userCharacterClasses;
       
   327 };
       
   328 
       
   329 PassOwnPtr<BytecodePattern> byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false);
       
   330 int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output);
       
   331 
       
   332 } } // namespace JSC::Yarr
       
   333 
       
   334 #endif
       
   335 
       
   336 #endif // RegexInterpreter_h