JavaScriptCore/yarr/RegexPattern.h
changeset 0 4f2f89ce4247
equal deleted inserted replaced
-1:000000000000 0:4f2f89ce4247
       
     1 /*
       
     2  * Copyright (C) 2009 Apple Inc. All rights reserved.
       
     3  *
       
     4  * Redistribution and use in source and binary forms, with or without
       
     5  * modification, are permitted provided that the following conditions
       
     6  * are met:
       
     7  * 1. Redistributions of source code must retain the above copyright
       
     8  *    notice, this list of conditions and the following disclaimer.
       
     9  * 2. Redistributions in binary form must reproduce the above copyright
       
    10  *    notice, this list of conditions and the following disclaimer in the
       
    11  *    documentation and/or other materials provided with the distribution.
       
    12  *
       
    13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
       
    14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
       
    16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
       
    17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
       
    18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
       
    19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
       
    20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
       
    21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
       
    22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
       
    23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
       
    24  */
       
    25 
       
    26 #ifndef RegexPattern_h
       
    27 #define RegexPattern_h
       
    28 
       
    29 
       
    30 #if ENABLE(YARR)
       
    31 
       
    32 #include <wtf/Vector.h>
       
    33 #include <wtf/unicode/Unicode.h>
       
    34 
       
    35 
       
    36 namespace JSC { namespace Yarr {
       
    37 
       
    38 #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers.
       
    39 #define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers.
       
    40 #define RegexStackSpaceForBackTrackInfoBackReference 2
       
    41 #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative.
       
    42 #define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1
       
    43 #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers.
       
    44 #define RegexStackSpaceForBackTrackInfoParentheses 4
       
    45 
       
    46 struct PatternDisjunction;
       
    47 
       
    48 struct CharacterRange {
       
    49     UChar begin;
       
    50     UChar end;
       
    51 
       
    52     CharacterRange(UChar begin, UChar end)
       
    53         : begin(begin)
       
    54         , end(end)
       
    55     {
       
    56     }
       
    57 };
       
    58 
       
    59 struct CharacterClassTable : RefCounted<CharacterClassTable> {
       
    60     const char* m_table;
       
    61     bool m_inverted;
       
    62     static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted)
       
    63     {
       
    64         return adoptRef(new CharacterClassTable(table, inverted));
       
    65     }
       
    66 
       
    67 private:
       
    68     CharacterClassTable(const char* table, bool inverted)
       
    69         : m_table(table)
       
    70         , m_inverted(inverted)
       
    71     {
       
    72     }
       
    73 };
       
    74 
       
    75 struct CharacterClass : FastAllocBase {
       
    76     // All CharacterClass instances have to have the full set of matches and ranges,
       
    77     // they may have an optional table for faster lookups (which must match the
       
    78     // specified matches and ranges)
       
    79     CharacterClass(PassRefPtr<CharacterClassTable> table)
       
    80         : m_table(table)
       
    81     {
       
    82     }
       
    83     Vector<UChar> m_matches;
       
    84     Vector<CharacterRange> m_ranges;
       
    85     Vector<UChar> m_matchesUnicode;
       
    86     Vector<CharacterRange> m_rangesUnicode;
       
    87     RefPtr<CharacterClassTable> m_table;
       
    88 };
       
    89 
       
    90 enum QuantifierType {
       
    91     QuantifierFixedCount,
       
    92     QuantifierGreedy,
       
    93     QuantifierNonGreedy,
       
    94 };
       
    95 
       
    96 struct PatternTerm {
       
    97     enum Type {
       
    98         TypeAssertionBOL,
       
    99         TypeAssertionEOL,
       
   100         TypeAssertionWordBoundary,
       
   101         TypePatternCharacter,
       
   102         TypeCharacterClass,
       
   103         TypeBackReference,
       
   104         TypeForwardReference,
       
   105         TypeParenthesesSubpattern,
       
   106         TypeParentheticalAssertion,
       
   107     } type;
       
   108     bool invertOrCapture;
       
   109     union {
       
   110         UChar patternCharacter;
       
   111         CharacterClass* characterClass;
       
   112         unsigned subpatternId;
       
   113         struct {
       
   114             PatternDisjunction* disjunction;
       
   115             unsigned subpatternId;
       
   116             unsigned lastSubpatternId;
       
   117             bool isCopy;
       
   118         } parentheses;
       
   119     };
       
   120     QuantifierType quantityType;
       
   121     unsigned quantityCount;
       
   122     int inputPosition;
       
   123     unsigned frameLocation;
       
   124 
       
   125     PatternTerm(UChar ch)
       
   126         : type(PatternTerm::TypePatternCharacter)
       
   127     {
       
   128         patternCharacter = ch;
       
   129         quantityType = QuantifierFixedCount;
       
   130         quantityCount = 1;
       
   131     }
       
   132 
       
   133     PatternTerm(CharacterClass* charClass, bool invert)
       
   134         : type(PatternTerm::TypeCharacterClass)
       
   135         , invertOrCapture(invert)
       
   136     {
       
   137         characterClass = charClass;
       
   138         quantityType = QuantifierFixedCount;
       
   139         quantityCount = 1;
       
   140     }
       
   141 
       
   142     PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool invertOrCapture)
       
   143         : type(type)
       
   144         , invertOrCapture(invertOrCapture)
       
   145     {
       
   146         parentheses.disjunction = disjunction;
       
   147         parentheses.subpatternId = subpatternId;
       
   148         parentheses.isCopy = false;
       
   149         quantityType = QuantifierFixedCount;
       
   150         quantityCount = 1;
       
   151     }
       
   152     
       
   153     PatternTerm(Type type, bool invert = false)
       
   154         : type(type)
       
   155         , invertOrCapture(invert)
       
   156     {
       
   157         quantityType = QuantifierFixedCount;
       
   158         quantityCount = 1;
       
   159     }
       
   160 
       
   161     PatternTerm(unsigned spatternId)
       
   162         : type(TypeBackReference)
       
   163         , invertOrCapture(false)
       
   164     {
       
   165         subpatternId = spatternId;
       
   166         quantityType = QuantifierFixedCount;
       
   167         quantityCount = 1;
       
   168     }
       
   169 
       
   170     static PatternTerm ForwardReference()
       
   171     {
       
   172         return PatternTerm(TypeForwardReference);
       
   173     }
       
   174 
       
   175     static PatternTerm BOL()
       
   176     {
       
   177         return PatternTerm(TypeAssertionBOL);
       
   178     }
       
   179 
       
   180     static PatternTerm EOL()
       
   181     {
       
   182         return PatternTerm(TypeAssertionEOL);
       
   183     }
       
   184 
       
   185     static PatternTerm WordBoundary(bool invert)
       
   186     {
       
   187         return PatternTerm(TypeAssertionWordBoundary, invert);
       
   188     }
       
   189     
       
   190     bool invert()
       
   191     {
       
   192         return invertOrCapture;
       
   193     }
       
   194 
       
   195     bool capture()
       
   196     {
       
   197         return invertOrCapture;
       
   198     }
       
   199     
       
   200     void quantify(unsigned count, QuantifierType type)
       
   201     {
       
   202         quantityCount = count;
       
   203         quantityType = type;
       
   204     }
       
   205 };
       
   206 
       
   207 struct PatternAlternative : FastAllocBase {
       
   208     PatternAlternative(PatternDisjunction* disjunction)
       
   209         : m_parent(disjunction)
       
   210     {
       
   211     }
       
   212 
       
   213     PatternTerm& lastTerm()
       
   214     {
       
   215         ASSERT(m_terms.size());
       
   216         return m_terms[m_terms.size() - 1];
       
   217     }
       
   218     
       
   219     void removeLastTerm()
       
   220     {
       
   221         ASSERT(m_terms.size());
       
   222         m_terms.shrink(m_terms.size() - 1);
       
   223     }
       
   224 
       
   225     Vector<PatternTerm> m_terms;
       
   226     PatternDisjunction* m_parent;
       
   227     unsigned m_minimumSize;
       
   228     bool m_hasFixedSize;
       
   229 };
       
   230 
       
   231 struct PatternDisjunction : FastAllocBase {
       
   232     PatternDisjunction(PatternAlternative* parent = 0)
       
   233         : m_parent(parent)
       
   234     {
       
   235     }
       
   236     
       
   237     ~PatternDisjunction()
       
   238     {
       
   239         deleteAllValues(m_alternatives);
       
   240     }
       
   241 
       
   242     PatternAlternative* addNewAlternative()
       
   243     {
       
   244         PatternAlternative* alternative = new PatternAlternative(this);
       
   245         m_alternatives.append(alternative);
       
   246         return alternative;
       
   247     }
       
   248 
       
   249     Vector<PatternAlternative*> m_alternatives;
       
   250     PatternAlternative* m_parent;
       
   251     unsigned m_minimumSize;
       
   252     unsigned m_callFrameSize;
       
   253     bool m_hasFixedSize;
       
   254 };
       
   255 
       
   256 // You probably don't want to be calling these functions directly
       
   257 // (please to be calling newlineCharacterClass() et al on your
       
   258 // friendly neighborhood RegexPattern instance to get nicely
       
   259 // cached copies).
       
   260 CharacterClass* newlineCreate();
       
   261 CharacterClass* digitsCreate();
       
   262 CharacterClass* spacesCreate();
       
   263 CharacterClass* wordcharCreate();
       
   264 CharacterClass* nondigitsCreate();
       
   265 CharacterClass* nonspacesCreate();
       
   266 CharacterClass* nonwordcharCreate();
       
   267 
       
   268 struct RegexPattern {
       
   269     RegexPattern(bool ignoreCase, bool multiline)
       
   270         : m_ignoreCase(ignoreCase)
       
   271         , m_multiline(multiline)
       
   272         , m_numSubpatterns(0)
       
   273         , m_maxBackReference(0)
       
   274         , m_containsBackreferences(false)
       
   275         , newlineCached(0)
       
   276         , digitsCached(0)
       
   277         , spacesCached(0)
       
   278         , wordcharCached(0)
       
   279         , nondigitsCached(0)
       
   280         , nonspacesCached(0)
       
   281         , nonwordcharCached(0)
       
   282     {
       
   283     }
       
   284 
       
   285     ~RegexPattern()
       
   286     {
       
   287         deleteAllValues(m_disjunctions);
       
   288         deleteAllValues(m_userCharacterClasses);
       
   289     }
       
   290 
       
   291     void reset()
       
   292     {
       
   293         m_numSubpatterns = 0;
       
   294         m_maxBackReference = 0;
       
   295 
       
   296         m_containsBackreferences = false;
       
   297 
       
   298         newlineCached = 0;
       
   299         digitsCached = 0;
       
   300         spacesCached = 0;
       
   301         wordcharCached = 0;
       
   302         nondigitsCached = 0;
       
   303         nonspacesCached = 0;
       
   304         nonwordcharCached = 0;
       
   305 
       
   306         deleteAllValues(m_disjunctions);
       
   307         m_disjunctions.clear();
       
   308         deleteAllValues(m_userCharacterClasses);
       
   309         m_userCharacterClasses.clear();
       
   310     }
       
   311 
       
   312     bool containsIllegalBackReference()
       
   313     {
       
   314         return m_maxBackReference > m_numSubpatterns;
       
   315     }
       
   316 
       
   317     CharacterClass* newlineCharacterClass()
       
   318     {
       
   319         if (!newlineCached)
       
   320             m_userCharacterClasses.append(newlineCached = newlineCreate());
       
   321         return newlineCached;
       
   322     }
       
   323     CharacterClass* digitsCharacterClass()
       
   324     {
       
   325         if (!digitsCached)
       
   326             m_userCharacterClasses.append(digitsCached = digitsCreate());
       
   327         return digitsCached;
       
   328     }
       
   329     CharacterClass* spacesCharacterClass()
       
   330     {
       
   331         if (!spacesCached)
       
   332             m_userCharacterClasses.append(spacesCached = spacesCreate());
       
   333         return spacesCached;
       
   334     }
       
   335     CharacterClass* wordcharCharacterClass()
       
   336     {
       
   337         if (!wordcharCached)
       
   338             m_userCharacterClasses.append(wordcharCached = wordcharCreate());
       
   339         return wordcharCached;
       
   340     }
       
   341     CharacterClass* nondigitsCharacterClass()
       
   342     {
       
   343         if (!nondigitsCached)
       
   344             m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
       
   345         return nondigitsCached;
       
   346     }
       
   347     CharacterClass* nonspacesCharacterClass()
       
   348     {
       
   349         if (!nonspacesCached)
       
   350             m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
       
   351         return nonspacesCached;
       
   352     }
       
   353     CharacterClass* nonwordcharCharacterClass()
       
   354     {
       
   355         if (!nonwordcharCached)
       
   356             m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
       
   357         return nonwordcharCached;
       
   358     }
       
   359 
       
   360     bool m_ignoreCase;
       
   361     bool m_multiline;
       
   362     unsigned m_numSubpatterns;
       
   363     unsigned m_maxBackReference;
       
   364     bool m_containsBackreferences;
       
   365     PatternDisjunction* m_body;
       
   366     Vector<PatternDisjunction*, 4> m_disjunctions;
       
   367     Vector<CharacterClass*> m_userCharacterClasses;
       
   368 
       
   369 private:
       
   370     CharacterClass* newlineCached;
       
   371     CharacterClass* digitsCached;
       
   372     CharacterClass* spacesCached;
       
   373     CharacterClass* wordcharCached;
       
   374     CharacterClass* nondigitsCached;
       
   375     CharacterClass* nonspacesCached;
       
   376     CharacterClass* nonwordcharCached;
       
   377 };
       
   378 
       
   379 } } // namespace JSC::Yarr
       
   380 
       
   381 #endif
       
   382 
       
   383 #endif // RegexPattern_h