diff -r 000000000000 -r 7f656887cf89 libraries/spcre/libpcre/pcre/doc/html/pcresyntax.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libraries/spcre/libpcre/pcre/doc/html/pcresyntax.html Wed Jun 23 15:52:26 2010 +0100 @@ -0,0 +1,457 @@ + + +pcresyntax specification + + +

pcresyntax man page

+Return to the PCRE index page. +

+This page is part of the PCRE HTML documentation. It was generated automatically +from the original man page. If there is any nonsense in it, please consult the +man page, in case the conversion went wrong. +
+

PCRE REGULAR EXPRESSION SYNTAX SUMMARY +
QUOTING +
CHARACTERS +
CHARACTER TYPES +
GENERAL CATEGORY PROPERTY CODES FOR \p and \P +
SCRIPT NAMES FOR \p AND \P +
CHARACTER CLASSES +
QUANTIFIERS +
ANCHORS AND SIMPLE ASSERTIONS +
MATCH POINT RESET +
ALTERNATION +
CAPTURING +
ATOMIC GROUPS +
COMMENT +
OPTION SETTING +
LOOKAHEAD AND LOOKBEHIND ASSERTIONS +
BACKREFERENCES +
SUBROUTINE REFERENCES (POSSIBLY RECURSIVE) +
CONDITIONAL PATTERNS +
BACKTRACKING CONTROL +
NEWLINE CONVENTIONS +
WHAT \R MATCHES +
CALLOUTS +
SEE ALSO +
AUTHOR +
REVISION +

+
PCRE REGULAR EXPRESSION SYNTAX SUMMARY
+

+The full syntax and semantics of the regular expressions that are supported by +PCRE are described in the +pcrepattern +documentation. This document contains just a quick-reference summary of the +syntax. +

+
QUOTING
+

+  \x         where x is non-alphanumeric is a literal x
+  \Q...\E    treat enclosed characters as literal
+

+
CHARACTERS
+

+  \a         alarm, that is, the BEL character (hex 07)
+  \cx        "control-x", where x is any character
+  \e         escape (hex 1B)
+  \f         formfeed (hex 0C)
+  \n         newline (hex 0A)
+  \r         carriage return (hex 0D)
+  \t         tab (hex 09)
+  \ddd       character with octal code ddd, or backreference
+  \xhh       character with hex code hh
+  \x{hhh..}  character with hex code hhh..
+

+
CHARACTER TYPES
+

+  .          any character except newline;
+               in dotall mode, any character whatsoever
+  \C         one byte, even in UTF-8 mode (best avoided)
+  \d         a decimal digit
+  \D         a character that is not a decimal digit
+  \h         a horizontal whitespace character
+  \H         a character that is not a horizontal whitespace character
+  \p{xx}     a character with the xx property
+  \P{xx}     a character without the xx property
+  \R         a newline sequence
+  \s         a whitespace character
+  \S         a character that is not a whitespace character
+  \v         a vertical whitespace character
+  \V         a character that is not a vertical whitespace character
+  \w         a "word" character
+  \W         a "non-word" character
+  \X         an extended Unicode sequence
+

+In PCRE, \d, \D, \s, \S, \w, and \W recognize only ASCII characters. +

+
GENERAL CATEGORY PROPERTY CODES FOR \p and \P
+

+  C          Other
+  Cc         Control
+  Cf         Format
+  Cn         Unassigned
+  Co         Private use
+  Cs         Surrogate
+
+  L          Letter
+  Ll         Lower case letter
+  Lm         Modifier letter
+  Lo         Other letter
+  Lt         Title case letter
+  Lu         Upper case letter
+  L&         Ll, Lu, or Lt
+
+  M          Mark
+  Mc         Spacing mark
+  Me         Enclosing mark
+  Mn         Non-spacing mark
+
+  N          Number
+  Nd         Decimal number
+  Nl         Letter number
+  No         Other number
+
+  P          Punctuation
+  Pc         Connector punctuation
+  Pd         Dash punctuation
+  Pe         Close punctuation
+  Pf         Final punctuation
+  Pi         Initial punctuation
+  Po         Other punctuation
+  Ps         Open punctuation
+
+  S          Symbol
+  Sc         Currency symbol
+  Sk         Modifier symbol
+  Sm         Mathematical symbol
+  So         Other symbol
+
+  Z          Separator
+  Zl         Line separator
+  Zp         Paragraph separator
+  Zs         Space separator
+

+
SCRIPT NAMES FOR \p AND \P
+

+Arabic, +Armenian, +Balinese, +Bengali, +Bopomofo, +Braille, +Buginese, +Buhid, +Canadian_Aboriginal, +Cherokee, +Common, +Coptic, +Cuneiform, +Cypriot, +Cyrillic, +Deseret, +Devanagari, +Ethiopic, +Georgian, +Glagolitic, +Gothic, +Greek, +Gujarati, +Gurmukhi, +Han, +Hangul, +Hanunoo, +Hebrew, +Hiragana, +Inherited, +Kannada, +Katakana, +Kharoshthi, +Khmer, +Lao, +Latin, +Limbu, +Linear_B, +Malayalam, +Mongolian, +Myanmar, +New_Tai_Lue, +Nko, +Ogham, +Old_Italic, +Old_Persian, +Oriya, +Osmanya, +Phags_Pa, +Phoenician, +Runic, +Shavian, +Sinhala, +Syloti_Nagri, +Syriac, +Tagalog, +Tagbanwa, +Tai_Le, +Tamil, +Telugu, +Thaana, +Thai, +Tibetan, +Tifinagh, +Ugaritic, +Yi. +

+
CHARACTER CLASSES
+

+  [...]       positive character class
+  [^...]      negative character class
+  [x-y]       range (can be used for hex characters)
+  [[:xxx:]]   positive POSIX named set
+  [[:^xxx:]]  negative POSIX named set
+
+  alnum       alphanumeric
+  alpha       alphabetic
+  ascii       0-127
+  blank       space or tab
+  cntrl       control character
+  digit       decimal digit
+  graph       printing, excluding space
+  lower       lower case letter
+  print       printing, including space
+  punct       printing, excluding alphanumeric
+  space       whitespace
+  upper       upper case letter
+  word        same as \w
+  xdigit      hexadecimal digit
+

+In PCRE, POSIX character set names recognize only ASCII characters. You can use +\Q...\E inside a character class. +

+
QUANTIFIERS
+

+  ?           0 or 1, greedy
+  ?+          0 or 1, possessive
+  ??          0 or 1, lazy
+  *           0 or more, greedy
+  *+          0 or more, possessive
+  *?          0 or more, lazy
+  +           1 or more, greedy
+  ++          1 or more, possessive
+  +?          1 or more, lazy
+  {n}         exactly n
+  {n,m}       at least n, no more than m, greedy
+  {n,m}+      at least n, no more than m, possessive
+  {n,m}?      at least n, no more than m, lazy
+  {n,}        n or more, greedy
+  {n,}+       n or more, possessive
+  {n,}?       n or more, lazy
+

+
ANCHORS AND SIMPLE ASSERTIONS
+

+  \b          word boundary
+  \B          not a word boundary
+  ^           start of subject
+               also after internal newline in multiline mode
+  \A          start of subject
+  $           end of subject
+               also before newline at end of subject
+               also before internal newline in multiline mode
+  \Z          end of subject
+               also before newline at end of subject
+  \z          end of subject
+  \G          first matching position in subject
+

+
MATCH POINT RESET
+

+  \K          reset start of match
+

+
ALTERNATION
+

+  expr|expr|expr...
+

+
CAPTURING
+

+  (...)          capturing group
+  (?<name>...)   named capturing group (Perl)
+  (?'name'...)   named capturing group (Perl)
+  (?P<name>...)  named capturing group (Python)
+  (?:...)        non-capturing group
+  (?|...)        non-capturing group; reset group numbers for
+                  capturing groups in each alternative
+

+
ATOMIC GROUPS
+

+  (?>...)        atomic, non-capturing group
+

+
COMMENT
+

+  (?#....)       comment (not nestable)
+

+
OPTION SETTING
+

+  (?i)           caseless
+  (?J)           allow duplicate names
+  (?m)           multiline
+  (?s)           single line (dotall)
+  (?U)           default ungreedy (lazy)
+  (?x)           extended (ignore white space)
+  (?-...)        unset option(s)
+

+
LOOKAHEAD AND LOOKBEHIND ASSERTIONS
+

+  (?=...)        positive look ahead
+  (?!...)        negative look ahead
+  (?<=...)       positive look behind
+  (?<!...)       negative look behind
+

+Each top-level branch of a look behind must be of a fixed length. +

+
BACKREFERENCES
+

+  \n             reference by number (can be ambiguous)
+  \gn            reference by number
+  \g{n}          reference by number
+  \g{-n}         relative reference by number
+  \k<name>       reference by name (Perl)
+  \k'name'       reference by name (Perl)
+  \g{name}       reference by name (Perl)
+  \k{name}       reference by name (.NET)
+  (?P=name)      reference by name (Python)
+

+
SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
+

+  (?R)           recurse whole pattern
+  (?n)           call subpattern by absolute number
+  (?+n)          call subpattern by relative number
+  (?-n)          call subpattern by relative number
+  (?&name)       call subpattern by name (Perl)
+  (?P>name)      call subpattern by name (Python)
+  \g<name>       call subpattern by name (Oniguruma)
+  \g'name'       call subpattern by name (Oniguruma)
+  \g<n>          call subpattern by absolute number (Oniguruma)
+  \g'n'          call subpattern by absolute number (Oniguruma)
+  \g<+n>         call subpattern by relative number (PCRE extension)
+  \g'+n'         call subpattern by relative number (PCRE extension)
+  \g<-n>         call subpattern by relative number (PCRE extension)
+  \g'-n'         call subpattern by relative number (PCRE extension)
+

+
CONDITIONAL PATTERNS
+

+  (?(condition)yes-pattern)
+  (?(condition)yes-pattern|no-pattern)
+
+  (?(n)...       absolute reference condition
+  (?(+n)...      relative reference condition
+  (?(-n)...      relative reference condition
+  (?(<name>)...  named reference condition (Perl)
+  (?('name')...  named reference condition (Perl)
+  (?(name)...    named reference condition (PCRE)
+  (?(R)...       overall recursion condition
+  (?(Rn)...      specific group recursion condition
+  (?(R&name)...  specific recursion condition
+  (?(DEFINE)...  define subpattern for reference
+  (?(assert)...  assertion condition
+

+
BACKTRACKING CONTROL
+

+The following act immediately they are reached: +

+  (*ACCEPT)      force successful match
+  (*FAIL)        force backtrack; synonym (*F)
+

+The following act only when a subsequent match failure causes a backtrack to +reach them. They all force a match failure, but they differ in what happens +afterwards. Those that advance the start-of-match point do so only if the +pattern is not anchored. +

+  (*COMMIT)      overall failure, no advance of starting point
+  (*PRUNE)       advance to next starting character
+  (*SKIP)        advance start to current matching position
+  (*THEN)        local failure, backtrack to next alternation
+

+
NEWLINE CONVENTIONS
+

+These are recognized only at the very start of the pattern or after a +(*BSR_...) option. +

+  (*CR)
+  (*LF)
+  (*CRLF)
+  (*ANYCRLF)
+  (*ANY)
+

+
WHAT \R MATCHES
+

+These are recognized only at the very start of the pattern or after a +(*...) option that sets the newline convention. +

+  (*BSR_ANYCRLF)
+  (*BSR_UNICODE)
+

+
CALLOUTS
+

+  (?C)      callout
+  (?Cn)     callout with data n
+

+
SEE ALSO
+

+pcrepattern(3), pcreapi(3), pcrecallout(3), +pcrematching(3), pcre(3). +

+
AUTHOR
+

+Philip Hazel +
+University Computing Service +
+Cambridge CB2 3QH, England. +
+

+
REVISION
+

+Return to the PCRE index page. +