...

Package syntax

import "github.com/dlclark/regexp2/syntax"
Overview
Index

Overview ▾

Index ▾

Constants
Variables
func CharDescription(ch rune) string
func Escape(input string) string
func IsECMAWordChar(r rune) bool
func IsWordChar(r rune) bool
func Unescape(input string) (string, error)
type AnchorLoc
    func (anchors AnchorLoc) String() string
type BmPrefix
    func (b *BmPrefix) Dump(indent string) string
    func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool
    func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int
    func (b *BmPrefix) String() string
type CharSet
    func (c CharSet) CharIn(ch rune) bool
    func (c CharSet) Copy() CharSet
    func (c CharSet) HasSubtraction() bool
    func (c CharSet) IsEmpty() bool
    func (c CharSet) IsMergeable() bool
    func (c CharSet) IsNegated() bool
    func (c CharSet) IsSingleton() bool
    func (c CharSet) IsSingletonInverse() bool
    func (c CharSet) SingletonChar() rune
    func (c CharSet) String() string
type Code
    func Write(tree *RegexTree) (*Code, error)
    func (c *Code) Dump() string
    func (c *Code) OpcodeDescription(offset int) string
type Error
    func (e *Error) Error() string
type ErrorCode
    func (e ErrorCode) String() string
type InstOp
type Prefix
type RegexOptions
type RegexTree
    func Parse(re string, op RegexOptions) (*RegexTree, error)
    func (t *RegexTree) Dump() string
type ReplacerData
    func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error)

Package files

charclass.go code.go escape.go parser.go prefix.go replacerdata.go tree.go writer.go

Constants

const (
    LowercaseSet = 0 // Set to arg.
    LowercaseAdd = 1 // Add arg.
    LowercaseBor = 2 // Bitwise or with 1.
    LowercaseBad = 3 // Bitwise and with 1 and add original.
)
const (
    Onerep    InstOp = 0 // lef,back char,min,max    a {n}
    Notonerep        = 1 // lef,back char,min,max    .{n}
    Setrep           = 2 // lef,back set,min,max     [\d]{n}

    Oneloop    = 3 // lef,back char,min,max    a {,n}
    Notoneloop = 4 // lef,back char,min,max    .{,n}
    Setloop    = 5 // lef,back set,min,max     [\d]{,n}

    Onelazy    = 6 // lef,back char,min,max    a {,n}?
    Notonelazy = 7 // lef,back char,min,max    .{,n}?
    Setlazy    = 8 // lef,back set,min,max     [\d]{,n}?

    One    = 9  // lef      char            a
    Notone = 10 // lef      char            [^a]
    Set    = 11 // lef      set             [a-z\s]  \w \s \d

    Multi = 12 // lef      string          abcd
    Ref   = 13 // lef      group           \#

    Bol         = 14 //                          ^
    Eol         = 15 //                          $
    Boundary    = 16 //                          \b
    Nonboundary = 17 //                          \B
    Beginning   = 18 //                          \A
    Start       = 19 //                          \G
    EndZ        = 20 //                          \Z
    End         = 21 //                          \Z

    Nothing = 22 //                          Reject!

    Lazybranch      = 23 // back     jump            straight first
    Branchmark      = 24 // back     jump            branch first for loop
    Lazybranchmark  = 25 // back     jump            straight first for loop
    Nullcount       = 26 // back     val             set counter, null mark
    Setcount        = 27 // back     val             set counter, make mark
    Branchcount     = 28 // back     jump,limit      branch++ if zero<=c<limit
    Lazybranchcount = 29 // back     jump,limit      same, but straight first
    Nullmark        = 30 // back                     save position
    Setmark         = 31 // back                     save position
    Capturemark     = 32 // back     group           define group
    Getmark         = 33 // back                     recall position
    Setjump         = 34 // back                     save backtrack state
    Backjump        = 35 //                          zap back to saved state
    Forejump        = 36 //                          zap backtracking state
    Testref         = 37 //                          backtrack if ref undefined
    Goto            = 38 //          jump            just go

    Prune = 39 //                          prune it baby
    Stop  = 40 //                          done!

    ECMABoundary    = 41 //                          \b
    NonECMABoundary = 42 //                          \B

    Mask  = 63  // Mask to get unmodified ordinary operator
    Rtl   = 64  // bit to indicate that we're reverse scanning.
    Back  = 128 // bit to indicate that we're backtracking.
    Back2 = 256 // bit to indicate that we're backtracking on a second branch.
    Ci    = 512 // bit to indicate that we're case-insensitive.
)
const (
    IgnoreCase              RegexOptions = 0x0001 // "i"
    Multiline                            = 0x0002 // "m"
    ExplicitCapture                      = 0x0004 // "n"
    Compiled                             = 0x0008 // "c"
    Singleline                           = 0x0010 // "s"
    IgnorePatternWhitespace              = 0x0020 // "x"
    RightToLeft                          = 0x0040 // "r"
    Debug                                = 0x0080 // "d"
    ECMAScript                           = 0x0100 // "e"
    RE2                                  = 0x0200 // RE2 compat mode
    Unicode                              = 0x0400 // "u"
)
const (
    // internal issue
    ErrInternalError ErrorCode = "regexp/syntax: internal error"
    // Parser errors
    ErrUnterminatedComment        = "unterminated comment"
    ErrInvalidCharRange           = "invalid character class range"
    ErrInvalidRepeatSize          = "invalid repeat count"
    ErrInvalidUTF8                = "invalid UTF-8"
    ErrCaptureGroupOutOfRange     = "capture group number out of range"
    ErrUnexpectedParen            = "unexpected )"
    ErrMissingParen               = "missing closing )"
    ErrMissingBrace               = "missing closing }"
    ErrInvalidRepeatOp            = "invalid nested repetition operator"
    ErrMissingRepeatArgument      = "missing argument to repetition operator"
    ErrConditionalExpression      = "illegal conditional (?(...)) expression"
    ErrTooManyAlternates          = "too many | in (?()|)"
    ErrUnrecognizedGrouping       = "unrecognized grouping construct: (%v"
    ErrInvalidGroupName           = "invalid group name: group names must begin with a word character and have a matching terminator"
    ErrCapNumNotZero              = "capture number cannot be zero"
    ErrUndefinedBackRef           = "reference to undefined group number %v"
    ErrUndefinedNameRef           = "reference to undefined group name %v"
    ErrAlternationCantCapture     = "alternation conditions do not capture and cannot be named"
    ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
    ErrMalformedReference         = "(?(%v) ) malformed"
    ErrUndefinedReference         = "(?(%v) ) reference to undefined group"
    ErrIllegalEndEscape           = "illegal \\ at end of pattern"
    ErrMalformedSlashP            = "malformed \\p{X} character escape"
    ErrIncompleteSlashP           = "incomplete \\p{X} character escape"
    ErrUnknownSlashP              = "unknown unicode category, script, or property '%v'"
    ErrUnrecognizedEscape         = "unrecognized escape sequence \\%v"
    ErrMissingControl             = "missing control character"
    ErrUnrecognizedControl        = "unrecognized control character"
    ErrTooFewHex                  = "insufficient hexadecimal digits"
    ErrInvalidHex                 = "hex values may not be larger than 0x10FFFF"
    ErrMalformedNameRef           = "malformed \\k<...> named back reference"
    ErrBadClassInCharRange        = "cannot include class \\%v in character range"
    ErrUnterminatedBracket        = "unterminated [] set"
    ErrSubtractionMustBeLast      = "a subtraction must be the last element in a character class"
    ErrReversedCharRange          = "[%c-%c] range in reverse order"
)
const (
    Q byte = 5 // quantifier
    S      = 4 // ordinary stopper
    Z      = 3 // ScanBlank stopper
    X      = 2 // whitespace
    E      = 1 // should be escaped
)

where the regex can be pegged

const (
    AnchorBeginning    AnchorLoc = 0x0001
    AnchorBol                    = 0x0002
    AnchorStart                  = 0x0004
    AnchorEol                    = 0x0008
    AnchorEndZ                   = 0x0010
    AnchorEnd                    = 0x0020
    AnchorBoundary               = 0x0040
    AnchorECMABoundary           = 0x0080
)
const (

    //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
    MaxPrefixSize = 50
)

Variables

var (
    AnyClass          = getCharSetFromOldString([]rune{0}, false)
    ECMAAnyClass      = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
    NoneClass         = getCharSetFromOldString(nil, false)
    ECMAWordClass     = getCharSetFromOldString(ecmaWord, false)
    NotECMAWordClass  = getCharSetFromOldString(ecmaWord, true)
    ECMASpaceClass    = getCharSetFromOldString(ecmaSpace, false)
    NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
    ECMADigitClass    = getCharSetFromOldString(ecmaDigit, false)
    NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)

    WordClass     = getCharSetFromCategoryString(false, false, wordCategoryText)
    NotWordClass  = getCharSetFromCategoryString(true, false, wordCategoryText)
    SpaceClass    = getCharSetFromCategoryString(false, false, spaceCategoryText)
    NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
    DigitClass    = getCharSetFromCategoryString(false, false, "Nd")
    NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")

    RE2SpaceClass    = getCharSetFromOldString(re2Space, false)
    NotRE2SpaceClass = getCharSetFromOldString(re2Space, true)
)

ErrReplacementError is a general error during parsing the replacement text

var ErrReplacementError = errors.New("Replacement pattern error.")

func CharDescription

func CharDescription(ch rune) string

CharDescription Produces a human-readable description for a single character.

func Escape

func Escape(input string) string

func IsECMAWordChar

func IsECMAWordChar(r rune) bool

func IsWordChar

func IsWordChar(r rune) bool

According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.

func Unescape

func Unescape(input string) (string, error)

type AnchorLoc

type AnchorLoc int16

func (AnchorLoc) String

func (anchors AnchorLoc) String() string

anchorDescription returns a human-readable description of the anchors

type BmPrefix

BmPrefix precomputes the Boyer-Moore tables for fast string scanning. These tables allow you to scan for the first occurrence of a string within a large body of text without examining every character. The performance of the heuristic depends on the actual string and the text being searched, but usually, the longer the string that is being searched for, the fewer characters need to be examined.

type BmPrefix struct {
    // contains filtered or unexported fields
}

func (*BmPrefix) Dump

func (b *BmPrefix) Dump(indent string) string

Dump returns the contents of the filter as a human readable string

func (*BmPrefix) IsMatch

func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool

When a regex is anchored, we can do a quick IsMatch test instead of a Scan

func (*BmPrefix) Scan

func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int

Scan uses the Boyer-Moore algorithm to find the first occurrence of the specified string within text, beginning at index, and constrained within beglimit and endlimit.

The direction and case-sensitivity of the match is determined by the arguments to the RegexBoyerMoore constructor.

func (*BmPrefix) String

func (b *BmPrefix) String() string

type CharSet

CharSet combines start-end rune ranges and unicode categories representing a set of characters

type CharSet struct {
    // contains filtered or unexported fields
}

func (CharSet) CharIn

func (c CharSet) CharIn(ch rune) bool

CharIn returns true if the rune is in our character set (either ranges or categories). It handles negations and subtracted sub-charsets.

func (CharSet) Copy

func (c CharSet) Copy() CharSet

Copy makes a deep copy to prevent accidental mutation of a set

func (CharSet) HasSubtraction

func (c CharSet) HasSubtraction() bool

func (CharSet) IsEmpty

func (c CharSet) IsEmpty() bool

func (CharSet) IsMergeable

func (c CharSet) IsMergeable() bool

func (CharSet) IsNegated

func (c CharSet) IsNegated() bool

func (CharSet) IsSingleton

func (c CharSet) IsSingleton() bool

func (CharSet) IsSingletonInverse

func (c CharSet) IsSingletonInverse() bool

func (CharSet) SingletonChar

func (c CharSet) SingletonChar() rune

SingletonChar will return the char from the first range without validation. It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input

func (CharSet) String

func (c CharSet) String() string

gets a human-readable description for a set string

type Code

type Code struct {
    Codes       []int       // the code
    Strings     [][]rune    // string table
    Sets        []*CharSet  //character set table
    TrackCount  int         // how many instructions use backtracking
    Caps        map[int]int // mapping of user group numbers -> impl group slots
    Capsize     int         // number of impl group slots
    FcPrefix    *Prefix     // the set of candidate first characters (may be null)
    BmPrefix    *BmPrefix   // the fixed prefix string as a Boyer-Moore machine (may be null)
    Anchors     AnchorLoc   // the set of zero-length start anchors (RegexFCD.Bol, etc)
    RightToLeft bool        // true if right to left
}

func Write

func Write(tree *RegexTree) (*Code, error)

func (*Code) Dump

func (c *Code) Dump() string

func (*Code) OpcodeDescription

func (c *Code) OpcodeDescription(offset int) string

OpcodeDescription is a humman readable string of the specific offset

type Error

An Error describes a failure to parse a regular expression and gives the offending expression.

type Error struct {
    Code ErrorCode
    Expr string
    Args []interface{}
}

func (*Error) Error

func (e *Error) Error() string

type ErrorCode

An ErrorCode describes a failure to parse a regular expression.

type ErrorCode string

func (ErrorCode) String

func (e ErrorCode) String() string

type InstOp

type InstOp int

type Prefix

type Prefix struct {
    PrefixStr       []rune
    PrefixSet       CharSet
    CaseInsensitive bool
}

type RegexOptions

type RegexOptions int32

type RegexTree

type RegexTree struct {
    Capnames map[string]int
    Caplist  []string
    // contains filtered or unexported fields
}

func Parse

func Parse(re string, op RegexOptions) (*RegexTree, error)

Parse converts a regex string into a parse tree

func (*RegexTree) Dump

func (t *RegexTree) Dump() string

type ReplacerData

type ReplacerData struct {
    Rep     string
    Strings []string
    Rules   []int
}

func NewReplacerData

func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error)

NewReplacerData will populate a reusable replacer data struct based on the given replacement string and the capture group data from a regexp