Package syntax

const (
    LowercaseSet = 0 // Set to arg.
    LowercaseAdd = 1 // Add arg.
    LowercaseBor = 2 // Bitwise or with 1.
    LowercaseBad = 3 // Bitwise and with 1 and add original.
)

const (
    Onerep    InstOp = 0 // lef,back char,min,max    a {n}
    Notonerep        = 1 // lef,back char,min,max    .{n}
    Setrep           = 2 // lef,back set,min,max     [\d]{n}

    Oneloop    = 3 // lef,back char,min,max    a {,n}
    Notoneloop = 4 // lef,back char,min,max    .{,n}
    Setloop    = 5 // lef,back set,min,max     [\d]{,n}

    Onelazy    = 6 // lef,back char,min,max    a {,n}?
    Notonelazy = 7 // lef,back char,min,max    .{,n}?
    Setlazy    = 8 // lef,back set,min,max     [\d]{,n}?

    One    = 9  // lef      char            a
    Notone = 10 // lef      char            [^a]
    Set    = 11 // lef      set             [a-z\s]  \w \s \d

    Multi = 12 // lef      string          abcd
    Ref   = 13 // lef      group           \#

    Bol         = 14 //                          ^
    Eol         = 15 //                          $
    Boundary    = 16 //                          \b
    Nonboundary = 17 //                          \B
    Beginning   = 18 //                          \A
    Start       = 19 //                          \G
    EndZ        = 20 //                          \Z
    End         = 21 //                          \Z

    Nothing = 22 //                          Reject!

    Lazybranch      = 23 // back     jump            straight first
    Branchmark      = 24 // back     jump            branch first for loop
    Lazybranchmark  = 25 // back     jump            straight first for loop
    Nullcount       = 26 // back     val             set counter, null mark
    Setcount        = 27 // back     val             set counter, make mark
    Branchcount     = 28 // back     jump,limit      branch++ if zero<=c<limit
    Lazybranchcount = 29 // back     jump,limit      same, but straight first
    Nullmark        = 30 // back                     save position
    Setmark         = 31 // back                     save position
    Capturemark     = 32 // back     group           define group
    Getmark         = 33 // back                     recall position
    Setjump         = 34 // back                     save backtrack state
    Backjump        = 35 //                          zap back to saved state
    Forejump        = 36 //                          zap backtracking state
    Testref         = 37 //                          backtrack if ref undefined
    Goto            = 38 //          jump            just go

    Prune = 39 //                          prune it baby
    Stop  = 40 //                          done!

    ECMABoundary    = 41 //                          \b
    NonECMABoundary = 42 //                          \B

    Mask  = 63  // Mask to get unmodified ordinary operator
    Rtl   = 64  // bit to indicate that we're reverse scanning.
    Back  = 128 // bit to indicate that we're backtracking.
    Back2 = 256 // bit to indicate that we're backtracking on a second branch.
    Ci    = 512 // bit to indicate that we're case-insensitive.
)

const (
    IgnoreCase              RegexOptions = 0x0001 // "i"
    Multiline                            = 0x0002 // "m"
    ExplicitCapture                      = 0x0004 // "n"
    Compiled                             = 0x0008 // "c"
    Singleline                           = 0x0010 // "s"
    IgnorePatternWhitespace              = 0x0020 // "x"
    RightToLeft                          = 0x0040 // "r"
    Debug                                = 0x0080 // "d"
    ECMAScript                           = 0x0100 // "e"
    RE2                                  = 0x0200 // RE2 compat mode
    Unicode                              = 0x0400 // "u"
)

const (
    // internal issue
    ErrInternalError ErrorCode = "regexp/syntax: internal error"
    // Parser errors
    ErrUnterminatedComment        = "unterminated comment"
    ErrInvalidCharRange           = "invalid character class range"
    ErrInvalidRepeatSize          = "invalid repeat count"
    ErrInvalidUTF8                = "invalid UTF-8"
    ErrCaptureGroupOutOfRange     = "capture group number out of range"
    ErrUnexpectedParen            = "unexpected )"
    ErrMissingParen               = "missing closing )"
    ErrMissingBrace               = "missing closing }"
    ErrInvalidRepeatOp            = "invalid nested repetition operator"
    ErrMissingRepeatArgument      = "missing argument to repetition operator"
    ErrConditionalExpression      = "illegal conditional (?(...)) expression"
    ErrTooManyAlternates          = "too many | in (?()|)"
    ErrUnrecognizedGrouping       = "unrecognized grouping construct: (%v"
    ErrInvalidGroupName           = "invalid group name: group names must begin with a word character and have a matching terminator"
    ErrCapNumNotZero              = "capture number cannot be zero"
    ErrUndefinedBackRef           = "reference to undefined group number %v"
    ErrUndefinedNameRef           = "reference to undefined group name %v"
    ErrAlternationCantCapture     = "alternation conditions do not capture and cannot be named"
    ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
    ErrMalformedReference         = "(?(%v) ) malformed"
    ErrUndefinedReference         = "(?(%v) ) reference to undefined group"
    ErrIllegalEndEscape           = "illegal \\ at end of pattern"
    ErrMalformedSlashP            = "malformed \\p{X} character escape"
    ErrIncompleteSlashP           = "incomplete \\p{X} character escape"
    ErrUnknownSlashP              = "unknown unicode category, script, or property '%v'"
    ErrUnrecognizedEscape         = "unrecognized escape sequence \\%v"
    ErrMissingControl             = "missing control character"
    ErrUnrecognizedControl        = "unrecognized control character"
    ErrTooFewHex                  = "insufficient hexadecimal digits"
    ErrInvalidHex                 = "hex values may not be larger than 0x10FFFF"
    ErrMalformedNameRef           = "malformed \\k<...> named back reference"
    ErrBadClassInCharRange        = "cannot include class \\%v in character range"
    ErrUnterminatedBracket        = "unterminated [] set"
    ErrSubtractionMustBeLast      = "a subtraction must be the last element in a character class"
    ErrReversedCharRange          = "[%c-%c] range in reverse order"
)

const (
    Q byte = 5 // quantifier
    S      = 4 // ordinary stopper
    Z      = 3 // ScanBlank stopper
    X      = 2 // whitespace
    E      = 1 // should be escaped
)

where the regex can be pegged

const (
    AnchorBeginning    AnchorLoc = 0x0001
    AnchorBol                    = 0x0002
    AnchorStart                  = 0x0004
    AnchorEol                    = 0x0008
    AnchorEndZ                   = 0x0010
    AnchorEnd                    = 0x0020
    AnchorBoundary               = 0x0040
    AnchorECMABoundary           = 0x0080
)

const (

    //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
    MaxPrefixSize = 50
)

Variables

var (
    AnyClass          = getCharSetFromOldString([]rune{0}, false)
    ECMAAnyClass      = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
    NoneClass         = getCharSetFromOldString(nil, false)
    ECMAWordClass     = getCharSetFromOldString(ecmaWord, false)
    NotECMAWordClass  = getCharSetFromOldString(ecmaWord, true)
    ECMASpaceClass    = getCharSetFromOldString(ecmaSpace, false)
    NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
    ECMADigitClass    = getCharSetFromOldString(ecmaDigit, false)
    NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)

    WordClass     = getCharSetFromCategoryString(false, false, wordCategoryText)
    NotWordClass  = getCharSetFromCategoryString(true, false, wordCategoryText)
    SpaceClass    = getCharSetFromCategoryString(false, false, spaceCategoryText)
    NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
    DigitClass    = getCharSetFromCategoryString(false, false, "Nd")
    NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")

    RE2SpaceClass    = getCharSetFromOldString(re2Space, false)
    NotRE2SpaceClass = getCharSetFromOldString(re2Space, true)
)

ErrReplacementError is a general error during parsing the replacement text

var ErrReplacementError = errors.New("Replacement pattern error.")

func CharDescription ¶

func CharDescription(ch rune) string

CharDescription Produces a human-readable description for a single character.

func Escape ¶

func Escape(input string) string

func IsECMAWordChar ¶

func IsECMAWordChar(r rune) bool

func IsWordChar ¶

func IsWordChar(r rune) bool

According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.

func Unescape ¶

func Unescape(input string) (string, error)

type AnchorLoc ¶

type AnchorLoc int16

func (AnchorLoc) String ¶

func (anchors AnchorLoc) String() string

anchorDescription returns a human-readable description of the anchors

type BmPrefix ¶

BmPrefix precomputes the Boyer-Moore tables for fast string scanning. These tables allow you to scan for the first occurrence of a string within a large body of text without examining every character. The performance of the heuristic depends on the actual string and the text being searched, but usually, the longer the string that is being searched for, the fewer characters need to be examined.

type BmPrefix struct {
    // contains filtered or unexported fields
}

func (*BmPrefix) Dump ¶

func (b *BmPrefix) Dump(indent string) string

Dump returns the contents of the filter as a human readable string

func (*BmPrefix) IsMatch ¶

func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool

When a regex is anchored, we can do a quick IsMatch test instead of a Scan

func (*BmPrefix) Scan ¶

func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int

Scan uses the Boyer-Moore algorithm to find the first occurrence of the specified string within text, beginning at index, and constrained within beglimit and endlimit.

The direction and case-sensitivity of the match is determined by the arguments to the RegexBoyerMoore constructor.

func (*BmPrefix) String ¶

func (b *BmPrefix) String() string

type CharSet ¶

CharSet combines start-end rune ranges and unicode categories representing a set of characters

type CharSet struct {
    // contains filtered or unexported fields
}

func (CharSet) CharIn ¶

func (c CharSet) CharIn(ch rune) bool

CharIn returns true if the rune is in our character set (either ranges or categories). It handles negations and subtracted sub-charsets.

func (CharSet) Copy ¶

func (c CharSet) Copy() CharSet

Copy makes a deep copy to prevent accidental mutation of a set

func (CharSet) HasSubtraction ¶

func (c CharSet) HasSubtraction() bool

func (CharSet) IsEmpty ¶

func (c CharSet) IsEmpty() bool

func (CharSet) IsMergeable ¶

func (c CharSet) IsMergeable() bool

func (CharSet) IsNegated ¶

func (c CharSet) IsNegated() bool

func (CharSet) IsSingleton ¶

func (c CharSet) IsSingleton() bool

func (CharSet) IsSingletonInverse ¶

func (c CharSet) IsSingletonInverse() bool

func (CharSet) SingletonChar ¶

func (c CharSet) SingletonChar() rune

SingletonChar will return the char from the first range without validation. It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input

func (CharSet) String ¶

func (c CharSet) String() string

gets a human-readable description for a set string

type Code ¶

type Code struct {
    Codes       []int       // the code
    Strings     [][]rune    // string table
    Sets        []*CharSet  //character set table
    TrackCount  int         // how many instructions use backtracking
    Caps        map[int]int // mapping of user group numbers -> impl group slots
    Capsize     int         // number of impl group slots
    FcPrefix    *Prefix     // the set of candidate first characters (may be null)
    BmPrefix    *BmPrefix   // the fixed prefix string as a Boyer-Moore machine (may be null)
    Anchors     AnchorLoc   // the set of zero-length start anchors (RegexFCD.Bol, etc)
    RightToLeft bool        // true if right to left
}

func Write ¶

func Write(tree *RegexTree) (*Code, error)

func (*Code) Dump ¶

func (c *Code) Dump() string

func (*Code) OpcodeDescription ¶

func (c *Code) OpcodeDescription(offset int) string

OpcodeDescription is a humman readable string of the specific offset

type Error ¶

An Error describes a failure to parse a regular expression and gives the offending expression.

type Error struct {
    Code ErrorCode
    Expr string
    Args []interface{}
}

func (*Error) Error ¶

func (e *Error) Error() string

type ErrorCode ¶

An ErrorCode describes a failure to parse a regular expression.

type ErrorCode string

func (ErrorCode) String ¶

func (e ErrorCode) String() string

type InstOp ¶

type InstOp int

type Prefix ¶

type Prefix struct {
    PrefixStr       []rune
    PrefixSet       CharSet
    CaseInsensitive bool
}

type RegexOptions ¶

type RegexOptions int32

type RegexTree ¶

type RegexTree struct {
    Capnames map[string]int
    Caplist  []string
    // contains filtered or unexported fields
}

func Parse ¶

func Parse(re string, op RegexOptions) (*RegexTree, error)

Parse converts a regex string into a parse tree

func (*RegexTree) Dump ¶

func (t *RegexTree) Dump() string

type ReplacerData ¶

type ReplacerData struct {
    Rep     string
    Strings []string
    Rules   []int
}

func NewReplacerData ¶

func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error)

NewReplacerData will populate a reusable replacer data struct based on the given replacement string and the capture group data from a regexp