...

Source file src/github.com/dlclark/regexp2/regexp.go

Documentation: github.com/dlclark/regexp2

     1  /*
     2  Package regexp2 is a regexp package that has an interface similar to Go's framework regexp engine but uses a
     3  more feature full regex engine behind the scenes.
     4  
     5  It doesn't have constant time guarantees, but it allows backtracking and is compatible with Perl5 and .NET.
     6  You'll likely be better off with the RE2 engine from the regexp package and should only use this if you
     7  need to write very complex patterns or require compatibility with .NET.
     8  */
     9  package regexp2
    10  
    11  import (
    12  	"errors"
    13  	"math"
    14  	"strconv"
    15  	"sync"
    16  	"time"
    17  
    18  	"github.com/dlclark/regexp2/syntax"
    19  )
    20  
    21  // Default timeout used when running regexp matches -- "forever"
    22  var DefaultMatchTimeout = time.Duration(math.MaxInt64)
    23  
    24  // Regexp is the representation of a compiled regular expression.
    25  // A Regexp is safe for concurrent use by multiple goroutines.
    26  type Regexp struct {
    27  	// A match will time out if it takes (approximately) more than
    28  	// MatchTimeout. This is a safety check in case the match
    29  	// encounters catastrophic backtracking.  The default value
    30  	// (DefaultMatchTimeout) causes all time out checking to be
    31  	// suppressed.
    32  	MatchTimeout time.Duration
    33  
    34  	// read-only after Compile
    35  	pattern string       // as passed to Compile
    36  	options RegexOptions // options
    37  
    38  	caps     map[int]int    // capnum->index
    39  	capnames map[string]int //capture group name -> index
    40  	capslist []string       //sorted list of capture group names
    41  	capsize  int            // size of the capture array
    42  
    43  	code *syntax.Code // compiled program
    44  
    45  	// cache of machines for running regexp
    46  	muRun  sync.Mutex
    47  	runner []*runner
    48  }
    49  
    50  // Compile parses a regular expression and returns, if successful,
    51  // a Regexp object that can be used to match against text.
    52  func Compile(expr string, opt RegexOptions) (*Regexp, error) {
    53  	// parse it
    54  	tree, err := syntax.Parse(expr, syntax.RegexOptions(opt))
    55  	if err != nil {
    56  		return nil, err
    57  	}
    58  
    59  	// translate it to code
    60  	code, err := syntax.Write(tree)
    61  	if err != nil {
    62  		return nil, err
    63  	}
    64  
    65  	// return it
    66  	return &Regexp{
    67  		pattern:      expr,
    68  		options:      opt,
    69  		caps:         code.Caps,
    70  		capnames:     tree.Capnames,
    71  		capslist:     tree.Caplist,
    72  		capsize:      code.Capsize,
    73  		code:         code,
    74  		MatchTimeout: DefaultMatchTimeout,
    75  	}, nil
    76  }
    77  
    78  // MustCompile is like Compile but panics if the expression cannot be parsed.
    79  // It simplifies safe initialization of global variables holding compiled regular
    80  // expressions.
    81  func MustCompile(str string, opt RegexOptions) *Regexp {
    82  	regexp, error := Compile(str, opt)
    83  	if error != nil {
    84  		panic(`regexp2: Compile(` + quote(str) + `): ` + error.Error())
    85  	}
    86  	return regexp
    87  }
    88  
    89  // Escape adds backslashes to any special characters in the input string
    90  func Escape(input string) string {
    91  	return syntax.Escape(input)
    92  }
    93  
    94  // Unescape removes any backslashes from previously-escaped special characters in the input string
    95  func Unescape(input string) (string, error) {
    96  	return syntax.Unescape(input)
    97  }
    98  
    99  // SetTimeoutPeriod is a debug function that sets the frequency of the timeout goroutine's sleep cycle.
   100  // Defaults to 100ms. The only benefit of setting this lower is that the 1 background goroutine that manages
   101  // timeouts may exit slightly sooner after all the timeouts have expired. See Github issue #63
   102  func SetTimeoutCheckPeriod(d time.Duration) {
   103  	clockPeriod = d
   104  }
   105  
   106  // StopTimeoutClock should only be used in unit tests to prevent the timeout clock goroutine
   107  // from appearing like a leaking goroutine
   108  func StopTimeoutClock() {
   109  	stopClock()
   110  }
   111  
   112  // String returns the source text used to compile the regular expression.
   113  func (re *Regexp) String() string {
   114  	return re.pattern
   115  }
   116  
   117  func quote(s string) string {
   118  	if strconv.CanBackquote(s) {
   119  		return "`" + s + "`"
   120  	}
   121  	return strconv.Quote(s)
   122  }
   123  
   124  // RegexOptions impact the runtime and parsing behavior
   125  // for each specific regex.  They are setable in code as well
   126  // as in the regex pattern itself.
   127  type RegexOptions int32
   128  
   129  const (
   130  	None                    RegexOptions = 0x0
   131  	IgnoreCase                           = 0x0001 // "i"
   132  	Multiline                            = 0x0002 // "m"
   133  	ExplicitCapture                      = 0x0004 // "n"
   134  	Compiled                             = 0x0008 // "c"
   135  	Singleline                           = 0x0010 // "s"
   136  	IgnorePatternWhitespace              = 0x0020 // "x"
   137  	RightToLeft                          = 0x0040 // "r"
   138  	Debug                                = 0x0080 // "d"
   139  	ECMAScript                           = 0x0100 // "e"
   140  	RE2                                  = 0x0200 // RE2 (regexp package) compatibility mode
   141  	Unicode                              = 0x0400 // "u"
   142  )
   143  
   144  func (re *Regexp) RightToLeft() bool {
   145  	return re.options&RightToLeft != 0
   146  }
   147  
   148  func (re *Regexp) Debug() bool {
   149  	return re.options&Debug != 0
   150  }
   151  
   152  // Replace searches the input string and replaces each match found with the replacement text.
   153  // Count will limit the number of matches attempted and startAt will allow
   154  // us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
   155  // Set startAt and count to -1 to go through the whole string
   156  func (re *Regexp) Replace(input, replacement string, startAt, count int) (string, error) {
   157  	data, err := syntax.NewReplacerData(replacement, re.caps, re.capsize, re.capnames, syntax.RegexOptions(re.options))
   158  	if err != nil {
   159  		return "", err
   160  	}
   161  	//TODO: cache ReplacerData
   162  
   163  	return replace(re, data, nil, input, startAt, count)
   164  }
   165  
   166  // ReplaceFunc searches the input string and replaces each match found using the string from the evaluator
   167  // Count will limit the number of matches attempted and startAt will allow
   168  // us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
   169  // Set startAt and count to -1 to go through the whole string.
   170  func (re *Regexp) ReplaceFunc(input string, evaluator MatchEvaluator, startAt, count int) (string, error) {
   171  	return replace(re, nil, evaluator, input, startAt, count)
   172  }
   173  
   174  // FindStringMatch searches the input string for a Regexp match
   175  func (re *Regexp) FindStringMatch(s string) (*Match, error) {
   176  	// convert string to runes
   177  	return re.run(false, -1, getRunes(s))
   178  }
   179  
   180  // FindRunesMatch searches the input rune slice for a Regexp match
   181  func (re *Regexp) FindRunesMatch(r []rune) (*Match, error) {
   182  	return re.run(false, -1, r)
   183  }
   184  
   185  // FindStringMatchStartingAt searches the input string for a Regexp match starting at the startAt index
   186  func (re *Regexp) FindStringMatchStartingAt(s string, startAt int) (*Match, error) {
   187  	if startAt > len(s) {
   188  		return nil, errors.New("startAt must be less than the length of the input string")
   189  	}
   190  	r, startAt := re.getRunesAndStart(s, startAt)
   191  	if startAt == -1 {
   192  		// we didn't find our start index in the string -- that's a problem
   193  		return nil, errors.New("startAt must align to the start of a valid rune in the input string")
   194  	}
   195  
   196  	return re.run(false, startAt, r)
   197  }
   198  
   199  // FindRunesMatchStartingAt searches the input rune slice for a Regexp match starting at the startAt index
   200  func (re *Regexp) FindRunesMatchStartingAt(r []rune, startAt int) (*Match, error) {
   201  	return re.run(false, startAt, r)
   202  }
   203  
   204  // FindNextMatch returns the next match in the same input string as the match parameter.
   205  // Will return nil if there is no next match or if given a nil match.
   206  func (re *Regexp) FindNextMatch(m *Match) (*Match, error) {
   207  	if m == nil {
   208  		return nil, nil
   209  	}
   210  
   211  	// If previous match was empty, advance by one before matching to prevent
   212  	// infinite loop
   213  	startAt := m.textpos
   214  	if m.Length == 0 {
   215  		if m.textpos == len(m.text) {
   216  			return nil, nil
   217  		}
   218  
   219  		if re.RightToLeft() {
   220  			startAt--
   221  		} else {
   222  			startAt++
   223  		}
   224  	}
   225  	return re.run(false, startAt, m.text)
   226  }
   227  
   228  // MatchString return true if the string matches the regex
   229  // error will be set if a timeout occurs
   230  func (re *Regexp) MatchString(s string) (bool, error) {
   231  	m, err := re.run(true, -1, getRunes(s))
   232  	if err != nil {
   233  		return false, err
   234  	}
   235  	return m != nil, nil
   236  }
   237  
   238  func (re *Regexp) getRunesAndStart(s string, startAt int) ([]rune, int) {
   239  	if startAt < 0 {
   240  		if re.RightToLeft() {
   241  			r := getRunes(s)
   242  			return r, len(r)
   243  		}
   244  		return getRunes(s), 0
   245  	}
   246  	ret := make([]rune, len(s))
   247  	i := 0
   248  	runeIdx := -1
   249  	for strIdx, r := range s {
   250  		if strIdx == startAt {
   251  			runeIdx = i
   252  		}
   253  		ret[i] = r
   254  		i++
   255  	}
   256  	if startAt == len(s) {
   257  		runeIdx = i
   258  	}
   259  	return ret[:i], runeIdx
   260  }
   261  
   262  func getRunes(s string) []rune {
   263  	return []rune(s)
   264  }
   265  
   266  // MatchRunes return true if the runes matches the regex
   267  // error will be set if a timeout occurs
   268  func (re *Regexp) MatchRunes(r []rune) (bool, error) {
   269  	m, err := re.run(true, -1, r)
   270  	if err != nil {
   271  		return false, err
   272  	}
   273  	return m != nil, nil
   274  }
   275  
   276  // GetGroupNames Returns the set of strings used to name capturing groups in the expression.
   277  func (re *Regexp) GetGroupNames() []string {
   278  	var result []string
   279  
   280  	if re.capslist == nil {
   281  		result = make([]string, re.capsize)
   282  
   283  		for i := 0; i < len(result); i++ {
   284  			result[i] = strconv.Itoa(i)
   285  		}
   286  	} else {
   287  		result = make([]string, len(re.capslist))
   288  		copy(result, re.capslist)
   289  	}
   290  
   291  	return result
   292  }
   293  
   294  // GetGroupNumbers returns the integer group numbers corresponding to a group name.
   295  func (re *Regexp) GetGroupNumbers() []int {
   296  	var result []int
   297  
   298  	if re.caps == nil {
   299  		result = make([]int, re.capsize)
   300  
   301  		for i := 0; i < len(result); i++ {
   302  			result[i] = i
   303  		}
   304  	} else {
   305  		result = make([]int, len(re.caps))
   306  
   307  		for k, v := range re.caps {
   308  			result[v] = k
   309  		}
   310  	}
   311  
   312  	return result
   313  }
   314  
   315  // GroupNameFromNumber retrieves a group name that corresponds to a group number.
   316  // It will return "" for and unknown group number.  Unnamed groups automatically
   317  // receive a name that is the decimal string equivalent of its number.
   318  func (re *Regexp) GroupNameFromNumber(i int) string {
   319  	if re.capslist == nil {
   320  		if i >= 0 && i < re.capsize {
   321  			return strconv.Itoa(i)
   322  		}
   323  
   324  		return ""
   325  	}
   326  
   327  	if re.caps != nil {
   328  		var ok bool
   329  		if i, ok = re.caps[i]; !ok {
   330  			return ""
   331  		}
   332  	}
   333  
   334  	if i >= 0 && i < len(re.capslist) {
   335  		return re.capslist[i]
   336  	}
   337  
   338  	return ""
   339  }
   340  
   341  // GroupNumberFromName returns a group number that corresponds to a group name.
   342  // Returns -1 if the name is not a recognized group name.  Numbered groups
   343  // automatically get a group name that is the decimal string equivalent of its number.
   344  func (re *Regexp) GroupNumberFromName(name string) int {
   345  	// look up name if we have a hashtable of names
   346  	if re.capnames != nil {
   347  		if k, ok := re.capnames[name]; ok {
   348  			return k
   349  		}
   350  
   351  		return -1
   352  	}
   353  
   354  	// convert to an int if it looks like a number
   355  	result := 0
   356  	for i := 0; i < len(name); i++ {
   357  		ch := name[i]
   358  
   359  		if ch > '9' || ch < '0' {
   360  			return -1
   361  		}
   362  
   363  		result *= 10
   364  		result += int(ch - '0')
   365  	}
   366  
   367  	// return int if it's in range
   368  	if result >= 0 && result < re.capsize {
   369  		return result
   370  	}
   371  
   372  	return -1
   373  }
   374  

View as plain text