...

Source file src/github.com/dop251/goja/regexp.go

Documentation: github.com/dop251/goja

     1  package goja
     2  
     3  import (
     4  	"fmt"
     5  	"github.com/dlclark/regexp2"
     6  	"github.com/dop251/goja/unistring"
     7  	"io"
     8  	"regexp"
     9  	"sort"
    10  	"strings"
    11  	"unicode/utf16"
    12  )
    13  
    14  type regexp2MatchCache struct {
    15  	target String
    16  	runes  []rune
    17  	posMap []int
    18  }
    19  
    20  // Not goroutine-safe. Use regexp2Wrapper.clone()
    21  type regexp2Wrapper struct {
    22  	rx    *regexp2.Regexp
    23  	cache *regexp2MatchCache
    24  }
    25  
    26  type regexpWrapper regexp.Regexp
    27  
    28  type positionMapItem struct {
    29  	src, dst int
    30  }
    31  type positionMap []positionMapItem
    32  
    33  func (m positionMap) get(src int) int {
    34  	if src <= 0 {
    35  		return src
    36  	}
    37  	res := sort.Search(len(m), func(n int) bool { return m[n].src >= src })
    38  	if res >= len(m) || m[res].src != src {
    39  		panic("index not found")
    40  	}
    41  	return m[res].dst
    42  }
    43  
    44  type arrayRuneReader struct {
    45  	runes []rune
    46  	pos   int
    47  }
    48  
    49  func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) {
    50  	if rd.pos < len(rd.runes) {
    51  		r = rd.runes[rd.pos]
    52  		size = 1
    53  		rd.pos++
    54  	} else {
    55  		err = io.EOF
    56  	}
    57  	return
    58  }
    59  
    60  // Not goroutine-safe. Use regexpPattern.clone()
    61  type regexpPattern struct {
    62  	src string
    63  
    64  	global, ignoreCase, multiline, sticky, unicode bool
    65  
    66  	regexpWrapper  *regexpWrapper
    67  	regexp2Wrapper *regexp2Wrapper
    68  }
    69  
    70  func compileRegexp2(src string, multiline, ignoreCase bool) (*regexp2Wrapper, error) {
    71  	var opts regexp2.RegexOptions = regexp2.ECMAScript
    72  	if multiline {
    73  		opts |= regexp2.Multiline
    74  	}
    75  	if ignoreCase {
    76  		opts |= regexp2.IgnoreCase
    77  	}
    78  	regexp2Pattern, err1 := regexp2.Compile(src, opts)
    79  	if err1 != nil {
    80  		return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1)
    81  	}
    82  
    83  	return &regexp2Wrapper{rx: regexp2Pattern}, nil
    84  }
    85  
    86  func (p *regexpPattern) createRegexp2() {
    87  	if p.regexp2Wrapper != nil {
    88  		return
    89  	}
    90  	rx, err := compileRegexp2(p.src, p.multiline, p.ignoreCase)
    91  	if err != nil {
    92  		// At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug.
    93  		panic(err)
    94  	}
    95  	p.regexp2Wrapper = rx
    96  }
    97  
    98  func buildUTF8PosMap(s unicodeString) (positionMap, string) {
    99  	pm := make(positionMap, 0, s.Length())
   100  	rd := s.Reader()
   101  	sPos, utf8Pos := 0, 0
   102  	var sb strings.Builder
   103  	for {
   104  		r, size, err := rd.ReadRune()
   105  		if err == io.EOF {
   106  			break
   107  		}
   108  		if err != nil {
   109  			// the string contains invalid UTF-16, bailing out
   110  			return nil, ""
   111  		}
   112  		utf8Size, _ := sb.WriteRune(r)
   113  		sPos += size
   114  		utf8Pos += utf8Size
   115  		pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos})
   116  	}
   117  	return pm, sb.String()
   118  }
   119  
   120  func (p *regexpPattern) findSubmatchIndex(s String, start int) []int {
   121  	if p.regexpWrapper == nil {
   122  		return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
   123  	}
   124  	if start != 0 {
   125  		// Unfortunately Go's regexp library does not allow starting from an arbitrary position.
   126  		// If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not
   127  		// work correctly.
   128  		p.createRegexp2()
   129  		return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
   130  	}
   131  	return p.regexpWrapper.findSubmatchIndex(s, p.unicode)
   132  }
   133  
   134  func (p *regexpPattern) findAllSubmatchIndex(s String, start int, limit int, sticky bool) [][]int {
   135  	if p.regexpWrapper == nil {
   136  		return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
   137  	}
   138  	if start == 0 {
   139  		a, u := devirtualizeString(s)
   140  		if u == nil {
   141  			return p.regexpWrapper.findAllSubmatchIndex(string(a), limit, sticky)
   142  		}
   143  		if limit == 1 {
   144  			result := p.regexpWrapper.findSubmatchIndexUnicode(u, p.unicode)
   145  			if result == nil {
   146  				return nil
   147  			}
   148  			return [][]int{result}
   149  		}
   150  		// Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an
   151  		// input.
   152  		if p.unicode {
   153  			// Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8.
   154  			pm, str := buildUTF8PosMap(u)
   155  			if pm != nil {
   156  				res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky)
   157  				for _, result := range res {
   158  					for i, idx := range result {
   159  						result[i] = pm.get(idx)
   160  					}
   161  				}
   162  				return res
   163  			}
   164  		}
   165  	}
   166  
   167  	p.createRegexp2()
   168  	return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
   169  }
   170  
   171  // clone creates a copy of the regexpPattern which can be used concurrently.
   172  func (p *regexpPattern) clone() *regexpPattern {
   173  	ret := &regexpPattern{
   174  		src:        p.src,
   175  		global:     p.global,
   176  		ignoreCase: p.ignoreCase,
   177  		multiline:  p.multiline,
   178  		sticky:     p.sticky,
   179  		unicode:    p.unicode,
   180  	}
   181  	if p.regexpWrapper != nil {
   182  		ret.regexpWrapper = p.regexpWrapper.clone()
   183  	}
   184  	if p.regexp2Wrapper != nil {
   185  		ret.regexp2Wrapper = p.regexp2Wrapper.clone()
   186  	}
   187  	return ret
   188  }
   189  
   190  type regexpObject struct {
   191  	baseObject
   192  	pattern *regexpPattern
   193  	source  String
   194  
   195  	standard bool
   196  }
   197  
   198  func (r *regexp2Wrapper) findSubmatchIndex(s String, start int, fullUnicode, doCache bool) (result []int) {
   199  	if fullUnicode {
   200  		return r.findSubmatchIndexUnicode(s, start, doCache)
   201  	}
   202  	return r.findSubmatchIndexUTF16(s, start, doCache)
   203  }
   204  
   205  func (r *regexp2Wrapper) findUTF16Cached(s String, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) {
   206  	wrapped := r.rx
   207  	cache := r.cache
   208  	if cache != nil && cache.posMap == nil && cache.target.SameAs(s) {
   209  		runes = cache.runes
   210  	} else {
   211  		runes = s.utf16Runes()
   212  		cache = nil
   213  	}
   214  	match, err = wrapped.FindRunesMatchStartingAt(runes, start)
   215  	if doCache && match != nil && err == nil {
   216  		if cache == nil {
   217  			if r.cache == nil {
   218  				r.cache = new(regexp2MatchCache)
   219  			}
   220  			*r.cache = regexp2MatchCache{
   221  				target: s,
   222  				runes:  runes,
   223  			}
   224  		}
   225  	} else {
   226  		r.cache = nil
   227  	}
   228  	return
   229  }
   230  
   231  func (r *regexp2Wrapper) findSubmatchIndexUTF16(s String, start int, doCache bool) (result []int) {
   232  	match, _, err := r.findUTF16Cached(s, start, doCache)
   233  	if err != nil {
   234  		return
   235  	}
   236  
   237  	if match == nil {
   238  		return
   239  	}
   240  	groups := match.Groups()
   241  
   242  	result = make([]int, 0, len(groups)<<1)
   243  	for _, group := range groups {
   244  		if len(group.Captures) > 0 {
   245  			result = append(result, group.Index, group.Index+group.Length)
   246  		} else {
   247  			result = append(result, -1, 0)
   248  		}
   249  	}
   250  	return
   251  }
   252  
   253  func (r *regexp2Wrapper) findUnicodeCached(s String, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) {
   254  	var (
   255  		runes       []rune
   256  		mappedStart int
   257  		splitPair   bool
   258  		savedRune   rune
   259  	)
   260  	wrapped := r.rx
   261  	cache := r.cache
   262  	if cache != nil && cache.posMap != nil && cache.target.SameAs(s) {
   263  		runes, posMap = cache.runes, cache.posMap
   264  		mappedStart, splitPair = posMapReverseLookup(posMap, start)
   265  	} else {
   266  		posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), start)
   267  		cache = nil
   268  	}
   269  	if splitPair {
   270  		// temporarily set the rune at mappedStart to the second code point of the pair
   271  		_, second := utf16.EncodeRune(runes[mappedStart])
   272  		savedRune, runes[mappedStart] = runes[mappedStart], second
   273  	}
   274  	match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart)
   275  	if doCache && match != nil && err == nil {
   276  		if splitPair {
   277  			runes[mappedStart] = savedRune
   278  		}
   279  		if cache == nil {
   280  			if r.cache == nil {
   281  				r.cache = new(regexp2MatchCache)
   282  			}
   283  			*r.cache = regexp2MatchCache{
   284  				target: s,
   285  				runes:  runes,
   286  				posMap: posMap,
   287  			}
   288  		}
   289  	} else {
   290  		r.cache = nil
   291  	}
   292  
   293  	return
   294  }
   295  
   296  func (r *regexp2Wrapper) findSubmatchIndexUnicode(s String, start int, doCache bool) (result []int) {
   297  	match, posMap, err := r.findUnicodeCached(s, start, doCache)
   298  	if match == nil || err != nil {
   299  		return
   300  	}
   301  
   302  	groups := match.Groups()
   303  
   304  	result = make([]int, 0, len(groups)<<1)
   305  	for _, group := range groups {
   306  		if len(group.Captures) > 0 {
   307  			result = append(result, posMap[group.Index], posMap[group.Index+group.Length])
   308  		} else {
   309  			result = append(result, -1, 0)
   310  		}
   311  	}
   312  	return
   313  }
   314  
   315  func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s String, start, limit int, sticky bool) [][]int {
   316  	wrapped := r.rx
   317  	match, runes, err := r.findUTF16Cached(s, start, false)
   318  	if match == nil || err != nil {
   319  		return nil
   320  	}
   321  	if limit < 0 {
   322  		limit = len(runes) + 1
   323  	}
   324  	results := make([][]int, 0, limit)
   325  	for match != nil {
   326  		groups := match.Groups()
   327  
   328  		result := make([]int, 0, len(groups)<<1)
   329  
   330  		for _, group := range groups {
   331  			if len(group.Captures) > 0 {
   332  				startPos := group.Index
   333  				endPos := group.Index + group.Length
   334  				result = append(result, startPos, endPos)
   335  			} else {
   336  				result = append(result, -1, 0)
   337  			}
   338  		}
   339  
   340  		if sticky && len(result) > 1 {
   341  			if result[0] != start {
   342  				break
   343  			}
   344  			start = result[1]
   345  		}
   346  
   347  		results = append(results, result)
   348  		limit--
   349  		if limit <= 0 {
   350  			break
   351  		}
   352  		match, err = wrapped.FindNextMatch(match)
   353  		if err != nil {
   354  			return nil
   355  		}
   356  	}
   357  	return results
   358  }
   359  
   360  func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) {
   361  	posMap = make([]int, 0, l+1)
   362  	curPos := 0
   363  	runes = make([]rune, 0, l)
   364  	startFound := false
   365  	for {
   366  		if !startFound {
   367  			if curPos == start {
   368  				mappedStart = len(runes)
   369  				startFound = true
   370  			}
   371  			if curPos > start {
   372  				// start position splits a surrogate pair
   373  				mappedStart = len(runes) - 1
   374  				splitPair = true
   375  				startFound = true
   376  			}
   377  		}
   378  		rn, size, err := rd.ReadRune()
   379  		if err != nil {
   380  			break
   381  		}
   382  		runes = append(runes, rn)
   383  		posMap = append(posMap, curPos)
   384  		curPos += size
   385  	}
   386  	posMap = append(posMap, curPos)
   387  	return
   388  }
   389  
   390  func posMapReverseLookup(posMap []int, pos int) (int, bool) {
   391  	mapped := sort.SearchInts(posMap, pos)
   392  	if mapped < len(posMap) && posMap[mapped] != pos {
   393  		return mapped - 1, true
   394  	}
   395  	return mapped, false
   396  }
   397  
   398  func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int {
   399  	wrapped := r.rx
   400  	if limit < 0 {
   401  		limit = len(s) + 1
   402  	}
   403  	results := make([][]int, 0, limit)
   404  	match, posMap, err := r.findUnicodeCached(s, start, false)
   405  	if err != nil {
   406  		return nil
   407  	}
   408  	for match != nil {
   409  		groups := match.Groups()
   410  
   411  		result := make([]int, 0, len(groups)<<1)
   412  
   413  		for _, group := range groups {
   414  			if len(group.Captures) > 0 {
   415  				start := posMap[group.Index]
   416  				end := posMap[group.Index+group.Length]
   417  				result = append(result, start, end)
   418  			} else {
   419  				result = append(result, -1, 0)
   420  			}
   421  		}
   422  
   423  		if sticky && len(result) > 1 {
   424  			if result[0] != start {
   425  				break
   426  			}
   427  			start = result[1]
   428  		}
   429  
   430  		results = append(results, result)
   431  		match, err = wrapped.FindNextMatch(match)
   432  		if err != nil {
   433  			return nil
   434  		}
   435  	}
   436  	return results
   437  }
   438  
   439  func (r *regexp2Wrapper) findAllSubmatchIndex(s String, start, limit int, sticky, fullUnicode bool) [][]int {
   440  	a, u := devirtualizeString(s)
   441  	if u != nil {
   442  		if fullUnicode {
   443  			return r.findAllSubmatchIndexUnicode(u, start, limit, sticky)
   444  		}
   445  		return r.findAllSubmatchIndexUTF16(u, start, limit, sticky)
   446  	}
   447  	return r.findAllSubmatchIndexUTF16(a, start, limit, sticky)
   448  }
   449  
   450  func (r *regexp2Wrapper) clone() *regexp2Wrapper {
   451  	return &regexp2Wrapper{
   452  		rx: r.rx,
   453  	}
   454  }
   455  
   456  func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) {
   457  	wrapped := (*regexp.Regexp)(r)
   458  	results = wrapped.FindAllStringSubmatchIndex(s, limit)
   459  	pos := 0
   460  	if sticky {
   461  		for i, result := range results {
   462  			if len(result) > 1 {
   463  				if result[0] != pos {
   464  					return results[:i]
   465  				}
   466  				pos = result[1]
   467  			}
   468  		}
   469  	}
   470  	return
   471  }
   472  
   473  func (r *regexpWrapper) findSubmatchIndex(s String, fullUnicode bool) []int {
   474  	a, u := devirtualizeString(s)
   475  	if u != nil {
   476  		return r.findSubmatchIndexUnicode(u, fullUnicode)
   477  	}
   478  	return r.findSubmatchIndexASCII(string(a))
   479  }
   480  
   481  func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int {
   482  	wrapped := (*regexp.Regexp)(r)
   483  	return wrapped.FindStringSubmatchIndex(s)
   484  }
   485  
   486  func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) {
   487  	wrapped := (*regexp.Regexp)(r)
   488  	if fullUnicode {
   489  		posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), 0)
   490  		res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes})
   491  		for i, item := range res {
   492  			if item >= 0 {
   493  				res[i] = posMap[item]
   494  			}
   495  		}
   496  		return res
   497  	}
   498  	return wrapped.FindReaderSubmatchIndex(s.utf16RuneReader())
   499  }
   500  
   501  func (r *regexpWrapper) clone() *regexpWrapper {
   502  	return r
   503  }
   504  
   505  func (r *regexpObject) execResultToArray(target String, result []int) Value {
   506  	captureCount := len(result) >> 1
   507  	valueArray := make([]Value, captureCount)
   508  	matchIndex := result[0]
   509  	valueArray[0] = target.Substring(result[0], result[1])
   510  	lowerBound := 0
   511  	for index := 1; index < captureCount; index++ {
   512  		offset := index << 1
   513  		if result[offset] >= 0 && result[offset+1] >= lowerBound {
   514  			valueArray[index] = target.Substring(result[offset], result[offset+1])
   515  			lowerBound = result[offset]
   516  		} else {
   517  			valueArray[index] = _undefined
   518  		}
   519  	}
   520  	match := r.val.runtime.newArrayValues(valueArray)
   521  	match.self.setOwnStr("input", target, false)
   522  	match.self.setOwnStr("index", intToValue(int64(matchIndex)), false)
   523  	return match
   524  }
   525  
   526  func (r *regexpObject) getLastIndex() int64 {
   527  	lastIndex := toLength(r.getStr("lastIndex", nil))
   528  	if !r.pattern.global && !r.pattern.sticky {
   529  		return 0
   530  	}
   531  	return lastIndex
   532  }
   533  
   534  func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool {
   535  	if r.pattern.sticky {
   536  		if firstResult == nil || int64(firstResult[0]) != index {
   537  			r.setOwnStr("lastIndex", intToValue(0), true)
   538  			return false
   539  		}
   540  	} else {
   541  		if firstResult == nil {
   542  			if r.pattern.global {
   543  				r.setOwnStr("lastIndex", intToValue(0), true)
   544  			}
   545  			return false
   546  		}
   547  	}
   548  
   549  	if r.pattern.global || r.pattern.sticky {
   550  		r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true)
   551  	}
   552  	return true
   553  }
   554  
   555  func (r *regexpObject) execRegexp(target String) (match bool, result []int) {
   556  	index := r.getLastIndex()
   557  	if index >= 0 && index <= int64(target.Length()) {
   558  		result = r.pattern.findSubmatchIndex(target, int(index))
   559  	}
   560  	match = r.updateLastIndex(index, result, result)
   561  	return
   562  }
   563  
   564  func (r *regexpObject) exec(target String) Value {
   565  	match, result := r.execRegexp(target)
   566  	if match {
   567  		return r.execResultToArray(target, result)
   568  	}
   569  	return _null
   570  }
   571  
   572  func (r *regexpObject) test(target String) bool {
   573  	match, _ := r.execRegexp(target)
   574  	return match
   575  }
   576  
   577  func (r *regexpObject) clone() *regexpObject {
   578  	r1 := r.val.runtime.newRegexpObject(r.prototype)
   579  	r1.source = r.source
   580  	r1.pattern = r.pattern
   581  
   582  	return r1
   583  }
   584  
   585  func (r *regexpObject) init() {
   586  	r.baseObject.init()
   587  	r.standard = true
   588  	r._putProp("lastIndex", intToValue(0), true, false, false)
   589  }
   590  
   591  func (r *regexpObject) setProto(proto *Object, throw bool) bool {
   592  	res := r.baseObject.setProto(proto, throw)
   593  	if res {
   594  		r.standard = false
   595  	}
   596  	return res
   597  }
   598  
   599  func (r *regexpObject) defineOwnPropertyStr(name unistring.String, desc PropertyDescriptor, throw bool) bool {
   600  	res := r.baseObject.defineOwnPropertyStr(name, desc, throw)
   601  	if res {
   602  		r.standard = false
   603  	}
   604  	return res
   605  }
   606  
   607  func (r *regexpObject) defineOwnPropertySym(name *Symbol, desc PropertyDescriptor, throw bool) bool {
   608  	res := r.baseObject.defineOwnPropertySym(name, desc, throw)
   609  	if res && r.standard {
   610  		switch name {
   611  		case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
   612  			r.standard = false
   613  		}
   614  	}
   615  	return res
   616  }
   617  
   618  func (r *regexpObject) deleteStr(name unistring.String, throw bool) bool {
   619  	res := r.baseObject.deleteStr(name, throw)
   620  	if res {
   621  		r.standard = false
   622  	}
   623  	return res
   624  }
   625  
   626  func (r *regexpObject) setOwnStr(name unistring.String, value Value, throw bool) bool {
   627  	res := r.baseObject.setOwnStr(name, value, throw)
   628  	if res && r.standard && name == "exec" {
   629  		r.standard = false
   630  	}
   631  	return res
   632  }
   633  
   634  func (r *regexpObject) setOwnSym(name *Symbol, value Value, throw bool) bool {
   635  	res := r.baseObject.setOwnSym(name, value, throw)
   636  	if res && r.standard {
   637  		switch name {
   638  		case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
   639  			r.standard = false
   640  		}
   641  	}
   642  	return res
   643  }
   644  

View as plain text