...

Source file src/github.com/dlclark/regexp2/syntax/prefix.go

Documentation: github.com/dlclark/regexp2/syntax

     1  package syntax
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"strconv"
     7  	"unicode"
     8  	"unicode/utf8"
     9  )
    10  
    11  type Prefix struct {
    12  	PrefixStr       []rune
    13  	PrefixSet       CharSet
    14  	CaseInsensitive bool
    15  }
    16  
    17  // It takes a RegexTree and computes the set of chars that can start it.
    18  func getFirstCharsPrefix(tree *RegexTree) *Prefix {
    19  	s := regexFcd{
    20  		fcStack:  make([]regexFc, 32),
    21  		intStack: make([]int, 32),
    22  	}
    23  	fc := s.regexFCFromRegexTree(tree)
    24  
    25  	if fc == nil || fc.nullable || fc.cc.IsEmpty() {
    26  		return nil
    27  	}
    28  	fcSet := fc.getFirstChars()
    29  	return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive}
    30  }
    31  
    32  type regexFcd struct {
    33  	intStack        []int
    34  	intDepth        int
    35  	fcStack         []regexFc
    36  	fcDepth         int
    37  	skipAllChildren bool // don't process any more children at the current level
    38  	skipchild       bool // don't process the current child.
    39  	failed          bool
    40  }
    41  
    42  /*
    43   * The main FC computation. It does a shortcutted depth-first walk
    44   * through the tree and calls CalculateFC to emits code before
    45   * and after each child of an interior node, and at each leaf.
    46   */
    47  func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc {
    48  	curNode := tree.root
    49  	curChild := 0
    50  
    51  	for {
    52  		if len(curNode.children) == 0 {
    53  			// This is a leaf node
    54  			s.calculateFC(curNode.t, curNode, 0)
    55  		} else if curChild < len(curNode.children) && !s.skipAllChildren {
    56  			// This is an interior node, and we have more children to analyze
    57  			s.calculateFC(curNode.t|beforeChild, curNode, curChild)
    58  
    59  			if !s.skipchild {
    60  				curNode = curNode.children[curChild]
    61  				// this stack is how we get a depth first walk of the tree.
    62  				s.pushInt(curChild)
    63  				curChild = 0
    64  			} else {
    65  				curChild++
    66  				s.skipchild = false
    67  			}
    68  			continue
    69  		}
    70  
    71  		// This is an interior node where we've finished analyzing all the children, or
    72  		// the end of a leaf node.
    73  		s.skipAllChildren = false
    74  
    75  		if s.intIsEmpty() {
    76  			break
    77  		}
    78  
    79  		curChild = s.popInt()
    80  		curNode = curNode.next
    81  
    82  		s.calculateFC(curNode.t|afterChild, curNode, curChild)
    83  		if s.failed {
    84  			return nil
    85  		}
    86  
    87  		curChild++
    88  	}
    89  
    90  	if s.fcIsEmpty() {
    91  		return nil
    92  	}
    93  
    94  	return s.popFC()
    95  }
    96  
    97  // To avoid recursion, we use a simple integer stack.
    98  // This is the push.
    99  func (s *regexFcd) pushInt(I int) {
   100  	if s.intDepth >= len(s.intStack) {
   101  		expanded := make([]int, s.intDepth*2)
   102  		copy(expanded, s.intStack)
   103  		s.intStack = expanded
   104  	}
   105  
   106  	s.intStack[s.intDepth] = I
   107  	s.intDepth++
   108  }
   109  
   110  // True if the stack is empty.
   111  func (s *regexFcd) intIsEmpty() bool {
   112  	return s.intDepth == 0
   113  }
   114  
   115  // This is the pop.
   116  func (s *regexFcd) popInt() int {
   117  	s.intDepth--
   118  	return s.intStack[s.intDepth]
   119  }
   120  
   121  // We also use a stack of RegexFC objects.
   122  // This is the push.
   123  func (s *regexFcd) pushFC(fc regexFc) {
   124  	if s.fcDepth >= len(s.fcStack) {
   125  		expanded := make([]regexFc, s.fcDepth*2)
   126  		copy(expanded, s.fcStack)
   127  		s.fcStack = expanded
   128  	}
   129  
   130  	s.fcStack[s.fcDepth] = fc
   131  	s.fcDepth++
   132  }
   133  
   134  // True if the stack is empty.
   135  func (s *regexFcd) fcIsEmpty() bool {
   136  	return s.fcDepth == 0
   137  }
   138  
   139  // This is the pop.
   140  func (s *regexFcd) popFC() *regexFc {
   141  	s.fcDepth--
   142  	return &s.fcStack[s.fcDepth]
   143  }
   144  
   145  // This is the top.
   146  func (s *regexFcd) topFC() *regexFc {
   147  	return &s.fcStack[s.fcDepth-1]
   148  }
   149  
   150  // Called in Beforechild to prevent further processing of the current child
   151  func (s *regexFcd) skipChild() {
   152  	s.skipchild = true
   153  }
   154  
   155  // FC computation and shortcut cases for each node type
   156  func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) {
   157  	//fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description())
   158  	ci := false
   159  	rtl := false
   160  
   161  	if nt <= ntRef {
   162  		if (node.options & IgnoreCase) != 0 {
   163  			ci = true
   164  		}
   165  		if (node.options & RightToLeft) != 0 {
   166  			rtl = true
   167  		}
   168  	}
   169  
   170  	switch nt {
   171  	case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild:
   172  		break
   173  
   174  	case ntTestgroup | beforeChild:
   175  		if CurIndex == 0 {
   176  			s.skipChild()
   177  		}
   178  		break
   179  
   180  	case ntEmpty:
   181  		s.pushFC(regexFc{nullable: true})
   182  		break
   183  
   184  	case ntConcatenate | afterChild:
   185  		if CurIndex != 0 {
   186  			child := s.popFC()
   187  			cumul := s.topFC()
   188  
   189  			s.failed = !cumul.addFC(*child, true)
   190  		}
   191  
   192  		fc := s.topFC()
   193  		if !fc.nullable {
   194  			s.skipAllChildren = true
   195  		}
   196  		break
   197  
   198  	case ntTestgroup | afterChild:
   199  		if CurIndex > 1 {
   200  			child := s.popFC()
   201  			cumul := s.topFC()
   202  
   203  			s.failed = !cumul.addFC(*child, false)
   204  		}
   205  		break
   206  
   207  	case ntAlternate | afterChild, ntTestref | afterChild:
   208  		if CurIndex != 0 {
   209  			child := s.popFC()
   210  			cumul := s.topFC()
   211  
   212  			s.failed = !cumul.addFC(*child, false)
   213  		}
   214  		break
   215  
   216  	case ntLoop | afterChild, ntLazyloop | afterChild:
   217  		if node.m == 0 {
   218  			fc := s.topFC()
   219  			fc.nullable = true
   220  		}
   221  		break
   222  
   223  	case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild:
   224  		break
   225  
   226  	case ntRequire | beforeChild, ntPrevent | beforeChild:
   227  		s.skipChild()
   228  		s.pushFC(regexFc{nullable: true})
   229  		break
   230  
   231  	case ntRequire | afterChild, ntPrevent | afterChild:
   232  		break
   233  
   234  	case ntOne, ntNotone:
   235  		s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci))
   236  		break
   237  
   238  	case ntOneloop, ntOnelazy:
   239  		s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci))
   240  		break
   241  
   242  	case ntNotoneloop, ntNotonelazy:
   243  		s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci))
   244  		break
   245  
   246  	case ntMulti:
   247  		if len(node.str) == 0 {
   248  			s.pushFC(regexFc{nullable: true})
   249  		} else if !rtl {
   250  			s.pushFC(newRegexFc(node.str[0], false, false, ci))
   251  		} else {
   252  			s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci))
   253  		}
   254  		break
   255  
   256  	case ntSet:
   257  		s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci})
   258  		break
   259  
   260  	case ntSetloop, ntSetlazy:
   261  		s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci})
   262  		break
   263  
   264  	case ntRef:
   265  		s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false})
   266  		break
   267  
   268  	case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
   269  		s.pushFC(regexFc{nullable: true})
   270  		break
   271  
   272  	default:
   273  		panic(fmt.Sprintf("unexpected op code: %v", nt))
   274  	}
   275  }
   276  
   277  type regexFc struct {
   278  	cc              CharSet
   279  	nullable        bool
   280  	caseInsensitive bool
   281  }
   282  
   283  func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc {
   284  	r := regexFc{
   285  		caseInsensitive: caseInsensitive,
   286  		nullable:        nullable,
   287  	}
   288  	if not {
   289  		if ch > 0 {
   290  			r.cc.addRange('\x00', ch-1)
   291  		}
   292  		if ch < 0xFFFF {
   293  			r.cc.addRange(ch+1, utf8.MaxRune)
   294  		}
   295  	} else {
   296  		r.cc.addRange(ch, ch)
   297  	}
   298  	return r
   299  }
   300  
   301  func (r *regexFc) getFirstChars() CharSet {
   302  	if r.caseInsensitive {
   303  		r.cc.addLowercase()
   304  	}
   305  
   306  	return r.cc
   307  }
   308  
   309  func (r *regexFc) addFC(fc regexFc, concatenate bool) bool {
   310  	if !r.cc.IsMergeable() || !fc.cc.IsMergeable() {
   311  		return false
   312  	}
   313  
   314  	if concatenate {
   315  		if !r.nullable {
   316  			return true
   317  		}
   318  
   319  		if !fc.nullable {
   320  			r.nullable = false
   321  		}
   322  	} else {
   323  		if fc.nullable {
   324  			r.nullable = true
   325  		}
   326  	}
   327  
   328  	r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive
   329  	r.cc.addSet(fc.cc)
   330  
   331  	return true
   332  }
   333  
   334  // This is a related computation: it takes a RegexTree and computes the
   335  // leading substring if it sees one. It's quite trivial and gives up easily.
   336  func getPrefix(tree *RegexTree) *Prefix {
   337  	var concatNode *regexNode
   338  	nextChild := 0
   339  
   340  	curNode := tree.root
   341  
   342  	for {
   343  		switch curNode.t {
   344  		case ntConcatenate:
   345  			if len(curNode.children) > 0 {
   346  				concatNode = curNode
   347  				nextChild = 0
   348  			}
   349  
   350  		case ntGreedy, ntCapture:
   351  			curNode = curNode.children[0]
   352  			concatNode = nil
   353  			continue
   354  
   355  		case ntOneloop, ntOnelazy:
   356  			if curNode.m > 0 {
   357  				return &Prefix{
   358  					PrefixStr:       repeat(curNode.ch, curNode.m),
   359  					CaseInsensitive: (curNode.options & IgnoreCase) != 0,
   360  				}
   361  			}
   362  			return nil
   363  
   364  		case ntOne:
   365  			return &Prefix{
   366  				PrefixStr:       []rune{curNode.ch},
   367  				CaseInsensitive: (curNode.options & IgnoreCase) != 0,
   368  			}
   369  
   370  		case ntMulti:
   371  			return &Prefix{
   372  				PrefixStr:       curNode.str,
   373  				CaseInsensitive: (curNode.options & IgnoreCase) != 0,
   374  			}
   375  
   376  		case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart,
   377  			ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent:
   378  
   379  		default:
   380  			return nil
   381  		}
   382  
   383  		if concatNode == nil || nextChild >= len(concatNode.children) {
   384  			return nil
   385  		}
   386  
   387  		curNode = concatNode.children[nextChild]
   388  		nextChild++
   389  	}
   390  }
   391  
   392  // repeat the rune r, c times... up to the max of MaxPrefixSize
   393  func repeat(r rune, c int) []rune {
   394  	if c > MaxPrefixSize {
   395  		c = MaxPrefixSize
   396  	}
   397  
   398  	ret := make([]rune, c)
   399  
   400  	// binary growth using copy for speed
   401  	ret[0] = r
   402  	bp := 1
   403  	for bp < len(ret) {
   404  		copy(ret[bp:], ret[:bp])
   405  		bp *= 2
   406  	}
   407  
   408  	return ret
   409  }
   410  
   411  // BmPrefix precomputes the Boyer-Moore
   412  // tables for fast string scanning. These tables allow
   413  // you to scan for the first occurrence of a string within
   414  // a large body of text without examining every character.
   415  // The performance of the heuristic depends on the actual
   416  // string and the text being searched, but usually, the longer
   417  // the string that is being searched for, the fewer characters
   418  // need to be examined.
   419  type BmPrefix struct {
   420  	positive        []int
   421  	negativeASCII   []int
   422  	negativeUnicode [][]int
   423  	pattern         []rune
   424  	lowASCII        rune
   425  	highASCII       rune
   426  	rightToLeft     bool
   427  	caseInsensitive bool
   428  }
   429  
   430  func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix {
   431  
   432  	b := &BmPrefix{
   433  		rightToLeft:     rightToLeft,
   434  		caseInsensitive: caseInsensitive,
   435  		pattern:         pattern,
   436  	}
   437  
   438  	if caseInsensitive {
   439  		for i := 0; i < len(b.pattern); i++ {
   440  			// We do the ToLower character by character for consistency.  With surrogate chars, doing
   441  			// a ToLower on the entire string could actually change the surrogate pair.  This is more correct
   442  			// linguistically, but since Regex doesn't support surrogates, it's more important to be
   443  			// consistent.
   444  
   445  			b.pattern[i] = unicode.ToLower(b.pattern[i])
   446  		}
   447  	}
   448  
   449  	var beforefirst, last, bump int
   450  	var scan, match int
   451  
   452  	if !rightToLeft {
   453  		beforefirst = -1
   454  		last = len(b.pattern) - 1
   455  		bump = 1
   456  	} else {
   457  		beforefirst = len(b.pattern)
   458  		last = 0
   459  		bump = -1
   460  	}
   461  
   462  	// PART I - the good-suffix shift table
   463  	//
   464  	// compute the positive requirement:
   465  	// if char "i" is the first one from the right that doesn't match,
   466  	// then we know the matcher can advance by _positive[i].
   467  	//
   468  	// This algorithm is a simplified variant of the standard
   469  	// Boyer-Moore good suffix calculation.
   470  
   471  	b.positive = make([]int, len(b.pattern))
   472  
   473  	examine := last
   474  	ch := b.pattern[examine]
   475  	b.positive[examine] = bump
   476  	examine -= bump
   477  
   478  Outerloop:
   479  	for {
   480  		// find an internal char (examine) that matches the tail
   481  
   482  		for {
   483  			if examine == beforefirst {
   484  				break Outerloop
   485  			}
   486  			if b.pattern[examine] == ch {
   487  				break
   488  			}
   489  			examine -= bump
   490  		}
   491  
   492  		match = last
   493  		scan = examine
   494  
   495  		// find the length of the match
   496  		for {
   497  			if scan == beforefirst || b.pattern[match] != b.pattern[scan] {
   498  				// at the end of the match, note the difference in _positive
   499  				// this is not the length of the match, but the distance from the internal match
   500  				// to the tail suffix.
   501  				if b.positive[match] == 0 {
   502  					b.positive[match] = match - scan
   503  				}
   504  
   505  				// System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
   506  
   507  				break
   508  			}
   509  
   510  			scan -= bump
   511  			match -= bump
   512  		}
   513  
   514  		examine -= bump
   515  	}
   516  
   517  	match = last - bump
   518  
   519  	// scan for the chars for which there are no shifts that yield a different candidate
   520  
   521  	// The inside of the if statement used to say
   522  	// "_positive[match] = last - beforefirst;"
   523  	// This is slightly less aggressive in how much we skip, but at worst it
   524  	// should mean a little more work rather than skipping a potential match.
   525  	for match != beforefirst {
   526  		if b.positive[match] == 0 {
   527  			b.positive[match] = bump
   528  		}
   529  
   530  		match -= bump
   531  	}
   532  
   533  	// PART II - the bad-character shift table
   534  	//
   535  	// compute the negative requirement:
   536  	// if char "ch" is the reject character when testing position "i",
   537  	// we can slide up by _negative[ch];
   538  	// (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
   539  	//
   540  	// the lookup table is divided into ASCII and Unicode portions;
   541  	// only those parts of the Unicode 16-bit code set that actually
   542  	// appear in the string are in the table. (Maximum size with
   543  	// Unicode is 65K; ASCII only case is 512 bytes.)
   544  
   545  	b.negativeASCII = make([]int, 128)
   546  
   547  	for i := 0; i < len(b.negativeASCII); i++ {
   548  		b.negativeASCII[i] = last - beforefirst
   549  	}
   550  
   551  	b.lowASCII = 127
   552  	b.highASCII = 0
   553  
   554  	for examine = last; examine != beforefirst; examine -= bump {
   555  		ch = b.pattern[examine]
   556  
   557  		switch {
   558  		case ch < 128:
   559  			if b.lowASCII > ch {
   560  				b.lowASCII = ch
   561  			}
   562  
   563  			if b.highASCII < ch {
   564  				b.highASCII = ch
   565  			}
   566  
   567  			if b.negativeASCII[ch] == last-beforefirst {
   568  				b.negativeASCII[ch] = last - examine
   569  			}
   570  		case ch <= 0xffff:
   571  			i, j := ch>>8, ch&0xFF
   572  
   573  			if b.negativeUnicode == nil {
   574  				b.negativeUnicode = make([][]int, 256)
   575  			}
   576  
   577  			if b.negativeUnicode[i] == nil {
   578  				newarray := make([]int, 256)
   579  
   580  				for k := 0; k < len(newarray); k++ {
   581  					newarray[k] = last - beforefirst
   582  				}
   583  
   584  				if i == 0 {
   585  					copy(newarray, b.negativeASCII)
   586  					//TODO: this line needed?
   587  					b.negativeASCII = newarray
   588  				}
   589  
   590  				b.negativeUnicode[i] = newarray
   591  			}
   592  
   593  			if b.negativeUnicode[i][j] == last-beforefirst {
   594  				b.negativeUnicode[i][j] = last - examine
   595  			}
   596  		default:
   597  			// we can't do the filter because this algo doesn't support
   598  			// unicode chars >0xffff
   599  			return nil
   600  		}
   601  	}
   602  
   603  	return b
   604  }
   605  
   606  func (b *BmPrefix) String() string {
   607  	return string(b.pattern)
   608  }
   609  
   610  // Dump returns the contents of the filter as a human readable string
   611  func (b *BmPrefix) Dump(indent string) string {
   612  	buf := &bytes.Buffer{}
   613  
   614  	fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent)
   615  	for i := 0; i < len(b.positive); i++ {
   616  		buf.WriteString(strconv.Itoa(b.positive[i]))
   617  		buf.WriteRune(' ')
   618  	}
   619  	buf.WriteRune('\n')
   620  
   621  	if b.negativeASCII != nil {
   622  		buf.WriteString(indent)
   623  		buf.WriteString("Negative table\n")
   624  		for i := 0; i < len(b.negativeASCII); i++ {
   625  			if b.negativeASCII[i] != len(b.pattern) {
   626  				fmt.Fprintf(buf, "%s  %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i]))
   627  			}
   628  		}
   629  	}
   630  
   631  	return buf.String()
   632  }
   633  
   634  // Scan uses the Boyer-Moore algorithm to find the first occurrence
   635  // of the specified string within text, beginning at index, and
   636  // constrained within beglimit and endlimit.
   637  //
   638  // The direction and case-sensitivity of the match is determined
   639  // by the arguments to the RegexBoyerMoore constructor.
   640  func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
   641  	var (
   642  		defadv, test, test2         int
   643  		match, startmatch, endmatch int
   644  		bump, advance               int
   645  		chTest                      rune
   646  		unicodeLookup               []int
   647  	)
   648  
   649  	if !b.rightToLeft {
   650  		defadv = len(b.pattern)
   651  		startmatch = len(b.pattern) - 1
   652  		endmatch = 0
   653  		test = index + defadv - 1
   654  		bump = 1
   655  	} else {
   656  		defadv = -len(b.pattern)
   657  		startmatch = 0
   658  		endmatch = -defadv - 1
   659  		test = index + defadv
   660  		bump = -1
   661  	}
   662  
   663  	chMatch := b.pattern[startmatch]
   664  
   665  	for {
   666  		if test >= endlimit || test < beglimit {
   667  			return -1
   668  		}
   669  
   670  		chTest = text[test]
   671  
   672  		if b.caseInsensitive {
   673  			chTest = unicode.ToLower(chTest)
   674  		}
   675  
   676  		if chTest != chMatch {
   677  			if chTest < 128 {
   678  				advance = b.negativeASCII[chTest]
   679  			} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
   680  				unicodeLookup = b.negativeUnicode[chTest>>8]
   681  				if len(unicodeLookup) > 0 {
   682  					advance = unicodeLookup[chTest&0xFF]
   683  				} else {
   684  					advance = defadv
   685  				}
   686  			} else {
   687  				advance = defadv
   688  			}
   689  
   690  			test += advance
   691  		} else { // if (chTest == chMatch)
   692  			test2 = test
   693  			match = startmatch
   694  
   695  			for {
   696  				if match == endmatch {
   697  					if b.rightToLeft {
   698  						return test2 + 1
   699  					} else {
   700  						return test2
   701  					}
   702  				}
   703  
   704  				match -= bump
   705  				test2 -= bump
   706  
   707  				chTest = text[test2]
   708  
   709  				if b.caseInsensitive {
   710  					chTest = unicode.ToLower(chTest)
   711  				}
   712  
   713  				if chTest != b.pattern[match] {
   714  					advance = b.positive[match]
   715  					if chTest < 128 {
   716  						test2 = (match - startmatch) + b.negativeASCII[chTest]
   717  					} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
   718  						unicodeLookup = b.negativeUnicode[chTest>>8]
   719  						if len(unicodeLookup) > 0 {
   720  							test2 = (match - startmatch) + unicodeLookup[chTest&0xFF]
   721  						} else {
   722  							test += advance
   723  							break
   724  						}
   725  					} else {
   726  						test += advance
   727  						break
   728  					}
   729  
   730  					if b.rightToLeft {
   731  						if test2 < advance {
   732  							advance = test2
   733  						}
   734  					} else if test2 > advance {
   735  						advance = test2
   736  					}
   737  
   738  					test += advance
   739  					break
   740  				}
   741  			}
   742  		}
   743  	}
   744  }
   745  
   746  // When a regex is anchored, we can do a quick IsMatch test instead of a Scan
   747  func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool {
   748  	if !b.rightToLeft {
   749  		if index < beglimit || endlimit-index < len(b.pattern) {
   750  			return false
   751  		}
   752  
   753  		return b.matchPattern(text, index)
   754  	} else {
   755  		if index > endlimit || index-beglimit < len(b.pattern) {
   756  			return false
   757  		}
   758  
   759  		return b.matchPattern(text, index-len(b.pattern))
   760  	}
   761  }
   762  
   763  func (b *BmPrefix) matchPattern(text []rune, index int) bool {
   764  	if len(text)-index < len(b.pattern) {
   765  		return false
   766  	}
   767  
   768  	if b.caseInsensitive {
   769  		for i := 0; i < len(b.pattern); i++ {
   770  			//Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
   771  			if unicode.ToLower(text[index+i]) != b.pattern[i] {
   772  				return false
   773  			}
   774  		}
   775  		return true
   776  	} else {
   777  		for i := 0; i < len(b.pattern); i++ {
   778  			if text[index+i] != b.pattern[i] {
   779  				return false
   780  			}
   781  		}
   782  		return true
   783  	}
   784  }
   785  
   786  type AnchorLoc int16
   787  
   788  // where the regex can be pegged
   789  const (
   790  	AnchorBeginning    AnchorLoc = 0x0001
   791  	AnchorBol                    = 0x0002
   792  	AnchorStart                  = 0x0004
   793  	AnchorEol                    = 0x0008
   794  	AnchorEndZ                   = 0x0010
   795  	AnchorEnd                    = 0x0020
   796  	AnchorBoundary               = 0x0040
   797  	AnchorECMABoundary           = 0x0080
   798  )
   799  
   800  func getAnchors(tree *RegexTree) AnchorLoc {
   801  
   802  	var concatNode *regexNode
   803  	nextChild, result := 0, AnchorLoc(0)
   804  
   805  	curNode := tree.root
   806  
   807  	for {
   808  		switch curNode.t {
   809  		case ntConcatenate:
   810  			if len(curNode.children) > 0 {
   811  				concatNode = curNode
   812  				nextChild = 0
   813  			}
   814  
   815  		case ntGreedy, ntCapture:
   816  			curNode = curNode.children[0]
   817  			concatNode = nil
   818  			continue
   819  
   820  		case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning,
   821  			ntStart, ntEndZ, ntEnd:
   822  			return result | anchorFromType(curNode.t)
   823  
   824  		case ntEmpty, ntRequire, ntPrevent:
   825  
   826  		default:
   827  			return result
   828  		}
   829  
   830  		if concatNode == nil || nextChild >= len(concatNode.children) {
   831  			return result
   832  		}
   833  
   834  		curNode = concatNode.children[nextChild]
   835  		nextChild++
   836  	}
   837  }
   838  
   839  func anchorFromType(t nodeType) AnchorLoc {
   840  	switch t {
   841  	case ntBol:
   842  		return AnchorBol
   843  	case ntEol:
   844  		return AnchorEol
   845  	case ntBoundary:
   846  		return AnchorBoundary
   847  	case ntECMABoundary:
   848  		return AnchorECMABoundary
   849  	case ntBeginning:
   850  		return AnchorBeginning
   851  	case ntStart:
   852  		return AnchorStart
   853  	case ntEndZ:
   854  		return AnchorEndZ
   855  	case ntEnd:
   856  		return AnchorEnd
   857  	default:
   858  		return 0
   859  	}
   860  }
   861  
   862  // anchorDescription returns a human-readable description of the anchors
   863  func (anchors AnchorLoc) String() string {
   864  	buf := &bytes.Buffer{}
   865  
   866  	if 0 != (anchors & AnchorBeginning) {
   867  		buf.WriteString(", Beginning")
   868  	}
   869  	if 0 != (anchors & AnchorStart) {
   870  		buf.WriteString(", Start")
   871  	}
   872  	if 0 != (anchors & AnchorBol) {
   873  		buf.WriteString(", Bol")
   874  	}
   875  	if 0 != (anchors & AnchorBoundary) {
   876  		buf.WriteString(", Boundary")
   877  	}
   878  	if 0 != (anchors & AnchorECMABoundary) {
   879  		buf.WriteString(", ECMABoundary")
   880  	}
   881  	if 0 != (anchors & AnchorEol) {
   882  		buf.WriteString(", Eol")
   883  	}
   884  	if 0 != (anchors & AnchorEnd) {
   885  		buf.WriteString(", End")
   886  	}
   887  	if 0 != (anchors & AnchorEndZ) {
   888  		buf.WriteString(", EndZ")
   889  	}
   890  
   891  	// trim off comma
   892  	if buf.Len() >= 2 {
   893  		return buf.String()[2:]
   894  	}
   895  	return "None"
   896  }
   897  

View as plain text