...

Source file src/github.com/dlclark/regexp2/syntax/tree.go

Documentation: github.com/dlclark/regexp2/syntax

     1  package syntax
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"math"
     7  	"strconv"
     8  )
     9  
    10  type RegexTree struct {
    11  	root       *regexNode
    12  	caps       map[int]int
    13  	capnumlist []int
    14  	captop     int
    15  	Capnames   map[string]int
    16  	Caplist    []string
    17  	options    RegexOptions
    18  }
    19  
    20  // It is built into a parsed tree for a regular expression.
    21  
    22  // Implementation notes:
    23  //
    24  // Since the node tree is a temporary data structure only used
    25  // during compilation of the regexp to integer codes, it's
    26  // designed for clarity and convenience rather than
    27  // space efficiency.
    28  //
    29  // RegexNodes are built into a tree, linked by the n.children list.
    30  // Each node also has a n.parent and n.ichild member indicating
    31  // its parent and which child # it is in its parent's list.
    32  //
    33  // RegexNodes come in as many types as there are constructs in
    34  // a regular expression, for example, "concatenate", "alternate",
    35  // "one", "rept", "group". There are also node types for basic
    36  // peephole optimizations, e.g., "onerep", "notsetrep", etc.
    37  //
    38  // Because perl 5 allows "lookback" groups that scan backwards,
    39  // each node also gets a "direction". Normally the value of
    40  // boolean n.backward = false.
    41  //
    42  // During parsing, top-level nodes are also stacked onto a parse
    43  // stack (a stack of trees). For this purpose we have a n.next
    44  // pointer. [Note that to save a few bytes, we could overload the
    45  // n.parent pointer instead.]
    46  //
    47  // On the parse stack, each tree has a "role" - basically, the
    48  // nonterminal in the grammar that the parser has currently
    49  // assigned to the tree. That code is stored in n.role.
    50  //
    51  // Finally, some of the different kinds of nodes have data.
    52  // Two integers (for the looping constructs) are stored in
    53  // n.operands, an an object (either a string or a set)
    54  // is stored in n.data
    55  type regexNode struct {
    56  	t        nodeType
    57  	children []*regexNode
    58  	str      []rune
    59  	set      *CharSet
    60  	ch       rune
    61  	m        int
    62  	n        int
    63  	options  RegexOptions
    64  	next     *regexNode
    65  }
    66  
    67  type nodeType int32
    68  
    69  const (
    70  	// The following are leaves, and correspond to primitive operations
    71  
    72  	ntOnerep      nodeType = 0  // lef,back char,min,max    a {n}
    73  	ntNotonerep            = 1  // lef,back char,min,max    .{n}
    74  	ntSetrep               = 2  // lef,back set,min,max     [\d]{n}
    75  	ntOneloop              = 3  // lef,back char,min,max    a {,n}
    76  	ntNotoneloop           = 4  // lef,back char,min,max    .{,n}
    77  	ntSetloop              = 5  // lef,back set,min,max     [\d]{,n}
    78  	ntOnelazy              = 6  // lef,back char,min,max    a {,n}?
    79  	ntNotonelazy           = 7  // lef,back char,min,max    .{,n}?
    80  	ntSetlazy              = 8  // lef,back set,min,max     [\d]{,n}?
    81  	ntOne                  = 9  // lef      char            a
    82  	ntNotone               = 10 // lef      char            [^a]
    83  	ntSet                  = 11 // lef      set             [a-z\s]  \w \s \d
    84  	ntMulti                = 12 // lef      string          abcd
    85  	ntRef                  = 13 // lef      group           \#
    86  	ntBol                  = 14 //                          ^
    87  	ntEol                  = 15 //                          $
    88  	ntBoundary             = 16 //                          \b
    89  	ntNonboundary          = 17 //                          \B
    90  	ntBeginning            = 18 //                          \A
    91  	ntStart                = 19 //                          \G
    92  	ntEndZ                 = 20 //                          \Z
    93  	ntEnd                  = 21 //                          \Z
    94  
    95  	// Interior nodes do not correspond to primitive operations, but
    96  	// control structures compositing other operations
    97  
    98  	// Concat and alternate take n children, and can run forward or backwards
    99  
   100  	ntNothing     = 22 //          []
   101  	ntEmpty       = 23 //          ()
   102  	ntAlternate   = 24 //          a|b
   103  	ntConcatenate = 25 //          ab
   104  	ntLoop        = 26 // m,x      * + ? {,}
   105  	ntLazyloop    = 27 // m,x      *? +? ?? {,}?
   106  	ntCapture     = 28 // n        ()
   107  	ntGroup       = 29 //          (?:)
   108  	ntRequire     = 30 //          (?=) (?<=)
   109  	ntPrevent     = 31 //          (?!) (?<!)
   110  	ntGreedy      = 32 //          (?>) (?<)
   111  	ntTestref     = 33 //          (?(n) | )
   112  	ntTestgroup   = 34 //          (?(...) | )
   113  
   114  	ntECMABoundary    = 41 //                          \b
   115  	ntNonECMABoundary = 42 //                          \B
   116  )
   117  
   118  func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
   119  	return &regexNode{
   120  		t:       t,
   121  		options: opt,
   122  	}
   123  }
   124  
   125  func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
   126  	return &regexNode{
   127  		t:       t,
   128  		options: opt,
   129  		ch:      ch,
   130  	}
   131  }
   132  
   133  func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
   134  	return &regexNode{
   135  		t:       t,
   136  		options: opt,
   137  		str:     str,
   138  	}
   139  }
   140  
   141  func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
   142  	return &regexNode{
   143  		t:       t,
   144  		options: opt,
   145  		set:     set,
   146  	}
   147  }
   148  
   149  func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
   150  	return &regexNode{
   151  		t:       t,
   152  		options: opt,
   153  		m:       m,
   154  	}
   155  }
   156  func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
   157  	return &regexNode{
   158  		t:       t,
   159  		options: opt,
   160  		m:       m,
   161  		n:       n,
   162  	}
   163  }
   164  
   165  func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
   166  	for i := 0; i < len(n.str); i++ {
   167  		buf.WriteRune(n.str[i])
   168  	}
   169  }
   170  
   171  func (n *regexNode) addChild(child *regexNode) {
   172  	reduced := child.reduce()
   173  	n.children = append(n.children, reduced)
   174  	reduced.next = n
   175  }
   176  
   177  func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
   178  	newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
   179  	n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
   180  }
   181  
   182  // removes children including the start but not the end index
   183  func (n *regexNode) removeChildren(startIndex, endIndex int) {
   184  	n.children = append(n.children[:startIndex], n.children[endIndex:]...)
   185  }
   186  
   187  // Pass type as OneLazy or OneLoop
   188  func (n *regexNode) makeRep(t nodeType, min, max int) {
   189  	n.t += (t - ntOne)
   190  	n.m = min
   191  	n.n = max
   192  }
   193  
   194  func (n *regexNode) reduce() *regexNode {
   195  	switch n.t {
   196  	case ntAlternate:
   197  		return n.reduceAlternation()
   198  
   199  	case ntConcatenate:
   200  		return n.reduceConcatenation()
   201  
   202  	case ntLoop, ntLazyloop:
   203  		return n.reduceRep()
   204  
   205  	case ntGroup:
   206  		return n.reduceGroup()
   207  
   208  	case ntSet, ntSetloop:
   209  		return n.reduceSet()
   210  
   211  	default:
   212  		return n
   213  	}
   214  }
   215  
   216  // Basic optimization. Single-letter alternations can be replaced
   217  // by faster set specifications, and nested alternations with no
   218  // intervening operators can be flattened:
   219  //
   220  // a|b|c|def|g|h -> [a-c]|def|[gh]
   221  // apple|(?:orange|pear)|grape -> apple|orange|pear|grape
   222  func (n *regexNode) reduceAlternation() *regexNode {
   223  	if len(n.children) == 0 {
   224  		return newRegexNode(ntNothing, n.options)
   225  	}
   226  
   227  	wasLastSet := false
   228  	lastNodeCannotMerge := false
   229  	var optionsLast RegexOptions
   230  	var i, j int
   231  
   232  	for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
   233  		at := n.children[i]
   234  
   235  		if j < i {
   236  			n.children[j] = at
   237  		}
   238  
   239  		for {
   240  			if at.t == ntAlternate {
   241  				for k := 0; k < len(at.children); k++ {
   242  					at.children[k].next = n
   243  				}
   244  				n.insertChildren(i+1, at.children)
   245  
   246  				j--
   247  			} else if at.t == ntSet || at.t == ntOne {
   248  				// Cannot merge sets if L or I options differ, or if either are negated.
   249  				optionsAt := at.options & (RightToLeft | IgnoreCase)
   250  
   251  				if at.t == ntSet {
   252  					if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
   253  						wasLastSet = true
   254  						lastNodeCannotMerge = !at.set.IsMergeable()
   255  						optionsLast = optionsAt
   256  						break
   257  					}
   258  				} else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
   259  					wasLastSet = true
   260  					lastNodeCannotMerge = false
   261  					optionsLast = optionsAt
   262  					break
   263  				}
   264  
   265  				// The last node was a Set or a One, we're a Set or One and our options are the same.
   266  				// Merge the two nodes.
   267  				j--
   268  				prev := n.children[j]
   269  
   270  				var prevCharClass *CharSet
   271  				if prev.t == ntOne {
   272  					prevCharClass = &CharSet{}
   273  					prevCharClass.addChar(prev.ch)
   274  				} else {
   275  					prevCharClass = prev.set
   276  				}
   277  
   278  				if at.t == ntOne {
   279  					prevCharClass.addChar(at.ch)
   280  				} else {
   281  					prevCharClass.addSet(*at.set)
   282  				}
   283  
   284  				prev.t = ntSet
   285  				prev.set = prevCharClass
   286  			} else if at.t == ntNothing {
   287  				j--
   288  			} else {
   289  				wasLastSet = false
   290  				lastNodeCannotMerge = false
   291  			}
   292  			break
   293  		}
   294  	}
   295  
   296  	if j < i {
   297  		n.removeChildren(j, i)
   298  	}
   299  
   300  	return n.stripEnation(ntNothing)
   301  }
   302  
   303  // Basic optimization. Adjacent strings can be concatenated.
   304  //
   305  // (?:abc)(?:def) -> abcdef
   306  func (n *regexNode) reduceConcatenation() *regexNode {
   307  	// Eliminate empties and concat adjacent strings/chars
   308  
   309  	var optionsLast RegexOptions
   310  	var optionsAt RegexOptions
   311  	var i, j int
   312  
   313  	if len(n.children) == 0 {
   314  		return newRegexNode(ntEmpty, n.options)
   315  	}
   316  
   317  	wasLastString := false
   318  
   319  	for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
   320  		var at, prev *regexNode
   321  
   322  		at = n.children[i]
   323  
   324  		if j < i {
   325  			n.children[j] = at
   326  		}
   327  
   328  		if at.t == ntConcatenate &&
   329  			((at.options & RightToLeft) == (n.options & RightToLeft)) {
   330  			for k := 0; k < len(at.children); k++ {
   331  				at.children[k].next = n
   332  			}
   333  
   334  			//insert at.children at i+1 index in n.children
   335  			n.insertChildren(i+1, at.children)
   336  
   337  			j--
   338  		} else if at.t == ntMulti || at.t == ntOne {
   339  			// Cannot merge strings if L or I options differ
   340  			optionsAt = at.options & (RightToLeft | IgnoreCase)
   341  
   342  			if !wasLastString || optionsLast != optionsAt {
   343  				wasLastString = true
   344  				optionsLast = optionsAt
   345  				continue
   346  			}
   347  
   348  			j--
   349  			prev = n.children[j]
   350  
   351  			if prev.t == ntOne {
   352  				prev.t = ntMulti
   353  				prev.str = []rune{prev.ch}
   354  			}
   355  
   356  			if (optionsAt & RightToLeft) == 0 {
   357  				if at.t == ntOne {
   358  					prev.str = append(prev.str, at.ch)
   359  				} else {
   360  					prev.str = append(prev.str, at.str...)
   361  				}
   362  			} else {
   363  				if at.t == ntOne {
   364  					// insert at the front by expanding our slice, copying the data over, and then setting the value
   365  					prev.str = append(prev.str, 0)
   366  					copy(prev.str[1:], prev.str)
   367  					prev.str[0] = at.ch
   368  				} else {
   369  					//insert at the front...this one we'll make a new slice and copy both into it
   370  					merge := make([]rune, len(prev.str)+len(at.str))
   371  					copy(merge, at.str)
   372  					copy(merge[len(at.str):], prev.str)
   373  					prev.str = merge
   374  				}
   375  			}
   376  		} else if at.t == ntEmpty {
   377  			j--
   378  		} else {
   379  			wasLastString = false
   380  		}
   381  	}
   382  
   383  	if j < i {
   384  		// remove indices j through i from the children
   385  		n.removeChildren(j, i)
   386  	}
   387  
   388  	return n.stripEnation(ntEmpty)
   389  }
   390  
   391  // Nested repeaters just get multiplied with each other if they're not
   392  // too lumpy
   393  func (n *regexNode) reduceRep() *regexNode {
   394  
   395  	u := n
   396  	t := n.t
   397  	min := n.m
   398  	max := n.n
   399  
   400  	for {
   401  		if len(u.children) == 0 {
   402  			break
   403  		}
   404  
   405  		child := u.children[0]
   406  
   407  		// multiply reps of the same type only
   408  		if child.t != t {
   409  			childType := child.t
   410  
   411  			if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
   412  				childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
   413  				break
   414  			}
   415  		}
   416  
   417  		// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
   418  		// [but things like (a {2,})+ are not too lumpy...]
   419  		if u.m == 0 && child.m > 1 || child.n < child.m*2 {
   420  			break
   421  		}
   422  
   423  		u = child
   424  		if u.m > 0 {
   425  			if (math.MaxInt32-1)/u.m < min {
   426  				u.m = math.MaxInt32
   427  			} else {
   428  				u.m = u.m * min
   429  			}
   430  		}
   431  		if u.n > 0 {
   432  			if (math.MaxInt32-1)/u.n < max {
   433  				u.n = math.MaxInt32
   434  			} else {
   435  				u.n = u.n * max
   436  			}
   437  		}
   438  	}
   439  
   440  	if math.MaxInt32 == min {
   441  		return newRegexNode(ntNothing, n.options)
   442  	}
   443  	return u
   444  
   445  }
   446  
   447  // Simple optimization. If a concatenation or alternation has only
   448  // one child strip out the intermediate node. If it has zero children,
   449  // turn it into an empty.
   450  func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
   451  	switch len(n.children) {
   452  	case 0:
   453  		return newRegexNode(emptyType, n.options)
   454  	case 1:
   455  		return n.children[0]
   456  	default:
   457  		return n
   458  	}
   459  }
   460  
   461  func (n *regexNode) reduceGroup() *regexNode {
   462  	u := n
   463  
   464  	for u.t == ntGroup {
   465  		u = u.children[0]
   466  	}
   467  
   468  	return u
   469  }
   470  
   471  // Simple optimization. If a set is a singleton, an inverse singleton,
   472  // or empty, it's transformed accordingly.
   473  func (n *regexNode) reduceSet() *regexNode {
   474  	// Extract empty-set, one and not-one case as special
   475  
   476  	if n.set == nil {
   477  		n.t = ntNothing
   478  	} else if n.set.IsSingleton() {
   479  		n.ch = n.set.SingletonChar()
   480  		n.set = nil
   481  		n.t += (ntOne - ntSet)
   482  	} else if n.set.IsSingletonInverse() {
   483  		n.ch = n.set.SingletonChar()
   484  		n.set = nil
   485  		n.t += (ntNotone - ntSet)
   486  	}
   487  
   488  	return n
   489  }
   490  
   491  func (n *regexNode) reverseLeft() *regexNode {
   492  	if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
   493  		//reverse children order
   494  		for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
   495  			n.children[left], n.children[right] = n.children[right], n.children[left]
   496  		}
   497  	}
   498  
   499  	return n
   500  }
   501  
   502  func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
   503  	if min == 0 && max == 0 {
   504  		return newRegexNode(ntEmpty, n.options)
   505  	}
   506  
   507  	if min == 1 && max == 1 {
   508  		return n
   509  	}
   510  
   511  	switch n.t {
   512  	case ntOne, ntNotone, ntSet:
   513  		if lazy {
   514  			n.makeRep(Onelazy, min, max)
   515  		} else {
   516  			n.makeRep(Oneloop, min, max)
   517  		}
   518  		return n
   519  
   520  	default:
   521  		var t nodeType
   522  		if lazy {
   523  			t = ntLazyloop
   524  		} else {
   525  			t = ntLoop
   526  		}
   527  		result := newRegexNodeMN(t, n.options, min, max)
   528  		result.addChild(n)
   529  		return result
   530  	}
   531  }
   532  
   533  // debug functions
   534  
   535  var typeStr = []string{
   536  	"Onerep", "Notonerep", "Setrep",
   537  	"Oneloop", "Notoneloop", "Setloop",
   538  	"Onelazy", "Notonelazy", "Setlazy",
   539  	"One", "Notone", "Set",
   540  	"Multi", "Ref",
   541  	"Bol", "Eol", "Boundary", "Nonboundary",
   542  	"Beginning", "Start", "EndZ", "End",
   543  	"Nothing", "Empty",
   544  	"Alternate", "Concatenate",
   545  	"Loop", "Lazyloop",
   546  	"Capture", "Group", "Require", "Prevent", "Greedy",
   547  	"Testref", "Testgroup",
   548  	"Unknown", "Unknown", "Unknown",
   549  	"Unknown", "Unknown", "Unknown",
   550  	"ECMABoundary", "NonECMABoundary",
   551  }
   552  
   553  func (n *regexNode) description() string {
   554  	buf := &bytes.Buffer{}
   555  
   556  	buf.WriteString(typeStr[n.t])
   557  
   558  	if (n.options & ExplicitCapture) != 0 {
   559  		buf.WriteString("-C")
   560  	}
   561  	if (n.options & IgnoreCase) != 0 {
   562  		buf.WriteString("-I")
   563  	}
   564  	if (n.options & RightToLeft) != 0 {
   565  		buf.WriteString("-L")
   566  	}
   567  	if (n.options & Multiline) != 0 {
   568  		buf.WriteString("-M")
   569  	}
   570  	if (n.options & Singleline) != 0 {
   571  		buf.WriteString("-S")
   572  	}
   573  	if (n.options & IgnorePatternWhitespace) != 0 {
   574  		buf.WriteString("-X")
   575  	}
   576  	if (n.options & ECMAScript) != 0 {
   577  		buf.WriteString("-E")
   578  	}
   579  
   580  	switch n.t {
   581  	case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
   582  		buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
   583  		break
   584  	case ntCapture:
   585  		buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
   586  		break
   587  	case ntRef, ntTestref:
   588  		buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
   589  		break
   590  	case ntMulti:
   591  		fmt.Fprintf(buf, "(String = %s)", string(n.str))
   592  		break
   593  	case ntSet, ntSetloop, ntSetlazy:
   594  		buf.WriteString("(Set = " + n.set.String() + ")")
   595  		break
   596  	}
   597  
   598  	switch n.t {
   599  	case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
   600  		buf.WriteString("(Min = ")
   601  		buf.WriteString(strconv.Itoa(n.m))
   602  		buf.WriteString(", Max = ")
   603  		if n.n == math.MaxInt32 {
   604  			buf.WriteString("inf")
   605  		} else {
   606  			buf.WriteString(strconv.Itoa(n.n))
   607  		}
   608  		buf.WriteString(")")
   609  
   610  		break
   611  	}
   612  
   613  	return buf.String()
   614  }
   615  
   616  var padSpace = []byte("                                ")
   617  
   618  func (t *RegexTree) Dump() string {
   619  	return t.root.dump()
   620  }
   621  
   622  func (n *regexNode) dump() string {
   623  	var stack []int
   624  	CurNode := n
   625  	CurChild := 0
   626  
   627  	buf := bytes.NewBufferString(CurNode.description())
   628  	buf.WriteRune('\n')
   629  
   630  	for {
   631  		if CurNode.children != nil && CurChild < len(CurNode.children) {
   632  			stack = append(stack, CurChild+1)
   633  			CurNode = CurNode.children[CurChild]
   634  			CurChild = 0
   635  
   636  			Depth := len(stack)
   637  			if Depth > 32 {
   638  				Depth = 32
   639  			}
   640  			buf.Write(padSpace[:Depth])
   641  			buf.WriteString(CurNode.description())
   642  			buf.WriteRune('\n')
   643  		} else {
   644  			if len(stack) == 0 {
   645  				break
   646  			}
   647  
   648  			CurChild = stack[len(stack)-1]
   649  			stack = stack[:len(stack)-1]
   650  			CurNode = CurNode.next
   651  		}
   652  	}
   653  	return buf.String()
   654  }
   655  

View as plain text