writer.go

Documentation: github.com/dlclark/regexp2/syntax

     1  package syntax
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"math"
     7  	"os"
     8  )
     9  
    10  func Write(tree *RegexTree) (*Code, error) {
    11  	w := writer{
    12  		intStack:   make([]int, 0, 32),
    13  		emitted:    make([]int, 2),
    14  		stringhash: make(map[string]int),
    15  		sethash:    make(map[string]int),
    16  	}
    17  
    18  	code, err := w.codeFromTree(tree)
    19  
    20  	if tree.options&Debug > 0 && code != nil {
    21  		os.Stdout.WriteString(code.Dump())
    22  		os.Stdout.WriteString("\n")
    23  	}
    24  
    25  	return code, err
    26  }
    27  
    28  type writer struct {
    29  	emitted []int
    30  
    31  	intStack    []int
    32  	curpos      int
    33  	stringhash  map[string]int
    34  	stringtable [][]rune
    35  	sethash     map[string]int
    36  	settable    []*CharSet
    37  	counting    bool
    38  	count       int
    39  	trackcount  int
    40  	caps        map[int]int
    41  }
    42  
    43  const (
    44  	beforeChild nodeType = 64
    45  	afterChild           = 128
    46  	//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
    47  	MaxPrefixSize = 50
    48  )
    49  
    50  // The top level RegexCode generator. It does a depth-first walk
    51  // through the tree and calls EmitFragment to emits code before
    52  // and after each child of an interior node, and at each leaf.
    53  //
    54  // It runs two passes, first to count the size of the generated
    55  // code, and second to generate the code.
    56  //
    57  // We should time it against the alternative, which is
    58  // to just generate the code and grow the array as we go.
    59  func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
    60  	var (
    61  		curNode  *regexNode
    62  		curChild int
    63  		capsize  int
    64  	)
    65  	// construct sparse capnum mapping if some numbers are unused
    66  
    67  	if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
    68  		capsize = tree.captop
    69  		w.caps = nil
    70  	} else {
    71  		capsize = len(tree.capnumlist)
    72  		w.caps = tree.caps
    73  		for i := 0; i < len(tree.capnumlist); i++ {
    74  			w.caps[tree.capnumlist[i]] = i
    75  		}
    76  	}
    77  
    78  	w.counting = true
    79  
    80  	for {
    81  		if !w.counting {
    82  			w.emitted = make([]int, w.count)
    83  		}
    84  
    85  		curNode = tree.root
    86  		curChild = 0
    87  
    88  		w.emit1(Lazybranch, 0)
    89  
    90  		for {
    91  			if len(curNode.children) == 0 {
    92  				w.emitFragment(curNode.t, curNode, 0)
    93  			} else if curChild < len(curNode.children) {
    94  				w.emitFragment(curNode.t|beforeChild, curNode, curChild)
    95  
    96  				curNode = curNode.children[curChild]
    97  
    98  				w.pushInt(curChild)
    99  				curChild = 0
   100  				continue
   101  			}
   102  
   103  			if w.emptyStack() {
   104  				break
   105  			}
   106  
   107  			curChild = w.popInt()
   108  			curNode = curNode.next
   109  
   110  			w.emitFragment(curNode.t|afterChild, curNode, curChild)
   111  			curChild++
   112  		}
   113  
   114  		w.patchJump(0, w.curPos())
   115  		w.emit(Stop)
   116  
   117  		if !w.counting {
   118  			break
   119  		}
   120  
   121  		w.counting = false
   122  	}
   123  
   124  	fcPrefix := getFirstCharsPrefix(tree)
   125  	prefix := getPrefix(tree)
   126  	rtl := (tree.options & RightToLeft) != 0
   127  
   128  	var bmPrefix *BmPrefix
   129  	//TODO: benchmark string prefixes
   130  	if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
   131  		if len(prefix.PrefixStr) > MaxPrefixSize {
   132  			// limit prefix changes to 10k
   133  			prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
   134  		}
   135  		bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
   136  	} else {
   137  		bmPrefix = nil
   138  	}
   139  
   140  	return &Code{
   141  		Codes:       w.emitted,
   142  		Strings:     w.stringtable,
   143  		Sets:        w.settable,
   144  		TrackCount:  w.trackcount,
   145  		Caps:        w.caps,
   146  		Capsize:     capsize,
   147  		FcPrefix:    fcPrefix,
   148  		BmPrefix:    bmPrefix,
   149  		Anchors:     getAnchors(tree),
   150  		RightToLeft: rtl,
   151  	}, nil
   152  }
   153  
   154  // The main RegexCode generator. It does a depth-first walk
   155  // through the tree and calls EmitFragment to emits code before
   156  // and after each child of an interior node, and at each leaf.
   157  func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
   158  	bits := InstOp(0)
   159  
   160  	if nodetype <= ntRef {
   161  		if (node.options & RightToLeft) != 0 {
   162  			bits |= Rtl
   163  		}
   164  		if (node.options & IgnoreCase) != 0 {
   165  			bits |= Ci
   166  		}
   167  	}
   168  	ntBits := nodeType(bits)
   169  
   170  	switch nodetype {
   171  	case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
   172  		break
   173  
   174  	case ntAlternate | beforeChild:
   175  		if curIndex < len(node.children)-1 {
   176  			w.pushInt(w.curPos())
   177  			w.emit1(Lazybranch, 0)
   178  		}
   179  
   180  	case ntAlternate | afterChild:
   181  		if curIndex < len(node.children)-1 {
   182  			lbPos := w.popInt()
   183  			w.pushInt(w.curPos())
   184  			w.emit1(Goto, 0)
   185  			w.patchJump(lbPos, w.curPos())
   186  		} else {
   187  			for i := 0; i < curIndex; i++ {
   188  				w.patchJump(w.popInt(), w.curPos())
   189  			}
   190  		}
   191  		break
   192  
   193  	case ntTestref | beforeChild:
   194  		if curIndex == 0 {
   195  			w.emit(Setjump)
   196  			w.pushInt(w.curPos())
   197  			w.emit1(Lazybranch, 0)
   198  			w.emit1(Testref, w.mapCapnum(node.m))
   199  			w.emit(Forejump)
   200  		}
   201  
   202  	case ntTestref | afterChild:
   203  		if curIndex == 0 {
   204  			branchpos := w.popInt()
   205  			w.pushInt(w.curPos())
   206  			w.emit1(Goto, 0)
   207  			w.patchJump(branchpos, w.curPos())
   208  			w.emit(Forejump)
   209  			if len(node.children) <= 1 {
   210  				w.patchJump(w.popInt(), w.curPos())
   211  			}
   212  		} else if curIndex == 1 {
   213  			w.patchJump(w.popInt(), w.curPos())
   214  		}
   215  
   216  	case ntTestgroup | beforeChild:
   217  		if curIndex == 0 {
   218  			w.emit(Setjump)
   219  			w.emit(Setmark)
   220  			w.pushInt(w.curPos())
   221  			w.emit1(Lazybranch, 0)
   222  		}
   223  
   224  	case ntTestgroup | afterChild:
   225  		if curIndex == 0 {
   226  			w.emit(Getmark)
   227  			w.emit(Forejump)
   228  		} else if curIndex == 1 {
   229  			Branchpos := w.popInt()
   230  			w.pushInt(w.curPos())
   231  			w.emit1(Goto, 0)
   232  			w.patchJump(Branchpos, w.curPos())
   233  			w.emit(Getmark)
   234  			w.emit(Forejump)
   235  			if len(node.children) <= 2 {
   236  				w.patchJump(w.popInt(), w.curPos())
   237  			}
   238  		} else if curIndex == 2 {
   239  			w.patchJump(w.popInt(), w.curPos())
   240  		}
   241  
   242  	case ntLoop | beforeChild, ntLazyloop | beforeChild:
   243  
   244  		if node.n < math.MaxInt32 || node.m > 1 {
   245  			if node.m == 0 {
   246  				w.emit1(Nullcount, 0)
   247  			} else {
   248  				w.emit1(Setcount, 1-node.m)
   249  			}
   250  		} else if node.m == 0 {
   251  			w.emit(Nullmark)
   252  		} else {
   253  			w.emit(Setmark)
   254  		}
   255  
   256  		if node.m == 0 {
   257  			w.pushInt(w.curPos())
   258  			w.emit1(Goto, 0)
   259  		}
   260  		w.pushInt(w.curPos())
   261  
   262  	case ntLoop | afterChild, ntLazyloop | afterChild:
   263  
   264  		startJumpPos := w.curPos()
   265  		lazy := (nodetype - (ntLoop | afterChild))
   266  
   267  		if node.n < math.MaxInt32 || node.m > 1 {
   268  			if node.n == math.MaxInt32 {
   269  				w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
   270  			} else {
   271  				w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
   272  			}
   273  		} else {
   274  			w.emit1(InstOp(Branchmark+lazy), w.popInt())
   275  		}
   276  
   277  		if node.m == 0 {
   278  			w.patchJump(w.popInt(), startJumpPos)
   279  		}
   280  
   281  	case ntGroup | beforeChild, ntGroup | afterChild:
   282  
   283  	case ntCapture | beforeChild:
   284  		w.emit(Setmark)
   285  
   286  	case ntCapture | afterChild:
   287  		w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
   288  
   289  	case ntRequire | beforeChild:
   290  		// NOTE: the following line causes lookahead/lookbehind to be
   291  		// NON-BACKTRACKING. It can be commented out with (*)
   292  		w.emit(Setjump)
   293  
   294  		w.emit(Setmark)
   295  
   296  	case ntRequire | afterChild:
   297  		w.emit(Getmark)
   298  
   299  		// NOTE: the following line causes lookahead/lookbehind to be
   300  		// NON-BACKTRACKING. It can be commented out with (*)
   301  		w.emit(Forejump)
   302  
   303  	case ntPrevent | beforeChild:
   304  		w.emit(Setjump)
   305  		w.pushInt(w.curPos())
   306  		w.emit1(Lazybranch, 0)
   307  
   308  	case ntPrevent | afterChild:
   309  		w.emit(Backjump)
   310  		w.patchJump(w.popInt(), w.curPos())
   311  		w.emit(Forejump)
   312  
   313  	case ntGreedy | beforeChild:
   314  		w.emit(Setjump)
   315  
   316  	case ntGreedy | afterChild:
   317  		w.emit(Forejump)
   318  
   319  	case ntOne, ntNotone:
   320  		w.emit1(InstOp(node.t|ntBits), int(node.ch))
   321  
   322  	case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
   323  		if node.m > 0 {
   324  			if node.t == ntOneloop || node.t == ntOnelazy {
   325  				w.emit2(Onerep|bits, int(node.ch), node.m)
   326  			} else {
   327  				w.emit2(Notonerep|bits, int(node.ch), node.m)
   328  			}
   329  		}
   330  		if node.n > node.m {
   331  			if node.n == math.MaxInt32 {
   332  				w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
   333  			} else {
   334  				w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
   335  			}
   336  		}
   337  
   338  	case ntSetloop, ntSetlazy:
   339  		if node.m > 0 {
   340  			w.emit2(Setrep|bits, w.setCode(node.set), node.m)
   341  		}
   342  		if node.n > node.m {
   343  			if node.n == math.MaxInt32 {
   344  				w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
   345  			} else {
   346  				w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
   347  			}
   348  		}
   349  
   350  	case ntMulti:
   351  		w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
   352  
   353  	case ntSet:
   354  		w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
   355  
   356  	case ntRef:
   357  		w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
   358  
   359  	case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
   360  		w.emit(InstOp(node.t))
   361  
   362  	default:
   363  		return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
   364  	}
   365  
   366  	return nil
   367  }
   368  
   369  // To avoid recursion, we use a simple integer stack.
   370  // This is the push.
   371  func (w *writer) pushInt(i int) {
   372  	w.intStack = append(w.intStack, i)
   373  }
   374  
   375  // Returns true if the stack is empty.
   376  func (w *writer) emptyStack() bool {
   377  	return len(w.intStack) == 0
   378  }
   379  
   380  // This is the pop.
   381  func (w *writer) popInt() int {
   382  	//get our item
   383  	idx := len(w.intStack) - 1
   384  	i := w.intStack[idx]
   385  	//trim our slice
   386  	w.intStack = w.intStack[:idx]
   387  	return i
   388  }
   389  
   390  // Returns the current position in the emitted code.
   391  func (w *writer) curPos() int {
   392  	return w.curpos
   393  }
   394  
   395  // Fixes up a jump instruction at the specified offset
   396  // so that it jumps to the specified jumpDest.
   397  func (w *writer) patchJump(offset, jumpDest int) {
   398  	w.emitted[offset+1] = jumpDest
   399  }
   400  
   401  // Returns an index in the set table for a charset
   402  // uses a map to eliminate duplicates.
   403  func (w *writer) setCode(set *CharSet) int {
   404  	if w.counting {
   405  		return 0
   406  	}
   407  
   408  	buf := &bytes.Buffer{}
   409  
   410  	set.mapHashFill(buf)
   411  	hash := buf.String()
   412  	i, ok := w.sethash[hash]
   413  	if !ok {
   414  		i = len(w.sethash)
   415  		w.sethash[hash] = i
   416  		w.settable = append(w.settable, set)
   417  	}
   418  	return i
   419  }
   420  
   421  // Returns an index in the string table for a string.
   422  // uses a map to eliminate duplicates.
   423  func (w *writer) stringCode(str []rune) int {
   424  	if w.counting {
   425  		return 0
   426  	}
   427  
   428  	hash := string(str)
   429  	i, ok := w.stringhash[hash]
   430  	if !ok {
   431  		i = len(w.stringhash)
   432  		w.stringhash[hash] = i
   433  		w.stringtable = append(w.stringtable, str)
   434  	}
   435  
   436  	return i
   437  }
   438  
   439  // When generating code on a regex that uses a sparse set
   440  // of capture slots, we hash them to a dense set of indices
   441  // for an array of capture slots. Instead of doing the hash
   442  // at match time, it's done at compile time, here.
   443  func (w *writer) mapCapnum(capnum int) int {
   444  	if capnum == -1 {
   445  		return -1
   446  	}
   447  
   448  	if w.caps != nil {
   449  		return w.caps[capnum]
   450  	}
   451  
   452  	return capnum
   453  }
   454  
   455  // Emits a zero-argument operation. Note that the emit
   456  // functions all run in two modes: they can emit code, or
   457  // they can just count the size of the code.
   458  func (w *writer) emit(op InstOp) {
   459  	if w.counting {
   460  		w.count++
   461  		if opcodeBacktracks(op) {
   462  			w.trackcount++
   463  		}
   464  		return
   465  	}
   466  	w.emitted[w.curpos] = int(op)
   467  	w.curpos++
   468  }
   469  
   470  // Emits a one-argument operation.
   471  func (w *writer) emit1(op InstOp, opd1 int) {
   472  	if w.counting {
   473  		w.count += 2
   474  		if opcodeBacktracks(op) {
   475  			w.trackcount++
   476  		}
   477  		return
   478  	}
   479  	w.emitted[w.curpos] = int(op)
   480  	w.curpos++
   481  	w.emitted[w.curpos] = opd1
   482  	w.curpos++
   483  }
   484  
   485  // Emits a two-argument operation.
   486  func (w *writer) emit2(op InstOp, opd1, opd2 int) {
   487  	if w.counting {
   488  		w.count += 3
   489  		if opcodeBacktracks(op) {
   490  			w.trackcount++
   491  		}
   492  		return
   493  	}
   494  	w.emitted[w.curpos] = int(op)
   495  	w.curpos++
   496  	w.emitted[w.curpos] = opd1
   497  	w.curpos++
   498  	w.emitted[w.curpos] = opd2
   499  	w.curpos++
   500  }
   501
View as plain text