...

Source file src/github.com/dlclark/regexp2/syntax/code.go

Documentation: github.com/dlclark/regexp2/syntax

     1  package syntax
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"math"
     7  )
     8  
     9  // similar to prog.go in the go regex package...also with comment 'may not belong in this package'
    10  
    11  // File provides operator constants for use by the Builder and the Machine.
    12  
    13  // Implementation notes:
    14  //
    15  // Regexps are built into RegexCodes, which contain an operation array,
    16  // a string table, and some constants.
    17  //
    18  // Each operation is one of the codes below, followed by the integer
    19  // operands specified for each op.
    20  //
    21  // Strings and sets are indices into a string table.
    22  
    23  type InstOp int
    24  
    25  const (
    26  	// 					    lef/back operands        description
    27  
    28  	Onerep    InstOp = 0 // lef,back char,min,max    a {n}
    29  	Notonerep        = 1 // lef,back char,min,max    .{n}
    30  	Setrep           = 2 // lef,back set,min,max     [\d]{n}
    31  
    32  	Oneloop    = 3 // lef,back char,min,max    a {,n}
    33  	Notoneloop = 4 // lef,back char,min,max    .{,n}
    34  	Setloop    = 5 // lef,back set,min,max     [\d]{,n}
    35  
    36  	Onelazy    = 6 // lef,back char,min,max    a {,n}?
    37  	Notonelazy = 7 // lef,back char,min,max    .{,n}?
    38  	Setlazy    = 8 // lef,back set,min,max     [\d]{,n}?
    39  
    40  	One    = 9  // lef      char            a
    41  	Notone = 10 // lef      char            [^a]
    42  	Set    = 11 // lef      set             [a-z\s]  \w \s \d
    43  
    44  	Multi = 12 // lef      string          abcd
    45  	Ref   = 13 // lef      group           \#
    46  
    47  	Bol         = 14 //                          ^
    48  	Eol         = 15 //                          $
    49  	Boundary    = 16 //                          \b
    50  	Nonboundary = 17 //                          \B
    51  	Beginning   = 18 //                          \A
    52  	Start       = 19 //                          \G
    53  	EndZ        = 20 //                          \Z
    54  	End         = 21 //                          \Z
    55  
    56  	Nothing = 22 //                          Reject!
    57  
    58  	// Primitive control structures
    59  
    60  	Lazybranch      = 23 // back     jump            straight first
    61  	Branchmark      = 24 // back     jump            branch first for loop
    62  	Lazybranchmark  = 25 // back     jump            straight first for loop
    63  	Nullcount       = 26 // back     val             set counter, null mark
    64  	Setcount        = 27 // back     val             set counter, make mark
    65  	Branchcount     = 28 // back     jump,limit      branch++ if zero<=c<limit
    66  	Lazybranchcount = 29 // back     jump,limit      same, but straight first
    67  	Nullmark        = 30 // back                     save position
    68  	Setmark         = 31 // back                     save position
    69  	Capturemark     = 32 // back     group           define group
    70  	Getmark         = 33 // back                     recall position
    71  	Setjump         = 34 // back                     save backtrack state
    72  	Backjump        = 35 //                          zap back to saved state
    73  	Forejump        = 36 //                          zap backtracking state
    74  	Testref         = 37 //                          backtrack if ref undefined
    75  	Goto            = 38 //          jump            just go
    76  
    77  	Prune = 39 //                          prune it baby
    78  	Stop  = 40 //                          done!
    79  
    80  	ECMABoundary    = 41 //                          \b
    81  	NonECMABoundary = 42 //                          \B
    82  
    83  	// Modifiers for alternate modes
    84  
    85  	Mask  = 63  // Mask to get unmodified ordinary operator
    86  	Rtl   = 64  // bit to indicate that we're reverse scanning.
    87  	Back  = 128 // bit to indicate that we're backtracking.
    88  	Back2 = 256 // bit to indicate that we're backtracking on a second branch.
    89  	Ci    = 512 // bit to indicate that we're case-insensitive.
    90  )
    91  
    92  type Code struct {
    93  	Codes       []int       // the code
    94  	Strings     [][]rune    // string table
    95  	Sets        []*CharSet  //character set table
    96  	TrackCount  int         // how many instructions use backtracking
    97  	Caps        map[int]int // mapping of user group numbers -> impl group slots
    98  	Capsize     int         // number of impl group slots
    99  	FcPrefix    *Prefix     // the set of candidate first characters (may be null)
   100  	BmPrefix    *BmPrefix   // the fixed prefix string as a Boyer-Moore machine (may be null)
   101  	Anchors     AnchorLoc   // the set of zero-length start anchors (RegexFCD.Bol, etc)
   102  	RightToLeft bool        // true if right to left
   103  }
   104  
   105  func opcodeBacktracks(op InstOp) bool {
   106  	op &= Mask
   107  
   108  	switch op {
   109  	case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark,
   110  		Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump,
   111  		Forejump, Goto:
   112  		return true
   113  
   114  	default:
   115  		return false
   116  	}
   117  }
   118  
   119  func opcodeSize(op InstOp) int {
   120  	op &= Mask
   121  
   122  	switch op {
   123  	case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ,
   124  		End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop:
   125  		return 1
   126  
   127  	case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark,
   128  		Prune, Set:
   129  		return 2
   130  
   131  	case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy,
   132  		Setlazy, Setrep, Setloop:
   133  		return 3
   134  
   135  	default:
   136  		panic(fmt.Errorf("Unexpected op code: %v", op))
   137  	}
   138  }
   139  
   140  var codeStr = []string{
   141  	"Onerep", "Notonerep", "Setrep",
   142  	"Oneloop", "Notoneloop", "Setloop",
   143  	"Onelazy", "Notonelazy", "Setlazy",
   144  	"One", "Notone", "Set",
   145  	"Multi", "Ref",
   146  	"Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End",
   147  	"Nothing",
   148  	"Lazybranch", "Branchmark", "Lazybranchmark",
   149  	"Nullcount", "Setcount", "Branchcount", "Lazybranchcount",
   150  	"Nullmark", "Setmark", "Capturemark", "Getmark",
   151  	"Setjump", "Backjump", "Forejump", "Testref", "Goto",
   152  	"Prune", "Stop",
   153  	"ECMABoundary", "NonECMABoundary",
   154  }
   155  
   156  func operatorDescription(op InstOp) string {
   157  	desc := codeStr[op&Mask]
   158  	if (op & Ci) != 0 {
   159  		desc += "-Ci"
   160  	}
   161  	if (op & Rtl) != 0 {
   162  		desc += "-Rtl"
   163  	}
   164  	if (op & Back) != 0 {
   165  		desc += "-Back"
   166  	}
   167  	if (op & Back2) != 0 {
   168  		desc += "-Back2"
   169  	}
   170  
   171  	return desc
   172  }
   173  
   174  // OpcodeDescription is a humman readable string of the specific offset
   175  func (c *Code) OpcodeDescription(offset int) string {
   176  	buf := &bytes.Buffer{}
   177  
   178  	op := InstOp(c.Codes[offset])
   179  	fmt.Fprintf(buf, "%06d ", offset)
   180  
   181  	if opcodeBacktracks(op & Mask) {
   182  		buf.WriteString("*")
   183  	} else {
   184  		buf.WriteString(" ")
   185  	}
   186  	buf.WriteString(operatorDescription(op))
   187  	buf.WriteString("(")
   188  	op &= Mask
   189  
   190  	switch op {
   191  	case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy:
   192  		buf.WriteString("Ch = ")
   193  		buf.WriteString(CharDescription(rune(c.Codes[offset+1])))
   194  
   195  	case Set, Setrep, Setloop, Setlazy:
   196  		buf.WriteString("Set = ")
   197  		buf.WriteString(c.Sets[c.Codes[offset+1]].String())
   198  
   199  	case Multi:
   200  		fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]]))
   201  
   202  	case Ref, Testref:
   203  		fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
   204  
   205  	case Capturemark:
   206  		fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
   207  		if c.Codes[offset+2] != -1 {
   208  			fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2])
   209  		}
   210  
   211  	case Nullcount, Setcount:
   212  		fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1])
   213  
   214  	case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount:
   215  		fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1])
   216  	}
   217  
   218  	switch op {
   219  	case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy:
   220  		buf.WriteString(", Rep = ")
   221  		if c.Codes[offset+2] == math.MaxInt32 {
   222  			buf.WriteString("inf")
   223  		} else {
   224  			fmt.Fprintf(buf, "%d", c.Codes[offset+2])
   225  		}
   226  
   227  	case Branchcount, Lazybranchcount:
   228  		buf.WriteString(", Limit = ")
   229  		if c.Codes[offset+2] == math.MaxInt32 {
   230  			buf.WriteString("inf")
   231  		} else {
   232  			fmt.Fprintf(buf, "%d", c.Codes[offset+2])
   233  		}
   234  
   235  	}
   236  
   237  	buf.WriteString(")")
   238  
   239  	return buf.String()
   240  }
   241  
   242  func (c *Code) Dump() string {
   243  	buf := &bytes.Buffer{}
   244  
   245  	if c.RightToLeft {
   246  		fmt.Fprintln(buf, "Direction:  right-to-left")
   247  	} else {
   248  		fmt.Fprintln(buf, "Direction:  left-to-right")
   249  	}
   250  	if c.FcPrefix == nil {
   251  		fmt.Fprintln(buf, "Firstchars: n/a")
   252  	} else {
   253  		fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String())
   254  	}
   255  
   256  	if c.BmPrefix == nil {
   257  		fmt.Fprintln(buf, "Prefix:     n/a")
   258  	} else {
   259  		fmt.Fprintf(buf, "Prefix:     %v\n", Escape(c.BmPrefix.String()))
   260  	}
   261  
   262  	fmt.Fprintf(buf, "Anchors:    %v\n", c.Anchors)
   263  	fmt.Fprintln(buf)
   264  
   265  	if c.BmPrefix != nil {
   266  		fmt.Fprintln(buf, "BoyerMoore:")
   267  		fmt.Fprintln(buf, c.BmPrefix.Dump("    "))
   268  	}
   269  	for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) {
   270  		fmt.Fprintln(buf, c.OpcodeDescription(i))
   271  	}
   272  
   273  	return buf.String()
   274  }
   275  

View as plain text