regexp_pcre_test.go

Documentation: github.com/dlclark/regexp2

     1  package regexp2
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"fmt"
     7  	"log"
     8  	"os"
     9  	"regexp"
    10  	"strconv"
    11  	"strings"
    12  	"testing"
    13  	"time"
    14  )
    15  
    16  // Process the file "testoutput1" from PCRE2 v10.21 (public domain)
    17  var totalCount, failCount = 0, 0
    18  
    19  func TestPcre_Basics(t *testing.T) {
    20  	defer func() {
    21  		if failCount > 0 {
    22  			t.Logf("%v of %v patterns failed", failCount, totalCount)
    23  		}
    24  	}()
    25  	// open our test patterns file and run through it
    26  	// validating results as we go
    27  	file, err := os.Open("testoutput1")
    28  	if err != nil {
    29  		log.Fatal(err)
    30  	}
    31  	defer file.Close()
    32  
    33  	// the high level structure of the file:
    34  	//		#comments - ignore only outside of the pattern
    35  	//		pattern (could be multi-line, could be surrounded by "" or //) after the / there are the options some we understand, some we don't
    36  	//		    test case
    37  	//		 0: success case
    38  	//		\= Expect no match (ignored)
    39  	//		    another test case
    40  	//		No Match
    41  	//
    42  	//		another pattern ...etc
    43  
    44  	scanner := bufio.NewScanner(file)
    45  	// main pattern loop
    46  	for scanner.Scan() {
    47  		// reading the file a line at a time
    48  		line := scanner.Text()
    49  
    50  		if trim := strings.TrimSpace(line); trim == "" || strings.HasPrefix(trim, "#") {
    51  			// skip blanks and comments
    52  			continue
    53  		}
    54  
    55  		patternStart := line[0]
    56  		if patternStart != '/' && patternStart != '"' {
    57  			// an error!  expected a pattern but we didn't understand what was in the file
    58  			t.Fatalf("Unknown file format, expected line to start with '/' or '\"', line in: %v", line)
    59  		}
    60  
    61  		// start building our pattern, handling multi-line patterns
    62  		pattern := line
    63  		totalCount++
    64  
    65  		// keep appending the lines to our pattern string until we
    66  		// find our closing tag, don't allow the first char to match on the
    67  		// line start, but subsequent lines could end on the first char
    68  		allowFirst := false
    69  		for !containsEnder(line, patternStart, allowFirst) {
    70  			if !scanner.Scan() {
    71  				// an error!  expected more pattern, but got eof
    72  				t.Fatalf("Unknown file format, expected more pattern text, but got EOF, pattern so far: %v", pattern)
    73  			}
    74  			line = scanner.Text()
    75  			pattern += fmt.Sprintf("\n%s", line)
    76  			allowFirst = true
    77  		}
    78  
    79  		// we have our raw pattern! -- we need to convert this to a compiled regex
    80  		re := compileRawPattern(t, pattern)
    81  
    82  		var (
    83  			capsIdx map[int]int
    84  			m       *Match
    85  			toMatch string
    86  		)
    87  		// now we need to parse the test cases if there are any
    88  		// they start with 4 spaces -- if we don't get a 4-space start then
    89  		// we're back out to our next pattern
    90  		for scanner.Scan() {
    91  			line = scanner.Text()
    92  
    93  			// blank line is our separator for a new pattern
    94  			if strings.TrimSpace(line) == "" {
    95  				break
    96  			}
    97  
    98  			// could be either "    " or "\= Expect"
    99  			if strings.HasPrefix(line, "\\= Expect") {
   100  				continue
   101  			} else if strings.HasPrefix(line, "    ") {
   102  				// trim off leading spaces for our text to match
   103  				toMatch = line[4:]
   104  				// trim off trailing spaces too
   105  				toMatch = strings.TrimRight(toMatch, " ")
   106  
   107  				m = matchString(t, re, toMatch)
   108  
   109  				capsIdx = make(map[int]int)
   110  				continue
   111  				//t.Fatalf("Expected match text to start with 4 spaces, instead got: '%v'", line)
   112  			} else if strings.HasPrefix(line, "No match") {
   113  				validateNoMatch(t, re, m)
   114  				// no match means we're done
   115  				continue
   116  			} else if subs := matchGroup.FindStringSubmatch(line); len(subs) == 3 {
   117  				gIdx, _ := strconv.Atoi(subs[1])
   118  				if _, ok := capsIdx[gIdx]; !ok {
   119  					capsIdx[gIdx] = 0
   120  				}
   121  				validateMatch(t, re, m, toMatch, subs[2], gIdx, capsIdx[gIdx])
   122  				capsIdx[gIdx]++
   123  				continue
   124  			} else {
   125  				// no match -- problem
   126  				t.Fatalf("Unknown file format, expected match or match group but got '%v'", line)
   127  			}
   128  		}
   129  
   130  	}
   131  
   132  	if err := scanner.Err(); err != nil {
   133  		log.Fatal(err)
   134  	}
   135  }
   136  
   137  var matchGroup = regexp.MustCompile(`^\s*(\d+): (.*)`)
   138  
   139  func problem(t *testing.T, input string, args ...interface{}) {
   140  	failCount++
   141  	t.Errorf(input, args...)
   142  }
   143  
   144  func validateNoMatch(t *testing.T, re *Regexp, m *Match) {
   145  	if re == nil || m == nil {
   146  		return
   147  	}
   148  
   149  	problem(t, "Expected no match for pattern '%v', but got '%v'", re.pattern, m.String())
   150  }
   151  
   152  func validateMatch(t *testing.T, re *Regexp, m *Match, toMatch, value string, idx, capIdx int) {
   153  	if re == nil {
   154  		// already error'd earlier up stream
   155  		return
   156  	}
   157  
   158  	if m == nil {
   159  		// we didn't match, but should have
   160  		problem(t, "Expected match for pattern '%v' with input '%v', but got no match", re.pattern, toMatch)
   161  		return
   162  	}
   163  
   164  	g := m.Groups()
   165  	if len(g) <= idx {
   166  		problem(t, "Expected group %v does not exist in pattern '%v' with input '%v'", idx, re.pattern, toMatch)
   167  		return
   168  	}
   169  
   170  	if value == "<unset>" {
   171  		// this means we shouldn't have a cap for this group
   172  		if len(g[idx].Captures) > 0 {
   173  			problem(t, "Expected no cap %v in group %v in pattern '%v' with input '%v'", g[idx].Captures[capIdx].String(), idx, re.pattern, toMatch)
   174  		}
   175  
   176  		return
   177  	}
   178  
   179  	if len(g[idx].Captures) <= capIdx {
   180  		problem(t, "Expected cap %v does not exist in group %v in pattern '%v' with input '%v'", capIdx, idx, re.pattern, toMatch)
   181  		return
   182  	}
   183  
   184  	escp := unEscapeGroup(g[idx].String())
   185  	//escp := unEscapeGroup(g[idx].Captures[capIdx].String())
   186  	if escp != value {
   187  		problem(t, "Expected '%v' but got '%v' for cap %v, group %v for pattern '%v' with input '%v'", value, escp, capIdx, idx, re.pattern, toMatch)
   188  		return
   189  	}
   190  }
   191  
   192  func compileRawPattern(t *testing.T, pattern string) *Regexp {
   193  	// check our end for RegexOptions -trim them off
   194  	index := strings.LastIndexAny(pattern, "/\"")
   195  	//
   196  	// Append "= Debug" to compare details between corefx and regexp2 on the PCRE test suite
   197  	//
   198  	var opts RegexOptions
   199  
   200  	if index+1 < len(pattern) {
   201  		textOptions := pattern[index+1:]
   202  		pattern = pattern[:index+1]
   203  		// there are lots of complex options here
   204  		for _, textOpt := range strings.Split(textOptions, ",") {
   205  			switch textOpt {
   206  			case "dupnames":
   207  				// we don't know how to handle this...
   208  			default:
   209  				if strings.Contains(textOpt, "i") {
   210  					opts |= IgnoreCase
   211  				}
   212  				if strings.Contains(textOpt, "s") {
   213  					opts |= Singleline
   214  				}
   215  				if strings.Contains(textOpt, "m") {
   216  					opts |= Multiline
   217  				}
   218  				if strings.Contains(textOpt, "x") {
   219  					opts |= IgnorePatternWhitespace
   220  				}
   221  			}
   222  		}
   223  
   224  	}
   225  
   226  	// trim off first and last char
   227  	pattern = pattern[1 : len(pattern)-1]
   228  
   229  	defer func() {
   230  		if rec := recover(); rec != nil {
   231  			problem(t, "PANIC in compiling \"%v\": %v", pattern, rec)
   232  		}
   233  	}()
   234  	re, err := Compile(pattern, opts)
   235  	if err != nil {
   236  		problem(t, "Error parsing \"%v\": %v", pattern, err)
   237  	}
   238  	return re
   239  }
   240  
   241  func matchString(t *testing.T, re *Regexp, toMatch string) *Match {
   242  	if re == nil {
   243  		return nil
   244  	}
   245  
   246  	re.MatchTimeout = time.Second * 1
   247  
   248  	escp := ""
   249  	var err error
   250  	if toMatch != "\\" {
   251  		escp = unEscapeToMatch(toMatch)
   252  	}
   253  	m, err := re.FindStringMatch(escp)
   254  	if err != nil {
   255  		problem(t, "Error matching \"%v\" in pattern \"%v\": %v", toMatch, re.pattern, err)
   256  	}
   257  	return m
   258  }
   259  
   260  func containsEnder(line string, ender byte, allowFirst bool) bool {
   261  	index := strings.LastIndexByte(line, ender)
   262  	if index > 0 {
   263  		return true
   264  	} else if index == 0 && allowFirst {
   265  		return true
   266  	}
   267  	return false
   268  }
   269  
   270  func unEscapeToMatch(line string) string {
   271  	idx := strings.IndexRune(line, '\\')
   272  	// no slashes means no unescape needed
   273  	if idx == -1 {
   274  		return line
   275  	}
   276  
   277  	buf := bytes.NewBufferString(line[:idx])
   278  	// get the runes for the rest of the string -- we're going full parser scan on this
   279  
   280  	inEscape := false
   281  	// take any \'s and convert them
   282  	for i := idx; i < len(line); i++ {
   283  		ch := line[i]
   284  		if ch == '\\' {
   285  			if inEscape {
   286  				buf.WriteByte(ch)
   287  			}
   288  			inEscape = !inEscape
   289  			continue
   290  		}
   291  		if inEscape {
   292  			switch ch {
   293  			case 'x':
   294  				buf.WriteByte(scanHex(line, &i))
   295  			case 'a':
   296  				buf.WriteByte(0x07)
   297  			case 'b':
   298  				buf.WriteByte('\b')
   299  			case 'e':
   300  				buf.WriteByte(0x1b)
   301  			case 'f':
   302  				buf.WriteByte('\f')
   303  			case 'n':
   304  				buf.WriteByte('\n')
   305  			case 'r':
   306  				buf.WriteByte('\r')
   307  			case 't':
   308  				buf.WriteByte('\t')
   309  			case 'v':
   310  				buf.WriteByte(0x0b)
   311  			default:
   312  				if ch >= '0' && ch <= '7' {
   313  					buf.WriteByte(scanOctal(line, &i))
   314  				} else {
   315  					buf.WriteByte(ch)
   316  					//panic(fmt.Sprintf("unexpected escape '%v' in %v", string(ch), line))
   317  				}
   318  			}
   319  			inEscape = false
   320  		} else {
   321  			buf.WriteByte(ch)
   322  		}
   323  	}
   324  
   325  	return buf.String()
   326  }
   327  
   328  func unEscapeGroup(val string) string {
   329  	// use hex for chars 0x00-0x1f, 0x7f-0xff
   330  	buf := &bytes.Buffer{}
   331  
   332  	for i := 0; i < len(val); i++ {
   333  		ch := val[i]
   334  		if ch <= 0x1f || ch >= 0x7f {
   335  			//write it as a \x00
   336  			fmt.Fprintf(buf, "\\x%.2x", ch)
   337  		} else {
   338  			// write as-is
   339  			buf.WriteByte(ch)
   340  		}
   341  	}
   342  
   343  	return buf.String()
   344  }
   345  
   346  func scanHex(line string, idx *int) byte {
   347  	if *idx >= len(line)-2 {
   348  		panic(fmt.Sprintf("not enough hex chars in %v at %v", line, *idx))
   349  	}
   350  	(*idx)++
   351  	d1 := hexDigit(line[*idx])
   352  	(*idx)++
   353  	d2 := hexDigit(line[*idx])
   354  	if d1 < 0 || d2 < 0 {
   355  		panic("bad hex chars")
   356  	}
   357  
   358  	return byte(d1*0x10 + d2)
   359  }
   360  
   361  // Returns n <= 0xF for a hex digit.
   362  func hexDigit(ch byte) int {
   363  
   364  	if d := uint(ch - '0'); d <= 9 {
   365  		return int(d)
   366  	}
   367  
   368  	if d := uint(ch - 'a'); d <= 5 {
   369  		return int(d + 0xa)
   370  	}
   371  
   372  	if d := uint(ch - 'A'); d <= 5 {
   373  		return int(d + 0xa)
   374  	}
   375  
   376  	return -1
   377  }
   378  
   379  // Scans up to three octal digits (stops before exceeding 0377).
   380  func scanOctal(line string, idx *int) byte {
   381  	// Consume octal chars only up to 3 digits and value 0377
   382  
   383  	// octals can be 3,2, or 1 digit
   384  	c := 3
   385  
   386  	if diff := len(line) - *idx; c > diff {
   387  		c = diff
   388  	}
   389  
   390  	i := 0
   391  	d := int(line[*idx] - '0')
   392  	for c > 0 && d <= 7 {
   393  		i *= 8
   394  		i += d
   395  
   396  		c--
   397  		(*idx)++
   398  		if *idx < len(line) {
   399  			d = int(line[*idx] - '0')
   400  		}
   401  	}
   402  	(*idx)--
   403  
   404  	// Octal codes only go up to 255.  Any larger and the behavior that Perl follows
   405  	// is simply to truncate the high bits.
   406  	i &= 0xFF
   407  
   408  	return byte(i)
   409  }
   410
View as plain text