regexp.go

Documentation: github.com/alecthomas/chroma/v2

     1  package chroma
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"path/filepath"
     7  	"regexp"
     8  	"sort"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  	"unicode/utf8"
    13  
    14  	"github.com/dlclark/regexp2"
    15  )
    16  
    17  // A Rule is the fundamental matching unit of the Regex lexer state machine.
    18  type Rule struct {
    19  	Pattern string
    20  	Type    Emitter
    21  	Mutator Mutator
    22  }
    23  
    24  // Words creates a regex that matches any of the given literal words.
    25  func Words(prefix, suffix string, words ...string) string {
    26  	sort.Slice(words, func(i, j int) bool {
    27  		return len(words[j]) < len(words[i])
    28  	})
    29  	for i, word := range words {
    30  		words[i] = regexp.QuoteMeta(word)
    31  	}
    32  	return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
    33  }
    34  
    35  // Tokenise text using lexer, returning tokens as a slice.
    36  func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
    37  	var out []Token
    38  	it, err := lexer.Tokenise(options, text)
    39  	if err != nil {
    40  		return nil, err
    41  	}
    42  	for t := it(); t != EOF; t = it() {
    43  		out = append(out, t)
    44  	}
    45  	return out, nil
    46  }
    47  
    48  // Rules maps from state to a sequence of Rules.
    49  type Rules map[string][]Rule
    50  
    51  // Rename clones rules then a rule.
    52  func (r Rules) Rename(oldRule, newRule string) Rules {
    53  	r = r.Clone()
    54  	r[newRule] = r[oldRule]
    55  	delete(r, oldRule)
    56  	return r
    57  }
    58  
    59  // Clone returns a clone of the Rules.
    60  func (r Rules) Clone() Rules {
    61  	out := map[string][]Rule{}
    62  	for key, rules := range r {
    63  		out[key] = make([]Rule, len(rules))
    64  		copy(out[key], rules)
    65  	}
    66  	return out
    67  }
    68  
    69  // Merge creates a clone of "r" then merges "rules" into the clone.
    70  func (r Rules) Merge(rules Rules) Rules {
    71  	out := r.Clone()
    72  	for k, v := range rules.Clone() {
    73  		out[k] = v
    74  	}
    75  	return out
    76  }
    77  
    78  // MustNewLexer creates a new Lexer with deferred rules generation or panics.
    79  func MustNewLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
    80  	lexer, err := NewLexer(config, rulesFunc)
    81  	if err != nil {
    82  		panic(err)
    83  	}
    84  	return lexer
    85  }
    86  
    87  // NewLexer creates a new regex-based Lexer.
    88  //
    89  // "rules" is a state machine transition map. Each key is a state. Values are sets of rules
    90  // that match input, optionally modify lexer state, and output tokens.
    91  func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
    92  	if config == nil {
    93  		config = &Config{}
    94  	}
    95  	for _, glob := range append(config.Filenames, config.AliasFilenames...) {
    96  		_, err := filepath.Match(glob, "")
    97  		if err != nil {
    98  			return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
    99  		}
   100  	}
   101  	r := &RegexLexer{
   102  		config:         config,
   103  		fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
   104  	}
   105  	// One-off code to generate XML lexers in the Chroma source tree.
   106  	// var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
   107  	// name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
   108  	// data, err := Marshal(r)
   109  	// if err != nil {
   110  	// 	if errors.Is(err, ErrNotSerialisable) {
   111  	// 		fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
   112  	// 		return r, nil
   113  	// 	}
   114  	// 	return nil, err
   115  	// }
   116  	// _, file, _, ok := runtime.Caller(2)
   117  	// if !ok {
   118  	// 	panic("??")
   119  	// }
   120  	// fmt.Println(file)
   121  	// if strings.Contains(file, "/lexers/") {
   122  	// 	dir := filepath.Join(filepath.Dir(file), "embedded")
   123  	// 	err = os.MkdirAll(dir, 0700)
   124  	// 	if err != nil {
   125  	// 		return nil, err
   126  	// 	}
   127  	// 	filename := filepath.Join(dir, name) + ".xml"
   128  	// 	fmt.Println(filename)
   129  	// 	err = ioutil.WriteFile(filename, data, 0600)
   130  	// 	if err != nil {
   131  	// 		return nil, err
   132  	// 	}
   133  	// }
   134  	return r, nil
   135  }
   136  
   137  // Trace enables debug tracing.
   138  func (r *RegexLexer) Trace(trace bool) *RegexLexer {
   139  	r.trace = trace
   140  	return r
   141  }
   142  
   143  // A CompiledRule is a Rule with a pre-compiled regex.
   144  //
   145  // Note that regular expressions are lazily compiled on first use of the lexer.
   146  type CompiledRule struct {
   147  	Rule
   148  	Regexp *regexp2.Regexp
   149  	flags  string
   150  }
   151  
   152  // CompiledRules is a map of rule name to sequence of compiled rules in that rule.
   153  type CompiledRules map[string][]*CompiledRule
   154  
   155  // LexerState contains the state for a single lex.
   156  type LexerState struct {
   157  	Lexer    *RegexLexer
   158  	Registry *LexerRegistry
   159  	Text     []rune
   160  	Pos      int
   161  	Rules    CompiledRules
   162  	Stack    []string
   163  	State    string
   164  	Rule     int
   165  	// Group matches.
   166  	Groups []string
   167  	// Named Group matches.
   168  	NamedGroups map[string]string
   169  	// Custum context for mutators.
   170  	MutatorContext map[interface{}]interface{}
   171  	iteratorStack  []Iterator
   172  	options        *TokeniseOptions
   173  	newlineAdded   bool
   174  }
   175  
   176  // Set mutator context.
   177  func (l *LexerState) Set(key interface{}, value interface{}) {
   178  	l.MutatorContext[key] = value
   179  }
   180  
   181  // Get mutator context.
   182  func (l *LexerState) Get(key interface{}) interface{} {
   183  	return l.MutatorContext[key]
   184  }
   185  
   186  // Iterator returns the next Token from the lexer.
   187  func (l *LexerState) Iterator() Token { // nolint: gocognit
   188  	end := len(l.Text)
   189  	if l.newlineAdded {
   190  		end--
   191  	}
   192  	for l.Pos < end && len(l.Stack) > 0 {
   193  		// Exhaust the iterator stack, if any.
   194  		for len(l.iteratorStack) > 0 {
   195  			n := len(l.iteratorStack) - 1
   196  			t := l.iteratorStack[n]()
   197  			if t == EOF {
   198  				l.iteratorStack = l.iteratorStack[:n]
   199  				continue
   200  			}
   201  			return t
   202  		}
   203  
   204  		l.State = l.Stack[len(l.Stack)-1]
   205  		if l.Lexer.trace {
   206  			fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
   207  		}
   208  		selectedRule, ok := l.Rules[l.State]
   209  		if !ok {
   210  			panic("unknown state " + l.State)
   211  		}
   212  		ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
   213  		// No match.
   214  		if groups == nil {
   215  			// From Pygments :\
   216  			//
   217  			// If the RegexLexer encounters a newline that is flagged as an error token, the stack is
   218  			// emptied and the lexer continues scanning in the 'root' state. This can help producing
   219  			// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
   220  			// closed.
   221  			if l.Text[l.Pos] == '\n' && l.State != l.options.State {
   222  				l.Stack = []string{l.options.State}
   223  				continue
   224  			}
   225  			l.Pos++
   226  			return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
   227  		}
   228  		l.Rule = ruleIndex
   229  		l.Groups = groups
   230  		l.NamedGroups = namedGroups
   231  		l.Pos += utf8.RuneCountInString(groups[0])
   232  		if rule.Mutator != nil {
   233  			if err := rule.Mutator.Mutate(l); err != nil {
   234  				panic(err)
   235  			}
   236  		}
   237  		if rule.Type != nil {
   238  			l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
   239  		}
   240  	}
   241  	// Exhaust the IteratorStack, if any.
   242  	// Duplicate code, but eh.
   243  	for len(l.iteratorStack) > 0 {
   244  		n := len(l.iteratorStack) - 1
   245  		t := l.iteratorStack[n]()
   246  		if t == EOF {
   247  			l.iteratorStack = l.iteratorStack[:n]
   248  			continue
   249  		}
   250  		return t
   251  	}
   252  
   253  	// If we get to here and we still have text, return it as an error.
   254  	if l.Pos != len(l.Text) && len(l.Stack) == 0 {
   255  		value := string(l.Text[l.Pos:])
   256  		l.Pos = len(l.Text)
   257  		return Token{Type: Error, Value: value}
   258  	}
   259  	return EOF
   260  }
   261  
   262  // RegexLexer is the default lexer implementation used in Chroma.
   263  type RegexLexer struct {
   264  	registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
   265  	config   *Config
   266  	analyser func(text string) float32
   267  	trace    bool
   268  
   269  	mu             sync.Mutex
   270  	compiled       bool
   271  	rawRules       Rules
   272  	rules          map[string][]*CompiledRule
   273  	fetchRulesFunc func() (Rules, error)
   274  	compileOnce    sync.Once
   275  }
   276  
   277  func (r *RegexLexer) String() string {
   278  	return r.config.Name
   279  }
   280  
   281  // Rules in the Lexer.
   282  func (r *RegexLexer) Rules() (Rules, error) {
   283  	if err := r.needRules(); err != nil {
   284  		return nil, err
   285  	}
   286  	return r.rawRules, nil
   287  }
   288  
   289  // SetRegistry the lexer will use to lookup other lexers if necessary.
   290  func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
   291  	r.registry = registry
   292  	return r
   293  }
   294  
   295  // SetAnalyser sets the analyser function used to perform content inspection.
   296  func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
   297  	r.analyser = analyser
   298  	return r
   299  }
   300  
   301  func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
   302  	if r.analyser != nil {
   303  		return r.analyser(text)
   304  	}
   305  	return 0.0
   306  }
   307  
   308  // SetConfig replaces the Config for this Lexer.
   309  func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
   310  	r.config = config
   311  	return r
   312  }
   313  
   314  func (r *RegexLexer) Config() *Config { // nolint
   315  	return r.config
   316  }
   317  
   318  // Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
   319  func (r *RegexLexer) maybeCompile() (err error) {
   320  	r.mu.Lock()
   321  	defer r.mu.Unlock()
   322  	if r.compiled {
   323  		return nil
   324  	}
   325  	for state, rules := range r.rules {
   326  		for i, rule := range rules {
   327  			if rule.Regexp == nil {
   328  				pattern := "(?:" + rule.Pattern + ")"
   329  				if rule.flags != "" {
   330  					pattern = "(?" + rule.flags + ")" + pattern
   331  				}
   332  				pattern = `\G` + pattern
   333  				rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2)
   334  				if err != nil {
   335  					return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
   336  				}
   337  				rule.Regexp.MatchTimeout = time.Millisecond * 250
   338  			}
   339  		}
   340  	}
   341  restart:
   342  	seen := map[LexerMutator]bool{}
   343  	for state := range r.rules {
   344  		for i := 0; i < len(r.rules[state]); i++ {
   345  			rule := r.rules[state][i]
   346  			if compile, ok := rule.Mutator.(LexerMutator); ok {
   347  				if seen[compile] {
   348  					return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
   349  				}
   350  				seen[compile] = true
   351  				if err := compile.MutateLexer(r.rules, state, i); err != nil {
   352  					return err
   353  				}
   354  				// Process the rules again in case the mutator added/removed rules.
   355  				//
   356  				// This sounds bad, but shouldn't be significant in practice.
   357  				goto restart
   358  			}
   359  		}
   360  	}
   361  	r.compiled = true
   362  	return nil
   363  }
   364  
   365  func (r *RegexLexer) fetchRules() error {
   366  	rules, err := r.fetchRulesFunc()
   367  	if err != nil {
   368  		return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
   369  	}
   370  	if _, ok := rules["root"]; !ok {
   371  		return fmt.Errorf("no \"root\" state")
   372  	}
   373  	compiledRules := map[string][]*CompiledRule{}
   374  	for state, rules := range rules {
   375  		compiledRules[state] = nil
   376  		for _, rule := range rules {
   377  			flags := ""
   378  			if !r.config.NotMultiline {
   379  				flags += "m"
   380  			}
   381  			if r.config.CaseInsensitive {
   382  				flags += "i"
   383  			}
   384  			if r.config.DotAll {
   385  				flags += "s"
   386  			}
   387  			compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
   388  		}
   389  	}
   390  
   391  	r.rawRules = rules
   392  	r.rules = compiledRules
   393  	return nil
   394  }
   395  
   396  func (r *RegexLexer) needRules() error {
   397  	var err error
   398  	if r.fetchRulesFunc != nil {
   399  		r.compileOnce.Do(func() {
   400  			err = r.fetchRules()
   401  		})
   402  	}
   403  	if err := r.maybeCompile(); err != nil {
   404  		return err
   405  	}
   406  	return err
   407  }
   408  
   409  func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
   410  	err := r.needRules()
   411  	if err != nil {
   412  		return nil, err
   413  	}
   414  	if options == nil {
   415  		options = defaultOptions
   416  	}
   417  	if options.EnsureLF {
   418  		text = ensureLF(text)
   419  	}
   420  	newlineAdded := false
   421  	if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
   422  		text += "\n"
   423  		newlineAdded = true
   424  	}
   425  	state := &LexerState{
   426  		Registry:       r.registry,
   427  		newlineAdded:   newlineAdded,
   428  		options:        options,
   429  		Lexer:          r,
   430  		Text:           []rune(text),
   431  		Stack:          []string{options.State},
   432  		Rules:          r.rules,
   433  		MutatorContext: map[interface{}]interface{}{},
   434  	}
   435  	return state.Iterator, nil
   436  }
   437  
   438  // MustRules is like Rules() but will panic on error.
   439  func (r *RegexLexer) MustRules() Rules {
   440  	rules, err := r.Rules()
   441  	if err != nil {
   442  		panic(err)
   443  	}
   444  	return rules
   445  }
   446  
   447  func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
   448  	for i, rule := range rules {
   449  		match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
   450  		if match != nil && err == nil && match.Index == pos {
   451  			groups := []string{}
   452  			namedGroups := make(map[string]string)
   453  			for _, g := range match.Groups() {
   454  				namedGroups[g.Name] = g.String()
   455  				groups = append(groups, g.String())
   456  			}
   457  			return i, rule, groups, namedGroups
   458  		}
   459  	}
   460  	return 0, &CompiledRule{}, nil, nil
   461  }
   462  
   463  // replace \r and \r\n with \n
   464  // same as strings.ReplaceAll but more efficient
   465  func ensureLF(text string) string {
   466  	buf := make([]byte, len(text))
   467  	var j int
   468  	for i := 0; i < len(text); i++ {
   469  		c := text[i]
   470  		if c == '\r' {
   471  			if i < len(text)-1 && text[i+1] == '\n' {
   472  				continue
   473  			}
   474  			c = '\n'
   475  		}
   476  		buf[j] = c
   477  		j++
   478  	}
   479  	return string(buf[:j])
   480  }
   481
View as plain text