regexp.go

Documentation: github.com/alecthomas/chroma

     1  package chroma
     2  
     3  import (
     4  	"fmt"
     5  	"os"
     6  	"path/filepath"
     7  	"regexp"
     8  	"sort"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  	"unicode/utf8"
    13  
    14  	"github.com/dlclark/regexp2"
    15  )
    16  
    17  // A Rule is the fundamental matching unit of the Regex lexer state machine.
    18  type Rule struct {
    19  	Pattern string
    20  	Type    Emitter
    21  	Mutator Mutator
    22  }
    23  
    24  // An Emitter takes group matches and returns tokens.
    25  type Emitter interface {
    26  	// Emit tokens for the given regex groups.
    27  	Emit(groups []string, state *LexerState) Iterator
    28  }
    29  
    30  // EmitterFunc is a function that is an Emitter.
    31  type EmitterFunc func(groups []string, state *LexerState) Iterator
    32  
    33  // Emit tokens for groups.
    34  func (e EmitterFunc) Emit(groups []string, state *LexerState) Iterator {
    35  	return e(groups, state)
    36  }
    37  
    38  // ByGroups emits a token for each matching group in the rule's regex.
    39  func ByGroups(emitters ...Emitter) Emitter {
    40  	return EmitterFunc(func(groups []string, state *LexerState) Iterator {
    41  		iterators := make([]Iterator, 0, len(groups)-1)
    42  		if len(emitters) != len(groups)-1 {
    43  			iterators = append(iterators, Error.Emit(groups, state))
    44  			// panic(errors.Errorf("number of groups %q does not match number of emitters %v", groups, emitters))
    45  		} else {
    46  			for i, group := range groups[1:] {
    47  				if emitters[i] != nil {
    48  					iterators = append(iterators, emitters[i].Emit([]string{group}, state))
    49  				}
    50  			}
    51  		}
    52  		return Concaterator(iterators...)
    53  	})
    54  }
    55  
    56  // ByGroupNames emits a token for each named matching group in the rule's regex.
    57  func ByGroupNames(emitters map[string]Emitter) Emitter {
    58  	return EmitterFunc(func(groups []string, state *LexerState) Iterator {
    59  		iterators := make([]Iterator, 0, len(state.NamedGroups)-1)
    60  		if len(state.NamedGroups)-1 == 0 {
    61  			if emitter, ok := emitters[`0`]; ok {
    62  				iterators = append(iterators, emitter.Emit(groups, state))
    63  			} else {
    64  				iterators = append(iterators, Error.Emit(groups, state))
    65  			}
    66  		} else {
    67  			ruleRegex := state.Rules[state.State][state.Rule].Regexp
    68  			for i := 1; i < len(state.NamedGroups); i++ {
    69  				groupName := ruleRegex.GroupNameFromNumber(i)
    70  				group := state.NamedGroups[groupName]
    71  				if emitter, ok := emitters[groupName]; ok {
    72  					if emitter != nil {
    73  						iterators = append(iterators, emitter.Emit([]string{group}, state))
    74  					}
    75  				} else {
    76  					iterators = append(iterators, Error.Emit([]string{group}, state))
    77  				}
    78  			}
    79  		}
    80  		return Concaterator(iterators...)
    81  	})
    82  }
    83  
    84  // UsingByGroup emits tokens for the matched groups in the regex using a
    85  // "sublexer". Used when lexing code blocks where the name of a sublexer is
    86  // contained within the block, for example on a Markdown text block or SQL
    87  // language block.
    88  //
    89  // The sublexer will be retrieved using sublexerGetFunc (typically
    90  // internal.Get), using the captured value from the matched sublexerNameGroup.
    91  //
    92  // If sublexerGetFunc returns a non-nil lexer for the captured sublexerNameGroup,
    93  // then tokens for the matched codeGroup will be emitted using the retrieved
    94  // lexer. Otherwise, if the sublexer is nil, then tokens will be emitted from
    95  // the passed emitter.
    96  //
    97  // Example:
    98  //
    99  // 	var Markdown = internal.Register(MustNewLexer(
   100  // 		&Config{
   101  // 			Name:      "markdown",
   102  // 			Aliases:   []string{"md", "mkd"},
   103  // 			Filenames: []string{"*.md", "*.mkd", "*.markdown"},
   104  // 			MimeTypes: []string{"text/x-markdown"},
   105  // 		},
   106  // 		Rules{
   107  // 			"root": {
   108  // 				{"^(```)(\\w+)(\\n)([\\w\\W]*?)(^```$)",
   109  // 					UsingByGroup(
   110  // 						internal.Get,
   111  // 						2, 4,
   112  // 						String, String, String, Text, String,
   113  // 					),
   114  // 					nil,
   115  // 				},
   116  // 			},
   117  // 		},
   118  // 	))
   119  //
   120  // See the lexers/m/markdown.go for the complete example.
   121  //
   122  // Note: panic's if the number emitters does not equal the number of matched
   123  // groups in the regex.
   124  func UsingByGroup(sublexerGetFunc func(string) Lexer, sublexerNameGroup, codeGroup int, emitters ...Emitter) Emitter {
   125  	return EmitterFunc(func(groups []string, state *LexerState) Iterator {
   126  		// bounds check
   127  		if len(emitters) != len(groups)-1 {
   128  			panic("UsingByGroup expects number of emitters to be the same as len(groups)-1")
   129  		}
   130  
   131  		// grab sublexer
   132  		sublexer := sublexerGetFunc(groups[sublexerNameGroup])
   133  
   134  		// build iterators
   135  		iterators := make([]Iterator, len(groups)-1)
   136  		for i, group := range groups[1:] {
   137  			if i == codeGroup-1 && sublexer != nil {
   138  				var err error
   139  				iterators[i], err = sublexer.Tokenise(nil, groups[codeGroup])
   140  				if err != nil {
   141  					panic(err)
   142  				}
   143  			} else if emitters[i] != nil {
   144  				iterators[i] = emitters[i].Emit([]string{group}, state)
   145  			}
   146  		}
   147  
   148  		return Concaterator(iterators...)
   149  	})
   150  }
   151  
   152  // Using returns an Emitter that uses a given Lexer for parsing and emitting.
   153  func Using(lexer Lexer) Emitter {
   154  	return EmitterFunc(func(groups []string, _ *LexerState) Iterator {
   155  		it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0])
   156  		if err != nil {
   157  			panic(err)
   158  		}
   159  		return it
   160  	})
   161  }
   162  
   163  // UsingSelf is like Using, but uses the current Lexer.
   164  func UsingSelf(stateName string) Emitter {
   165  	return EmitterFunc(func(groups []string, state *LexerState) Iterator {
   166  		it, err := state.Lexer.Tokenise(&TokeniseOptions{State: stateName, Nested: true}, groups[0])
   167  		if err != nil {
   168  			panic(err)
   169  		}
   170  		return it
   171  	})
   172  }
   173  
   174  // Words creates a regex that matches any of the given literal words.
   175  func Words(prefix, suffix string, words ...string) string {
   176  	sort.Slice(words, func(i, j int) bool {
   177  		return len(words[j]) < len(words[i])
   178  	})
   179  	for i, word := range words {
   180  		words[i] = regexp.QuoteMeta(word)
   181  	}
   182  	return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
   183  }
   184  
   185  // Tokenise text using lexer, returning tokens as a slice.
   186  func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
   187  	var out []Token
   188  	it, err := lexer.Tokenise(options, text)
   189  	if err != nil {
   190  		return nil, err
   191  	}
   192  	for t := it(); t != EOF; t = it() {
   193  		out = append(out, t)
   194  	}
   195  	return out, nil
   196  }
   197  
   198  // Rules maps from state to a sequence of Rules.
   199  type Rules map[string][]Rule
   200  
   201  // Rename clones rules then a rule.
   202  func (r Rules) Rename(oldRule, newRule string) Rules {
   203  	r = r.Clone()
   204  	r[newRule] = r[oldRule]
   205  	delete(r, oldRule)
   206  	return r
   207  }
   208  
   209  // Clone returns a clone of the Rules.
   210  func (r Rules) Clone() Rules {
   211  	out := map[string][]Rule{}
   212  	for key, rules := range r {
   213  		out[key] = make([]Rule, len(rules))
   214  		copy(out[key], rules)
   215  	}
   216  	return out
   217  }
   218  
   219  // Merge creates a clone of "r" then merges "rules" into the clone.
   220  func (r Rules) Merge(rules Rules) Rules {
   221  	out := r.Clone()
   222  	for k, v := range rules.Clone() {
   223  		out[k] = v
   224  	}
   225  	return out
   226  }
   227  
   228  // MustNewLazyLexer creates a new Lexer with deferred rules generation or panics.
   229  func MustNewLazyLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
   230  	lexer, err := NewLazyLexer(config, rulesFunc)
   231  	if err != nil {
   232  		panic(err)
   233  	}
   234  	return lexer
   235  }
   236  
   237  // NewLazyLexer creates a new regex-based Lexer with deferred rules generation.
   238  func NewLazyLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
   239  	if config == nil {
   240  		config = &Config{}
   241  	}
   242  	for _, glob := range append(config.Filenames, config.AliasFilenames...) {
   243  		_, err := filepath.Match(glob, "")
   244  		if err != nil {
   245  			return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
   246  		}
   247  	}
   248  	return &RegexLexer{
   249  		config:       config,
   250  		compilerFunc: rulesFunc,
   251  	}, nil
   252  }
   253  
   254  // MustNewLexer creates a new Lexer or panics.
   255  //
   256  // Deprecated: Use MustNewLazyLexer instead.
   257  func MustNewLexer(config *Config, rules Rules) *RegexLexer { // nolint: forbidigo
   258  	lexer, err := NewLexer(config, rules) // nolint: forbidigo
   259  	if err != nil {
   260  		panic(err)
   261  	}
   262  	return lexer
   263  }
   264  
   265  // NewLexer creates a new regex-based Lexer.
   266  //
   267  // "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
   268  // that match input, optionally modify lexer state, and output tokens.
   269  //
   270  // Deprecated: Use NewLazyLexer instead.
   271  func NewLexer(config *Config, rules Rules) (*RegexLexer, error) { // nolint: forbidigo
   272  	return NewLazyLexer(config, func() Rules { return rules })
   273  }
   274  
   275  // Trace enables debug tracing.
   276  func (r *RegexLexer) Trace(trace bool) *RegexLexer {
   277  	r.trace = trace
   278  	return r
   279  }
   280  
   281  // A CompiledRule is a Rule with a pre-compiled regex.
   282  //
   283  // Note that regular expressions are lazily compiled on first use of the lexer.
   284  type CompiledRule struct {
   285  	Rule
   286  	Regexp *regexp2.Regexp
   287  	flags  string
   288  }
   289  
   290  // CompiledRules is a map of rule name to sequence of compiled rules in that rule.
   291  type CompiledRules map[string][]*CompiledRule
   292  
   293  // LexerState contains the state for a single lex.
   294  type LexerState struct {
   295  	Lexer *RegexLexer
   296  	Text  []rune
   297  	Pos   int
   298  	Rules CompiledRules
   299  	Stack []string
   300  	State string
   301  	Rule  int
   302  	// Group matches.
   303  	Groups []string
   304  	// Named Group matches.
   305  	NamedGroups map[string]string
   306  	// Custum context for mutators.
   307  	MutatorContext map[interface{}]interface{}
   308  	iteratorStack  []Iterator
   309  	options        *TokeniseOptions
   310  	newlineAdded   bool
   311  }
   312  
   313  // Set mutator context.
   314  func (l *LexerState) Set(key interface{}, value interface{}) {
   315  	l.MutatorContext[key] = value
   316  }
   317  
   318  // Get mutator context.
   319  func (l *LexerState) Get(key interface{}) interface{} {
   320  	return l.MutatorContext[key]
   321  }
   322  
   323  // Iterator returns the next Token from the lexer.
   324  func (l *LexerState) Iterator() Token { // nolint: gocognit
   325  	end := len(l.Text)
   326  	if l.newlineAdded {
   327  		end--
   328  	}
   329  	for l.Pos < end && len(l.Stack) > 0 {
   330  		// Exhaust the iterator stack, if any.
   331  		for len(l.iteratorStack) > 0 {
   332  			n := len(l.iteratorStack) - 1
   333  			t := l.iteratorStack[n]()
   334  			if t == EOF {
   335  				l.iteratorStack = l.iteratorStack[:n]
   336  				continue
   337  			}
   338  			return t
   339  		}
   340  
   341  		l.State = l.Stack[len(l.Stack)-1]
   342  		if l.Lexer.trace {
   343  			fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
   344  		}
   345  		selectedRule, ok := l.Rules[l.State]
   346  		if !ok {
   347  			panic("unknown state " + l.State)
   348  		}
   349  		ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
   350  		// No match.
   351  		if groups == nil {
   352  			// From Pygments :\
   353  			//
   354  			// If the RegexLexer encounters a newline that is flagged as an error token, the stack is
   355  			// emptied and the lexer continues scanning in the 'root' state. This can help producing
   356  			// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
   357  			// closed.
   358  			if l.Text[l.Pos] == '\n' && l.State != l.options.State {
   359  				l.Stack = []string{l.options.State}
   360  				continue
   361  			}
   362  			l.Pos++
   363  			return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
   364  		}
   365  		l.Rule = ruleIndex
   366  		l.Groups = groups
   367  		l.NamedGroups = namedGroups
   368  		l.Pos += utf8.RuneCountInString(groups[0])
   369  		if rule.Mutator != nil {
   370  			if err := rule.Mutator.Mutate(l); err != nil {
   371  				panic(err)
   372  			}
   373  		}
   374  		if rule.Type != nil {
   375  			l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
   376  		}
   377  	}
   378  	// Exhaust the IteratorStack, if any.
   379  	// Duplicate code, but eh.
   380  	for len(l.iteratorStack) > 0 {
   381  		n := len(l.iteratorStack) - 1
   382  		t := l.iteratorStack[n]()
   383  		if t == EOF {
   384  			l.iteratorStack = l.iteratorStack[:n]
   385  			continue
   386  		}
   387  		return t
   388  	}
   389  
   390  	// If we get to here and we still have text, return it as an error.
   391  	if l.Pos != len(l.Text) && len(l.Stack) == 0 {
   392  		value := string(l.Text[l.Pos:])
   393  		l.Pos = len(l.Text)
   394  		return Token{Type: Error, Value: value}
   395  	}
   396  	return EOF
   397  }
   398  
   399  // RegexLexer is the default lexer implementation used in Chroma.
   400  type RegexLexer struct {
   401  	config   *Config
   402  	analyser func(text string) float32
   403  	trace    bool
   404  
   405  	mu           sync.Mutex
   406  	compiled     bool
   407  	rules        map[string][]*CompiledRule
   408  	compilerFunc func() Rules
   409  	compileOnce  sync.Once
   410  }
   411  
   412  // SetAnalyser sets the analyser function used to perform content inspection.
   413  func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer {
   414  	r.analyser = analyser
   415  	return r
   416  }
   417  
   418  func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
   419  	if r.analyser != nil {
   420  		return r.analyser(text)
   421  	}
   422  	return 0.0
   423  }
   424  
   425  func (r *RegexLexer) Config() *Config { // nolint
   426  	return r.config
   427  }
   428  
   429  // Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
   430  func (r *RegexLexer) maybeCompile() (err error) {
   431  	r.mu.Lock()
   432  	defer r.mu.Unlock()
   433  	if r.compiled {
   434  		return nil
   435  	}
   436  	for state, rules := range r.rules {
   437  		for i, rule := range rules {
   438  			if rule.Regexp == nil {
   439  				pattern := "(?:" + rule.Pattern + ")"
   440  				if rule.flags != "" {
   441  					pattern = "(?" + rule.flags + ")" + pattern
   442  				}
   443  				pattern = `\G` + pattern
   444  				rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2)
   445  				if err != nil {
   446  					return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
   447  				}
   448  				rule.Regexp.MatchTimeout = time.Millisecond * 250
   449  			}
   450  		}
   451  	}
   452  restart:
   453  	seen := map[LexerMutator]bool{}
   454  	for state := range r.rules {
   455  		for i := 0; i < len(r.rules[state]); i++ {
   456  			rule := r.rules[state][i]
   457  			if compile, ok := rule.Mutator.(LexerMutator); ok {
   458  				if seen[compile] {
   459  					return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
   460  				}
   461  				seen[compile] = true
   462  				if err := compile.MutateLexer(r.rules, state, i); err != nil {
   463  					return err
   464  				}
   465  				// Process the rules again in case the mutator added/removed rules.
   466  				//
   467  				// This sounds bad, but shouldn't be significant in practice.
   468  				goto restart
   469  			}
   470  		}
   471  	}
   472  	r.compiled = true
   473  	return nil
   474  }
   475  
   476  func (r *RegexLexer) compileRules() error {
   477  	rules := r.compilerFunc()
   478  	if _, ok := rules["root"]; !ok {
   479  		return fmt.Errorf("no \"root\" state")
   480  	}
   481  	compiledRules := map[string][]*CompiledRule{}
   482  	for state, rules := range rules {
   483  		compiledRules[state] = nil
   484  		for _, rule := range rules {
   485  			flags := ""
   486  			if !r.config.NotMultiline {
   487  				flags += "m"
   488  			}
   489  			if r.config.CaseInsensitive {
   490  				flags += "i"
   491  			}
   492  			if r.config.DotAll {
   493  				flags += "s"
   494  			}
   495  			compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
   496  		}
   497  	}
   498  
   499  	r.rules = compiledRules
   500  	return nil
   501  }
   502  
   503  func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
   504  	var err error
   505  	if r.compilerFunc != nil {
   506  		r.compileOnce.Do(func() {
   507  			err = r.compileRules()
   508  		})
   509  	}
   510  	if err != nil {
   511  		return nil, err
   512  	}
   513  	if err := r.maybeCompile(); err != nil {
   514  		return nil, err
   515  	}
   516  	if options == nil {
   517  		options = defaultOptions
   518  	}
   519  	if options.EnsureLF {
   520  		text = ensureLF(text)
   521  	}
   522  	newlineAdded := false
   523  	if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
   524  		text += "\n"
   525  		newlineAdded = true
   526  	}
   527  	state := &LexerState{
   528  		newlineAdded:   newlineAdded,
   529  		options:        options,
   530  		Lexer:          r,
   531  		Text:           []rune(text),
   532  		Stack:          []string{options.State},
   533  		Rules:          r.rules,
   534  		MutatorContext: map[interface{}]interface{}{},
   535  	}
   536  	return state.Iterator, nil
   537  }
   538  
   539  func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
   540  	for i, rule := range rules {
   541  		match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
   542  		if match != nil && err == nil && match.Index == pos {
   543  			groups := []string{}
   544  			namedGroups := make(map[string]string)
   545  			for _, g := range match.Groups() {
   546  				namedGroups[g.Name] = g.String()
   547  				groups = append(groups, g.String())
   548  			}
   549  			return i, rule, groups, namedGroups
   550  		}
   551  	}
   552  	return 0, &CompiledRule{}, nil, nil
   553  }
   554  
   555  // replace \r and \r\n with \n
   556  // same as strings.ReplaceAll but more efficient
   557  func ensureLF(text string) string {
   558  	buf := make([]byte, len(text))
   559  	var j int
   560  	for i := 0; i < len(text); i++ {
   561  		c := text[i]
   562  		if c == '\r' {
   563  			if i < len(text)-1 && text[i+1] == '\n' {
   564  				continue
   565  			}
   566  			c = '\n'
   567  		}
   568  		buf[j] = c
   569  		j++
   570  	}
   571  	return string(buf[:j])
   572  }
   573
View as plain text