1 package chroma
2
3 import (
4 "fmt"
5 "os"
6 "path/filepath"
7 "regexp"
8 "sort"
9 "strings"
10 "sync"
11 "time"
12 "unicode/utf8"
13
14 "github.com/dlclark/regexp2"
15 )
16
17
18 type Rule struct {
19 Pattern string
20 Type Emitter
21 Mutator Mutator
22 }
23
24
25 func Words(prefix, suffix string, words ...string) string {
26 sort.Slice(words, func(i, j int) bool {
27 return len(words[j]) < len(words[i])
28 })
29 for i, word := range words {
30 words[i] = regexp.QuoteMeta(word)
31 }
32 return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
33 }
34
35
36 func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
37 var out []Token
38 it, err := lexer.Tokenise(options, text)
39 if err != nil {
40 return nil, err
41 }
42 for t := it(); t != EOF; t = it() {
43 out = append(out, t)
44 }
45 return out, nil
46 }
47
48
49 type Rules map[string][]Rule
50
51
52 func (r Rules) Rename(oldRule, newRule string) Rules {
53 r = r.Clone()
54 r[newRule] = r[oldRule]
55 delete(r, oldRule)
56 return r
57 }
58
59
60 func (r Rules) Clone() Rules {
61 out := map[string][]Rule{}
62 for key, rules := range r {
63 out[key] = make([]Rule, len(rules))
64 copy(out[key], rules)
65 }
66 return out
67 }
68
69
70 func (r Rules) Merge(rules Rules) Rules {
71 out := r.Clone()
72 for k, v := range rules.Clone() {
73 out[k] = v
74 }
75 return out
76 }
77
78
79 func MustNewLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
80 lexer, err := NewLexer(config, rulesFunc)
81 if err != nil {
82 panic(err)
83 }
84 return lexer
85 }
86
87
88
89
90
91 func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
92 if config == nil {
93 config = &Config{}
94 }
95 for _, glob := range append(config.Filenames, config.AliasFilenames...) {
96 _, err := filepath.Match(glob, "")
97 if err != nil {
98 return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
99 }
100 }
101 r := &RegexLexer{
102 config: config,
103 fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
104 }
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134 return r, nil
135 }
136
137
138 func (r *RegexLexer) Trace(trace bool) *RegexLexer {
139 r.trace = trace
140 return r
141 }
142
143
144
145
146 type CompiledRule struct {
147 Rule
148 Regexp *regexp2.Regexp
149 flags string
150 }
151
152
153 type CompiledRules map[string][]*CompiledRule
154
155
156 type LexerState struct {
157 Lexer *RegexLexer
158 Registry *LexerRegistry
159 Text []rune
160 Pos int
161 Rules CompiledRules
162 Stack []string
163 State string
164 Rule int
165
166 Groups []string
167
168 NamedGroups map[string]string
169
170 MutatorContext map[interface{}]interface{}
171 iteratorStack []Iterator
172 options *TokeniseOptions
173 newlineAdded bool
174 }
175
176
177 func (l *LexerState) Set(key interface{}, value interface{}) {
178 l.MutatorContext[key] = value
179 }
180
181
182 func (l *LexerState) Get(key interface{}) interface{} {
183 return l.MutatorContext[key]
184 }
185
186
187 func (l *LexerState) Iterator() Token {
188 end := len(l.Text)
189 if l.newlineAdded {
190 end--
191 }
192 for l.Pos < end && len(l.Stack) > 0 {
193
194 for len(l.iteratorStack) > 0 {
195 n := len(l.iteratorStack) - 1
196 t := l.iteratorStack[n]()
197 if t == EOF {
198 l.iteratorStack = l.iteratorStack[:n]
199 continue
200 }
201 return t
202 }
203
204 l.State = l.Stack[len(l.Stack)-1]
205 if l.Lexer.trace {
206 fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
207 }
208 selectedRule, ok := l.Rules[l.State]
209 if !ok {
210 panic("unknown state " + l.State)
211 }
212 ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
213
214 if groups == nil {
215
216
217
218
219
220
221 if l.Text[l.Pos] == '\n' && l.State != l.options.State {
222 l.Stack = []string{l.options.State}
223 continue
224 }
225 l.Pos++
226 return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
227 }
228 l.Rule = ruleIndex
229 l.Groups = groups
230 l.NamedGroups = namedGroups
231 l.Pos += utf8.RuneCountInString(groups[0])
232 if rule.Mutator != nil {
233 if err := rule.Mutator.Mutate(l); err != nil {
234 panic(err)
235 }
236 }
237 if rule.Type != nil {
238 l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
239 }
240 }
241
242
243 for len(l.iteratorStack) > 0 {
244 n := len(l.iteratorStack) - 1
245 t := l.iteratorStack[n]()
246 if t == EOF {
247 l.iteratorStack = l.iteratorStack[:n]
248 continue
249 }
250 return t
251 }
252
253
254 if l.Pos != len(l.Text) && len(l.Stack) == 0 {
255 value := string(l.Text[l.Pos:])
256 l.Pos = len(l.Text)
257 return Token{Type: Error, Value: value}
258 }
259 return EOF
260 }
261
262
263 type RegexLexer struct {
264 registry *LexerRegistry
265 config *Config
266 analyser func(text string) float32
267 trace bool
268
269 mu sync.Mutex
270 compiled bool
271 rawRules Rules
272 rules map[string][]*CompiledRule
273 fetchRulesFunc func() (Rules, error)
274 compileOnce sync.Once
275 }
276
277 func (r *RegexLexer) String() string {
278 return r.config.Name
279 }
280
281
282 func (r *RegexLexer) Rules() (Rules, error) {
283 if err := r.needRules(); err != nil {
284 return nil, err
285 }
286 return r.rawRules, nil
287 }
288
289
290 func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
291 r.registry = registry
292 return r
293 }
294
295
296 func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
297 r.analyser = analyser
298 return r
299 }
300
301 func (r *RegexLexer) AnalyseText(text string) float32 {
302 if r.analyser != nil {
303 return r.analyser(text)
304 }
305 return 0.0
306 }
307
308
309 func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
310 r.config = config
311 return r
312 }
313
314 func (r *RegexLexer) Config() *Config {
315 return r.config
316 }
317
318
319 func (r *RegexLexer) maybeCompile() (err error) {
320 r.mu.Lock()
321 defer r.mu.Unlock()
322 if r.compiled {
323 return nil
324 }
325 for state, rules := range r.rules {
326 for i, rule := range rules {
327 if rule.Regexp == nil {
328 pattern := "(?:" + rule.Pattern + ")"
329 if rule.flags != "" {
330 pattern = "(?" + rule.flags + ")" + pattern
331 }
332 pattern = `\G` + pattern
333 rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2)
334 if err != nil {
335 return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
336 }
337 rule.Regexp.MatchTimeout = time.Millisecond * 250
338 }
339 }
340 }
341 restart:
342 seen := map[LexerMutator]bool{}
343 for state := range r.rules {
344 for i := 0; i < len(r.rules[state]); i++ {
345 rule := r.rules[state][i]
346 if compile, ok := rule.Mutator.(LexerMutator); ok {
347 if seen[compile] {
348 return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
349 }
350 seen[compile] = true
351 if err := compile.MutateLexer(r.rules, state, i); err != nil {
352 return err
353 }
354
355
356
357 goto restart
358 }
359 }
360 }
361 r.compiled = true
362 return nil
363 }
364
365 func (r *RegexLexer) fetchRules() error {
366 rules, err := r.fetchRulesFunc()
367 if err != nil {
368 return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
369 }
370 if _, ok := rules["root"]; !ok {
371 return fmt.Errorf("no \"root\" state")
372 }
373 compiledRules := map[string][]*CompiledRule{}
374 for state, rules := range rules {
375 compiledRules[state] = nil
376 for _, rule := range rules {
377 flags := ""
378 if !r.config.NotMultiline {
379 flags += "m"
380 }
381 if r.config.CaseInsensitive {
382 flags += "i"
383 }
384 if r.config.DotAll {
385 flags += "s"
386 }
387 compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
388 }
389 }
390
391 r.rawRules = rules
392 r.rules = compiledRules
393 return nil
394 }
395
396 func (r *RegexLexer) needRules() error {
397 var err error
398 if r.fetchRulesFunc != nil {
399 r.compileOnce.Do(func() {
400 err = r.fetchRules()
401 })
402 }
403 if err := r.maybeCompile(); err != nil {
404 return err
405 }
406 return err
407 }
408
409 func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
410 err := r.needRules()
411 if err != nil {
412 return nil, err
413 }
414 if options == nil {
415 options = defaultOptions
416 }
417 if options.EnsureLF {
418 text = ensureLF(text)
419 }
420 newlineAdded := false
421 if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
422 text += "\n"
423 newlineAdded = true
424 }
425 state := &LexerState{
426 Registry: r.registry,
427 newlineAdded: newlineAdded,
428 options: options,
429 Lexer: r,
430 Text: []rune(text),
431 Stack: []string{options.State},
432 Rules: r.rules,
433 MutatorContext: map[interface{}]interface{}{},
434 }
435 return state.Iterator, nil
436 }
437
438
439 func (r *RegexLexer) MustRules() Rules {
440 rules, err := r.Rules()
441 if err != nil {
442 panic(err)
443 }
444 return rules
445 }
446
447 func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
448 for i, rule := range rules {
449 match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
450 if match != nil && err == nil && match.Index == pos {
451 groups := []string{}
452 namedGroups := make(map[string]string)
453 for _, g := range match.Groups() {
454 namedGroups[g.Name] = g.String()
455 groups = append(groups, g.String())
456 }
457 return i, rule, groups, namedGroups
458 }
459 }
460 return 0, &CompiledRule{}, nil, nil
461 }
462
463
464
465 func ensureLF(text string) string {
466 buf := make([]byte, len(text))
467 var j int
468 for i := 0; i < len(text); i++ {
469 c := text[i]
470 if c == '\r' {
471 if i < len(text)-1 && text[i+1] == '\n' {
472 continue
473 }
474 c = '\n'
475 }
476 buf[j] = c
477 j++
478 }
479 return string(buf[:j])
480 }
481
View as plain text