1 package chroma
2
3 import (
4 "fmt"
5 "os"
6 "path/filepath"
7 "regexp"
8 "sort"
9 "strings"
10 "sync"
11 "time"
12 "unicode/utf8"
13
14 "github.com/dlclark/regexp2"
15 )
16
17
18 type Rule struct {
19 Pattern string
20 Type Emitter
21 Mutator Mutator
22 }
23
24
25 type Emitter interface {
26
27 Emit(groups []string, state *LexerState) Iterator
28 }
29
30
31 type EmitterFunc func(groups []string, state *LexerState) Iterator
32
33
34 func (e EmitterFunc) Emit(groups []string, state *LexerState) Iterator {
35 return e(groups, state)
36 }
37
38
39 func ByGroups(emitters ...Emitter) Emitter {
40 return EmitterFunc(func(groups []string, state *LexerState) Iterator {
41 iterators := make([]Iterator, 0, len(groups)-1)
42 if len(emitters) != len(groups)-1 {
43 iterators = append(iterators, Error.Emit(groups, state))
44
45 } else {
46 for i, group := range groups[1:] {
47 if emitters[i] != nil {
48 iterators = append(iterators, emitters[i].Emit([]string{group}, state))
49 }
50 }
51 }
52 return Concaterator(iterators...)
53 })
54 }
55
56
57 func ByGroupNames(emitters map[string]Emitter) Emitter {
58 return EmitterFunc(func(groups []string, state *LexerState) Iterator {
59 iterators := make([]Iterator, 0, len(state.NamedGroups)-1)
60 if len(state.NamedGroups)-1 == 0 {
61 if emitter, ok := emitters[`0`]; ok {
62 iterators = append(iterators, emitter.Emit(groups, state))
63 } else {
64 iterators = append(iterators, Error.Emit(groups, state))
65 }
66 } else {
67 ruleRegex := state.Rules[state.State][state.Rule].Regexp
68 for i := 1; i < len(state.NamedGroups); i++ {
69 groupName := ruleRegex.GroupNameFromNumber(i)
70 group := state.NamedGroups[groupName]
71 if emitter, ok := emitters[groupName]; ok {
72 if emitter != nil {
73 iterators = append(iterators, emitter.Emit([]string{group}, state))
74 }
75 } else {
76 iterators = append(iterators, Error.Emit([]string{group}, state))
77 }
78 }
79 }
80 return Concaterator(iterators...)
81 })
82 }
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124 func UsingByGroup(sublexerGetFunc func(string) Lexer, sublexerNameGroup, codeGroup int, emitters ...Emitter) Emitter {
125 return EmitterFunc(func(groups []string, state *LexerState) Iterator {
126
127 if len(emitters) != len(groups)-1 {
128 panic("UsingByGroup expects number of emitters to be the same as len(groups)-1")
129 }
130
131
132 sublexer := sublexerGetFunc(groups[sublexerNameGroup])
133
134
135 iterators := make([]Iterator, len(groups)-1)
136 for i, group := range groups[1:] {
137 if i == codeGroup-1 && sublexer != nil {
138 var err error
139 iterators[i], err = sublexer.Tokenise(nil, groups[codeGroup])
140 if err != nil {
141 panic(err)
142 }
143 } else if emitters[i] != nil {
144 iterators[i] = emitters[i].Emit([]string{group}, state)
145 }
146 }
147
148 return Concaterator(iterators...)
149 })
150 }
151
152
153 func Using(lexer Lexer) Emitter {
154 return EmitterFunc(func(groups []string, _ *LexerState) Iterator {
155 it, err := lexer.Tokenise(&TokeniseOptions{State: "root", Nested: true}, groups[0])
156 if err != nil {
157 panic(err)
158 }
159 return it
160 })
161 }
162
163
164 func UsingSelf(stateName string) Emitter {
165 return EmitterFunc(func(groups []string, state *LexerState) Iterator {
166 it, err := state.Lexer.Tokenise(&TokeniseOptions{State: stateName, Nested: true}, groups[0])
167 if err != nil {
168 panic(err)
169 }
170 return it
171 })
172 }
173
174
175 func Words(prefix, suffix string, words ...string) string {
176 sort.Slice(words, func(i, j int) bool {
177 return len(words[j]) < len(words[i])
178 })
179 for i, word := range words {
180 words[i] = regexp.QuoteMeta(word)
181 }
182 return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
183 }
184
185
186 func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
187 var out []Token
188 it, err := lexer.Tokenise(options, text)
189 if err != nil {
190 return nil, err
191 }
192 for t := it(); t != EOF; t = it() {
193 out = append(out, t)
194 }
195 return out, nil
196 }
197
198
199 type Rules map[string][]Rule
200
201
202 func (r Rules) Rename(oldRule, newRule string) Rules {
203 r = r.Clone()
204 r[newRule] = r[oldRule]
205 delete(r, oldRule)
206 return r
207 }
208
209
210 func (r Rules) Clone() Rules {
211 out := map[string][]Rule{}
212 for key, rules := range r {
213 out[key] = make([]Rule, len(rules))
214 copy(out[key], rules)
215 }
216 return out
217 }
218
219
220 func (r Rules) Merge(rules Rules) Rules {
221 out := r.Clone()
222 for k, v := range rules.Clone() {
223 out[k] = v
224 }
225 return out
226 }
227
228
229 func MustNewLazyLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
230 lexer, err := NewLazyLexer(config, rulesFunc)
231 if err != nil {
232 panic(err)
233 }
234 return lexer
235 }
236
237
238 func NewLazyLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
239 if config == nil {
240 config = &Config{}
241 }
242 for _, glob := range append(config.Filenames, config.AliasFilenames...) {
243 _, err := filepath.Match(glob, "")
244 if err != nil {
245 return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
246 }
247 }
248 return &RegexLexer{
249 config: config,
250 compilerFunc: rulesFunc,
251 }, nil
252 }
253
254
255
256
257 func MustNewLexer(config *Config, rules Rules) *RegexLexer {
258 lexer, err := NewLexer(config, rules)
259 if err != nil {
260 panic(err)
261 }
262 return lexer
263 }
264
265
266
267
268
269
270
271 func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
272 return NewLazyLexer(config, func() Rules { return rules })
273 }
274
275
276 func (r *RegexLexer) Trace(trace bool) *RegexLexer {
277 r.trace = trace
278 return r
279 }
280
281
282
283
284 type CompiledRule struct {
285 Rule
286 Regexp *regexp2.Regexp
287 flags string
288 }
289
290
291 type CompiledRules map[string][]*CompiledRule
292
293
294 type LexerState struct {
295 Lexer *RegexLexer
296 Text []rune
297 Pos int
298 Rules CompiledRules
299 Stack []string
300 State string
301 Rule int
302
303 Groups []string
304
305 NamedGroups map[string]string
306
307 MutatorContext map[interface{}]interface{}
308 iteratorStack []Iterator
309 options *TokeniseOptions
310 newlineAdded bool
311 }
312
313
314 func (l *LexerState) Set(key interface{}, value interface{}) {
315 l.MutatorContext[key] = value
316 }
317
318
319 func (l *LexerState) Get(key interface{}) interface{} {
320 return l.MutatorContext[key]
321 }
322
323
324 func (l *LexerState) Iterator() Token {
325 end := len(l.Text)
326 if l.newlineAdded {
327 end--
328 }
329 for l.Pos < end && len(l.Stack) > 0 {
330
331 for len(l.iteratorStack) > 0 {
332 n := len(l.iteratorStack) - 1
333 t := l.iteratorStack[n]()
334 if t == EOF {
335 l.iteratorStack = l.iteratorStack[:n]
336 continue
337 }
338 return t
339 }
340
341 l.State = l.Stack[len(l.Stack)-1]
342 if l.Lexer.trace {
343 fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
344 }
345 selectedRule, ok := l.Rules[l.State]
346 if !ok {
347 panic("unknown state " + l.State)
348 }
349 ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
350
351 if groups == nil {
352
353
354
355
356
357
358 if l.Text[l.Pos] == '\n' && l.State != l.options.State {
359 l.Stack = []string{l.options.State}
360 continue
361 }
362 l.Pos++
363 return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
364 }
365 l.Rule = ruleIndex
366 l.Groups = groups
367 l.NamedGroups = namedGroups
368 l.Pos += utf8.RuneCountInString(groups[0])
369 if rule.Mutator != nil {
370 if err := rule.Mutator.Mutate(l); err != nil {
371 panic(err)
372 }
373 }
374 if rule.Type != nil {
375 l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
376 }
377 }
378
379
380 for len(l.iteratorStack) > 0 {
381 n := len(l.iteratorStack) - 1
382 t := l.iteratorStack[n]()
383 if t == EOF {
384 l.iteratorStack = l.iteratorStack[:n]
385 continue
386 }
387 return t
388 }
389
390
391 if l.Pos != len(l.Text) && len(l.Stack) == 0 {
392 value := string(l.Text[l.Pos:])
393 l.Pos = len(l.Text)
394 return Token{Type: Error, Value: value}
395 }
396 return EOF
397 }
398
399
400 type RegexLexer struct {
401 config *Config
402 analyser func(text string) float32
403 trace bool
404
405 mu sync.Mutex
406 compiled bool
407 rules map[string][]*CompiledRule
408 compilerFunc func() Rules
409 compileOnce sync.Once
410 }
411
412
413 func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer {
414 r.analyser = analyser
415 return r
416 }
417
418 func (r *RegexLexer) AnalyseText(text string) float32 {
419 if r.analyser != nil {
420 return r.analyser(text)
421 }
422 return 0.0
423 }
424
425 func (r *RegexLexer) Config() *Config {
426 return r.config
427 }
428
429
430 func (r *RegexLexer) maybeCompile() (err error) {
431 r.mu.Lock()
432 defer r.mu.Unlock()
433 if r.compiled {
434 return nil
435 }
436 for state, rules := range r.rules {
437 for i, rule := range rules {
438 if rule.Regexp == nil {
439 pattern := "(?:" + rule.Pattern + ")"
440 if rule.flags != "" {
441 pattern = "(?" + rule.flags + ")" + pattern
442 }
443 pattern = `\G` + pattern
444 rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2)
445 if err != nil {
446 return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
447 }
448 rule.Regexp.MatchTimeout = time.Millisecond * 250
449 }
450 }
451 }
452 restart:
453 seen := map[LexerMutator]bool{}
454 for state := range r.rules {
455 for i := 0; i < len(r.rules[state]); i++ {
456 rule := r.rules[state][i]
457 if compile, ok := rule.Mutator.(LexerMutator); ok {
458 if seen[compile] {
459 return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
460 }
461 seen[compile] = true
462 if err := compile.MutateLexer(r.rules, state, i); err != nil {
463 return err
464 }
465
466
467
468 goto restart
469 }
470 }
471 }
472 r.compiled = true
473 return nil
474 }
475
476 func (r *RegexLexer) compileRules() error {
477 rules := r.compilerFunc()
478 if _, ok := rules["root"]; !ok {
479 return fmt.Errorf("no \"root\" state")
480 }
481 compiledRules := map[string][]*CompiledRule{}
482 for state, rules := range rules {
483 compiledRules[state] = nil
484 for _, rule := range rules {
485 flags := ""
486 if !r.config.NotMultiline {
487 flags += "m"
488 }
489 if r.config.CaseInsensitive {
490 flags += "i"
491 }
492 if r.config.DotAll {
493 flags += "s"
494 }
495 compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
496 }
497 }
498
499 r.rules = compiledRules
500 return nil
501 }
502
503 func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
504 var err error
505 if r.compilerFunc != nil {
506 r.compileOnce.Do(func() {
507 err = r.compileRules()
508 })
509 }
510 if err != nil {
511 return nil, err
512 }
513 if err := r.maybeCompile(); err != nil {
514 return nil, err
515 }
516 if options == nil {
517 options = defaultOptions
518 }
519 if options.EnsureLF {
520 text = ensureLF(text)
521 }
522 newlineAdded := false
523 if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
524 text += "\n"
525 newlineAdded = true
526 }
527 state := &LexerState{
528 newlineAdded: newlineAdded,
529 options: options,
530 Lexer: r,
531 Text: []rune(text),
532 Stack: []string{options.State},
533 Rules: r.rules,
534 MutatorContext: map[interface{}]interface{}{},
535 }
536 return state.Iterator, nil
537 }
538
539 func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
540 for i, rule := range rules {
541 match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
542 if match != nil && err == nil && match.Index == pos {
543 groups := []string{}
544 namedGroups := make(map[string]string)
545 for _, g := range match.Groups() {
546 namedGroups[g.Name] = g.String()
547 groups = append(groups, g.String())
548 }
549 return i, rule, groups, namedGroups
550 }
551 }
552 return 0, &CompiledRule{}, nil, nil
553 }
554
555
556
557 func ensureLF(text string) string {
558 buf := make([]byte, len(text))
559 var j int
560 for i := 0; i < len(text); i++ {
561 c := text[i]
562 if c == '\r' {
563 if i < len(text)-1 && text[i+1] == '\n' {
564 continue
565 }
566 c = '\n'
567 }
568 buf[j] = c
569 j++
570 }
571 return string(buf[:j])
572 }
573
View as plain text