1 package filtering
2
3 import (
4 "errors"
5 "fmt"
6 "io"
7 "unicode"
8 "unicode/utf8"
9 )
10
11
12 type Lexer struct {
13 filter string
14 tokenStart Position
15 tokenEnd Position
16 lineOffsets []int32
17 }
18
19
20 func (l *Lexer) Init(filter string) {
21 *l = Lexer{
22 filter: filter,
23 tokenStart: Position{Offset: 0, Line: 1, Column: 1},
24 tokenEnd: Position{Offset: 0, Line: 1, Column: 1},
25 lineOffsets: l.lineOffsets[:0],
26 }
27 }
28
29
30 func (l *Lexer) Lex() (Token, error) {
31 r, err := l.nextRune()
32 if err != nil {
33 return Token{}, err
34 }
35 switch r {
36
37 case '(', ')', '-', '.', '=', ':', ',':
38 return l.emit(TokenType(l.tokenValue()))
39
40 case '<', '>', '!':
41 if l.sniffRune('=') {
42 _, _ = l.nextRune()
43 }
44 return l.emit(TokenType(l.tokenValue()))
45
46 case '\'', '"':
47 for {
48 r2, err := l.nextRune()
49 if err != nil {
50 if errors.Is(err, io.EOF) {
51 return Token{}, l.errorf("unterminated string")
52 }
53 return Token{}, err
54 }
55 if r == r2 {
56 return l.emit(TokenTypeString)
57 }
58 }
59
60 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
61
62 if l.sniffRune('x') {
63 _, _ = l.nextRune()
64 for l.sniff(isHexDigit) {
65 _, _ = l.nextRune()
66 }
67 return l.emit(TokenTypeHexNumber)
68 }
69 for l.sniff(unicode.IsDigit) {
70 _, _ = l.nextRune()
71 }
72 return l.emit(TokenTypeNumber)
73 }
74
75 if unicode.IsSpace(r) {
76 for l.sniff(unicode.IsSpace) {
77 _, _ = l.nextRune()
78 }
79 return l.emit(TokenTypeWhitespace)
80 }
81
82 for l.sniff(isText) {
83 _, _ = l.nextRune()
84 }
85
86 if tokenType := TokenType(l.tokenValue()); tokenType.IsKeyword() {
87 return l.emit(tokenType)
88 }
89
90 return l.emit(TokenTypeText)
91 }
92
93
94 func (l *Lexer) Position() Position {
95 return l.tokenStart
96 }
97
98
99 func (l *Lexer) LineOffsets() []int32 {
100 return l.lineOffsets
101 }
102
103 func (l *Lexer) emit(t TokenType) (Token, error) {
104 token := Token{
105 Position: l.tokenStart,
106 Type: t,
107 Value: l.tokenValue(),
108 }
109 l.tokenStart = l.tokenEnd
110 return token, nil
111 }
112
113 func (l *Lexer) tokenValue() string {
114 return l.filter[l.tokenStart.Offset:l.tokenEnd.Offset]
115 }
116
117 func (l *Lexer) remainingFilter() string {
118 return l.filter[l.tokenEnd.Offset:]
119 }
120
121 func (l *Lexer) nextRune() (rune, error) {
122 r, n := utf8.DecodeRuneInString(l.remainingFilter())
123 switch {
124 case n == 0:
125 return r, io.EOF
126 case r == utf8.RuneError:
127 return r, l.errorf("invalid UTF-8")
128 }
129 if r == '\n' {
130 l.lineOffsets = append(l.lineOffsets, l.tokenEnd.Offset)
131 l.tokenEnd.Line++
132 l.tokenEnd.Column = 1
133 } else {
134 l.tokenEnd.Column++
135 }
136 l.tokenEnd.Offset += int32(n)
137 return r, nil
138 }
139
140 func (l *Lexer) sniff(wantFns ...func(rune) bool) bool {
141 remaining := l.remainingFilter()
142 for _, wantFn := range wantFns {
143 r, n := utf8.DecodeRuneInString(l.remainingFilter())
144 if !wantFn(r) {
145 return false
146 }
147 remaining = remaining[n:]
148 }
149 return true
150 }
151
152 func (l *Lexer) sniffRune(want rune) bool {
153 r, _ := utf8.DecodeRuneInString(l.remainingFilter())
154 return r == want
155 }
156
157 func (l *Lexer) errorf(format string, args ...interface{}) error {
158 return &lexError{
159 filter: l.filter,
160 position: l.tokenStart,
161 message: fmt.Sprintf(format, args...),
162 }
163 }
164
165 func isText(r rune) bool {
166 switch r {
167 case utf8.RuneError, '(', ')', '-', '.', '=', ':', '<', '>', '!', ',':
168 return false
169 }
170 return !unicode.IsSpace(r)
171 }
172
173 func isHexDigit(r rune) bool {
174 return unicode.Is(unicode.ASCII_Hex_Digit, r)
175 }
176
View as plain text