1 package parser
2
3 import (
4 "fmt"
5 "strconv"
6 "strings"
7 "unicode/utf8"
8 )
9
10 const (
11 WhitespaceChars = " \f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff"
12 Re2Dot = "[^\r\n\u2028\u2029]"
13 )
14
15 type regexpParseError struct {
16 offset int
17 err string
18 }
19
20 type RegexpErrorIncompatible struct {
21 regexpParseError
22 }
23 type RegexpSyntaxError struct {
24 regexpParseError
25 }
26
27 func (s regexpParseError) Error() string {
28 return s.err
29 }
30
31 type _RegExp_parser struct {
32 str string
33 length int
34
35 chr rune
36 chrOffset int
37 offset int
38
39 err error
40
41 goRegexp strings.Builder
42 passOffset int
43 }
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58 func TransformRegExp(pattern string) (transformed string, err error) {
59
60 if pattern == "" {
61 return "", nil
62 }
63
64 parser := _RegExp_parser{
65 str: pattern,
66 length: len(pattern),
67 }
68 err = parser.parse()
69 if err != nil {
70 return "", err
71 }
72
73 return parser.ResultString(), nil
74 }
75
76 func (self *_RegExp_parser) ResultString() string {
77 if self.passOffset != -1 {
78 return self.str[:self.passOffset]
79 }
80 return self.goRegexp.String()
81 }
82
83 func (self *_RegExp_parser) parse() (err error) {
84 self.read()
85 self.scan()
86 return self.err
87 }
88
89 func (self *_RegExp_parser) read() {
90 if self.offset < self.length {
91 self.chrOffset = self.offset
92 chr, width := rune(self.str[self.offset]), 1
93 if chr >= utf8.RuneSelf {
94 chr, width = utf8.DecodeRuneInString(self.str[self.offset:])
95 if chr == utf8.RuneError && width == 1 {
96 self.error(true, "Invalid UTF-8 character")
97 return
98 }
99 }
100 self.offset += width
101 self.chr = chr
102 } else {
103 self.chrOffset = self.length
104 self.chr = -1
105 }
106 }
107
108 func (self *_RegExp_parser) stopPassing() {
109 self.goRegexp.Grow(3 * len(self.str) / 2)
110 self.goRegexp.WriteString(self.str[:self.passOffset])
111 self.passOffset = -1
112 }
113
114 func (self *_RegExp_parser) write(p []byte) {
115 if self.passOffset != -1 {
116 self.stopPassing()
117 }
118 self.goRegexp.Write(p)
119 }
120
121 func (self *_RegExp_parser) writeByte(b byte) {
122 if self.passOffset != -1 {
123 self.stopPassing()
124 }
125 self.goRegexp.WriteByte(b)
126 }
127
128 func (self *_RegExp_parser) writeString(s string) {
129 if self.passOffset != -1 {
130 self.stopPassing()
131 }
132 self.goRegexp.WriteString(s)
133 }
134
135 func (self *_RegExp_parser) scan() {
136 for self.chr != -1 {
137 switch self.chr {
138 case '\\':
139 self.read()
140 self.scanEscape(false)
141 case '(':
142 self.pass()
143 self.scanGroup()
144 case '[':
145 self.scanBracket()
146 case ')':
147 self.error(true, "Unmatched ')'")
148 return
149 case '.':
150 self.writeString(Re2Dot)
151 self.read()
152 default:
153 self.pass()
154 }
155 }
156 }
157
158
159 func (self *_RegExp_parser) scanGroup() {
160 str := self.str[self.chrOffset:]
161 if len(str) > 1 {
162 if str[0] == '?' {
163 ch := str[1]
164 switch {
165 case ch == '=' || ch == '!':
166 self.error(false, "re2: Invalid (%s) <lookahead>", self.str[self.chrOffset:self.chrOffset+2])
167 return
168 case ch == '<':
169 self.error(false, "re2: Invalid (%s) <lookbehind>", self.str[self.chrOffset:self.chrOffset+2])
170 return
171 case ch != ':':
172 self.error(true, "Invalid group")
173 return
174 }
175 }
176 }
177 for self.chr != -1 && self.chr != ')' {
178 switch self.chr {
179 case '\\':
180 self.read()
181 self.scanEscape(false)
182 case '(':
183 self.pass()
184 self.scanGroup()
185 case '[':
186 self.scanBracket()
187 case '.':
188 self.writeString(Re2Dot)
189 self.read()
190 default:
191 self.pass()
192 continue
193 }
194 }
195 if self.chr != ')' {
196 self.error(true, "Unterminated group")
197 return
198 }
199 self.pass()
200 }
201
202
203 func (self *_RegExp_parser) scanBracket() {
204 str := self.str[self.chrOffset:]
205 if strings.HasPrefix(str, "[]") {
206
207 self.writeString("[^\u0000-\U0001FFFF]")
208 self.offset += 1
209 self.read()
210 return
211 }
212
213 if strings.HasPrefix(str, "[^]") {
214 self.writeString("[\u0000-\U0001FFFF]")
215 self.offset += 2
216 self.read()
217 return
218 }
219
220 self.pass()
221 for self.chr != -1 {
222 if self.chr == ']' {
223 break
224 } else if self.chr == '\\' {
225 self.read()
226 self.scanEscape(true)
227 continue
228 }
229 self.pass()
230 }
231 if self.chr != ']' {
232 self.error(true, "Unterminated character class")
233 return
234 }
235 self.pass()
236 }
237
238
239 func (self *_RegExp_parser) scanEscape(inClass bool) {
240 offset := self.chrOffset
241
242 var length, base uint32
243 switch self.chr {
244
245 case '0', '1', '2', '3', '4', '5', '6', '7':
246 var value int64
247 size := 0
248 for {
249 digit := int64(digitValue(self.chr))
250 if digit >= 8 {
251
252 break
253 }
254 value = value*8 + digit
255 self.read()
256 size += 1
257 }
258 if size == 1 {
259 if value != 0 {
260
261 self.error(false, "re2: Invalid \\%d <backreference>", value)
262 return
263 }
264 self.passString(offset-1, self.chrOffset)
265 return
266 }
267 tmp := []byte{'\\', 'x', '0', 0}
268 if value >= 16 {
269 tmp = tmp[0:2]
270 } else {
271 tmp = tmp[0:3]
272 }
273 tmp = strconv.AppendInt(tmp, value, 16)
274 self.write(tmp)
275 return
276
277 case '8', '9':
278 self.read()
279 self.error(false, "re2: Invalid \\%s <backreference>", self.str[offset:self.chrOffset])
280 return
281
282 case 'x':
283 self.read()
284 length, base = 2, 16
285
286 case 'u':
287 self.read()
288 if self.chr == '{' {
289 self.read()
290 length, base = 0, 16
291 } else {
292 length, base = 4, 16
293 }
294
295 case 'b':
296 if inClass {
297 self.write([]byte{'\\', 'x', '0', '8'})
298 self.read()
299 return
300 }
301 fallthrough
302
303 case 'B':
304 fallthrough
305
306 case 'd', 'D', 'w', 'W':
307
308
309 fallthrough
310
311 case '\\':
312 fallthrough
313
314 case 'f', 'n', 'r', 't', 'v':
315 self.passString(offset-1, self.offset)
316 self.read()
317 return
318
319 case 'c':
320 self.read()
321 var value int64
322 if 'a' <= self.chr && self.chr <= 'z' {
323 value = int64(self.chr - 'a' + 1)
324 } else if 'A' <= self.chr && self.chr <= 'Z' {
325 value = int64(self.chr - 'A' + 1)
326 } else {
327 self.writeByte('c')
328 return
329 }
330 tmp := []byte{'\\', 'x', '0', 0}
331 if value >= 16 {
332 tmp = tmp[0:2]
333 } else {
334 tmp = tmp[0:3]
335 }
336 tmp = strconv.AppendInt(tmp, value, 16)
337 self.write(tmp)
338 self.read()
339 return
340 case 's':
341 if inClass {
342 self.writeString(WhitespaceChars)
343 } else {
344 self.writeString("[" + WhitespaceChars + "]")
345 }
346 self.read()
347 return
348 case 'S':
349 if inClass {
350 self.error(false, "S in class")
351 return
352 } else {
353 self.writeString("[^" + WhitespaceChars + "]")
354 }
355 self.read()
356 return
357 default:
358
359
360 if self.chr == '$' || self.chr < utf8.RuneSelf && !isIdentifierPart(self.chr) {
361
362 self.passString(offset-1, self.offset)
363 self.read()
364 return
365 }
366
367 self.pass()
368 return
369 }
370
371
372 valueOffset := self.chrOffset
373
374 if length > 0 {
375 for length := length; length > 0; length-- {
376 digit := uint32(digitValue(self.chr))
377 if digit >= base {
378
379 goto skip
380 }
381 self.read()
382 }
383 } else {
384 for self.chr != '}' && self.chr != -1 {
385 digit := uint32(digitValue(self.chr))
386 if digit >= base {
387
388 goto skip
389 }
390 self.read()
391 }
392 }
393
394 if length == 4 || length == 0 {
395 self.write([]byte{
396 '\\',
397 'x',
398 '{',
399 })
400 self.passString(valueOffset, self.chrOffset)
401 if length != 0 {
402 self.writeByte('}')
403 }
404 } else if length == 2 {
405 self.passString(offset-1, valueOffset+2)
406 } else {
407
408 self.error(true, "re2: Illegal branch in scanEscape")
409 return
410 }
411
412 return
413
414 skip:
415 self.passString(offset, self.chrOffset)
416 }
417
418 func (self *_RegExp_parser) pass() {
419 if self.passOffset == self.chrOffset {
420 self.passOffset = self.offset
421 } else {
422 if self.passOffset != -1 {
423 self.stopPassing()
424 }
425 if self.chr != -1 {
426 self.goRegexp.WriteRune(self.chr)
427 }
428 }
429 self.read()
430 }
431
432 func (self *_RegExp_parser) passString(start, end int) {
433 if self.passOffset == start {
434 self.passOffset = end
435 return
436 }
437 if self.passOffset != -1 {
438 self.stopPassing()
439 }
440 self.goRegexp.WriteString(self.str[start:end])
441 }
442
443 func (self *_RegExp_parser) error(fatal bool, msg string, msgValues ...interface{}) {
444 if self.err != nil {
445 return
446 }
447 e := regexpParseError{
448 offset: self.offset,
449 err: fmt.Sprintf(msg, msgValues...),
450 }
451 if fatal {
452 self.err = RegexpSyntaxError{e}
453 } else {
454 self.err = RegexpErrorIncompatible{e}
455 }
456 self.offset = self.length
457 self.chr = -1
458 }
459
View as plain text