1 package syntax
2
3 import (
4 "fmt"
5 "math"
6 "os"
7 "sort"
8 "strconv"
9 "unicode"
10 )
11
12 type RegexOptions int32
13
14 const (
15 IgnoreCase RegexOptions = 0x0001
16 Multiline = 0x0002
17 ExplicitCapture = 0x0004
18 Compiled = 0x0008
19 Singleline = 0x0010
20 IgnorePatternWhitespace = 0x0020
21 RightToLeft = 0x0040
22 Debug = 0x0080
23 ECMAScript = 0x0100
24 RE2 = 0x0200
25 Unicode = 0x0400
26 )
27
28 func optionFromCode(ch rune) RegexOptions {
29
30 switch ch {
31 case 'i', 'I':
32 return IgnoreCase
33 case 'r', 'R':
34 return RightToLeft
35 case 'm', 'M':
36 return Multiline
37 case 'n', 'N':
38 return ExplicitCapture
39 case 's', 'S':
40 return Singleline
41 case 'x', 'X':
42 return IgnorePatternWhitespace
43 case 'd', 'D':
44 return Debug
45 case 'e', 'E':
46 return ECMAScript
47 case 'u', 'U':
48 return Unicode
49 default:
50 return 0
51 }
52 }
53
54
55
56 type Error struct {
57 Code ErrorCode
58 Expr string
59 Args []interface{}
60 }
61
62 func (e *Error) Error() string {
63 if len(e.Args) == 0 {
64 return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
65 }
66 return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
67 }
68
69
70 type ErrorCode string
71
72 const (
73
74 ErrInternalError ErrorCode = "regexp/syntax: internal error"
75
76 ErrUnterminatedComment = "unterminated comment"
77 ErrInvalidCharRange = "invalid character class range"
78 ErrInvalidRepeatSize = "invalid repeat count"
79 ErrInvalidUTF8 = "invalid UTF-8"
80 ErrCaptureGroupOutOfRange = "capture group number out of range"
81 ErrUnexpectedParen = "unexpected )"
82 ErrMissingParen = "missing closing )"
83 ErrMissingBrace = "missing closing }"
84 ErrInvalidRepeatOp = "invalid nested repetition operator"
85 ErrMissingRepeatArgument = "missing argument to repetition operator"
86 ErrConditionalExpression = "illegal conditional (?(...)) expression"
87 ErrTooManyAlternates = "too many | in (?()|)"
88 ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
89 ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
90 ErrCapNumNotZero = "capture number cannot be zero"
91 ErrUndefinedBackRef = "reference to undefined group number %v"
92 ErrUndefinedNameRef = "reference to undefined group name %v"
93 ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
94 ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
95 ErrMalformedReference = "(?(%v) ) malformed"
96 ErrUndefinedReference = "(?(%v) ) reference to undefined group"
97 ErrIllegalEndEscape = "illegal \\ at end of pattern"
98 ErrMalformedSlashP = "malformed \\p{X} character escape"
99 ErrIncompleteSlashP = "incomplete \\p{X} character escape"
100 ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
101 ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
102 ErrMissingControl = "missing control character"
103 ErrUnrecognizedControl = "unrecognized control character"
104 ErrTooFewHex = "insufficient hexadecimal digits"
105 ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
106 ErrMalformedNameRef = "malformed \\k<...> named back reference"
107 ErrBadClassInCharRange = "cannot include class \\%v in character range"
108 ErrUnterminatedBracket = "unterminated [] set"
109 ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
110 ErrReversedCharRange = "[%c-%c] range in reverse order"
111 )
112
113 func (e ErrorCode) String() string {
114 return string(e)
115 }
116
117 type parser struct {
118 stack *regexNode
119 group *regexNode
120 alternation *regexNode
121 concatenation *regexNode
122 unit *regexNode
123
124 patternRaw string
125 pattern []rune
126
127 currentPos int
128 specialCase *unicode.SpecialCase
129
130 autocap int
131 capcount int
132 captop int
133 capsize int
134
135 caps map[int]int
136 capnames map[string]int
137
138 capnumlist []int
139 capnamelist []string
140
141 options RegexOptions
142 optionsStack []RegexOptions
143 ignoreNextParen bool
144 }
145
146 const (
147 maxValueDiv10 int = math.MaxInt32 / 10
148 maxValueMod10 = math.MaxInt32 % 10
149 )
150
151
152 func Parse(re string, op RegexOptions) (*RegexTree, error) {
153 p := parser{
154 options: op,
155 caps: make(map[int]int),
156 }
157 p.setPattern(re)
158
159 if err := p.countCaptures(); err != nil {
160 return nil, err
161 }
162
163 p.reset(op)
164 root, err := p.scanRegex()
165
166 if err != nil {
167 return nil, err
168 }
169 tree := &RegexTree{
170 root: root,
171 caps: p.caps,
172 capnumlist: p.capnumlist,
173 captop: p.captop,
174 Capnames: p.capnames,
175 Caplist: p.capnamelist,
176 options: op,
177 }
178
179 if tree.options&Debug > 0 {
180 os.Stdout.WriteString(tree.Dump())
181 }
182
183 return tree, nil
184 }
185
186 func (p *parser) setPattern(pattern string) {
187 p.patternRaw = pattern
188 p.pattern = make([]rune, 0, len(pattern))
189
190
191 for _, r := range pattern {
192 p.pattern = append(p.pattern, r)
193 }
194 }
195 func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
196 return &Error{Code: code, Expr: p.patternRaw, Args: args}
197 }
198
199 func (p *parser) noteCaptureSlot(i, pos int) {
200 if _, ok := p.caps[i]; !ok {
201
202 p.caps[i] = pos
203 p.capcount++
204
205 if p.captop <= i {
206 if i == math.MaxInt32 {
207 p.captop = i
208 } else {
209 p.captop = i + 1
210 }
211 }
212 }
213 }
214
215 func (p *parser) noteCaptureName(name string, pos int) {
216 if p.capnames == nil {
217 p.capnames = make(map[string]int)
218 }
219
220 if _, ok := p.capnames[name]; !ok {
221 p.capnames[name] = pos
222 p.capnamelist = append(p.capnamelist, name)
223 }
224 }
225
226 func (p *parser) assignNameSlots() {
227 if p.capnames != nil {
228 for _, name := range p.capnamelist {
229 for p.isCaptureSlot(p.autocap) {
230 p.autocap++
231 }
232 pos := p.capnames[name]
233 p.capnames[name] = p.autocap
234 p.noteCaptureSlot(p.autocap, pos)
235
236 p.autocap++
237 }
238 }
239
240
241 if p.capcount < p.captop {
242 p.capnumlist = make([]int, p.capcount)
243 i := 0
244
245 for k := range p.caps {
246 p.capnumlist[i] = k
247 i++
248 }
249
250 sort.Ints(p.capnumlist)
251 }
252
253
254 if p.capnames != nil || p.capnumlist != nil {
255 var oldcapnamelist []string
256 var next int
257 var k int
258
259 if p.capnames == nil {
260 oldcapnamelist = nil
261 p.capnames = make(map[string]int)
262 p.capnamelist = []string{}
263 next = -1
264 } else {
265 oldcapnamelist = p.capnamelist
266 p.capnamelist = []string{}
267 next = p.capnames[oldcapnamelist[0]]
268 }
269
270 for i := 0; i < p.capcount; i++ {
271 j := i
272 if p.capnumlist != nil {
273 j = p.capnumlist[i]
274 }
275
276 if next == j {
277 p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
278 k++
279
280 if k == len(oldcapnamelist) {
281 next = -1
282 } else {
283 next = p.capnames[oldcapnamelist[k]]
284 }
285
286 } else {
287
288 str := strconv.Itoa(j)
289 p.capnamelist = append(p.capnamelist, str)
290 p.capnames[str] = j
291 }
292 }
293 }
294 }
295
296 func (p *parser) consumeAutocap() int {
297 r := p.autocap
298 p.autocap++
299 return r
300 }
301
302
303
304 func (p *parser) countCaptures() error {
305 var ch rune
306
307 p.noteCaptureSlot(0, 0)
308
309 p.autocap = 1
310
311 for p.charsRight() > 0 {
312 pos := p.textpos()
313 ch = p.moveRightGetChar()
314 switch ch {
315 case '\\':
316 if p.charsRight() > 0 {
317 p.scanBackslash(true)
318 }
319
320 case '#':
321 if p.useOptionX() {
322 p.moveLeft()
323 p.scanBlank()
324 }
325
326 case '[':
327 p.scanCharSet(false, true)
328
329 case ')':
330 if !p.emptyOptionsStack() {
331 p.popOptions()
332 }
333
334 case '(':
335 if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
336 p.moveLeft()
337 p.scanBlank()
338 } else {
339 p.pushOptions()
340 if p.charsRight() > 0 && p.rightChar(0) == '?' {
341
342 p.moveRight(1)
343
344 if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
345
346
347 p.moveRight(1)
348 ch = p.rightChar(0)
349
350 if ch != '0' && IsWordChar(ch) {
351 if ch >= '1' && ch <= '9' {
352 dec, err := p.scanDecimal()
353 if err != nil {
354 return err
355 }
356 p.noteCaptureSlot(dec, pos)
357 } else {
358 p.noteCaptureName(p.scanCapname(), pos)
359 }
360 }
361 } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
362
363 p.moveRight(2)
364 ch = p.rightChar(0)
365 if IsWordChar(ch) {
366 p.noteCaptureName(p.scanCapname(), pos)
367 }
368
369 } else {
370
371
372
373 p.scanOptions()
374
375 if p.charsRight() > 0 {
376 if p.rightChar(0) == ')' {
377
378 p.moveRight(1)
379 p.popKeepOptions()
380 } else if p.rightChar(0) == '(' {
381
382
383 p.ignoreNextParen = true
384
385
386 continue
387 }
388 }
389 }
390 } else {
391 if !p.useOptionN() && !p.ignoreNextParen {
392 p.noteCaptureSlot(p.consumeAutocap(), pos)
393 }
394 }
395 }
396
397 p.ignoreNextParen = false
398
399 }
400 }
401
402 p.assignNameSlots()
403 return nil
404 }
405
406 func (p *parser) reset(topopts RegexOptions) {
407 p.currentPos = 0
408 p.autocap = 1
409 p.ignoreNextParen = false
410
411 if len(p.optionsStack) > 0 {
412 p.optionsStack = p.optionsStack[:0]
413 }
414
415 p.options = topopts
416 p.stack = nil
417 }
418
419 func (p *parser) scanRegex() (*regexNode, error) {
420 ch := '@'
421 isQuant := false
422
423 p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
424
425 for p.charsRight() > 0 {
426 wasPrevQuantifier := isQuant
427 isQuant = false
428
429 if err := p.scanBlank(); err != nil {
430 return nil, err
431 }
432
433 startpos := p.textpos()
434
435
436
437 if p.useOptionX() {
438 for p.charsRight() > 0 {
439 ch = p.rightChar(0)
440
441 if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
442 break
443 }
444 p.moveRight(1)
445 }
446 } else {
447 for p.charsRight() > 0 {
448 ch = p.rightChar(0)
449 if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
450 break
451 }
452 p.moveRight(1)
453 }
454 }
455
456 endpos := p.textpos()
457
458 p.scanBlank()
459
460 if p.charsRight() == 0 {
461 ch = '!'
462 } else if ch = p.rightChar(0); isSpecial(ch) {
463 isQuant = isQuantifier(ch)
464 p.moveRight(1)
465 } else {
466 ch = ' '
467 }
468
469 if startpos < endpos {
470 cchUnquantified := endpos - startpos
471 if isQuant {
472 cchUnquantified--
473 }
474 wasPrevQuantifier = false
475
476 if cchUnquantified > 0 {
477 p.addToConcatenate(startpos, cchUnquantified, false)
478 }
479
480 if isQuant {
481 p.addUnitOne(p.charAt(endpos - 1))
482 }
483 }
484
485 switch ch {
486 case '!':
487 goto BreakOuterScan
488
489 case ' ':
490 goto ContinueOuterScan
491
492 case '[':
493 cc, err := p.scanCharSet(p.useOptionI(), false)
494 if err != nil {
495 return nil, err
496 }
497 p.addUnitSet(cc)
498
499 case '(':
500 p.pushOptions()
501
502 if grouper, err := p.scanGroupOpen(); err != nil {
503 return nil, err
504 } else if grouper == nil {
505 p.popKeepOptions()
506 } else {
507 p.pushGroup()
508 p.startGroup(grouper)
509 }
510
511 continue
512
513 case '|':
514 p.addAlternate()
515 goto ContinueOuterScan
516
517 case ')':
518 if p.emptyStack() {
519 return nil, p.getErr(ErrUnexpectedParen)
520 }
521
522 if err := p.addGroup(); err != nil {
523 return nil, err
524 }
525 if err := p.popGroup(); err != nil {
526 return nil, err
527 }
528 p.popOptions()
529
530 if p.unit == nil {
531 goto ContinueOuterScan
532 }
533
534 case '\\':
535 n, err := p.scanBackslash(false)
536 if err != nil {
537 return nil, err
538 }
539 p.addUnitNode(n)
540
541 case '^':
542 if p.useOptionM() {
543 p.addUnitType(ntBol)
544 } else {
545 p.addUnitType(ntBeginning)
546 }
547
548 case '$':
549 if p.useOptionM() {
550 p.addUnitType(ntEol)
551 } else {
552 p.addUnitType(ntEndZ)
553 }
554
555 case '.':
556 if p.useOptionE() {
557 p.addUnitSet(ECMAAnyClass())
558 } else if p.useOptionS() {
559 p.addUnitSet(AnyClass())
560 } else {
561 p.addUnitNotone('\n')
562 }
563
564 case '{', '*', '+', '?':
565 if p.unit == nil {
566 if wasPrevQuantifier {
567 return nil, p.getErr(ErrInvalidRepeatOp)
568 } else {
569 return nil, p.getErr(ErrMissingRepeatArgument)
570 }
571 }
572 p.moveLeft()
573
574 default:
575 return nil, p.getErr(ErrInternalError)
576 }
577
578 if err := p.scanBlank(); err != nil {
579 return nil, err
580 }
581
582 if p.charsRight() > 0 {
583 isQuant = p.isTrueQuantifier()
584 }
585 if p.charsRight() == 0 || !isQuant {
586
587 p.addConcatenate()
588 goto ContinueOuterScan
589 }
590
591 ch = p.moveRightGetChar()
592
593
594 for p.unit != nil {
595 var min, max int
596 var lazy bool
597
598 switch ch {
599 case '*':
600 min = 0
601 max = math.MaxInt32
602
603 case '?':
604 min = 0
605 max = 1
606
607 case '+':
608 min = 1
609 max = math.MaxInt32
610
611 case '{':
612 {
613 var err error
614 startpos = p.textpos()
615 if min, err = p.scanDecimal(); err != nil {
616 return nil, err
617 }
618 max = min
619 if startpos < p.textpos() {
620 if p.charsRight() > 0 && p.rightChar(0) == ',' {
621 p.moveRight(1)
622 if p.charsRight() == 0 || p.rightChar(0) == '}' {
623 max = math.MaxInt32
624 } else {
625 if max, err = p.scanDecimal(); err != nil {
626 return nil, err
627 }
628 }
629 }
630 }
631
632 if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
633 p.addConcatenate()
634 p.textto(startpos - 1)
635 goto ContinueOuterScan
636 }
637 }
638
639 default:
640 return nil, p.getErr(ErrInternalError)
641 }
642
643 if err := p.scanBlank(); err != nil {
644 return nil, err
645 }
646
647 if p.charsRight() == 0 || p.rightChar(0) != '?' {
648 lazy = false
649 } else {
650 p.moveRight(1)
651 lazy = true
652 }
653
654 if min > max {
655 return nil, p.getErr(ErrInvalidRepeatSize)
656 }
657
658 p.addConcatenate3(lazy, min, max)
659 }
660
661 ContinueOuterScan:
662 }
663
664 BreakOuterScan:
665 ;
666
667 if !p.emptyStack() {
668 return nil, p.getErr(ErrMissingParen)
669 }
670
671 if err := p.addGroup(); err != nil {
672 return nil, err
673 }
674
675 return p.unit, nil
676
677 }
678
679
682 func (p *parser) scanReplacement() (*regexNode, error) {
683 var c, startpos int
684
685 p.concatenation = newRegexNode(ntConcatenate, p.options)
686
687 for {
688 c = p.charsRight()
689 if c == 0 {
690 break
691 }
692
693 startpos = p.textpos()
694
695 for c > 0 && p.rightChar(0) != '$' {
696 p.moveRight(1)
697 c--
698 }
699
700 p.addToConcatenate(startpos, p.textpos()-startpos, true)
701
702 if c > 0 {
703 if p.moveRightGetChar() == '$' {
704 n, err := p.scanDollar()
705 if err != nil {
706 return nil, err
707 }
708 p.addUnitNode(n)
709 }
710 p.addConcatenate()
711 }
712 }
713
714 return p.concatenation, nil
715 }
716
717
720 func (p *parser) scanDollar() (*regexNode, error) {
721 if p.charsRight() == 0 {
722 return newRegexNodeCh(ntOne, p.options, '$'), nil
723 }
724
725 ch := p.rightChar(0)
726 angled := false
727 backpos := p.textpos()
728 lastEndPos := backpos
729
730
731
732 if ch == '{' && p.charsRight() > 1 {
733 angled = true
734 p.moveRight(1)
735 ch = p.rightChar(0)
736 }
737
738
739
740 if ch >= '0' && ch <= '9' {
741 if !angled && p.useOptionE() {
742 capnum := -1
743 newcapnum := int(ch - '0')
744 p.moveRight(1)
745 if p.isCaptureSlot(newcapnum) {
746 capnum = newcapnum
747 lastEndPos = p.textpos()
748 }
749
750 for p.charsRight() > 0 {
751 ch = p.rightChar(0)
752 if ch < '0' || ch > '9' {
753 break
754 }
755 digit := int(ch - '0')
756 if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
757 return nil, p.getErr(ErrCaptureGroupOutOfRange)
758 }
759
760 newcapnum = newcapnum*10 + digit
761
762 p.moveRight(1)
763 if p.isCaptureSlot(newcapnum) {
764 capnum = newcapnum
765 lastEndPos = p.textpos()
766 }
767 }
768 p.textto(lastEndPos)
769 if capnum >= 0 {
770 return newRegexNodeM(ntRef, p.options, capnum), nil
771 }
772 } else {
773 capnum, err := p.scanDecimal()
774 if err != nil {
775 return nil, err
776 }
777 if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
778 if p.isCaptureSlot(capnum) {
779 return newRegexNodeM(ntRef, p.options, capnum), nil
780 }
781 }
782 }
783 } else if angled && IsWordChar(ch) {
784 capname := p.scanCapname()
785
786 if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
787 if p.isCaptureName(capname) {
788 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
789 }
790 }
791 } else if !angled {
792 capnum := 1
793
794 switch ch {
795 case '$':
796 p.moveRight(1)
797 return newRegexNodeCh(ntOne, p.options, '$'), nil
798 case '&':
799 capnum = 0
800 case '`':
801 capnum = replaceLeftPortion
802 case '\'':
803 capnum = replaceRightPortion
804 case '+':
805 capnum = replaceLastGroup
806 case '_':
807 capnum = replaceWholeString
808 }
809
810 if capnum != 1 {
811 p.moveRight(1)
812 return newRegexNodeM(ntRef, p.options, capnum), nil
813 }
814 }
815
816
817
818 p.textto(backpos)
819 return newRegexNodeCh(ntOne, p.options, '$'), nil
820 }
821
822
823
824
825 func (p *parser) scanGroupOpen() (*regexNode, error) {
826 var ch rune
827 var nt nodeType
828 var err error
829 close := '>'
830 start := p.textpos()
831
832
833
834
835
836 if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
837 if p.useOptionN() || p.ignoreNextParen {
838 p.ignoreNextParen = false
839 return newRegexNode(ntGroup, p.options), nil
840 }
841 return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
842 }
843
844 p.moveRight(1)
845
846 for {
847 if p.charsRight() == 0 {
848 break
849 }
850
851 switch ch = p.moveRightGetChar(); ch {
852 case ':':
853 nt = ntGroup
854
855 case '=':
856 p.options &= ^RightToLeft
857 nt = ntRequire
858
859 case '!':
860 p.options &= ^RightToLeft
861 nt = ntPrevent
862
863 case '>':
864 nt = ntGreedy
865
866 case '\'':
867 close = '\''
868 fallthrough
869
870 case '<':
871 if p.charsRight() == 0 {
872 goto BreakRecognize
873 }
874
875 switch ch = p.moveRightGetChar(); ch {
876 case '=':
877 if close == '\'' {
878 goto BreakRecognize
879 }
880
881 p.options |= RightToLeft
882 nt = ntRequire
883
884 case '!':
885 if close == '\'' {
886 goto BreakRecognize
887 }
888
889 p.options |= RightToLeft
890 nt = ntPrevent
891
892 default:
893 p.moveLeft()
894 capnum := -1
895 uncapnum := -1
896 proceed := false
897
898
899
900 if ch >= '0' && ch <= '9' {
901 if capnum, err = p.scanDecimal(); err != nil {
902 return nil, err
903 }
904
905 if !p.isCaptureSlot(capnum) {
906 capnum = -1
907 }
908
909
910 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
911 return nil, p.getErr(ErrInvalidGroupName)
912 }
913 if capnum == 0 {
914 return nil, p.getErr(ErrCapNumNotZero)
915 }
916 } else if IsWordChar(ch) {
917 capname := p.scanCapname()
918
919 if p.isCaptureName(capname) {
920 capnum = p.captureSlotFromName(capname)
921 }
922
923
924 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
925 return nil, p.getErr(ErrInvalidGroupName)
926 }
927 } else if ch == '-' {
928 proceed = true
929 } else {
930
931 return nil, p.getErr(ErrInvalidGroupName)
932 }
933
934
935
936 if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
937 p.moveRight(1)
938
939
940 if p.charsRight() == 0 {
941 return nil, p.getErr(ErrInvalidGroupName)
942 }
943
944 ch = p.rightChar(0)
945 if ch >= '0' && ch <= '9' {
946 if uncapnum, err = p.scanDecimal(); err != nil {
947 return nil, err
948 }
949
950 if !p.isCaptureSlot(uncapnum) {
951 return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
952 }
953
954
955 if p.charsRight() > 0 && p.rightChar(0) != close {
956 return nil, p.getErr(ErrInvalidGroupName)
957 }
958 } else if IsWordChar(ch) {
959 uncapname := p.scanCapname()
960
961 if !p.isCaptureName(uncapname) {
962 return nil, p.getErr(ErrUndefinedNameRef, uncapname)
963 }
964 uncapnum = p.captureSlotFromName(uncapname)
965
966
967 if p.charsRight() > 0 && p.rightChar(0) != close {
968 return nil, p.getErr(ErrInvalidGroupName)
969 }
970 } else {
971
972 return nil, p.getErr(ErrInvalidGroupName)
973 }
974 }
975
976
977
978 if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
979 return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
980 }
981 goto BreakRecognize
982 }
983
984 case '(':
985
986
987 parenPos := p.textpos()
988 if p.charsRight() > 0 {
989 ch = p.rightChar(0)
990
991
992 if ch >= '0' && ch <= '9' {
993 var capnum int
994 if capnum, err = p.scanDecimal(); err != nil {
995 return nil, err
996 }
997 if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
998 if p.isCaptureSlot(capnum) {
999 return newRegexNodeM(ntTestref, p.options, capnum), nil
1000 }
1001 return nil, p.getErr(ErrUndefinedReference, capnum)
1002 }
1003
1004 return nil, p.getErr(ErrMalformedReference, capnum)
1005
1006 } else if IsWordChar(ch) {
1007 capname := p.scanCapname()
1008
1009 if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
1010 return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
1011 }
1012 }
1013 }
1014
1015 nt = ntTestgroup
1016 p.textto(parenPos - 1)
1017 p.ignoreNextParen = true
1018
1019 charsRight := p.charsRight()
1020 if charsRight >= 3 && p.rightChar(1) == '?' {
1021 rightchar2 := p.rightChar(2)
1022
1023 if rightchar2 == '#' {
1024 return nil, p.getErr(ErrAlternationCantHaveComment)
1025 }
1026
1027
1028 if rightchar2 == '\'' {
1029 return nil, p.getErr(ErrAlternationCantCapture)
1030 }
1031
1032 if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
1033 return nil, p.getErr(ErrAlternationCantCapture)
1034 }
1035 }
1036
1037 case 'P':
1038 if p.useRE2() {
1039
1040 if p.charsRight() < 3 {
1041 goto BreakRecognize
1042 }
1043
1044 ch = p.moveRightGetChar()
1045 if ch != '<' {
1046 goto BreakRecognize
1047 }
1048
1049 ch = p.moveRightGetChar()
1050 p.moveLeft()
1051
1052 if IsWordChar(ch) {
1053 capnum := -1
1054 capname := p.scanCapname()
1055
1056 if p.isCaptureName(capname) {
1057 capnum = p.captureSlotFromName(capname)
1058 }
1059
1060
1061 if p.charsRight() > 0 && p.rightChar(0) != '>' {
1062 return nil, p.getErr(ErrInvalidGroupName)
1063 }
1064
1065
1066
1067 if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
1068 return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
1069 }
1070 goto BreakRecognize
1071
1072 } else {
1073
1074 return nil, p.getErr(ErrInvalidGroupName)
1075 }
1076 }
1077
1078
1079 fallthrough
1080
1081 default:
1082 p.moveLeft()
1083
1084 nt = ntGroup
1085
1086 if p.group.t != ntTestgroup {
1087 p.scanOptions()
1088 }
1089 if p.charsRight() == 0 {
1090 goto BreakRecognize
1091 }
1092
1093 if ch = p.moveRightGetChar(); ch == ')' {
1094 return nil, nil
1095 }
1096
1097 if ch != ':' {
1098 goto BreakRecognize
1099 }
1100
1101 }
1102
1103 return newRegexNode(nt, p.options), nil
1104 }
1105
1106 BreakRecognize:
1107
1108
1109
1110 return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
1111 }
1112
1113
1114 func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
1115
1116 if p.charsRight() == 0 {
1117 return nil, p.getErr(ErrIllegalEndEscape)
1118 }
1119
1120 switch ch := p.rightChar(0); ch {
1121 case 'b', 'B', 'A', 'G', 'Z', 'z':
1122 p.moveRight(1)
1123 return newRegexNode(p.typeFromCode(ch), p.options), nil
1124
1125 case 'w':
1126 p.moveRight(1)
1127 if p.useOptionE() || p.useRE2() {
1128 return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
1129 }
1130 return newRegexNodeSet(ntSet, p.options, WordClass()), nil
1131
1132 case 'W':
1133 p.moveRight(1)
1134 if p.useOptionE() || p.useRE2() {
1135 return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
1136 }
1137 return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
1138
1139 case 's':
1140 p.moveRight(1)
1141 if p.useOptionE() {
1142 return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
1143 } else if p.useRE2() {
1144 return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
1145 }
1146 return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
1147
1148 case 'S':
1149 p.moveRight(1)
1150 if p.useOptionE() {
1151 return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
1152 } else if p.useRE2() {
1153 return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
1154 }
1155 return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
1156
1157 case 'd':
1158 p.moveRight(1)
1159 if p.useOptionE() || p.useRE2() {
1160 return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
1161 }
1162 return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
1163
1164 case 'D':
1165 p.moveRight(1)
1166 if p.useOptionE() || p.useRE2() {
1167 return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
1168 }
1169 return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
1170
1171 case 'p', 'P':
1172 p.moveRight(1)
1173 prop, err := p.parseProperty()
1174 if err != nil {
1175 return nil, err
1176 }
1177 cc := &CharSet{}
1178 cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
1179 if p.useOptionI() {
1180 cc.addLowercase()
1181 }
1182
1183 return newRegexNodeSet(ntSet, p.options, cc), nil
1184
1185 default:
1186 return p.scanBasicBackslash(scanOnly)
1187 }
1188 }
1189
1190
1191 func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
1192 if p.charsRight() == 0 {
1193 return nil, p.getErr(ErrIllegalEndEscape)
1194 }
1195 angled := false
1196 k := false
1197 close := '\x00'
1198
1199 backpos := p.textpos()
1200 ch := p.rightChar(0)
1201
1202
1203
1204
1205
1206
1207
1208 if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) {
1209 if p.charsRight() >= 2 {
1210 p.moveRight(1)
1211 ch = p.moveRightGetChar()
1212
1213 if ch == '<' || (!p.useOptionE() && ch == '\'') {
1214 angled = true
1215 if ch == '\'' {
1216 close = '\''
1217 } else {
1218 close = '>'
1219 }
1220 }
1221 }
1222
1223 if !angled || p.charsRight() <= 0 {
1224 return nil, p.getErr(ErrMalformedNameRef)
1225 }
1226
1227 ch = p.rightChar(0)
1228 k = true
1229
1230 } else if !p.useOptionE() && (ch == '<' || ch == '\'') && p.charsRight() > 1 {
1231 angled = true
1232 if ch == '\'' {
1233 close = '\''
1234 } else {
1235 close = '>'
1236 }
1237
1238 p.moveRight(1)
1239 ch = p.rightChar(0)
1240 }
1241
1242
1243
1244 if angled && ch >= '0' && ch <= '9' {
1245 capnum, err := p.scanDecimal()
1246 if err != nil {
1247 return nil, err
1248 }
1249
1250 if p.charsRight() > 0 && p.moveRightGetChar() == close {
1251 if p.isCaptureSlot(capnum) {
1252 return newRegexNodeM(ntRef, p.options, capnum), nil
1253 }
1254 return nil, p.getErr(ErrUndefinedBackRef, capnum)
1255 }
1256 } else if !angled && ch >= '1' && ch <= '9' {
1257 capnum, err := p.scanDecimal()
1258 if err != nil {
1259 return nil, err
1260 }
1261
1262 if scanOnly {
1263 return nil, nil
1264 }
1265
1266 if p.isCaptureSlot(capnum) {
1267 return newRegexNodeM(ntRef, p.options, capnum), nil
1268 }
1269 if capnum <= 9 && !p.useOptionE() {
1270 return nil, p.getErr(ErrUndefinedBackRef, capnum)
1271 }
1272
1273 } else if angled {
1274 capname := p.scanCapname()
1275
1276 if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close {
1277
1278 if scanOnly {
1279 return nil, nil
1280 }
1281
1282 if p.isCaptureName(capname) {
1283 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
1284 }
1285 return nil, p.getErr(ErrUndefinedNameRef, capname)
1286 } else {
1287 if k {
1288 return nil, p.getErr(ErrMalformedNameRef)
1289 }
1290 }
1291 }
1292
1293
1294
1295 p.textto(backpos)
1296 ch, err := p.scanCharEscape()
1297 if err != nil {
1298 return nil, err
1299 }
1300
1301 if scanOnly {
1302 return nil, nil
1303 }
1304
1305 if p.useOptionI() {
1306 ch = unicode.ToLower(ch)
1307 }
1308
1309 return newRegexNodeCh(ntOne, p.options, ch), nil
1310 }
1311
1312
1313 func (p *parser) parseProperty() (string, error) {
1314
1315
1316 if p.charsRight() >= 1 && p.rightChar(0) != '{' {
1317 ch := string(p.moveRightGetChar())
1318
1319 if !isValidUnicodeCat(ch) {
1320 return "", p.getErr(ErrUnknownSlashP, ch)
1321 }
1322 return ch, nil
1323 }
1324
1325 if p.charsRight() < 3 {
1326 return "", p.getErr(ErrIncompleteSlashP)
1327 }
1328 ch := p.moveRightGetChar()
1329 if ch != '{' {
1330 return "", p.getErr(ErrMalformedSlashP)
1331 }
1332
1333 startpos := p.textpos()
1334 for p.charsRight() > 0 {
1335 ch = p.moveRightGetChar()
1336 if !(IsWordChar(ch) || ch == '-') {
1337 p.moveLeft()
1338 break
1339 }
1340 }
1341 capname := string(p.pattern[startpos:p.textpos()])
1342
1343 if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
1344 return "", p.getErr(ErrIncompleteSlashP)
1345 }
1346
1347 if !isValidUnicodeCat(capname) {
1348 return "", p.getErr(ErrUnknownSlashP, capname)
1349 }
1350
1351 return capname, nil
1352 }
1353
1354
1355 func (p *parser) typeFromCode(ch rune) nodeType {
1356 switch ch {
1357 case 'b':
1358 if p.useOptionE() {
1359 return ntECMABoundary
1360 }
1361 return ntBoundary
1362 case 'B':
1363 if p.useOptionE() {
1364 return ntNonECMABoundary
1365 }
1366 return ntNonboundary
1367 case 'A':
1368 return ntBeginning
1369 case 'G':
1370 return ntStart
1371 case 'Z':
1372 return ntEndZ
1373 case 'z':
1374 return ntEnd
1375 default:
1376 return ntNothing
1377 }
1378 }
1379
1380
1381 func (p *parser) scanBlank() error {
1382 if p.useOptionX() {
1383 for {
1384 for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
1385 p.moveRight(1)
1386 }
1387
1388 if p.charsRight() == 0 {
1389 break
1390 }
1391
1392 if p.rightChar(0) == '#' {
1393 for p.charsRight() > 0 && p.rightChar(0) != '\n' {
1394 p.moveRight(1)
1395 }
1396 } else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
1397 p.rightChar(1) == '?' && p.rightChar(0) == '(' {
1398 for p.charsRight() > 0 && p.rightChar(0) != ')' {
1399 p.moveRight(1)
1400 }
1401 if p.charsRight() == 0 {
1402 return p.getErr(ErrUnterminatedComment)
1403 }
1404 p.moveRight(1)
1405 } else {
1406 break
1407 }
1408 }
1409 } else {
1410 for {
1411 if p.charsRight() < 3 || p.rightChar(2) != '#' ||
1412 p.rightChar(1) != '?' || p.rightChar(0) != '(' {
1413 return nil
1414 }
1415
1416 for p.charsRight() > 0 && p.rightChar(0) != ')' {
1417 p.moveRight(1)
1418 }
1419 if p.charsRight() == 0 {
1420 return p.getErr(ErrUnterminatedComment)
1421 }
1422 p.moveRight(1)
1423 }
1424 }
1425 return nil
1426 }
1427
1428 func (p *parser) scanCapname() string {
1429 startpos := p.textpos()
1430
1431 for p.charsRight() > 0 {
1432 if !IsWordChar(p.moveRightGetChar()) {
1433 p.moveLeft()
1434 break
1435 }
1436 }
1437
1438 return string(p.pattern[startpos:p.textpos()])
1439 }
1440
1441
1442 func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
1443 ch := '\x00'
1444 chPrev := '\x00'
1445 inRange := false
1446 firstChar := true
1447 closed := false
1448
1449 var cc *CharSet
1450 if !scanOnly {
1451 cc = &CharSet{}
1452 }
1453
1454 if p.charsRight() > 0 && p.rightChar(0) == '^' {
1455 p.moveRight(1)
1456 if !scanOnly {
1457 cc.negate = true
1458 }
1459 }
1460
1461 for ; p.charsRight() > 0; firstChar = false {
1462 fTranslatedChar := false
1463 ch = p.moveRightGetChar()
1464 if ch == ']' {
1465 if !firstChar {
1466 closed = true
1467 break
1468 } else if p.useOptionE() {
1469 if !scanOnly {
1470 cc.addRanges(NoneClass().ranges)
1471 }
1472 closed = true
1473 break
1474 }
1475
1476 } else if ch == '\\' && p.charsRight() > 0 {
1477 switch ch = p.moveRightGetChar(); ch {
1478 case 'D', 'd':
1479 if !scanOnly {
1480 if inRange {
1481 return nil, p.getErr(ErrBadClassInCharRange, ch)
1482 }
1483 cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
1484 }
1485 continue
1486
1487 case 'S', 's':
1488 if !scanOnly {
1489 if inRange {
1490 return nil, p.getErr(ErrBadClassInCharRange, ch)
1491 }
1492 cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
1493 }
1494 continue
1495
1496 case 'W', 'w':
1497 if !scanOnly {
1498 if inRange {
1499 return nil, p.getErr(ErrBadClassInCharRange, ch)
1500 }
1501
1502 cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
1503 }
1504 continue
1505
1506 case 'p', 'P':
1507 if !scanOnly {
1508 if inRange {
1509 return nil, p.getErr(ErrBadClassInCharRange, ch)
1510 }
1511 prop, err := p.parseProperty()
1512 if err != nil {
1513 return nil, err
1514 }
1515 cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
1516 } else {
1517 p.parseProperty()
1518 }
1519
1520 continue
1521
1522 case '-':
1523 if !scanOnly {
1524 cc.addRange(ch, ch)
1525 }
1526 continue
1527
1528 default:
1529 p.moveLeft()
1530 var err error
1531 ch, err = p.scanCharEscape()
1532 if err != nil {
1533 return nil, err
1534 }
1535 fTranslatedChar = true
1536 break
1537 }
1538 } else if ch == '[' {
1539
1540
1541 if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
1542 savePos := p.textpos()
1543
1544 p.moveRight(1)
1545 negate := false
1546 if p.charsRight() > 1 && p.rightChar(0) == '^' {
1547 negate = true
1548 p.moveRight(1)
1549 }
1550
1551 nm := p.scanCapname()
1552 if !scanOnly && p.useRE2() {
1553
1554
1555 if ok := cc.addNamedASCII(nm, negate); !ok {
1556 return nil, p.getErr(ErrInvalidCharRange)
1557 }
1558 }
1559 if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
1560 p.textto(savePos)
1561 } else if p.useRE2() {
1562
1563 continue
1564 }
1565 }
1566 }
1567
1568 if inRange {
1569 inRange = false
1570 if !scanOnly {
1571 if ch == '[' && !fTranslatedChar && !firstChar {
1572
1573
1574
1575 cc.addChar(chPrev)
1576 sub, err := p.scanCharSet(caseInsensitive, false)
1577 if err != nil {
1578 return nil, err
1579 }
1580 cc.addSubtraction(sub)
1581
1582 if p.charsRight() > 0 && p.rightChar(0) != ']' {
1583 return nil, p.getErr(ErrSubtractionMustBeLast)
1584 }
1585 } else {
1586
1587 if chPrev > ch {
1588 return nil, p.getErr(ErrReversedCharRange, chPrev, ch)
1589 }
1590 cc.addRange(chPrev, ch)
1591 }
1592 }
1593 } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
1594
1595 chPrev = ch
1596 inRange = true
1597 p.moveRight(1)
1598 } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
1599
1600
1601 if !scanOnly {
1602 p.moveRight(1)
1603 sub, err := p.scanCharSet(caseInsensitive, false)
1604 if err != nil {
1605 return nil, err
1606 }
1607 cc.addSubtraction(sub)
1608
1609 if p.charsRight() > 0 && p.rightChar(0) != ']' {
1610 return nil, p.getErr(ErrSubtractionMustBeLast)
1611 }
1612 } else {
1613 p.moveRight(1)
1614 p.scanCharSet(caseInsensitive, true)
1615 }
1616 } else {
1617 if !scanOnly {
1618 cc.addRange(ch, ch)
1619 }
1620 }
1621 }
1622
1623 if !closed {
1624 return nil, p.getErr(ErrUnterminatedBracket)
1625 }
1626
1627 if !scanOnly && caseInsensitive {
1628 cc.addLowercase()
1629 }
1630
1631 return cc, nil
1632 }
1633
1634
1635 func (p *parser) scanDecimal() (int, error) {
1636 i := 0
1637 var d int
1638
1639 for p.charsRight() > 0 {
1640 d = int(p.rightChar(0) - '0')
1641 if d < 0 || d > 9 {
1642 break
1643 }
1644 p.moveRight(1)
1645
1646 if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
1647 return 0, p.getErr(ErrCaptureGroupOutOfRange)
1648 }
1649
1650 i *= 10
1651 i += d
1652 }
1653
1654 return int(i), nil
1655 }
1656
1657
1658 func isOnlyTopOption(option RegexOptions) bool {
1659 return option == RightToLeft || option == ECMAScript || option == RE2
1660 }
1661
1662
1663 func (p *parser) scanOptions() {
1664
1665 for off := false; p.charsRight() > 0; p.moveRight(1) {
1666 ch := p.rightChar(0)
1667
1668 if ch == '-' {
1669 off = true
1670 } else if ch == '+' {
1671 off = false
1672 } else {
1673 option := optionFromCode(ch)
1674 if option == 0 || isOnlyTopOption(option) {
1675 return
1676 }
1677
1678 if off {
1679 p.options &= ^option
1680 } else {
1681 p.options |= option
1682 }
1683 }
1684 }
1685 }
1686
1687
1688 func (p *parser) scanCharEscape() (r rune, err error) {
1689
1690 ch := p.moveRightGetChar()
1691
1692 if ch >= '0' && ch <= '7' {
1693 p.moveLeft()
1694 return p.scanOctal(), nil
1695 }
1696
1697 pos := p.textpos()
1698
1699 switch ch {
1700 case 'x':
1701
1702 if p.charsRight() > 0 && p.rightChar(0) == '{' {
1703 if p.useOptionE() {
1704 return ch, nil
1705 }
1706 p.moveRight(1)
1707 return p.scanHexUntilBrace()
1708 } else {
1709 r, err = p.scanHex(2)
1710 }
1711 case 'u':
1712
1713 if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' {
1714 p.moveRight(1)
1715 return p.scanHexUntilBrace()
1716 } else {
1717 r, err = p.scanHex(4)
1718 }
1719 case 'a':
1720 return '\u0007', nil
1721 case 'b':
1722 return '\b', nil
1723 case 'e':
1724 return '\u001B', nil
1725 case 'f':
1726 return '\f', nil
1727 case 'n':
1728 return '\n', nil
1729 case 'r':
1730 return '\r', nil
1731 case 't':
1732 return '\t', nil
1733 case 'v':
1734 return '\u000B', nil
1735 case 'c':
1736 r, err = p.scanControl()
1737 default:
1738 if !p.useOptionE() && !p.useRE2() && IsWordChar(ch) {
1739 return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
1740 }
1741 return ch, nil
1742 }
1743 if err != nil && p.useOptionE() {
1744 p.textto(pos)
1745 return ch, nil
1746 }
1747 return
1748 }
1749
1750
1751 func (p *parser) scanControl() (rune, error) {
1752 if p.charsRight() <= 0 {
1753 return 0, p.getErr(ErrMissingControl)
1754 }
1755
1756 ch := p.moveRightGetChar()
1757
1758
1759
1760 if ch >= 'a' && ch <= 'z' {
1761 ch = (ch - ('a' - 'A'))
1762 }
1763 ch = (ch - '@')
1764 if ch >= 0 && ch < ' ' {
1765 return ch, nil
1766 }
1767
1768 return 0, p.getErr(ErrUnrecognizedControl)
1769
1770 }
1771
1772
1773
1774 func (p *parser) scanHexUntilBrace() (rune, error) {
1775
1776
1777 i := 0
1778 hasContent := false
1779
1780 for p.charsRight() > 0 {
1781 ch := p.moveRightGetChar()
1782 if ch == '}' {
1783
1784
1785 if !hasContent {
1786 return 0, p.getErr(ErrTooFewHex)
1787 }
1788 return rune(i), nil
1789 }
1790 hasContent = true
1791
1792 d := hexDigit(ch)
1793 if d < 0 {
1794 return 0, p.getErr(ErrMissingBrace)
1795 }
1796
1797 i *= 0x10
1798 i += d
1799
1800 if i > unicode.MaxRune {
1801 return 0, p.getErr(ErrInvalidHex)
1802 }
1803 }
1804
1805
1806 return 0, p.getErr(ErrMissingBrace)
1807 }
1808
1809
1810 func (p *parser) scanHex(c int) (rune, error) {
1811
1812 i := 0
1813
1814 if p.charsRight() >= c {
1815 for c > 0 {
1816 d := hexDigit(p.moveRightGetChar())
1817 if d < 0 {
1818 break
1819 }
1820 i *= 0x10
1821 i += d
1822 c--
1823 }
1824 }
1825
1826 if c > 0 {
1827 return 0, p.getErr(ErrTooFewHex)
1828 }
1829
1830 return rune(i), nil
1831 }
1832
1833
1834 func hexDigit(ch rune) int {
1835
1836 if d := uint(ch - '0'); d <= 9 {
1837 return int(d)
1838 }
1839
1840 if d := uint(ch - 'a'); d <= 5 {
1841 return int(d + 0xa)
1842 }
1843
1844 if d := uint(ch - 'A'); d <= 5 {
1845 return int(d + 0xa)
1846 }
1847
1848 return -1
1849 }
1850
1851
1852 func (p *parser) scanOctal() rune {
1853
1854
1855 c := 3
1856
1857 if c > p.charsRight() {
1858 c = p.charsRight()
1859 }
1860
1861
1862 i := 0
1863 d := int(p.rightChar(0) - '0')
1864 for c > 0 && d <= 7 && d >= 0 {
1865 if i >= 0x20 && p.useOptionE() {
1866 break
1867 }
1868 i *= 8
1869 i += d
1870 c--
1871
1872 p.moveRight(1)
1873 if !p.rightMost() {
1874 d = int(p.rightChar(0) - '0')
1875 }
1876 }
1877
1878
1879
1880 i &= 0xFF
1881
1882 return rune(i)
1883 }
1884
1885
1886 func (p *parser) textpos() int {
1887 return p.currentPos
1888 }
1889
1890
1891 func (p *parser) textto(pos int) {
1892 p.currentPos = pos
1893 }
1894
1895
1896 func (p *parser) moveRightGetChar() rune {
1897 ch := p.pattern[p.currentPos]
1898 p.currentPos++
1899 return ch
1900 }
1901
1902
1903 func (p *parser) moveRight(i int) {
1904
1905 p.currentPos += i
1906 }
1907
1908
1909 func (p *parser) moveLeft() {
1910 p.currentPos--
1911 }
1912
1913
1914 func (p *parser) charAt(i int) rune {
1915 return p.pattern[i]
1916 }
1917
1918
1919 func (p *parser) rightChar(i int) rune {
1920
1921 return p.pattern[p.currentPos+i]
1922 }
1923
1924
1925 func (p *parser) charsRight() int {
1926 return len(p.pattern) - p.currentPos
1927 }
1928
1929 func (p *parser) rightMost() bool {
1930 return p.currentPos == len(p.pattern)
1931 }
1932
1933
1934 func (p *parser) captureSlotFromName(capname string) int {
1935 return p.capnames[capname]
1936 }
1937
1938
1939 func (p *parser) isCaptureSlot(i int) bool {
1940 if p.caps != nil {
1941 _, ok := p.caps[i]
1942 return ok
1943 }
1944
1945 return (i >= 0 && i < p.capsize)
1946 }
1947
1948
1949 func (p *parser) isCaptureName(capname string) bool {
1950 if p.capnames == nil {
1951 return false
1952 }
1953
1954 _, ok := p.capnames[capname]
1955 return ok
1956 }
1957
1958
1959
1960
1961 func (p *parser) useOptionN() bool {
1962 return (p.options & ExplicitCapture) != 0
1963 }
1964
1965
1966 func (p *parser) useOptionI() bool {
1967 return (p.options & IgnoreCase) != 0
1968 }
1969
1970
1971 func (p *parser) useOptionM() bool {
1972 return (p.options & Multiline) != 0
1973 }
1974
1975
1976 func (p *parser) useOptionS() bool {
1977 return (p.options & Singleline) != 0
1978 }
1979
1980
1981 func (p *parser) useOptionX() bool {
1982 return (p.options & IgnorePatternWhitespace) != 0
1983 }
1984
1985
1986 func (p *parser) useOptionE() bool {
1987 return (p.options & ECMAScript) != 0
1988 }
1989
1990
1991 func (p *parser) useRE2() bool {
1992 return (p.options & RE2) != 0
1993 }
1994
1995
1996 func (p *parser) useOptionU() bool {
1997 return (p.options & Unicode) != 0
1998 }
1999
2000
2001 func (p *parser) emptyOptionsStack() bool {
2002 return len(p.optionsStack) == 0
2003 }
2004
2005
2006 func (p *parser) addConcatenate() {
2007
2008 p.concatenation.addChild(p.unit)
2009 p.unit = nil
2010 }
2011
2012
2013 func (p *parser) addConcatenate3(lazy bool, min, max int) {
2014 p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
2015 p.unit = nil
2016 }
2017
2018
2019 func (p *parser) addUnitOne(ch rune) {
2020 if p.useOptionI() {
2021 ch = unicode.ToLower(ch)
2022 }
2023
2024 p.unit = newRegexNodeCh(ntOne, p.options, ch)
2025 }
2026
2027
2028 func (p *parser) addUnitNotone(ch rune) {
2029 if p.useOptionI() {
2030 ch = unicode.ToLower(ch)
2031 }
2032
2033 p.unit = newRegexNodeCh(ntNotone, p.options, ch)
2034 }
2035
2036
2037 func (p *parser) addUnitSet(set *CharSet) {
2038 p.unit = newRegexNodeSet(ntSet, p.options, set)
2039 }
2040
2041
2042 func (p *parser) addUnitNode(node *regexNode) {
2043 p.unit = node
2044 }
2045
2046
2047 func (p *parser) addUnitType(t nodeType) {
2048 p.unit = newRegexNode(t, p.options)
2049 }
2050
2051
2052 func (p *parser) addGroup() error {
2053 if p.group.t == ntTestgroup || p.group.t == ntTestref {
2054 p.group.addChild(p.concatenation.reverseLeft())
2055 if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
2056 return p.getErr(ErrTooManyAlternates)
2057 }
2058 } else {
2059 p.alternation.addChild(p.concatenation.reverseLeft())
2060 p.group.addChild(p.alternation)
2061 }
2062
2063 p.unit = p.group
2064 return nil
2065 }
2066
2067
2068 func (p *parser) popKeepOptions() {
2069 lastIdx := len(p.optionsStack) - 1
2070 p.optionsStack = p.optionsStack[:lastIdx]
2071 }
2072
2073
2074 func (p *parser) popOptions() {
2075 lastIdx := len(p.optionsStack) - 1
2076
2077 p.options = p.optionsStack[lastIdx]
2078 p.optionsStack = p.optionsStack[:lastIdx]
2079 }
2080
2081
2082 func (p *parser) pushOptions() {
2083 p.optionsStack = append(p.optionsStack, p.options)
2084 }
2085
2086
2087 func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
2088 var node *regexNode
2089
2090 if cch == 0 {
2091 return
2092 }
2093
2094 if cch > 1 {
2095 str := make([]rune, cch)
2096 copy(str, p.pattern[pos:pos+cch])
2097
2098 if p.useOptionI() && !isReplacement {
2099
2100
2101
2102
2103 for i := 0; i < len(str); i++ {
2104 str[i] = unicode.ToLower(str[i])
2105 }
2106 }
2107
2108 node = newRegexNodeStr(ntMulti, p.options, str)
2109 } else {
2110 ch := p.charAt(pos)
2111
2112 if p.useOptionI() && !isReplacement {
2113 ch = unicode.ToLower(ch)
2114 }
2115
2116 node = newRegexNodeCh(ntOne, p.options, ch)
2117 }
2118
2119 p.concatenation.addChild(node)
2120 }
2121
2122
2123 func (p *parser) pushGroup() {
2124 p.group.next = p.stack
2125 p.alternation.next = p.group
2126 p.concatenation.next = p.alternation
2127 p.stack = p.concatenation
2128 }
2129
2130
2131 func (p *parser) popGroup() error {
2132 p.concatenation = p.stack
2133 p.alternation = p.concatenation.next
2134 p.group = p.alternation.next
2135 p.stack = p.group.next
2136
2137
2138 if p.group.t == ntTestgroup && len(p.group.children) == 0 {
2139 if p.unit == nil {
2140 return p.getErr(ErrConditionalExpression)
2141 }
2142
2143 p.group.addChild(p.unit)
2144 p.unit = nil
2145 }
2146 return nil
2147 }
2148
2149
2150 func (p *parser) emptyStack() bool {
2151 return p.stack == nil
2152 }
2153
2154
2155 func (p *parser) startGroup(openGroup *regexNode) {
2156 p.group = openGroup
2157 p.alternation = newRegexNode(ntAlternate, p.options)
2158 p.concatenation = newRegexNode(ntConcatenate, p.options)
2159 }
2160
2161
2162 func (p *parser) addAlternate() {
2163
2164
2165 if p.group.t == ntTestgroup || p.group.t == ntTestref {
2166 p.group.addChild(p.concatenation.reverseLeft())
2167 } else {
2168 p.alternation.addChild(p.concatenation.reverseLeft())
2169 }
2170
2171 p.concatenation = newRegexNode(ntConcatenate, p.options)
2172 }
2173
2174
2175
2176 const (
2177 Q byte = 5
2178 S = 4
2179 Z = 3
2180 X = 2
2181 E = 1
2182 )
2183
2184 var _category = []byte{
2185
2186 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2187
2188 X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
2189
2190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
2191
2192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
2193 }
2194
2195 func isSpace(ch rune) bool {
2196 return (ch <= ' ' && _category[ch] == X)
2197 }
2198
2199
2200 func isSpecial(ch rune) bool {
2201 return (ch <= '|' && _category[ch] >= S)
2202 }
2203
2204
2205 func isStopperX(ch rune) bool {
2206 return (ch <= '|' && _category[ch] >= X)
2207 }
2208
2209
2210 func isQuantifier(ch rune) bool {
2211 return (ch <= '{' && _category[ch] >= Q)
2212 }
2213
2214 func (p *parser) isTrueQuantifier() bool {
2215 nChars := p.charsRight()
2216 if nChars == 0 {
2217 return false
2218 }
2219
2220 startpos := p.textpos()
2221 ch := p.charAt(startpos)
2222 if ch != '{' {
2223 return ch <= '{' && _category[ch] >= Q
2224 }
2225
2226
2227 pos := startpos
2228 for {
2229 nChars--
2230 if nChars <= 0 {
2231 break
2232 }
2233 pos++
2234 ch = p.charAt(pos)
2235 if ch < '0' || ch > '9' {
2236 break
2237 }
2238 }
2239
2240 if nChars == 0 || pos-startpos == 1 {
2241 return false
2242 }
2243 if ch == '}' {
2244 return true
2245 }
2246 if ch != ',' {
2247 return false
2248 }
2249 for {
2250 nChars--
2251 if nChars <= 0 {
2252 break
2253 }
2254 pos++
2255 ch = p.charAt(pos)
2256 if ch < '0' || ch > '9' {
2257 break
2258 }
2259 }
2260
2261 return nChars > 0 && ch == '}'
2262 }
2263
View as plain text