1 package syntax
2
3 import (
4 "bytes"
5 "fmt"
6 "strconv"
7 "unicode"
8 "unicode/utf8"
9 )
10
11 type Prefix struct {
12 PrefixStr []rune
13 PrefixSet CharSet
14 CaseInsensitive bool
15 }
16
17
18 func getFirstCharsPrefix(tree *RegexTree) *Prefix {
19 s := regexFcd{
20 fcStack: make([]regexFc, 32),
21 intStack: make([]int, 32),
22 }
23 fc := s.regexFCFromRegexTree(tree)
24
25 if fc == nil || fc.nullable || fc.cc.IsEmpty() {
26 return nil
27 }
28 fcSet := fc.getFirstChars()
29 return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive}
30 }
31
32 type regexFcd struct {
33 intStack []int
34 intDepth int
35 fcStack []regexFc
36 fcDepth int
37 skipAllChildren bool
38 skipchild bool
39 failed bool
40 }
41
42
47 func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc {
48 curNode := tree.root
49 curChild := 0
50
51 for {
52 if len(curNode.children) == 0 {
53
54 s.calculateFC(curNode.t, curNode, 0)
55 } else if curChild < len(curNode.children) && !s.skipAllChildren {
56
57 s.calculateFC(curNode.t|beforeChild, curNode, curChild)
58
59 if !s.skipchild {
60 curNode = curNode.children[curChild]
61
62 s.pushInt(curChild)
63 curChild = 0
64 } else {
65 curChild++
66 s.skipchild = false
67 }
68 continue
69 }
70
71
72
73 s.skipAllChildren = false
74
75 if s.intIsEmpty() {
76 break
77 }
78
79 curChild = s.popInt()
80 curNode = curNode.next
81
82 s.calculateFC(curNode.t|afterChild, curNode, curChild)
83 if s.failed {
84 return nil
85 }
86
87 curChild++
88 }
89
90 if s.fcIsEmpty() {
91 return nil
92 }
93
94 return s.popFC()
95 }
96
97
98
99 func (s *regexFcd) pushInt(I int) {
100 if s.intDepth >= len(s.intStack) {
101 expanded := make([]int, s.intDepth*2)
102 copy(expanded, s.intStack)
103 s.intStack = expanded
104 }
105
106 s.intStack[s.intDepth] = I
107 s.intDepth++
108 }
109
110
111 func (s *regexFcd) intIsEmpty() bool {
112 return s.intDepth == 0
113 }
114
115
116 func (s *regexFcd) popInt() int {
117 s.intDepth--
118 return s.intStack[s.intDepth]
119 }
120
121
122
123 func (s *regexFcd) pushFC(fc regexFc) {
124 if s.fcDepth >= len(s.fcStack) {
125 expanded := make([]regexFc, s.fcDepth*2)
126 copy(expanded, s.fcStack)
127 s.fcStack = expanded
128 }
129
130 s.fcStack[s.fcDepth] = fc
131 s.fcDepth++
132 }
133
134
135 func (s *regexFcd) fcIsEmpty() bool {
136 return s.fcDepth == 0
137 }
138
139
140 func (s *regexFcd) popFC() *regexFc {
141 s.fcDepth--
142 return &s.fcStack[s.fcDepth]
143 }
144
145
146 func (s *regexFcd) topFC() *regexFc {
147 return &s.fcStack[s.fcDepth-1]
148 }
149
150
151 func (s *regexFcd) skipChild() {
152 s.skipchild = true
153 }
154
155
156 func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) {
157
158 ci := false
159 rtl := false
160
161 if nt <= ntRef {
162 if (node.options & IgnoreCase) != 0 {
163 ci = true
164 }
165 if (node.options & RightToLeft) != 0 {
166 rtl = true
167 }
168 }
169
170 switch nt {
171 case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild:
172 break
173
174 case ntTestgroup | beforeChild:
175 if CurIndex == 0 {
176 s.skipChild()
177 }
178 break
179
180 case ntEmpty:
181 s.pushFC(regexFc{nullable: true})
182 break
183
184 case ntConcatenate | afterChild:
185 if CurIndex != 0 {
186 child := s.popFC()
187 cumul := s.topFC()
188
189 s.failed = !cumul.addFC(*child, true)
190 }
191
192 fc := s.topFC()
193 if !fc.nullable {
194 s.skipAllChildren = true
195 }
196 break
197
198 case ntTestgroup | afterChild:
199 if CurIndex > 1 {
200 child := s.popFC()
201 cumul := s.topFC()
202
203 s.failed = !cumul.addFC(*child, false)
204 }
205 break
206
207 case ntAlternate | afterChild, ntTestref | afterChild:
208 if CurIndex != 0 {
209 child := s.popFC()
210 cumul := s.topFC()
211
212 s.failed = !cumul.addFC(*child, false)
213 }
214 break
215
216 case ntLoop | afterChild, ntLazyloop | afterChild:
217 if node.m == 0 {
218 fc := s.topFC()
219 fc.nullable = true
220 }
221 break
222
223 case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild:
224 break
225
226 case ntRequire | beforeChild, ntPrevent | beforeChild:
227 s.skipChild()
228 s.pushFC(regexFc{nullable: true})
229 break
230
231 case ntRequire | afterChild, ntPrevent | afterChild:
232 break
233
234 case ntOne, ntNotone:
235 s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci))
236 break
237
238 case ntOneloop, ntOnelazy:
239 s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci))
240 break
241
242 case ntNotoneloop, ntNotonelazy:
243 s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci))
244 break
245
246 case ntMulti:
247 if len(node.str) == 0 {
248 s.pushFC(regexFc{nullable: true})
249 } else if !rtl {
250 s.pushFC(newRegexFc(node.str[0], false, false, ci))
251 } else {
252 s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci))
253 }
254 break
255
256 case ntSet:
257 s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci})
258 break
259
260 case ntSetloop, ntSetlazy:
261 s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci})
262 break
263
264 case ntRef:
265 s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false})
266 break
267
268 case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
269 s.pushFC(regexFc{nullable: true})
270 break
271
272 default:
273 panic(fmt.Sprintf("unexpected op code: %v", nt))
274 }
275 }
276
277 type regexFc struct {
278 cc CharSet
279 nullable bool
280 caseInsensitive bool
281 }
282
283 func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc {
284 r := regexFc{
285 caseInsensitive: caseInsensitive,
286 nullable: nullable,
287 }
288 if not {
289 if ch > 0 {
290 r.cc.addRange('\x00', ch-1)
291 }
292 if ch < 0xFFFF {
293 r.cc.addRange(ch+1, utf8.MaxRune)
294 }
295 } else {
296 r.cc.addRange(ch, ch)
297 }
298 return r
299 }
300
301 func (r *regexFc) getFirstChars() CharSet {
302 if r.caseInsensitive {
303 r.cc.addLowercase()
304 }
305
306 return r.cc
307 }
308
309 func (r *regexFc) addFC(fc regexFc, concatenate bool) bool {
310 if !r.cc.IsMergeable() || !fc.cc.IsMergeable() {
311 return false
312 }
313
314 if concatenate {
315 if !r.nullable {
316 return true
317 }
318
319 if !fc.nullable {
320 r.nullable = false
321 }
322 } else {
323 if fc.nullable {
324 r.nullable = true
325 }
326 }
327
328 r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive
329 r.cc.addSet(fc.cc)
330
331 return true
332 }
333
334
335
336 func getPrefix(tree *RegexTree) *Prefix {
337 var concatNode *regexNode
338 nextChild := 0
339
340 curNode := tree.root
341
342 for {
343 switch curNode.t {
344 case ntConcatenate:
345 if len(curNode.children) > 0 {
346 concatNode = curNode
347 nextChild = 0
348 }
349
350 case ntGreedy, ntCapture:
351 curNode = curNode.children[0]
352 concatNode = nil
353 continue
354
355 case ntOneloop, ntOnelazy:
356 if curNode.m > 0 {
357 return &Prefix{
358 PrefixStr: repeat(curNode.ch, curNode.m),
359 CaseInsensitive: (curNode.options & IgnoreCase) != 0,
360 }
361 }
362 return nil
363
364 case ntOne:
365 return &Prefix{
366 PrefixStr: []rune{curNode.ch},
367 CaseInsensitive: (curNode.options & IgnoreCase) != 0,
368 }
369
370 case ntMulti:
371 return &Prefix{
372 PrefixStr: curNode.str,
373 CaseInsensitive: (curNode.options & IgnoreCase) != 0,
374 }
375
376 case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart,
377 ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent:
378
379 default:
380 return nil
381 }
382
383 if concatNode == nil || nextChild >= len(concatNode.children) {
384 return nil
385 }
386
387 curNode = concatNode.children[nextChild]
388 nextChild++
389 }
390 }
391
392
393 func repeat(r rune, c int) []rune {
394 if c > MaxPrefixSize {
395 c = MaxPrefixSize
396 }
397
398 ret := make([]rune, c)
399
400
401 ret[0] = r
402 bp := 1
403 for bp < len(ret) {
404 copy(ret[bp:], ret[:bp])
405 bp *= 2
406 }
407
408 return ret
409 }
410
411
412
413
414
415
416
417
418
419 type BmPrefix struct {
420 positive []int
421 negativeASCII []int
422 negativeUnicode [][]int
423 pattern []rune
424 lowASCII rune
425 highASCII rune
426 rightToLeft bool
427 caseInsensitive bool
428 }
429
430 func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix {
431
432 b := &BmPrefix{
433 rightToLeft: rightToLeft,
434 caseInsensitive: caseInsensitive,
435 pattern: pattern,
436 }
437
438 if caseInsensitive {
439 for i := 0; i < len(b.pattern); i++ {
440
441
442
443
444
445 b.pattern[i] = unicode.ToLower(b.pattern[i])
446 }
447 }
448
449 var beforefirst, last, bump int
450 var scan, match int
451
452 if !rightToLeft {
453 beforefirst = -1
454 last = len(b.pattern) - 1
455 bump = 1
456 } else {
457 beforefirst = len(b.pattern)
458 last = 0
459 bump = -1
460 }
461
462
463
464
465
466
467
468
469
470
471 b.positive = make([]int, len(b.pattern))
472
473 examine := last
474 ch := b.pattern[examine]
475 b.positive[examine] = bump
476 examine -= bump
477
478 Outerloop:
479 for {
480
481
482 for {
483 if examine == beforefirst {
484 break Outerloop
485 }
486 if b.pattern[examine] == ch {
487 break
488 }
489 examine -= bump
490 }
491
492 match = last
493 scan = examine
494
495
496 for {
497 if scan == beforefirst || b.pattern[match] != b.pattern[scan] {
498
499
500
501 if b.positive[match] == 0 {
502 b.positive[match] = match - scan
503 }
504
505
506
507 break
508 }
509
510 scan -= bump
511 match -= bump
512 }
513
514 examine -= bump
515 }
516
517 match = last - bump
518
519
520
521
522
523
524
525 for match != beforefirst {
526 if b.positive[match] == 0 {
527 b.positive[match] = bump
528 }
529
530 match -= bump
531 }
532
533
534
535
536
537
538
539
540
541
542
543
544
545 b.negativeASCII = make([]int, 128)
546
547 for i := 0; i < len(b.negativeASCII); i++ {
548 b.negativeASCII[i] = last - beforefirst
549 }
550
551 b.lowASCII = 127
552 b.highASCII = 0
553
554 for examine = last; examine != beforefirst; examine -= bump {
555 ch = b.pattern[examine]
556
557 switch {
558 case ch < 128:
559 if b.lowASCII > ch {
560 b.lowASCII = ch
561 }
562
563 if b.highASCII < ch {
564 b.highASCII = ch
565 }
566
567 if b.negativeASCII[ch] == last-beforefirst {
568 b.negativeASCII[ch] = last - examine
569 }
570 case ch <= 0xffff:
571 i, j := ch>>8, ch&0xFF
572
573 if b.negativeUnicode == nil {
574 b.negativeUnicode = make([][]int, 256)
575 }
576
577 if b.negativeUnicode[i] == nil {
578 newarray := make([]int, 256)
579
580 for k := 0; k < len(newarray); k++ {
581 newarray[k] = last - beforefirst
582 }
583
584 if i == 0 {
585 copy(newarray, b.negativeASCII)
586
587 b.negativeASCII = newarray
588 }
589
590 b.negativeUnicode[i] = newarray
591 }
592
593 if b.negativeUnicode[i][j] == last-beforefirst {
594 b.negativeUnicode[i][j] = last - examine
595 }
596 default:
597
598
599 return nil
600 }
601 }
602
603 return b
604 }
605
606 func (b *BmPrefix) String() string {
607 return string(b.pattern)
608 }
609
610
611 func (b *BmPrefix) Dump(indent string) string {
612 buf := &bytes.Buffer{}
613
614 fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent)
615 for i := 0; i < len(b.positive); i++ {
616 buf.WriteString(strconv.Itoa(b.positive[i]))
617 buf.WriteRune(' ')
618 }
619 buf.WriteRune('\n')
620
621 if b.negativeASCII != nil {
622 buf.WriteString(indent)
623 buf.WriteString("Negative table\n")
624 for i := 0; i < len(b.negativeASCII); i++ {
625 if b.negativeASCII[i] != len(b.pattern) {
626 fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i]))
627 }
628 }
629 }
630
631 return buf.String()
632 }
633
634
635
636
637
638
639
640 func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
641 var (
642 defadv, test, test2 int
643 match, startmatch, endmatch int
644 bump, advance int
645 chTest rune
646 unicodeLookup []int
647 )
648
649 if !b.rightToLeft {
650 defadv = len(b.pattern)
651 startmatch = len(b.pattern) - 1
652 endmatch = 0
653 test = index + defadv - 1
654 bump = 1
655 } else {
656 defadv = -len(b.pattern)
657 startmatch = 0
658 endmatch = -defadv - 1
659 test = index + defadv
660 bump = -1
661 }
662
663 chMatch := b.pattern[startmatch]
664
665 for {
666 if test >= endlimit || test < beglimit {
667 return -1
668 }
669
670 chTest = text[test]
671
672 if b.caseInsensitive {
673 chTest = unicode.ToLower(chTest)
674 }
675
676 if chTest != chMatch {
677 if chTest < 128 {
678 advance = b.negativeASCII[chTest]
679 } else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
680 unicodeLookup = b.negativeUnicode[chTest>>8]
681 if len(unicodeLookup) > 0 {
682 advance = unicodeLookup[chTest&0xFF]
683 } else {
684 advance = defadv
685 }
686 } else {
687 advance = defadv
688 }
689
690 test += advance
691 } else {
692 test2 = test
693 match = startmatch
694
695 for {
696 if match == endmatch {
697 if b.rightToLeft {
698 return test2 + 1
699 } else {
700 return test2
701 }
702 }
703
704 match -= bump
705 test2 -= bump
706
707 chTest = text[test2]
708
709 if b.caseInsensitive {
710 chTest = unicode.ToLower(chTest)
711 }
712
713 if chTest != b.pattern[match] {
714 advance = b.positive[match]
715 if chTest < 128 {
716 test2 = (match - startmatch) + b.negativeASCII[chTest]
717 } else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
718 unicodeLookup = b.negativeUnicode[chTest>>8]
719 if len(unicodeLookup) > 0 {
720 test2 = (match - startmatch) + unicodeLookup[chTest&0xFF]
721 } else {
722 test += advance
723 break
724 }
725 } else {
726 test += advance
727 break
728 }
729
730 if b.rightToLeft {
731 if test2 < advance {
732 advance = test2
733 }
734 } else if test2 > advance {
735 advance = test2
736 }
737
738 test += advance
739 break
740 }
741 }
742 }
743 }
744 }
745
746
747 func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool {
748 if !b.rightToLeft {
749 if index < beglimit || endlimit-index < len(b.pattern) {
750 return false
751 }
752
753 return b.matchPattern(text, index)
754 } else {
755 if index > endlimit || index-beglimit < len(b.pattern) {
756 return false
757 }
758
759 return b.matchPattern(text, index-len(b.pattern))
760 }
761 }
762
763 func (b *BmPrefix) matchPattern(text []rune, index int) bool {
764 if len(text)-index < len(b.pattern) {
765 return false
766 }
767
768 if b.caseInsensitive {
769 for i := 0; i < len(b.pattern); i++ {
770
771 if unicode.ToLower(text[index+i]) != b.pattern[i] {
772 return false
773 }
774 }
775 return true
776 } else {
777 for i := 0; i < len(b.pattern); i++ {
778 if text[index+i] != b.pattern[i] {
779 return false
780 }
781 }
782 return true
783 }
784 }
785
786 type AnchorLoc int16
787
788
789 const (
790 AnchorBeginning AnchorLoc = 0x0001
791 AnchorBol = 0x0002
792 AnchorStart = 0x0004
793 AnchorEol = 0x0008
794 AnchorEndZ = 0x0010
795 AnchorEnd = 0x0020
796 AnchorBoundary = 0x0040
797 AnchorECMABoundary = 0x0080
798 )
799
800 func getAnchors(tree *RegexTree) AnchorLoc {
801
802 var concatNode *regexNode
803 nextChild, result := 0, AnchorLoc(0)
804
805 curNode := tree.root
806
807 for {
808 switch curNode.t {
809 case ntConcatenate:
810 if len(curNode.children) > 0 {
811 concatNode = curNode
812 nextChild = 0
813 }
814
815 case ntGreedy, ntCapture:
816 curNode = curNode.children[0]
817 concatNode = nil
818 continue
819
820 case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning,
821 ntStart, ntEndZ, ntEnd:
822 return result | anchorFromType(curNode.t)
823
824 case ntEmpty, ntRequire, ntPrevent:
825
826 default:
827 return result
828 }
829
830 if concatNode == nil || nextChild >= len(concatNode.children) {
831 return result
832 }
833
834 curNode = concatNode.children[nextChild]
835 nextChild++
836 }
837 }
838
839 func anchorFromType(t nodeType) AnchorLoc {
840 switch t {
841 case ntBol:
842 return AnchorBol
843 case ntEol:
844 return AnchorEol
845 case ntBoundary:
846 return AnchorBoundary
847 case ntECMABoundary:
848 return AnchorECMABoundary
849 case ntBeginning:
850 return AnchorBeginning
851 case ntStart:
852 return AnchorStart
853 case ntEndZ:
854 return AnchorEndZ
855 case ntEnd:
856 return AnchorEnd
857 default:
858 return 0
859 }
860 }
861
862
863 func (anchors AnchorLoc) String() string {
864 buf := &bytes.Buffer{}
865
866 if 0 != (anchors & AnchorBeginning) {
867 buf.WriteString(", Beginning")
868 }
869 if 0 != (anchors & AnchorStart) {
870 buf.WriteString(", Start")
871 }
872 if 0 != (anchors & AnchorBol) {
873 buf.WriteString(", Bol")
874 }
875 if 0 != (anchors & AnchorBoundary) {
876 buf.WriteString(", Boundary")
877 }
878 if 0 != (anchors & AnchorECMABoundary) {
879 buf.WriteString(", ECMABoundary")
880 }
881 if 0 != (anchors & AnchorEol) {
882 buf.WriteString(", Eol")
883 }
884 if 0 != (anchors & AnchorEnd) {
885 buf.WriteString(", End")
886 }
887 if 0 != (anchors & AnchorEndZ) {
888 buf.WriteString(", EndZ")
889 }
890
891
892 if buf.Len() >= 2 {
893 return buf.String()[2:]
894 }
895 return "None"
896 }
897
View as plain text