1 package syntax
2
3 import (
4 "bytes"
5 "fmt"
6 "math"
7 "strconv"
8 )
9
10 type RegexTree struct {
11 root *regexNode
12 caps map[int]int
13 capnumlist []int
14 captop int
15 Capnames map[string]int
16 Caplist []string
17 options RegexOptions
18 }
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55 type regexNode struct {
56 t nodeType
57 children []*regexNode
58 str []rune
59 set *CharSet
60 ch rune
61 m int
62 n int
63 options RegexOptions
64 next *regexNode
65 }
66
67 type nodeType int32
68
69 const (
70
71
72 ntOnerep nodeType = 0
73 ntNotonerep = 1
74 ntSetrep = 2
75 ntOneloop = 3
76 ntNotoneloop = 4
77 ntSetloop = 5
78 ntOnelazy = 6
79 ntNotonelazy = 7
80 ntSetlazy = 8
81 ntOne = 9
82 ntNotone = 10
83 ntSet = 11
84 ntMulti = 12
85 ntRef = 13
86 ntBol = 14
87 ntEol = 15
88 ntBoundary = 16
89 ntNonboundary = 17
90 ntBeginning = 18
91 ntStart = 19
92 ntEndZ = 20
93 ntEnd = 21
94
95
96
97
98
99
100 ntNothing = 22
101 ntEmpty = 23
102 ntAlternate = 24
103 ntConcatenate = 25
104 ntLoop = 26
105 ntLazyloop = 27
106 ntCapture = 28
107 ntGroup = 29
108 ntRequire = 30
109 ntPrevent = 31
110 ntGreedy = 32
111 ntTestref = 33
112 ntTestgroup = 34
113
114 ntECMABoundary = 41
115 ntNonECMABoundary = 42
116 )
117
118 func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
119 return ®exNode{
120 t: t,
121 options: opt,
122 }
123 }
124
125 func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
126 return ®exNode{
127 t: t,
128 options: opt,
129 ch: ch,
130 }
131 }
132
133 func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
134 return ®exNode{
135 t: t,
136 options: opt,
137 str: str,
138 }
139 }
140
141 func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
142 return ®exNode{
143 t: t,
144 options: opt,
145 set: set,
146 }
147 }
148
149 func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
150 return ®exNode{
151 t: t,
152 options: opt,
153 m: m,
154 }
155 }
156 func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
157 return ®exNode{
158 t: t,
159 options: opt,
160 m: m,
161 n: n,
162 }
163 }
164
165 func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
166 for i := 0; i < len(n.str); i++ {
167 buf.WriteRune(n.str[i])
168 }
169 }
170
171 func (n *regexNode) addChild(child *regexNode) {
172 reduced := child.reduce()
173 n.children = append(n.children, reduced)
174 reduced.next = n
175 }
176
177 func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
178 newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
179 n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
180 }
181
182
183 func (n *regexNode) removeChildren(startIndex, endIndex int) {
184 n.children = append(n.children[:startIndex], n.children[endIndex:]...)
185 }
186
187
188 func (n *regexNode) makeRep(t nodeType, min, max int) {
189 n.t += (t - ntOne)
190 n.m = min
191 n.n = max
192 }
193
194 func (n *regexNode) reduce() *regexNode {
195 switch n.t {
196 case ntAlternate:
197 return n.reduceAlternation()
198
199 case ntConcatenate:
200 return n.reduceConcatenation()
201
202 case ntLoop, ntLazyloop:
203 return n.reduceRep()
204
205 case ntGroup:
206 return n.reduceGroup()
207
208 case ntSet, ntSetloop:
209 return n.reduceSet()
210
211 default:
212 return n
213 }
214 }
215
216
217
218
219
220
221
222 func (n *regexNode) reduceAlternation() *regexNode {
223 if len(n.children) == 0 {
224 return newRegexNode(ntNothing, n.options)
225 }
226
227 wasLastSet := false
228 lastNodeCannotMerge := false
229 var optionsLast RegexOptions
230 var i, j int
231
232 for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
233 at := n.children[i]
234
235 if j < i {
236 n.children[j] = at
237 }
238
239 for {
240 if at.t == ntAlternate {
241 for k := 0; k < len(at.children); k++ {
242 at.children[k].next = n
243 }
244 n.insertChildren(i+1, at.children)
245
246 j--
247 } else if at.t == ntSet || at.t == ntOne {
248
249 optionsAt := at.options & (RightToLeft | IgnoreCase)
250
251 if at.t == ntSet {
252 if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
253 wasLastSet = true
254 lastNodeCannotMerge = !at.set.IsMergeable()
255 optionsLast = optionsAt
256 break
257 }
258 } else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
259 wasLastSet = true
260 lastNodeCannotMerge = false
261 optionsLast = optionsAt
262 break
263 }
264
265
266
267 j--
268 prev := n.children[j]
269
270 var prevCharClass *CharSet
271 if prev.t == ntOne {
272 prevCharClass = &CharSet{}
273 prevCharClass.addChar(prev.ch)
274 } else {
275 prevCharClass = prev.set
276 }
277
278 if at.t == ntOne {
279 prevCharClass.addChar(at.ch)
280 } else {
281 prevCharClass.addSet(*at.set)
282 }
283
284 prev.t = ntSet
285 prev.set = prevCharClass
286 } else if at.t == ntNothing {
287 j--
288 } else {
289 wasLastSet = false
290 lastNodeCannotMerge = false
291 }
292 break
293 }
294 }
295
296 if j < i {
297 n.removeChildren(j, i)
298 }
299
300 return n.stripEnation(ntNothing)
301 }
302
303
304
305
306 func (n *regexNode) reduceConcatenation() *regexNode {
307
308
309 var optionsLast RegexOptions
310 var optionsAt RegexOptions
311 var i, j int
312
313 if len(n.children) == 0 {
314 return newRegexNode(ntEmpty, n.options)
315 }
316
317 wasLastString := false
318
319 for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
320 var at, prev *regexNode
321
322 at = n.children[i]
323
324 if j < i {
325 n.children[j] = at
326 }
327
328 if at.t == ntConcatenate &&
329 ((at.options & RightToLeft) == (n.options & RightToLeft)) {
330 for k := 0; k < len(at.children); k++ {
331 at.children[k].next = n
332 }
333
334
335 n.insertChildren(i+1, at.children)
336
337 j--
338 } else if at.t == ntMulti || at.t == ntOne {
339
340 optionsAt = at.options & (RightToLeft | IgnoreCase)
341
342 if !wasLastString || optionsLast != optionsAt {
343 wasLastString = true
344 optionsLast = optionsAt
345 continue
346 }
347
348 j--
349 prev = n.children[j]
350
351 if prev.t == ntOne {
352 prev.t = ntMulti
353 prev.str = []rune{prev.ch}
354 }
355
356 if (optionsAt & RightToLeft) == 0 {
357 if at.t == ntOne {
358 prev.str = append(prev.str, at.ch)
359 } else {
360 prev.str = append(prev.str, at.str...)
361 }
362 } else {
363 if at.t == ntOne {
364
365 prev.str = append(prev.str, 0)
366 copy(prev.str[1:], prev.str)
367 prev.str[0] = at.ch
368 } else {
369
370 merge := make([]rune, len(prev.str)+len(at.str))
371 copy(merge, at.str)
372 copy(merge[len(at.str):], prev.str)
373 prev.str = merge
374 }
375 }
376 } else if at.t == ntEmpty {
377 j--
378 } else {
379 wasLastString = false
380 }
381 }
382
383 if j < i {
384
385 n.removeChildren(j, i)
386 }
387
388 return n.stripEnation(ntEmpty)
389 }
390
391
392
393 func (n *regexNode) reduceRep() *regexNode {
394
395 u := n
396 t := n.t
397 min := n.m
398 max := n.n
399
400 for {
401 if len(u.children) == 0 {
402 break
403 }
404
405 child := u.children[0]
406
407
408 if child.t != t {
409 childType := child.t
410
411 if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
412 childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
413 break
414 }
415 }
416
417
418
419 if u.m == 0 && child.m > 1 || child.n < child.m*2 {
420 break
421 }
422
423 u = child
424 if u.m > 0 {
425 if (math.MaxInt32-1)/u.m < min {
426 u.m = math.MaxInt32
427 } else {
428 u.m = u.m * min
429 }
430 }
431 if u.n > 0 {
432 if (math.MaxInt32-1)/u.n < max {
433 u.n = math.MaxInt32
434 } else {
435 u.n = u.n * max
436 }
437 }
438 }
439
440 if math.MaxInt32 == min {
441 return newRegexNode(ntNothing, n.options)
442 }
443 return u
444
445 }
446
447
448
449
450 func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
451 switch len(n.children) {
452 case 0:
453 return newRegexNode(emptyType, n.options)
454 case 1:
455 return n.children[0]
456 default:
457 return n
458 }
459 }
460
461 func (n *regexNode) reduceGroup() *regexNode {
462 u := n
463
464 for u.t == ntGroup {
465 u = u.children[0]
466 }
467
468 return u
469 }
470
471
472
473 func (n *regexNode) reduceSet() *regexNode {
474
475
476 if n.set == nil {
477 n.t = ntNothing
478 } else if n.set.IsSingleton() {
479 n.ch = n.set.SingletonChar()
480 n.set = nil
481 n.t += (ntOne - ntSet)
482 } else if n.set.IsSingletonInverse() {
483 n.ch = n.set.SingletonChar()
484 n.set = nil
485 n.t += (ntNotone - ntSet)
486 }
487
488 return n
489 }
490
491 func (n *regexNode) reverseLeft() *regexNode {
492 if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
493
494 for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
495 n.children[left], n.children[right] = n.children[right], n.children[left]
496 }
497 }
498
499 return n
500 }
501
502 func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
503 if min == 0 && max == 0 {
504 return newRegexNode(ntEmpty, n.options)
505 }
506
507 if min == 1 && max == 1 {
508 return n
509 }
510
511 switch n.t {
512 case ntOne, ntNotone, ntSet:
513 if lazy {
514 n.makeRep(Onelazy, min, max)
515 } else {
516 n.makeRep(Oneloop, min, max)
517 }
518 return n
519
520 default:
521 var t nodeType
522 if lazy {
523 t = ntLazyloop
524 } else {
525 t = ntLoop
526 }
527 result := newRegexNodeMN(t, n.options, min, max)
528 result.addChild(n)
529 return result
530 }
531 }
532
533
534
535 var typeStr = []string{
536 "Onerep", "Notonerep", "Setrep",
537 "Oneloop", "Notoneloop", "Setloop",
538 "Onelazy", "Notonelazy", "Setlazy",
539 "One", "Notone", "Set",
540 "Multi", "Ref",
541 "Bol", "Eol", "Boundary", "Nonboundary",
542 "Beginning", "Start", "EndZ", "End",
543 "Nothing", "Empty",
544 "Alternate", "Concatenate",
545 "Loop", "Lazyloop",
546 "Capture", "Group", "Require", "Prevent", "Greedy",
547 "Testref", "Testgroup",
548 "Unknown", "Unknown", "Unknown",
549 "Unknown", "Unknown", "Unknown",
550 "ECMABoundary", "NonECMABoundary",
551 }
552
553 func (n *regexNode) description() string {
554 buf := &bytes.Buffer{}
555
556 buf.WriteString(typeStr[n.t])
557
558 if (n.options & ExplicitCapture) != 0 {
559 buf.WriteString("-C")
560 }
561 if (n.options & IgnoreCase) != 0 {
562 buf.WriteString("-I")
563 }
564 if (n.options & RightToLeft) != 0 {
565 buf.WriteString("-L")
566 }
567 if (n.options & Multiline) != 0 {
568 buf.WriteString("-M")
569 }
570 if (n.options & Singleline) != 0 {
571 buf.WriteString("-S")
572 }
573 if (n.options & IgnorePatternWhitespace) != 0 {
574 buf.WriteString("-X")
575 }
576 if (n.options & ECMAScript) != 0 {
577 buf.WriteString("-E")
578 }
579
580 switch n.t {
581 case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
582 buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
583 break
584 case ntCapture:
585 buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
586 break
587 case ntRef, ntTestref:
588 buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
589 break
590 case ntMulti:
591 fmt.Fprintf(buf, "(String = %s)", string(n.str))
592 break
593 case ntSet, ntSetloop, ntSetlazy:
594 buf.WriteString("(Set = " + n.set.String() + ")")
595 break
596 }
597
598 switch n.t {
599 case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
600 buf.WriteString("(Min = ")
601 buf.WriteString(strconv.Itoa(n.m))
602 buf.WriteString(", Max = ")
603 if n.n == math.MaxInt32 {
604 buf.WriteString("inf")
605 } else {
606 buf.WriteString(strconv.Itoa(n.n))
607 }
608 buf.WriteString(")")
609
610 break
611 }
612
613 return buf.String()
614 }
615
616 var padSpace = []byte(" ")
617
618 func (t *RegexTree) Dump() string {
619 return t.root.dump()
620 }
621
622 func (n *regexNode) dump() string {
623 var stack []int
624 CurNode := n
625 CurChild := 0
626
627 buf := bytes.NewBufferString(CurNode.description())
628 buf.WriteRune('\n')
629
630 for {
631 if CurNode.children != nil && CurChild < len(CurNode.children) {
632 stack = append(stack, CurChild+1)
633 CurNode = CurNode.children[CurChild]
634 CurChild = 0
635
636 Depth := len(stack)
637 if Depth > 32 {
638 Depth = 32
639 }
640 buf.Write(padSpace[:Depth])
641 buf.WriteString(CurNode.description())
642 buf.WriteRune('\n')
643 } else {
644 if len(stack) == 0 {
645 break
646 }
647
648 CurChild = stack[len(stack)-1]
649 stack = stack[:len(stack)-1]
650 CurNode = CurNode.next
651 }
652 }
653 return buf.String()
654 }
655
View as plain text