1 package syntax
2
3 import (
4 "bytes"
5 "fmt"
6 "math"
7 "os"
8 )
9
10 func Write(tree *RegexTree) (*Code, error) {
11 w := writer{
12 intStack: make([]int, 0, 32),
13 emitted: make([]int, 2),
14 stringhash: make(map[string]int),
15 sethash: make(map[string]int),
16 }
17
18 code, err := w.codeFromTree(tree)
19
20 if tree.options&Debug > 0 && code != nil {
21 os.Stdout.WriteString(code.Dump())
22 os.Stdout.WriteString("\n")
23 }
24
25 return code, err
26 }
27
28 type writer struct {
29 emitted []int
30
31 intStack []int
32 curpos int
33 stringhash map[string]int
34 stringtable [][]rune
35 sethash map[string]int
36 settable []*CharSet
37 counting bool
38 count int
39 trackcount int
40 caps map[int]int
41 }
42
43 const (
44 beforeChild nodeType = 64
45 afterChild = 128
46
47 MaxPrefixSize = 50
48 )
49
50
51
52
53
54
55
56
57
58
59 func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
60 var (
61 curNode *regexNode
62 curChild int
63 capsize int
64 )
65
66
67 if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
68 capsize = tree.captop
69 w.caps = nil
70 } else {
71 capsize = len(tree.capnumlist)
72 w.caps = tree.caps
73 for i := 0; i < len(tree.capnumlist); i++ {
74 w.caps[tree.capnumlist[i]] = i
75 }
76 }
77
78 w.counting = true
79
80 for {
81 if !w.counting {
82 w.emitted = make([]int, w.count)
83 }
84
85 curNode = tree.root
86 curChild = 0
87
88 w.emit1(Lazybranch, 0)
89
90 for {
91 if len(curNode.children) == 0 {
92 w.emitFragment(curNode.t, curNode, 0)
93 } else if curChild < len(curNode.children) {
94 w.emitFragment(curNode.t|beforeChild, curNode, curChild)
95
96 curNode = curNode.children[curChild]
97
98 w.pushInt(curChild)
99 curChild = 0
100 continue
101 }
102
103 if w.emptyStack() {
104 break
105 }
106
107 curChild = w.popInt()
108 curNode = curNode.next
109
110 w.emitFragment(curNode.t|afterChild, curNode, curChild)
111 curChild++
112 }
113
114 w.patchJump(0, w.curPos())
115 w.emit(Stop)
116
117 if !w.counting {
118 break
119 }
120
121 w.counting = false
122 }
123
124 fcPrefix := getFirstCharsPrefix(tree)
125 prefix := getPrefix(tree)
126 rtl := (tree.options & RightToLeft) != 0
127
128 var bmPrefix *BmPrefix
129
130 if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
131 if len(prefix.PrefixStr) > MaxPrefixSize {
132
133 prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
134 }
135 bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
136 } else {
137 bmPrefix = nil
138 }
139
140 return &Code{
141 Codes: w.emitted,
142 Strings: w.stringtable,
143 Sets: w.settable,
144 TrackCount: w.trackcount,
145 Caps: w.caps,
146 Capsize: capsize,
147 FcPrefix: fcPrefix,
148 BmPrefix: bmPrefix,
149 Anchors: getAnchors(tree),
150 RightToLeft: rtl,
151 }, nil
152 }
153
154
155
156
157 func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
158 bits := InstOp(0)
159
160 if nodetype <= ntRef {
161 if (node.options & RightToLeft) != 0 {
162 bits |= Rtl
163 }
164 if (node.options & IgnoreCase) != 0 {
165 bits |= Ci
166 }
167 }
168 ntBits := nodeType(bits)
169
170 switch nodetype {
171 case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
172 break
173
174 case ntAlternate | beforeChild:
175 if curIndex < len(node.children)-1 {
176 w.pushInt(w.curPos())
177 w.emit1(Lazybranch, 0)
178 }
179
180 case ntAlternate | afterChild:
181 if curIndex < len(node.children)-1 {
182 lbPos := w.popInt()
183 w.pushInt(w.curPos())
184 w.emit1(Goto, 0)
185 w.patchJump(lbPos, w.curPos())
186 } else {
187 for i := 0; i < curIndex; i++ {
188 w.patchJump(w.popInt(), w.curPos())
189 }
190 }
191 break
192
193 case ntTestref | beforeChild:
194 if curIndex == 0 {
195 w.emit(Setjump)
196 w.pushInt(w.curPos())
197 w.emit1(Lazybranch, 0)
198 w.emit1(Testref, w.mapCapnum(node.m))
199 w.emit(Forejump)
200 }
201
202 case ntTestref | afterChild:
203 if curIndex == 0 {
204 branchpos := w.popInt()
205 w.pushInt(w.curPos())
206 w.emit1(Goto, 0)
207 w.patchJump(branchpos, w.curPos())
208 w.emit(Forejump)
209 if len(node.children) <= 1 {
210 w.patchJump(w.popInt(), w.curPos())
211 }
212 } else if curIndex == 1 {
213 w.patchJump(w.popInt(), w.curPos())
214 }
215
216 case ntTestgroup | beforeChild:
217 if curIndex == 0 {
218 w.emit(Setjump)
219 w.emit(Setmark)
220 w.pushInt(w.curPos())
221 w.emit1(Lazybranch, 0)
222 }
223
224 case ntTestgroup | afterChild:
225 if curIndex == 0 {
226 w.emit(Getmark)
227 w.emit(Forejump)
228 } else if curIndex == 1 {
229 Branchpos := w.popInt()
230 w.pushInt(w.curPos())
231 w.emit1(Goto, 0)
232 w.patchJump(Branchpos, w.curPos())
233 w.emit(Getmark)
234 w.emit(Forejump)
235 if len(node.children) <= 2 {
236 w.patchJump(w.popInt(), w.curPos())
237 }
238 } else if curIndex == 2 {
239 w.patchJump(w.popInt(), w.curPos())
240 }
241
242 case ntLoop | beforeChild, ntLazyloop | beforeChild:
243
244 if node.n < math.MaxInt32 || node.m > 1 {
245 if node.m == 0 {
246 w.emit1(Nullcount, 0)
247 } else {
248 w.emit1(Setcount, 1-node.m)
249 }
250 } else if node.m == 0 {
251 w.emit(Nullmark)
252 } else {
253 w.emit(Setmark)
254 }
255
256 if node.m == 0 {
257 w.pushInt(w.curPos())
258 w.emit1(Goto, 0)
259 }
260 w.pushInt(w.curPos())
261
262 case ntLoop | afterChild, ntLazyloop | afterChild:
263
264 startJumpPos := w.curPos()
265 lazy := (nodetype - (ntLoop | afterChild))
266
267 if node.n < math.MaxInt32 || node.m > 1 {
268 if node.n == math.MaxInt32 {
269 w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
270 } else {
271 w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
272 }
273 } else {
274 w.emit1(InstOp(Branchmark+lazy), w.popInt())
275 }
276
277 if node.m == 0 {
278 w.patchJump(w.popInt(), startJumpPos)
279 }
280
281 case ntGroup | beforeChild, ntGroup | afterChild:
282
283 case ntCapture | beforeChild:
284 w.emit(Setmark)
285
286 case ntCapture | afterChild:
287 w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
288
289 case ntRequire | beforeChild:
290
291
292 w.emit(Setjump)
293
294 w.emit(Setmark)
295
296 case ntRequire | afterChild:
297 w.emit(Getmark)
298
299
300
301 w.emit(Forejump)
302
303 case ntPrevent | beforeChild:
304 w.emit(Setjump)
305 w.pushInt(w.curPos())
306 w.emit1(Lazybranch, 0)
307
308 case ntPrevent | afterChild:
309 w.emit(Backjump)
310 w.patchJump(w.popInt(), w.curPos())
311 w.emit(Forejump)
312
313 case ntGreedy | beforeChild:
314 w.emit(Setjump)
315
316 case ntGreedy | afterChild:
317 w.emit(Forejump)
318
319 case ntOne, ntNotone:
320 w.emit1(InstOp(node.t|ntBits), int(node.ch))
321
322 case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
323 if node.m > 0 {
324 if node.t == ntOneloop || node.t == ntOnelazy {
325 w.emit2(Onerep|bits, int(node.ch), node.m)
326 } else {
327 w.emit2(Notonerep|bits, int(node.ch), node.m)
328 }
329 }
330 if node.n > node.m {
331 if node.n == math.MaxInt32 {
332 w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
333 } else {
334 w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
335 }
336 }
337
338 case ntSetloop, ntSetlazy:
339 if node.m > 0 {
340 w.emit2(Setrep|bits, w.setCode(node.set), node.m)
341 }
342 if node.n > node.m {
343 if node.n == math.MaxInt32 {
344 w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
345 } else {
346 w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
347 }
348 }
349
350 case ntMulti:
351 w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
352
353 case ntSet:
354 w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
355
356 case ntRef:
357 w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
358
359 case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
360 w.emit(InstOp(node.t))
361
362 default:
363 return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
364 }
365
366 return nil
367 }
368
369
370
371 func (w *writer) pushInt(i int) {
372 w.intStack = append(w.intStack, i)
373 }
374
375
376 func (w *writer) emptyStack() bool {
377 return len(w.intStack) == 0
378 }
379
380
381 func (w *writer) popInt() int {
382
383 idx := len(w.intStack) - 1
384 i := w.intStack[idx]
385
386 w.intStack = w.intStack[:idx]
387 return i
388 }
389
390
391 func (w *writer) curPos() int {
392 return w.curpos
393 }
394
395
396
397 func (w *writer) patchJump(offset, jumpDest int) {
398 w.emitted[offset+1] = jumpDest
399 }
400
401
402
403 func (w *writer) setCode(set *CharSet) int {
404 if w.counting {
405 return 0
406 }
407
408 buf := &bytes.Buffer{}
409
410 set.mapHashFill(buf)
411 hash := buf.String()
412 i, ok := w.sethash[hash]
413 if !ok {
414 i = len(w.sethash)
415 w.sethash[hash] = i
416 w.settable = append(w.settable, set)
417 }
418 return i
419 }
420
421
422
423 func (w *writer) stringCode(str []rune) int {
424 if w.counting {
425 return 0
426 }
427
428 hash := string(str)
429 i, ok := w.stringhash[hash]
430 if !ok {
431 i = len(w.stringhash)
432 w.stringhash[hash] = i
433 w.stringtable = append(w.stringtable, str)
434 }
435
436 return i
437 }
438
439
440
441
442
443 func (w *writer) mapCapnum(capnum int) int {
444 if capnum == -1 {
445 return -1
446 }
447
448 if w.caps != nil {
449 return w.caps[capnum]
450 }
451
452 return capnum
453 }
454
455
456
457
458 func (w *writer) emit(op InstOp) {
459 if w.counting {
460 w.count++
461 if opcodeBacktracks(op) {
462 w.trackcount++
463 }
464 return
465 }
466 w.emitted[w.curpos] = int(op)
467 w.curpos++
468 }
469
470
471 func (w *writer) emit1(op InstOp, opd1 int) {
472 if w.counting {
473 w.count += 2
474 if opcodeBacktracks(op) {
475 w.trackcount++
476 }
477 return
478 }
479 w.emitted[w.curpos] = int(op)
480 w.curpos++
481 w.emitted[w.curpos] = opd1
482 w.curpos++
483 }
484
485
486 func (w *writer) emit2(op InstOp, opd1, opd2 int) {
487 if w.counting {
488 w.count += 3
489 if opcodeBacktracks(op) {
490 w.trackcount++
491 }
492 return
493 }
494 w.emitted[w.curpos] = int(op)
495 w.curpos++
496 w.emitted[w.curpos] = opd1
497 w.curpos++
498 w.emitted[w.curpos] = opd2
499 w.curpos++
500 }
501
View as plain text