1
2
3
4 package patchbpf
5
6 import (
7 "bytes"
8 "encoding/binary"
9 "errors"
10 "fmt"
11 "io"
12 "os"
13 "runtime"
14 "unsafe"
15
16 libseccomp "github.com/seccomp/libseccomp-golang"
17 "github.com/sirupsen/logrus"
18 "golang.org/x/net/bpf"
19 "golang.org/x/sys/unix"
20
21 "github.com/opencontainers/runc/libcontainer/configs"
22 "github.com/opencontainers/runc/libcontainer/utils"
23 )
24
25
26
79 import "C"
80
81 var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)
82
83
84
85
86 const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0
87
88 func isAllowAction(action configs.Action) bool {
89 switch action {
90
91
92
93 case configs.Allow, configs.Log, configs.Trace:
94 return true
95 default:
96 return false
97 }
98 }
99
100 func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) {
101 var program []bpf.RawInstruction
102 loop:
103 for {
104
105
106 var insn unix.SockFilter
107 if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil {
108 if errors.Is(err, io.EOF) {
109
110 break loop
111 }
112 if errors.Is(err, io.ErrUnexpectedEOF) {
113
114 return nil, fmt.Errorf("program parsing halted mid-instruction: %w", err)
115 }
116
117 return nil, fmt.Errorf("error parsing instructions: %w", err)
118 }
119 program = append(program, bpf.RawInstruction{
120 Op: insn.Code,
121 Jt: insn.Jt,
122 Jf: insn.Jf,
123 K: insn.K,
124 })
125 }
126 return program, nil
127 }
128
129 func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) {
130 rdr, wtr, err := os.Pipe()
131 if err != nil {
132 return nil, fmt.Errorf("error creating scratch pipe: %w", err)
133 }
134 defer wtr.Close()
135 defer rdr.Close()
136
137 readerBuffer := new(bytes.Buffer)
138 errChan := make(chan error, 1)
139 go func() {
140 _, err := io.Copy(readerBuffer, rdr)
141 errChan <- err
142 close(errChan)
143 }()
144
145 if err := filter.ExportBPF(wtr); err != nil {
146 return nil, fmt.Errorf("error exporting BPF: %w", err)
147 }
148
149 _ = wtr.Close()
150
151 if copyErr := <-errChan; copyErr != nil {
152 return nil, fmt.Errorf("error reading from ExportBPF pipe: %w", copyErr)
153 }
154
155
156 rawProgram, err := parseProgram(readerBuffer)
157 if err != nil {
158 return nil, fmt.Errorf("parsing generated BPF filter: %w", err)
159 }
160 program, ok := bpf.Disassemble(rawProgram)
161 if !ok {
162 return nil, errors.New("could not disassemble entire BPF filter")
163 }
164 return program, nil
165 }
166
167 type nativeArch uint32
168
169 const invalidArch nativeArch = 0
170
171 func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
172 switch arch {
173 case libseccomp.ArchNative:
174
175 arch, err := libseccomp.GetNativeArch()
176 if err != nil {
177 return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
178 }
179 return archToNative(arch)
180 case libseccomp.ArchX86:
181 return nativeArch(C.C_AUDIT_ARCH_I386), nil
182 case libseccomp.ArchAMD64, libseccomp.ArchX32:
183
184
185
186 return nativeArch(C.C_AUDIT_ARCH_X86_64), nil
187 case libseccomp.ArchARM:
188 return nativeArch(C.C_AUDIT_ARCH_ARM), nil
189 case libseccomp.ArchARM64:
190 return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil
191 case libseccomp.ArchMIPS:
192 return nativeArch(C.C_AUDIT_ARCH_MIPS), nil
193 case libseccomp.ArchMIPS64:
194 return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil
195 case libseccomp.ArchMIPS64N32:
196 return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil
197 case libseccomp.ArchMIPSEL:
198 return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil
199 case libseccomp.ArchMIPSEL64:
200 return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil
201 case libseccomp.ArchMIPSEL64N32:
202 return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
203 case libseccomp.ArchPPC:
204 return nativeArch(C.C_AUDIT_ARCH_PPC), nil
205 case libseccomp.ArchPPC64:
206 return nativeArch(C.C_AUDIT_ARCH_PPC64), nil
207 case libseccomp.ArchPPC64LE:
208 return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil
209 case libseccomp.ArchS390:
210 return nativeArch(C.C_AUDIT_ARCH_S390), nil
211 case libseccomp.ArchS390X:
212 return nativeArch(C.C_AUDIT_ARCH_S390X), nil
213 case libseccomp.ArchRISCV64:
214 return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil
215 default:
216 return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
217 }
218 }
219
220 type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
221
222
223
224
225
226 func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
227 lastSyscalls := make(lastSyscallMap)
228
229
230 for _, ociArch := range config.Architectures {
231 arch, err := libseccomp.GetArchFromString(ociArch)
232 if err != nil {
233 return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
234 }
235
236
237
238 if arch == libseccomp.ArchNative {
239 nativeArch, err := libseccomp.GetNativeArch()
240 if err != nil {
241 return nil, fmt.Errorf("unable to get native architecture: %w", err)
242 }
243 arch = nativeArch
244 }
245
246
247 nativeArch, err := archToNative(arch)
248 if err != nil {
249 return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
250 }
251
252 if _, ok := lastSyscalls[nativeArch]; !ok {
253 lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
254 }
255 if _, ok := lastSyscalls[nativeArch][arch]; ok {
256
257
258
259 continue
260 }
261
262
263 var largestSyscall libseccomp.ScmpSyscall
264 for _, rule := range config.Syscalls {
265 sysno, err := libseccomp.GetSyscallFromNameByArch(rule.Name, arch)
266 if err != nil {
267
268 continue
269 }
270 if sysno > largestSyscall {
271 largestSyscall = sysno
272 }
273 }
274 if largestSyscall != 0 {
275 lastSyscalls[nativeArch][arch] = largestSyscall
276 } else {
277 logrus.Warnf("could not find any syscalls for arch %s", ociArch)
278 delete(lastSyscalls[nativeArch], arch)
279 }
280 }
281 return lastSyscalls, nil
282 }
283
284
285
286
287
288
289
290
291
292
293
294
295 func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
296
297
298
299 archJumpTable := map[nativeArch]uint32{}
300
301
302
303
304 programTail := []bpf.Instruction{
305
306 bpf.Jump{Skip: 1},
307
308 bpf.RetConstant{Val: retErrnoEnosys},
309 }
310
311
312 for nativeArch, maxSyscalls := range lastSyscalls {
313
314
315
316 baseJumpEnosys := uint32(len(programTail) - 1)
317 baseJumpFilter := baseJumpEnosys + 1
318
319
320
321
322 section := []bpf.Instruction{
323
324 bpf.LoadAbsolute{Off: 0, Size: 4},
325 }
326
327 switch len(maxSyscalls) {
328 case 0:
329
330 continue
331 case 1:
332
333 var (
334 scmpArch libseccomp.ScmpArch
335 sysno libseccomp.ScmpSyscall
336 )
337 for arch, no := range maxSyscalls {
338 sysno = no
339 scmpArch = arch
340 }
341
342 switch scmpArch {
343
344
345
346
347
348
349
350
351
352 case libseccomp.ArchS390, libseccomp.ArchS390X:
353 section = append(section, []bpf.Instruction{
354
355 bpf.JumpIf{
356 Cond: bpf.JumpNotEqual,
357 Val: uint32(s390xMultiplexSyscall),
358 SkipTrue: 1,
359 },
360
361 bpf.RetConstant{Val: retErrnoEnosys},
362 }...)
363 }
364
365
366
367
368 var sectionTail []bpf.Instruction
369 if baseJumpEnosys+1 <= 255 {
370 sectionTail = []bpf.Instruction{
371
372 bpf.JumpIf{
373 Cond: bpf.JumpGreaterThan,
374 Val: uint32(sysno),
375 SkipTrue: uint8(baseJumpEnosys + 1),
376 },
377
378 bpf.Jump{Skip: baseJumpFilter},
379 }
380 } else {
381 sectionTail = []bpf.Instruction{
382
383 bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1},
384
385 bpf.Jump{Skip: baseJumpEnosys + 1},
386
387 bpf.Jump{Skip: baseJumpFilter},
388 }
389 }
390
391
392
393 if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
394
395 switch scmpArch {
396 case libseccomp.ArchAMD64:
397 sectionTail = append([]bpf.Instruction{
398
399 bpf.JumpIf{
400 Cond: bpf.JumpBitsSet,
401 Val: 1 << 30,
402 SkipTrue: uint8(len(sectionTail) - 1),
403 },
404 }, sectionTail...)
405 case libseccomp.ArchX32:
406 sectionTail = append([]bpf.Instruction{
407
408 bpf.JumpIf{
409 Cond: bpf.JumpBitsNotSet,
410 Val: 1 << 30,
411 SkipTrue: uint8(len(sectionTail) - 1),
412 },
413 }, sectionTail...)
414 default:
415 return nil, fmt.Errorf("unknown amd64 native architecture %#x", scmpArch)
416 }
417 }
418
419 section = append(section, sectionTail...)
420 case 2:
421
422 if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
423 return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
424 }
425
426 x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
427 if !ok {
428 return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls)
429 }
430 x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64]
431 if !ok {
432 return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls)
433 }
434
435
436
437
438
439 if baseJumpEnosys+2 <= 255 {
440
441
442
443
444
445 section = append(section, []bpf.Instruction{
446
447 bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
448
449 bpf.JumpIf{
450 Cond: bpf.JumpGreaterThan,
451 Val: uint32(x86sysno),
452 SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1,
453 },
454
455 bpf.JumpIf{
456 Cond: bpf.JumpGreaterThan,
457 Val: uint32(x32sysno),
458 SkipTrue: uint8(baseJumpEnosys + 1),
459 },
460
461 bpf.Jump{Skip: baseJumpFilter},
462 }...)
463 } else {
464
465
466
467
468
469
470
471 section = append(section, []bpf.Instruction{
472
473 bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
474
475 bpf.JumpIf{
476 Cond: bpf.JumpGreaterThan,
477 Val: uint32(x86sysno),
478 SkipTrue: 1, SkipFalse: 2,
479 },
480
481 bpf.JumpIf{
482 Cond: bpf.JumpLessOrEqual,
483 Val: uint32(x32sysno),
484 SkipTrue: 1,
485 },
486
487 bpf.Jump{Skip: baseJumpEnosys + 1},
488
489 bpf.Jump{Skip: baseJumpFilter},
490 }...)
491 }
492 default:
493 return nil, fmt.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls))
494 }
495
496
497 programTail = append(section, programTail...)
498
499
500 archJumpTable[nativeArch] = uint32(len(programTail))
501 }
502
503
504
505
506 programTail = append([]bpf.Instruction{
507
508 bpf.Jump{Skip: uint32(len(programTail))},
509 }, programTail...)
510
511
512
513
514
515
516
517
518
519
520 for nativeArch := range lastSyscalls {
521
522 jump := uint32(len(programTail)) - archJumpTable[nativeArch]
523
524
525
526 if jump <= 255 {
527 programTail = append([]bpf.Instruction{
528
529 bpf.JumpIf{
530 Cond: bpf.JumpEqual,
531 Val: uint32(nativeArch),
532 SkipTrue: uint8(jump),
533 },
534 }, programTail...)
535 } else {
536 programTail = append([]bpf.Instruction{
537
538 bpf.JumpIf{
539 Cond: bpf.JumpNotEqual,
540 Val: uint32(nativeArch),
541 SkipTrue: 1,
542 },
543
544 bpf.Jump{Skip: jump},
545 }, programTail...)
546 }
547 }
548
549
550 programTail = append([]bpf.Instruction{
551
552 bpf.LoadAbsolute{Off: 4, Size: 4},
553 }, programTail...)
554
555
556 return programTail, nil
557 }
558
559 func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) {
560 rawProgram, err := bpf.Assemble(program)
561 if err != nil {
562 return nil, fmt.Errorf("error assembling program: %w", err)
563 }
564
565
566 var filter []unix.SockFilter
567 for _, insn := range rawProgram {
568 filter = append(filter, unix.SockFilter{
569 Code: insn.Op,
570 Jt: insn.Jt,
571 Jf: insn.Jf,
572 K: insn.K,
573 })
574 }
575 return filter, nil
576 }
577
578 func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) {
579
580
581 if config.DefaultErrnoRet != nil && *config.DefaultErrnoRet == uint(retErrnoEnosys) {
582 return nil, nil
583 }
584
585 if isAllowAction(config.DefaultAction) {
586 logrus.Debugf("seccomp: skipping -ENOSYS stub filter generation")
587 return nil, nil
588 }
589
590 lastSyscalls, err := findLastSyscalls(config)
591 if err != nil {
592 return nil, fmt.Errorf("error finding last syscalls for -ENOSYS stub: %w", err)
593 }
594 stubProgram, err := generateEnosysStub(lastSyscalls)
595 if err != nil {
596 return nil, fmt.Errorf("error generating -ENOSYS stub: %w", err)
597 }
598 return stubProgram, nil
599 }
600
601 func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) {
602 program, err := disassembleFilter(filter)
603 if err != nil {
604 return nil, fmt.Errorf("error disassembling original filter: %w", err)
605 }
606
607 patch, err := generatePatch(config)
608 if err != nil {
609 return nil, fmt.Errorf("error generating patch for filter: %w", err)
610 }
611 fullProgram := append(patch, program...)
612
613 logrus.Debugf("seccomp: prepending -ENOSYS stub filter to user filter...")
614 for idx, insn := range patch {
615 logrus.Debugf(" [%4.1d] %s", idx, insn)
616 }
617 logrus.Debugf(" [....] --- original filter ---")
618
619 fprog, err := assemble(fullProgram)
620 if err != nil {
621 return nil, fmt.Errorf("error assembling modified filter: %w", err)
622 }
623 return fprog, nil
624 }
625
626 func filterFlags(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) {
627
628 apiLevel, _ := libseccomp.GetAPI()
629
630 noNewPrivs, err = filter.GetNoNewPrivsBit()
631 if err != nil {
632 return 0, false, fmt.Errorf("unable to fetch no_new_privs filter bit: %w", err)
633 }
634
635 if apiLevel >= 3 {
636 if logBit, err := filter.GetLogBit(); err != nil {
637 return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_LOG bit: %w", err)
638 } else if logBit {
639 flags |= uint(C.C_FILTER_FLAG_LOG)
640 }
641 }
642
643
644
645 for _, call := range config.Syscalls {
646 if call.Action == configs.Notify {
647 flags |= uint(C.C_FILTER_FLAG_NEW_LISTENER)
648 break
649 }
650 }
651
652 return
653 }
654
655 func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err error) {
656 fprog := unix.SockFprog{
657 Len: uint16(len(filter)),
658 Filter: &filter[0],
659 }
660 fd = -1
661
662 if flags == 0 {
663 err = unix.Prctl(unix.PR_SET_SECCOMP,
664 unix.SECCOMP_MODE_FILTER,
665 uintptr(unsafe.Pointer(&fprog)), 0, 0)
666 } else {
667 fdptr, _, errno := unix.RawSyscall(unix.SYS_SECCOMP,
668 uintptr(C.C_SET_MODE_FILTER),
669 uintptr(flags), uintptr(unsafe.Pointer(&fprog)))
670 if errno != 0 {
671 err = errno
672 }
673 if flags&uint(C.C_FILTER_FLAG_NEW_LISTENER) != 0 {
674 fd = int(fdptr)
675 }
676 }
677 runtime.KeepAlive(filter)
678 runtime.KeepAlive(fprog)
679 return
680 }
681
682
683
684
685
686
687 func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) {
688
689 fprog, err := enosysPatchFilter(config, filter)
690 if err != nil {
691 return -1, fmt.Errorf("error patching filter: %w", err)
692 }
693
694
695 seccompFlags, noNewPrivs, err := filterFlags(config, filter)
696 if err != nil {
697 return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err)
698 }
699
700
701
702 if noNewPrivs {
703 logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path")
704 if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
705 return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err)
706 }
707 }
708
709
710 fd, err := sysSeccompSetFilter(seccompFlags, fprog)
711 if err != nil {
712 return -1, fmt.Errorf("error loading seccomp filter: %w", err)
713 }
714
715 return fd, nil
716 }
717
View as plain text