...

Source file src/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go

Documentation: github.com/opencontainers/runc/libcontainer/seccomp/patchbpf

     1  //go:build cgo && seccomp
     2  // +build cgo,seccomp
     3  
     4  package patchbpf
     5  
     6  import (
     7  	"bytes"
     8  	"encoding/binary"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"os"
    13  	"runtime"
    14  	"unsafe"
    15  
    16  	libseccomp "github.com/seccomp/libseccomp-golang"
    17  	"github.com/sirupsen/logrus"
    18  	"golang.org/x/net/bpf"
    19  	"golang.org/x/sys/unix"
    20  
    21  	"github.com/opencontainers/runc/libcontainer/configs"
    22  	"github.com/opencontainers/runc/libcontainer/utils"
    23  )
    24  
    25  // #cgo pkg-config: libseccomp
    26  /*
    27  #include <errno.h>
    28  #include <stdint.h>
    29  #include <seccomp.h>
    30  #include <linux/seccomp.h>
    31  
    32  const uint32_t C_ACT_ERRNO_ENOSYS = SCMP_ACT_ERRNO(ENOSYS);
    33  
    34  // Copied from <linux/seccomp.h>.
    35  
    36  #ifndef SECCOMP_SET_MODE_FILTER
    37  #	define SECCOMP_SET_MODE_FILTER 1
    38  #endif
    39  const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER;
    40  
    41  #ifndef SECCOMP_FILTER_FLAG_LOG
    42  #	define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
    43  #endif
    44  const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG;
    45  
    46  #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
    47  #	define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
    48  #endif
    49  const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER;
    50  
    51  #ifndef AUDIT_ARCH_RISCV64
    52  #ifndef EM_RISCV
    53  #define EM_RISCV		243
    54  #endif
    55  #define AUDIT_ARCH_RISCV64	(EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
    56  #endif
    57  
    58  // We use the AUDIT_ARCH_* values because those are the ones used by the kernel
    59  // and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we
    60  // use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*.
    61  
    62  const uint32_t C_AUDIT_ARCH_I386         = AUDIT_ARCH_I386;
    63  const uint32_t C_AUDIT_ARCH_X86_64       = AUDIT_ARCH_X86_64;
    64  const uint32_t C_AUDIT_ARCH_ARM          = AUDIT_ARCH_ARM;
    65  const uint32_t C_AUDIT_ARCH_AARCH64      = AUDIT_ARCH_AARCH64;
    66  const uint32_t C_AUDIT_ARCH_MIPS         = AUDIT_ARCH_MIPS;
    67  const uint32_t C_AUDIT_ARCH_MIPS64       = AUDIT_ARCH_MIPS64;
    68  const uint32_t C_AUDIT_ARCH_MIPS64N32    = AUDIT_ARCH_MIPS64N32;
    69  const uint32_t C_AUDIT_ARCH_MIPSEL       = AUDIT_ARCH_MIPSEL;
    70  const uint32_t C_AUDIT_ARCH_MIPSEL64     = AUDIT_ARCH_MIPSEL64;
    71  const uint32_t C_AUDIT_ARCH_MIPSEL64N32  = AUDIT_ARCH_MIPSEL64N32;
    72  const uint32_t C_AUDIT_ARCH_PPC          = AUDIT_ARCH_PPC;
    73  const uint32_t C_AUDIT_ARCH_PPC64        = AUDIT_ARCH_PPC64;
    74  const uint32_t C_AUDIT_ARCH_PPC64LE      = AUDIT_ARCH_PPC64LE;
    75  const uint32_t C_AUDIT_ARCH_S390         = AUDIT_ARCH_S390;
    76  const uint32_t C_AUDIT_ARCH_S390X        = AUDIT_ARCH_S390X;
    77  const uint32_t C_AUDIT_ARCH_RISCV64      = AUDIT_ARCH_RISCV64;
    78  */
    79  import "C"
    80  
    81  var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)
    82  
    83  // This syscall is used for multiplexing "large" syscalls on s390(x). Unknown
    84  // syscalls will end up with this syscall number, so we need to explicitly
    85  // return -ENOSYS for this syscall on those architectures.
    86  const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0
    87  
    88  func isAllowAction(action configs.Action) bool {
    89  	switch action {
    90  	// Trace is considered an "allow" action because a good tracer should
    91  	// support future syscalls (by handling -ENOSYS on its own), and giving
    92  	// -ENOSYS will be disruptive for emulation.
    93  	case configs.Allow, configs.Log, configs.Trace:
    94  		return true
    95  	default:
    96  		return false
    97  	}
    98  }
    99  
   100  func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) {
   101  	var program []bpf.RawInstruction
   102  loop:
   103  	for {
   104  		// Read the next instruction. We have to use NativeEndian because
   105  		// seccomp_export_bpf outputs the program in *host* endian-ness.
   106  		var insn unix.SockFilter
   107  		if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil {
   108  			if errors.Is(err, io.EOF) {
   109  				// Parsing complete.
   110  				break loop
   111  			}
   112  			if errors.Is(err, io.ErrUnexpectedEOF) {
   113  				// Parsing stopped mid-instruction.
   114  				return nil, fmt.Errorf("program parsing halted mid-instruction: %w", err)
   115  			}
   116  			// All other errors.
   117  			return nil, fmt.Errorf("error parsing instructions: %w", err)
   118  		}
   119  		program = append(program, bpf.RawInstruction{
   120  			Op: insn.Code,
   121  			Jt: insn.Jt,
   122  			Jf: insn.Jf,
   123  			K:  insn.K,
   124  		})
   125  	}
   126  	return program, nil
   127  }
   128  
   129  func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) {
   130  	rdr, wtr, err := os.Pipe()
   131  	if err != nil {
   132  		return nil, fmt.Errorf("error creating scratch pipe: %w", err)
   133  	}
   134  	defer wtr.Close()
   135  	defer rdr.Close()
   136  
   137  	readerBuffer := new(bytes.Buffer)
   138  	errChan := make(chan error, 1)
   139  	go func() {
   140  		_, err := io.Copy(readerBuffer, rdr)
   141  		errChan <- err
   142  		close(errChan)
   143  	}()
   144  
   145  	if err := filter.ExportBPF(wtr); err != nil {
   146  		return nil, fmt.Errorf("error exporting BPF: %w", err)
   147  	}
   148  	// Close so that the reader actually gets EOF.
   149  	_ = wtr.Close()
   150  
   151  	if copyErr := <-errChan; copyErr != nil {
   152  		return nil, fmt.Errorf("error reading from ExportBPF pipe: %w", copyErr)
   153  	}
   154  
   155  	// Parse the instructions.
   156  	rawProgram, err := parseProgram(readerBuffer)
   157  	if err != nil {
   158  		return nil, fmt.Errorf("parsing generated BPF filter: %w", err)
   159  	}
   160  	program, ok := bpf.Disassemble(rawProgram)
   161  	if !ok {
   162  		return nil, errors.New("could not disassemble entire BPF filter")
   163  	}
   164  	return program, nil
   165  }
   166  
   167  type nativeArch uint32
   168  
   169  const invalidArch nativeArch = 0
   170  
   171  func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
   172  	switch arch {
   173  	case libseccomp.ArchNative:
   174  		// Convert to actual native architecture.
   175  		arch, err := libseccomp.GetNativeArch()
   176  		if err != nil {
   177  			return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
   178  		}
   179  		return archToNative(arch)
   180  	case libseccomp.ArchX86:
   181  		return nativeArch(C.C_AUDIT_ARCH_I386), nil
   182  	case libseccomp.ArchAMD64, libseccomp.ArchX32:
   183  		// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
   184  		//       30th bit of the syscall number set to indicate that it's not a
   185  		//       normal x86_64 syscall.
   186  		return nativeArch(C.C_AUDIT_ARCH_X86_64), nil
   187  	case libseccomp.ArchARM:
   188  		return nativeArch(C.C_AUDIT_ARCH_ARM), nil
   189  	case libseccomp.ArchARM64:
   190  		return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil
   191  	case libseccomp.ArchMIPS:
   192  		return nativeArch(C.C_AUDIT_ARCH_MIPS), nil
   193  	case libseccomp.ArchMIPS64:
   194  		return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil
   195  	case libseccomp.ArchMIPS64N32:
   196  		return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil
   197  	case libseccomp.ArchMIPSEL:
   198  		return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil
   199  	case libseccomp.ArchMIPSEL64:
   200  		return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil
   201  	case libseccomp.ArchMIPSEL64N32:
   202  		return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
   203  	case libseccomp.ArchPPC:
   204  		return nativeArch(C.C_AUDIT_ARCH_PPC), nil
   205  	case libseccomp.ArchPPC64:
   206  		return nativeArch(C.C_AUDIT_ARCH_PPC64), nil
   207  	case libseccomp.ArchPPC64LE:
   208  		return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil
   209  	case libseccomp.ArchS390:
   210  		return nativeArch(C.C_AUDIT_ARCH_S390), nil
   211  	case libseccomp.ArchS390X:
   212  		return nativeArch(C.C_AUDIT_ARCH_S390X), nil
   213  	case libseccomp.ArchRISCV64:
   214  		return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil
   215  	default:
   216  		return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
   217  	}
   218  }
   219  
   220  type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
   221  
   222  // Figure out largest syscall number referenced in the filter for each
   223  // architecture. We will be generating code based on the native architecture
   224  // representation, but SCMP_ARCH_X32 means we have to track cases where the
   225  // same architecture has different largest syscalls based on the mode.
   226  func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
   227  	lastSyscalls := make(lastSyscallMap)
   228  	// Only loop over architectures which are present in the filter. Any other
   229  	// architectures will get the libseccomp bad architecture action anyway.
   230  	for _, ociArch := range config.Architectures {
   231  		arch, err := libseccomp.GetArchFromString(ociArch)
   232  		if err != nil {
   233  			return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
   234  		}
   235  
   236  		// Map native architecture to a real architecture value to avoid
   237  		// doubling-up the lastSyscall mapping.
   238  		if arch == libseccomp.ArchNative {
   239  			nativeArch, err := libseccomp.GetNativeArch()
   240  			if err != nil {
   241  				return nil, fmt.Errorf("unable to get native architecture: %w", err)
   242  			}
   243  			arch = nativeArch
   244  		}
   245  
   246  		// Figure out native architecture representation of the architecture.
   247  		nativeArch, err := archToNative(arch)
   248  		if err != nil {
   249  			return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
   250  		}
   251  
   252  		if _, ok := lastSyscalls[nativeArch]; !ok {
   253  			lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
   254  		}
   255  		if _, ok := lastSyscalls[nativeArch][arch]; ok {
   256  			// Because of ArchNative we may hit the same entry multiple times.
   257  			// Just skip it if we've seen this (nativeArch, ScmpArch)
   258  			// combination before.
   259  			continue
   260  		}
   261  
   262  		// Find the largest syscall in the filter for this architecture.
   263  		var largestSyscall libseccomp.ScmpSyscall
   264  		for _, rule := range config.Syscalls {
   265  			sysno, err := libseccomp.GetSyscallFromNameByArch(rule.Name, arch)
   266  			if err != nil {
   267  				// Ignore unknown syscalls.
   268  				continue
   269  			}
   270  			if sysno > largestSyscall {
   271  				largestSyscall = sysno
   272  			}
   273  		}
   274  		if largestSyscall != 0 {
   275  			lastSyscalls[nativeArch][arch] = largestSyscall
   276  		} else {
   277  			logrus.Warnf("could not find any syscalls for arch %s", ociArch)
   278  			delete(lastSyscalls[nativeArch], arch)
   279  		}
   280  	}
   281  	return lastSyscalls, nil
   282  }
   283  
   284  // FIXME FIXME FIXME
   285  //
   286  // This solution is less than ideal. In the future it would be great to have
   287  // per-arch information about which syscalls were added in which kernel
   288  // versions so we can create far more accurate filter rules (handling holes in
   289  // the syscall table and determining -ENOSYS requirements based on kernel
   290  // minimum version alone.
   291  //
   292  // This implementation can in principle cause issues with syscalls like
   293  // close_range(2) which were added out-of-order in the syscall table between
   294  // kernel releases.
   295  func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
   296  	// A jump-table for each nativeArch used to generate the initial
   297  	// conditional jumps -- measured from the *END* of the program so they
   298  	// remain valid after prepending to the tail.
   299  	archJumpTable := map[nativeArch]uint32{}
   300  
   301  	// Generate our own -ENOSYS rules for each architecture. They have to be
   302  	// generated in reverse (prepended to the tail of the program) because the
   303  	// JumpIf jumps need to be computed from the end of the program.
   304  	programTail := []bpf.Instruction{
   305  		// Fall-through rules jump into the filter.
   306  		bpf.Jump{Skip: 1},
   307  		// Rules which jump to here get -ENOSYS.
   308  		bpf.RetConstant{Val: retErrnoEnosys},
   309  	}
   310  
   311  	// Generate the syscall -ENOSYS rules.
   312  	for nativeArch, maxSyscalls := range lastSyscalls {
   313  		// The number of instructions from the tail of this section which need
   314  		// to be jumped in order to reach the -ENOSYS return. If the section
   315  		// does not jump, it will fall through to the actual filter.
   316  		baseJumpEnosys := uint32(len(programTail) - 1)
   317  		baseJumpFilter := baseJumpEnosys + 1
   318  
   319  		// Add the load instruction for the syscall number -- we jump here
   320  		// directly from the arch code so we need to do it here. Sadly we can't
   321  		// share this code between architecture branches.
   322  		section := []bpf.Instruction{
   323  			// load [0] (syscall number)
   324  			bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4.
   325  		}
   326  
   327  		switch len(maxSyscalls) {
   328  		case 0:
   329  			// No syscalls found for this arch -- skip it and move on.
   330  			continue
   331  		case 1:
   332  			// Get the only syscall and scmpArch in the map.
   333  			var (
   334  				scmpArch libseccomp.ScmpArch
   335  				sysno    libseccomp.ScmpSyscall
   336  			)
   337  			for arch, no := range maxSyscalls {
   338  				sysno = no
   339  				scmpArch = arch
   340  			}
   341  
   342  			switch scmpArch {
   343  			// Return -ENOSYS for setup(2) on s390(x). This syscall is used for
   344  			// multiplexing "large syscall number" syscalls, but if the syscall
   345  			// number is not known to the kernel then the syscall number is
   346  			// left unchanged (and because it is sysno=0, you'll end up with
   347  			// EPERM for syscalls the kernel doesn't know about).
   348  			//
   349  			// The actual setup(2) syscall is never used by userspace anymore
   350  			// (and hasn't existed for decades) outside of this multiplexing
   351  			// scheme so returning -ENOSYS is fine.
   352  			case libseccomp.ArchS390, libseccomp.ArchS390X:
   353  				section = append(section, []bpf.Instruction{
   354  					// jne [setup=0],1
   355  					bpf.JumpIf{
   356  						Cond:     bpf.JumpNotEqual,
   357  						Val:      uint32(s390xMultiplexSyscall),
   358  						SkipTrue: 1,
   359  					},
   360  					// ret [ENOSYS]
   361  					bpf.RetConstant{Val: retErrnoEnosys},
   362  				}...)
   363  			}
   364  
   365  			// The simplest case just boils down to a single jgt instruction,
   366  			// with special handling if baseJumpEnosys is larger than 255 (and
   367  			// thus a long jump is required).
   368  			var sectionTail []bpf.Instruction
   369  			if baseJumpEnosys+1 <= 255 {
   370  				sectionTail = []bpf.Instruction{
   371  					// jgt [syscall],[baseJumpEnosys+1]
   372  					bpf.JumpIf{
   373  						Cond:     bpf.JumpGreaterThan,
   374  						Val:      uint32(sysno),
   375  						SkipTrue: uint8(baseJumpEnosys + 1),
   376  					},
   377  					// ja [baseJumpFilter]
   378  					bpf.Jump{Skip: baseJumpFilter},
   379  				}
   380  			} else {
   381  				sectionTail = []bpf.Instruction{
   382  					// jle [syscall],1
   383  					bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1},
   384  					// ja [baseJumpEnosys+1]
   385  					bpf.Jump{Skip: baseJumpEnosys + 1},
   386  					// ja [baseJumpFilter]
   387  					bpf.Jump{Skip: baseJumpFilter},
   388  				}
   389  			}
   390  
   391  			// If we're on x86 we need to add a check for x32 and if we're in
   392  			// the wrong mode we jump over the section.
   393  			if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
   394  				// Generate a prefix to check the mode.
   395  				switch scmpArch {
   396  				case libseccomp.ArchAMD64:
   397  					sectionTail = append([]bpf.Instruction{
   398  						// jset (1<<30),[len(tail)-1]
   399  						bpf.JumpIf{
   400  							Cond:     bpf.JumpBitsSet,
   401  							Val:      1 << 30,
   402  							SkipTrue: uint8(len(sectionTail) - 1),
   403  						},
   404  					}, sectionTail...)
   405  				case libseccomp.ArchX32:
   406  					sectionTail = append([]bpf.Instruction{
   407  						// jset (1<<30),0,[len(tail)-1]
   408  						bpf.JumpIf{
   409  							Cond:     bpf.JumpBitsNotSet,
   410  							Val:      1 << 30,
   411  							SkipTrue: uint8(len(sectionTail) - 1),
   412  						},
   413  					}, sectionTail...)
   414  				default:
   415  					return nil, fmt.Errorf("unknown amd64 native architecture %#x", scmpArch)
   416  				}
   417  			}
   418  
   419  			section = append(section, sectionTail...)
   420  		case 2:
   421  			// x32 and x86_64 are a unique case, we can't handle any others.
   422  			if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
   423  				return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
   424  			}
   425  
   426  			x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
   427  			if !ok {
   428  				return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls)
   429  			}
   430  			x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64]
   431  			if !ok {
   432  				return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls)
   433  			}
   434  
   435  			// The x32 ABI indicates that a syscall is being made by an x32
   436  			// process by setting the 30th bit of the syscall number, but we
   437  			// need to do some special-casing depending on whether we need to
   438  			// do long jumps.
   439  			if baseJumpEnosys+2 <= 255 {
   440  				// For the simple case we want to have something like:
   441  				//   jset (1<<30),1
   442  				//   jgt [x86 syscall],[baseJumpEnosys+2],1
   443  				//   jgt [x32 syscall],[baseJumpEnosys+1]
   444  				//   ja [baseJumpFilter]
   445  				section = append(section, []bpf.Instruction{
   446  					// jset (1<<30),1
   447  					bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
   448  					// jgt [x86 syscall],[baseJumpEnosys+1],1
   449  					bpf.JumpIf{
   450  						Cond:     bpf.JumpGreaterThan,
   451  						Val:      uint32(x86sysno),
   452  						SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1,
   453  					},
   454  					// jgt [x32 syscall],[baseJumpEnosys]
   455  					bpf.JumpIf{
   456  						Cond:     bpf.JumpGreaterThan,
   457  						Val:      uint32(x32sysno),
   458  						SkipTrue: uint8(baseJumpEnosys + 1),
   459  					},
   460  					// ja [baseJumpFilter]
   461  					bpf.Jump{Skip: baseJumpFilter},
   462  				}...)
   463  			} else {
   464  				// But if the [baseJumpEnosys+2] jump is larger than 255 we
   465  				// need to do a long jump like so:
   466  				//   jset (1<<30),1
   467  				//   jgt [x86 syscall],1,2
   468  				//   jle [x32 syscall],1
   469  				//   ja [baseJumpEnosys+1]
   470  				//   ja [baseJumpFilter]
   471  				section = append(section, []bpf.Instruction{
   472  					// jset (1<<30),1
   473  					bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
   474  					// jgt [x86 syscall],1,2
   475  					bpf.JumpIf{
   476  						Cond:     bpf.JumpGreaterThan,
   477  						Val:      uint32(x86sysno),
   478  						SkipTrue: 1, SkipFalse: 2,
   479  					},
   480  					// jle [x32 syscall],[baseJumpEnosys]
   481  					bpf.JumpIf{
   482  						Cond:     bpf.JumpLessOrEqual,
   483  						Val:      uint32(x32sysno),
   484  						SkipTrue: 1,
   485  					},
   486  					// ja [baseJumpEnosys+1]
   487  					bpf.Jump{Skip: baseJumpEnosys + 1},
   488  					// ja [baseJumpFilter]
   489  					bpf.Jump{Skip: baseJumpFilter},
   490  				}...)
   491  			}
   492  		default:
   493  			return nil, fmt.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls))
   494  		}
   495  
   496  		// Prepend this section to the tail.
   497  		programTail = append(section, programTail...)
   498  
   499  		// Update jump table.
   500  		archJumpTable[nativeArch] = uint32(len(programTail))
   501  	}
   502  
   503  	// Add a dummy "jump to filter" for any architecture we might miss below.
   504  	// Such architectures will probably get the BadArch action of the filter
   505  	// regardless.
   506  	programTail = append([]bpf.Instruction{
   507  		// ja [end of stub and start of filter]
   508  		bpf.Jump{Skip: uint32(len(programTail))},
   509  	}, programTail...)
   510  
   511  	// Generate the jump rules for each architecture. This has to be done in
   512  	// reverse as well for the same reason as above. We add to programTail
   513  	// directly because the jumps are impacted by each architecture rule we add
   514  	// as well.
   515  	//
   516  	// TODO: Maybe we want to optimise to avoid long jumps here? So sort the
   517  	//       architectures based on how large the jumps are going to be, or
   518  	//       re-sort the candidate architectures each time to make sure that we
   519  	//       pick the largest jump which is going to be smaller than 255.
   520  	for nativeArch := range lastSyscalls {
   521  		// We jump forwards but the jump table is calculated from the *END*.
   522  		jump := uint32(len(programTail)) - archJumpTable[nativeArch]
   523  
   524  		// Same routine as above -- this is a basic jeq check, complicated
   525  		// slightly if it turns out that we need to do a long jump.
   526  		if jump <= 255 {
   527  			programTail = append([]bpf.Instruction{
   528  				// jeq [arch],[jump]
   529  				bpf.JumpIf{
   530  					Cond:     bpf.JumpEqual,
   531  					Val:      uint32(nativeArch),
   532  					SkipTrue: uint8(jump),
   533  				},
   534  			}, programTail...)
   535  		} else {
   536  			programTail = append([]bpf.Instruction{
   537  				// jne [arch],1
   538  				bpf.JumpIf{
   539  					Cond:     bpf.JumpNotEqual,
   540  					Val:      uint32(nativeArch),
   541  					SkipTrue: 1,
   542  				},
   543  				// ja [jump]
   544  				bpf.Jump{Skip: jump},
   545  			}, programTail...)
   546  		}
   547  	}
   548  
   549  	// Prepend the load instruction for the architecture.
   550  	programTail = append([]bpf.Instruction{
   551  		// load [4] (architecture)
   552  		bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4.
   553  	}, programTail...)
   554  
   555  	// And that's all folks!
   556  	return programTail, nil
   557  }
   558  
   559  func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) {
   560  	rawProgram, err := bpf.Assemble(program)
   561  	if err != nil {
   562  		return nil, fmt.Errorf("error assembling program: %w", err)
   563  	}
   564  
   565  	// Convert to []unix.SockFilter for unix.SockFilter.
   566  	var filter []unix.SockFilter
   567  	for _, insn := range rawProgram {
   568  		filter = append(filter, unix.SockFilter{
   569  			Code: insn.Op,
   570  			Jt:   insn.Jt,
   571  			Jf:   insn.Jf,
   572  			K:    insn.K,
   573  		})
   574  	}
   575  	return filter, nil
   576  }
   577  
   578  func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) {
   579  	// Patch the generated cBPF only when there is not a defaultErrnoRet set
   580  	// and it is different from ENOSYS
   581  	if config.DefaultErrnoRet != nil && *config.DefaultErrnoRet == uint(retErrnoEnosys) {
   582  		return nil, nil
   583  	}
   584  	// We only add the stub if the default action is not permissive.
   585  	if isAllowAction(config.DefaultAction) {
   586  		logrus.Debugf("seccomp: skipping -ENOSYS stub filter generation")
   587  		return nil, nil
   588  	}
   589  
   590  	lastSyscalls, err := findLastSyscalls(config)
   591  	if err != nil {
   592  		return nil, fmt.Errorf("error finding last syscalls for -ENOSYS stub: %w", err)
   593  	}
   594  	stubProgram, err := generateEnosysStub(lastSyscalls)
   595  	if err != nil {
   596  		return nil, fmt.Errorf("error generating -ENOSYS stub: %w", err)
   597  	}
   598  	return stubProgram, nil
   599  }
   600  
   601  func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) {
   602  	program, err := disassembleFilter(filter)
   603  	if err != nil {
   604  		return nil, fmt.Errorf("error disassembling original filter: %w", err)
   605  	}
   606  
   607  	patch, err := generatePatch(config)
   608  	if err != nil {
   609  		return nil, fmt.Errorf("error generating patch for filter: %w", err)
   610  	}
   611  	fullProgram := append(patch, program...)
   612  
   613  	logrus.Debugf("seccomp: prepending -ENOSYS stub filter to user filter...")
   614  	for idx, insn := range patch {
   615  		logrus.Debugf("  [%4.1d] %s", idx, insn)
   616  	}
   617  	logrus.Debugf("  [....] --- original filter ---")
   618  
   619  	fprog, err := assemble(fullProgram)
   620  	if err != nil {
   621  		return nil, fmt.Errorf("error assembling modified filter: %w", err)
   622  	}
   623  	return fprog, nil
   624  }
   625  
   626  func filterFlags(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) {
   627  	// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
   628  	apiLevel, _ := libseccomp.GetAPI()
   629  
   630  	noNewPrivs, err = filter.GetNoNewPrivsBit()
   631  	if err != nil {
   632  		return 0, false, fmt.Errorf("unable to fetch no_new_privs filter bit: %w", err)
   633  	}
   634  
   635  	if apiLevel >= 3 {
   636  		if logBit, err := filter.GetLogBit(); err != nil {
   637  			return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_LOG bit: %w", err)
   638  		} else if logBit {
   639  			flags |= uint(C.C_FILTER_FLAG_LOG)
   640  		}
   641  	}
   642  
   643  	// TODO: Support seccomp flags not yet added to libseccomp-golang...
   644  
   645  	for _, call := range config.Syscalls {
   646  		if call.Action == configs.Notify {
   647  			flags |= uint(C.C_FILTER_FLAG_NEW_LISTENER)
   648  			break
   649  		}
   650  	}
   651  
   652  	return
   653  }
   654  
   655  func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err error) {
   656  	fprog := unix.SockFprog{
   657  		Len:    uint16(len(filter)),
   658  		Filter: &filter[0],
   659  	}
   660  	fd = -1 // only return a valid fd when C_FILTER_FLAG_NEW_LISTENER is set
   661  	// If no seccomp flags were requested we can use the old-school prctl(2).
   662  	if flags == 0 {
   663  		err = unix.Prctl(unix.PR_SET_SECCOMP,
   664  			unix.SECCOMP_MODE_FILTER,
   665  			uintptr(unsafe.Pointer(&fprog)), 0, 0)
   666  	} else {
   667  		fdptr, _, errno := unix.RawSyscall(unix.SYS_SECCOMP,
   668  			uintptr(C.C_SET_MODE_FILTER),
   669  			uintptr(flags), uintptr(unsafe.Pointer(&fprog)))
   670  		if errno != 0 {
   671  			err = errno
   672  		}
   673  		if flags&uint(C.C_FILTER_FLAG_NEW_LISTENER) != 0 {
   674  			fd = int(fdptr)
   675  		}
   676  	}
   677  	runtime.KeepAlive(filter)
   678  	runtime.KeepAlive(fprog)
   679  	return
   680  }
   681  
   682  // PatchAndLoad takes a seccomp configuration and a libseccomp filter which has
   683  // been pre-configured with the set of rules in the seccomp config. It then
   684  // patches said filter to handle -ENOSYS in a much nicer manner than the
   685  // default libseccomp default action behaviour, and loads the patched filter
   686  // into the kernel for the current process.
   687  func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) {
   688  	// Generate a patched filter.
   689  	fprog, err := enosysPatchFilter(config, filter)
   690  	if err != nil {
   691  		return -1, fmt.Errorf("error patching filter: %w", err)
   692  	}
   693  
   694  	// Get the set of libseccomp flags set.
   695  	seccompFlags, noNewPrivs, err := filterFlags(config, filter)
   696  	if err != nil {
   697  		return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err)
   698  	}
   699  
   700  	// Set no_new_privs if it was requested, though in runc we handle
   701  	// no_new_privs separately so warn if we hit this path.
   702  	if noNewPrivs {
   703  		logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path")
   704  		if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
   705  			return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err)
   706  		}
   707  	}
   708  
   709  	// Finally, load the filter.
   710  	fd, err := sysSeccompSetFilter(seccompFlags, fprog)
   711  	if err != nil {
   712  		return -1, fmt.Errorf("error loading seccomp filter: %w", err)
   713  	}
   714  
   715  	return fd, nil
   716  }
   717  

View as plain text