...

Source file src/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go

Documentation: github.com/opencontainers/runc/libcontainer/cgroups/ebpf

     1  package ebpf
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"os"
     7  	"runtime"
     8  	"sync"
     9  	"unsafe"
    10  
    11  	"github.com/cilium/ebpf"
    12  	"github.com/cilium/ebpf/asm"
    13  	"github.com/cilium/ebpf/link"
    14  	"github.com/sirupsen/logrus"
    15  	"golang.org/x/sys/unix"
    16  )
    17  
    18  func nilCloser() error {
    19  	return nil
    20  }
    21  
    22  func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
    23  	type bpfAttrQuery struct {
    24  		TargetFd    uint32
    25  		AttachType  uint32
    26  		QueryType   uint32
    27  		AttachFlags uint32
    28  		ProgIds     uint64 // __aligned_u64
    29  		ProgCnt     uint32
    30  	}
    31  
    32  	// Currently you can only have 64 eBPF programs attached to a cgroup.
    33  	size := 64
    34  	retries := 0
    35  	for retries < 10 {
    36  		progIds := make([]uint32, size)
    37  		query := bpfAttrQuery{
    38  			TargetFd:   uint32(dirFd),
    39  			AttachType: uint32(unix.BPF_CGROUP_DEVICE),
    40  			ProgIds:    uint64(uintptr(unsafe.Pointer(&progIds[0]))),
    41  			ProgCnt:    uint32(len(progIds)),
    42  		}
    43  
    44  		// Fetch the list of program ids.
    45  		_, _, errno := unix.Syscall(unix.SYS_BPF,
    46  			uintptr(unix.BPF_PROG_QUERY),
    47  			uintptr(unsafe.Pointer(&query)),
    48  			unsafe.Sizeof(query))
    49  		size = int(query.ProgCnt)
    50  		runtime.KeepAlive(query)
    51  		if errno != 0 {
    52  			// On ENOSPC we get the correct number of programs.
    53  			if errno == unix.ENOSPC {
    54  				retries++
    55  				continue
    56  			}
    57  			return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
    58  		}
    59  
    60  		// Convert the ids to program handles.
    61  		progIds = progIds[:size]
    62  		programs := make([]*ebpf.Program, 0, len(progIds))
    63  		for _, progId := range progIds {
    64  			program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
    65  			if err != nil {
    66  				// We skip over programs that give us -EACCES or -EPERM. This
    67  				// is necessary because there may be BPF programs that have
    68  				// been attached (such as with --systemd-cgroup) which have an
    69  				// LSM label that blocks us from interacting with the program.
    70  				//
    71  				// Because additional BPF_CGROUP_DEVICE programs only can add
    72  				// restrictions, there's no real issue with just ignoring these
    73  				// programs (and stops runc from breaking on distributions with
    74  				// very strict SELinux policies).
    75  				if errors.Is(err, os.ErrPermission) {
    76  					logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
    77  					continue
    78  				}
    79  				return nil, fmt.Errorf("cannot fetch program from id: %w", err)
    80  			}
    81  			programs = append(programs, program)
    82  		}
    83  		runtime.KeepAlive(progIds)
    84  		return programs, nil
    85  	}
    86  
    87  	return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
    88  }
    89  
    90  var (
    91  	haveBpfProgReplaceBool bool
    92  	haveBpfProgReplaceOnce sync.Once
    93  )
    94  
    95  // Loosely based on the BPF_F_REPLACE support check in
    96  // https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
    97  //
    98  // TODO: move this logic to cilium/ebpf
    99  func haveBpfProgReplace() bool {
   100  	haveBpfProgReplaceOnce.Do(func() {
   101  		prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
   102  			Type:    ebpf.CGroupDevice,
   103  			License: "MIT",
   104  			Instructions: asm.Instructions{
   105  				asm.Mov.Imm(asm.R0, 0),
   106  				asm.Return(),
   107  			},
   108  		})
   109  		if err != nil {
   110  			logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
   111  			return
   112  		}
   113  		defer prog.Close()
   114  
   115  		devnull, err := os.Open("/dev/null")
   116  		if err != nil {
   117  			logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
   118  			return
   119  		}
   120  		defer devnull.Close()
   121  
   122  		// We know that we have BPF_PROG_ATTACH since we can load
   123  		// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
   124  		// we know that the feature isn't present.
   125  		err = link.RawAttachProgram(link.RawAttachProgramOptions{
   126  			// We rely on this fd being checked after attachFlags.
   127  			Target: int(devnull.Fd()),
   128  			// Attempt to "replace" bad fds with this program.
   129  			Program: prog,
   130  			Attach:  ebpf.AttachCGroupDevice,
   131  			Flags:   unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE,
   132  		})
   133  		if errors.Is(err, unix.EINVAL) {
   134  			// not supported
   135  			return
   136  		}
   137  		// attach_flags test succeeded.
   138  		if !errors.Is(err, unix.EBADF) {
   139  			logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
   140  		}
   141  		haveBpfProgReplaceBool = true
   142  	})
   143  	return haveBpfProgReplaceBool
   144  }
   145  
   146  // LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
   147  //
   148  // Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
   149  //
   150  // https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
   151  func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
   152  	// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
   153  	// This limit is not inherited into the container.
   154  	memlockLimit := &unix.Rlimit{
   155  		Cur: unix.RLIM_INFINITY,
   156  		Max: unix.RLIM_INFINITY,
   157  	}
   158  	_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
   159  
   160  	// Get the list of existing programs.
   161  	oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
   162  	if err != nil {
   163  		return nilCloser, err
   164  	}
   165  	useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
   166  
   167  	// Generate new program.
   168  	spec := &ebpf.ProgramSpec{
   169  		Type:         ebpf.CGroupDevice,
   170  		Instructions: insts,
   171  		License:      license,
   172  	}
   173  	prog, err := ebpf.NewProgram(spec)
   174  	if err != nil {
   175  		return nilCloser, err
   176  	}
   177  
   178  	// If there is only one old program, we can just replace it directly.
   179  	var (
   180  		replaceProg *ebpf.Program
   181  		attachFlags uint32 = unix.BPF_F_ALLOW_MULTI
   182  	)
   183  	if useReplaceProg {
   184  		replaceProg = oldProgs[0]
   185  		attachFlags |= unix.BPF_F_REPLACE
   186  	}
   187  	err = link.RawAttachProgram(link.RawAttachProgramOptions{
   188  		Target:  dirFd,
   189  		Program: prog,
   190  		Replace: replaceProg,
   191  		Attach:  ebpf.AttachCGroupDevice,
   192  		Flags:   attachFlags,
   193  	})
   194  	if err != nil {
   195  		return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
   196  	}
   197  	closer := func() error {
   198  		err = link.RawDetachProgram(link.RawDetachProgramOptions{
   199  			Target:  dirFd,
   200  			Program: prog,
   201  			Attach:  ebpf.AttachCGroupDevice,
   202  		})
   203  		if err != nil {
   204  			return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
   205  		}
   206  		// TODO: Should we attach the old filters back in this case? Otherwise
   207  		//       we fail-open on a security feature, which is a bit scary.
   208  		return nil
   209  	}
   210  	if !useReplaceProg {
   211  		logLevel := logrus.DebugLevel
   212  		// If there was more than one old program, give a warning (since this
   213  		// really shouldn't happen with runc-managed cgroups) and then detach
   214  		// all the old programs.
   215  		if len(oldProgs) > 1 {
   216  			// NOTE: Ideally this should be a warning but it turns out that
   217  			//       systemd-managed cgroups trigger this warning (apparently
   218  			//       systemd doesn't delete old non-systemd programs when
   219  			//       setting properties).
   220  			logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
   221  			logLevel = logrus.InfoLevel
   222  		}
   223  		for idx, oldProg := range oldProgs {
   224  			// Output some extra debug info.
   225  			if info, err := oldProg.Info(); err == nil {
   226  				fields := logrus.Fields{
   227  					"type": info.Type.String(),
   228  					"tag":  info.Tag,
   229  					"name": info.Name,
   230  				}
   231  				if id, ok := info.ID(); ok {
   232  					fields["id"] = id
   233  				}
   234  				if runCount, ok := info.RunCount(); ok {
   235  					fields["run_count"] = runCount
   236  				}
   237  				if runtime, ok := info.Runtime(); ok {
   238  					fields["runtime"] = runtime.String()
   239  				}
   240  				logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
   241  			}
   242  			err = link.RawDetachProgram(link.RawDetachProgramOptions{
   243  				Target:  dirFd,
   244  				Program: oldProg,
   245  				Attach:  ebpf.AttachCGroupDevice,
   246  			})
   247  			if err != nil {
   248  				return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
   249  			}
   250  		}
   251  	}
   252  	return closer, nil
   253  }
   254  

View as plain text