...

Source file src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go

Documentation: github.com/opencontainers/runc/libcontainer/cgroups/systemd

     1  package systemd
     2  
     3  import (
     4  	"bufio"
     5  	"errors"
     6  	"fmt"
     7  	"math"
     8  	"os"
     9  	"path/filepath"
    10  	"strconv"
    11  	"strings"
    12  	"sync"
    13  
    14  	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
    15  	securejoin "github.com/cyphar/filepath-securejoin"
    16  	"github.com/sirupsen/logrus"
    17  
    18  	"github.com/opencontainers/runc/libcontainer/cgroups"
    19  	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
    20  	"github.com/opencontainers/runc/libcontainer/configs"
    21  )
    22  
    23  type unifiedManager struct {
    24  	mu      sync.Mutex
    25  	cgroups *configs.Cgroup
    26  	// path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
    27  	path  string
    28  	dbus  *dbusConnManager
    29  	fsMgr cgroups.Manager
    30  }
    31  
    32  func NewUnifiedManager(config *configs.Cgroup, path string) (cgroups.Manager, error) {
    33  	m := &unifiedManager{
    34  		cgroups: config,
    35  		path:    path,
    36  		dbus:    newDbusConnManager(config.Rootless),
    37  	}
    38  	if err := m.initPath(); err != nil {
    39  		return nil, err
    40  	}
    41  
    42  	fsMgr, err := fs2.NewManager(config, m.path)
    43  	if err != nil {
    44  		return nil, err
    45  	}
    46  	m.fsMgr = fsMgr
    47  
    48  	return m, nil
    49  }
    50  
    51  // unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified
    52  // key/value map (where key is cgroupfs file name) to systemd unit properties.
    53  // This is on a best-effort basis, so the properties that are not known
    54  // (to this function and/or systemd) are ignored (but logged with "debug"
    55  // log level).
    56  //
    57  // For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt
    58  //
    59  // For the list of systemd unit properties, see systemd.resource-control(5).
    60  func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) {
    61  	var err error
    62  
    63  	for k, v := range res {
    64  		if strings.Contains(k, "/") {
    65  			return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
    66  		}
    67  		sk := strings.SplitN(k, ".", 2)
    68  		if len(sk) != 2 {
    69  			return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
    70  		}
    71  		// Kernel is quite forgiving to extra whitespace
    72  		// around the value, and so should we.
    73  		v = strings.TrimSpace(v)
    74  		// Please keep cases in alphabetical order.
    75  		switch k {
    76  		case "cpu.max":
    77  			// value: quota [period]
    78  			quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set
    79  			period := defCPUQuotaPeriod
    80  			sv := strings.Fields(v)
    81  			if len(sv) < 1 || len(sv) > 2 {
    82  				return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v)
    83  			}
    84  			// quota
    85  			if sv[0] != "max" {
    86  				quota, err = strconv.ParseInt(sv[0], 10, 64)
    87  				if err != nil {
    88  					return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err)
    89  				}
    90  			}
    91  			// period
    92  			if len(sv) == 2 {
    93  				period, err = strconv.ParseUint(sv[1], 10, 64)
    94  				if err != nil {
    95  					return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err)
    96  				}
    97  			}
    98  			addCpuQuota(cm, &props, quota, period)
    99  
   100  		case "cpu.weight":
   101  			num, err := strconv.ParseUint(v, 10, 64)
   102  			if err != nil {
   103  				return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
   104  			}
   105  			props = append(props,
   106  				newProp("CPUWeight", num))
   107  
   108  		case "cpuset.cpus", "cpuset.mems":
   109  			bits, err := RangeToBits(v)
   110  			if err != nil {
   111  				return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err)
   112  			}
   113  			m := map[string]string{
   114  				"cpuset.cpus": "AllowedCPUs",
   115  				"cpuset.mems": "AllowedMemoryNodes",
   116  			}
   117  			// systemd only supports these properties since v244
   118  			sdVer := systemdVersion(cm)
   119  			if sdVer >= 244 {
   120  				props = append(props,
   121  					newProp(m[k], bits))
   122  			} else {
   123  				logrus.Debugf("systemd v%d is too old to support %s"+
   124  					" (setting will still be applied to cgroupfs)",
   125  					sdVer, m[k])
   126  			}
   127  
   128  		case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max":
   129  			num := uint64(math.MaxUint64)
   130  			if v != "max" {
   131  				num, err = strconv.ParseUint(v, 10, 64)
   132  				if err != nil {
   133  					return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
   134  				}
   135  			}
   136  			m := map[string]string{
   137  				"memory.high":     "MemoryHigh",
   138  				"memory.low":      "MemoryLow",
   139  				"memory.min":      "MemoryMin",
   140  				"memory.max":      "MemoryMax",
   141  				"memory.swap.max": "MemorySwapMax",
   142  			}
   143  			props = append(props,
   144  				newProp(m[k], num))
   145  
   146  		case "pids.max":
   147  			num := uint64(math.MaxUint64)
   148  			if v != "max" {
   149  				var err error
   150  				num, err = strconv.ParseUint(v, 10, 64)
   151  				if err != nil {
   152  					return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
   153  				}
   154  			}
   155  			props = append(props,
   156  				newProp("TasksMax", num))
   157  
   158  		case "memory.oom.group":
   159  			// Setting this to 1 is roughly equivalent to OOMPolicy=kill
   160  			// (as per systemd.service(5) and
   161  			// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html),
   162  			// but it's not clear what to do if it is unset or set
   163  			// to 0 in runc update, as there are two other possible
   164  			// values for OOMPolicy (continue/stop).
   165  			fallthrough
   166  
   167  		default:
   168  			// Ignore the unknown resource here -- will still be
   169  			// applied in Set which calls fs2.Set.
   170  			logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v)
   171  		}
   172  	}
   173  
   174  	return props, nil
   175  }
   176  
   177  func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
   178  	var properties []systemdDbus.Property
   179  
   180  	// NOTE: This is of questionable correctness because we insert our own
   181  	//       devices eBPF program later. Two programs with identical rules
   182  	//       aren't the end of the world, but it is a bit concerning. However
   183  	//       it's unclear if systemd removes all eBPF programs attached when
   184  	//       doing SetUnitProperties...
   185  	deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
   186  	if err != nil {
   187  		return nil, err
   188  	}
   189  	properties = append(properties, deviceProperties...)
   190  
   191  	if r.Memory != 0 {
   192  		properties = append(properties,
   193  			newProp("MemoryMax", uint64(r.Memory)))
   194  	}
   195  	if r.MemoryReservation != 0 {
   196  		properties = append(properties,
   197  			newProp("MemoryLow", uint64(r.MemoryReservation)))
   198  	}
   199  
   200  	swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
   201  	if err != nil {
   202  		return nil, err
   203  	}
   204  	if swap != 0 {
   205  		properties = append(properties,
   206  			newProp("MemorySwapMax", uint64(swap)))
   207  	}
   208  
   209  	if r.CpuWeight != 0 {
   210  		properties = append(properties,
   211  			newProp("CPUWeight", r.CpuWeight))
   212  	}
   213  
   214  	addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
   215  
   216  	if r.PidsLimit > 0 || r.PidsLimit == -1 {
   217  		properties = append(properties,
   218  			newProp("TasksMax", uint64(r.PidsLimit)))
   219  	}
   220  
   221  	err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
   222  	if err != nil {
   223  		return nil, err
   224  	}
   225  
   226  	// ignore r.KernelMemory
   227  
   228  	// convert Resources.Unified map to systemd properties
   229  	if r.Unified != nil {
   230  		unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified)
   231  		if err != nil {
   232  			return nil, err
   233  		}
   234  		properties = append(properties, unifiedProps...)
   235  	}
   236  
   237  	return properties, nil
   238  }
   239  
   240  func (m *unifiedManager) Apply(pid int) error {
   241  	var (
   242  		c          = m.cgroups
   243  		unitName   = getUnitName(c)
   244  		properties []systemdDbus.Property
   245  	)
   246  
   247  	slice := "system.slice"
   248  	if m.cgroups.Rootless {
   249  		slice = "user.slice"
   250  	}
   251  	if c.Parent != "" {
   252  		slice = c.Parent
   253  	}
   254  
   255  	properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
   256  
   257  	if strings.HasSuffix(unitName, ".slice") {
   258  		// If we create a slice, the parent is defined via a Wants=.
   259  		properties = append(properties, systemdDbus.PropWants(slice))
   260  	} else {
   261  		// Otherwise it's a scope, which we put into a Slice=.
   262  		properties = append(properties, systemdDbus.PropSlice(slice))
   263  		// Assume scopes always support delegation (supported since systemd v218).
   264  		properties = append(properties, newProp("Delegate", true))
   265  	}
   266  
   267  	// only add pid if its valid, -1 is used w/ general slice creation.
   268  	if pid != -1 {
   269  		properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
   270  	}
   271  
   272  	// Always enable accounting, this gets us the same behaviour as the fs implementation,
   273  	// plus the kernel has some problems with joining the memory cgroup at a later time.
   274  	properties = append(properties,
   275  		newProp("MemoryAccounting", true),
   276  		newProp("CPUAccounting", true),
   277  		newProp("IOAccounting", true),
   278  		newProp("TasksAccounting", true),
   279  	)
   280  
   281  	// Assume DefaultDependencies= will always work (the check for it was previously broken.)
   282  	properties = append(properties,
   283  		newProp("DefaultDependencies", false))
   284  
   285  	properties = append(properties, c.SystemdProps...)
   286  
   287  	if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
   288  		return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err)
   289  	}
   290  
   291  	if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
   292  		return err
   293  	}
   294  
   295  	if c.OwnerUID != nil {
   296  		// The directory itself must be chowned.
   297  		err := os.Chown(m.path, *c.OwnerUID, -1)
   298  		if err != nil {
   299  			return err
   300  		}
   301  
   302  		filesToChown, err := cgroupFilesToChown()
   303  		if err != nil {
   304  			return err
   305  		}
   306  
   307  		for _, v := range filesToChown {
   308  			err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
   309  			// Some files might not be present.
   310  			if err != nil && !errors.Is(err, os.ErrNotExist) {
   311  				return err
   312  			}
   313  		}
   314  	}
   315  
   316  	return nil
   317  }
   318  
   319  // The kernel exposes a list of files that should be chowned to the delegate
   320  // uid in /sys/kernel/cgroup/delegate.  If the file is not present
   321  // (Linux < 4.15), use the initial values mentioned in cgroups(7).
   322  func cgroupFilesToChown() ([]string, error) {
   323  	const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
   324  
   325  	f, err := os.Open(cgroupDelegateFile)
   326  	if err != nil {
   327  		return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil
   328  	}
   329  	defer f.Close()
   330  
   331  	filesToChown := []string{}
   332  	scanner := bufio.NewScanner(f)
   333  	for scanner.Scan() {
   334  		filesToChown = append(filesToChown, scanner.Text())
   335  	}
   336  	if err := scanner.Err(); err != nil {
   337  		return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
   338  	}
   339  
   340  	return filesToChown, nil
   341  }
   342  
   343  func (m *unifiedManager) Destroy() error {
   344  	m.mu.Lock()
   345  	defer m.mu.Unlock()
   346  
   347  	unitName := getUnitName(m.cgroups)
   348  	if err := stopUnit(m.dbus, unitName); err != nil {
   349  		return err
   350  	}
   351  
   352  	// systemd 239 do not remove sub-cgroups.
   353  	err := m.fsMgr.Destroy()
   354  	// fsMgr.Destroy has handled ErrNotExist
   355  	if err != nil {
   356  		return err
   357  	}
   358  
   359  	return nil
   360  }
   361  
   362  func (m *unifiedManager) Path(_ string) string {
   363  	return m.path
   364  }
   365  
   366  // getSliceFull value is used in initPath.
   367  // The value is incompatible with systemdDbus.PropSlice.
   368  func (m *unifiedManager) getSliceFull() (string, error) {
   369  	c := m.cgroups
   370  	slice := "system.slice"
   371  	if c.Rootless {
   372  		slice = "user.slice"
   373  	}
   374  	if c.Parent != "" {
   375  		var err error
   376  		slice, err = ExpandSlice(c.Parent)
   377  		if err != nil {
   378  			return "", err
   379  		}
   380  	}
   381  
   382  	if c.Rootless {
   383  		// managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service".
   384  		managerCG, err := getManagerProperty(m.dbus, "ControlGroup")
   385  		if err != nil {
   386  			return "", err
   387  		}
   388  		slice = filepath.Join(managerCG, slice)
   389  	}
   390  
   391  	// an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice"
   392  	// NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified.
   393  	return slice, nil
   394  }
   395  
   396  func (m *unifiedManager) initPath() error {
   397  	if m.path != "" {
   398  		return nil
   399  	}
   400  
   401  	sliceFull, err := m.getSliceFull()
   402  	if err != nil {
   403  		return err
   404  	}
   405  
   406  	c := m.cgroups
   407  	path := filepath.Join(sliceFull, getUnitName(c))
   408  	path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path)
   409  	if err != nil {
   410  		return err
   411  	}
   412  
   413  	// an example of the final path in rootless:
   414  	// "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope"
   415  	m.path = path
   416  
   417  	return nil
   418  }
   419  
   420  func (m *unifiedManager) Freeze(state configs.FreezerState) error {
   421  	return m.fsMgr.Freeze(state)
   422  }
   423  
   424  func (m *unifiedManager) GetPids() ([]int, error) {
   425  	return cgroups.GetPids(m.path)
   426  }
   427  
   428  func (m *unifiedManager) GetAllPids() ([]int, error) {
   429  	return cgroups.GetAllPids(m.path)
   430  }
   431  
   432  func (m *unifiedManager) GetStats() (*cgroups.Stats, error) {
   433  	return m.fsMgr.GetStats()
   434  }
   435  
   436  func (m *unifiedManager) Set(r *configs.Resources) error {
   437  	if r == nil {
   438  		return nil
   439  	}
   440  	properties, err := genV2ResourcesProperties(r, m.dbus)
   441  	if err != nil {
   442  		return err
   443  	}
   444  
   445  	if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
   446  		return fmt.Errorf("unable to set unit properties: %w", err)
   447  	}
   448  
   449  	return m.fsMgr.Set(r)
   450  }
   451  
   452  func (m *unifiedManager) GetPaths() map[string]string {
   453  	paths := make(map[string]string, 1)
   454  	paths[""] = m.path
   455  	return paths
   456  }
   457  
   458  func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) {
   459  	return m.cgroups, nil
   460  }
   461  
   462  func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
   463  	return m.fsMgr.GetFreezerState()
   464  }
   465  
   466  func (m *unifiedManager) Exists() bool {
   467  	return cgroups.PathExists(m.path)
   468  }
   469  
   470  func (m *unifiedManager) OOMKillCount() (uint64, error) {
   471  	return m.fsMgr.OOMKillCount()
   472  }
   473  

View as plain text