...

Source file src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go

Documentation: github.com/opencontainers/runc/libcontainer/cgroups/systemd

     1  package systemd
     2  
     3  import (
     4  	"bufio"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"math"
     9  	"os"
    10  	"regexp"
    11  	"strconv"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
    17  	dbus "github.com/godbus/dbus/v5"
    18  	"github.com/sirupsen/logrus"
    19  
    20  	cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
    21  	"github.com/opencontainers/runc/libcontainer/configs"
    22  	"github.com/opencontainers/runc/libcontainer/devices"
    23  )
    24  
    25  const (
    26  	// Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2.
    27  	// v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and
    28  	// v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
    29  	defCPUQuotaPeriod = uint64(100000)
    30  )
    31  
    32  var (
    33  	versionOnce sync.Once
    34  	version     int
    35  
    36  	isRunningSystemdOnce sync.Once
    37  	isRunningSystemd     bool
    38  )
    39  
    40  // NOTE: This function comes from package github.com/coreos/go-systemd/util
    41  // It was borrowed here to avoid a dependency on cgo.
    42  //
    43  // IsRunningSystemd checks whether the host was booted with systemd as its init
    44  // system. This functions similarly to systemd's `sd_booted(3)`: internally, it
    45  // checks whether /run/systemd/system/ exists and is a directory.
    46  // http://www.freedesktop.org/software/systemd/man/sd_booted.html
    47  func IsRunningSystemd() bool {
    48  	isRunningSystemdOnce.Do(func() {
    49  		fi, err := os.Lstat("/run/systemd/system")
    50  		isRunningSystemd = err == nil && fi.IsDir()
    51  	})
    52  	return isRunningSystemd
    53  }
    54  
    55  // systemd represents slice hierarchy using `-`, so we need to follow suit when
    56  // generating the path of slice. Essentially, test-a-b.slice becomes
    57  // /test.slice/test-a.slice/test-a-b.slice.
    58  func ExpandSlice(slice string) (string, error) {
    59  	suffix := ".slice"
    60  	// Name has to end with ".slice", but can't be just ".slice".
    61  	if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
    62  		return "", fmt.Errorf("invalid slice name: %s", slice)
    63  	}
    64  
    65  	// Path-separators are not allowed.
    66  	if strings.Contains(slice, "/") {
    67  		return "", fmt.Errorf("invalid slice name: %s", slice)
    68  	}
    69  
    70  	var path, prefix string
    71  	sliceName := strings.TrimSuffix(slice, suffix)
    72  	// if input was -.slice, we should just return root now
    73  	if sliceName == "-" {
    74  		return "/", nil
    75  	}
    76  	for _, component := range strings.Split(sliceName, "-") {
    77  		// test--a.slice isn't permitted, nor is -test.slice.
    78  		if component == "" {
    79  			return "", fmt.Errorf("invalid slice name: %s", slice)
    80  		}
    81  
    82  		// Append the component to the path and to the prefix.
    83  		path += "/" + prefix + component + suffix
    84  		prefix += component + "-"
    85  	}
    86  	return path, nil
    87  }
    88  
    89  func groupPrefix(ruleType devices.Type) (string, error) {
    90  	switch ruleType {
    91  	case devices.BlockDevice:
    92  		return "block-", nil
    93  	case devices.CharDevice:
    94  		return "char-", nil
    95  	default:
    96  		return "", fmt.Errorf("device type %v has no group prefix", ruleType)
    97  	}
    98  }
    99  
   100  // findDeviceGroup tries to find the device group name (as listed in
   101  // /proc/devices) with the type prefixed as required for DeviceAllow, for a
   102  // given (type, major) combination. If more than one device group exists, an
   103  // arbitrary one is chosen.
   104  func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
   105  	fh, err := os.Open("/proc/devices")
   106  	if err != nil {
   107  		return "", err
   108  	}
   109  	defer fh.Close()
   110  
   111  	prefix, err := groupPrefix(ruleType)
   112  	if err != nil {
   113  		return "", err
   114  	}
   115  
   116  	scanner := bufio.NewScanner(fh)
   117  	var currentType devices.Type
   118  	for scanner.Scan() {
   119  		// We need to strip spaces because the first number is column-aligned.
   120  		line := strings.TrimSpace(scanner.Text())
   121  
   122  		// Handle the "header" lines.
   123  		switch line {
   124  		case "Block devices:":
   125  			currentType = devices.BlockDevice
   126  			continue
   127  		case "Character devices:":
   128  			currentType = devices.CharDevice
   129  			continue
   130  		case "":
   131  			continue
   132  		}
   133  
   134  		// Skip lines unrelated to our type.
   135  		if currentType != ruleType {
   136  			continue
   137  		}
   138  
   139  		// Parse out the (major, name).
   140  		var (
   141  			currMajor int64
   142  			currName  string
   143  		)
   144  		if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
   145  			if err == nil {
   146  				err = errors.New("wrong number of fields")
   147  			}
   148  			return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err)
   149  		}
   150  
   151  		if currMajor == ruleMajor {
   152  			return prefix + currName, nil
   153  		}
   154  	}
   155  	if err := scanner.Err(); err != nil {
   156  		return "", fmt.Errorf("reading /proc/devices: %w", err)
   157  	}
   158  	// Couldn't find the device group.
   159  	return "", nil
   160  }
   161  
   162  // DeviceAllow is the dbus type "a(ss)" which means we need a struct
   163  // to represent it in Go.
   164  type deviceAllowEntry struct {
   165  	Path  string
   166  	Perms string
   167  }
   168  
   169  func allowAllDevices() []systemdDbus.Property {
   170  	// Setting mode to auto and removing all DeviceAllow rules
   171  	// results in allowing access to all devices.
   172  	return []systemdDbus.Property{
   173  		newProp("DevicePolicy", "auto"),
   174  		newProp("DeviceAllow", []deviceAllowEntry{}),
   175  	}
   176  }
   177  
   178  // generateDeviceProperties takes the configured device rules and generates a
   179  // corresponding set of systemd properties to configure the devices correctly.
   180  func generateDeviceProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) {
   181  	if r.SkipDevices {
   182  		return nil, nil
   183  	}
   184  
   185  	properties := []systemdDbus.Property{
   186  		// Always run in the strictest white-list mode.
   187  		newProp("DevicePolicy", "strict"),
   188  		// Empty the DeviceAllow array before filling it.
   189  		newProp("DeviceAllow", []deviceAllowEntry{}),
   190  	}
   191  
   192  	// Figure out the set of rules.
   193  	configEmu := &cgroupdevices.Emulator{}
   194  	for _, rule := range r.Devices {
   195  		if err := configEmu.Apply(*rule); err != nil {
   196  			return nil, fmt.Errorf("unable to apply rule for systemd: %w", err)
   197  		}
   198  	}
   199  	// systemd doesn't support blacklists. So we log a warning, and tell
   200  	// systemd to act as a deny-all whitelist. This ruleset will be replaced
   201  	// with our normal fallback code. This may result in spurious errors, but
   202  	// the only other option is to error out here.
   203  	if configEmu.IsBlacklist() {
   204  		// However, if we're dealing with an allow-all rule then we can do it.
   205  		if configEmu.IsAllowAll() {
   206  			return allowAllDevices(), nil
   207  		}
   208  		logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
   209  		return properties, nil
   210  	}
   211  
   212  	// Now generate the set of rules we actually need to apply. Unlike the
   213  	// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
   214  	// whitelist which is the default for devices.Emulator.
   215  	finalRules, err := configEmu.Rules()
   216  	if err != nil {
   217  		return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err)
   218  	}
   219  	var deviceAllowList []deviceAllowEntry
   220  	for _, rule := range finalRules {
   221  		if !rule.Allow {
   222  			// Should never happen.
   223  			return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
   224  		}
   225  		switch rule.Type {
   226  		case devices.BlockDevice, devices.CharDevice:
   227  		default:
   228  			// Should never happen.
   229  			return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
   230  		}
   231  
   232  		entry := deviceAllowEntry{
   233  			Perms: string(rule.Permissions),
   234  		}
   235  
   236  		// systemd has a fairly odd (though understandable) syntax here, and
   237  		// because of the OCI configuration format we have to do quite a bit of
   238  		// trickery to convert things:
   239  		//
   240  		//  * Concrete rules with non-wildcard major/minor numbers have to use
   241  		//    /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses
   242  		//    stat(2) on such paths to look up device properties, meaning we
   243  		//    cannot add whitelist rules for devices that don't exist. Since v240,
   244  		//    device properties are parsed from the path string.
   245  		//
   246  		//    However, path globbing is not support for path-based rules so we
   247  		//    need to handle wildcards in some other manner.
   248  		//
   249  		//  * Wildcard-minor rules have to specify a "device group name" (the
   250  		//    second column in /proc/devices).
   251  		//
   252  		//  * Wildcard (major and minor) rules can just specify a glob with the
   253  		//    type ("char-*" or "block-*").
   254  		//
   255  		// The only type of rule we can't handle is wildcard-major rules, and
   256  		// so we'll give a warning in that case (note that the fallback code
   257  		// will insert any rules systemd couldn't handle). What amazing fun.
   258  
   259  		if rule.Major == devices.Wildcard {
   260  			// "_ *:n _" rules aren't supported by systemd.
   261  			if rule.Minor != devices.Wildcard {
   262  				logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
   263  				continue
   264  			}
   265  
   266  			// "_ *:* _" rules just wildcard everything.
   267  			prefix, err := groupPrefix(rule.Type)
   268  			if err != nil {
   269  				return nil, err
   270  			}
   271  			entry.Path = prefix + "*"
   272  		} else if rule.Minor == devices.Wildcard {
   273  			// "_ n:* _" rules require a device group from /proc/devices.
   274  			group, err := findDeviceGroup(rule.Type, rule.Major)
   275  			if err != nil {
   276  				return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err)
   277  			}
   278  			if group == "" {
   279  				// Couldn't find a group.
   280  				logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
   281  				continue
   282  			}
   283  			entry.Path = group
   284  		} else {
   285  			// "_ n:m _" rules are just a path in /dev/{block,char}/.
   286  			switch rule.Type {
   287  			case devices.BlockDevice:
   288  				entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
   289  			case devices.CharDevice:
   290  				entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
   291  			}
   292  			if sdVer < 240 {
   293  				// Old systemd versions use stat(2) on path to find out device major:minor
   294  				// numbers and type. If the path doesn't exist, it will not add the rule,
   295  				// emitting a warning instead.
   296  				// Since all of this logic is best-effort anyway (we manually set these
   297  				// rules separately to systemd) we can safely skip entries that don't
   298  				// have a corresponding path.
   299  				if _, err := os.Stat(entry.Path); err != nil {
   300  					continue
   301  				}
   302  			}
   303  		}
   304  		deviceAllowList = append(deviceAllowList, entry)
   305  	}
   306  
   307  	properties = append(properties, newProp("DeviceAllow", deviceAllowList))
   308  	return properties, nil
   309  }
   310  
   311  func newProp(name string, units interface{}) systemdDbus.Property {
   312  	return systemdDbus.Property{
   313  		Name:  name,
   314  		Value: dbus.MakeVariant(units),
   315  	}
   316  }
   317  
   318  func getUnitName(c *configs.Cgroup) string {
   319  	// by default, we create a scope unless the user explicitly asks for a slice.
   320  	if !strings.HasSuffix(c.Name, ".slice") {
   321  		return c.ScopePrefix + "-" + c.Name + ".scope"
   322  	}
   323  	return c.Name
   324  }
   325  
   326  // This code should be in sync with getUnitName.
   327  func getUnitType(unitName string) string {
   328  	if strings.HasSuffix(unitName, ".slice") {
   329  		return "Slice"
   330  	}
   331  	return "Scope"
   332  }
   333  
   334  // isDbusError returns true if the error is a specific dbus error.
   335  func isDbusError(err error, name string) bool {
   336  	if err != nil {
   337  		var derr dbus.Error
   338  		if errors.As(err, &derr) {
   339  			return strings.Contains(derr.Name, name)
   340  		}
   341  	}
   342  	return false
   343  }
   344  
   345  // isUnitExists returns true if the error is that a systemd unit already exists.
   346  func isUnitExists(err error) bool {
   347  	return isDbusError(err, "org.freedesktop.systemd1.UnitExists")
   348  }
   349  
   350  func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error {
   351  	statusChan := make(chan string, 1)
   352  	retry := true
   353  
   354  retry:
   355  	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
   356  		_, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan)
   357  		return err
   358  	})
   359  	if err != nil {
   360  		if !isUnitExists(err) {
   361  			return err
   362  		}
   363  		if ignoreExist {
   364  			// TODO: remove this hack.
   365  			// This is kubelet making sure a slice exists (see
   366  			// https://github.com/opencontainers/runc/pull/1124).
   367  			return nil
   368  		}
   369  		if retry {
   370  			// In case a unit with the same name exists, this may
   371  			// be a leftover failed unit. Reset it, so systemd can
   372  			// remove it, and retry once.
   373  			err = resetFailedUnit(cm, unitName)
   374  			if err != nil {
   375  				logrus.Warnf("unable to reset failed unit: %v", err)
   376  			}
   377  			retry = false
   378  			goto retry
   379  		}
   380  		return err
   381  	}
   382  
   383  	timeout := time.NewTimer(30 * time.Second)
   384  	defer timeout.Stop()
   385  
   386  	select {
   387  	case s := <-statusChan:
   388  		close(statusChan)
   389  		// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
   390  		if s != "done" {
   391  			_ = resetFailedUnit(cm, unitName)
   392  			return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
   393  		}
   394  	case <-timeout.C:
   395  		_ = resetFailedUnit(cm, unitName)
   396  		return errors.New("Timeout waiting for systemd to create " + unitName)
   397  	}
   398  
   399  	return nil
   400  }
   401  
   402  func stopUnit(cm *dbusConnManager, unitName string) error {
   403  	statusChan := make(chan string, 1)
   404  	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
   405  		_, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan)
   406  		return err
   407  	})
   408  	if err == nil {
   409  		timeout := time.NewTimer(30 * time.Second)
   410  		defer timeout.Stop()
   411  
   412  		select {
   413  		case s := <-statusChan:
   414  			close(statusChan)
   415  			// Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
   416  			if s != "done" {
   417  				logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s)
   418  			}
   419  		case <-timeout.C:
   420  			return errors.New("Timed out while waiting for systemd to remove " + unitName)
   421  		}
   422  	}
   423  
   424  	// In case of a failed unit, let systemd remove it.
   425  	_ = resetFailedUnit(cm, unitName)
   426  
   427  	return nil
   428  }
   429  
   430  func resetFailedUnit(cm *dbusConnManager, name string) error {
   431  	return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
   432  		return c.ResetFailedUnitContext(context.TODO(), name)
   433  	})
   434  }
   435  
   436  func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) {
   437  	var prop *systemdDbus.Property
   438  	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) {
   439  		prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName)
   440  		return Err
   441  	})
   442  	return prop, err
   443  }
   444  
   445  func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error {
   446  	return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
   447  		return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...)
   448  	})
   449  }
   450  
   451  func getManagerProperty(cm *dbusConnManager, name string) (string, error) {
   452  	str := ""
   453  	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
   454  		var err error
   455  		str, err = c.GetManagerProperty(name)
   456  		return err
   457  	})
   458  	if err != nil {
   459  		return "", err
   460  	}
   461  	return strconv.Unquote(str)
   462  }
   463  
   464  func systemdVersion(cm *dbusConnManager) int {
   465  	versionOnce.Do(func() {
   466  		version = -1
   467  		verStr, err := getManagerProperty(cm, "Version")
   468  		if err == nil {
   469  			version, err = systemdVersionAtoi(verStr)
   470  		}
   471  
   472  		if err != nil {
   473  			logrus.WithError(err).Error("unable to get systemd version")
   474  		}
   475  	})
   476  
   477  	return version
   478  }
   479  
   480  func systemdVersionAtoi(verStr string) (int, error) {
   481  	// verStr should be of the form:
   482  	// "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes).
   483  	// The result for all of the above should be 245.
   484  	// Thus, we unconditionally remove the "v" prefix
   485  	// and then match on the first integer we can grab.
   486  	re := regexp.MustCompile(`v?([0-9]+)`)
   487  	matches := re.FindStringSubmatch(verStr)
   488  	if len(matches) < 2 {
   489  		return 0, fmt.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches)
   490  	}
   491  	ver, err := strconv.Atoi(matches[1])
   492  	if err != nil {
   493  		return -1, fmt.Errorf("can't parse version: %w", err)
   494  	}
   495  	return ver, nil
   496  }
   497  
   498  func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) {
   499  	if period != 0 {
   500  		// systemd only supports CPUQuotaPeriodUSec since v242
   501  		sdVer := systemdVersion(cm)
   502  		if sdVer >= 242 {
   503  			*properties = append(*properties,
   504  				newProp("CPUQuotaPeriodUSec", period))
   505  		} else {
   506  			logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+
   507  				" (setting will still be applied to cgroupfs)", sdVer)
   508  		}
   509  	}
   510  	if quota != 0 || period != 0 {
   511  		// corresponds to USEC_INFINITY in systemd
   512  		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
   513  		if quota > 0 {
   514  			if period == 0 {
   515  				// assume the default
   516  				period = defCPUQuotaPeriod
   517  			}
   518  			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
   519  			// (integer percentage of CPU) internally.  This means that if a fractional percent of
   520  			// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
   521  			// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
   522  			cpuQuotaPerSecUSec = uint64(quota*1000000) / period
   523  			if cpuQuotaPerSecUSec%10000 != 0 {
   524  				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
   525  			}
   526  		}
   527  		*properties = append(*properties,
   528  			newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
   529  	}
   530  }
   531  
   532  func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error {
   533  	if cpus == "" && mems == "" {
   534  		return nil
   535  	}
   536  
   537  	// systemd only supports AllowedCPUs/AllowedMemoryNodes since v244
   538  	sdVer := systemdVersion(cm)
   539  	if sdVer < 244 {
   540  		logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+
   541  			" (settings will still be applied to cgroupfs)", sdVer)
   542  		return nil
   543  	}
   544  
   545  	if cpus != "" {
   546  		bits, err := RangeToBits(cpus)
   547  		if err != nil {
   548  			return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w",
   549  				cpus, err)
   550  		}
   551  		*props = append(*props,
   552  			newProp("AllowedCPUs", bits))
   553  	}
   554  	if mems != "" {
   555  		bits, err := RangeToBits(mems)
   556  		if err != nil {
   557  			return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w",
   558  				mems, err)
   559  		}
   560  		*props = append(*props,
   561  			newProp("AllowedMemoryNodes", bits))
   562  	}
   563  	return nil
   564  }
   565  

View as plain text