...

Source file src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go

Documentation: github.com/opencontainers/runc/libcontainer/cgroups/systemd

     1  package systemd
     2  
     3  import (
     4  	"errors"
     5  	"os"
     6  	"path/filepath"
     7  	"reflect"
     8  	"strings"
     9  	"sync"
    10  
    11  	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
    12  	"github.com/godbus/dbus/v5"
    13  	"github.com/sirupsen/logrus"
    14  
    15  	"github.com/opencontainers/runc/libcontainer/cgroups"
    16  	"github.com/opencontainers/runc/libcontainer/cgroups/fs"
    17  	"github.com/opencontainers/runc/libcontainer/configs"
    18  )
    19  
    20  type legacyManager struct {
    21  	mu      sync.Mutex
    22  	cgroups *configs.Cgroup
    23  	paths   map[string]string
    24  	dbus    *dbusConnManager
    25  }
    26  
    27  func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
    28  	if cg.Rootless {
    29  		return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1")
    30  	}
    31  	if cg.Resources != nil && cg.Resources.Unified != nil {
    32  		return nil, cgroups.ErrV1NoUnified
    33  	}
    34  	if paths == nil {
    35  		var err error
    36  		paths, err = initPaths(cg)
    37  		if err != nil {
    38  			return nil, err
    39  		}
    40  	}
    41  	return &legacyManager{
    42  		cgroups: cg,
    43  		paths:   paths,
    44  		dbus:    newDbusConnManager(false),
    45  	}, nil
    46  }
    47  
    48  type subsystem interface {
    49  	// Name returns the name of the subsystem.
    50  	Name() string
    51  	// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
    52  	GetStats(path string, stats *cgroups.Stats) error
    53  	// Set sets cgroup resource limits.
    54  	Set(path string, r *configs.Resources) error
    55  }
    56  
    57  var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
    58  
    59  var legacySubsystems = []subsystem{
    60  	&fs.CpusetGroup{},
    61  	&fs.DevicesGroup{},
    62  	&fs.MemoryGroup{},
    63  	&fs.CpuGroup{},
    64  	&fs.CpuacctGroup{},
    65  	&fs.PidsGroup{},
    66  	&fs.BlkioGroup{},
    67  	&fs.HugetlbGroup{},
    68  	&fs.PerfEventGroup{},
    69  	&fs.FreezerGroup{},
    70  	&fs.NetPrioGroup{},
    71  	&fs.NetClsGroup{},
    72  	&fs.NameGroup{GroupName: "name=systemd"},
    73  	&fs.RdmaGroup{},
    74  	&fs.NameGroup{GroupName: "misc"},
    75  }
    76  
    77  func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
    78  	var properties []systemdDbus.Property
    79  
    80  	deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
    81  	if err != nil {
    82  		return nil, err
    83  	}
    84  	properties = append(properties, deviceProperties...)
    85  
    86  	if r.Memory != 0 {
    87  		properties = append(properties,
    88  			newProp("MemoryLimit", uint64(r.Memory)))
    89  	}
    90  
    91  	if r.CpuShares != 0 {
    92  		properties = append(properties,
    93  			newProp("CPUShares", r.CpuShares))
    94  	}
    95  
    96  	addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
    97  
    98  	if r.BlkioWeight != 0 {
    99  		properties = append(properties,
   100  			newProp("BlockIOWeight", uint64(r.BlkioWeight)))
   101  	}
   102  
   103  	if r.PidsLimit > 0 || r.PidsLimit == -1 {
   104  		properties = append(properties,
   105  			newProp("TasksMax", uint64(r.PidsLimit)))
   106  	}
   107  
   108  	err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
   109  	if err != nil {
   110  		return nil, err
   111  	}
   112  
   113  	return properties, nil
   114  }
   115  
   116  // initPaths figures out and returns paths to cgroups.
   117  func initPaths(c *configs.Cgroup) (map[string]string, error) {
   118  	slice := "system.slice"
   119  	if c.Parent != "" {
   120  		var err error
   121  		slice, err = ExpandSlice(c.Parent)
   122  		if err != nil {
   123  			return nil, err
   124  		}
   125  	}
   126  
   127  	unit := getUnitName(c)
   128  
   129  	paths := make(map[string]string)
   130  	for _, s := range legacySubsystems {
   131  		subsystemPath, err := getSubsystemPath(slice, unit, s.Name())
   132  		if err != nil {
   133  			// Even if it's `not found` error, we'll return err
   134  			// because devices cgroup is hard requirement for
   135  			// container security.
   136  			if s.Name() == "devices" {
   137  				return nil, err
   138  			}
   139  			// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
   140  			if cgroups.IsNotFound(err) {
   141  				continue
   142  			}
   143  			return nil, err
   144  		}
   145  		paths[s.Name()] = subsystemPath
   146  	}
   147  
   148  	// If systemd is using cgroups-hybrid mode then add the slice path of
   149  	// this container to the paths so the following process executed with
   150  	// "runc exec" joins that cgroup as well.
   151  	if cgroups.IsCgroup2HybridMode() {
   152  		// "" means cgroup-hybrid path
   153  		cgroupsHybridPath, err := getSubsystemPath(slice, unit, "")
   154  		if err != nil && cgroups.IsNotFound(err) {
   155  			return nil, err
   156  		}
   157  		paths[""] = cgroupsHybridPath
   158  	}
   159  
   160  	return paths, nil
   161  }
   162  
   163  func (m *legacyManager) Apply(pid int) error {
   164  	var (
   165  		c          = m.cgroups
   166  		unitName   = getUnitName(c)
   167  		slice      = "system.slice"
   168  		properties []systemdDbus.Property
   169  	)
   170  
   171  	m.mu.Lock()
   172  	defer m.mu.Unlock()
   173  
   174  	if c.Parent != "" {
   175  		slice = c.Parent
   176  	}
   177  
   178  	properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
   179  
   180  	if strings.HasSuffix(unitName, ".slice") {
   181  		// If we create a slice, the parent is defined via a Wants=.
   182  		properties = append(properties, systemdDbus.PropWants(slice))
   183  	} else {
   184  		// Otherwise it's a scope, which we put into a Slice=.
   185  		properties = append(properties, systemdDbus.PropSlice(slice))
   186  		// Assume scopes always support delegation (supported since systemd v218).
   187  		properties = append(properties, newProp("Delegate", true))
   188  	}
   189  
   190  	// only add pid if its valid, -1 is used w/ general slice creation.
   191  	if pid != -1 {
   192  		properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
   193  	}
   194  
   195  	// Always enable accounting, this gets us the same behaviour as the fs implementation,
   196  	// plus the kernel has some problems with joining the memory cgroup at a later time.
   197  	properties = append(properties,
   198  		newProp("MemoryAccounting", true),
   199  		newProp("CPUAccounting", true),
   200  		newProp("BlockIOAccounting", true),
   201  		newProp("TasksAccounting", true),
   202  	)
   203  
   204  	// Assume DefaultDependencies= will always work (the check for it was previously broken.)
   205  	properties = append(properties,
   206  		newProp("DefaultDependencies", false))
   207  
   208  	properties = append(properties, c.SystemdProps...)
   209  
   210  	if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
   211  		return err
   212  	}
   213  
   214  	if err := m.joinCgroups(pid); err != nil {
   215  		return err
   216  	}
   217  
   218  	return nil
   219  }
   220  
   221  func (m *legacyManager) Destroy() error {
   222  	m.mu.Lock()
   223  	defer m.mu.Unlock()
   224  
   225  	stopErr := stopUnit(m.dbus, getUnitName(m.cgroups))
   226  
   227  	// Both on success and on error, cleanup all the cgroups
   228  	// we are aware of, as some of them were created directly
   229  	// by Apply() and are not managed by systemd.
   230  	if err := cgroups.RemovePaths(m.paths); err != nil && stopErr == nil {
   231  		return err
   232  	}
   233  
   234  	return stopErr
   235  }
   236  
   237  func (m *legacyManager) Path(subsys string) string {
   238  	m.mu.Lock()
   239  	defer m.mu.Unlock()
   240  	return m.paths[subsys]
   241  }
   242  
   243  func (m *legacyManager) joinCgroups(pid int) error {
   244  	for _, sys := range legacySubsystems {
   245  		name := sys.Name()
   246  		switch name {
   247  		case "name=systemd":
   248  			// let systemd handle this
   249  		case "cpuset":
   250  			if path, ok := m.paths[name]; ok {
   251  				s := &fs.CpusetGroup{}
   252  				if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil {
   253  					return err
   254  				}
   255  			}
   256  		default:
   257  			if path, ok := m.paths[name]; ok {
   258  				if err := os.MkdirAll(path, 0o755); err != nil {
   259  					return err
   260  				}
   261  				if err := cgroups.WriteCgroupProc(path, pid); err != nil {
   262  					return err
   263  				}
   264  			}
   265  		}
   266  	}
   267  
   268  	return nil
   269  }
   270  
   271  func getSubsystemPath(slice, unit, subsystem string) (string, error) {
   272  	mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem)
   273  	if err != nil {
   274  		return "", err
   275  	}
   276  
   277  	return filepath.Join(mountpoint, slice, unit), nil
   278  }
   279  
   280  func (m *legacyManager) Freeze(state configs.FreezerState) error {
   281  	err := m.doFreeze(state)
   282  	if err == nil {
   283  		m.cgroups.Resources.Freezer = state
   284  	}
   285  	return err
   286  }
   287  
   288  // doFreeze is the same as Freeze but without
   289  // changing the m.cgroups.Resources.Frozen field.
   290  func (m *legacyManager) doFreeze(state configs.FreezerState) error {
   291  	path, ok := m.paths["freezer"]
   292  	if !ok {
   293  		return errSubsystemDoesNotExist
   294  	}
   295  	freezer := &fs.FreezerGroup{}
   296  	resources := &configs.Resources{Freezer: state}
   297  	return freezer.Set(path, resources)
   298  }
   299  
   300  func (m *legacyManager) GetPids() ([]int, error) {
   301  	path, ok := m.paths["devices"]
   302  	if !ok {
   303  		return nil, errSubsystemDoesNotExist
   304  	}
   305  	return cgroups.GetPids(path)
   306  }
   307  
   308  func (m *legacyManager) GetAllPids() ([]int, error) {
   309  	path, ok := m.paths["devices"]
   310  	if !ok {
   311  		return nil, errSubsystemDoesNotExist
   312  	}
   313  	return cgroups.GetAllPids(path)
   314  }
   315  
   316  func (m *legacyManager) GetStats() (*cgroups.Stats, error) {
   317  	m.mu.Lock()
   318  	defer m.mu.Unlock()
   319  	stats := cgroups.NewStats()
   320  	for _, sys := range legacySubsystems {
   321  		path := m.paths[sys.Name()]
   322  		if path == "" {
   323  			continue
   324  		}
   325  		if err := sys.GetStats(path, stats); err != nil {
   326  			return nil, err
   327  		}
   328  	}
   329  
   330  	return stats, nil
   331  }
   332  
   333  // freezeBeforeSet answers whether there is a need to freeze the cgroup before
   334  // applying its systemd unit properties, and thaw after, while avoiding
   335  // unnecessary freezer state changes.
   336  //
   337  // The reason why we have to freeze is that systemd's application of device
   338  // rules is done disruptively, resulting in spurious errors to common devices
   339  // (unlike our fs driver, they will happily write deny-all rules to running
   340  // containers). So we have to freeze the container to avoid the container get
   341  // an occasional "permission denied" error.
   342  func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) (needsFreeze, needsThaw bool, err error) {
   343  	// Special case for SkipDevices, as used by Kubernetes to create pod
   344  	// cgroups with allow-all device policy).
   345  	if r.SkipDevices {
   346  		if r.SkipFreezeOnSet {
   347  			// Both needsFreeze and needsThaw are false.
   348  			return
   349  		}
   350  
   351  		// No need to freeze if SkipDevices is set, and either
   352  		// (1) systemd unit does not (yet) exist, or
   353  		// (2) it has DevicePolicy=auto and empty DeviceAllow list.
   354  		//
   355  		// Interestingly, (1) and (2) are the same here because
   356  		// a non-existent unit returns default properties,
   357  		// and settings in (2) are the defaults.
   358  		//
   359  		// Do not return errors from getUnitTypeProperty, as they alone
   360  		// should not prevent Set from working.
   361  
   362  		unitType := getUnitType(unitName)
   363  
   364  		devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy")
   365  		if e == nil && devPolicy.Value == dbus.MakeVariant("auto") {
   366  			devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow")
   367  			if e == nil {
   368  				if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 {
   369  					needsFreeze = false
   370  					needsThaw = false
   371  					return
   372  				}
   373  			}
   374  		}
   375  	}
   376  
   377  	needsFreeze = true
   378  	needsThaw = true
   379  
   380  	// Check the current freezer state.
   381  	freezerState, err := m.GetFreezerState()
   382  	if err != nil {
   383  		return
   384  	}
   385  	if freezerState == configs.Frozen {
   386  		// Already frozen, and should stay frozen.
   387  		needsFreeze = false
   388  		needsThaw = false
   389  	}
   390  
   391  	if r.Freezer == configs.Frozen {
   392  		// Will be frozen anyway -- no need to thaw.
   393  		needsThaw = false
   394  	}
   395  	return
   396  }
   397  
   398  func (m *legacyManager) Set(r *configs.Resources) error {
   399  	if r == nil {
   400  		return nil
   401  	}
   402  	if r.Unified != nil {
   403  		return cgroups.ErrV1NoUnified
   404  	}
   405  	properties, err := genV1ResourcesProperties(r, m.dbus)
   406  	if err != nil {
   407  		return err
   408  	}
   409  
   410  	unitName := getUnitName(m.cgroups)
   411  	needsFreeze, needsThaw, err := m.freezeBeforeSet(unitName, r)
   412  	if err != nil {
   413  		return err
   414  	}
   415  
   416  	if needsFreeze {
   417  		if err := m.doFreeze(configs.Frozen); err != nil {
   418  			// If freezer cgroup isn't supported, we just warn about it.
   419  			logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
   420  			// skip update the cgroup while frozen failed. #3803
   421  			if !errors.Is(err, errSubsystemDoesNotExist) {
   422  				if needsThaw {
   423  					if thawErr := m.doFreeze(configs.Thawed); thawErr != nil {
   424  						logrus.Infof("thaw container after doFreeze failed: %v", thawErr)
   425  					}
   426  				}
   427  				return err
   428  			}
   429  		}
   430  	}
   431  	setErr := setUnitProperties(m.dbus, unitName, properties...)
   432  	if needsThaw {
   433  		if err := m.doFreeze(configs.Thawed); err != nil {
   434  			logrus.Infof("thaw container after SetUnitProperties failed: %v", err)
   435  		}
   436  	}
   437  	if setErr != nil {
   438  		return setErr
   439  	}
   440  
   441  	for _, sys := range legacySubsystems {
   442  		// Get the subsystem path, but don't error out for not found cgroups.
   443  		path, ok := m.paths[sys.Name()]
   444  		if !ok {
   445  			continue
   446  		}
   447  		if err := sys.Set(path, r); err != nil {
   448  			return err
   449  		}
   450  	}
   451  
   452  	return nil
   453  }
   454  
   455  func (m *legacyManager) GetPaths() map[string]string {
   456  	m.mu.Lock()
   457  	defer m.mu.Unlock()
   458  	return m.paths
   459  }
   460  
   461  func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) {
   462  	return m.cgroups, nil
   463  }
   464  
   465  func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
   466  	path, ok := m.paths["freezer"]
   467  	if !ok {
   468  		return configs.Undefined, nil
   469  	}
   470  	freezer := &fs.FreezerGroup{}
   471  	return freezer.GetState(path)
   472  }
   473  
   474  func (m *legacyManager) Exists() bool {
   475  	return cgroups.PathExists(m.Path("devices"))
   476  }
   477  
   478  func (m *legacyManager) OOMKillCount() (uint64, error) {
   479  	return fs.OOMKillCount(m.Path("memory"))
   480  }
   481  

View as plain text