...

Source file src/github.com/containerd/cgroups/v2/manager.go

Documentation: github.com/containerd/cgroups/v2

     1  /*
     2     Copyright The containerd Authors.
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package v2
    18  
    19  import (
    20  	"bufio"
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"io"
    25  	"math"
    26  	"os"
    27  	"path/filepath"
    28  	"strconv"
    29  	"strings"
    30  	"syscall"
    31  	"time"
    32  
    33  	"github.com/containerd/cgroups/v2/stats"
    34  
    35  	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
    36  	"github.com/godbus/dbus/v5"
    37  	"github.com/opencontainers/runtime-spec/specs-go"
    38  	"github.com/sirupsen/logrus"
    39  	"golang.org/x/sys/unix"
    40  )
    41  
    42  const (
    43  	subtreeControl     = "cgroup.subtree_control"
    44  	controllersFile    = "cgroup.controllers"
    45  	defaultCgroup2Path = "/sys/fs/cgroup"
    46  	defaultSlice       = "system.slice"
    47  )
    48  
    49  var (
    50  	canDelegate bool
    51  )
    52  
    53  type Event struct {
    54  	Low     uint64
    55  	High    uint64
    56  	Max     uint64
    57  	OOM     uint64
    58  	OOMKill uint64
    59  }
    60  
    61  // Resources for a cgroups v2 unified hierarchy
    62  type Resources struct {
    63  	CPU     *CPU
    64  	Memory  *Memory
    65  	Pids    *Pids
    66  	IO      *IO
    67  	RDMA    *RDMA
    68  	HugeTlb *HugeTlb
    69  	// When len(Devices) is zero, devices are not controlled
    70  	Devices []specs.LinuxDeviceCgroup
    71  }
    72  
    73  // Values returns the raw filenames and values that
    74  // can be written to the unified hierarchy
    75  func (r *Resources) Values() (o []Value) {
    76  	if r.CPU != nil {
    77  		o = append(o, r.CPU.Values()...)
    78  	}
    79  	if r.Memory != nil {
    80  		o = append(o, r.Memory.Values()...)
    81  	}
    82  	if r.Pids != nil {
    83  		o = append(o, r.Pids.Values()...)
    84  	}
    85  	if r.IO != nil {
    86  		o = append(o, r.IO.Values()...)
    87  	}
    88  	if r.RDMA != nil {
    89  		o = append(o, r.RDMA.Values()...)
    90  	}
    91  	if r.HugeTlb != nil {
    92  		o = append(o, r.HugeTlb.Values()...)
    93  	}
    94  	return o
    95  }
    96  
    97  // EnabledControllers returns the list of all not nil resource controllers
    98  func (r *Resources) EnabledControllers() (c []string) {
    99  	if r.CPU != nil {
   100  		c = append(c, "cpu")
   101  		c = append(c, "cpuset")
   102  	}
   103  	if r.Memory != nil {
   104  		c = append(c, "memory")
   105  	}
   106  	if r.Pids != nil {
   107  		c = append(c, "pids")
   108  	}
   109  	if r.IO != nil {
   110  		c = append(c, "io")
   111  	}
   112  	if r.RDMA != nil {
   113  		c = append(c, "rdma")
   114  	}
   115  	if r.HugeTlb != nil {
   116  		c = append(c, "hugetlb")
   117  	}
   118  	return
   119  }
   120  
   121  // Value of a cgroup setting
   122  type Value struct {
   123  	filename string
   124  	value    interface{}
   125  }
   126  
   127  // write the value to the full, absolute path, of a unified hierarchy
   128  func (c *Value) write(path string, perm os.FileMode) error {
   129  	var data []byte
   130  	switch t := c.value.(type) {
   131  	case uint64:
   132  		data = []byte(strconv.FormatUint(t, 10))
   133  	case uint16:
   134  		data = []byte(strconv.FormatUint(uint64(t), 10))
   135  	case int64:
   136  		data = []byte(strconv.FormatInt(t, 10))
   137  	case []byte:
   138  		data = t
   139  	case string:
   140  		data = []byte(t)
   141  	case CPUMax:
   142  		data = []byte(t)
   143  	default:
   144  		return ErrInvalidFormat
   145  	}
   146  
   147  	// Retry writes on EINTR; see:
   148  	//    https://github.com/golang/go/issues/38033
   149  	for {
   150  		err := os.WriteFile(
   151  			filepath.Join(path, c.filename),
   152  			data,
   153  			perm,
   154  		)
   155  		if err == nil {
   156  			return nil
   157  		} else if !errors.Is(err, syscall.EINTR) {
   158  			return err
   159  		}
   160  	}
   161  }
   162  
   163  func writeValues(path string, values []Value) error {
   164  	for _, o := range values {
   165  		if err := o.write(path, defaultFilePerm); err != nil {
   166  			return err
   167  		}
   168  	}
   169  	return nil
   170  }
   171  
   172  func NewManager(mountpoint string, group string, resources *Resources) (*Manager, error) {
   173  	if resources == nil {
   174  		return nil, errors.New("resources reference is nil")
   175  	}
   176  	if err := VerifyGroupPath(group); err != nil {
   177  		return nil, err
   178  	}
   179  	path := filepath.Join(mountpoint, group)
   180  	if err := os.MkdirAll(path, defaultDirPerm); err != nil {
   181  		return nil, err
   182  	}
   183  	m := Manager{
   184  		unifiedMountpoint: mountpoint,
   185  		path:              path,
   186  	}
   187  	if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil {
   188  		// clean up cgroup dir on failure
   189  		os.Remove(path)
   190  		return nil, err
   191  	}
   192  	if err := setResources(path, resources); err != nil {
   193  		os.Remove(path)
   194  		return nil, err
   195  	}
   196  	return &m, nil
   197  }
   198  
   199  func LoadManager(mountpoint string, group string) (*Manager, error) {
   200  	if err := VerifyGroupPath(group); err != nil {
   201  		return nil, err
   202  	}
   203  	path := filepath.Join(mountpoint, group)
   204  	return &Manager{
   205  		unifiedMountpoint: mountpoint,
   206  		path:              path,
   207  	}, nil
   208  }
   209  
   210  type Manager struct {
   211  	unifiedMountpoint string
   212  	path              string
   213  }
   214  
   215  func setResources(path string, resources *Resources) error {
   216  	if resources != nil {
   217  		if err := writeValues(path, resources.Values()); err != nil {
   218  			return err
   219  		}
   220  		if err := setDevices(path, resources.Devices); err != nil {
   221  			return err
   222  		}
   223  	}
   224  	return nil
   225  }
   226  
   227  func (c *Manager) RootControllers() ([]string, error) {
   228  	b, err := os.ReadFile(filepath.Join(c.unifiedMountpoint, controllersFile))
   229  	if err != nil {
   230  		return nil, err
   231  	}
   232  	return strings.Fields(string(b)), nil
   233  }
   234  
   235  func (c *Manager) Controllers() ([]string, error) {
   236  	b, err := os.ReadFile(filepath.Join(c.path, controllersFile))
   237  	if err != nil {
   238  		return nil, err
   239  	}
   240  	return strings.Fields(string(b)), nil
   241  }
   242  
   243  func (c *Manager) Update(resources *Resources) error {
   244  	return setResources(c.path, resources)
   245  }
   246  
   247  type ControllerToggle int
   248  
   249  const (
   250  	Enable ControllerToggle = iota + 1
   251  	Disable
   252  )
   253  
   254  func toggleFunc(controllers []string, prefix string) []string {
   255  	out := make([]string, len(controllers))
   256  	for i, c := range controllers {
   257  		out[i] = prefix + c
   258  	}
   259  	return out
   260  }
   261  
   262  func (c *Manager) ToggleControllers(controllers []string, t ControllerToggle) error {
   263  	// when c.path is like /foo/bar/baz, the following files need to be written:
   264  	// * /sys/fs/cgroup/cgroup.subtree_control
   265  	// * /sys/fs/cgroup/foo/cgroup.subtree_control
   266  	// * /sys/fs/cgroup/foo/bar/cgroup.subtree_control
   267  	// Note that /sys/fs/cgroup/foo/bar/baz/cgroup.subtree_control does not need to be written.
   268  	split := strings.Split(c.path, "/")
   269  	var lastErr error
   270  	for i := range split {
   271  		f := strings.Join(split[:i], "/")
   272  		if !strings.HasPrefix(f, c.unifiedMountpoint) || f == c.path {
   273  			continue
   274  		}
   275  		filePath := filepath.Join(f, subtreeControl)
   276  		if err := c.writeSubtreeControl(filePath, controllers, t); err != nil {
   277  			// When running as rootless, the user may face EPERM on parent groups, but it is neglible when the
   278  			// controller is already written.
   279  			// So we only return the last error.
   280  			lastErr = fmt.Errorf("failed to write subtree controllers %+v to %q: %w", controllers, filePath, err)
   281  		} else {
   282  			lastErr = nil
   283  		}
   284  	}
   285  	return lastErr
   286  }
   287  
   288  func (c *Manager) writeSubtreeControl(filePath string, controllers []string, t ControllerToggle) error {
   289  	f, err := os.OpenFile(filePath, os.O_WRONLY, 0)
   290  	if err != nil {
   291  		return err
   292  	}
   293  	defer f.Close()
   294  	switch t {
   295  	case Enable:
   296  		controllers = toggleFunc(controllers, "+")
   297  	case Disable:
   298  		controllers = toggleFunc(controllers, "-")
   299  	}
   300  	_, err = f.WriteString(strings.Join(controllers, " "))
   301  	return err
   302  }
   303  
   304  func (c *Manager) NewChild(name string, resources *Resources) (*Manager, error) {
   305  	if strings.HasPrefix(name, "/") {
   306  		return nil, errors.New("name must be relative")
   307  	}
   308  	path := filepath.Join(c.path, name)
   309  	if err := os.MkdirAll(path, defaultDirPerm); err != nil {
   310  		return nil, err
   311  	}
   312  	m := Manager{
   313  		unifiedMountpoint: c.unifiedMountpoint,
   314  		path:              path,
   315  	}
   316  	if resources != nil {
   317  		if err := m.ToggleControllers(resources.EnabledControllers(), Enable); err != nil {
   318  			// clean up cgroup dir on failure
   319  			os.Remove(path)
   320  			return nil, err
   321  		}
   322  	}
   323  	if err := setResources(path, resources); err != nil {
   324  		// clean up cgroup dir on failure
   325  		os.Remove(path)
   326  		return nil, err
   327  	}
   328  	return &m, nil
   329  }
   330  
   331  func (c *Manager) AddProc(pid uint64) error {
   332  	v := Value{
   333  		filename: cgroupProcs,
   334  		value:    pid,
   335  	}
   336  	return writeValues(c.path, []Value{v})
   337  }
   338  
   339  func (c *Manager) AddThread(tid uint64) error {
   340  	v := Value{
   341  		filename: cgroupThreads,
   342  		value:    tid,
   343  	}
   344  	return writeValues(c.path, []Value{v})
   345  }
   346  
   347  func (c *Manager) Delete() error {
   348  	// kernel prevents cgroups with running process from being removed, check the tree is empty
   349  	processes, err := c.Procs(true)
   350  	if err != nil {
   351  		return err
   352  	}
   353  	if len(processes) > 0 {
   354  		return fmt.Errorf("cgroups: unable to remove path %q: still contains running processes", c.path)
   355  	}
   356  	return remove(c.path)
   357  }
   358  
   359  func (c *Manager) Procs(recursive bool) ([]uint64, error) {
   360  	var processes []uint64
   361  	err := filepath.Walk(c.path, func(p string, info os.FileInfo, err error) error {
   362  		if err != nil {
   363  			return err
   364  		}
   365  		if !recursive && info.IsDir() {
   366  			if p == c.path {
   367  				return nil
   368  			}
   369  			return filepath.SkipDir
   370  		}
   371  		_, name := filepath.Split(p)
   372  		if name != cgroupProcs {
   373  			return nil
   374  		}
   375  		procs, err := parseCgroupProcsFile(p)
   376  		if err != nil {
   377  			return err
   378  		}
   379  		processes = append(processes, procs...)
   380  		return nil
   381  	})
   382  	return processes, err
   383  }
   384  
   385  func (c *Manager) MoveTo(destination *Manager) error {
   386  	processes, err := c.Procs(true)
   387  	if err != nil {
   388  		return err
   389  	}
   390  	for _, p := range processes {
   391  		if err := destination.AddProc(p); err != nil {
   392  			if strings.Contains(err.Error(), "no such process") {
   393  				continue
   394  			}
   395  			return err
   396  		}
   397  	}
   398  	return nil
   399  }
   400  
   401  var singleValueFiles = []string{
   402  	"pids.current",
   403  	"pids.max",
   404  }
   405  
   406  func (c *Manager) Stat() (*stats.Metrics, error) {
   407  	controllers, err := c.Controllers()
   408  	if err != nil {
   409  		return nil, err
   410  	}
   411  	out := make(map[string]interface{})
   412  	for _, controller := range controllers {
   413  		switch controller {
   414  		case "cpu", "memory":
   415  			if err := readKVStatsFile(c.path, controller+".stat", out); err != nil {
   416  				if os.IsNotExist(err) {
   417  					continue
   418  				}
   419  				return nil, err
   420  			}
   421  		}
   422  	}
   423  	for _, name := range singleValueFiles {
   424  		if err := readSingleFile(c.path, name, out); err != nil {
   425  			if os.IsNotExist(err) {
   426  				continue
   427  			}
   428  			return nil, err
   429  		}
   430  	}
   431  	memoryEvents := make(map[string]interface{})
   432  	if err := readKVStatsFile(c.path, "memory.events", memoryEvents); err != nil {
   433  		if !os.IsNotExist(err) {
   434  			return nil, err
   435  		}
   436  	}
   437  	var metrics stats.Metrics
   438  
   439  	metrics.Pids = &stats.PidsStat{
   440  		Current: getPidValue("pids.current", out),
   441  		Limit:   getPidValue("pids.max", out),
   442  	}
   443  	metrics.CPU = &stats.CPUStat{
   444  		UsageUsec:     getUint64Value("usage_usec", out),
   445  		UserUsec:      getUint64Value("user_usec", out),
   446  		SystemUsec:    getUint64Value("system_usec", out),
   447  		NrPeriods:     getUint64Value("nr_periods", out),
   448  		NrThrottled:   getUint64Value("nr_throttled", out),
   449  		ThrottledUsec: getUint64Value("throttled_usec", out),
   450  	}
   451  	metrics.Memory = &stats.MemoryStat{
   452  		Anon:                  getUint64Value("anon", out),
   453  		File:                  getUint64Value("file", out),
   454  		KernelStack:           getUint64Value("kernel_stack", out),
   455  		Slab:                  getUint64Value("slab", out),
   456  		Sock:                  getUint64Value("sock", out),
   457  		Shmem:                 getUint64Value("shmem", out),
   458  		FileMapped:            getUint64Value("file_mapped", out),
   459  		FileDirty:             getUint64Value("file_dirty", out),
   460  		FileWriteback:         getUint64Value("file_writeback", out),
   461  		AnonThp:               getUint64Value("anon_thp", out),
   462  		InactiveAnon:          getUint64Value("inactive_anon", out),
   463  		ActiveAnon:            getUint64Value("active_anon", out),
   464  		InactiveFile:          getUint64Value("inactive_file", out),
   465  		ActiveFile:            getUint64Value("active_file", out),
   466  		Unevictable:           getUint64Value("unevictable", out),
   467  		SlabReclaimable:       getUint64Value("slab_reclaimable", out),
   468  		SlabUnreclaimable:     getUint64Value("slab_unreclaimable", out),
   469  		Pgfault:               getUint64Value("pgfault", out),
   470  		Pgmajfault:            getUint64Value("pgmajfault", out),
   471  		WorkingsetRefault:     getUint64Value("workingset_refault", out),
   472  		WorkingsetActivate:    getUint64Value("workingset_activate", out),
   473  		WorkingsetNodereclaim: getUint64Value("workingset_nodereclaim", out),
   474  		Pgrefill:              getUint64Value("pgrefill", out),
   475  		Pgscan:                getUint64Value("pgscan", out),
   476  		Pgsteal:               getUint64Value("pgsteal", out),
   477  		Pgactivate:            getUint64Value("pgactivate", out),
   478  		Pgdeactivate:          getUint64Value("pgdeactivate", out),
   479  		Pglazyfree:            getUint64Value("pglazyfree", out),
   480  		Pglazyfreed:           getUint64Value("pglazyfreed", out),
   481  		ThpFaultAlloc:         getUint64Value("thp_fault_alloc", out),
   482  		ThpCollapseAlloc:      getUint64Value("thp_collapse_alloc", out),
   483  		Usage:                 getStatFileContentUint64(filepath.Join(c.path, "memory.current")),
   484  		UsageLimit:            getStatFileContentUint64(filepath.Join(c.path, "memory.max")),
   485  		SwapUsage:             getStatFileContentUint64(filepath.Join(c.path, "memory.swap.current")),
   486  		SwapLimit:             getStatFileContentUint64(filepath.Join(c.path, "memory.swap.max")),
   487  	}
   488  	if len(memoryEvents) > 0 {
   489  		metrics.MemoryEvents = &stats.MemoryEvents{
   490  			Low:     getUint64Value("low", memoryEvents),
   491  			High:    getUint64Value("high", memoryEvents),
   492  			Max:     getUint64Value("max", memoryEvents),
   493  			Oom:     getUint64Value("oom", memoryEvents),
   494  			OomKill: getUint64Value("oom_kill", memoryEvents),
   495  		}
   496  	}
   497  	metrics.Io = &stats.IOStat{Usage: readIoStats(c.path)}
   498  	metrics.Rdma = &stats.RdmaStat{
   499  		Current: rdmaStats(filepath.Join(c.path, "rdma.current")),
   500  		Limit:   rdmaStats(filepath.Join(c.path, "rdma.max")),
   501  	}
   502  	metrics.Hugetlb = readHugeTlbStats(c.path)
   503  
   504  	return &metrics, nil
   505  }
   506  
   507  func getUint64Value(key string, out map[string]interface{}) uint64 {
   508  	v, ok := out[key]
   509  	if !ok {
   510  		return 0
   511  	}
   512  	switch t := v.(type) {
   513  	case uint64:
   514  		return t
   515  	}
   516  	return 0
   517  }
   518  
   519  func getPidValue(key string, out map[string]interface{}) uint64 {
   520  	v, ok := out[key]
   521  	if !ok {
   522  		return 0
   523  	}
   524  	switch t := v.(type) {
   525  	case uint64:
   526  		return t
   527  	case string:
   528  		if t == "max" {
   529  			return math.MaxUint64
   530  		}
   531  	}
   532  	return 0
   533  }
   534  
   535  func readSingleFile(path string, file string, out map[string]interface{}) error {
   536  	f, err := os.Open(filepath.Join(path, file))
   537  	if err != nil {
   538  		return err
   539  	}
   540  	defer f.Close()
   541  	data, err := io.ReadAll(f)
   542  	if err != nil {
   543  		return err
   544  	}
   545  	s := strings.TrimSpace(string(data))
   546  	v, err := parseUint(s, 10, 64)
   547  	if err != nil {
   548  		// if we cannot parse as a uint, parse as a string
   549  		out[file] = s
   550  		return nil
   551  	}
   552  	out[file] = v
   553  	return nil
   554  }
   555  
   556  func readKVStatsFile(path string, file string, out map[string]interface{}) error {
   557  	f, err := os.Open(filepath.Join(path, file))
   558  	if err != nil {
   559  		return err
   560  	}
   561  	defer f.Close()
   562  
   563  	s := bufio.NewScanner(f)
   564  	for s.Scan() {
   565  		name, value, err := parseKV(s.Text())
   566  		if err != nil {
   567  			return fmt.Errorf("error while parsing %s (line=%q): %w", filepath.Join(path, file), s.Text(), err)
   568  		}
   569  		out[name] = value
   570  	}
   571  	return s.Err()
   572  }
   573  
   574  func (c *Manager) Freeze() error {
   575  	return c.freeze(c.path, Frozen)
   576  }
   577  
   578  func (c *Manager) Thaw() error {
   579  	return c.freeze(c.path, Thawed)
   580  }
   581  
   582  func (c *Manager) freeze(path string, state State) error {
   583  	values := state.Values()
   584  	for {
   585  		if err := writeValues(path, values); err != nil {
   586  			return err
   587  		}
   588  		current, err := fetchState(path)
   589  		if err != nil {
   590  			return err
   591  		}
   592  		if current == state {
   593  			return nil
   594  		}
   595  		time.Sleep(1 * time.Millisecond)
   596  	}
   597  }
   598  
   599  func (c *Manager) isCgroupEmpty() bool {
   600  	// In case of any error we return true so that we exit and don't leak resources
   601  	out := make(map[string]interface{})
   602  	if err := readKVStatsFile(c.path, "cgroup.events", out); err != nil {
   603  		return true
   604  	}
   605  	if v, ok := out["populated"]; ok {
   606  		populated, ok := v.(uint64)
   607  		if !ok {
   608  			return true
   609  		}
   610  		return populated == 0
   611  	}
   612  	return true
   613  }
   614  
   615  // MemoryEventFD returns inotify file descriptor and 'memory.events' inotify watch descriptor
   616  func (c *Manager) MemoryEventFD() (int, uint32, error) {
   617  	fpath := filepath.Join(c.path, "memory.events")
   618  	fd, err := syscall.InotifyInit()
   619  	if err != nil {
   620  		return 0, 0, errors.New("failed to create inotify fd")
   621  	}
   622  	wd, err := syscall.InotifyAddWatch(fd, fpath, unix.IN_MODIFY)
   623  	if err != nil {
   624  		syscall.Close(fd)
   625  		return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", fpath, err)
   626  	}
   627  	// monitor to detect process exit/cgroup deletion
   628  	evpath := filepath.Join(c.path, "cgroup.events")
   629  	if _, err = syscall.InotifyAddWatch(fd, evpath, unix.IN_MODIFY); err != nil {
   630  		syscall.Close(fd)
   631  		return 0, 0, fmt.Errorf("failed to add inotify watch for %q: %w", evpath, err)
   632  	}
   633  
   634  	return fd, uint32(wd), nil
   635  }
   636  
   637  func (c *Manager) EventChan() (<-chan Event, <-chan error) {
   638  	ec := make(chan Event)
   639  	errCh := make(chan error, 1)
   640  	go c.waitForEvents(ec, errCh)
   641  
   642  	return ec, errCh
   643  }
   644  
   645  func parseMemoryEvents(out map[string]interface{}) (Event, error) {
   646  	e := Event{}
   647  	if v, ok := out["high"]; ok {
   648  		e.High, ok = v.(uint64)
   649  		if !ok {
   650  			return Event{}, fmt.Errorf("cannot convert high to uint64: %+v", v)
   651  		}
   652  	}
   653  	if v, ok := out["low"]; ok {
   654  		e.Low, ok = v.(uint64)
   655  		if !ok {
   656  			return Event{}, fmt.Errorf("cannot convert low to uint64: %+v", v)
   657  		}
   658  	}
   659  	if v, ok := out["max"]; ok {
   660  		e.Max, ok = v.(uint64)
   661  		if !ok {
   662  			return Event{}, fmt.Errorf("cannot convert max to uint64: %+v", v)
   663  		}
   664  	}
   665  	if v, ok := out["oom"]; ok {
   666  		e.OOM, ok = v.(uint64)
   667  		if !ok {
   668  			return Event{}, fmt.Errorf("cannot convert oom to uint64: %+v", v)
   669  		}
   670  	}
   671  	if v, ok := out["oom_kill"]; ok {
   672  		e.OOMKill, ok = v.(uint64)
   673  		if !ok {
   674  			return Event{}, fmt.Errorf("cannot convert oom_kill to uint64: %+v", v)
   675  		}
   676  	}
   677  	return e, nil
   678  }
   679  
   680  func (c *Manager) waitForEvents(ec chan<- Event, errCh chan<- error) {
   681  	defer close(errCh)
   682  
   683  	fd, _, err := c.MemoryEventFD()
   684  	if err != nil {
   685  		errCh <- err
   686  		return
   687  	}
   688  	defer syscall.Close(fd)
   689  
   690  	for {
   691  		buffer := make([]byte, syscall.SizeofInotifyEvent*10)
   692  		bytesRead, err := syscall.Read(fd, buffer)
   693  		if err != nil {
   694  			errCh <- err
   695  			return
   696  		}
   697  		if bytesRead >= syscall.SizeofInotifyEvent {
   698  			out := make(map[string]interface{})
   699  			if err := readKVStatsFile(c.path, "memory.events", out); err != nil {
   700  				// When cgroup is deleted read may return -ENODEV instead of -ENOENT from open.
   701  				if _, statErr := os.Lstat(filepath.Join(c.path, "memory.events")); !os.IsNotExist(statErr) {
   702  					errCh <- err
   703  				}
   704  				return
   705  			}
   706  			e, err := parseMemoryEvents(out)
   707  			if err != nil {
   708  				errCh <- err
   709  				return
   710  			}
   711  			ec <- e
   712  			if c.isCgroupEmpty() {
   713  				return
   714  			}
   715  		}
   716  	}
   717  }
   718  
   719  func setDevices(path string, devices []specs.LinuxDeviceCgroup) error {
   720  	if len(devices) == 0 {
   721  		return nil
   722  	}
   723  	insts, license, err := DeviceFilter(devices)
   724  	if err != nil {
   725  		return err
   726  	}
   727  	dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY|unix.O_CLOEXEC, 0600)
   728  	if err != nil {
   729  		return fmt.Errorf("cannot get dir FD for %s", path)
   730  	}
   731  	defer unix.Close(dirFD)
   732  	if _, err := LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
   733  		if !canSkipEBPFError(devices) {
   734  			return err
   735  		}
   736  	}
   737  	return nil
   738  }
   739  
   740  // getSystemdFullPath returns the full systemd path when creating a systemd slice group.
   741  // the reason this is necessary is because the "-" character has a special meaning in
   742  // systemd slice. For example, when creating a slice called "my-group-112233.slice",
   743  // systemd will create a hierarchy like this:
   744  //      /sys/fs/cgroup/my.slice/my-group.slice/my-group-112233.slice
   745  func getSystemdFullPath(slice, group string) string {
   746  	return filepath.Join(defaultCgroup2Path, dashesToPath(slice), dashesToPath(group))
   747  }
   748  
   749  // dashesToPath converts a slice name with dashes to it's corresponding systemd filesystem path.
   750  func dashesToPath(in string) string {
   751  	path := ""
   752  	if strings.HasSuffix(in, ".slice") && strings.Contains(in, "-") {
   753  		parts := strings.Split(in, "-")
   754  		for i := range parts {
   755  			s := strings.Join(parts[0:i+1], "-")
   756  			if !strings.HasSuffix(s, ".slice") {
   757  				s += ".slice"
   758  			}
   759  			path = filepath.Join(path, s)
   760  		}
   761  	} else {
   762  		path = filepath.Join(path, in)
   763  	}
   764  	return path
   765  }
   766  
   767  func NewSystemd(slice, group string, pid int, resources *Resources) (*Manager, error) {
   768  	if slice == "" {
   769  		slice = defaultSlice
   770  	}
   771  	ctx := context.TODO()
   772  	path := getSystemdFullPath(slice, group)
   773  	conn, err := systemdDbus.NewWithContext(ctx)
   774  	if err != nil {
   775  		return &Manager{}, err
   776  	}
   777  	defer conn.Close()
   778  
   779  	properties := []systemdDbus.Property{
   780  		systemdDbus.PropDescription("cgroup " + group),
   781  		newSystemdProperty("DefaultDependencies", false),
   782  		newSystemdProperty("MemoryAccounting", true),
   783  		newSystemdProperty("CPUAccounting", true),
   784  		newSystemdProperty("IOAccounting", true),
   785  	}
   786  
   787  	// if we create a slice, the parent is defined via a Wants=
   788  	if strings.HasSuffix(group, ".slice") {
   789  		properties = append(properties, systemdDbus.PropWants(defaultSlice))
   790  	} else {
   791  		// otherwise, we use Slice=
   792  		properties = append(properties, systemdDbus.PropSlice(defaultSlice))
   793  	}
   794  
   795  	// only add pid if its valid, -1 is used w/ general slice creation.
   796  	if pid != -1 {
   797  		properties = append(properties, newSystemdProperty("PIDs", []uint32{uint32(pid)}))
   798  	}
   799  
   800  	if resources.Memory != nil && resources.Memory.Min != nil && *resources.Memory.Min != 0 {
   801  		properties = append(properties,
   802  			newSystemdProperty("MemoryMin", uint64(*resources.Memory.Min)))
   803  	}
   804  
   805  	if resources.Memory != nil && resources.Memory.Max != nil && *resources.Memory.Max != 0 {
   806  		properties = append(properties,
   807  			newSystemdProperty("MemoryMax", uint64(*resources.Memory.Max)))
   808  	}
   809  
   810  	if resources.CPU != nil && resources.CPU.Weight != nil && *resources.CPU.Weight != 0 {
   811  		properties = append(properties,
   812  			newSystemdProperty("CPUWeight", *resources.CPU.Weight))
   813  	}
   814  
   815  	if resources.CPU != nil && resources.CPU.Max != "" {
   816  		quota, period := resources.CPU.Max.extractQuotaAndPeriod()
   817  		// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
   818  		// corresponds to USEC_INFINITY in systemd
   819  		// if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
   820  		// always setting a property value ensures we can apply a quota and remove it later
   821  		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
   822  		if quota > 0 {
   823  			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
   824  			// (integer percentage of CPU) internally.  This means that if a fractional percent of
   825  			// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
   826  			// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
   827  			cpuQuotaPerSecUSec = uint64(quota*1000000) / period
   828  			if cpuQuotaPerSecUSec%10000 != 0 {
   829  				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
   830  			}
   831  		}
   832  		properties = append(properties,
   833  			newSystemdProperty("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
   834  	}
   835  
   836  	// If we can delegate, we add the property back in
   837  	if canDelegate {
   838  		properties = append(properties, newSystemdProperty("Delegate", true))
   839  	}
   840  
   841  	if resources.Pids != nil && resources.Pids.Max > 0 {
   842  		properties = append(properties,
   843  			newSystemdProperty("TasksAccounting", true),
   844  			newSystemdProperty("TasksMax", uint64(resources.Pids.Max)))
   845  	}
   846  
   847  	statusChan := make(chan string, 1)
   848  	if _, err := conn.StartTransientUnitContext(ctx, group, "replace", properties, statusChan); err == nil {
   849  		select {
   850  		case <-statusChan:
   851  		case <-time.After(time.Second):
   852  			logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", group)
   853  		}
   854  	} else if !isUnitExists(err) {
   855  		return &Manager{}, err
   856  	}
   857  
   858  	return &Manager{
   859  		path: path,
   860  	}, nil
   861  }
   862  
   863  func LoadSystemd(slice, group string) (*Manager, error) {
   864  	if slice == "" {
   865  		slice = defaultSlice
   866  	}
   867  	path := getSystemdFullPath(slice, group)
   868  	return &Manager{
   869  		path: path,
   870  	}, nil
   871  }
   872  
   873  func (c *Manager) DeleteSystemd() error {
   874  	ctx := context.TODO()
   875  	conn, err := systemdDbus.NewWithContext(ctx)
   876  	if err != nil {
   877  		return err
   878  	}
   879  	defer conn.Close()
   880  	group := systemdUnitFromPath(c.path)
   881  	ch := make(chan string)
   882  	_, err = conn.StopUnitContext(ctx, group, "replace", ch)
   883  	if err != nil {
   884  		return err
   885  	}
   886  	<-ch
   887  	return nil
   888  }
   889  
   890  func newSystemdProperty(name string, units interface{}) systemdDbus.Property {
   891  	return systemdDbus.Property{
   892  		Name:  name,
   893  		Value: dbus.MakeVariant(units),
   894  	}
   895  }
   896  

View as plain text