...

Source file src/github.com/opencontainers/runc/libcontainer/cgroups/utils.go

Documentation: github.com/opencontainers/runc/libcontainer/cgroups

     1  package cgroups
     2  
     3  import (
     4  	"bufio"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"path/filepath"
    10  	"strconv"
    11  	"strings"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/opencontainers/runc/libcontainer/userns"
    16  	"github.com/sirupsen/logrus"
    17  	"golang.org/x/sys/unix"
    18  )
    19  
    20  const (
    21  	CgroupProcesses   = "cgroup.procs"
    22  	unifiedMountpoint = "/sys/fs/cgroup"
    23  	hybridMountpoint  = "/sys/fs/cgroup/unified"
    24  )
    25  
    26  var (
    27  	isUnifiedOnce sync.Once
    28  	isUnified     bool
    29  	isHybridOnce  sync.Once
    30  	isHybrid      bool
    31  )
    32  
    33  // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
    34  func IsCgroup2UnifiedMode() bool {
    35  	isUnifiedOnce.Do(func() {
    36  		var st unix.Statfs_t
    37  		err := unix.Statfs(unifiedMountpoint, &st)
    38  		if err != nil {
    39  			if os.IsNotExist(err) && userns.RunningInUserNS() {
    40  				// ignore the "not found" error if running in userns
    41  				logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
    42  				isUnified = false
    43  				return
    44  			}
    45  			panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
    46  		}
    47  		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
    48  	})
    49  	return isUnified
    50  }
    51  
    52  // IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode.
    53  func IsCgroup2HybridMode() bool {
    54  	isHybridOnce.Do(func() {
    55  		var st unix.Statfs_t
    56  		err := unix.Statfs(hybridMountpoint, &st)
    57  		if err != nil {
    58  			isHybrid = false
    59  			if !os.IsNotExist(err) {
    60  				// Report unexpected errors.
    61  				logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint)
    62  			}
    63  			return
    64  		}
    65  		isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC
    66  	})
    67  	return isHybrid
    68  }
    69  
    70  type Mount struct {
    71  	Mountpoint string
    72  	Root       string
    73  	Subsystems []string
    74  }
    75  
    76  // GetCgroupMounts returns the mounts for the cgroup subsystems.
    77  // all indicates whether to return just the first instance or all the mounts.
    78  // This function should not be used from cgroupv2 code, as in this case
    79  // all the controllers are available under the constant unifiedMountpoint.
    80  func GetCgroupMounts(all bool) ([]Mount, error) {
    81  	if IsCgroup2UnifiedMode() {
    82  		// TODO: remove cgroupv2 case once all external users are converted
    83  		availableControllers, err := GetAllSubsystems()
    84  		if err != nil {
    85  			return nil, err
    86  		}
    87  		m := Mount{
    88  			Mountpoint: unifiedMountpoint,
    89  			Root:       unifiedMountpoint,
    90  			Subsystems: availableControllers,
    91  		}
    92  		return []Mount{m}, nil
    93  	}
    94  
    95  	return getCgroupMountsV1(all)
    96  }
    97  
    98  // GetAllSubsystems returns all the cgroup subsystems supported by the kernel
    99  func GetAllSubsystems() ([]string, error) {
   100  	// /proc/cgroups is meaningless for v2
   101  	// https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
   102  	if IsCgroup2UnifiedMode() {
   103  		// "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
   104  		// - devices: implemented in kernel 4.15
   105  		// - freezer: implemented in kernel 5.2
   106  		// We assume these are always available, as it is hard to detect availability.
   107  		pseudo := []string{"devices", "freezer"}
   108  		data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers")
   109  		if err != nil {
   110  			return nil, err
   111  		}
   112  		subsystems := append(pseudo, strings.Fields(data)...)
   113  		return subsystems, nil
   114  	}
   115  	f, err := os.Open("/proc/cgroups")
   116  	if err != nil {
   117  		return nil, err
   118  	}
   119  	defer f.Close()
   120  
   121  	subsystems := []string{}
   122  
   123  	s := bufio.NewScanner(f)
   124  	for s.Scan() {
   125  		text := s.Text()
   126  		if text[0] != '#' {
   127  			parts := strings.Fields(text)
   128  			if len(parts) >= 4 && parts[3] != "0" {
   129  				subsystems = append(subsystems, parts[0])
   130  			}
   131  		}
   132  	}
   133  	if err := s.Err(); err != nil {
   134  		return nil, err
   135  	}
   136  	return subsystems, nil
   137  }
   138  
   139  func readProcsFile(dir string) ([]int, error) {
   140  	f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY)
   141  	if err != nil {
   142  		return nil, err
   143  	}
   144  	defer f.Close()
   145  
   146  	var (
   147  		s   = bufio.NewScanner(f)
   148  		out = []int{}
   149  	)
   150  
   151  	for s.Scan() {
   152  		if t := s.Text(); t != "" {
   153  			pid, err := strconv.Atoi(t)
   154  			if err != nil {
   155  				return nil, err
   156  			}
   157  			out = append(out, pid)
   158  		}
   159  	}
   160  	return out, s.Err()
   161  }
   162  
   163  // ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
   164  // or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
   165  //
   166  //	"cpu": "/user.slice/user-1000.slice"
   167  //	"pids": "/user.slice/user-1000.slice"
   168  //
   169  // etc.
   170  //
   171  // Note that for cgroup v2 unified hierarchy, there are no per-controller
   172  // cgroup paths, so the resulting map will have a single element where the key
   173  // is empty string ("") and the value is the cgroup path the <pid> is in.
   174  func ParseCgroupFile(path string) (map[string]string, error) {
   175  	f, err := os.Open(path)
   176  	if err != nil {
   177  		return nil, err
   178  	}
   179  	defer f.Close()
   180  
   181  	return parseCgroupFromReader(f)
   182  }
   183  
   184  // helper function for ParseCgroupFile to make testing easier
   185  func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
   186  	s := bufio.NewScanner(r)
   187  	cgroups := make(map[string]string)
   188  
   189  	for s.Scan() {
   190  		text := s.Text()
   191  		// from cgroups(7):
   192  		// /proc/[pid]/cgroup
   193  		// ...
   194  		// For each cgroup hierarchy ... there is one entry
   195  		// containing three colon-separated fields of the form:
   196  		//     hierarchy-ID:subsystem-list:cgroup-path
   197  		parts := strings.SplitN(text, ":", 3)
   198  		if len(parts) < 3 {
   199  			return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
   200  		}
   201  
   202  		for _, subs := range strings.Split(parts[1], ",") {
   203  			cgroups[subs] = parts[2]
   204  		}
   205  	}
   206  	if err := s.Err(); err != nil {
   207  		return nil, err
   208  	}
   209  
   210  	return cgroups, nil
   211  }
   212  
   213  func PathExists(path string) bool {
   214  	if _, err := os.Stat(path); err != nil {
   215  		return false
   216  	}
   217  	return true
   218  }
   219  
   220  func EnterPid(cgroupPaths map[string]string, pid int) error {
   221  	for _, path := range cgroupPaths {
   222  		if PathExists(path) {
   223  			if err := WriteCgroupProc(path, pid); err != nil {
   224  				return err
   225  			}
   226  		}
   227  	}
   228  	return nil
   229  }
   230  
   231  func rmdir(path string) error {
   232  	err := unix.Rmdir(path)
   233  	if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare
   234  		return nil
   235  	}
   236  	return &os.PathError{Op: "rmdir", Path: path, Err: err}
   237  }
   238  
   239  // RemovePath aims to remove cgroup path. It does so recursively,
   240  // by removing any subdirectories (sub-cgroups) first.
   241  func RemovePath(path string) error {
   242  	// try the fast path first
   243  	if err := rmdir(path); err == nil {
   244  		return nil
   245  	}
   246  
   247  	infos, err := os.ReadDir(path)
   248  	if err != nil {
   249  		if os.IsNotExist(err) {
   250  			err = nil
   251  		}
   252  		return err
   253  	}
   254  	for _, info := range infos {
   255  		if info.IsDir() {
   256  			// We should remove subcgroups dir first
   257  			if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
   258  				break
   259  			}
   260  		}
   261  	}
   262  	if err == nil {
   263  		err = rmdir(path)
   264  	}
   265  	return err
   266  }
   267  
   268  // RemovePaths iterates over the provided paths removing them.
   269  // We trying to remove all paths five times with increasing delay between tries.
   270  // If after all there are not removed cgroups - appropriate error will be
   271  // returned.
   272  func RemovePaths(paths map[string]string) (err error) {
   273  	const retries = 5
   274  	delay := 10 * time.Millisecond
   275  	for i := 0; i < retries; i++ {
   276  		if i != 0 {
   277  			time.Sleep(delay)
   278  			delay *= 2
   279  		}
   280  		for s, p := range paths {
   281  			if err := RemovePath(p); err != nil {
   282  				// do not log intermediate iterations
   283  				switch i {
   284  				case 0:
   285  					logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
   286  				case retries - 1:
   287  					logrus.WithError(err).Error("Failed to remove cgroup")
   288  				}
   289  			}
   290  			_, err := os.Stat(p)
   291  			// We need this strange way of checking cgroups existence because
   292  			// RemoveAll almost always returns error, even on already removed
   293  			// cgroups
   294  			if os.IsNotExist(err) {
   295  				delete(paths, s)
   296  			}
   297  		}
   298  		if len(paths) == 0 {
   299  			//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
   300  			paths = make(map[string]string)
   301  			return nil
   302  		}
   303  	}
   304  	return fmt.Errorf("Failed to remove paths: %v", paths)
   305  }
   306  
   307  var (
   308  	hugePageSizes []string
   309  	initHPSOnce   sync.Once
   310  )
   311  
   312  func HugePageSizes() []string {
   313  	initHPSOnce.Do(func() {
   314  		dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
   315  		if err != nil {
   316  			return
   317  		}
   318  		files, err := dir.Readdirnames(0)
   319  		dir.Close()
   320  		if err != nil {
   321  			return
   322  		}
   323  
   324  		hugePageSizes, err = getHugePageSizeFromFilenames(files)
   325  		if err != nil {
   326  			logrus.Warn("HugePageSizes: ", err)
   327  		}
   328  	})
   329  
   330  	return hugePageSizes
   331  }
   332  
   333  func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
   334  	pageSizes := make([]string, 0, len(fileNames))
   335  	var warn error
   336  
   337  	for _, file := range fileNames {
   338  		// example: hugepages-1048576kB
   339  		val := strings.TrimPrefix(file, "hugepages-")
   340  		if len(val) == len(file) {
   341  			// Unexpected file name: no prefix found, ignore it.
   342  			continue
   343  		}
   344  		// The suffix is always "kB" (as of Linux 5.13). If we find
   345  		// something else, produce an error but keep going.
   346  		eLen := len(val) - 2
   347  		val = strings.TrimSuffix(val, "kB")
   348  		if len(val) != eLen {
   349  			// Highly unlikely.
   350  			if warn == nil {
   351  				warn = errors.New(file + `: invalid suffix (expected "kB")`)
   352  			}
   353  			continue
   354  		}
   355  		size, err := strconv.Atoi(val)
   356  		if err != nil {
   357  			// Highly unlikely.
   358  			if warn == nil {
   359  				warn = fmt.Errorf("%s: %w", file, err)
   360  			}
   361  			continue
   362  		}
   363  		// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
   364  		// but in our case the size is in KB already.
   365  		if size >= (1 << 20) {
   366  			val = strconv.Itoa(size>>20) + "GB"
   367  		} else if size >= (1 << 10) {
   368  			val = strconv.Itoa(size>>10) + "MB"
   369  		} else {
   370  			val += "KB"
   371  		}
   372  		pageSizes = append(pageSizes, val)
   373  	}
   374  
   375  	return pageSizes, warn
   376  }
   377  
   378  // GetPids returns all pids, that were added to cgroup at path.
   379  func GetPids(dir string) ([]int, error) {
   380  	return readProcsFile(dir)
   381  }
   382  
   383  // WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
   384  func WriteCgroupProc(dir string, pid int) error {
   385  	// Normally dir should not be empty, one case is that cgroup subsystem
   386  	// is not mounted, we will get empty dir, and we want it fail here.
   387  	if dir == "" {
   388  		return fmt.Errorf("no such directory for %s", CgroupProcesses)
   389  	}
   390  
   391  	// Dont attach any pid to the cgroup if -1 is specified as a pid
   392  	if pid == -1 {
   393  		return nil
   394  	}
   395  
   396  	file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY)
   397  	if err != nil {
   398  		return fmt.Errorf("failed to write %v: %w", pid, err)
   399  	}
   400  	defer file.Close()
   401  
   402  	for i := 0; i < 5; i++ {
   403  		_, err = file.WriteString(strconv.Itoa(pid))
   404  		if err == nil {
   405  			return nil
   406  		}
   407  
   408  		// EINVAL might mean that the task being added to cgroup.procs is in state
   409  		// TASK_NEW. We should attempt to do so again.
   410  		if errors.Is(err, unix.EINVAL) {
   411  			time.Sleep(30 * time.Millisecond)
   412  			continue
   413  		}
   414  
   415  		return fmt.Errorf("failed to write %v: %w", pid, err)
   416  	}
   417  	return err
   418  }
   419  
   420  // Since the OCI spec is designed for cgroup v1, in some cases
   421  // there is need to convert from the cgroup v1 configuration to cgroup v2
   422  // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
   423  // convert from [2-262144] to [1-10000]
   424  // 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
   425  func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
   426  	if cpuShares == 0 {
   427  		return 0
   428  	}
   429  	return (1 + ((cpuShares-2)*9999)/262142)
   430  }
   431  
   432  // ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
   433  // for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap
   434  // is defined as memory+swap combined, while in cgroup v2 swap is a separate value.
   435  func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
   436  	// for compatibility with cgroup1 controller, set swap to unlimited in
   437  	// case the memory is set to unlimited, and swap is not explicitly set,
   438  	// treating the request as "set both memory and swap to unlimited".
   439  	if memory == -1 && memorySwap == 0 {
   440  		return -1, nil
   441  	}
   442  	if memorySwap == -1 || memorySwap == 0 {
   443  		// -1 is "max", 0 is "unset", so treat as is
   444  		return memorySwap, nil
   445  	}
   446  	// sanity checks
   447  	if memory == 0 || memory == -1 {
   448  		return 0, errors.New("unable to set swap limit without memory limit")
   449  	}
   450  	if memory < 0 {
   451  		return 0, fmt.Errorf("invalid memory value: %d", memory)
   452  	}
   453  	if memorySwap < memory {
   454  		return 0, errors.New("memory+swap limit should be >= memory limit")
   455  	}
   456  
   457  	return memorySwap - memory, nil
   458  }
   459  
   460  // Since the OCI spec is designed for cgroup v1, in some cases
   461  // there is need to convert from the cgroup v1 configuration to cgroup v2
   462  // the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
   463  // convert linearly from [10-1000] to [1-10000]
   464  func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
   465  	if blkIoWeight == 0 {
   466  		return 0
   467  	}
   468  	return 1 + (uint64(blkIoWeight)-10)*9999/990
   469  }
   470  

View as plain text