...

Source file src/k8s.io/kubernetes/pkg/kubelet/cm/cgroup_manager_linux.go

Documentation: k8s.io/kubernetes/pkg/kubelet/cm

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cm
    18  
    19  import (
    20  	"errors"
    21  	"fmt"
    22  	"os"
    23  	"path"
    24  	"path/filepath"
    25  	"strconv"
    26  	"strings"
    27  	"sync"
    28  	"time"
    29  
    30  	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
    31  	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
    32  	"github.com/opencontainers/runc/libcontainer/cgroups/manager"
    33  	cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
    34  	libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs"
    35  	v1 "k8s.io/api/core/v1"
    36  	"k8s.io/klog/v2"
    37  	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
    38  
    39  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    40  	"k8s.io/apimachinery/pkg/util/sets"
    41  	cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util"
    42  	"k8s.io/kubernetes/pkg/kubelet/metrics"
    43  )
    44  
    45  const (
    46  	// systemdSuffix is the cgroup name suffix for systemd
    47  	systemdSuffix string = ".slice"
    48  	// Cgroup2MemoryMin is memory.min for cgroup v2
    49  	Cgroup2MemoryMin string = "memory.min"
    50  	// Cgroup2MemoryHigh is memory.high for cgroup v2
    51  	Cgroup2MemoryHigh      string = "memory.high"
    52  	Cgroup2MaxCpuLimit     string = "max"
    53  	Cgroup2MaxSwapFilename string = "memory.swap.max"
    54  )
    55  
    56  var RootCgroupName = CgroupName([]string{})
    57  
    58  // NewCgroupName composes a new cgroup name.
    59  // Use RootCgroupName as base to start at the root.
    60  // This function does some basic check for invalid characters at the name.
    61  func NewCgroupName(base CgroupName, components ...string) CgroupName {
    62  	for _, component := range components {
    63  		// Forbit using "_" in internal names. When remapping internal
    64  		// names to systemd cgroup driver, we want to remap "-" => "_",
    65  		// so we forbid "_" so that we can always reverse the mapping.
    66  		if strings.Contains(component, "/") || strings.Contains(component, "_") {
    67  			panic(fmt.Errorf("invalid character in component [%q] of CgroupName", component))
    68  		}
    69  	}
    70  	return CgroupName(append(append([]string{}, base...), components...))
    71  }
    72  
    73  func escapeSystemdCgroupName(part string) string {
    74  	return strings.Replace(part, "-", "_", -1)
    75  }
    76  
    77  func unescapeSystemdCgroupName(part string) string {
    78  	return strings.Replace(part, "_", "-", -1)
    79  }
    80  
    81  // cgroupName.ToSystemd converts the internal cgroup name to a systemd name.
    82  // For example, the name {"kubepods", "burstable", "pod1234-abcd-5678-efgh"} becomes
    83  // "/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod1234_abcd_5678_efgh.slice"
    84  // This function always expands the systemd name into the cgroupfs form. If only
    85  // the last part is needed, use path.Base(...) on it to discard the rest.
    86  func (cgroupName CgroupName) ToSystemd() string {
    87  	if len(cgroupName) == 0 || (len(cgroupName) == 1 && cgroupName[0] == "") {
    88  		return "/"
    89  	}
    90  	newparts := []string{}
    91  	for _, part := range cgroupName {
    92  		part = escapeSystemdCgroupName(part)
    93  		newparts = append(newparts, part)
    94  	}
    95  
    96  	result, err := cgroupsystemd.ExpandSlice(strings.Join(newparts, "-") + systemdSuffix)
    97  	if err != nil {
    98  		// Should never happen...
    99  		panic(fmt.Errorf("error converting cgroup name [%v] to systemd format: %v", cgroupName, err))
   100  	}
   101  	return result
   102  }
   103  
   104  func ParseSystemdToCgroupName(name string) CgroupName {
   105  	driverName := path.Base(name)
   106  	driverName = strings.TrimSuffix(driverName, systemdSuffix)
   107  	parts := strings.Split(driverName, "-")
   108  	result := []string{}
   109  	for _, part := range parts {
   110  		result = append(result, unescapeSystemdCgroupName(part))
   111  	}
   112  	return CgroupName(result)
   113  }
   114  
   115  func (cgroupName CgroupName) ToCgroupfs() string {
   116  	return "/" + path.Join(cgroupName...)
   117  }
   118  
   119  func ParseCgroupfsToCgroupName(name string) CgroupName {
   120  	components := strings.Split(strings.TrimPrefix(name, "/"), "/")
   121  	if len(components) == 1 && components[0] == "" {
   122  		components = []string{}
   123  	}
   124  	return CgroupName(components)
   125  }
   126  
   127  func IsSystemdStyleName(name string) bool {
   128  	return strings.HasSuffix(name, systemdSuffix)
   129  }
   130  
   131  // CgroupSubsystems holds information about the mounted cgroup subsystems
   132  type CgroupSubsystems struct {
   133  	// Cgroup subsystem mounts.
   134  	// e.g.: "/sys/fs/cgroup/cpu" -> ["cpu", "cpuacct"]
   135  	Mounts []libcontainercgroups.Mount
   136  
   137  	// Cgroup subsystem to their mount location.
   138  	// e.g.: "cpu" -> "/sys/fs/cgroup/cpu"
   139  	MountPoints map[string]string
   140  }
   141  
   142  // cgroupManagerImpl implements the CgroupManager interface.
   143  // Its a stateless object which can be used to
   144  // update,create or delete any number of cgroups
   145  // It relies on runc/libcontainer cgroup managers.
   146  type cgroupManagerImpl struct {
   147  	// subsystems holds information about all the
   148  	// mounted cgroup subsystems on the node
   149  	subsystems *CgroupSubsystems
   150  
   151  	// useSystemd tells if systemd cgroup manager should be used.
   152  	useSystemd bool
   153  }
   154  
   155  // Make sure that cgroupManagerImpl implements the CgroupManager interface
   156  var _ CgroupManager = &cgroupManagerImpl{}
   157  
   158  // NewCgroupManager is a factory method that returns a CgroupManager
   159  func NewCgroupManager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
   160  	return &cgroupManagerImpl{
   161  		subsystems: cs,
   162  		useSystemd: cgroupDriver == "systemd",
   163  	}
   164  }
   165  
   166  // Name converts the cgroup to the driver specific value in cgroupfs form.
   167  // This always returns a valid cgroupfs path even when systemd driver is in use!
   168  func (m *cgroupManagerImpl) Name(name CgroupName) string {
   169  	if m.useSystemd {
   170  		return name.ToSystemd()
   171  	}
   172  	return name.ToCgroupfs()
   173  }
   174  
   175  // CgroupName converts the literal cgroupfs name on the host to an internal identifier.
   176  func (m *cgroupManagerImpl) CgroupName(name string) CgroupName {
   177  	if m.useSystemd {
   178  		return ParseSystemdToCgroupName(name)
   179  	}
   180  	return ParseCgroupfsToCgroupName(name)
   181  }
   182  
   183  // buildCgroupPaths builds a path to each cgroup subsystem for the specified name.
   184  func (m *cgroupManagerImpl) buildCgroupPaths(name CgroupName) map[string]string {
   185  	cgroupFsAdaptedName := m.Name(name)
   186  	cgroupPaths := make(map[string]string, len(m.subsystems.MountPoints))
   187  	for key, val := range m.subsystems.MountPoints {
   188  		cgroupPaths[key] = path.Join(val, cgroupFsAdaptedName)
   189  	}
   190  	return cgroupPaths
   191  }
   192  
   193  // buildCgroupUnifiedPath builds a path to the specified name.
   194  func (m *cgroupManagerImpl) buildCgroupUnifiedPath(name CgroupName) string {
   195  	cgroupFsAdaptedName := m.Name(name)
   196  	return path.Join(cmutil.CgroupRoot, cgroupFsAdaptedName)
   197  }
   198  
   199  // libctCgroupConfig converts CgroupConfig to libcontainer's Cgroup config.
   200  func (m *cgroupManagerImpl) libctCgroupConfig(in *CgroupConfig, needResources bool) *libcontainerconfigs.Cgroup {
   201  	config := &libcontainerconfigs.Cgroup{
   202  		Systemd: m.useSystemd,
   203  	}
   204  	if needResources {
   205  		config.Resources = m.toResources(in.ResourceParameters)
   206  	} else {
   207  		config.Resources = &libcontainerconfigs.Resources{}
   208  	}
   209  
   210  	if !config.Systemd {
   211  		// For fs cgroup manager, we can either set Path or Name and Parent.
   212  		// Setting Path is easier.
   213  		config.Path = in.Name.ToCgroupfs()
   214  
   215  		return config
   216  	}
   217  
   218  	// For systemd, we have to set Name and Parent, as they are needed to talk to systemd.
   219  	// Setting Path is optional as it can be deduced from Name and Parent.
   220  
   221  	// TODO(filbranden): This logic belongs in libcontainer/cgroup/systemd instead.
   222  	// It should take a libcontainerconfigs.Cgroup.Path field (rather than Name and Parent)
   223  	// and split it appropriately, using essentially the logic below.
   224  	// This was done for cgroupfs in opencontainers/runc#497 but a counterpart
   225  	// for systemd was never introduced.
   226  	dir, base := path.Split(in.Name.ToSystemd())
   227  	if dir == "/" {
   228  		dir = "-.slice"
   229  	} else {
   230  		dir = path.Base(dir)
   231  	}
   232  	config.Parent = dir
   233  	config.Name = base
   234  
   235  	return config
   236  }
   237  
   238  // Validate checks if all subsystem cgroups already exist
   239  func (m *cgroupManagerImpl) Validate(name CgroupName) error {
   240  	if libcontainercgroups.IsCgroup2UnifiedMode() {
   241  		cgroupPath := m.buildCgroupUnifiedPath(name)
   242  		neededControllers := getSupportedUnifiedControllers()
   243  		enabledControllers, err := readUnifiedControllers(cgroupPath)
   244  		if err != nil {
   245  			return fmt.Errorf("could not read controllers for cgroup %q: %w", name, err)
   246  		}
   247  		difference := neededControllers.Difference(enabledControllers)
   248  		if difference.Len() > 0 {
   249  			return fmt.Errorf("cgroup %q has some missing controllers: %v", name, strings.Join(sets.List(difference), ", "))
   250  		}
   251  		return nil // valid V2 cgroup
   252  	}
   253  
   254  	// Get map of all cgroup paths on the system for the particular cgroup
   255  	cgroupPaths := m.buildCgroupPaths(name)
   256  
   257  	// the presence of alternative control groups not known to runc confuses
   258  	// the kubelet existence checks.
   259  	// ideally, we would have a mechanism in runc to support Exists() logic
   260  	// scoped to the set control groups it understands.  this is being discussed
   261  	// in https://github.com/opencontainers/runc/issues/1440
   262  	// once resolved, we can remove this code.
   263  	allowlistControllers := sets.New[string]("cpu", "cpuacct", "cpuset", "memory", "systemd", "pids")
   264  
   265  	if _, ok := m.subsystems.MountPoints["hugetlb"]; ok {
   266  		allowlistControllers.Insert("hugetlb")
   267  	}
   268  	var missingPaths []string
   269  	// If even one cgroup path doesn't exist, then the cgroup doesn't exist.
   270  	for controller, path := range cgroupPaths {
   271  		// ignore mounts we don't care about
   272  		if !allowlistControllers.Has(controller) {
   273  			continue
   274  		}
   275  		if !libcontainercgroups.PathExists(path) {
   276  			missingPaths = append(missingPaths, path)
   277  		}
   278  	}
   279  
   280  	if len(missingPaths) > 0 {
   281  		return fmt.Errorf("cgroup %q has some missing paths: %v", name, strings.Join(missingPaths, ", "))
   282  	}
   283  
   284  	return nil // valid V1 cgroup
   285  }
   286  
   287  // Exists checks if all subsystem cgroups already exist
   288  func (m *cgroupManagerImpl) Exists(name CgroupName) bool {
   289  	return m.Validate(name) == nil
   290  }
   291  
   292  // Destroy destroys the specified cgroup
   293  func (m *cgroupManagerImpl) Destroy(cgroupConfig *CgroupConfig) error {
   294  	start := time.Now()
   295  	defer func() {
   296  		metrics.CgroupManagerDuration.WithLabelValues("destroy").Observe(metrics.SinceInSeconds(start))
   297  	}()
   298  
   299  	libcontainerCgroupConfig := m.libctCgroupConfig(cgroupConfig, false)
   300  	manager, err := manager.New(libcontainerCgroupConfig)
   301  	if err != nil {
   302  		return err
   303  	}
   304  
   305  	// Delete cgroups using libcontainers Managers Destroy() method
   306  	if err = manager.Destroy(); err != nil {
   307  		return fmt.Errorf("unable to destroy cgroup paths for cgroup %v : %v", cgroupConfig.Name, err)
   308  	}
   309  
   310  	return nil
   311  }
   312  
   313  // getCPUWeight converts from the range [2, 262144] to [1, 10000]
   314  func getCPUWeight(cpuShares *uint64) uint64 {
   315  	if cpuShares == nil {
   316  		return 0
   317  	}
   318  	if *cpuShares >= 262144 {
   319  		return 10000
   320  	}
   321  	return 1 + ((*cpuShares-2)*9999)/262142
   322  }
   323  
   324  // readUnifiedControllers reads the controllers available at the specified cgroup
   325  func readUnifiedControllers(path string) (sets.Set[string], error) {
   326  	controllersFileContent, err := os.ReadFile(filepath.Join(path, "cgroup.controllers"))
   327  	if err != nil {
   328  		return nil, err
   329  	}
   330  	controllers := strings.Fields(string(controllersFileContent))
   331  	return sets.New(controllers...), nil
   332  }
   333  
   334  var (
   335  	availableRootControllersOnce sync.Once
   336  	availableRootControllers     sets.Set[string]
   337  )
   338  
   339  // getSupportedUnifiedControllers returns a set of supported controllers when running on cgroup v2
   340  func getSupportedUnifiedControllers() sets.Set[string] {
   341  	// This is the set of controllers used by the Kubelet
   342  	supportedControllers := sets.New("cpu", "cpuset", "memory", "hugetlb", "pids")
   343  	// Memoize the set of controllers that are present in the root cgroup
   344  	availableRootControllersOnce.Do(func() {
   345  		var err error
   346  		availableRootControllers, err = readUnifiedControllers(cmutil.CgroupRoot)
   347  		if err != nil {
   348  			panic(fmt.Errorf("cannot read cgroup controllers at %s", cmutil.CgroupRoot))
   349  		}
   350  	})
   351  	// Return the set of controllers that are supported both by the Kubelet and by the kernel
   352  	return supportedControllers.Intersection(availableRootControllers)
   353  }
   354  
   355  func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources {
   356  	resources := &libcontainerconfigs.Resources{
   357  		SkipDevices:     true,
   358  		SkipFreezeOnSet: true,
   359  	}
   360  	if resourceConfig == nil {
   361  		return resources
   362  	}
   363  	if resourceConfig.Memory != nil {
   364  		resources.Memory = *resourceConfig.Memory
   365  	}
   366  	if resourceConfig.CPUShares != nil {
   367  		if libcontainercgroups.IsCgroup2UnifiedMode() {
   368  			resources.CpuWeight = getCPUWeight(resourceConfig.CPUShares)
   369  		} else {
   370  			resources.CpuShares = *resourceConfig.CPUShares
   371  		}
   372  	}
   373  	if resourceConfig.CPUQuota != nil {
   374  		resources.CpuQuota = *resourceConfig.CPUQuota
   375  	}
   376  	if resourceConfig.CPUPeriod != nil {
   377  		resources.CpuPeriod = *resourceConfig.CPUPeriod
   378  	}
   379  	if resourceConfig.PidsLimit != nil {
   380  		resources.PidsLimit = *resourceConfig.PidsLimit
   381  	}
   382  
   383  	m.maybeSetHugetlb(resourceConfig, resources)
   384  
   385  	// Ideally unified is used for all the resources when running on cgroup v2.
   386  	// It doesn't make difference for the memory.max limit, but for e.g. the cpu controller
   387  	// you can specify the correct setting without relying on the conversions performed by the OCI runtime.
   388  	if resourceConfig.Unified != nil && libcontainercgroups.IsCgroup2UnifiedMode() {
   389  		resources.Unified = make(map[string]string)
   390  		for k, v := range resourceConfig.Unified {
   391  			resources.Unified[k] = v
   392  		}
   393  	}
   394  	return resources
   395  }
   396  
   397  func (m *cgroupManagerImpl) maybeSetHugetlb(resourceConfig *ResourceConfig, resources *libcontainerconfigs.Resources) {
   398  	// Check if hugetlb is supported.
   399  	if libcontainercgroups.IsCgroup2UnifiedMode() {
   400  		if !getSupportedUnifiedControllers().Has("hugetlb") {
   401  			klog.V(6).InfoS("Optional subsystem not supported: hugetlb")
   402  			return
   403  		}
   404  	} else if _, ok := m.subsystems.MountPoints["hugetlb"]; !ok {
   405  		klog.V(6).InfoS("Optional subsystem not supported: hugetlb")
   406  		return
   407  	}
   408  
   409  	// For each page size enumerated, set that value.
   410  	pageSizes := sets.New[string]()
   411  	for pageSize, limit := range resourceConfig.HugePageLimit {
   412  		sizeString, err := v1helper.HugePageUnitSizeFromByteSize(pageSize)
   413  		if err != nil {
   414  			klog.InfoS("Invalid pageSize", "err", err)
   415  			continue
   416  		}
   417  		resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
   418  			Pagesize: sizeString,
   419  			Limit:    uint64(limit),
   420  		})
   421  		pageSizes.Insert(sizeString)
   422  	}
   423  	// for each page size omitted, limit to 0
   424  	for _, pageSize := range libcontainercgroups.HugePageSizes() {
   425  		if pageSizes.Has(pageSize) {
   426  			continue
   427  		}
   428  		resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
   429  			Pagesize: pageSize,
   430  			Limit:    uint64(0),
   431  		})
   432  	}
   433  }
   434  
   435  // Update updates the cgroup with the specified Cgroup Configuration
   436  func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error {
   437  	start := time.Now()
   438  	defer func() {
   439  		metrics.CgroupManagerDuration.WithLabelValues("update").Observe(metrics.SinceInSeconds(start))
   440  	}()
   441  
   442  	libcontainerCgroupConfig := m.libctCgroupConfig(cgroupConfig, true)
   443  	manager, err := manager.New(libcontainerCgroupConfig)
   444  	if err != nil {
   445  		return fmt.Errorf("failed to create cgroup manager: %v", err)
   446  	}
   447  	return manager.Set(libcontainerCgroupConfig.Resources)
   448  }
   449  
   450  // Create creates the specified cgroup
   451  func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error {
   452  	start := time.Now()
   453  	defer func() {
   454  		metrics.CgroupManagerDuration.WithLabelValues("create").Observe(metrics.SinceInSeconds(start))
   455  	}()
   456  
   457  	libcontainerCgroupConfig := m.libctCgroupConfig(cgroupConfig, true)
   458  	manager, err := manager.New(libcontainerCgroupConfig)
   459  	if err != nil {
   460  		return err
   461  	}
   462  
   463  	// Apply(-1) is a hack to create the cgroup directories for each resource
   464  	// subsystem. The function [cgroups.Manager.apply()] applies cgroup
   465  	// configuration to the process with the specified pid.
   466  	// It creates cgroup files for each subsystems and writes the pid
   467  	// in the tasks file. We use the function to create all the required
   468  	// cgroup files but not attach any "real" pid to the cgroup.
   469  	if err := manager.Apply(-1); err != nil {
   470  		return err
   471  	}
   472  
   473  	// it may confuse why we call set after we do apply, but the issue is that runc
   474  	// follows a similar pattern.  it's needed to ensure cpu quota is set properly.
   475  	if err := manager.Set(libcontainerCgroupConfig.Resources); err != nil {
   476  		utilruntime.HandleError(fmt.Errorf("cgroup manager.Set failed: %w", err))
   477  	}
   478  
   479  	return nil
   480  }
   481  
   482  // Scans through all subsystems to find pids associated with specified cgroup.
   483  func (m *cgroupManagerImpl) Pids(name CgroupName) []int {
   484  	// we need the driver specific name
   485  	cgroupFsName := m.Name(name)
   486  
   487  	// Get a list of processes that we need to kill
   488  	pidsToKill := sets.New[int]()
   489  	var pids []int
   490  	for _, val := range m.subsystems.MountPoints {
   491  		dir := path.Join(val, cgroupFsName)
   492  		_, err := os.Stat(dir)
   493  		if os.IsNotExist(err) {
   494  			// The subsystem pod cgroup is already deleted
   495  			// do nothing, continue
   496  			continue
   497  		}
   498  		// Get a list of pids that are still charged to the pod's cgroup
   499  		pids, err = getCgroupProcs(dir)
   500  		if err != nil {
   501  			continue
   502  		}
   503  		pidsToKill.Insert(pids...)
   504  
   505  		// WalkFunc which is called for each file and directory in the pod cgroup dir
   506  		visitor := func(path string, info os.FileInfo, err error) error {
   507  			if err != nil {
   508  				klog.V(4).InfoS("Cgroup manager encountered error scanning cgroup path", "path", path, "err", err)
   509  				return filepath.SkipDir
   510  			}
   511  			if !info.IsDir() {
   512  				return nil
   513  			}
   514  			pids, err = getCgroupProcs(path)
   515  			if err != nil {
   516  				klog.V(4).InfoS("Cgroup manager encountered error getting procs for cgroup path", "path", path, "err", err)
   517  				return filepath.SkipDir
   518  			}
   519  			pidsToKill.Insert(pids...)
   520  			return nil
   521  		}
   522  		// Walk through the pod cgroup directory to check if
   523  		// container cgroups haven't been GCed yet. Get attached processes to
   524  		// all such unwanted containers under the pod cgroup
   525  		if err = filepath.Walk(dir, visitor); err != nil {
   526  			klog.V(4).InfoS("Cgroup manager encountered error scanning pids for directory", "path", dir, "err", err)
   527  		}
   528  	}
   529  	return sets.List(pidsToKill)
   530  }
   531  
   532  // ReduceCPULimits reduces the cgroup's cpu shares to the lowest possible value
   533  func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error {
   534  	// Set lowest possible CpuShares value for the cgroup
   535  	minimumCPUShares := uint64(MinShares)
   536  	resources := &ResourceConfig{
   537  		CPUShares: &minimumCPUShares,
   538  	}
   539  	containerConfig := &CgroupConfig{
   540  		Name:               cgroupName,
   541  		ResourceParameters: resources,
   542  	}
   543  	return m.Update(containerConfig)
   544  }
   545  
   546  // MemoryUsage returns the current memory usage of the specified cgroup,
   547  // as read from cgroupfs.
   548  func (m *cgroupManagerImpl) MemoryUsage(name CgroupName) (int64, error) {
   549  	var path, file string
   550  	if libcontainercgroups.IsCgroup2UnifiedMode() {
   551  		path = m.buildCgroupUnifiedPath(name)
   552  		file = "memory.current"
   553  	} else {
   554  		mp, ok := m.subsystems.MountPoints["memory"]
   555  		if !ok { // should not happen
   556  			return -1, errors.New("no cgroup v1 mountpoint for memory controller found")
   557  		}
   558  		path = mp + "/" + m.Name(name)
   559  		file = "memory.usage_in_bytes"
   560  	}
   561  	val, err := fscommon.GetCgroupParamUint(path, file)
   562  	return int64(val), err
   563  }
   564  
   565  // Convert cgroup v1 cpu.shares value to cgroup v2 cpu.weight
   566  // https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2254-cgroup-v2#phase-1-convert-from-cgroups-v1-settings-to-v2
   567  func CpuSharesToCpuWeight(cpuShares uint64) uint64 {
   568  	return uint64((((cpuShares - 2) * 9999) / 262142) + 1)
   569  }
   570  
   571  // Convert cgroup v2 cpu.weight value to cgroup v1 cpu.shares
   572  // https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2254-cgroup-v2#phase-1-convert-from-cgroups-v1-settings-to-v2
   573  func CpuWeightToCpuShares(cpuWeight uint64) uint64 {
   574  	return uint64((((cpuWeight - 1) * 262142) / 9999) + 2)
   575  }
   576  
   577  func getCgroupv1CpuConfig(cgroupPath string) (*ResourceConfig, error) {
   578  	cpuQuotaStr, errQ := fscommon.GetCgroupParamString(cgroupPath, "cpu.cfs_quota_us")
   579  	if errQ != nil {
   580  		return nil, fmt.Errorf("failed to read CPU quota for cgroup %v: %v", cgroupPath, errQ)
   581  	}
   582  	cpuQuota, errInt := strconv.ParseInt(cpuQuotaStr, 10, 64)
   583  	if errInt != nil {
   584  		return nil, fmt.Errorf("failed to convert CPU quota as integer for cgroup %v: %v", cgroupPath, errInt)
   585  	}
   586  	cpuPeriod, errP := fscommon.GetCgroupParamUint(cgroupPath, "cpu.cfs_period_us")
   587  	if errP != nil {
   588  		return nil, fmt.Errorf("failed to read CPU period for cgroup %v: %v", cgroupPath, errP)
   589  	}
   590  	cpuShares, errS := fscommon.GetCgroupParamUint(cgroupPath, "cpu.shares")
   591  	if errS != nil {
   592  		return nil, fmt.Errorf("failed to read CPU shares for cgroup %v: %v", cgroupPath, errS)
   593  	}
   594  	return &ResourceConfig{CPUShares: &cpuShares, CPUQuota: &cpuQuota, CPUPeriod: &cpuPeriod}, nil
   595  }
   596  
   597  func getCgroupv2CpuConfig(cgroupPath string) (*ResourceConfig, error) {
   598  	var cpuLimitStr, cpuPeriodStr string
   599  	cpuLimitAndPeriod, err := fscommon.GetCgroupParamString(cgroupPath, "cpu.max")
   600  	if err != nil {
   601  		return nil, fmt.Errorf("failed to read cpu.max file for cgroup %v: %v", cgroupPath, err)
   602  	}
   603  	numItems, errScan := fmt.Sscanf(cpuLimitAndPeriod, "%s %s", &cpuLimitStr, &cpuPeriodStr)
   604  	if errScan != nil || numItems != 2 {
   605  		return nil, fmt.Errorf("failed to correctly parse content of cpu.max file ('%s') for cgroup %v: %v",
   606  			cpuLimitAndPeriod, cgroupPath, errScan)
   607  	}
   608  	cpuLimit := int64(-1)
   609  	if cpuLimitStr != Cgroup2MaxCpuLimit {
   610  		cpuLimit, err = strconv.ParseInt(cpuLimitStr, 10, 64)
   611  		if err != nil {
   612  			return nil, fmt.Errorf("failed to convert CPU limit as integer for cgroup %v: %v", cgroupPath, err)
   613  		}
   614  	}
   615  	cpuPeriod, errPeriod := strconv.ParseUint(cpuPeriodStr, 10, 64)
   616  	if errPeriod != nil {
   617  		return nil, fmt.Errorf("failed to convert CPU period as integer for cgroup %v: %v", cgroupPath, errPeriod)
   618  	}
   619  	cpuWeight, errWeight := fscommon.GetCgroupParamUint(cgroupPath, "cpu.weight")
   620  	if errWeight != nil {
   621  		return nil, fmt.Errorf("failed to read CPU weight for cgroup %v: %v", cgroupPath, errWeight)
   622  	}
   623  	cpuShares := CpuWeightToCpuShares(cpuWeight)
   624  	return &ResourceConfig{CPUShares: &cpuShares, CPUQuota: &cpuLimit, CPUPeriod: &cpuPeriod}, nil
   625  }
   626  
   627  func getCgroupCpuConfig(cgroupPath string) (*ResourceConfig, error) {
   628  	if libcontainercgroups.IsCgroup2UnifiedMode() {
   629  		return getCgroupv2CpuConfig(cgroupPath)
   630  	} else {
   631  		return getCgroupv1CpuConfig(cgroupPath)
   632  	}
   633  }
   634  
   635  func getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) {
   636  	memLimitFile := "memory.limit_in_bytes"
   637  	if libcontainercgroups.IsCgroup2UnifiedMode() {
   638  		memLimitFile = "memory.max"
   639  	}
   640  	memLimit, err := fscommon.GetCgroupParamUint(cgroupPath, memLimitFile)
   641  	if err != nil {
   642  		return nil, fmt.Errorf("failed to read %s for cgroup %v: %v", memLimitFile, cgroupPath, err)
   643  	}
   644  	mLim := int64(memLimit)
   645  	//TODO(vinaykul,InPlacePodVerticalScaling): Add memory request support
   646  	return &ResourceConfig{Memory: &mLim}, nil
   647  
   648  }
   649  
   650  // Get the resource config values applied to the cgroup for specified resource type
   651  func (m *cgroupManagerImpl) GetCgroupConfig(name CgroupName, resource v1.ResourceName) (*ResourceConfig, error) {
   652  	cgroupPaths := m.buildCgroupPaths(name)
   653  	cgroupResourcePath, found := cgroupPaths[string(resource)]
   654  	if !found {
   655  		return nil, fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name)
   656  	}
   657  	switch resource {
   658  	case v1.ResourceCPU:
   659  		return getCgroupCpuConfig(cgroupResourcePath)
   660  	case v1.ResourceMemory:
   661  		return getCgroupMemoryConfig(cgroupResourcePath)
   662  	}
   663  	return nil, fmt.Errorf("unsupported resource %v for cgroup %v", resource, name)
   664  }
   665  
   666  func setCgroupv1CpuConfig(cgroupPath string, resourceConfig *ResourceConfig) error {
   667  	var cpuQuotaStr, cpuPeriodStr, cpuSharesStr string
   668  	if resourceConfig.CPUQuota != nil {
   669  		cpuQuotaStr = strconv.FormatInt(*resourceConfig.CPUQuota, 10)
   670  		if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.cfs_quota_us"), []byte(cpuQuotaStr), 0700); err != nil {
   671  			return fmt.Errorf("failed to write %v to %v: %v", cpuQuotaStr, cgroupPath, err)
   672  		}
   673  	}
   674  	if resourceConfig.CPUPeriod != nil {
   675  		cpuPeriodStr = strconv.FormatUint(*resourceConfig.CPUPeriod, 10)
   676  		if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.cfs_period_us"), []byte(cpuPeriodStr), 0700); err != nil {
   677  			return fmt.Errorf("failed to write %v to %v: %v", cpuPeriodStr, cgroupPath, err)
   678  		}
   679  	}
   680  	if resourceConfig.CPUShares != nil {
   681  		cpuSharesStr = strconv.FormatUint(*resourceConfig.CPUShares, 10)
   682  		if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.shares"), []byte(cpuSharesStr), 0700); err != nil {
   683  			return fmt.Errorf("failed to write %v to %v: %v", cpuSharesStr, cgroupPath, err)
   684  		}
   685  	}
   686  	return nil
   687  }
   688  
   689  func setCgroupv2CpuConfig(cgroupPath string, resourceConfig *ResourceConfig) error {
   690  	if resourceConfig.CPUQuota != nil {
   691  		if resourceConfig.CPUPeriod == nil {
   692  			return fmt.Errorf("CpuPeriod must be specified in order to set CpuLimit")
   693  		}
   694  		cpuLimitStr := Cgroup2MaxCpuLimit
   695  		if *resourceConfig.CPUQuota > -1 {
   696  			cpuLimitStr = strconv.FormatInt(*resourceConfig.CPUQuota, 10)
   697  		}
   698  		cpuPeriodStr := strconv.FormatUint(*resourceConfig.CPUPeriod, 10)
   699  		cpuMaxStr := fmt.Sprintf("%s %s", cpuLimitStr, cpuPeriodStr)
   700  		if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.max"), []byte(cpuMaxStr), 0700); err != nil {
   701  			return fmt.Errorf("failed to write %v to %v: %v", cpuMaxStr, cgroupPath, err)
   702  		}
   703  	}
   704  	if resourceConfig.CPUShares != nil {
   705  		cpuWeight := CpuSharesToCpuWeight(*resourceConfig.CPUShares)
   706  		cpuWeightStr := strconv.FormatUint(cpuWeight, 10)
   707  		if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.weight"), []byte(cpuWeightStr), 0700); err != nil {
   708  			return fmt.Errorf("failed to write %v to %v: %v", cpuWeightStr, cgroupPath, err)
   709  		}
   710  	}
   711  	return nil
   712  }
   713  
   714  func setCgroupCpuConfig(cgroupPath string, resourceConfig *ResourceConfig) error {
   715  	if libcontainercgroups.IsCgroup2UnifiedMode() {
   716  		return setCgroupv2CpuConfig(cgroupPath, resourceConfig)
   717  	} else {
   718  		return setCgroupv1CpuConfig(cgroupPath, resourceConfig)
   719  	}
   720  }
   721  
   722  func setCgroupMemoryConfig(cgroupPath string, resourceConfig *ResourceConfig) error {
   723  	memLimitFile := "memory.limit_in_bytes"
   724  	if libcontainercgroups.IsCgroup2UnifiedMode() {
   725  		memLimitFile = "memory.max"
   726  	}
   727  	memLimit := strconv.FormatInt(*resourceConfig.Memory, 10)
   728  	if err := os.WriteFile(filepath.Join(cgroupPath, memLimitFile), []byte(memLimit), 0700); err != nil {
   729  		return fmt.Errorf("failed to write %v to %v/%v: %v", memLimit, cgroupPath, memLimitFile, err)
   730  	}
   731  	//TODO(vinaykul,InPlacePodVerticalScaling): Add memory request support
   732  	return nil
   733  }
   734  
   735  // Set resource config for the specified resource type on the cgroup
   736  func (m *cgroupManagerImpl) SetCgroupConfig(name CgroupName, resource v1.ResourceName, resourceConfig *ResourceConfig) error {
   737  	cgroupPaths := m.buildCgroupPaths(name)
   738  	cgroupResourcePath, found := cgroupPaths[string(resource)]
   739  	if !found {
   740  		return fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name)
   741  	}
   742  	switch resource {
   743  	case v1.ResourceCPU:
   744  		return setCgroupCpuConfig(cgroupResourcePath, resourceConfig)
   745  	case v1.ResourceMemory:
   746  		return setCgroupMemoryConfig(cgroupResourcePath, resourceConfig)
   747  	}
   748  	return nil
   749  }
   750  

View as plain text