...

Source file src/k8s.io/kubernetes/pkg/kubelet/cm/qos_container_manager_linux.go

Documentation: k8s.io/kubernetes/pkg/kubelet/cm

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cm
    18  
    19  import (
    20  	"fmt"
    21  	"strconv"
    22  	"strings"
    23  	"sync"
    24  	"time"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	"k8s.io/klog/v2"
    28  
    29  	"k8s.io/apimachinery/pkg/util/wait"
    30  
    31  	units "github.com/docker/go-units"
    32  	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
    33  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    34  
    35  	"k8s.io/kubernetes/pkg/api/v1/resource"
    36  	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
    37  	kubefeatures "k8s.io/kubernetes/pkg/features"
    38  )
    39  
    40  const (
    41  	// how often the qos cgroup manager will perform periodic update
    42  	// of the qos level cgroup resource constraints
    43  	periodicQOSCgroupUpdateInterval = 1 * time.Minute
    44  )
    45  
    46  type QOSContainerManager interface {
    47  	Start(func() v1.ResourceList, ActivePodsFunc) error
    48  	GetQOSContainersInfo() QOSContainersInfo
    49  	UpdateCgroups() error
    50  }
    51  
    52  type qosContainerManagerImpl struct {
    53  	sync.Mutex
    54  	qosContainersInfo  QOSContainersInfo
    55  	subsystems         *CgroupSubsystems
    56  	cgroupManager      CgroupManager
    57  	activePods         ActivePodsFunc
    58  	getNodeAllocatable func() v1.ResourceList
    59  	cgroupRoot         CgroupName
    60  	qosReserved        map[v1.ResourceName]int64
    61  }
    62  
    63  func NewQOSContainerManager(subsystems *CgroupSubsystems, cgroupRoot CgroupName, nodeConfig NodeConfig, cgroupManager CgroupManager) (QOSContainerManager, error) {
    64  	if !nodeConfig.CgroupsPerQOS {
    65  		return &qosContainerManagerNoop{
    66  			cgroupRoot: cgroupRoot,
    67  		}, nil
    68  	}
    69  
    70  	return &qosContainerManagerImpl{
    71  		subsystems:    subsystems,
    72  		cgroupManager: cgroupManager,
    73  		cgroupRoot:    cgroupRoot,
    74  		qosReserved:   nodeConfig.QOSReserved,
    75  	}, nil
    76  }
    77  
    78  func (m *qosContainerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
    79  	return m.qosContainersInfo
    80  }
    81  
    82  func (m *qosContainerManagerImpl) Start(getNodeAllocatable func() v1.ResourceList, activePods ActivePodsFunc) error {
    83  	cm := m.cgroupManager
    84  	rootContainer := m.cgroupRoot
    85  	if !cm.Exists(rootContainer) {
    86  		return fmt.Errorf("root container %v doesn't exist", rootContainer)
    87  	}
    88  
    89  	// Top level for Qos containers are created only for Burstable
    90  	// and Best Effort classes
    91  	qosClasses := map[v1.PodQOSClass]CgroupName{
    92  		v1.PodQOSBurstable:  NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBurstable))),
    93  		v1.PodQOSBestEffort: NewCgroupName(rootContainer, strings.ToLower(string(v1.PodQOSBestEffort))),
    94  	}
    95  
    96  	// Create containers for both qos classes
    97  	for qosClass, containerName := range qosClasses {
    98  		resourceParameters := &ResourceConfig{}
    99  		// the BestEffort QoS class has a statically configured minShares value
   100  		if qosClass == v1.PodQOSBestEffort {
   101  			minShares := uint64(MinShares)
   102  			resourceParameters.CPUShares = &minShares
   103  		}
   104  
   105  		// containerConfig object stores the cgroup specifications
   106  		containerConfig := &CgroupConfig{
   107  			Name:               containerName,
   108  			ResourceParameters: resourceParameters,
   109  		}
   110  
   111  		// for each enumerated huge page size, the qos tiers are unbounded
   112  		m.setHugePagesUnbounded(containerConfig)
   113  
   114  		// check if it exists
   115  		if !cm.Exists(containerName) {
   116  			if err := cm.Create(containerConfig); err != nil {
   117  				return fmt.Errorf("failed to create top level %v QOS cgroup : %v", qosClass, err)
   118  			}
   119  		} else {
   120  			// to ensure we actually have the right state, we update the config on startup
   121  			if err := cm.Update(containerConfig); err != nil {
   122  				return fmt.Errorf("failed to update top level %v QOS cgroup : %v", qosClass, err)
   123  			}
   124  		}
   125  	}
   126  	// Store the top level qos container names
   127  	m.qosContainersInfo = QOSContainersInfo{
   128  		Guaranteed: rootContainer,
   129  		Burstable:  qosClasses[v1.PodQOSBurstable],
   130  		BestEffort: qosClasses[v1.PodQOSBestEffort],
   131  	}
   132  	m.getNodeAllocatable = getNodeAllocatable
   133  	m.activePods = activePods
   134  
   135  	// update qos cgroup tiers on startup and in periodic intervals
   136  	// to ensure desired state is in sync with actual state.
   137  	go wait.Until(func() {
   138  		err := m.UpdateCgroups()
   139  		if err != nil {
   140  			klog.InfoS("Failed to reserve QoS requests", "err", err)
   141  		}
   142  	}, periodicQOSCgroupUpdateInterval, wait.NeverStop)
   143  
   144  	return nil
   145  }
   146  
   147  // setHugePagesUnbounded ensures hugetlb is effectively unbounded
   148  func (m *qosContainerManagerImpl) setHugePagesUnbounded(cgroupConfig *CgroupConfig) error {
   149  	hugePageLimit := map[int64]int64{}
   150  	for _, pageSize := range libcontainercgroups.HugePageSizes() {
   151  		pageSizeBytes, err := units.RAMInBytes(pageSize)
   152  		if err != nil {
   153  			return err
   154  		}
   155  		hugePageLimit[pageSizeBytes] = int64(1 << 62)
   156  	}
   157  	cgroupConfig.ResourceParameters.HugePageLimit = hugePageLimit
   158  	return nil
   159  }
   160  
   161  func (m *qosContainerManagerImpl) setHugePagesConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
   162  	for _, v := range configs {
   163  		if err := m.setHugePagesUnbounded(v); err != nil {
   164  			return err
   165  		}
   166  	}
   167  	return nil
   168  }
   169  
   170  func (m *qosContainerManagerImpl) setCPUCgroupConfig(configs map[v1.PodQOSClass]*CgroupConfig) error {
   171  	pods := m.activePods()
   172  	burstablePodCPURequest := int64(0)
   173  	reuseReqs := make(v1.ResourceList, 4)
   174  	for i := range pods {
   175  		pod := pods[i]
   176  		qosClass := v1qos.GetPodQOS(pod)
   177  		if qosClass != v1.PodQOSBurstable {
   178  			// we only care about the burstable qos tier
   179  			continue
   180  		}
   181  		req := resource.PodRequests(pod, resource.PodResourcesOptions{Reuse: reuseReqs})
   182  		if request, found := req[v1.ResourceCPU]; found {
   183  			burstablePodCPURequest += request.MilliValue()
   184  		}
   185  	}
   186  
   187  	// make sure best effort is always 2 shares
   188  	bestEffortCPUShares := uint64(MinShares)
   189  	configs[v1.PodQOSBestEffort].ResourceParameters.CPUShares = &bestEffortCPUShares
   190  
   191  	// set burstable shares based on current observe state
   192  	burstableCPUShares := MilliCPUToShares(burstablePodCPURequest)
   193  	configs[v1.PodQOSBurstable].ResourceParameters.CPUShares = &burstableCPUShares
   194  	return nil
   195  }
   196  
   197  // getQoSMemoryRequests sums and returns the memory request of all pods for
   198  // guaranteed and burstable qos classes.
   199  func (m *qosContainerManagerImpl) getQoSMemoryRequests() map[v1.PodQOSClass]int64 {
   200  	qosMemoryRequests := map[v1.PodQOSClass]int64{
   201  		v1.PodQOSGuaranteed: 0,
   202  		v1.PodQOSBurstable:  0,
   203  	}
   204  
   205  	// Sum the pod limits for pods in each QOS class
   206  	pods := m.activePods()
   207  	reuseReqs := make(v1.ResourceList, 4)
   208  	for _, pod := range pods {
   209  		podMemoryRequest := int64(0)
   210  		qosClass := v1qos.GetPodQOS(pod)
   211  		if qosClass == v1.PodQOSBestEffort {
   212  			// limits are not set for Best Effort pods
   213  			continue
   214  		}
   215  		req := resource.PodRequests(pod, resource.PodResourcesOptions{Reuse: reuseReqs})
   216  		if request, found := req[v1.ResourceMemory]; found {
   217  			podMemoryRequest += request.Value()
   218  		}
   219  		qosMemoryRequests[qosClass] += podMemoryRequest
   220  	}
   221  
   222  	return qosMemoryRequests
   223  }
   224  
   225  // setMemoryReserve sums the memory limits of all pods in a QOS class,
   226  // calculates QOS class memory limits, and set those limits in the
   227  // CgroupConfig for each QOS class.
   228  func (m *qosContainerManagerImpl) setMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
   229  	qosMemoryRequests := m.getQoSMemoryRequests()
   230  
   231  	resources := m.getNodeAllocatable()
   232  	allocatableResource, ok := resources[v1.ResourceMemory]
   233  	if !ok {
   234  		klog.V(2).InfoS("Allocatable memory value could not be determined, not setting QoS memory limits")
   235  		return
   236  	}
   237  	allocatable := allocatableResource.Value()
   238  	if allocatable == 0 {
   239  		klog.V(2).InfoS("Allocatable memory reported as 0, might be in standalone mode, not setting QoS memory limits")
   240  		return
   241  	}
   242  
   243  	for qos, limits := range qosMemoryRequests {
   244  		klog.V(2).InfoS("QoS pod memory limit", "qos", qos, "limits", limits, "percentReserve", percentReserve)
   245  	}
   246  
   247  	// Calculate QOS memory limits
   248  	burstableLimit := allocatable - (qosMemoryRequests[v1.PodQOSGuaranteed] * percentReserve / 100)
   249  	bestEffortLimit := burstableLimit - (qosMemoryRequests[v1.PodQOSBurstable] * percentReserve / 100)
   250  	configs[v1.PodQOSBurstable].ResourceParameters.Memory = &burstableLimit
   251  	configs[v1.PodQOSBestEffort].ResourceParameters.Memory = &bestEffortLimit
   252  }
   253  
   254  // retrySetMemoryReserve checks for any QoS cgroups over the limit
   255  // that was attempted to be set in the first Update() and adjusts
   256  // their memory limit to the usage to prevent further growth.
   257  func (m *qosContainerManagerImpl) retrySetMemoryReserve(configs map[v1.PodQOSClass]*CgroupConfig, percentReserve int64) {
   258  	// Unreclaimable memory usage may already exceeded the desired limit
   259  	// Attempt to set the limit near the current usage to put pressure
   260  	// on the cgroup and prevent further growth.
   261  	for qos, config := range configs {
   262  		usage, err := m.cgroupManager.MemoryUsage(config.Name)
   263  		if err != nil {
   264  			klog.V(2).InfoS("Failed to get resource stats", "err", err)
   265  			return
   266  		}
   267  
   268  		// Because there is no good way to determine of the original Update()
   269  		// on the memory resource was successful, we determine failure of the
   270  		// first attempt by checking if the usage is above the limit we attempt
   271  		// to set.  If it is, we assume the first attempt to set the limit failed
   272  		// and try again setting the limit to the usage.  Otherwise we leave
   273  		// the CgroupConfig as is.
   274  		if configs[qos].ResourceParameters.Memory != nil && usage > *configs[qos].ResourceParameters.Memory {
   275  			configs[qos].ResourceParameters.Memory = &usage
   276  		}
   277  	}
   278  }
   279  
   280  // setMemoryQoS sums the memory requests of all pods in the Burstable class,
   281  // and set the sum memory as the memory.min in the Unified field of CgroupConfig.
   282  func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*CgroupConfig) {
   283  	qosMemoryRequests := m.getQoSMemoryRequests()
   284  
   285  	// Calculate the memory.min:
   286  	// for burstable(/kubepods/burstable): sum of all burstable pods
   287  	// for guaranteed(/kubepods): sum of all guaranteed and burstable pods
   288  	burstableMin := qosMemoryRequests[v1.PodQOSBurstable]
   289  	guaranteedMin := qosMemoryRequests[v1.PodQOSGuaranteed] + burstableMin
   290  
   291  	if burstableMin > 0 {
   292  		if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil {
   293  			configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string)
   294  		}
   295  		configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10)
   296  		klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin)
   297  	}
   298  
   299  	if guaranteedMin > 0 {
   300  		if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil {
   301  			configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string)
   302  		}
   303  		configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
   304  		klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin)
   305  	}
   306  }
   307  
   308  func (m *qosContainerManagerImpl) UpdateCgroups() error {
   309  	m.Lock()
   310  	defer m.Unlock()
   311  
   312  	qosConfigs := map[v1.PodQOSClass]*CgroupConfig{
   313  		v1.PodQOSGuaranteed: {
   314  			Name:               m.qosContainersInfo.Guaranteed,
   315  			ResourceParameters: &ResourceConfig{},
   316  		},
   317  		v1.PodQOSBurstable: {
   318  			Name:               m.qosContainersInfo.Burstable,
   319  			ResourceParameters: &ResourceConfig{},
   320  		},
   321  		v1.PodQOSBestEffort: {
   322  			Name:               m.qosContainersInfo.BestEffort,
   323  			ResourceParameters: &ResourceConfig{},
   324  		},
   325  	}
   326  
   327  	// update the qos level cgroup settings for cpu shares
   328  	if err := m.setCPUCgroupConfig(qosConfigs); err != nil {
   329  		return err
   330  	}
   331  
   332  	// update the qos level cgroup settings for huge pages (ensure they remain unbounded)
   333  	if err := m.setHugePagesConfig(qosConfigs); err != nil {
   334  		return err
   335  	}
   336  
   337  	// update the qos level cgrougs v2 settings of memory qos if feature enabled
   338  	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
   339  		libcontainercgroups.IsCgroup2UnifiedMode() {
   340  		m.setMemoryQoS(qosConfigs)
   341  	}
   342  
   343  	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.QOSReserved) {
   344  		for resource, percentReserve := range m.qosReserved {
   345  			switch resource {
   346  			case v1.ResourceMemory:
   347  				m.setMemoryReserve(qosConfigs, percentReserve)
   348  			}
   349  		}
   350  
   351  		updateSuccess := true
   352  		for _, config := range qosConfigs {
   353  			err := m.cgroupManager.Update(config)
   354  			if err != nil {
   355  				updateSuccess = false
   356  			}
   357  		}
   358  		if updateSuccess {
   359  			klog.V(4).InfoS("Updated QoS cgroup configuration")
   360  			return nil
   361  		}
   362  
   363  		// If the resource can adjust the ResourceConfig to increase likelihood of
   364  		// success, call the adjustment function here.  Otherwise, the Update() will
   365  		// be called again with the same values.
   366  		for resource, percentReserve := range m.qosReserved {
   367  			switch resource {
   368  			case v1.ResourceMemory:
   369  				m.retrySetMemoryReserve(qosConfigs, percentReserve)
   370  			}
   371  		}
   372  	}
   373  
   374  	for _, config := range qosConfigs {
   375  		err := m.cgroupManager.Update(config)
   376  		if err != nil {
   377  			klog.ErrorS(err, "Failed to update QoS cgroup configuration")
   378  			return err
   379  		}
   380  	}
   381  
   382  	klog.V(4).InfoS("Updated QoS cgroup configuration")
   383  	return nil
   384  }
   385  
   386  type qosContainerManagerNoop struct {
   387  	cgroupRoot CgroupName
   388  }
   389  
   390  var _ QOSContainerManager = &qosContainerManagerNoop{}
   391  
   392  func (m *qosContainerManagerNoop) GetQOSContainersInfo() QOSContainersInfo {
   393  	return QOSContainersInfo{}
   394  }
   395  
   396  func (m *qosContainerManagerNoop) Start(_ func() v1.ResourceList, _ ActivePodsFunc) error {
   397  	return nil
   398  }
   399  
   400  func (m *qosContainerManagerNoop) UpdateCgroups() error {
   401  	return nil
   402  }
   403  

View as plain text