kuberuntime_container_linux.go

Documentation: k8s.io/kubernetes/pkg/kubelet/kuberuntime

     1  //go:build linux
     2  // +build linux
     3  
     4  /*
     5  Copyright 2018 The Kubernetes Authors.
     6  
     7  Licensed under the Apache License, Version 2.0 (the "License");
     8  you may not use this file except in compliance with the License.
     9  You may obtain a copy of the License at
    10  
    11      http://www.apache.org/licenses/LICENSE-2.0
    12  
    13  Unless required by applicable law or agreed to in writing, software
    14  distributed under the License is distributed on an "AS IS" BASIS,
    15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  See the License for the specific language governing permissions and
    17  limitations under the License.
    18  */
    19  
    20  package kuberuntime
    21  
    22  import (
    23  	"errors"
    24  	"fmt"
    25  	"math"
    26  	"os"
    27  	"path/filepath"
    28  	"strconv"
    29  	"sync"
    30  	"time"
    31  
    32  	"github.com/containerd/cgroups"
    33  	cadvisorv1 "github.com/google/cadvisor/info/v1"
    34  	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
    35  
    36  	v1 "k8s.io/api/core/v1"
    37  	"k8s.io/apimachinery/pkg/api/resource"
    38  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    39  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    40  	"k8s.io/klog/v2"
    41  	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
    42  	kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
    43  	kubefeatures "k8s.io/kubernetes/pkg/features"
    44  	"k8s.io/kubernetes/pkg/kubelet/cm"
    45  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    46  	"k8s.io/kubernetes/pkg/kubelet/qos"
    47  	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
    48  )
    49  
    50  var defaultPageSize = int64(os.Getpagesize())
    51  
    52  // applyPlatformSpecificContainerConfig applies platform specific configurations to runtimeapi.ContainerConfig.
    53  func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config *runtimeapi.ContainerConfig, container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID) error {
    54  	enforceMemoryQoS := false
    55  	// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
    56  	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
    57  		isCgroup2UnifiedMode() {
    58  		enforceMemoryQoS = true
    59  	}
    60  	cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS)
    61  	if err != nil {
    62  		return err
    63  	}
    64  	config.Linux = cl
    65  
    66  	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.UserNamespacesSupport) {
    67  		if cl.SecurityContext.NamespaceOptions.UsernsOptions != nil {
    68  			for _, mount := range config.Mounts {
    69  				mount.UidMappings = cl.SecurityContext.NamespaceOptions.UsernsOptions.Uids
    70  				mount.GidMappings = cl.SecurityContext.NamespaceOptions.UsernsOptions.Gids
    71  			}
    72  		}
    73  	}
    74  	return nil
    75  }
    76  
    77  // generateLinuxContainerConfig generates linux container config for kubelet runtime v1.
    78  func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string, nsTarget *kubecontainer.ContainerID, enforceMemoryQoS bool) (*runtimeapi.LinuxContainerConfig, error) {
    79  	sc, err := m.determineEffectiveSecurityContext(pod, container, uid, username)
    80  	if err != nil {
    81  		return nil, err
    82  	}
    83  	lc := &runtimeapi.LinuxContainerConfig{
    84  		Resources:       m.generateLinuxContainerResources(pod, container, enforceMemoryQoS),
    85  		SecurityContext: sc,
    86  	}
    87  
    88  	if nsTarget != nil && lc.SecurityContext.NamespaceOptions.Pid == runtimeapi.NamespaceMode_CONTAINER {
    89  		lc.SecurityContext.NamespaceOptions.Pid = runtimeapi.NamespaceMode_TARGET
    90  		lc.SecurityContext.NamespaceOptions.TargetId = nsTarget.ID
    91  	}
    92  
    93  	return lc, nil
    94  }
    95  
    96  // generateLinuxContainerResources generates linux container resources config for runtime
    97  func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod, container *v1.Container, enforceMemoryQoS bool) *runtimeapi.LinuxContainerResources {
    98  	// set linux container resources
    99  	var cpuRequest *resource.Quantity
   100  	if _, cpuRequestExists := container.Resources.Requests[v1.ResourceCPU]; cpuRequestExists {
   101  		cpuRequest = container.Resources.Requests.Cpu()
   102  	}
   103  	lcr := m.calculateLinuxResources(cpuRequest, container.Resources.Limits.Cpu(), container.Resources.Limits.Memory())
   104  
   105  	lcr.OomScoreAdj = int64(qos.GetContainerOOMScoreAdjust(pod, container,
   106  		int64(m.machineInfo.MemoryCapacity)))
   107  
   108  	lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)
   109  
   110  	// Configure swap for the container
   111  	m.configureContainerSwapResources(lcr, pod, container)
   112  
   113  	// Set memory.min and memory.high to enforce MemoryQoS
   114  	if enforceMemoryQoS {
   115  		unified := map[string]string{}
   116  		memoryRequest := container.Resources.Requests.Memory().Value()
   117  		memoryLimit := container.Resources.Limits.Memory().Value()
   118  		if memoryRequest != 0 {
   119  			unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10)
   120  		}
   121  
   122  		// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
   123  		// Here, we only check from memory perspective. Hence MemoryQoS feature is disabled on those QoS pods by not setting memory.high.
   124  		if memoryRequest != memoryLimit {
   125  			// The formula for memory.high for container cgroup is modified in Alpha stage of the feature in K8s v1.27.
   126  			// It will be set based on formula:
   127  			// `memory.high=floor[(requests.memory + memory throttling factor * (limits.memory or node allocatable memory - requests.memory))/pageSize] * pageSize`
   128  			// where default value of memory throttling factor is set to 0.9
   129  			// More info: https://git.k8s.io/enhancements/keps/sig-node/2570-memory-qos
   130  			memoryHigh := int64(0)
   131  			if memoryLimit != 0 {
   132  				memoryHigh = int64(math.Floor(
   133  					float64(memoryRequest)+
   134  						(float64(memoryLimit)-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize
   135  			} else {
   136  				allocatable := m.getNodeAllocatable()
   137  				allocatableMemory, ok := allocatable[v1.ResourceMemory]
   138  				if ok && allocatableMemory.Value() > 0 {
   139  					memoryHigh = int64(math.Floor(
   140  						float64(memoryRequest)+
   141  							(float64(allocatableMemory.Value())-float64(memoryRequest))*float64(m.memoryThrottlingFactor))/float64(defaultPageSize)) * defaultPageSize
   142  				}
   143  			}
   144  			if memoryHigh != 0 && memoryHigh > memoryRequest {
   145  				unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
   146  			}
   147  		}
   148  		if len(unified) > 0 {
   149  			if lcr.Unified == nil {
   150  				lcr.Unified = unified
   151  			} else {
   152  				for k, v := range unified {
   153  					lcr.Unified[k] = v
   154  				}
   155  			}
   156  			klog.V(4).InfoS("MemoryQoS config for container", "pod", klog.KObj(pod), "containerName", container.Name, "unified", unified)
   157  		}
   158  	}
   159  
   160  	return lcr
   161  }
   162  
   163  // configureContainerSwapResources configures the swap resources for a specified (linux) container.
   164  // Swap is only configured if a swap cgroup controller is available and the NodeSwap feature gate is enabled.
   165  func (m *kubeGenericRuntimeManager) configureContainerSwapResources(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
   166  	if !swapControllerAvailable() {
   167  		klog.InfoS("No swap cgroup controller present", "swapBehavior", m.memorySwapBehavior, "pod", klog.KObj(pod), "containerName", container.Name)
   168  		return
   169  	}
   170  	swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo)
   171  	if m.memorySwapBehavior == kubelettypes.LimitedSwap {
   172  		if !isCgroup2UnifiedMode() {
   173  			swapConfigurationHelper.ConfigureNoSwap(lcr)
   174  			return
   175  		}
   176  	}
   177  
   178  	if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
   179  		swapConfigurationHelper.ConfigureNoSwap(lcr)
   180  		return
   181  	}
   182  
   183  	// NOTE(ehashman): Behavior is defined in the opencontainers runtime spec:
   184  	// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
   185  	switch m.memorySwapBehavior {
   186  	case kubelettypes.NoSwap:
   187  		swapConfigurationHelper.ConfigureNoSwap(lcr)
   188  	case kubelettypes.LimitedSwap:
   189  		swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
   190  	default:
   191  		swapConfigurationHelper.ConfigureNoSwap(lcr)
   192  	}
   193  }
   194  
   195  // generateContainerResources generates platform specific (linux) container resources config for runtime
   196  func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, container *v1.Container) *runtimeapi.ContainerResources {
   197  	enforceMemoryQoS := false
   198  	// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
   199  	if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
   200  		isCgroup2UnifiedMode() {
   201  		enforceMemoryQoS = true
   202  	}
   203  	return &runtimeapi.ContainerResources{
   204  		Linux: m.generateLinuxContainerResources(pod, container, enforceMemoryQoS),
   205  	}
   206  }
   207  
   208  // calculateLinuxResources will create the linuxContainerResources type based on the provided CPU and memory resource requests, limits
   209  func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit, memoryLimit *resource.Quantity) *runtimeapi.LinuxContainerResources {
   210  	resources := runtimeapi.LinuxContainerResources{}
   211  	var cpuShares int64
   212  
   213  	memLimit := memoryLimit.Value()
   214  
   215  	// If request is not specified, but limit is, we want request to default to limit.
   216  	// API server does this for new containers, but we repeat this logic in Kubelet
   217  	// for containers running on existing Kubernetes clusters.
   218  	if cpuRequest == nil && cpuLimit != nil {
   219  		cpuShares = int64(cm.MilliCPUToShares(cpuLimit.MilliValue()))
   220  	} else {
   221  		// if cpuRequest.Amount is nil, then MilliCPUToShares will return the minimal number
   222  		// of CPU shares.
   223  		cpuShares = int64(cm.MilliCPUToShares(cpuRequest.MilliValue()))
   224  	}
   225  	resources.CpuShares = cpuShares
   226  	if memLimit != 0 {
   227  		resources.MemoryLimitInBytes = memLimit
   228  	}
   229  
   230  	if m.cpuCFSQuota {
   231  		// if cpuLimit.Amount is nil, then the appropriate default value is returned
   232  		// to allow full usage of cpu resource.
   233  		cpuPeriod := int64(quotaPeriod)
   234  		if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) {
   235  			// kubeGenericRuntimeManager.cpuCFSQuotaPeriod is provided in time.Duration,
   236  			// but we need to convert it to number of microseconds which is used by kernel.
   237  			cpuPeriod = int64(m.cpuCFSQuotaPeriod.Duration / time.Microsecond)
   238  		}
   239  		cpuQuota := milliCPUToQuota(cpuLimit.MilliValue(), cpuPeriod)
   240  		resources.CpuQuota = cpuQuota
   241  		resources.CpuPeriod = cpuPeriod
   242  	}
   243  
   244  	// runc requires cgroupv2 for unified mode
   245  	if isCgroup2UnifiedMode() {
   246  		resources.Unified = map[string]string{
   247  			// Ask the kernel to kill all processes in the container cgroup in case of OOM.
   248  			// See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for
   249  			// more info.
   250  			"memory.oom.group": "1",
   251  		}
   252  	}
   253  	return &resources
   254  }
   255  
   256  // GetHugepageLimitsFromResources returns limits of each hugepages from resources.
   257  func GetHugepageLimitsFromResources(resources v1.ResourceRequirements) []*runtimeapi.HugepageLimit {
   258  	var hugepageLimits []*runtimeapi.HugepageLimit
   259  
   260  	// For each page size, limit to 0.
   261  	for _, pageSize := range libcontainercgroups.HugePageSizes() {
   262  		hugepageLimits = append(hugepageLimits, &runtimeapi.HugepageLimit{
   263  			PageSize: pageSize,
   264  			Limit:    uint64(0),
   265  		})
   266  	}
   267  
   268  	requiredHugepageLimits := map[string]uint64{}
   269  	for resourceObj, amountObj := range resources.Limits {
   270  		if !v1helper.IsHugePageResourceName(resourceObj) {
   271  			continue
   272  		}
   273  
   274  		pageSize, err := v1helper.HugePageSizeFromResourceName(resourceObj)
   275  		if err != nil {
   276  			klog.InfoS("Failed to get hugepage size from resource", "object", resourceObj, "err", err)
   277  			continue
   278  		}
   279  
   280  		sizeString, err := v1helper.HugePageUnitSizeFromByteSize(pageSize.Value())
   281  		if err != nil {
   282  			klog.InfoS("Size is invalid", "object", resourceObj, "err", err)
   283  			continue
   284  		}
   285  		requiredHugepageLimits[sizeString] = uint64(amountObj.Value())
   286  	}
   287  
   288  	for _, hugepageLimit := range hugepageLimits {
   289  		if limit, exists := requiredHugepageLimits[hugepageLimit.PageSize]; exists {
   290  			hugepageLimit.Limit = limit
   291  		}
   292  	}
   293  
   294  	return hugepageLimits
   295  }
   296  
   297  func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *kubecontainer.ContainerResources {
   298  	var cStatusResources *kubecontainer.ContainerResources
   299  	runtimeStatusResources := statusResources.GetLinux()
   300  	if runtimeStatusResources != nil {
   301  		var cpuLimit, memLimit, cpuRequest *resource.Quantity
   302  		if runtimeStatusResources.CpuPeriod > 0 {
   303  			milliCPU := quotaToMilliCPU(runtimeStatusResources.CpuQuota, runtimeStatusResources.CpuPeriod)
   304  			if milliCPU > 0 {
   305  				cpuLimit = resource.NewMilliQuantity(milliCPU, resource.DecimalSI)
   306  			}
   307  		}
   308  		if runtimeStatusResources.CpuShares > 0 {
   309  			milliCPU := sharesToMilliCPU(runtimeStatusResources.CpuShares)
   310  			if milliCPU > 0 {
   311  				cpuRequest = resource.NewMilliQuantity(milliCPU, resource.DecimalSI)
   312  			}
   313  		}
   314  		if runtimeStatusResources.MemoryLimitInBytes > 0 {
   315  			memLimit = resource.NewQuantity(runtimeStatusResources.MemoryLimitInBytes, resource.BinarySI)
   316  		}
   317  		if cpuLimit != nil || memLimit != nil || cpuRequest != nil {
   318  			cStatusResources = &kubecontainer.ContainerResources{
   319  				CPULimit:    cpuLimit,
   320  				CPURequest:  cpuRequest,
   321  				MemoryLimit: memLimit,
   322  			}
   323  		}
   324  	}
   325  	return cStatusResources
   326  }
   327  
   328  // Note: this function variable is being added here so it would be possible to mock
   329  // the cgroup version for unit tests by assigning a new mocked function into it. Without it,
   330  // the cgroup version would solely depend on the environment running the test.
   331  var isCgroup2UnifiedMode = func() bool {
   332  	return libcontainercgroups.IsCgroup2UnifiedMode()
   333  }
   334  
   335  var (
   336  	swapControllerAvailability     bool
   337  	swapControllerAvailabilityOnce sync.Once
   338  )
   339  
   340  // Note: this function variable is being added here so it would be possible to mock
   341  // the swap controller availability for unit tests by assigning a new function to it. Without it,
   342  // the swap controller availability would solely depend on the environment running the test.
   343  var swapControllerAvailable = func() bool {
   344  	// See https://github.com/containerd/containerd/pull/7838/
   345  	swapControllerAvailabilityOnce.Do(func() {
   346  		const warn = "Failed to detect the availability of the swap controller, assuming not available"
   347  		p := "/sys/fs/cgroup/memory/memory.memsw.limit_in_bytes"
   348  		if isCgroup2UnifiedMode() {
   349  			// memory.swap.max does not exist in the cgroup root, so we check /sys/fs/cgroup/<SELF>/memory.swap.max
   350  			_, unified, err := cgroups.ParseCgroupFileUnified("/proc/self/cgroup")
   351  			if err != nil {
   352  				klog.V(5).ErrorS(fmt.Errorf("failed to parse /proc/self/cgroup: %w", err), warn)
   353  				return
   354  			}
   355  			p = filepath.Join("/sys/fs/cgroup", unified, "memory.swap.max")
   356  		}
   357  		if _, err := os.Stat(p); err != nil {
   358  			if !errors.Is(err, os.ErrNotExist) {
   359  				klog.V(5).ErrorS(err, warn)
   360  			}
   361  			return
   362  		}
   363  		swapControllerAvailability = true
   364  	})
   365  	return swapControllerAvailability
   366  }
   367  
   368  type swapConfigurationHelper struct {
   369  	machineInfo cadvisorv1.MachineInfo
   370  }
   371  
   372  func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper {
   373  	return &swapConfigurationHelper{machineInfo: machineInfo}
   374  }
   375  
   376  func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
   377  	podQos := kubeapiqos.GetPodQOS(pod)
   378  	containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero()
   379  	memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0
   380  
   381  	if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit {
   382  		m.ConfigureNoSwap(lcr)
   383  		return
   384  	}
   385  
   386  	containerMemoryRequest := container.Resources.Requests.Memory()
   387  	swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))
   388  
   389  	if err != nil {
   390  		klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap")
   391  		m.ConfigureNoSwap(lcr)
   392  		return
   393  	}
   394  
   395  	m.configureSwap(lcr, swapLimit)
   396  }
   397  
   398  func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) {
   399  	if !isCgroup2UnifiedMode() {
   400  		if swapControllerAvailable() {
   401  			// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
   402  			// Some swapping is still possible.
   403  			// Note that if memory limit is 0, memory swap limit is ignored.
   404  			lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
   405  		}
   406  		return
   407  	}
   408  
   409  	m.configureSwap(lcr, 0)
   410  }
   411  
   412  func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) {
   413  	if !isCgroup2UnifiedMode() {
   414  		klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected")
   415  		return
   416  	}
   417  
   418  	if lcr.Unified == nil {
   419  		lcr.Unified = map[string]string{}
   420  	}
   421  
   422  	lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory)
   423  }
   424  
   425  // The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>.
   426  // For more info, please look at the following KEP: https://kep.k8s.io/2400
   427  func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) {
   428  	if nodeTotalMemory <= 0 {
   429  		return 0, fmt.Errorf("total node memory is 0")
   430  	}
   431  	if containerMemoryRequest > nodeTotalMemory {
   432  		return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory)
   433  	}
   434  
   435  	containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory)
   436  	swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable)
   437  
   438  	return int64(swapAllocation), nil
   439  }
   440
View as plain text