...

Source file src/k8s.io/kubernetes/pkg/kubelet/cm/helpers_linux.go

Documentation: k8s.io/kubernetes/pkg/kubelet/cm

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cm
    18  
    19  import (
    20  	"bufio"
    21  	"fmt"
    22  	"os"
    23  	"path/filepath"
    24  	"strconv"
    25  
    26  	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
    27  	v1 "k8s.io/api/core/v1"
    28  	"k8s.io/apimachinery/pkg/types"
    29  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    30  
    31  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    32  	"k8s.io/kubernetes/pkg/api/v1/resource"
    33  	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
    34  	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
    35  	kubefeatures "k8s.io/kubernetes/pkg/features"
    36  	"k8s.io/kubernetes/pkg/kubelet/cm/util"
    37  )
    38  
    39  const (
    40  	// These limits are defined in the kernel:
    41  	// https://github.com/torvalds/linux/blob/0bddd227f3dc55975e2b8dfa7fc6f959b062a2c7/kernel/sched/sched.h#L427-L428
    42  	MinShares = 2
    43  	MaxShares = 262144
    44  
    45  	SharesPerCPU  = 1024
    46  	MilliCPUToCPU = 1000
    47  
    48  	// 100000 microseconds is equivalent to 100ms
    49  	QuotaPeriod = 100000
    50  	// 1000 microseconds is equivalent to 1ms
    51  	// defined here:
    52  	// https://github.com/torvalds/linux/blob/cac03ac368fabff0122853de2422d4e17a32de08/kernel/sched/core.c#L10546
    53  	MinQuotaPeriod = 1000
    54  )
    55  
    56  // MilliCPUToQuota converts milliCPU to CFS quota and period values.
    57  // Input parameters and resulting value is number of microseconds.
    58  func MilliCPUToQuota(milliCPU int64, period int64) (quota int64) {
    59  	// CFS quota is measured in two values:
    60  	//  - cfs_period_us=100ms (the amount of time to measure usage across given by period)
    61  	//  - cfs_quota=20ms (the amount of cpu time allowed to be used across a period)
    62  	// so in the above example, you are limited to 20% of a single CPU
    63  	// for multi-cpu environments, you just scale equivalent amounts
    64  	// see https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt for details
    65  
    66  	if milliCPU == 0 {
    67  		return
    68  	}
    69  
    70  	if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) {
    71  		period = QuotaPeriod
    72  	}
    73  
    74  	// we then convert your milliCPU to a value normalized over a period
    75  	quota = (milliCPU * period) / MilliCPUToCPU
    76  
    77  	// quota needs to be a minimum of 1ms.
    78  	if quota < MinQuotaPeriod {
    79  		quota = MinQuotaPeriod
    80  	}
    81  	return
    82  }
    83  
    84  // MilliCPUToShares converts the milliCPU to CFS shares.
    85  func MilliCPUToShares(milliCPU int64) uint64 {
    86  	if milliCPU == 0 {
    87  		// Docker converts zero milliCPU to unset, which maps to kernel default
    88  		// for unset: 1024. Return 2 here to really match kernel default for
    89  		// zero milliCPU.
    90  		return MinShares
    91  	}
    92  	// Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding.
    93  	shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU
    94  	if shares < MinShares {
    95  		return MinShares
    96  	}
    97  	if shares > MaxShares {
    98  		return MaxShares
    99  	}
   100  	return uint64(shares)
   101  }
   102  
   103  // HugePageLimits converts the API representation to a map
   104  // from huge page size (in bytes) to huge page limit (in bytes).
   105  func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
   106  	hugePageLimits := map[int64]int64{}
   107  	for k, v := range resourceList {
   108  		if v1helper.IsHugePageResourceName(k) {
   109  			pageSize, _ := v1helper.HugePageSizeFromResourceName(k)
   110  			if value, exists := hugePageLimits[pageSize.Value()]; exists {
   111  				hugePageLimits[pageSize.Value()] = value + v.Value()
   112  			} else {
   113  				hugePageLimits[pageSize.Value()] = v.Value()
   114  			}
   115  		}
   116  	}
   117  	return hugePageLimits
   118  }
   119  
   120  // ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
   121  func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig {
   122  	inPlacePodVerticalScalingEnabled := utilfeature.DefaultFeatureGate.Enabled(kubefeatures.InPlacePodVerticalScaling)
   123  	// sum requests and limits.
   124  	reqs := resource.PodRequests(pod, resource.PodResourcesOptions{
   125  		InPlacePodVerticalScalingEnabled: inPlacePodVerticalScalingEnabled,
   126  	})
   127  	// track if limits were applied for each resource.
   128  	memoryLimitsDeclared := true
   129  	cpuLimitsDeclared := true
   130  
   131  	limits := resource.PodLimits(pod, resource.PodResourcesOptions{
   132  		InPlacePodVerticalScalingEnabled: inPlacePodVerticalScalingEnabled,
   133  		ContainerFn: func(res v1.ResourceList, containerType podutil.ContainerType) {
   134  			if res.Cpu().IsZero() {
   135  				cpuLimitsDeclared = false
   136  			}
   137  			if res.Memory().IsZero() {
   138  				memoryLimitsDeclared = false
   139  			}
   140  		},
   141  	})
   142  	// map hugepage pagesize (bytes) to limits (bytes)
   143  	hugePageLimits := HugePageLimits(reqs)
   144  
   145  	cpuRequests := int64(0)
   146  	cpuLimits := int64(0)
   147  	memoryLimits := int64(0)
   148  	if request, found := reqs[v1.ResourceCPU]; found {
   149  		cpuRequests = request.MilliValue()
   150  	}
   151  	if limit, found := limits[v1.ResourceCPU]; found {
   152  		cpuLimits = limit.MilliValue()
   153  	}
   154  	if limit, found := limits[v1.ResourceMemory]; found {
   155  		memoryLimits = limit.Value()
   156  	}
   157  
   158  	// convert to CFS values
   159  	cpuShares := MilliCPUToShares(cpuRequests)
   160  	cpuQuota := MilliCPUToQuota(cpuLimits, int64(cpuPeriod))
   161  
   162  	// quota is not capped when cfs quota is disabled
   163  	if !enforceCPULimits {
   164  		cpuQuota = int64(-1)
   165  	}
   166  
   167  	// determine the qos class
   168  	qosClass := v1qos.GetPodQOS(pod)
   169  
   170  	// build the result
   171  	result := &ResourceConfig{}
   172  	if qosClass == v1.PodQOSGuaranteed {
   173  		result.CPUShares = &cpuShares
   174  		result.CPUQuota = &cpuQuota
   175  		result.CPUPeriod = &cpuPeriod
   176  		result.Memory = &memoryLimits
   177  	} else if qosClass == v1.PodQOSBurstable {
   178  		result.CPUShares = &cpuShares
   179  		if cpuLimitsDeclared {
   180  			result.CPUQuota = &cpuQuota
   181  			result.CPUPeriod = &cpuPeriod
   182  		}
   183  		if memoryLimitsDeclared {
   184  			result.Memory = &memoryLimits
   185  		}
   186  	} else {
   187  		shares := uint64(MinShares)
   188  		result.CPUShares = &shares
   189  	}
   190  	result.HugePageLimit = hugePageLimits
   191  
   192  	if enforceMemoryQoS {
   193  		memoryMin := int64(0)
   194  		if request, found := reqs[v1.ResourceMemory]; found {
   195  			memoryMin = request.Value()
   196  		}
   197  		if memoryMin > 0 {
   198  			result.Unified = map[string]string{
   199  				Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10),
   200  			}
   201  		}
   202  	}
   203  
   204  	return result
   205  }
   206  
   207  // getCgroupSubsystemsV1 returns information about the mounted cgroup v1 subsystems
   208  func getCgroupSubsystemsV1() (*CgroupSubsystems, error) {
   209  	// get all cgroup mounts.
   210  	allCgroups, err := libcontainercgroups.GetCgroupMounts(true)
   211  	if err != nil {
   212  		return &CgroupSubsystems{}, err
   213  	}
   214  	if len(allCgroups) == 0 {
   215  		return &CgroupSubsystems{}, fmt.Errorf("failed to find cgroup mounts")
   216  	}
   217  	mountPoints := make(map[string]string, len(allCgroups))
   218  	for _, mount := range allCgroups {
   219  		// BEFORE kubelet used a random mount point per cgroups subsystem;
   220  		// NOW    more deterministic: kubelet use mount point with shortest path;
   221  		// FUTURE is bright with clear expectation determined in doc.
   222  		// ref. issue: https://github.com/kubernetes/kubernetes/issues/95488
   223  
   224  		for _, subsystem := range mount.Subsystems {
   225  			previous := mountPoints[subsystem]
   226  			if previous == "" || len(mount.Mountpoint) < len(previous) {
   227  				mountPoints[subsystem] = mount.Mountpoint
   228  			}
   229  		}
   230  	}
   231  	return &CgroupSubsystems{
   232  		Mounts:      allCgroups,
   233  		MountPoints: mountPoints,
   234  	}, nil
   235  }
   236  
   237  // getCgroupSubsystemsV2 returns information about the enabled cgroup v2 subsystems
   238  func getCgroupSubsystemsV2() (*CgroupSubsystems, error) {
   239  	controllers, err := libcontainercgroups.GetAllSubsystems()
   240  	if err != nil {
   241  		return nil, err
   242  	}
   243  
   244  	mounts := []libcontainercgroups.Mount{}
   245  	mountPoints := make(map[string]string, len(controllers))
   246  	for _, controller := range controllers {
   247  		mountPoints[controller] = util.CgroupRoot
   248  		m := libcontainercgroups.Mount{
   249  			Mountpoint: util.CgroupRoot,
   250  			Root:       util.CgroupRoot,
   251  			Subsystems: []string{controller},
   252  		}
   253  		mounts = append(mounts, m)
   254  	}
   255  
   256  	return &CgroupSubsystems{
   257  		Mounts:      mounts,
   258  		MountPoints: mountPoints,
   259  	}, nil
   260  }
   261  
   262  // GetCgroupSubsystems returns information about the mounted cgroup subsystems
   263  func GetCgroupSubsystems() (*CgroupSubsystems, error) {
   264  	if libcontainercgroups.IsCgroup2UnifiedMode() {
   265  		return getCgroupSubsystemsV2()
   266  	}
   267  
   268  	return getCgroupSubsystemsV1()
   269  }
   270  
   271  // getCgroupProcs takes a cgroup directory name as an argument
   272  // reads through the cgroup's procs file and returns a list of tgid's.
   273  // It returns an empty list if a procs file doesn't exists
   274  func getCgroupProcs(dir string) ([]int, error) {
   275  	procsFile := filepath.Join(dir, "cgroup.procs")
   276  	f, err := os.Open(procsFile)
   277  	if err != nil {
   278  		if os.IsNotExist(err) {
   279  			// The procsFile does not exist, So no pids attached to this directory
   280  			return []int{}, nil
   281  		}
   282  		return nil, err
   283  	}
   284  	defer f.Close()
   285  
   286  	s := bufio.NewScanner(f)
   287  	out := []int{}
   288  	for s.Scan() {
   289  		if t := s.Text(); t != "" {
   290  			pid, err := strconv.Atoi(t)
   291  			if err != nil {
   292  				return nil, fmt.Errorf("unexpected line in %v; could not convert to pid: %v", procsFile, err)
   293  			}
   294  			out = append(out, pid)
   295  		}
   296  	}
   297  	return out, nil
   298  }
   299  
   300  // GetPodCgroupNameSuffix returns the last element of the pod CgroupName identifier
   301  func GetPodCgroupNameSuffix(podUID types.UID) string {
   302  	return podCgroupNamePrefix + string(podUID)
   303  }
   304  
   305  // NodeAllocatableRoot returns the literal cgroup path for the node allocatable cgroup
   306  func NodeAllocatableRoot(cgroupRoot string, cgroupsPerQOS bool, cgroupDriver string) string {
   307  	nodeAllocatableRoot := ParseCgroupfsToCgroupName(cgroupRoot)
   308  	if cgroupsPerQOS {
   309  		nodeAllocatableRoot = NewCgroupName(nodeAllocatableRoot, defaultNodeAllocatableCgroupName)
   310  	}
   311  	if cgroupDriver == "systemd" {
   312  		return nodeAllocatableRoot.ToSystemd()
   313  	}
   314  	return nodeAllocatableRoot.ToCgroupfs()
   315  }
   316  
   317  // GetKubeletContainer returns the cgroup the kubelet will use
   318  func GetKubeletContainer(kubeletCgroups string) (string, error) {
   319  	if kubeletCgroups == "" {
   320  		cont, err := getContainer(os.Getpid())
   321  		if err != nil {
   322  			return "", err
   323  		}
   324  		return cont, nil
   325  	}
   326  	return kubeletCgroups, nil
   327  }
   328  

View as plain text