pod_container_manager_linux.go

Documentation: k8s.io/kubernetes/pkg/kubelet/cm

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cm
    18  
    19  import (
    20  	"errors"
    21  	"fmt"
    22  	"os"
    23  	"path"
    24  	"strings"
    25  
    26  	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
    27  	v1 "k8s.io/api/core/v1"
    28  	"k8s.io/apimachinery/pkg/types"
    29  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    30  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    31  	"k8s.io/klog/v2"
    32  	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
    33  	kubefeatures "k8s.io/kubernetes/pkg/features"
    34  )
    35  
    36  const (
    37  	podCgroupNamePrefix = "pod"
    38  )
    39  
    40  // podContainerManagerImpl implements podContainerManager interface.
    41  // It is the general implementation which allows pod level container
    42  // management if qos Cgroup is enabled.
    43  type podContainerManagerImpl struct {
    44  	// qosContainersInfo hold absolute paths of the top level qos containers
    45  	qosContainersInfo QOSContainersInfo
    46  	// Stores the mounted cgroup subsystems
    47  	subsystems *CgroupSubsystems
    48  	// cgroupManager is the cgroup Manager Object responsible for managing all
    49  	// pod cgroups.
    50  	cgroupManager CgroupManager
    51  	// Maximum number of pids in a pod
    52  	podPidsLimit int64
    53  	// enforceCPULimits controls whether cfs quota is enforced or not
    54  	enforceCPULimits bool
    55  	// cpuCFSQuotaPeriod is the cfs period value, cfs_period_us, setting per
    56  	// node for all containers in usec
    57  	cpuCFSQuotaPeriod uint64
    58  }
    59  
    60  // Make sure that podContainerManagerImpl implements the PodContainerManager interface
    61  var _ PodContainerManager = &podContainerManagerImpl{}
    62  
    63  // Exists checks if the pod's cgroup already exists
    64  func (m *podContainerManagerImpl) Exists(pod *v1.Pod) bool {
    65  	podContainerName, _ := m.GetPodContainerName(pod)
    66  	return m.cgroupManager.Exists(podContainerName)
    67  }
    68  
    69  // EnsureExists takes a pod as argument and makes sure that
    70  // pod cgroup exists if qos cgroup hierarchy flag is enabled.
    71  // If the pod level container doesn't already exist it is created.
    72  func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
    73  	// check if container already exist
    74  	alreadyExists := m.Exists(pod)
    75  	if !alreadyExists {
    76  		enforceMemoryQoS := false
    77  		if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
    78  			libcontainercgroups.IsCgroup2UnifiedMode() {
    79  			enforceMemoryQoS = true
    80  		}
    81  		// Create the pod container
    82  		podContainerName, _ := m.GetPodContainerName(pod)
    83  		containerConfig := &CgroupConfig{
    84  			Name:               podContainerName,
    85  			ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS),
    86  		}
    87  		if m.podPidsLimit > 0 {
    88  			containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit
    89  		}
    90  		if enforceMemoryQoS {
    91  			klog.V(4).InfoS("MemoryQoS config for pod", "pod", klog.KObj(pod), "unified", containerConfig.ResourceParameters.Unified)
    92  		}
    93  		if err := m.cgroupManager.Create(containerConfig); err != nil {
    94  			return fmt.Errorf("failed to create container for %v : %v", podContainerName, err)
    95  		}
    96  	}
    97  	return nil
    98  }
    99  
   100  // GetPodContainerName returns the CgroupName identifier, and its literal cgroupfs form on the host.
   101  func (m *podContainerManagerImpl) GetPodContainerName(pod *v1.Pod) (CgroupName, string) {
   102  	podQOS := v1qos.GetPodQOS(pod)
   103  	// Get the parent QOS container name
   104  	var parentContainer CgroupName
   105  	switch podQOS {
   106  	case v1.PodQOSGuaranteed:
   107  		parentContainer = m.qosContainersInfo.Guaranteed
   108  	case v1.PodQOSBurstable:
   109  		parentContainer = m.qosContainersInfo.Burstable
   110  	case v1.PodQOSBestEffort:
   111  		parentContainer = m.qosContainersInfo.BestEffort
   112  	}
   113  	podContainer := GetPodCgroupNameSuffix(pod.UID)
   114  
   115  	// Get the absolute path of the cgroup
   116  	cgroupName := NewCgroupName(parentContainer, podContainer)
   117  	// Get the literal cgroupfs name
   118  	cgroupfsName := m.cgroupManager.Name(cgroupName)
   119  
   120  	return cgroupName, cgroupfsName
   121  }
   122  
   123  func (m *podContainerManagerImpl) GetPodCgroupMemoryUsage(pod *v1.Pod) (uint64, error) {
   124  	podCgroupName, _ := m.GetPodContainerName(pod)
   125  	memUsage, err := m.cgroupManager.MemoryUsage(podCgroupName)
   126  	if err != nil {
   127  		return 0, err
   128  	}
   129  	return uint64(memUsage), nil
   130  }
   131  
   132  func (m *podContainerManagerImpl) GetPodCgroupConfig(pod *v1.Pod, resource v1.ResourceName) (*ResourceConfig, error) {
   133  	podCgroupName, _ := m.GetPodContainerName(pod)
   134  	return m.cgroupManager.GetCgroupConfig(podCgroupName, resource)
   135  }
   136  
   137  func (m *podContainerManagerImpl) SetPodCgroupConfig(pod *v1.Pod, resource v1.ResourceName, resourceConfig *ResourceConfig) error {
   138  	podCgroupName, _ := m.GetPodContainerName(pod)
   139  	return m.cgroupManager.SetCgroupConfig(podCgroupName, resource, resourceConfig)
   140  }
   141  
   142  // Kill one process ID
   143  func (m *podContainerManagerImpl) killOnePid(pid int) error {
   144  	// os.FindProcess never returns an error on POSIX
   145  	// https://go-review.googlesource.com/c/go/+/19093
   146  	p, _ := os.FindProcess(pid)
   147  	if err := p.Kill(); err != nil {
   148  		// If the process already exited, that's fine.
   149  		if errors.Is(err, os.ErrProcessDone) {
   150  			klog.V(3).InfoS("Process no longer exists", "pid", pid)
   151  			return nil
   152  		}
   153  		return err
   154  	}
   155  	return nil
   156  }
   157  
   158  // Scan through the whole cgroup directory and kill all processes either
   159  // attached to the pod cgroup or to a container cgroup under the pod cgroup
   160  func (m *podContainerManagerImpl) tryKillingCgroupProcesses(podCgroup CgroupName) error {
   161  	pidsToKill := m.cgroupManager.Pids(podCgroup)
   162  	// No pids charged to the terminated pod cgroup return
   163  	if len(pidsToKill) == 0 {
   164  		return nil
   165  	}
   166  
   167  	var errlist []error
   168  	// os.Kill often errors out,
   169  	// We try killing all the pids multiple times
   170  	removed := map[int]bool{}
   171  	for i := 0; i < 5; i++ {
   172  		if i != 0 {
   173  			klog.V(3).InfoS("Attempt failed to kill all unwanted process from cgroup, retrying", "attempt", i, "cgroupName", podCgroup)
   174  		}
   175  		errlist = []error{}
   176  		for _, pid := range pidsToKill {
   177  			if _, ok := removed[pid]; ok {
   178  				continue
   179  			}
   180  			klog.V(3).InfoS("Attempting to kill process from cgroup", "pid", pid, "cgroupName", podCgroup)
   181  			if err := m.killOnePid(pid); err != nil {
   182  				klog.V(3).InfoS("Failed to kill process from cgroup", "pid", pid, "cgroupName", podCgroup, "err", err)
   183  				errlist = append(errlist, err)
   184  			} else {
   185  				removed[pid] = true
   186  			}
   187  		}
   188  		if len(errlist) == 0 {
   189  			klog.V(3).InfoS("Successfully killed all unwanted processes from cgroup", "cgroupName", podCgroup)
   190  			return nil
   191  		}
   192  	}
   193  	return utilerrors.NewAggregate(errlist)
   194  }
   195  
   196  // Destroy destroys the pod container cgroup paths
   197  func (m *podContainerManagerImpl) Destroy(podCgroup CgroupName) error {
   198  	// Try killing all the processes attached to the pod cgroup
   199  	if err := m.tryKillingCgroupProcesses(podCgroup); err != nil {
   200  		klog.InfoS("Failed to kill all the processes attached to cgroup", "cgroupName", podCgroup, "err", err)
   201  		return fmt.Errorf("failed to kill all the processes attached to the %v cgroups : %v", podCgroup, err)
   202  	}
   203  
   204  	// Now its safe to remove the pod's cgroup
   205  	containerConfig := &CgroupConfig{
   206  		Name:               podCgroup,
   207  		ResourceParameters: &ResourceConfig{},
   208  	}
   209  	if err := m.cgroupManager.Destroy(containerConfig); err != nil {
   210  		klog.InfoS("Failed to delete cgroup paths", "cgroupName", podCgroup, "err", err)
   211  		return fmt.Errorf("failed to delete cgroup paths for %v : %v", podCgroup, err)
   212  	}
   213  	return nil
   214  }
   215  
   216  // ReduceCPULimits reduces the CPU CFS values to the minimum amount of shares.
   217  func (m *podContainerManagerImpl) ReduceCPULimits(podCgroup CgroupName) error {
   218  	return m.cgroupManager.ReduceCPULimits(podCgroup)
   219  }
   220  
   221  // IsPodCgroup returns true if the literal cgroupfs name corresponds to a pod
   222  func (m *podContainerManagerImpl) IsPodCgroup(cgroupfs string) (bool, types.UID) {
   223  	// convert the literal cgroupfs form to the driver specific value
   224  	cgroupName := m.cgroupManager.CgroupName(cgroupfs)
   225  	qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
   226  	basePath := ""
   227  	for _, qosContainerName := range qosContainersList {
   228  		// a pod cgroup is a direct child of a qos node, so check if its a match
   229  		if len(cgroupName) == len(qosContainerName)+1 {
   230  			basePath = cgroupName[len(qosContainerName)]
   231  		}
   232  	}
   233  	if basePath == "" {
   234  		return false, types.UID("")
   235  	}
   236  	if !strings.HasPrefix(basePath, podCgroupNamePrefix) {
   237  		return false, types.UID("")
   238  	}
   239  	parts := strings.Split(basePath, podCgroupNamePrefix)
   240  	if len(parts) != 2 {
   241  		return false, types.UID("")
   242  	}
   243  	return true, types.UID(parts[1])
   244  }
   245  
   246  // GetAllPodsFromCgroups scans through all the subsystems of pod cgroups
   247  // Get list of pods whose cgroup still exist on the cgroup mounts
   248  func (m *podContainerManagerImpl) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
   249  	// Map for storing all the found pods on the disk
   250  	foundPods := make(map[types.UID]CgroupName)
   251  	qosContainersList := [3]CgroupName{m.qosContainersInfo.BestEffort, m.qosContainersInfo.Burstable, m.qosContainersInfo.Guaranteed}
   252  	// Scan through all the subsystem mounts
   253  	// and through each QoS cgroup directory for each subsystem mount
   254  	// If a pod cgroup exists in even a single subsystem mount
   255  	// we will attempt to delete it
   256  	for _, val := range m.subsystems.MountPoints {
   257  		for _, qosContainerName := range qosContainersList {
   258  			// get the subsystems QoS cgroup absolute name
   259  			qcConversion := m.cgroupManager.Name(qosContainerName)
   260  			qc := path.Join(val, qcConversion)
   261  			dirInfo, err := os.ReadDir(qc)
   262  			if err != nil {
   263  				if os.IsNotExist(err) {
   264  					continue
   265  				}
   266  				return nil, fmt.Errorf("failed to read the cgroup directory %v : %v", qc, err)
   267  			}
   268  			for i := range dirInfo {
   269  				// its not a directory, so continue on...
   270  				if !dirInfo[i].IsDir() {
   271  					continue
   272  				}
   273  				// convert the concrete cgroupfs name back to an internal identifier
   274  				// this is needed to handle path conversion for systemd environments.
   275  				// we pass the fully qualified path so decoding can work as expected
   276  				// since systemd encodes the path in each segment.
   277  				cgroupfsPath := path.Join(qcConversion, dirInfo[i].Name())
   278  				internalPath := m.cgroupManager.CgroupName(cgroupfsPath)
   279  				// we only care about base segment of the converted path since that
   280  				// is what we are reading currently to know if it is a pod or not.
   281  				basePath := internalPath[len(internalPath)-1]
   282  				if !strings.Contains(basePath, podCgroupNamePrefix) {
   283  					continue
   284  				}
   285  				// we then split the name on the pod prefix to determine the uid
   286  				parts := strings.Split(basePath, podCgroupNamePrefix)
   287  				// the uid is missing, so we log the unexpected cgroup not of form pod<uid>
   288  				if len(parts) != 2 {
   289  					klog.InfoS("Pod cgroup manager ignored unexpected cgroup because it is not a pod", "path", cgroupfsPath)
   290  					continue
   291  				}
   292  				podUID := parts[1]
   293  				foundPods[types.UID(podUID)] = internalPath
   294  			}
   295  		}
   296  	}
   297  	return foundPods, nil
   298  }
   299  
   300  // podContainerManagerNoop implements podContainerManager interface.
   301  // It is a no-op implementation and basically does nothing
   302  // podContainerManagerNoop is used in case the QoS cgroup Hierarchy is not
   303  // enabled, so Exists() returns true always as the cgroupRoot
   304  // is expected to always exist.
   305  type podContainerManagerNoop struct {
   306  	cgroupRoot CgroupName
   307  }
   308  
   309  // Make sure that podContainerManagerStub implements the PodContainerManager interface
   310  var _ PodContainerManager = &podContainerManagerNoop{}
   311  
   312  func (m *podContainerManagerNoop) Exists(_ *v1.Pod) bool {
   313  	return true
   314  }
   315  
   316  func (m *podContainerManagerNoop) EnsureExists(_ *v1.Pod) error {
   317  	return nil
   318  }
   319  
   320  func (m *podContainerManagerNoop) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
   321  	return m.cgroupRoot, ""
   322  }
   323  
   324  func (m *podContainerManagerNoop) GetPodContainerNameForDriver(_ *v1.Pod) string {
   325  	return ""
   326  }
   327  
   328  // Destroy destroys the pod container cgroup paths
   329  func (m *podContainerManagerNoop) Destroy(_ CgroupName) error {
   330  	return nil
   331  }
   332  
   333  func (m *podContainerManagerNoop) ReduceCPULimits(_ CgroupName) error {
   334  	return nil
   335  }
   336  
   337  func (m *podContainerManagerNoop) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
   338  	return nil, nil
   339  }
   340  
   341  func (m *podContainerManagerNoop) IsPodCgroup(cgroupfs string) (bool, types.UID) {
   342  	return false, types.UID("")
   343  }
   344  
   345  func (m *podContainerManagerNoop) GetPodCgroupMemoryUsage(_ *v1.Pod) (uint64, error) {
   346  	return 0, nil
   347  }
   348  
   349  func (m *podContainerManagerNoop) GetPodCgroupConfig(_ *v1.Pod, _ v1.ResourceName) (*ResourceConfig, error) {
   350  	return nil, nil
   351  }
   352  
   353  func (m *podContainerManagerNoop) SetPodCgroupConfig(_ *v1.Pod, _ v1.ResourceName, _ *ResourceConfig) error {
   354  	return nil
   355  }
   356
View as plain text