...

Source file src/k8s.io/kubernetes/pkg/kubelet/preemption/preemption.go

Documentation: k8s.io/kubernetes/pkg/kubelet/preemption

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package preemption
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  
    23  	v1 "k8s.io/api/core/v1"
    24  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    25  	"k8s.io/client-go/tools/record"
    26  	"k8s.io/klog/v2"
    27  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    28  	"k8s.io/kubernetes/pkg/api/v1/resource"
    29  	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
    30  	"k8s.io/kubernetes/pkg/features"
    31  	"k8s.io/kubernetes/pkg/kubelet/events"
    32  	"k8s.io/kubernetes/pkg/kubelet/eviction"
    33  	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
    34  	"k8s.io/kubernetes/pkg/kubelet/metrics"
    35  	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
    36  )
    37  
    38  const message = "Preempted in order to admit critical pod"
    39  
    40  // CriticalPodAdmissionHandler is an AdmissionFailureHandler that handles admission failure for Critical Pods.
    41  // If the ONLY admission failures are due to insufficient resources, then CriticalPodAdmissionHandler evicts pods
    42  // so that the critical pod can be admitted.  For evictions, the CriticalPodAdmissionHandler evicts a set of pods that
    43  // frees up the required resource requests.  The set of pods is designed to minimize impact, and is prioritized according to the ordering:
    44  // minimal impact for guaranteed pods > minimal impact for burstable pods > minimal impact for besteffort pods.
    45  // minimal impact is defined as follows: fewest pods evicted > fewest total requests of pods.
    46  // finding the fewest total requests of pods is considered besteffort.
    47  type CriticalPodAdmissionHandler struct {
    48  	getPodsFunc eviction.ActivePodsFunc
    49  	killPodFunc eviction.KillPodFunc
    50  	recorder    record.EventRecorder
    51  }
    52  
    53  var _ lifecycle.AdmissionFailureHandler = &CriticalPodAdmissionHandler{}
    54  
    55  func NewCriticalPodAdmissionHandler(getPodsFunc eviction.ActivePodsFunc, killPodFunc eviction.KillPodFunc, recorder record.EventRecorder) *CriticalPodAdmissionHandler {
    56  	return &CriticalPodAdmissionHandler{
    57  		getPodsFunc: getPodsFunc,
    58  		killPodFunc: killPodFunc,
    59  		recorder:    recorder,
    60  	}
    61  }
    62  
    63  // HandleAdmissionFailure gracefully handles admission rejection, and, in some cases,
    64  // to allow admission of the pod despite its previous failure.
    65  func (c *CriticalPodAdmissionHandler) HandleAdmissionFailure(admitPod *v1.Pod, failureReasons []lifecycle.PredicateFailureReason) ([]lifecycle.PredicateFailureReason, error) {
    66  	if !kubetypes.IsCriticalPod(admitPod) {
    67  		return failureReasons, nil
    68  	}
    69  	// InsufficientResourceError is not a reason to reject a critical pod.
    70  	// Instead of rejecting, we free up resources to admit it, if no other reasons for rejection exist.
    71  	nonResourceReasons := []lifecycle.PredicateFailureReason{}
    72  	resourceReasons := []*admissionRequirement{}
    73  	for _, reason := range failureReasons {
    74  		if r, ok := reason.(*lifecycle.InsufficientResourceError); ok {
    75  			resourceReasons = append(resourceReasons, &admissionRequirement{
    76  				resourceName: r.ResourceName,
    77  				quantity:     r.GetInsufficientAmount(),
    78  			})
    79  		} else {
    80  			nonResourceReasons = append(nonResourceReasons, reason)
    81  		}
    82  	}
    83  	if len(nonResourceReasons) > 0 {
    84  		// Return only reasons that are not resource related, since critical pods cannot fail admission for resource reasons.
    85  		return nonResourceReasons, nil
    86  	}
    87  	err := c.evictPodsToFreeRequests(admitPod, admissionRequirementList(resourceReasons))
    88  	// if no error is returned, preemption succeeded and the pod is safe to admit.
    89  	return nil, err
    90  }
    91  
    92  // evictPodsToFreeRequests takes a list of insufficient resources, and attempts to free them by evicting pods
    93  // based on requests.  For example, if the only insufficient resource is 200Mb of memory, this function could
    94  // evict a pod with request=250Mb.
    95  func (c *CriticalPodAdmissionHandler) evictPodsToFreeRequests(admitPod *v1.Pod, insufficientResources admissionRequirementList) error {
    96  	podsToPreempt, err := getPodsToPreempt(admitPod, c.getPodsFunc(), insufficientResources)
    97  	if err != nil {
    98  		return fmt.Errorf("preemption: error finding a set of pods to preempt: %v", err)
    99  	}
   100  	for _, pod := range podsToPreempt {
   101  		// record that we are evicting the pod
   102  		c.recorder.Eventf(pod, v1.EventTypeWarning, events.PreemptContainer, message)
   103  		// this is a blocking call and should only return when the pod and its containers are killed.
   104  		klog.V(3).InfoS("Preempting pod to free up resources", "pod", klog.KObj(pod), "podUID", pod.UID, "insufficientResources", insufficientResources)
   105  		err := c.killPodFunc(pod, true, nil, func(status *v1.PodStatus) {
   106  			status.Phase = v1.PodFailed
   107  			status.Reason = events.PreemptContainer
   108  			status.Message = message
   109  			if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) {
   110  				podutil.UpdatePodCondition(status, &v1.PodCondition{
   111  					Type:    v1.DisruptionTarget,
   112  					Status:  v1.ConditionTrue,
   113  					Reason:  v1.PodReasonTerminationByKubelet,
   114  					Message: "Pod was preempted by Kubelet to accommodate a critical pod.",
   115  				})
   116  			}
   117  		})
   118  		if err != nil {
   119  			klog.ErrorS(err, "Failed to evict pod", "pod", klog.KObj(pod))
   120  			// In future syncPod loops, the kubelet will retry the pod deletion steps that it was stuck on.
   121  			continue
   122  		}
   123  		if len(insufficientResources) > 0 {
   124  			metrics.Preemptions.WithLabelValues(insufficientResources[0].resourceName.String()).Inc()
   125  		} else {
   126  			metrics.Preemptions.WithLabelValues("").Inc()
   127  		}
   128  		klog.InfoS("Pod evicted successfully", "pod", klog.KObj(pod))
   129  	}
   130  	return nil
   131  }
   132  
   133  // getPodsToPreempt returns a list of pods that could be preempted to free requests >= requirements
   134  func getPodsToPreempt(pod *v1.Pod, pods []*v1.Pod, requirements admissionRequirementList) ([]*v1.Pod, error) {
   135  	bestEffortPods, burstablePods, guaranteedPods := sortPodsByQOS(pod, pods)
   136  
   137  	// make sure that pods exist to reclaim the requirements
   138  	unableToMeetRequirements := requirements.subtract(append(append(bestEffortPods, burstablePods...), guaranteedPods...)...)
   139  	if len(unableToMeetRequirements) > 0 {
   140  		return nil, fmt.Errorf("no set of running pods found to reclaim resources: %v", unableToMeetRequirements.toString())
   141  	}
   142  	// find the guaranteed pods we would need to evict if we already evicted ALL burstable and besteffort pods.
   143  	guaranteedToEvict, err := getPodsToPreemptByDistance(guaranteedPods, requirements.subtract(append(bestEffortPods, burstablePods...)...))
   144  	if err != nil {
   145  		return nil, err
   146  	}
   147  	// Find the burstable pods we would need to evict if we already evicted ALL besteffort pods, and the required guaranteed pods.
   148  	burstableToEvict, err := getPodsToPreemptByDistance(burstablePods, requirements.subtract(append(bestEffortPods, guaranteedToEvict...)...))
   149  	if err != nil {
   150  		return nil, err
   151  	}
   152  	// Find the besteffort pods we would need to evict if we already evicted the required guaranteed and burstable pods.
   153  	bestEffortToEvict, err := getPodsToPreemptByDistance(bestEffortPods, requirements.subtract(append(burstableToEvict, guaranteedToEvict...)...))
   154  	if err != nil {
   155  		return nil, err
   156  	}
   157  	return append(append(bestEffortToEvict, burstableToEvict...), guaranteedToEvict...), nil
   158  }
   159  
   160  // getPodsToPreemptByDistance finds the pods that have pod requests >= admission requirements.
   161  // Chooses pods that minimize "distance" to the requirements.
   162  // If more than one pod exists that fulfills the remaining requirements,
   163  // it chooses the pod that has the "smaller resource request"
   164  // This method, by repeatedly choosing the pod that fulfills as much of the requirements as possible,
   165  // attempts to minimize the number of pods returned.
   166  func getPodsToPreemptByDistance(pods []*v1.Pod, requirements admissionRequirementList) ([]*v1.Pod, error) {
   167  	podsToEvict := []*v1.Pod{}
   168  	// evict pods by shortest distance from remaining requirements, updating requirements every round.
   169  	for len(requirements) > 0 {
   170  		if len(pods) == 0 {
   171  			return nil, fmt.Errorf("no set of running pods found to reclaim resources: %v", requirements.toString())
   172  		}
   173  		// all distances must be less than len(requirements), because the max distance for a single requirement is 1
   174  		bestDistance := float64(len(requirements) + 1)
   175  		bestPodIndex := 0
   176  		// Find the pod with the smallest distance from requirements
   177  		// Or, in the case of two equidistant pods, find the pod with "smaller" resource requests.
   178  		for i, pod := range pods {
   179  			dist := requirements.distance(pod)
   180  			if dist < bestDistance || (bestDistance == dist && smallerResourceRequest(pod, pods[bestPodIndex])) {
   181  				bestDistance = dist
   182  				bestPodIndex = i
   183  			}
   184  		}
   185  		// subtract the pod from requirements, and transfer the pod from input-pods to pods-to-evicted
   186  		requirements = requirements.subtract(pods[bestPodIndex])
   187  		podsToEvict = append(podsToEvict, pods[bestPodIndex])
   188  		pods[bestPodIndex] = pods[len(pods)-1]
   189  		pods = pods[:len(pods)-1]
   190  	}
   191  	return podsToEvict, nil
   192  }
   193  
   194  type admissionRequirement struct {
   195  	resourceName v1.ResourceName
   196  	quantity     int64
   197  }
   198  
   199  type admissionRequirementList []*admissionRequirement
   200  
   201  // distance returns distance of the pods requests from the admissionRequirements.
   202  // The distance is measured by the fraction of the requirement satisfied by the pod,
   203  // so that each requirement is weighted equally, regardless of absolute magnitude.
   204  func (a admissionRequirementList) distance(pod *v1.Pod) float64 {
   205  	dist := float64(0)
   206  	for _, req := range a {
   207  		remainingRequest := float64(req.quantity - resource.GetResourceRequest(pod, req.resourceName))
   208  		if remainingRequest > 0 {
   209  			dist += math.Pow(remainingRequest/float64(req.quantity), 2)
   210  		}
   211  	}
   212  	return dist
   213  }
   214  
   215  // subtract returns a new admissionRequirementList containing remaining requirements if the provided pod
   216  // were to be preempted
   217  func (a admissionRequirementList) subtract(pods ...*v1.Pod) admissionRequirementList {
   218  	newList := []*admissionRequirement{}
   219  	for _, req := range a {
   220  		newQuantity := req.quantity
   221  		for _, pod := range pods {
   222  			newQuantity -= resource.GetResourceRequest(pod, req.resourceName)
   223  			if newQuantity <= 0 {
   224  				break
   225  			}
   226  		}
   227  		if newQuantity > 0 {
   228  			newList = append(newList, &admissionRequirement{
   229  				resourceName: req.resourceName,
   230  				quantity:     newQuantity,
   231  			})
   232  		}
   233  	}
   234  	return newList
   235  }
   236  
   237  func (a admissionRequirementList) toString() string {
   238  	s := "["
   239  	for _, req := range a {
   240  		s += fmt.Sprintf("(res: %v, q: %d), ", req.resourceName, req.quantity)
   241  	}
   242  	return s + "]"
   243  }
   244  
   245  // sortPodsByQOS returns lists containing besteffort, burstable, and guaranteed pods that
   246  // can be preempted by preemptor pod.
   247  func sortPodsByQOS(preemptor *v1.Pod, pods []*v1.Pod) (bestEffort, burstable, guaranteed []*v1.Pod) {
   248  	for _, pod := range pods {
   249  		if kubetypes.Preemptable(preemptor, pod) {
   250  			switch v1qos.GetPodQOS(pod) {
   251  			case v1.PodQOSBestEffort:
   252  				bestEffort = append(bestEffort, pod)
   253  			case v1.PodQOSBurstable:
   254  				burstable = append(burstable, pod)
   255  			case v1.PodQOSGuaranteed:
   256  				guaranteed = append(guaranteed, pod)
   257  			default:
   258  			}
   259  		}
   260  	}
   261  
   262  	return
   263  }
   264  
   265  // smallerResourceRequest returns true if pod1 has a smaller request than pod2
   266  func smallerResourceRequest(pod1 *v1.Pod, pod2 *v1.Pod) bool {
   267  	priorityList := []v1.ResourceName{
   268  		v1.ResourceMemory,
   269  		v1.ResourceCPU,
   270  	}
   271  	for _, res := range priorityList {
   272  		req1 := resource.GetResourceRequest(pod1, res)
   273  		req2 := resource.GetResourceRequest(pod2, res)
   274  		if req1 < req2 {
   275  			return true
   276  		} else if req1 > req2 {
   277  			return false
   278  		}
   279  	}
   280  	return true
   281  }
   282  

View as plain text