eviction_manager.go

Documentation: k8s.io/kubernetes/pkg/kubelet/eviction

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package eviction
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sort"
    23  	"sync"
    24  	"time"
    25  
    26  	"k8s.io/klog/v2"
    27  
    28  	v1 "k8s.io/api/core/v1"
    29  	"k8s.io/apimachinery/pkg/api/resource"
    30  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    31  	"k8s.io/client-go/tools/record"
    32  	corev1helpers "k8s.io/component-helpers/scheduling/corev1"
    33  	statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
    34  	"k8s.io/utils/clock"
    35  
    36  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    37  	resourcehelper "k8s.io/kubernetes/pkg/api/v1/resource"
    38  	v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
    39  	"k8s.io/kubernetes/pkg/features"
    40  	evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
    41  	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
    42  	"k8s.io/kubernetes/pkg/kubelet/metrics"
    43  	"k8s.io/kubernetes/pkg/kubelet/server/stats"
    44  	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
    45  )
    46  
    47  const (
    48  	podCleanupTimeout  = 30 * time.Second
    49  	podCleanupPollFreq = time.Second
    50  )
    51  
    52  const (
    53  	// signalEphemeralContainerFsLimit is amount of storage available on filesystem requested by the container
    54  	signalEphemeralContainerFsLimit string = "ephemeralcontainerfs.limit"
    55  	// signalEphemeralPodFsLimit is amount of storage available on filesystem requested by the pod
    56  	signalEphemeralPodFsLimit string = "ephemeralpodfs.limit"
    57  	// signalEmptyDirFsLimit is amount of storage available on filesystem requested by an emptyDir
    58  	signalEmptyDirFsLimit string = "emptydirfs.limit"
    59  )
    60  
    61  // managerImpl implements Manager
    62  type managerImpl struct {
    63  	//  used to track time
    64  	clock clock.WithTicker
    65  	// config is how the manager is configured
    66  	config Config
    67  	// the function to invoke to kill a pod
    68  	killPodFunc KillPodFunc
    69  	// the interface that knows how to do image gc
    70  	imageGC ImageGC
    71  	// the interface that knows how to do container gc
    72  	containerGC ContainerGC
    73  	// protects access to internal state
    74  	sync.RWMutex
    75  	// node conditions are the set of conditions present
    76  	nodeConditions []v1.NodeConditionType
    77  	// captures when a node condition was last observed based on a threshold being met
    78  	nodeConditionsLastObservedAt nodeConditionsObservedAt
    79  	// nodeRef is a reference to the node
    80  	nodeRef *v1.ObjectReference
    81  	// used to record events about the node
    82  	recorder record.EventRecorder
    83  	// used to measure usage stats on system
    84  	summaryProvider stats.SummaryProvider
    85  	// records when a threshold was first observed
    86  	thresholdsFirstObservedAt thresholdsObservedAt
    87  	// records the set of thresholds that have been met (including graceperiod) but not yet resolved
    88  	thresholdsMet []evictionapi.Threshold
    89  	// signalToRankFunc maps a resource to ranking function for that resource.
    90  	signalToRankFunc map[evictionapi.Signal]rankFunc
    91  	// signalToNodeReclaimFuncs maps a resource to an ordered list of functions that know how to reclaim that resource.
    92  	signalToNodeReclaimFuncs map[evictionapi.Signal]nodeReclaimFuncs
    93  	// last observations from synchronize
    94  	lastObservations signalObservations
    95  	// dedicatedImageFs indicates if imagefs is on a separate device from the rootfs
    96  	dedicatedImageFs *bool
    97  	// splitContainerImageFs indicates if containerfs is on a separate device from imagefs
    98  	splitContainerImageFs *bool
    99  	// thresholdNotifiers is a list of memory threshold notifiers which each notify for a memory eviction threshold
   100  	thresholdNotifiers []ThresholdNotifier
   101  	// thresholdsLastUpdated is the last time the thresholdNotifiers were updated.
   102  	thresholdsLastUpdated time.Time
   103  	// whether can support local storage capacity isolation
   104  	localStorageCapacityIsolation bool
   105  }
   106  
   107  // ensure it implements the required interface
   108  var _ Manager = &managerImpl{}
   109  
   110  // NewManager returns a configured Manager and an associated admission handler to enforce eviction configuration.
   111  func NewManager(
   112  	summaryProvider stats.SummaryProvider,
   113  	config Config,
   114  	killPodFunc KillPodFunc,
   115  	imageGC ImageGC,
   116  	containerGC ContainerGC,
   117  	recorder record.EventRecorder,
   118  	nodeRef *v1.ObjectReference,
   119  	clock clock.WithTicker,
   120  	localStorageCapacityIsolation bool,
   121  ) (Manager, lifecycle.PodAdmitHandler) {
   122  	manager := &managerImpl{
   123  		clock:                         clock,
   124  		killPodFunc:                   killPodFunc,
   125  		imageGC:                       imageGC,
   126  		containerGC:                   containerGC,
   127  		config:                        config,
   128  		recorder:                      recorder,
   129  		summaryProvider:               summaryProvider,
   130  		nodeRef:                       nodeRef,
   131  		nodeConditionsLastObservedAt:  nodeConditionsObservedAt{},
   132  		thresholdsFirstObservedAt:     thresholdsObservedAt{},
   133  		dedicatedImageFs:              nil,
   134  		splitContainerImageFs:         nil,
   135  		thresholdNotifiers:            []ThresholdNotifier{},
   136  		localStorageCapacityIsolation: localStorageCapacityIsolation,
   137  	}
   138  	return manager, manager
   139  }
   140  
   141  // Admit rejects a pod if its not safe to admit for node stability.
   142  func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
   143  	m.RLock()
   144  	defer m.RUnlock()
   145  	if len(m.nodeConditions) == 0 {
   146  		return lifecycle.PodAdmitResult{Admit: true}
   147  	}
   148  	// Admit Critical pods even under resource pressure since they are required for system stability.
   149  	// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
   150  	if kubelettypes.IsCriticalPod(attrs.Pod) {
   151  		return lifecycle.PodAdmitResult{Admit: true}
   152  	}
   153  
   154  	// Conditions other than memory pressure reject all pods
   155  	nodeOnlyHasMemoryPressureCondition := hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) && len(m.nodeConditions) == 1
   156  	if nodeOnlyHasMemoryPressureCondition {
   157  		notBestEffort := v1.PodQOSBestEffort != v1qos.GetPodQOS(attrs.Pod)
   158  		if notBestEffort {
   159  			return lifecycle.PodAdmitResult{Admit: true}
   160  		}
   161  
   162  		// When node has memory pressure, check BestEffort Pod's toleration:
   163  		// admit it if tolerates memory pressure taint, fail for other tolerations, e.g. DiskPressure.
   164  		if corev1helpers.TolerationsTolerateTaint(attrs.Pod.Spec.Tolerations, &v1.Taint{
   165  			Key:    v1.TaintNodeMemoryPressure,
   166  			Effect: v1.TaintEffectNoSchedule,
   167  		}) {
   168  			return lifecycle.PodAdmitResult{Admit: true}
   169  		}
   170  	}
   171  
   172  	// reject pods when under memory pressure (if pod is best effort), or if under disk pressure.
   173  	klog.InfoS("Failed to admit pod to node", "pod", klog.KObj(attrs.Pod), "nodeCondition", m.nodeConditions)
   174  	return lifecycle.PodAdmitResult{
   175  		Admit:   false,
   176  		Reason:  Reason,
   177  		Message: fmt.Sprintf(nodeConditionMessageFmt, m.nodeConditions),
   178  	}
   179  }
   180  
   181  // Start starts the control loop to observe and response to low compute resources.
   182  func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) {
   183  	thresholdHandler := func(message string) {
   184  		klog.InfoS(message)
   185  		m.synchronize(diskInfoProvider, podFunc)
   186  	}
   187  	if m.config.KernelMemcgNotification {
   188  		for _, threshold := range m.config.Thresholds {
   189  			if threshold.Signal == evictionapi.SignalMemoryAvailable || threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable {
   190  				notifier, err := NewMemoryThresholdNotifier(threshold, m.config.PodCgroupRoot, &CgroupNotifierFactory{}, thresholdHandler)
   191  				if err != nil {
   192  					klog.InfoS("Eviction manager: failed to create memory threshold notifier", "err", err)
   193  				} else {
   194  					go notifier.Start()
   195  					m.thresholdNotifiers = append(m.thresholdNotifiers, notifier)
   196  				}
   197  			}
   198  		}
   199  	}
   200  	// start the eviction manager monitoring
   201  	go func() {
   202  		for {
   203  			evictedPods, err := m.synchronize(diskInfoProvider, podFunc)
   204  			if evictedPods != nil && err == nil {
   205  				klog.InfoS("Eviction manager: pods evicted, waiting for pod to be cleaned up", "pods", klog.KObjSlice(evictedPods))
   206  				m.waitForPodsCleanup(podCleanedUpFunc, evictedPods)
   207  			} else {
   208  				if err != nil {
   209  					klog.ErrorS(err, "Eviction manager: failed to synchronize")
   210  				}
   211  				time.Sleep(monitoringInterval)
   212  			}
   213  		}
   214  	}()
   215  }
   216  
   217  // IsUnderMemoryPressure returns true if the node is under memory pressure.
   218  func (m *managerImpl) IsUnderMemoryPressure() bool {
   219  	m.RLock()
   220  	defer m.RUnlock()
   221  	return hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure)
   222  }
   223  
   224  // IsUnderDiskPressure returns true if the node is under disk pressure.
   225  func (m *managerImpl) IsUnderDiskPressure() bool {
   226  	m.RLock()
   227  	defer m.RUnlock()
   228  	return hasNodeCondition(m.nodeConditions, v1.NodeDiskPressure)
   229  }
   230  
   231  // IsUnderPIDPressure returns true if the node is under PID pressure.
   232  func (m *managerImpl) IsUnderPIDPressure() bool {
   233  	m.RLock()
   234  	defer m.RUnlock()
   235  	return hasNodeCondition(m.nodeConditions, v1.NodePIDPressure)
   236  }
   237  
   238  // synchronize is the main control loop that enforces eviction thresholds.
   239  // Returns the pod that was killed, or nil if no pod was killed.
   240  func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) ([]*v1.Pod, error) {
   241  	ctx := context.Background()
   242  	// if we have nothing to do, just return
   243  	thresholds := m.config.Thresholds
   244  	if len(thresholds) == 0 && !m.localStorageCapacityIsolation {
   245  		return nil, nil
   246  	}
   247  
   248  	klog.V(3).InfoS("Eviction manager: synchronize housekeeping")
   249  	// build the ranking functions (if not yet known)
   250  	// TODO: have a function in cadvisor that lets us know if global housekeeping has completed
   251  	if m.dedicatedImageFs == nil {
   252  		hasImageFs, splitDiskError := diskInfoProvider.HasDedicatedImageFs(ctx)
   253  		if splitDiskError != nil {
   254  			klog.ErrorS(splitDiskError, "Eviction manager: failed to get HasDedicatedImageFs")
   255  			return nil, fmt.Errorf("eviction manager: failed to get HasDedicatedImageFs: %v", splitDiskError)
   256  		}
   257  		m.dedicatedImageFs = &hasImageFs
   258  		splitContainerImageFs := m.containerGC.IsContainerFsSeparateFromImageFs(ctx)
   259  
   260  		// If we are a split filesystem but the feature is turned off
   261  		// we should return an error.
   262  		// This is a bad state.
   263  		if !utilfeature.DefaultFeatureGate.Enabled(features.KubeletSeparateDiskGC) && splitContainerImageFs {
   264  			splitDiskError := fmt.Errorf("KubeletSeparateDiskGC is turned off but we still have a split filesystem")
   265  			return nil, splitDiskError
   266  		}
   267  		thresholds, err := UpdateContainerFsThresholds(m.config.Thresholds, hasImageFs, splitContainerImageFs)
   268  		m.config.Thresholds = thresholds
   269  		if err != nil {
   270  			klog.ErrorS(err, "eviction manager: found conflicting containerfs eviction. Ignoring.")
   271  		}
   272  		m.splitContainerImageFs = &splitContainerImageFs
   273  		m.signalToRankFunc = buildSignalToRankFunc(hasImageFs, splitContainerImageFs)
   274  		m.signalToNodeReclaimFuncs = buildSignalToNodeReclaimFuncs(m.imageGC, m.containerGC, hasImageFs, splitContainerImageFs)
   275  	}
   276  
   277  	klog.V(3).InfoS("FileSystem detection", "DedicatedImageFs", m.dedicatedImageFs, "SplitImageFs", m.splitContainerImageFs)
   278  	activePods := podFunc()
   279  	updateStats := true
   280  	summary, err := m.summaryProvider.Get(ctx, updateStats)
   281  	if err != nil {
   282  		klog.ErrorS(err, "Eviction manager: failed to get summary stats")
   283  		return nil, nil
   284  	}
   285  
   286  	if m.clock.Since(m.thresholdsLastUpdated) > notifierRefreshInterval {
   287  		m.thresholdsLastUpdated = m.clock.Now()
   288  		for _, notifier := range m.thresholdNotifiers {
   289  			if err := notifier.UpdateThreshold(summary); err != nil {
   290  				klog.InfoS("Eviction manager: failed to update notifier", "notifier", notifier.Description(), "err", err)
   291  			}
   292  		}
   293  	}
   294  
   295  	// make observations and get a function to derive pod usage stats relative to those observations.
   296  	observations, statsFunc := makeSignalObservations(summary)
   297  	debugLogObservations("observations", observations)
   298  
   299  	// determine the set of thresholds met independent of grace period
   300  	thresholds = thresholdsMet(thresholds, observations, false)
   301  	debugLogThresholdsWithObservation("thresholds - ignoring grace period", thresholds, observations)
   302  
   303  	// determine the set of thresholds previously met that have not yet satisfied the associated min-reclaim
   304  	if len(m.thresholdsMet) > 0 {
   305  		thresholdsNotYetResolved := thresholdsMet(m.thresholdsMet, observations, true)
   306  		thresholds = mergeThresholds(thresholds, thresholdsNotYetResolved)
   307  	}
   308  	debugLogThresholdsWithObservation("thresholds - reclaim not satisfied", thresholds, observations)
   309  
   310  	// track when a threshold was first observed
   311  	now := m.clock.Now()
   312  	thresholdsFirstObservedAt := thresholdsFirstObservedAt(thresholds, m.thresholdsFirstObservedAt, now)
   313  
   314  	// the set of node conditions that are triggered by currently observed thresholds
   315  	nodeConditions := nodeConditions(thresholds)
   316  	if len(nodeConditions) > 0 {
   317  		klog.V(3).InfoS("Eviction manager: node conditions - observed", "nodeCondition", nodeConditions)
   318  	}
   319  
   320  	// track when a node condition was last observed
   321  	nodeConditionsLastObservedAt := nodeConditionsLastObservedAt(nodeConditions, m.nodeConditionsLastObservedAt, now)
   322  
   323  	// node conditions report true if it has been observed within the transition period window
   324  	nodeConditions = nodeConditionsObservedSince(nodeConditionsLastObservedAt, m.config.PressureTransitionPeriod, now)
   325  	if len(nodeConditions) > 0 {
   326  		klog.V(3).InfoS("Eviction manager: node conditions - transition period not met", "nodeCondition", nodeConditions)
   327  	}
   328  
   329  	// determine the set of thresholds we need to drive eviction behavior (i.e. all grace periods are met)
   330  	thresholds = thresholdsMetGracePeriod(thresholdsFirstObservedAt, now)
   331  	debugLogThresholdsWithObservation("thresholds - grace periods satisfied", thresholds, observations)
   332  
   333  	// update internal state
   334  	m.Lock()
   335  	m.nodeConditions = nodeConditions
   336  	m.thresholdsFirstObservedAt = thresholdsFirstObservedAt
   337  	m.nodeConditionsLastObservedAt = nodeConditionsLastObservedAt
   338  	m.thresholdsMet = thresholds
   339  
   340  	// determine the set of thresholds whose stats have been updated since the last sync
   341  	thresholds = thresholdsUpdatedStats(thresholds, observations, m.lastObservations)
   342  	debugLogThresholdsWithObservation("thresholds - updated stats", thresholds, observations)
   343  
   344  	m.lastObservations = observations
   345  	m.Unlock()
   346  
   347  	// evict pods if there is a resource usage violation from local volume temporary storage
   348  	// If eviction happens in localStorageEviction function, skip the rest of eviction action
   349  	if m.localStorageCapacityIsolation {
   350  		if evictedPods := m.localStorageEviction(activePods, statsFunc); len(evictedPods) > 0 {
   351  			return evictedPods, nil
   352  		}
   353  	}
   354  
   355  	if len(thresholds) == 0 {
   356  		klog.V(3).InfoS("Eviction manager: no resources are starved")
   357  		return nil, nil
   358  	}
   359  
   360  	// rank the thresholds by eviction priority
   361  	sort.Sort(byEvictionPriority(thresholds))
   362  	thresholdToReclaim, resourceToReclaim, foundAny := getReclaimableThreshold(thresholds)
   363  	if !foundAny {
   364  		return nil, nil
   365  	}
   366  	klog.InfoS("Eviction manager: attempting to reclaim", "resourceName", resourceToReclaim)
   367  
   368  	// record an event about the resources we are now attempting to reclaim via eviction
   369  	m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim)
   370  
   371  	// check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods.
   372  	if m.reclaimNodeLevelResources(ctx, thresholdToReclaim.Signal, resourceToReclaim) {
   373  		klog.InfoS("Eviction manager: able to reduce resource pressure without evicting pods.", "resourceName", resourceToReclaim)
   374  		return nil, nil
   375  	}
   376  
   377  	klog.InfoS("Eviction manager: must evict pod(s) to reclaim", "resourceName", resourceToReclaim)
   378  
   379  	// rank the pods for eviction
   380  	rank, ok := m.signalToRankFunc[thresholdToReclaim.Signal]
   381  	if !ok {
   382  		klog.ErrorS(nil, "Eviction manager: no ranking function for signal", "threshold", thresholdToReclaim.Signal)
   383  		return nil, nil
   384  	}
   385  
   386  	// the only candidates viable for eviction are those pods that had anything running.
   387  	if len(activePods) == 0 {
   388  		klog.ErrorS(nil, "Eviction manager: eviction thresholds have been met, but no pods are active to evict")
   389  		return nil, nil
   390  	}
   391  
   392  	// rank the running pods for eviction for the specified resource
   393  	rank(activePods, statsFunc)
   394  
   395  	klog.InfoS("Eviction manager: pods ranked for eviction", "pods", klog.KObjSlice(activePods))
   396  
   397  	//record age of metrics for met thresholds that we are using for evictions.
   398  	for _, t := range thresholds {
   399  		timeObserved := observations[t.Signal].time
   400  		if !timeObserved.IsZero() {
   401  			metrics.EvictionStatsAge.WithLabelValues(string(t.Signal)).Observe(metrics.SinceInSeconds(timeObserved.Time))
   402  		}
   403  	}
   404  
   405  	// we kill at most a single pod during each eviction interval
   406  	for i := range activePods {
   407  		pod := activePods[i]
   408  		gracePeriodOverride := int64(0)
   409  		if !isHardEvictionThreshold(thresholdToReclaim) {
   410  			gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
   411  		}
   412  		message, annotations := evictionMessage(resourceToReclaim, pod, statsFunc, thresholds, observations)
   413  		var condition *v1.PodCondition
   414  		if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) {
   415  			condition = &v1.PodCondition{
   416  				Type:    v1.DisruptionTarget,
   417  				Status:  v1.ConditionTrue,
   418  				Reason:  v1.PodReasonTerminationByKubelet,
   419  				Message: message,
   420  			}
   421  		}
   422  		if m.evictPod(pod, gracePeriodOverride, message, annotations, condition) {
   423  			metrics.Evictions.WithLabelValues(string(thresholdToReclaim.Signal)).Inc()
   424  			return []*v1.Pod{pod}, nil
   425  		}
   426  	}
   427  	klog.InfoS("Eviction manager: unable to evict any pods from the node")
   428  	return nil, nil
   429  }
   430  
   431  func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) {
   432  	timeout := m.clock.NewTimer(podCleanupTimeout)
   433  	defer timeout.Stop()
   434  	ticker := m.clock.NewTicker(podCleanupPollFreq)
   435  	defer ticker.Stop()
   436  	for {
   437  		select {
   438  		case <-timeout.C():
   439  			klog.InfoS("Eviction manager: timed out waiting for pods to be cleaned up", "pods", klog.KObjSlice(pods))
   440  			return
   441  		case <-ticker.C():
   442  			for i, pod := range pods {
   443  				if !podCleanedUpFunc(pod) {
   444  					break
   445  				}
   446  				if i == len(pods)-1 {
   447  					klog.InfoS("Eviction manager: pods successfully cleaned up", "pods", klog.KObjSlice(pods))
   448  					return
   449  				}
   450  			}
   451  		}
   452  	}
   453  }
   454  
   455  // reclaimNodeLevelResources attempts to reclaim node level resources.  returns true if thresholds were satisfied and no pod eviction is required.
   456  func (m *managerImpl) reclaimNodeLevelResources(ctx context.Context, signalToReclaim evictionapi.Signal, resourceToReclaim v1.ResourceName) bool {
   457  	nodeReclaimFuncs := m.signalToNodeReclaimFuncs[signalToReclaim]
   458  	for _, nodeReclaimFunc := range nodeReclaimFuncs {
   459  		// attempt to reclaim the pressured resource.
   460  		if err := nodeReclaimFunc(ctx); err != nil {
   461  			klog.InfoS("Eviction manager: unexpected error when attempting to reduce resource pressure", "resourceName", resourceToReclaim, "err", err)
   462  		}
   463  
   464  	}
   465  	if len(nodeReclaimFuncs) > 0 {
   466  		summary, err := m.summaryProvider.Get(ctx, true)
   467  		if err != nil {
   468  			klog.ErrorS(err, "Eviction manager: failed to get summary stats after resource reclaim")
   469  			return false
   470  		}
   471  
   472  		// make observations and get a function to derive pod usage stats relative to those observations.
   473  		observations, _ := makeSignalObservations(summary)
   474  		debugLogObservations("observations after resource reclaim", observations)
   475  
   476  		// evaluate all thresholds independently of their grace period to see if with
   477  		// the new observations, we think we have met min reclaim goals
   478  		thresholds := thresholdsMet(m.config.Thresholds, observations, true)
   479  		debugLogThresholdsWithObservation("thresholds after resource reclaim - ignoring grace period", thresholds, observations)
   480  
   481  		if len(thresholds) == 0 {
   482  			return true
   483  		}
   484  	}
   485  	return false
   486  }
   487  
   488  // localStorageEviction checks the EmptyDir volume usage for each pod and determine whether it exceeds the specified limit and needs
   489  // to be evicted. It also checks every container in the pod, if the container overlay usage exceeds the limit, the pod will be evicted too.
   490  func (m *managerImpl) localStorageEviction(pods []*v1.Pod, statsFunc statsFunc) []*v1.Pod {
   491  	evicted := []*v1.Pod{}
   492  	for _, pod := range pods {
   493  		podStats, ok := statsFunc(pod)
   494  		if !ok {
   495  			continue
   496  		}
   497  
   498  		if m.emptyDirLimitEviction(podStats, pod) {
   499  			evicted = append(evicted, pod)
   500  			continue
   501  		}
   502  
   503  		if m.podEphemeralStorageLimitEviction(podStats, pod) {
   504  			evicted = append(evicted, pod)
   505  			continue
   506  		}
   507  
   508  		if m.containerEphemeralStorageLimitEviction(podStats, pod) {
   509  			evicted = append(evicted, pod)
   510  		}
   511  	}
   512  
   513  	return evicted
   514  }
   515  
   516  func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
   517  	podVolumeUsed := make(map[string]*resource.Quantity)
   518  	for _, volume := range podStats.VolumeStats {
   519  		podVolumeUsed[volume.Name] = resource.NewQuantity(int64(*volume.UsedBytes), resource.BinarySI)
   520  	}
   521  	for i := range pod.Spec.Volumes {
   522  		source := &pod.Spec.Volumes[i].VolumeSource
   523  		if source.EmptyDir != nil {
   524  			size := source.EmptyDir.SizeLimit
   525  			used := podVolumeUsed[pod.Spec.Volumes[i].Name]
   526  			if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 {
   527  				// the emptyDir usage exceeds the size limit, evict the pod
   528  				if m.evictPod(pod, 0, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil, nil) {
   529  					metrics.Evictions.WithLabelValues(signalEmptyDirFsLimit).Inc()
   530  					return true
   531  				}
   532  				return false
   533  			}
   534  		}
   535  	}
   536  
   537  	return false
   538  }
   539  
   540  func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
   541  	podLimits := resourcehelper.PodLimits(pod, resourcehelper.PodResourcesOptions{})
   542  	_, found := podLimits[v1.ResourceEphemeralStorage]
   543  	if !found {
   544  		return false
   545  	}
   546  
   547  	// pod stats api summarizes ephemeral storage usage (container, emptyDir, host[etc-hosts, logs])
   548  	podEphemeralStorageTotalUsage := &resource.Quantity{}
   549  	if podStats.EphemeralStorage != nil && podStats.EphemeralStorage.UsedBytes != nil {
   550  		podEphemeralStorageTotalUsage = resource.NewQuantity(int64(*podStats.EphemeralStorage.UsedBytes), resource.BinarySI)
   551  	}
   552  	podEphemeralStorageLimit := podLimits[v1.ResourceEphemeralStorage]
   553  	if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 {
   554  		// the total usage of pod exceeds the total size limit of containers, evict the pod
   555  		message := fmt.Sprintf(podEphemeralStorageMessageFmt, podEphemeralStorageLimit.String())
   556  		if m.evictPod(pod, 0, message, nil, nil) {
   557  			metrics.Evictions.WithLabelValues(signalEphemeralPodFsLimit).Inc()
   558  			return true
   559  		}
   560  		return false
   561  	}
   562  	return false
   563  }
   564  
   565  func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
   566  	thresholdsMap := make(map[string]*resource.Quantity)
   567  	for _, container := range pod.Spec.Containers {
   568  		ephemeralLimit := container.Resources.Limits.StorageEphemeral()
   569  		if ephemeralLimit != nil && ephemeralLimit.Value() != 0 {
   570  			thresholdsMap[container.Name] = ephemeralLimit
   571  		}
   572  	}
   573  
   574  	for _, containerStat := range podStats.Containers {
   575  		containerUsed := diskUsage(containerStat.Logs)
   576  		if !*m.dedicatedImageFs {
   577  			containerUsed.Add(*diskUsage(containerStat.Rootfs))
   578  		}
   579  
   580  		if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok {
   581  			if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 {
   582  				if m.evictPod(pod, 0, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil, nil) {
   583  					metrics.Evictions.WithLabelValues(signalEphemeralContainerFsLimit).Inc()
   584  					return true
   585  				}
   586  				return false
   587  			}
   588  		}
   589  	}
   590  	return false
   591  }
   592  
   593  func (m *managerImpl) evictPod(pod *v1.Pod, gracePeriodOverride int64, evictMsg string, annotations map[string]string, condition *v1.PodCondition) bool {
   594  	// If the pod is marked as critical and static, and support for critical pod annotations is enabled,
   595  	// do not evict such pods. Static pods are not re-admitted after evictions.
   596  	// https://github.com/kubernetes/kubernetes/issues/40573 has more details.
   597  	if kubelettypes.IsCriticalPod(pod) {
   598  		klog.ErrorS(nil, "Eviction manager: cannot evict a critical pod", "pod", klog.KObj(pod))
   599  		return false
   600  	}
   601  	// record that we are evicting the pod
   602  	m.recorder.AnnotatedEventf(pod, annotations, v1.EventTypeWarning, Reason, evictMsg)
   603  	// this is a blocking call and should only return when the pod and its containers are killed.
   604  	klog.V(3).InfoS("Evicting pod", "pod", klog.KObj(pod), "podUID", pod.UID, "message", evictMsg)
   605  	err := m.killPodFunc(pod, true, &gracePeriodOverride, func(status *v1.PodStatus) {
   606  		status.Phase = v1.PodFailed
   607  		status.Reason = Reason
   608  		status.Message = evictMsg
   609  		if condition != nil {
   610  			podutil.UpdatePodCondition(status, condition)
   611  		}
   612  	})
   613  	if err != nil {
   614  		klog.ErrorS(err, "Eviction manager: pod failed to evict", "pod", klog.KObj(pod))
   615  	} else {
   616  		klog.InfoS("Eviction manager: pod is evicted successfully", "pod", klog.KObj(pod))
   617  	}
   618  	return true
   619  }
   620
View as plain text