...

Source file src/k8s.io/kubernetes/pkg/kubelet/eviction/helpers.go

Documentation: k8s.io/kubernetes/pkg/kubelet/eviction

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package eviction
    18  
    19  import (
    20  	"errors"
    21  	"fmt"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  	"time"
    26  
    27  	v1 "k8s.io/api/core/v1"
    28  	"k8s.io/apimachinery/pkg/api/resource"
    29  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    30  	corev1helpers "k8s.io/component-helpers/scheduling/corev1"
    31  	"k8s.io/klog/v2"
    32  	statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
    33  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    34  	v1resource "k8s.io/kubernetes/pkg/api/v1/resource"
    35  	"k8s.io/kubernetes/pkg/features"
    36  	evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
    37  	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
    38  	volumeutils "k8s.io/kubernetes/pkg/volume/util"
    39  )
    40  
    41  const (
    42  	unsupportedEvictionSignal = "unsupported eviction signal %v"
    43  	// Reason is the reason reported back in status.
    44  	Reason = "Evicted"
    45  	// nodeLowMessageFmt is the message for evictions due to resource pressure.
    46  	nodeLowMessageFmt = "The node was low on resource: %v. "
    47  	// nodeConditionMessageFmt is the message for evictions due to resource pressure.
    48  	nodeConditionMessageFmt = "The node had condition: %v. "
    49  	// containerMessageFmt provides additional information for containers exceeding requests
    50  	containerMessageFmt = "Container %s was using %s, request is %s, has larger consumption of %v. "
    51  	// containerEphemeralStorageMessageFmt provides additional information for containers which have exceeded their ES limit
    52  	containerEphemeralStorageMessageFmt = "Container %s exceeded its local ephemeral storage limit %q. "
    53  	// podEphemeralStorageMessageFmt provides additional information for pods which have exceeded their ES limit
    54  	podEphemeralStorageMessageFmt = "Pod ephemeral local storage usage exceeds the total limit of containers %s. "
    55  	// emptyDirMessageFmt provides additional information for empty-dir volumes which have exceeded their size limit
    56  	emptyDirMessageFmt = "Usage of EmptyDir volume %q exceeds the limit %q. "
    57  	// inodes, number. internal to this module, used to account for local disk inode consumption.
    58  	resourceInodes v1.ResourceName = "inodes"
    59  	// resourcePids, number. internal to this module, used to account for local pid consumption.
    60  	resourcePids v1.ResourceName = "pids"
    61  	// OffendingContainersKey is the key in eviction event annotations for the list of container names which exceeded their requests
    62  	OffendingContainersKey = "offending_containers"
    63  	// OffendingContainersUsageKey is the key in eviction event annotations for the list of usage of containers which exceeded their requests
    64  	OffendingContainersUsageKey = "offending_containers_usage"
    65  	// StarvedResourceKey is the key for the starved resource in eviction event annotations
    66  	StarvedResourceKey = "starved_resource"
    67  	// thresholdMetMessageFmt is the message for evictions due to resource pressure.
    68  	thresholdMetMessageFmt = "Threshold quantity: %v, available: %v. "
    69  )
    70  
    71  var (
    72  	// signalToNodeCondition maps a signal to the node condition to report if threshold is met.
    73  	signalToNodeCondition map[evictionapi.Signal]v1.NodeConditionType
    74  	// signalToResource maps a Signal to its associated Resource.
    75  	signalToResource map[evictionapi.Signal]v1.ResourceName
    76  )
    77  
    78  func init() {
    79  	// map eviction signals to node conditions
    80  	signalToNodeCondition = map[evictionapi.Signal]v1.NodeConditionType{}
    81  	signalToNodeCondition[evictionapi.SignalMemoryAvailable] = v1.NodeMemoryPressure
    82  	signalToNodeCondition[evictionapi.SignalAllocatableMemoryAvailable] = v1.NodeMemoryPressure
    83  	signalToNodeCondition[evictionapi.SignalImageFsAvailable] = v1.NodeDiskPressure
    84  	signalToNodeCondition[evictionapi.SignalContainerFsAvailable] = v1.NodeDiskPressure
    85  	signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure
    86  	signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure
    87  	signalToNodeCondition[evictionapi.SignalNodeFsInodesFree] = v1.NodeDiskPressure
    88  	signalToNodeCondition[evictionapi.SignalContainerFsInodesFree] = v1.NodeDiskPressure
    89  	signalToNodeCondition[evictionapi.SignalPIDAvailable] = v1.NodePIDPressure
    90  
    91  	// map signals to resources (and vice-versa)
    92  	signalToResource = map[evictionapi.Signal]v1.ResourceName{}
    93  	signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory
    94  	signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory
    95  	signalToResource[evictionapi.SignalImageFsAvailable] = v1.ResourceEphemeralStorage
    96  	signalToResource[evictionapi.SignalImageFsInodesFree] = resourceInodes
    97  	signalToResource[evictionapi.SignalContainerFsAvailable] = v1.ResourceEphemeralStorage
    98  	signalToResource[evictionapi.SignalContainerFsInodesFree] = resourceInodes
    99  	signalToResource[evictionapi.SignalNodeFsAvailable] = v1.ResourceEphemeralStorage
   100  	signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceInodes
   101  	signalToResource[evictionapi.SignalPIDAvailable] = resourcePids
   102  }
   103  
   104  // validSignal returns true if the signal is supported.
   105  func validSignal(signal evictionapi.Signal) bool {
   106  	_, found := signalToResource[signal]
   107  	return found
   108  }
   109  
   110  // getReclaimableThreshold finds the threshold and resource to reclaim
   111  func getReclaimableThreshold(thresholds []evictionapi.Threshold) (evictionapi.Threshold, v1.ResourceName, bool) {
   112  	for _, thresholdToReclaim := range thresholds {
   113  		if resourceToReclaim, ok := signalToResource[thresholdToReclaim.Signal]; ok {
   114  			return thresholdToReclaim, resourceToReclaim, true
   115  		}
   116  		klog.V(3).InfoS("Eviction manager: threshold was crossed, but reclaim is not implemented for this threshold.", "threshold", thresholdToReclaim.Signal)
   117  	}
   118  	return evictionapi.Threshold{}, "", false
   119  }
   120  
   121  // ParseThresholdConfig parses the flags for thresholds.
   122  func ParseThresholdConfig(allocatableConfig []string, evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim map[string]string) ([]evictionapi.Threshold, error) {
   123  	results := []evictionapi.Threshold{}
   124  	hardThresholds, err := parseThresholdStatements(evictionHard)
   125  	if err != nil {
   126  		return nil, err
   127  	}
   128  	results = append(results, hardThresholds...)
   129  	softThresholds, err := parseThresholdStatements(evictionSoft)
   130  	if err != nil {
   131  		return nil, err
   132  	}
   133  	gracePeriods, err := parseGracePeriods(evictionSoftGracePeriod)
   134  	if err != nil {
   135  		return nil, err
   136  	}
   137  	minReclaims, err := parseMinimumReclaims(evictionMinimumReclaim)
   138  	if err != nil {
   139  		return nil, err
   140  	}
   141  	for i := range softThresholds {
   142  		signal := softThresholds[i].Signal
   143  		period, found := gracePeriods[signal]
   144  		if !found {
   145  			return nil, fmt.Errorf("grace period must be specified for the soft eviction threshold %v", signal)
   146  		}
   147  		softThresholds[i].GracePeriod = period
   148  	}
   149  	results = append(results, softThresholds...)
   150  	for i := range results {
   151  		if minReclaim, ok := minReclaims[results[i].Signal]; ok {
   152  			results[i].MinReclaim = &minReclaim
   153  		}
   154  	}
   155  	for _, key := range allocatableConfig {
   156  		if key == kubetypes.NodeAllocatableEnforcementKey {
   157  			results = addAllocatableThresholds(results)
   158  			break
   159  		}
   160  	}
   161  	return results, nil
   162  }
   163  
   164  func addAllocatableThresholds(thresholds []evictionapi.Threshold) []evictionapi.Threshold {
   165  	additionalThresholds := []evictionapi.Threshold{}
   166  	for _, threshold := range thresholds {
   167  		if threshold.Signal == evictionapi.SignalMemoryAvailable && isHardEvictionThreshold(threshold) {
   168  			// Copy the SignalMemoryAvailable to SignalAllocatableMemoryAvailable
   169  			additionalThresholds = append(additionalThresholds, evictionapi.Threshold{
   170  				Signal:     evictionapi.SignalAllocatableMemoryAvailable,
   171  				Operator:   threshold.Operator,
   172  				Value:      threshold.Value,
   173  				MinReclaim: threshold.MinReclaim,
   174  			})
   175  		}
   176  	}
   177  	return append(append([]evictionapi.Threshold{}, thresholds...), additionalThresholds...)
   178  }
   179  
   180  // UpdateContainerFsThresholds will add containerfs eviction hard/soft
   181  // settings based on container runtime settings.
   182  // Thresholds are parsed from evictionHard and evictionSoft limits so we will override.
   183  // If there is a single filesystem, then containerfs settings are same as nodefs.
   184  // If there is a separate image filesystem for both containers and images then containerfs settings are same as imagefs.
   185  func UpdateContainerFsThresholds(thresholds []evictionapi.Threshold, imageFs, separateContainerImageFs bool) ([]evictionapi.Threshold, error) {
   186  	hardNodeFsDisk := evictionapi.Threshold{}
   187  	softNodeFsDisk := evictionapi.Threshold{}
   188  	hardNodeINodeDisk := evictionapi.Threshold{}
   189  	softNodeINodeDisk := evictionapi.Threshold{}
   190  	hardImageFsDisk := evictionapi.Threshold{}
   191  	softImageFsDisk := evictionapi.Threshold{}
   192  	hardImageINodeDisk := evictionapi.Threshold{}
   193  	softImageINodeDisk := evictionapi.Threshold{}
   194  
   195  	hardContainerFsDisk := -1
   196  	softContainerFsDisk := -1
   197  	hardContainerFsINodes := -1
   198  	softContainerFsINodes := -1
   199  	// Find the imagefs and nodefs thresholds
   200  	var err error = nil
   201  	for idx, threshold := range thresholds {
   202  		if threshold.Signal == evictionapi.SignalImageFsAvailable && isHardEvictionThreshold(threshold) {
   203  			hardImageFsDisk = threshold
   204  		}
   205  		if threshold.Signal == evictionapi.SignalImageFsAvailable && !isHardEvictionThreshold(threshold) {
   206  			softImageFsDisk = threshold
   207  		}
   208  		if threshold.Signal == evictionapi.SignalImageFsInodesFree && isHardEvictionThreshold(threshold) {
   209  			hardImageINodeDisk = threshold
   210  		}
   211  		if threshold.Signal == evictionapi.SignalImageFsInodesFree && !isHardEvictionThreshold(threshold) {
   212  			softImageINodeDisk = threshold
   213  		}
   214  		if threshold.Signal == evictionapi.SignalNodeFsAvailable && isHardEvictionThreshold(threshold) {
   215  			hardNodeFsDisk = threshold
   216  		}
   217  		if threshold.Signal == evictionapi.SignalNodeFsAvailable && !isHardEvictionThreshold(threshold) {
   218  			softNodeFsDisk = threshold
   219  		}
   220  		if threshold.Signal == evictionapi.SignalNodeFsInodesFree && isHardEvictionThreshold(threshold) {
   221  			hardNodeINodeDisk = threshold
   222  		}
   223  		if threshold.Signal == evictionapi.SignalNodeFsInodesFree && !isHardEvictionThreshold(threshold) {
   224  			softNodeINodeDisk = threshold
   225  		}
   226  		// We are logging a warning and we will override the settings.
   227  		// In this case this is safe because we do not support a separate container filesystem.
   228  		// So we want either limits to be same as nodefs or imagefs.
   229  		if threshold.Signal == evictionapi.SignalContainerFsAvailable && isHardEvictionThreshold(threshold) {
   230  			err = errors.Join(fmt.Errorf("found containerfs.available for hard eviction. ignoring"))
   231  			hardContainerFsDisk = idx
   232  		}
   233  		if threshold.Signal == evictionapi.SignalContainerFsAvailable && !isHardEvictionThreshold(threshold) {
   234  			err = errors.Join(fmt.Errorf("found containerfs.available for soft eviction. ignoring"))
   235  			softContainerFsDisk = idx
   236  		}
   237  		if threshold.Signal == evictionapi.SignalContainerFsInodesFree && isHardEvictionThreshold(threshold) {
   238  			err = errors.Join(fmt.Errorf("found containerfs.inodesFree for hard eviction. ignoring"))
   239  			hardContainerFsINodes = idx
   240  		}
   241  		if threshold.Signal == evictionapi.SignalContainerFsInodesFree && !isHardEvictionThreshold(threshold) {
   242  			err = errors.Join(fmt.Errorf("found containerfs.inodesFree for soft eviction. ignoring"))
   243  			softContainerFsINodes = idx
   244  		}
   245  	}
   246  	// Either split disk case (containerfs=nodefs) or single filesystem
   247  	if (imageFs && separateContainerImageFs) || (!imageFs && !separateContainerImageFs) {
   248  		if hardContainerFsDisk != -1 {
   249  			thresholds[hardContainerFsDisk] = evictionapi.Threshold{
   250  				Signal: evictionapi.SignalContainerFsAvailable, Operator: hardNodeFsDisk.Operator, Value: hardNodeFsDisk.Value, MinReclaim: hardNodeFsDisk.MinReclaim,
   251  			}
   252  		} else {
   253  			thresholds = append(thresholds, evictionapi.Threshold{
   254  				Signal:     evictionapi.SignalContainerFsAvailable,
   255  				Operator:   hardNodeFsDisk.Operator,
   256  				Value:      hardNodeFsDisk.Value,
   257  				MinReclaim: hardNodeFsDisk.MinReclaim,
   258  			})
   259  		}
   260  		if softContainerFsDisk != -1 {
   261  			thresholds[softContainerFsDisk] = evictionapi.Threshold{
   262  				Signal: evictionapi.SignalContainerFsAvailable, GracePeriod: softNodeFsDisk.GracePeriod, Operator: softNodeFsDisk.Operator, Value: softNodeFsDisk.Value, MinReclaim: softNodeFsDisk.MinReclaim,
   263  			}
   264  		} else {
   265  			thresholds = append(thresholds, evictionapi.Threshold{
   266  				Signal:      evictionapi.SignalContainerFsAvailable,
   267  				Operator:    softNodeFsDisk.Operator,
   268  				Value:       softNodeFsDisk.Value,
   269  				MinReclaim:  softNodeFsDisk.MinReclaim,
   270  				GracePeriod: softNodeFsDisk.GracePeriod,
   271  			})
   272  		}
   273  		if hardContainerFsINodes != -1 {
   274  			thresholds[hardContainerFsINodes] = evictionapi.Threshold{
   275  				Signal: evictionapi.SignalContainerFsInodesFree, Operator: hardNodeINodeDisk.Operator, Value: hardNodeINodeDisk.Value, MinReclaim: hardNodeINodeDisk.MinReclaim,
   276  			}
   277  		} else {
   278  			thresholds = append(thresholds, evictionapi.Threshold{
   279  				Signal:     evictionapi.SignalContainerFsInodesFree,
   280  				Operator:   hardNodeINodeDisk.Operator,
   281  				Value:      hardNodeINodeDisk.Value,
   282  				MinReclaim: hardNodeINodeDisk.MinReclaim,
   283  			})
   284  		}
   285  		if softContainerFsINodes != -1 {
   286  			thresholds[softContainerFsINodes] = evictionapi.Threshold{
   287  				Signal: evictionapi.SignalContainerFsInodesFree, GracePeriod: softNodeINodeDisk.GracePeriod, Operator: softNodeINodeDisk.Operator, Value: softNodeINodeDisk.Value, MinReclaim: softNodeINodeDisk.MinReclaim,
   288  			}
   289  		} else {
   290  			thresholds = append(thresholds, evictionapi.Threshold{
   291  				Signal:      evictionapi.SignalContainerFsInodesFree,
   292  				Operator:    softNodeINodeDisk.Operator,
   293  				Value:       softNodeINodeDisk.Value,
   294  				MinReclaim:  softNodeINodeDisk.MinReclaim,
   295  				GracePeriod: softNodeINodeDisk.GracePeriod,
   296  			})
   297  		}
   298  	}
   299  	// Separate image filesystem case
   300  	if imageFs && !separateContainerImageFs {
   301  		if hardContainerFsDisk != -1 {
   302  			thresholds[hardContainerFsDisk] = evictionapi.Threshold{
   303  				Signal: evictionapi.SignalContainerFsAvailable, Operator: hardImageFsDisk.Operator, Value: hardImageFsDisk.Value, MinReclaim: hardImageFsDisk.MinReclaim,
   304  			}
   305  		} else {
   306  			thresholds = append(thresholds, evictionapi.Threshold{
   307  				Signal:     evictionapi.SignalContainerFsAvailable,
   308  				Operator:   hardImageFsDisk.Operator,
   309  				Value:      hardImageFsDisk.Value,
   310  				MinReclaim: hardImageFsDisk.MinReclaim,
   311  			})
   312  		}
   313  		if softContainerFsDisk != -1 {
   314  			thresholds[softContainerFsDisk] = evictionapi.Threshold{
   315  				Signal: evictionapi.SignalContainerFsAvailable, GracePeriod: softImageFsDisk.GracePeriod, Operator: softImageFsDisk.Operator, Value: softImageFsDisk.Value, MinReclaim: softImageFsDisk.MinReclaim,
   316  			}
   317  		} else {
   318  			thresholds = append(thresholds, evictionapi.Threshold{
   319  				Signal:      evictionapi.SignalContainerFsAvailable,
   320  				Operator:    softImageFsDisk.Operator,
   321  				Value:       softImageFsDisk.Value,
   322  				MinReclaim:  softImageFsDisk.MinReclaim,
   323  				GracePeriod: softImageFsDisk.GracePeriod,
   324  			})
   325  		}
   326  		if hardContainerFsINodes != -1 {
   327  			thresholds[hardContainerFsINodes] = evictionapi.Threshold{
   328  				Signal: evictionapi.SignalContainerFsInodesFree, GracePeriod: hardImageINodeDisk.GracePeriod, Operator: hardImageINodeDisk.Operator, Value: hardImageINodeDisk.Value, MinReclaim: hardImageINodeDisk.MinReclaim,
   329  			}
   330  		} else {
   331  			thresholds = append(thresholds, evictionapi.Threshold{
   332  				Signal:     evictionapi.SignalContainerFsInodesFree,
   333  				Operator:   hardImageINodeDisk.Operator,
   334  				Value:      hardImageINodeDisk.Value,
   335  				MinReclaim: hardImageINodeDisk.MinReclaim,
   336  			})
   337  		}
   338  		if softContainerFsINodes != -1 {
   339  			thresholds[softContainerFsINodes] = evictionapi.Threshold{
   340  				Signal: evictionapi.SignalContainerFsInodesFree, GracePeriod: softImageINodeDisk.GracePeriod, Operator: softImageINodeDisk.Operator, Value: softImageINodeDisk.Value, MinReclaim: softImageINodeDisk.MinReclaim,
   341  			}
   342  		} else {
   343  			thresholds = append(thresholds, evictionapi.Threshold{
   344  				Signal:      evictionapi.SignalContainerFsInodesFree,
   345  				Operator:    softImageINodeDisk.Operator,
   346  				Value:       softImageINodeDisk.Value,
   347  				MinReclaim:  softImageINodeDisk.MinReclaim,
   348  				GracePeriod: softImageINodeDisk.GracePeriod,
   349  			})
   350  		}
   351  	}
   352  	return thresholds, err
   353  }
   354  
   355  // parseThresholdStatements parses the input statements into a list of Threshold objects.
   356  func parseThresholdStatements(statements map[string]string) ([]evictionapi.Threshold, error) {
   357  	if len(statements) == 0 {
   358  		return nil, nil
   359  	}
   360  	results := []evictionapi.Threshold{}
   361  	for signal, val := range statements {
   362  		result, err := parseThresholdStatement(evictionapi.Signal(signal), val)
   363  		if err != nil {
   364  			return nil, err
   365  		}
   366  		if result != nil {
   367  			results = append(results, *result)
   368  		}
   369  	}
   370  	return results, nil
   371  }
   372  
   373  // parseThresholdStatement parses a threshold statement and returns a threshold,
   374  // or nil if the threshold should be ignored.
   375  func parseThresholdStatement(signal evictionapi.Signal, val string) (*evictionapi.Threshold, error) {
   376  	if !validSignal(signal) {
   377  		return nil, fmt.Errorf(unsupportedEvictionSignal, signal)
   378  	}
   379  	operator := evictionapi.OpForSignal[signal]
   380  	if strings.HasSuffix(val, "%") {
   381  		// ignore 0% and 100%
   382  		if val == "0%" || val == "100%" {
   383  			return nil, nil
   384  		}
   385  		percentage, err := parsePercentage(val)
   386  		if err != nil {
   387  			return nil, err
   388  		}
   389  		if percentage < 0 {
   390  			return nil, fmt.Errorf("eviction percentage threshold %v must be >= 0%%: %s", signal, val)
   391  		}
   392  		// percentage is a float and should not be greater than 1 (100%)
   393  		if percentage > 1 {
   394  			return nil, fmt.Errorf("eviction percentage threshold %v must be <= 100%%: %s", signal, val)
   395  		}
   396  		return &evictionapi.Threshold{
   397  			Signal:   signal,
   398  			Operator: operator,
   399  			Value: evictionapi.ThresholdValue{
   400  				Percentage: percentage,
   401  			},
   402  		}, nil
   403  	}
   404  	quantity, err := resource.ParseQuantity(val)
   405  	if err != nil {
   406  		return nil, err
   407  	}
   408  	if quantity.Sign() < 0 || quantity.IsZero() {
   409  		return nil, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity)
   410  	}
   411  	return &evictionapi.Threshold{
   412  		Signal:   signal,
   413  		Operator: operator,
   414  		Value: evictionapi.ThresholdValue{
   415  			Quantity: &quantity,
   416  		},
   417  	}, nil
   418  }
   419  
   420  // parsePercentage parses a string representing a percentage value
   421  func parsePercentage(input string) (float32, error) {
   422  	value, err := strconv.ParseFloat(strings.TrimRight(input, "%"), 32)
   423  	if err != nil {
   424  		return 0, err
   425  	}
   426  	return float32(value) / 100, nil
   427  }
   428  
   429  // parseGracePeriods parses the grace period statements
   430  func parseGracePeriods(statements map[string]string) (map[evictionapi.Signal]time.Duration, error) {
   431  	if len(statements) == 0 {
   432  		return nil, nil
   433  	}
   434  	results := map[evictionapi.Signal]time.Duration{}
   435  	for signal, val := range statements {
   436  		signal := evictionapi.Signal(signal)
   437  		if !validSignal(signal) {
   438  			return nil, fmt.Errorf(unsupportedEvictionSignal, signal)
   439  		}
   440  		gracePeriod, err := time.ParseDuration(val)
   441  		if err != nil {
   442  			return nil, err
   443  		}
   444  		if gracePeriod < 0 {
   445  			return nil, fmt.Errorf("invalid eviction grace period specified: %v, must be a positive value", val)
   446  		}
   447  		results[signal] = gracePeriod
   448  	}
   449  	return results, nil
   450  }
   451  
   452  // parseMinimumReclaims parses the minimum reclaim statements
   453  func parseMinimumReclaims(statements map[string]string) (map[evictionapi.Signal]evictionapi.ThresholdValue, error) {
   454  	if len(statements) == 0 {
   455  		return nil, nil
   456  	}
   457  	results := map[evictionapi.Signal]evictionapi.ThresholdValue{}
   458  	for signal, val := range statements {
   459  		signal := evictionapi.Signal(signal)
   460  		if !validSignal(signal) {
   461  			return nil, fmt.Errorf(unsupportedEvictionSignal, signal)
   462  		}
   463  		if strings.HasSuffix(val, "%") {
   464  			percentage, err := parsePercentage(val)
   465  			if err != nil {
   466  				return nil, err
   467  			}
   468  			if percentage <= 0 {
   469  				return nil, fmt.Errorf("eviction percentage minimum reclaim %v must be positive: %s", signal, val)
   470  			}
   471  			results[signal] = evictionapi.ThresholdValue{
   472  				Percentage: percentage,
   473  			}
   474  			continue
   475  		}
   476  		quantity, err := resource.ParseQuantity(val)
   477  		if err != nil {
   478  			return nil, err
   479  		}
   480  		if quantity.Sign() < 0 {
   481  			return nil, fmt.Errorf("negative eviction minimum reclaim specified for %v", signal)
   482  		}
   483  		results[signal] = evictionapi.ThresholdValue{
   484  			Quantity: &quantity,
   485  		}
   486  	}
   487  	return results, nil
   488  }
   489  
   490  // diskUsage converts used bytes into a resource quantity.
   491  func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity {
   492  	if fsStats == nil || fsStats.UsedBytes == nil {
   493  		return &resource.Quantity{Format: resource.BinarySI}
   494  	}
   495  	usage := int64(*fsStats.UsedBytes)
   496  	return resource.NewQuantity(usage, resource.BinarySI)
   497  }
   498  
   499  // inodeUsage converts inodes consumed into a resource quantity.
   500  func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity {
   501  	if fsStats == nil || fsStats.InodesUsed == nil {
   502  		return &resource.Quantity{Format: resource.DecimalSI}
   503  	}
   504  	usage := int64(*fsStats.InodesUsed)
   505  	return resource.NewQuantity(usage, resource.DecimalSI)
   506  }
   507  
   508  // memoryUsage converts working set into a resource quantity.
   509  func memoryUsage(memStats *statsapi.MemoryStats) *resource.Quantity {
   510  	if memStats == nil || memStats.WorkingSetBytes == nil {
   511  		return &resource.Quantity{Format: resource.BinarySI}
   512  	}
   513  	usage := int64(*memStats.WorkingSetBytes)
   514  	return resource.NewQuantity(usage, resource.BinarySI)
   515  }
   516  
   517  // processUsage converts working set into a process count.
   518  func processUsage(processStats *statsapi.ProcessStats) uint64 {
   519  	if processStats == nil || processStats.ProcessCount == nil {
   520  		return 0
   521  	}
   522  	usage := uint64(*processStats.ProcessCount)
   523  	return usage
   524  }
   525  
   526  // localVolumeNames returns the set of volumes for the pod that are local
   527  // TODO: summary API should report what volumes consume local storage rather than hard-code here.
   528  func localVolumeNames(pod *v1.Pod) []string {
   529  	result := []string{}
   530  	for _, volume := range pod.Spec.Volumes {
   531  		if volume.HostPath != nil ||
   532  			volumeutils.IsLocalEphemeralVolume(volume) {
   533  			result = append(result, volume.Name)
   534  		}
   535  	}
   536  	return result
   537  }
   538  
   539  // containerUsage aggregates container disk usage and inode consumption for the specified stats to measure.
   540  func containerUsage(podStats statsapi.PodStats, statsToMeasure []fsStatsType) v1.ResourceList {
   541  	disk := resource.Quantity{Format: resource.BinarySI}
   542  	inodes := resource.Quantity{Format: resource.DecimalSI}
   543  	for _, container := range podStats.Containers {
   544  		if hasFsStatsType(statsToMeasure, fsStatsRoot) {
   545  			disk.Add(*diskUsage(container.Rootfs))
   546  			inodes.Add(*inodeUsage(container.Rootfs))
   547  		}
   548  		if hasFsStatsType(statsToMeasure, fsStatsLogs) {
   549  			disk.Add(*diskUsage(container.Logs))
   550  			inodes.Add(*inodeUsage(container.Logs))
   551  		}
   552  	}
   553  	return v1.ResourceList{
   554  		v1.ResourceEphemeralStorage: disk,
   555  		resourceInodes:              inodes,
   556  	}
   557  }
   558  
   559  // podLocalVolumeUsage aggregates pod local volumes disk usage and inode consumption for the specified stats to measure.
   560  func podLocalVolumeUsage(volumeNames []string, podStats statsapi.PodStats) v1.ResourceList {
   561  	disk := resource.Quantity{Format: resource.BinarySI}
   562  	inodes := resource.Quantity{Format: resource.DecimalSI}
   563  	for _, volumeName := range volumeNames {
   564  		for _, volumeStats := range podStats.VolumeStats {
   565  			if volumeStats.Name == volumeName {
   566  				disk.Add(*diskUsage(&volumeStats.FsStats))
   567  				inodes.Add(*inodeUsage(&volumeStats.FsStats))
   568  				break
   569  			}
   570  		}
   571  	}
   572  	return v1.ResourceList{
   573  		v1.ResourceEphemeralStorage: disk,
   574  		resourceInodes:              inodes,
   575  	}
   576  }
   577  
   578  // podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure.
   579  func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) {
   580  	disk := resource.Quantity{Format: resource.BinarySI}
   581  	inodes := resource.Quantity{Format: resource.DecimalSI}
   582  
   583  	containerUsageList := containerUsage(podStats, statsToMeasure)
   584  	disk.Add(containerUsageList[v1.ResourceEphemeralStorage])
   585  	inodes.Add(containerUsageList[resourceInodes])
   586  
   587  	if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) {
   588  		volumeNames := localVolumeNames(pod)
   589  		podLocalVolumeUsageList := podLocalVolumeUsage(volumeNames, podStats)
   590  		disk.Add(podLocalVolumeUsageList[v1.ResourceEphemeralStorage])
   591  		inodes.Add(podLocalVolumeUsageList[resourceInodes])
   592  	}
   593  	return v1.ResourceList{
   594  		v1.ResourceEphemeralStorage: disk,
   595  		resourceInodes:              inodes,
   596  	}, nil
   597  }
   598  
   599  // formatThreshold formats a threshold for logging.
   600  func formatThreshold(threshold evictionapi.Threshold) string {
   601  	return fmt.Sprintf("threshold(signal=%v, operator=%v, value=%v, gracePeriod=%v)", threshold.Signal, threshold.Operator, evictionapi.ThresholdValue(threshold.Value), threshold.GracePeriod)
   602  }
   603  
   604  // cachedStatsFunc returns a statsFunc based on the provided pod stats.
   605  func cachedStatsFunc(podStats []statsapi.PodStats) statsFunc {
   606  	uid2PodStats := map[string]statsapi.PodStats{}
   607  	for i := range podStats {
   608  		uid2PodStats[podStats[i].PodRef.UID] = podStats[i]
   609  	}
   610  	return func(pod *v1.Pod) (statsapi.PodStats, bool) {
   611  		stats, found := uid2PodStats[string(pod.UID)]
   612  		return stats, found
   613  	}
   614  }
   615  
   616  // Cmp compares p1 and p2 and returns:
   617  //
   618  //	-1 if p1 <  p2
   619  //	 0 if p1 == p2
   620  //	+1 if p1 >  p2
   621  type cmpFunc func(p1, p2 *v1.Pod) int
   622  
   623  // multiSorter implements the Sort interface, sorting changes within.
   624  type multiSorter struct {
   625  	pods []*v1.Pod
   626  	cmp  []cmpFunc
   627  }
   628  
   629  // Sort sorts the argument slice according to the less functions passed to OrderedBy.
   630  func (ms *multiSorter) Sort(pods []*v1.Pod) {
   631  	ms.pods = pods
   632  	sort.Sort(ms)
   633  }
   634  
   635  // OrderedBy returns a Sorter that sorts using the cmp functions, in order.
   636  // Call its Sort method to sort the data.
   637  func orderedBy(cmp ...cmpFunc) *multiSorter {
   638  	return &multiSorter{
   639  		cmp: cmp,
   640  	}
   641  }
   642  
   643  // Len is part of sort.Interface.
   644  func (ms *multiSorter) Len() int {
   645  	return len(ms.pods)
   646  }
   647  
   648  // Swap is part of sort.Interface.
   649  func (ms *multiSorter) Swap(i, j int) {
   650  	ms.pods[i], ms.pods[j] = ms.pods[j], ms.pods[i]
   651  }
   652  
   653  // Less is part of sort.Interface.
   654  func (ms *multiSorter) Less(i, j int) bool {
   655  	p1, p2 := ms.pods[i], ms.pods[j]
   656  	var k int
   657  	for k = 0; k < len(ms.cmp)-1; k++ {
   658  		cmpResult := ms.cmp[k](p1, p2)
   659  		// p1 is less than p2
   660  		if cmpResult < 0 {
   661  			return true
   662  		}
   663  		// p1 is greater than p2
   664  		if cmpResult > 0 {
   665  			return false
   666  		}
   667  		// we don't know yet
   668  	}
   669  	// the last cmp func is the final decider
   670  	return ms.cmp[k](p1, p2) < 0
   671  }
   672  
   673  // priority compares pods by Priority, if priority is enabled.
   674  func priority(p1, p2 *v1.Pod) int {
   675  	priority1 := corev1helpers.PodPriority(p1)
   676  	priority2 := corev1helpers.PodPriority(p2)
   677  	if priority1 == priority2 {
   678  		return 0
   679  	}
   680  	if priority1 > priority2 {
   681  		return 1
   682  	}
   683  	return -1
   684  }
   685  
   686  // exceedMemoryRequests compares whether or not pods' memory usage exceeds their requests
   687  func exceedMemoryRequests(stats statsFunc) cmpFunc {
   688  	return func(p1, p2 *v1.Pod) int {
   689  		p1Stats, p1Found := stats(p1)
   690  		p2Stats, p2Found := stats(p2)
   691  		if !p1Found || !p2Found {
   692  			// prioritize evicting the pod for which no stats were found
   693  			return cmpBool(!p1Found, !p2Found)
   694  		}
   695  
   696  		p1Memory := memoryUsage(p1Stats.Memory)
   697  		p2Memory := memoryUsage(p2Stats.Memory)
   698  		p1ExceedsRequests := p1Memory.Cmp(v1resource.GetResourceRequestQuantity(p1, v1.ResourceMemory)) == 1
   699  		p2ExceedsRequests := p2Memory.Cmp(v1resource.GetResourceRequestQuantity(p2, v1.ResourceMemory)) == 1
   700  		// prioritize evicting the pod which exceeds its requests
   701  		return cmpBool(p1ExceedsRequests, p2ExceedsRequests)
   702  	}
   703  }
   704  
   705  // memory compares pods by largest consumer of memory relative to request.
   706  func memory(stats statsFunc) cmpFunc {
   707  	return func(p1, p2 *v1.Pod) int {
   708  		p1Stats, p1Found := stats(p1)
   709  		p2Stats, p2Found := stats(p2)
   710  		if !p1Found || !p2Found {
   711  			// prioritize evicting the pod for which no stats were found
   712  			return cmpBool(!p1Found, !p2Found)
   713  		}
   714  
   715  		// adjust p1, p2 usage relative to the request (if any)
   716  		p1Memory := memoryUsage(p1Stats.Memory)
   717  		p1Request := v1resource.GetResourceRequestQuantity(p1, v1.ResourceMemory)
   718  		p1Memory.Sub(p1Request)
   719  
   720  		p2Memory := memoryUsage(p2Stats.Memory)
   721  		p2Request := v1resource.GetResourceRequestQuantity(p2, v1.ResourceMemory)
   722  		p2Memory.Sub(p2Request)
   723  
   724  		// prioritize evicting the pod which has the larger consumption of memory
   725  		return p2Memory.Cmp(*p1Memory)
   726  	}
   727  }
   728  
   729  // process compares pods by largest consumer of process number relative to request.
   730  func process(stats statsFunc) cmpFunc {
   731  	return func(p1, p2 *v1.Pod) int {
   732  		p1Stats, p1Found := stats(p1)
   733  		p2Stats, p2Found := stats(p2)
   734  		if !p1Found || !p2Found {
   735  			// prioritize evicting the pod for which no stats were found
   736  			return cmpBool(!p1Found, !p2Found)
   737  		}
   738  
   739  		p1Process := processUsage(p1Stats.ProcessStats)
   740  		p2Process := processUsage(p2Stats.ProcessStats)
   741  		// prioritize evicting the pod which has the larger consumption of process
   742  		return int(p2Process - p1Process)
   743  	}
   744  }
   745  
   746  // exceedDiskRequests compares whether or not pods' disk usage exceeds their requests
   747  func exceedDiskRequests(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) cmpFunc {
   748  	return func(p1, p2 *v1.Pod) int {
   749  		p1Stats, p1Found := stats(p1)
   750  		p2Stats, p2Found := stats(p2)
   751  		if !p1Found || !p2Found {
   752  			// prioritize evicting the pod for which no stats were found
   753  			return cmpBool(!p1Found, !p2Found)
   754  		}
   755  
   756  		p1Usage, p1Err := podDiskUsage(p1Stats, p1, fsStatsToMeasure)
   757  		p2Usage, p2Err := podDiskUsage(p2Stats, p2, fsStatsToMeasure)
   758  		if p1Err != nil || p2Err != nil {
   759  			// prioritize evicting the pod which had an error getting stats
   760  			return cmpBool(p1Err != nil, p2Err != nil)
   761  		}
   762  
   763  		p1Disk := p1Usage[diskResource]
   764  		p2Disk := p2Usage[diskResource]
   765  		p1ExceedsRequests := p1Disk.Cmp(v1resource.GetResourceRequestQuantity(p1, diskResource)) == 1
   766  		p2ExceedsRequests := p2Disk.Cmp(v1resource.GetResourceRequestQuantity(p2, diskResource)) == 1
   767  		// prioritize evicting the pod which exceeds its requests
   768  		return cmpBool(p1ExceedsRequests, p2ExceedsRequests)
   769  	}
   770  }
   771  
   772  // disk compares pods by largest consumer of disk relative to request for the specified disk resource.
   773  func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) cmpFunc {
   774  	return func(p1, p2 *v1.Pod) int {
   775  		p1Stats, p1Found := stats(p1)
   776  		p2Stats, p2Found := stats(p2)
   777  		if !p1Found || !p2Found {
   778  			// prioritize evicting the pod for which no stats were found
   779  			return cmpBool(!p1Found, !p2Found)
   780  		}
   781  		p1Usage, p1Err := podDiskUsage(p1Stats, p1, fsStatsToMeasure)
   782  		p2Usage, p2Err := podDiskUsage(p2Stats, p2, fsStatsToMeasure)
   783  		if p1Err != nil || p2Err != nil {
   784  			// prioritize evicting the pod which had an error getting stats
   785  			return cmpBool(p1Err != nil, p2Err != nil)
   786  		}
   787  
   788  		// adjust p1, p2 usage relative to the request (if any)
   789  		p1Disk := p1Usage[diskResource]
   790  		p2Disk := p2Usage[diskResource]
   791  		p1Request := v1resource.GetResourceRequestQuantity(p1, v1.ResourceEphemeralStorage)
   792  		p1Disk.Sub(p1Request)
   793  		p2Request := v1resource.GetResourceRequestQuantity(p2, v1.ResourceEphemeralStorage)
   794  		p2Disk.Sub(p2Request)
   795  		// prioritize evicting the pod which has the larger consumption of disk
   796  		return p2Disk.Cmp(p1Disk)
   797  	}
   798  }
   799  
   800  // cmpBool compares booleans, placing true before false
   801  func cmpBool(a, b bool) int {
   802  	if a == b {
   803  		return 0
   804  	}
   805  	if !b {
   806  		return -1
   807  	}
   808  	return 1
   809  }
   810  
   811  // rankMemoryPressure orders the input pods for eviction in response to memory pressure.
   812  // It ranks by whether or not the pod's usage exceeds its requests, then by priority, and
   813  // finally by memory usage above requests.
   814  func rankMemoryPressure(pods []*v1.Pod, stats statsFunc) {
   815  	orderedBy(exceedMemoryRequests(stats), priority, memory(stats)).Sort(pods)
   816  }
   817  
   818  // rankPIDPressure orders the input pods by priority in response to PID pressure.
   819  func rankPIDPressure(pods []*v1.Pod, stats statsFunc) {
   820  	orderedBy(priority, process(stats)).Sort(pods)
   821  }
   822  
   823  // rankDiskPressureFunc returns a rankFunc that measures the specified fs stats.
   824  func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) rankFunc {
   825  	return func(pods []*v1.Pod, stats statsFunc) {
   826  		orderedBy(exceedDiskRequests(stats, fsStatsToMeasure, diskResource), priority, disk(stats, fsStatsToMeasure, diskResource)).Sort(pods)
   827  	}
   828  }
   829  
   830  // byEvictionPriority implements sort.Interface for []v1.ResourceName.
   831  type byEvictionPriority []evictionapi.Threshold
   832  
   833  func (a byEvictionPriority) Len() int      { return len(a) }
   834  func (a byEvictionPriority) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
   835  
   836  // Less ranks memory before all other resources, and ranks thresholds with no resource to reclaim last
   837  func (a byEvictionPriority) Less(i, j int) bool {
   838  	_, jSignalHasResource := signalToResource[a[j].Signal]
   839  	return a[i].Signal == evictionapi.SignalMemoryAvailable || a[i].Signal == evictionapi.SignalAllocatableMemoryAvailable || !jSignalHasResource
   840  }
   841  
   842  // makeSignalObservations derives observations using the specified summary provider.
   843  func makeSignalObservations(summary *statsapi.Summary) (signalObservations, statsFunc) {
   844  	// build the function to work against for pod stats
   845  	statsFunc := cachedStatsFunc(summary.Pods)
   846  	// build an evaluation context for current eviction signals
   847  	result := signalObservations{}
   848  
   849  	if memory := summary.Node.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil {
   850  		result[evictionapi.SignalMemoryAvailable] = signalObservation{
   851  			available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI),
   852  			capacity:  resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI),
   853  			time:      memory.Time,
   854  		}
   855  	}
   856  	if allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods); err != nil {
   857  		klog.ErrorS(err, "Eviction manager: failed to construct signal", "signal", evictionapi.SignalAllocatableMemoryAvailable)
   858  	} else {
   859  		if memory := allocatableContainer.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil {
   860  			result[evictionapi.SignalAllocatableMemoryAvailable] = signalObservation{
   861  				available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI),
   862  				capacity:  resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI),
   863  				time:      memory.Time,
   864  			}
   865  		}
   866  	}
   867  	if nodeFs := summary.Node.Fs; nodeFs != nil {
   868  		if nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil {
   869  			result[evictionapi.SignalNodeFsAvailable] = signalObservation{
   870  				available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI),
   871  				capacity:  resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI),
   872  				time:      nodeFs.Time,
   873  			}
   874  		}
   875  		if nodeFs.InodesFree != nil && nodeFs.Inodes != nil {
   876  			result[evictionapi.SignalNodeFsInodesFree] = signalObservation{
   877  				available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.DecimalSI),
   878  				capacity:  resource.NewQuantity(int64(*nodeFs.Inodes), resource.DecimalSI),
   879  				time:      nodeFs.Time,
   880  			}
   881  		}
   882  	}
   883  	if summary.Node.Runtime != nil {
   884  		if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil {
   885  			if imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil {
   886  				result[evictionapi.SignalImageFsAvailable] = signalObservation{
   887  					available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI),
   888  					capacity:  resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI),
   889  					time:      imageFs.Time,
   890  				}
   891  			}
   892  			if imageFs.InodesFree != nil && imageFs.Inodes != nil {
   893  				result[evictionapi.SignalImageFsInodesFree] = signalObservation{
   894  					available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.DecimalSI),
   895  					capacity:  resource.NewQuantity(int64(*imageFs.Inodes), resource.DecimalSI),
   896  					time:      imageFs.Time,
   897  				}
   898  			}
   899  		}
   900  		if containerFs := summary.Node.Runtime.ContainerFs; containerFs != nil {
   901  			if containerFs.AvailableBytes != nil && containerFs.CapacityBytes != nil {
   902  				result[evictionapi.SignalContainerFsAvailable] = signalObservation{
   903  					available: resource.NewQuantity(int64(*containerFs.AvailableBytes), resource.BinarySI),
   904  					capacity:  resource.NewQuantity(int64(*containerFs.CapacityBytes), resource.BinarySI),
   905  					time:      containerFs.Time,
   906  				}
   907  			}
   908  			if containerFs.InodesFree != nil && containerFs.Inodes != nil {
   909  				result[evictionapi.SignalContainerFsInodesFree] = signalObservation{
   910  					available: resource.NewQuantity(int64(*containerFs.InodesFree), resource.DecimalSI),
   911  					capacity:  resource.NewQuantity(int64(*containerFs.Inodes), resource.DecimalSI),
   912  					time:      containerFs.Time,
   913  				}
   914  			}
   915  		}
   916  	}
   917  	if rlimit := summary.Node.Rlimit; rlimit != nil {
   918  		if rlimit.NumOfRunningProcesses != nil && rlimit.MaxPID != nil {
   919  			available := int64(*rlimit.MaxPID) - int64(*rlimit.NumOfRunningProcesses)
   920  			result[evictionapi.SignalPIDAvailable] = signalObservation{
   921  				available: resource.NewQuantity(available, resource.DecimalSI),
   922  				capacity:  resource.NewQuantity(int64(*rlimit.MaxPID), resource.DecimalSI),
   923  				time:      rlimit.Time,
   924  			}
   925  		}
   926  	}
   927  	return result, statsFunc
   928  }
   929  
   930  func getSysContainer(sysContainers []statsapi.ContainerStats, name string) (*statsapi.ContainerStats, error) {
   931  	for _, cont := range sysContainers {
   932  		if cont.Name == name {
   933  			return &cont, nil
   934  		}
   935  	}
   936  	return nil, fmt.Errorf("system container %q not found in metrics", name)
   937  }
   938  
   939  // thresholdsMet returns the set of thresholds that were met independent of grace period
   940  func thresholdsMet(thresholds []evictionapi.Threshold, observations signalObservations, enforceMinReclaim bool) []evictionapi.Threshold {
   941  	results := []evictionapi.Threshold{}
   942  	for i := range thresholds {
   943  		threshold := thresholds[i]
   944  		observed, found := observations[threshold.Signal]
   945  		if !found {
   946  			klog.InfoS("Eviction manager: no observation found for eviction signal", "signal", threshold.Signal)
   947  			continue
   948  		}
   949  		// determine if we have met the specified threshold
   950  		thresholdMet := false
   951  		quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity)
   952  		// if enforceMinReclaim is specified, we compare relative to value - minreclaim
   953  		if enforceMinReclaim && threshold.MinReclaim != nil {
   954  			quantity.Add(*evictionapi.GetThresholdQuantity(*threshold.MinReclaim, observed.capacity))
   955  		}
   956  		thresholdResult := quantity.Cmp(*observed.available)
   957  		switch threshold.Operator {
   958  		case evictionapi.OpLessThan:
   959  			thresholdMet = thresholdResult > 0
   960  		}
   961  		if thresholdMet {
   962  			results = append(results, threshold)
   963  		}
   964  	}
   965  	return results
   966  }
   967  
   968  func debugLogObservations(logPrefix string, observations signalObservations) {
   969  	klogV := klog.V(3)
   970  	if !klogV.Enabled() {
   971  		return
   972  	}
   973  	for k, v := range observations {
   974  		if !v.time.IsZero() {
   975  			klogV.InfoS("Eviction manager:", "log", logPrefix, "signal", k, "resourceName", signalToResource[k], "available", v.available, "capacity", v.capacity, "time", v.time)
   976  		} else {
   977  			klogV.InfoS("Eviction manager:", "log", logPrefix, "signal", k, "resourceName", signalToResource[k], "available", v.available, "capacity", v.capacity)
   978  		}
   979  	}
   980  }
   981  
   982  func debugLogThresholdsWithObservation(logPrefix string, thresholds []evictionapi.Threshold, observations signalObservations) {
   983  	klogV := klog.V(3)
   984  	if !klogV.Enabled() {
   985  		return
   986  	}
   987  	for i := range thresholds {
   988  		threshold := thresholds[i]
   989  		observed, found := observations[threshold.Signal]
   990  		if found {
   991  			quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity)
   992  			klogV.InfoS("Eviction manager: threshold observed resource", "log", logPrefix, "signal", threshold.Signal, "resourceName", signalToResource[threshold.Signal], "quantity", quantity, "available", observed.available)
   993  		} else {
   994  			klogV.InfoS("Eviction manager: threshold had no observation", "log", logPrefix, "signal", threshold.Signal)
   995  		}
   996  	}
   997  }
   998  
   999  func thresholdsUpdatedStats(thresholds []evictionapi.Threshold, observations, lastObservations signalObservations) []evictionapi.Threshold {
  1000  	results := []evictionapi.Threshold{}
  1001  	for i := range thresholds {
  1002  		threshold := thresholds[i]
  1003  		observed, found := observations[threshold.Signal]
  1004  		if !found {
  1005  			klog.InfoS("Eviction manager: no observation found for eviction signal", "signal", threshold.Signal)
  1006  			continue
  1007  		}
  1008  		last, found := lastObservations[threshold.Signal]
  1009  		if !found || observed.time.IsZero() || observed.time.After(last.time.Time) {
  1010  			results = append(results, threshold)
  1011  		}
  1012  	}
  1013  	return results
  1014  }
  1015  
  1016  // thresholdsFirstObservedAt merges the input set of thresholds with the previous observation to determine when active set of thresholds were initially met.
  1017  func thresholdsFirstObservedAt(thresholds []evictionapi.Threshold, lastObservedAt thresholdsObservedAt, now time.Time) thresholdsObservedAt {
  1018  	results := thresholdsObservedAt{}
  1019  	for i := range thresholds {
  1020  		observedAt, found := lastObservedAt[thresholds[i]]
  1021  		if !found {
  1022  			observedAt = now
  1023  		}
  1024  		results[thresholds[i]] = observedAt
  1025  	}
  1026  	return results
  1027  }
  1028  
  1029  // thresholdsMetGracePeriod returns the set of thresholds that have satisfied associated grace period
  1030  func thresholdsMetGracePeriod(observedAt thresholdsObservedAt, now time.Time) []evictionapi.Threshold {
  1031  	results := []evictionapi.Threshold{}
  1032  	for threshold, at := range observedAt {
  1033  		duration := now.Sub(at)
  1034  		if duration < threshold.GracePeriod {
  1035  			klog.V(2).InfoS("Eviction manager: eviction criteria not yet met", "threshold", formatThreshold(threshold), "duration", duration)
  1036  			continue
  1037  		}
  1038  		results = append(results, threshold)
  1039  	}
  1040  	return results
  1041  }
  1042  
  1043  // nodeConditions returns the set of node conditions associated with a threshold
  1044  func nodeConditions(thresholds []evictionapi.Threshold) []v1.NodeConditionType {
  1045  	results := []v1.NodeConditionType{}
  1046  	for _, threshold := range thresholds {
  1047  		if nodeCondition, found := signalToNodeCondition[threshold.Signal]; found {
  1048  			if !hasNodeCondition(results, nodeCondition) {
  1049  				results = append(results, nodeCondition)
  1050  			}
  1051  		}
  1052  	}
  1053  	return results
  1054  }
  1055  
  1056  // nodeConditionsLastObservedAt merges the input with the previous observation to determine when a condition was most recently met.
  1057  func nodeConditionsLastObservedAt(nodeConditions []v1.NodeConditionType, lastObservedAt nodeConditionsObservedAt, now time.Time) nodeConditionsObservedAt {
  1058  	results := nodeConditionsObservedAt{}
  1059  	// the input conditions were observed "now"
  1060  	for i := range nodeConditions {
  1061  		results[nodeConditions[i]] = now
  1062  	}
  1063  	// the conditions that were not observed now are merged in with their old time
  1064  	for key, value := range lastObservedAt {
  1065  		_, found := results[key]
  1066  		if !found {
  1067  			results[key] = value
  1068  		}
  1069  	}
  1070  	return results
  1071  }
  1072  
  1073  // nodeConditionsObservedSince returns the set of conditions that have been observed within the specified period
  1074  func nodeConditionsObservedSince(observedAt nodeConditionsObservedAt, period time.Duration, now time.Time) []v1.NodeConditionType {
  1075  	results := []v1.NodeConditionType{}
  1076  	for nodeCondition, at := range observedAt {
  1077  		duration := now.Sub(at)
  1078  		if duration < period {
  1079  			results = append(results, nodeCondition)
  1080  		}
  1081  	}
  1082  	return results
  1083  }
  1084  
  1085  // hasFsStatsType returns true if the fsStat is in the input list
  1086  func hasFsStatsType(inputs []fsStatsType, item fsStatsType) bool {
  1087  	for _, input := range inputs {
  1088  		if input == item {
  1089  			return true
  1090  		}
  1091  	}
  1092  	return false
  1093  }
  1094  
  1095  // hasNodeCondition returns true if the node condition is in the input list
  1096  func hasNodeCondition(inputs []v1.NodeConditionType, item v1.NodeConditionType) bool {
  1097  	for _, input := range inputs {
  1098  		if input == item {
  1099  			return true
  1100  		}
  1101  	}
  1102  	return false
  1103  }
  1104  
  1105  // mergeThresholds will merge both threshold lists eliminating duplicates.
  1106  func mergeThresholds(inputsA []evictionapi.Threshold, inputsB []evictionapi.Threshold) []evictionapi.Threshold {
  1107  	results := inputsA
  1108  	for _, threshold := range inputsB {
  1109  		if !hasThreshold(results, threshold) {
  1110  			results = append(results, threshold)
  1111  		}
  1112  	}
  1113  	return results
  1114  }
  1115  
  1116  // hasThreshold returns true if the threshold is in the input list
  1117  func hasThreshold(inputs []evictionapi.Threshold, item evictionapi.Threshold) bool {
  1118  	for _, input := range inputs {
  1119  		if input.GracePeriod == item.GracePeriod && input.Operator == item.Operator && input.Signal == item.Signal && compareThresholdValue(input.Value, item.Value) {
  1120  			return true
  1121  		}
  1122  	}
  1123  	return false
  1124  }
  1125  
  1126  // compareThresholdValue returns true if the two thresholdValue objects are logically the same
  1127  func compareThresholdValue(a evictionapi.ThresholdValue, b evictionapi.ThresholdValue) bool {
  1128  	if a.Quantity != nil {
  1129  		if b.Quantity == nil {
  1130  			return false
  1131  		}
  1132  		return a.Quantity.Cmp(*b.Quantity) == 0
  1133  	}
  1134  	if b.Quantity != nil {
  1135  		return false
  1136  	}
  1137  	return a.Percentage == b.Percentage
  1138  }
  1139  
  1140  // isHardEvictionThreshold returns true if eviction should immediately occur
  1141  func isHardEvictionThreshold(threshold evictionapi.Threshold) bool {
  1142  	return threshold.GracePeriod == time.Duration(0)
  1143  }
  1144  
  1145  func isAllocatableEvictionThreshold(threshold evictionapi.Threshold) bool {
  1146  	return threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable
  1147  }
  1148  
  1149  // buildSignalToRankFunc returns ranking functions associated with resources
  1150  func buildSignalToRankFunc(withImageFs bool, imageContainerSplitFs bool) map[evictionapi.Signal]rankFunc {
  1151  	signalToRankFunc := map[evictionapi.Signal]rankFunc{
  1152  		evictionapi.SignalMemoryAvailable:            rankMemoryPressure,
  1153  		evictionapi.SignalAllocatableMemoryAvailable: rankMemoryPressure,
  1154  		evictionapi.SignalPIDAvailable:               rankPIDPressure,
  1155  	}
  1156  	// usage of an imagefs is optional
  1157  	// We have a dedicated Image filesystem (images and containers are on same disk)
  1158  	// then we assume it is just a separate imagefs
  1159  	if withImageFs && !imageContainerSplitFs {
  1160  		// with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes
  1161  		signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
  1162  		signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
  1163  		// with an imagefs, imagefs pod rank func for eviction only includes rootfs
  1164  		signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsImages}, v1.ResourceEphemeralStorage)
  1165  		signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsImages}, resourceInodes)
  1166  		signalToRankFunc[evictionapi.SignalContainerFsAvailable] = signalToRankFunc[evictionapi.SignalImageFsAvailable]
  1167  		signalToRankFunc[evictionapi.SignalContainerFsInodesFree] = signalToRankFunc[evictionapi.SignalImageFsInodesFree]
  1168  
  1169  		// If both imagefs and container fs are on separate disks
  1170  		// we want to track the writeable layer in containerfs signals.
  1171  	} else if withImageFs && imageContainerSplitFs {
  1172  		// with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes
  1173  		signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource, fsStatsRoot}, v1.ResourceEphemeralStorage)
  1174  		signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource, fsStatsRoot}, resourceInodes)
  1175  		signalToRankFunc[evictionapi.SignalContainerFsAvailable] = signalToRankFunc[evictionapi.SignalNodeFsAvailable]
  1176  		signalToRankFunc[evictionapi.SignalContainerFsInodesFree] = signalToRankFunc[evictionapi.SignalNodeFsInodesFree]
  1177  		// with an imagefs, containerfs pod rank func for eviction only includes rootfs
  1178  
  1179  		signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsImages}, v1.ResourceEphemeralStorage)
  1180  		signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsImages}, resourceInodes)
  1181  		// If image fs is not on separate disk as root but container fs is
  1182  	} else {
  1183  		// without an imagefs, nodefs pod rank func for eviction looks at all fs stats.
  1184  		// since imagefs and nodefs share a common device, they share common ranking functions.
  1185  		signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsImages, fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
  1186  		signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsImages, fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
  1187  		signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsImages, fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage)
  1188  		signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsImages, fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
  1189  		signalToRankFunc[evictionapi.SignalContainerFsAvailable] = signalToRankFunc[evictionapi.SignalNodeFsAvailable]
  1190  		signalToRankFunc[evictionapi.SignalContainerFsInodesFree] = signalToRankFunc[evictionapi.SignalNodeFsInodesFree]
  1191  	}
  1192  	return signalToRankFunc
  1193  }
  1194  
  1195  // PodIsEvicted returns true if the reported pod status is due to an eviction.
  1196  func PodIsEvicted(podStatus v1.PodStatus) bool {
  1197  	return podStatus.Phase == v1.PodFailed && podStatus.Reason == Reason
  1198  }
  1199  
  1200  // buildSignalToNodeReclaimFuncs returns reclaim functions associated with resources.
  1201  func buildSignalToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool, splitContainerImageFs bool) map[evictionapi.Signal]nodeReclaimFuncs {
  1202  	signalToReclaimFunc := map[evictionapi.Signal]nodeReclaimFuncs{}
  1203  	// usage of an imagefs is optional
  1204  	if withImageFs && !splitContainerImageFs {
  1205  		// with an imagefs, nodefs pressure should just delete logs
  1206  		signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{}
  1207  		signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{}
  1208  		// with an imagefs, imagefs pressure should delete unused images
  1209  		signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
  1210  		signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
  1211  		// usage of imagefs and container fs on separate disks
  1212  		// containers gc on containerfs pressure
  1213  		// image gc on imagefs pressure
  1214  	} else if withImageFs && splitContainerImageFs {
  1215  		// with an imagefs, imagefs pressure should delete unused images
  1216  		signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{imageGC.DeleteUnusedImages}
  1217  		signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{imageGC.DeleteUnusedImages}
  1218  		// with an split fs and imagefs, containerfs pressure should delete unused containers
  1219  		signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers}
  1220  		signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers}
  1221  	} else {
  1222  		// without an imagefs, nodefs pressure should delete logs, and unused images
  1223  		// since imagefs, containerfs and nodefs share a common device, they share common reclaim functions
  1224  		signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
  1225  		signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
  1226  		signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
  1227  		signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages}
  1228  	}
  1229  	return signalToReclaimFunc
  1230  }
  1231  
  1232  // evictionMessage constructs a useful message about why an eviction occurred, and annotations to provide metadata about the eviction
  1233  func evictionMessage(resourceToReclaim v1.ResourceName, pod *v1.Pod, stats statsFunc, thresholds []evictionapi.Threshold, observations signalObservations) (message string, annotations map[string]string) {
  1234  	annotations = make(map[string]string)
  1235  	message = fmt.Sprintf(nodeLowMessageFmt, resourceToReclaim)
  1236  	quantity, available := getThresholdMetInfo(resourceToReclaim, thresholds, observations)
  1237  	if quantity != nil && available != nil {
  1238  		message += fmt.Sprintf(thresholdMetMessageFmt, quantity, available)
  1239  	}
  1240  	containers := []string{}
  1241  	containerUsage := []string{}
  1242  	podStats, ok := stats(pod)
  1243  	if !ok {
  1244  		return
  1245  	}
  1246  	for _, containerStats := range podStats.Containers {
  1247  		for _, container := range pod.Spec.Containers {
  1248  			if container.Name == containerStats.Name {
  1249  				requests := container.Resources.Requests[resourceToReclaim]
  1250  				if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) &&
  1251  					(resourceToReclaim == v1.ResourceMemory || resourceToReclaim == v1.ResourceCPU) {
  1252  					if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok {
  1253  						requests = cs.AllocatedResources[resourceToReclaim]
  1254  					}
  1255  				}
  1256  				var usage *resource.Quantity
  1257  				switch resourceToReclaim {
  1258  				case v1.ResourceEphemeralStorage:
  1259  					if containerStats.Rootfs != nil && containerStats.Rootfs.UsedBytes != nil && containerStats.Logs != nil && containerStats.Logs.UsedBytes != nil {
  1260  						usage = resource.NewQuantity(int64(*containerStats.Rootfs.UsedBytes+*containerStats.Logs.UsedBytes), resource.BinarySI)
  1261  					}
  1262  				case v1.ResourceMemory:
  1263  					if containerStats.Memory != nil && containerStats.Memory.WorkingSetBytes != nil {
  1264  						usage = resource.NewQuantity(int64(*containerStats.Memory.WorkingSetBytes), resource.BinarySI)
  1265  					}
  1266  				}
  1267  				if usage != nil && usage.Cmp(requests) > 0 {
  1268  					message += fmt.Sprintf(containerMessageFmt, container.Name, usage.String(), requests.String(), resourceToReclaim)
  1269  					containers = append(containers, container.Name)
  1270  					containerUsage = append(containerUsage, usage.String())
  1271  				}
  1272  			}
  1273  		}
  1274  	}
  1275  	annotations[OffendingContainersKey] = strings.Join(containers, ",")
  1276  	annotations[OffendingContainersUsageKey] = strings.Join(containerUsage, ",")
  1277  	annotations[StarvedResourceKey] = string(resourceToReclaim)
  1278  	return
  1279  }
  1280  
  1281  // getThresholdMetInfo get the threshold quantity and available for the resource resourceToReclaim
  1282  func getThresholdMetInfo(resourceToReclaim v1.ResourceName, thresholds []evictionapi.Threshold, observations signalObservations) (quantity *resource.Quantity, available *resource.Quantity) {
  1283  	for i := range thresholds {
  1284  		threshold := thresholds[i]
  1285  		if signalToResource[threshold.Signal] == resourceToReclaim {
  1286  			observed, found := observations[threshold.Signal]
  1287  			if found {
  1288  				quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity)
  1289  				return quantity, observed.available
  1290  			}
  1291  		}
  1292  	}
  1293  	return nil, nil
  1294  }
  1295  

View as plain text