...

Source file src/k8s.io/kubernetes/pkg/scheduler/framework/interface.go

Documentation: k8s.io/kubernetes/pkg/scheduler/framework

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // This file defines the scheduling framework plugin interfaces.
    18  
    19  package framework
    20  
    21  import (
    22  	"context"
    23  	"errors"
    24  	"math"
    25  	"strings"
    26  	"sync"
    27  	"time"
    28  
    29  	"github.com/google/go-cmp/cmp"
    30  	"github.com/google/go-cmp/cmp/cmpopts"
    31  	v1 "k8s.io/api/core/v1"
    32  	"k8s.io/apimachinery/pkg/types"
    33  	"k8s.io/apimachinery/pkg/util/sets"
    34  	"k8s.io/client-go/informers"
    35  	clientset "k8s.io/client-go/kubernetes"
    36  	restclient "k8s.io/client-go/rest"
    37  	"k8s.io/client-go/tools/events"
    38  	"k8s.io/klog/v2"
    39  	"k8s.io/kubernetes/pkg/scheduler/apis/config"
    40  	"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
    41  )
    42  
    43  // NodeScoreList declares a list of nodes and their scores.
    44  type NodeScoreList []NodeScore
    45  
    46  // NodeScore is a struct with node name and score.
    47  type NodeScore struct {
    48  	Name  string
    49  	Score int64
    50  }
    51  
    52  // NodeToStatusMap declares map from node name to its status.
    53  type NodeToStatusMap map[string]*Status
    54  
    55  // NodePluginScores is a struct with node name and scores for that node.
    56  type NodePluginScores struct {
    57  	// Name is node name.
    58  	Name string
    59  	// Scores is scores from plugins and extenders.
    60  	Scores []PluginScore
    61  	// TotalScore is the total score in Scores.
    62  	TotalScore int64
    63  }
    64  
    65  // PluginScore is a struct with plugin/extender name and score.
    66  type PluginScore struct {
    67  	// Name is the name of plugin or extender.
    68  	Name  string
    69  	Score int64
    70  }
    71  
    72  // Code is the Status code/type which is returned from plugins.
    73  type Code int
    74  
    75  // These are predefined codes used in a Status.
    76  // Note: when you add a new status, you have to add it in `codes` slice below.
    77  const (
    78  	// Success means that plugin ran correctly and found pod schedulable.
    79  	// NOTE: A nil status is also considered as "Success".
    80  	Success Code = iota
    81  	// Error is one of the failures, used for internal plugin errors, unexpected input, etc.
    82  	// Plugin shouldn't return this code for expected failures, like Unschedulable.
    83  	// Since it's the unexpected failure, the scheduling queue registers the pod without unschedulable plugins.
    84  	// Meaning, the Pod will be requeued to activeQ/backoffQ soon.
    85  	Error
    86  	// Unschedulable is one of the failures, used when a plugin finds a pod unschedulable.
    87  	// If it's returned from PreFilter or Filter, the scheduler might attempt to
    88  	// run other postFilter plugins like preemption to get this pod scheduled.
    89  	// Use UnschedulableAndUnresolvable to make the scheduler skipping other postFilter plugins.
    90  	// The accompanying status message should explain why the pod is unschedulable.
    91  	//
    92  	// We regard the backoff as a penalty of wasting the scheduling cycle.
    93  	// When the scheduling queue requeues Pods, which was rejected with Unschedulable in the last scheduling,
    94  	// the Pod goes through backoff.
    95  	Unschedulable
    96  	// UnschedulableAndUnresolvable is used when a plugin finds a pod unschedulable and
    97  	// other postFilter plugins like preemption would not change anything.
    98  	// See the comment on PostFilter interface for more details about how PostFilter should handle this status.
    99  	// Plugins should return Unschedulable if it is possible that the pod can get scheduled
   100  	// after running other postFilter plugins.
   101  	// The accompanying status message should explain why the pod is unschedulable.
   102  	//
   103  	// We regard the backoff as a penalty of wasting the scheduling cycle.
   104  	// When the scheduling queue requeues Pods, which was rejected with UnschedulableAndUnresolvable in the last scheduling,
   105  	// the Pod goes through backoff.
   106  	UnschedulableAndUnresolvable
   107  	// Wait is used when a Permit plugin finds a pod scheduling should wait.
   108  	Wait
   109  	// Skip is used in the following scenarios:
   110  	// - when a Bind plugin chooses to skip binding.
   111  	// - when a PreFilter plugin returns Skip so that coupled Filter plugin/PreFilterExtensions() will be skipped.
   112  	// - when a PreScore plugin returns Skip so that coupled Score plugin will be skipped.
   113  	Skip
   114  	// Pending means that the scheduling process is finished successfully,
   115  	// but the plugin wants to stop the scheduling cycle/binding cycle here.
   116  	//
   117  	// For example, the DRA plugin sometimes needs to wait for the external device driver
   118  	// to provision the resource for the Pod.
   119  	// It's different from when to return Unschedulable/UnschedulableAndUnresolvable,
   120  	// because in this case, the scheduler decides where the Pod can go successfully,
   121  	// but we need to wait for the external component to do something based on that scheduling result.
   122  	//
   123  	// We regard the backoff as a penalty of wasting the scheduling cycle.
   124  	// In the case of returning Pending, we cannot say the scheduling cycle is wasted
   125  	// because the scheduling result is used to proceed the Pod's scheduling forward,
   126  	// that particular scheduling cycle is failed though.
   127  	// So, Pods rejected by such reasons don't need to suffer a penalty (backoff).
   128  	// When the scheduling queue requeues Pods, which was rejected with Pending in the last scheduling,
   129  	// the Pod goes to activeQ directly ignoring backoff.
   130  	Pending
   131  )
   132  
   133  // This list should be exactly the same as the codes iota defined above in the same order.
   134  var codes = []string{"Success", "Error", "Unschedulable", "UnschedulableAndUnresolvable", "Wait", "Skip", "Pending"}
   135  
   136  func (c Code) String() string {
   137  	return codes[c]
   138  }
   139  
   140  const (
   141  	// MaxNodeScore is the maximum score a Score plugin is expected to return.
   142  	MaxNodeScore int64 = 100
   143  
   144  	// MinNodeScore is the minimum score a Score plugin is expected to return.
   145  	MinNodeScore int64 = 0
   146  
   147  	// MaxTotalScore is the maximum total score.
   148  	MaxTotalScore int64 = math.MaxInt64
   149  )
   150  
   151  // PodsToActivateKey is a reserved state key for stashing pods.
   152  // If the stashed pods are present in unschedulablePods or backoffQ,they will be
   153  // activated (i.e., moved to activeQ) in two phases:
   154  // - end of a scheduling cycle if it succeeds (will be cleared from `PodsToActivate` if activated)
   155  // - end of a binding cycle if it succeeds
   156  var PodsToActivateKey StateKey = "kubernetes.io/pods-to-activate"
   157  
   158  // PodsToActivate stores pods to be activated.
   159  type PodsToActivate struct {
   160  	sync.Mutex
   161  	// Map is keyed with namespaced pod name, and valued with the pod.
   162  	Map map[string]*v1.Pod
   163  }
   164  
   165  // Clone just returns the same state.
   166  func (s *PodsToActivate) Clone() StateData {
   167  	return s
   168  }
   169  
   170  // NewPodsToActivate instantiates a PodsToActivate object.
   171  func NewPodsToActivate() *PodsToActivate {
   172  	return &PodsToActivate{Map: make(map[string]*v1.Pod)}
   173  }
   174  
   175  // Status indicates the result of running a plugin. It consists of a code, a
   176  // message, (optionally) an error, and a plugin name it fails by.
   177  // When the status code is not Success, the reasons should explain why.
   178  // And, when code is Success, all the other fields should be empty.
   179  // NOTE: A nil Status is also considered as Success.
   180  type Status struct {
   181  	code    Code
   182  	reasons []string
   183  	err     error
   184  	// plugin is an optional field that records the plugin name causes this status.
   185  	// It's set by the framework when code is Unschedulable, UnschedulableAndUnresolvable or Pending.
   186  	plugin string
   187  }
   188  
   189  func (s *Status) WithError(err error) *Status {
   190  	s.err = err
   191  	return s
   192  }
   193  
   194  // Code returns code of the Status.
   195  func (s *Status) Code() Code {
   196  	if s == nil {
   197  		return Success
   198  	}
   199  	return s.code
   200  }
   201  
   202  // Message returns a concatenated message on reasons of the Status.
   203  func (s *Status) Message() string {
   204  	if s == nil {
   205  		return ""
   206  	}
   207  	return strings.Join(s.Reasons(), ", ")
   208  }
   209  
   210  // SetPlugin sets the given plugin name to s.plugin.
   211  func (s *Status) SetPlugin(plugin string) {
   212  	s.plugin = plugin
   213  }
   214  
   215  // WithPlugin sets the given plugin name to s.plugin,
   216  // and returns the given status object.
   217  func (s *Status) WithPlugin(plugin string) *Status {
   218  	s.SetPlugin(plugin)
   219  	return s
   220  }
   221  
   222  // Plugin returns the plugin name which caused this status.
   223  func (s *Status) Plugin() string {
   224  	return s.plugin
   225  }
   226  
   227  // Reasons returns reasons of the Status.
   228  func (s *Status) Reasons() []string {
   229  	if s.err != nil {
   230  		return append([]string{s.err.Error()}, s.reasons...)
   231  	}
   232  	return s.reasons
   233  }
   234  
   235  // AppendReason appends given reason to the Status.
   236  func (s *Status) AppendReason(reason string) {
   237  	s.reasons = append(s.reasons, reason)
   238  }
   239  
   240  // IsSuccess returns true if and only if "Status" is nil or Code is "Success".
   241  func (s *Status) IsSuccess() bool {
   242  	return s.Code() == Success
   243  }
   244  
   245  // IsWait returns true if and only if "Status" is non-nil and its Code is "Wait".
   246  func (s *Status) IsWait() bool {
   247  	return s.Code() == Wait
   248  }
   249  
   250  // IsSkip returns true if and only if "Status" is non-nil and its Code is "Skip".
   251  func (s *Status) IsSkip() bool {
   252  	return s.Code() == Skip
   253  }
   254  
   255  // IsRejected returns true if "Status" is Unschedulable (Unschedulable, UnschedulableAndUnresolvable, or Pending).
   256  func (s *Status) IsRejected() bool {
   257  	code := s.Code()
   258  	return code == Unschedulable || code == UnschedulableAndUnresolvable || code == Pending
   259  }
   260  
   261  // AsError returns nil if the status is a success, a wait or a skip; otherwise returns an "error" object
   262  // with a concatenated message on reasons of the Status.
   263  func (s *Status) AsError() error {
   264  	if s.IsSuccess() || s.IsWait() || s.IsSkip() {
   265  		return nil
   266  	}
   267  	if s.err != nil {
   268  		return s.err
   269  	}
   270  	return errors.New(s.Message())
   271  }
   272  
   273  // Equal checks equality of two statuses. This is useful for testing with
   274  // cmp.Equal.
   275  func (s *Status) Equal(x *Status) bool {
   276  	if s == nil || x == nil {
   277  		return s.IsSuccess() && x.IsSuccess()
   278  	}
   279  	if s.code != x.code {
   280  		return false
   281  	}
   282  	if !cmp.Equal(s.err, x.err, cmpopts.EquateErrors()) {
   283  		return false
   284  	}
   285  	if !cmp.Equal(s.reasons, x.reasons) {
   286  		return false
   287  	}
   288  	return cmp.Equal(s.plugin, x.plugin)
   289  }
   290  
   291  func (s *Status) String() string {
   292  	return s.Message()
   293  }
   294  
   295  // NewStatus makes a Status out of the given arguments and returns its pointer.
   296  func NewStatus(code Code, reasons ...string) *Status {
   297  	s := &Status{
   298  		code:    code,
   299  		reasons: reasons,
   300  	}
   301  	return s
   302  }
   303  
   304  // AsStatus wraps an error in a Status.
   305  func AsStatus(err error) *Status {
   306  	if err == nil {
   307  		return nil
   308  	}
   309  	return &Status{
   310  		code: Error,
   311  		err:  err,
   312  	}
   313  }
   314  
   315  // WaitingPod represents a pod currently waiting in the permit phase.
   316  type WaitingPod interface {
   317  	// GetPod returns a reference to the waiting pod.
   318  	GetPod() *v1.Pod
   319  	// GetPendingPlugins returns a list of pending Permit plugin's name.
   320  	GetPendingPlugins() []string
   321  	// Allow declares the waiting pod is allowed to be scheduled by the plugin named as "pluginName".
   322  	// If this is the last remaining plugin to allow, then a success signal is delivered
   323  	// to unblock the pod.
   324  	Allow(pluginName string)
   325  	// Reject declares the waiting pod unschedulable.
   326  	Reject(pluginName, msg string)
   327  }
   328  
   329  // Plugin is the parent type for all the scheduling framework plugins.
   330  type Plugin interface {
   331  	Name() string
   332  }
   333  
   334  // PreEnqueuePlugin is an interface that must be implemented by "PreEnqueue" plugins.
   335  // These plugins are called prior to adding Pods to activeQ.
   336  // Note: an preEnqueue plugin is expected to be lightweight and efficient, so it's not expected to
   337  // involve expensive calls like accessing external endpoints; otherwise it'd block other
   338  // Pods' enqueuing in event handlers.
   339  type PreEnqueuePlugin interface {
   340  	Plugin
   341  	// PreEnqueue is called prior to adding Pods to activeQ.
   342  	PreEnqueue(ctx context.Context, p *v1.Pod) *Status
   343  }
   344  
   345  // LessFunc is the function to sort pod info
   346  type LessFunc func(podInfo1, podInfo2 *QueuedPodInfo) bool
   347  
   348  // QueueSortPlugin is an interface that must be implemented by "QueueSort" plugins.
   349  // These plugins are used to sort pods in the scheduling queue. Only one queue sort
   350  // plugin may be enabled at a time.
   351  type QueueSortPlugin interface {
   352  	Plugin
   353  	// Less are used to sort pods in the scheduling queue.
   354  	Less(*QueuedPodInfo, *QueuedPodInfo) bool
   355  }
   356  
   357  // EnqueueExtensions is an optional interface that plugins can implement to efficiently
   358  // move unschedulable Pods in internal scheduling queues.
   359  // In the scheduler, Pods can be unschedulable by PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins,
   360  // and Pods rejected by these plugins are requeued based on this extension point.
   361  // Failures from other extension points are regarded as temporal errors (e.g., network failure),
   362  // and the scheduler requeue Pods without this extension point - always requeue Pods to activeQ after backoff.
   363  // This is because such temporal errors cannot be resolved by specific cluster events,
   364  // and we have no choise but keep retrying scheduling until the failure is resolved.
   365  //
   366  // Plugins that make pod unschedulable (PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins) should implement this interface,
   367  // otherwise the default implementation will be used, which is less efficient in requeueing Pods rejected by the plugin.
   368  // And, if plugins other than above extension points support this interface, they are just ignored.
   369  type EnqueueExtensions interface {
   370  	Plugin
   371  	// EventsToRegister returns a series of possible events that may cause a Pod
   372  	// failed by this plugin schedulable. Each event has a callback function that
   373  	// filters out events to reduce useless retry of Pod's scheduling.
   374  	// The events will be registered when instantiating the internal scheduling queue,
   375  	// and leveraged to build event handlers dynamically.
   376  	// Note: the returned list needs to be static (not depend on configuration parameters);
   377  	// otherwise it would lead to undefined behavior.
   378  	//
   379  	// Appropriate implementation of this function will make Pod's re-scheduling accurate and performant.
   380  	EventsToRegister() []ClusterEventWithHint
   381  }
   382  
   383  // PreFilterExtensions is an interface that is included in plugins that allow specifying
   384  // callbacks to make incremental updates to its supposedly pre-calculated
   385  // state.
   386  type PreFilterExtensions interface {
   387  	// AddPod is called by the framework while trying to evaluate the impact
   388  	// of adding podToAdd to the node while scheduling podToSchedule.
   389  	AddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status
   390  	// RemovePod is called by the framework while trying to evaluate the impact
   391  	// of removing podToRemove from the node while scheduling podToSchedule.
   392  	RemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status
   393  }
   394  
   395  // PreFilterPlugin is an interface that must be implemented by "PreFilter" plugins.
   396  // These plugins are called at the beginning of the scheduling cycle.
   397  type PreFilterPlugin interface {
   398  	Plugin
   399  	// PreFilter is called at the beginning of the scheduling cycle. All PreFilter
   400  	// plugins must return success or the pod will be rejected. PreFilter could optionally
   401  	// return a PreFilterResult to influence which nodes to evaluate downstream. This is useful
   402  	// for cases where it is possible to determine the subset of nodes to process in O(1) time.
   403  	// When PreFilterResult filters out some Nodes, the framework considers Nodes that are filtered out as getting "UnschedulableAndUnresolvable".
   404  	// i.e., those Nodes will be out of the candidates of the preemption.
   405  	//
   406  	// When it returns Skip status, returned PreFilterResult and other fields in status are just ignored,
   407  	// and coupled Filter plugin/PreFilterExtensions() will be skipped in this scheduling cycle.
   408  	PreFilter(ctx context.Context, state *CycleState, p *v1.Pod) (*PreFilterResult, *Status)
   409  	// PreFilterExtensions returns a PreFilterExtensions interface if the plugin implements one,
   410  	// or nil if it does not. A Pre-filter plugin can provide extensions to incrementally
   411  	// modify its pre-processed info. The framework guarantees that the extensions
   412  	// AddPod/RemovePod will only be called after PreFilter, possibly on a cloned
   413  	// CycleState, and may call those functions more than once before calling
   414  	// Filter again on a specific node.
   415  	PreFilterExtensions() PreFilterExtensions
   416  }
   417  
   418  // FilterPlugin is an interface for Filter plugins. These plugins are called at the
   419  // filter extension point for filtering out hosts that cannot run a pod.
   420  // This concept used to be called 'predicate' in the original scheduler.
   421  // These plugins should return "Success", "Unschedulable" or "Error" in Status.code.
   422  // However, the scheduler accepts other valid codes as well.
   423  // Anything other than "Success" will lead to exclusion of the given host from
   424  // running the pod.
   425  type FilterPlugin interface {
   426  	Plugin
   427  	// Filter is called by the scheduling framework.
   428  	// All FilterPlugins should return "Success" to declare that
   429  	// the given node fits the pod. If Filter doesn't return "Success",
   430  	// it will return "Unschedulable", "UnschedulableAndUnresolvable" or "Error".
   431  	// For the node being evaluated, Filter plugins should look at the passed
   432  	// nodeInfo reference for this particular node's information (e.g., pods
   433  	// considered to be running on the node) instead of looking it up in the
   434  	// NodeInfoSnapshot because we don't guarantee that they will be the same.
   435  	// For example, during preemption, we may pass a copy of the original
   436  	// nodeInfo object that has some pods removed from it to evaluate the
   437  	// possibility of preempting them to schedule the target pod.
   438  	Filter(ctx context.Context, state *CycleState, pod *v1.Pod, nodeInfo *NodeInfo) *Status
   439  }
   440  
   441  // PostFilterPlugin is an interface for "PostFilter" plugins. These plugins are called
   442  // after a pod cannot be scheduled.
   443  type PostFilterPlugin interface {
   444  	Plugin
   445  	// PostFilter is called by the scheduling framework
   446  	// when the scheduling cycle failed at PreFilter or Filter by Unschedulable or UnschedulableAndUnresolvable.
   447  	// NodeToStatusMap has statuses that each Node got in the Filter phase.
   448  	// If this scheduling cycle failed at PreFilter, all Nodes have the status from the rejector PreFilter plugin in NodeToStatusMap.
   449  	// Note that the scheduling framework runs PostFilter plugins even when PreFilter returned UnschedulableAndUnresolvable.
   450  	// In that case, NodeToStatusMap contains all Nodes with UnschedulableAndUnresolvable.
   451  	//
   452  	// Also, ignoring Nodes with UnschedulableAndUnresolvable is the responsibility of each PostFilter plugin,
   453  	// meaning NodeToStatusMap obviously could have Nodes with UnschedulableAndUnresolvable
   454  	// and the scheduling framework does call PostFilter even when all Nodes in NodeToStatusMap are UnschedulableAndUnresolvable.
   455  	//
   456  	// A PostFilter plugin should return one of the following statuses:
   457  	// - Unschedulable: the plugin gets executed successfully but the pod cannot be made schedulable.
   458  	// - Success: the plugin gets executed successfully and the pod can be made schedulable.
   459  	// - Error: the plugin aborts due to some internal error.
   460  	//
   461  	// Informational plugins should be configured ahead of other ones, and always return Unschedulable status.
   462  	// Optionally, a non-nil PostFilterResult may be returned along with a Success status. For example,
   463  	// a preemption plugin may choose to return nominatedNodeName, so that framework can reuse that to update the
   464  	// preemptor pod's .spec.status.nominatedNodeName field.
   465  	PostFilter(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusMap) (*PostFilterResult, *Status)
   466  }
   467  
   468  // PreScorePlugin is an interface for "PreScore" plugin. PreScore is an
   469  // informational extension point. Plugins will be called with a list of nodes
   470  // that passed the filtering phase. A plugin may use this data to update internal
   471  // state or to generate logs/metrics.
   472  type PreScorePlugin interface {
   473  	Plugin
   474  	// PreScore is called by the scheduling framework after a list of nodes
   475  	// passed the filtering phase. All prescore plugins must return success or
   476  	// the pod will be rejected
   477  	// When it returns Skip status, other fields in status are just ignored,
   478  	// and coupled Score plugin will be skipped in this scheduling cycle.
   479  	PreScore(ctx context.Context, state *CycleState, pod *v1.Pod, nodes []*NodeInfo) *Status
   480  }
   481  
   482  // ScoreExtensions is an interface for Score extended functionality.
   483  type ScoreExtensions interface {
   484  	// NormalizeScore is called for all node scores produced by the same plugin's "Score"
   485  	// method. A successful run of NormalizeScore will update the scores list and return
   486  	// a success status.
   487  	NormalizeScore(ctx context.Context, state *CycleState, p *v1.Pod, scores NodeScoreList) *Status
   488  }
   489  
   490  // ScorePlugin is an interface that must be implemented by "Score" plugins to rank
   491  // nodes that passed the filtering phase.
   492  type ScorePlugin interface {
   493  	Plugin
   494  	// Score is called on each filtered node. It must return success and an integer
   495  	// indicating the rank of the node. All scoring plugins must return success or
   496  	// the pod will be rejected.
   497  	Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (int64, *Status)
   498  
   499  	// ScoreExtensions returns a ScoreExtensions interface if it implements one, or nil if does not.
   500  	ScoreExtensions() ScoreExtensions
   501  }
   502  
   503  // ReservePlugin is an interface for plugins with Reserve and Unreserve
   504  // methods. These are meant to update the state of the plugin. This concept
   505  // used to be called 'assume' in the original scheduler. These plugins should
   506  // return only Success or Error in Status.code. However, the scheduler accepts
   507  // other valid codes as well. Anything other than Success will lead to
   508  // rejection of the pod.
   509  type ReservePlugin interface {
   510  	Plugin
   511  	// Reserve is called by the scheduling framework when the scheduler cache is
   512  	// updated. If this method returns a failed Status, the scheduler will call
   513  	// the Unreserve method for all enabled ReservePlugins.
   514  	Reserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
   515  	// Unreserve is called by the scheduling framework when a reserved pod was
   516  	// rejected, an error occurred during reservation of subsequent plugins, or
   517  	// in a later phase. The Unreserve method implementation must be idempotent
   518  	// and may be called by the scheduler even if the corresponding Reserve
   519  	// method for the same plugin was not called.
   520  	Unreserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string)
   521  }
   522  
   523  // PreBindPlugin is an interface that must be implemented by "PreBind" plugins.
   524  // These plugins are called before a pod being scheduled.
   525  type PreBindPlugin interface {
   526  	Plugin
   527  	// PreBind is called before binding a pod. All prebind plugins must return
   528  	// success or the pod will be rejected and won't be sent for binding.
   529  	PreBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
   530  }
   531  
   532  // PostBindPlugin is an interface that must be implemented by "PostBind" plugins.
   533  // These plugins are called after a pod is successfully bound to a node.
   534  type PostBindPlugin interface {
   535  	Plugin
   536  	// PostBind is called after a pod is successfully bound. These plugins are
   537  	// informational. A common application of this extension point is for cleaning
   538  	// up. If a plugin needs to clean-up its state after a pod is scheduled and
   539  	// bound, PostBind is the extension point that it should register.
   540  	PostBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string)
   541  }
   542  
   543  // PermitPlugin is an interface that must be implemented by "Permit" plugins.
   544  // These plugins are called before a pod is bound to a node.
   545  type PermitPlugin interface {
   546  	Plugin
   547  	// Permit is called before binding a pod (and before prebind plugins). Permit
   548  	// plugins are used to prevent or delay the binding of a Pod. A permit plugin
   549  	// must return success or wait with timeout duration, or the pod will be rejected.
   550  	// The pod will also be rejected if the wait timeout or the pod is rejected while
   551  	// waiting. Note that if the plugin returns "wait", the framework will wait only
   552  	// after running the remaining plugins given that no other plugin rejects the pod.
   553  	Permit(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (*Status, time.Duration)
   554  }
   555  
   556  // BindPlugin is an interface that must be implemented by "Bind" plugins. Bind
   557  // plugins are used to bind a pod to a Node.
   558  type BindPlugin interface {
   559  	Plugin
   560  	// Bind plugins will not be called until all pre-bind plugins have completed. Each
   561  	// bind plugin is called in the configured order. A bind plugin may choose whether
   562  	// or not to handle the given Pod. If a bind plugin chooses to handle a Pod, the
   563  	// remaining bind plugins are skipped. When a bind plugin does not handle a pod,
   564  	// it must return Skip in its Status code. If a bind plugin returns an Error, the
   565  	// pod is rejected and will not be bound.
   566  	Bind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
   567  }
   568  
   569  // Framework manages the set of plugins in use by the scheduling framework.
   570  // Configured plugins are called at specified points in a scheduling context.
   571  type Framework interface {
   572  	Handle
   573  
   574  	// PreEnqueuePlugins returns the registered preEnqueue plugins.
   575  	PreEnqueuePlugins() []PreEnqueuePlugin
   576  
   577  	// EnqueueExtensions returns the registered Enqueue extensions.
   578  	EnqueueExtensions() []EnqueueExtensions
   579  
   580  	// QueueSortFunc returns the function to sort pods in scheduling queue
   581  	QueueSortFunc() LessFunc
   582  
   583  	// RunPreFilterPlugins runs the set of configured PreFilter plugins. It returns
   584  	// *Status and its code is set to non-success if any of the plugins returns
   585  	// anything but Success. If a non-success status is returned, then the scheduling
   586  	// cycle is aborted.
   587  	// It also returns a PreFilterResult, which may influence what or how many nodes to
   588  	// evaluate downstream.
   589  	RunPreFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod) (*PreFilterResult, *Status)
   590  
   591  	// RunPostFilterPlugins runs the set of configured PostFilter plugins.
   592  	// PostFilter plugins can either be informational, in which case should be configured
   593  	// to execute first and return Unschedulable status, or ones that try to change the
   594  	// cluster state to make the pod potentially schedulable in a future scheduling cycle.
   595  	RunPostFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusMap) (*PostFilterResult, *Status)
   596  
   597  	// RunPreBindPlugins runs the set of configured PreBind plugins. It returns
   598  	// *Status and its code is set to non-success if any of the plugins returns
   599  	// anything but Success. If the Status code is "Unschedulable", it is
   600  	// considered as a scheduling check failure, otherwise, it is considered as an
   601  	// internal error. In either case the pod is not going to be bound.
   602  	RunPreBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
   603  
   604  	// RunPostBindPlugins runs the set of configured PostBind plugins.
   605  	RunPostBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string)
   606  
   607  	// RunReservePluginsReserve runs the Reserve method of the set of
   608  	// configured Reserve plugins. If any of these calls returns an error, it
   609  	// does not continue running the remaining ones and returns the error. In
   610  	// such case, pod will not be scheduled.
   611  	RunReservePluginsReserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
   612  
   613  	// RunReservePluginsUnreserve runs the Unreserve method of the set of
   614  	// configured Reserve plugins.
   615  	RunReservePluginsUnreserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string)
   616  
   617  	// RunPermitPlugins runs the set of configured Permit plugins. If any of these
   618  	// plugins returns a status other than "Success" or "Wait", it does not continue
   619  	// running the remaining plugins and returns an error. Otherwise, if any of the
   620  	// plugins returns "Wait", then this function will create and add waiting pod
   621  	// to a map of currently waiting pods and return status with "Wait" code.
   622  	// Pod will remain waiting pod for the minimum duration returned by the Permit plugins.
   623  	RunPermitPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
   624  
   625  	// WaitOnPermit will block, if the pod is a waiting pod, until the waiting pod is rejected or allowed.
   626  	WaitOnPermit(ctx context.Context, pod *v1.Pod) *Status
   627  
   628  	// RunBindPlugins runs the set of configured Bind plugins. A Bind plugin may choose
   629  	// whether or not to handle the given Pod. If a Bind plugin chooses to skip the
   630  	// binding, it should return code=5("skip") status. Otherwise, it should return "Error"
   631  	// or "Success". If none of the plugins handled binding, RunBindPlugins returns
   632  	// code=5("skip") status.
   633  	RunBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
   634  
   635  	// HasFilterPlugins returns true if at least one Filter plugin is defined.
   636  	HasFilterPlugins() bool
   637  
   638  	// HasPostFilterPlugins returns true if at least one PostFilter plugin is defined.
   639  	HasPostFilterPlugins() bool
   640  
   641  	// HasScorePlugins returns true if at least one Score plugin is defined.
   642  	HasScorePlugins() bool
   643  
   644  	// ListPlugins returns a map of extension point name to list of configured Plugins.
   645  	ListPlugins() *config.Plugins
   646  
   647  	// ProfileName returns the profile name associated to a profile.
   648  	ProfileName() string
   649  
   650  	// PercentageOfNodesToScore returns percentageOfNodesToScore associated to a profile.
   651  	PercentageOfNodesToScore() *int32
   652  
   653  	// SetPodNominator sets the PodNominator
   654  	SetPodNominator(nominator PodNominator)
   655  
   656  	// Close calls Close method of each plugin.
   657  	Close() error
   658  }
   659  
   660  // Handle provides data and some tools that plugins can use. It is
   661  // passed to the plugin factories at the time of plugin initialization. Plugins
   662  // must store and use this handle to call framework functions.
   663  type Handle interface {
   664  	// PodNominator abstracts operations to maintain nominated Pods.
   665  	PodNominator
   666  	// PluginsRunner abstracts operations to run some plugins.
   667  	PluginsRunner
   668  	// SnapshotSharedLister returns listers from the latest NodeInfo Snapshot. The snapshot
   669  	// is taken at the beginning of a scheduling cycle and remains unchanged until
   670  	// a pod finishes "Permit" point.
   671  	//
   672  	// It should be used only during scheduling cycle:
   673  	// - There is no guarantee that the information remains unchanged in the binding phase of scheduling.
   674  	//   So, plugins shouldn't use it in the binding cycle (pre-bind/bind/post-bind/un-reserve plugin)
   675  	//   otherwise, a concurrent read/write error might occur.
   676  	// - There is no guarantee that the information is always up-to-date.
   677  	//   So, plugins shouldn't use it in QueueingHint and PreEnqueue
   678  	//   otherwise, they might make a decision based on stale information.
   679  	//
   680  	// Instead, they should use the resources getting from Informer created from SharedInformerFactory().
   681  	SnapshotSharedLister() SharedLister
   682  
   683  	// IterateOverWaitingPods acquires a read lock and iterates over the WaitingPods map.
   684  	IterateOverWaitingPods(callback func(WaitingPod))
   685  
   686  	// GetWaitingPod returns a waiting pod given its UID.
   687  	GetWaitingPod(uid types.UID) WaitingPod
   688  
   689  	// RejectWaitingPod rejects a waiting pod given its UID.
   690  	// The return value indicates if the pod is waiting or not.
   691  	RejectWaitingPod(uid types.UID) bool
   692  
   693  	// ClientSet returns a kubernetes clientSet.
   694  	ClientSet() clientset.Interface
   695  
   696  	// KubeConfig returns the raw kube config.
   697  	KubeConfig() *restclient.Config
   698  
   699  	// EventRecorder returns an event recorder.
   700  	EventRecorder() events.EventRecorder
   701  
   702  	SharedInformerFactory() informers.SharedInformerFactory
   703  
   704  	// RunFilterPluginsWithNominatedPods runs the set of configured filter plugins for nominated pod on the given node.
   705  	RunFilterPluginsWithNominatedPods(ctx context.Context, state *CycleState, pod *v1.Pod, info *NodeInfo) *Status
   706  
   707  	// Extenders returns registered scheduler extenders.
   708  	Extenders() []Extender
   709  
   710  	// Parallelizer returns a parallelizer holding parallelism for scheduler.
   711  	Parallelizer() parallelize.Parallelizer
   712  }
   713  
   714  // PreFilterResult wraps needed info for scheduler framework to act upon PreFilter phase.
   715  type PreFilterResult struct {
   716  	// The set of nodes that should be considered downstream; if nil then
   717  	// all nodes are eligible.
   718  	NodeNames sets.Set[string]
   719  }
   720  
   721  func (p *PreFilterResult) AllNodes() bool {
   722  	return p == nil || p.NodeNames == nil
   723  }
   724  
   725  func (p *PreFilterResult) Merge(in *PreFilterResult) *PreFilterResult {
   726  	if p.AllNodes() && in.AllNodes() {
   727  		return nil
   728  	}
   729  
   730  	r := PreFilterResult{}
   731  	if p.AllNodes() {
   732  		r.NodeNames = in.NodeNames.Clone()
   733  		return &r
   734  	}
   735  	if in.AllNodes() {
   736  		r.NodeNames = p.NodeNames.Clone()
   737  		return &r
   738  	}
   739  
   740  	r.NodeNames = p.NodeNames.Intersection(in.NodeNames)
   741  	return &r
   742  }
   743  
   744  type NominatingMode int
   745  
   746  const (
   747  	ModeNoop NominatingMode = iota
   748  	ModeOverride
   749  )
   750  
   751  type NominatingInfo struct {
   752  	NominatedNodeName string
   753  	NominatingMode    NominatingMode
   754  }
   755  
   756  // PostFilterResult wraps needed info for scheduler framework to act upon PostFilter phase.
   757  type PostFilterResult struct {
   758  	*NominatingInfo
   759  }
   760  
   761  func NewPostFilterResultWithNominatedNode(name string) *PostFilterResult {
   762  	return &PostFilterResult{
   763  		NominatingInfo: &NominatingInfo{
   764  			NominatedNodeName: name,
   765  			NominatingMode:    ModeOverride,
   766  		},
   767  	}
   768  }
   769  
   770  func (ni *NominatingInfo) Mode() NominatingMode {
   771  	if ni == nil {
   772  		return ModeNoop
   773  	}
   774  	return ni.NominatingMode
   775  }
   776  
   777  // PodNominator abstracts operations to maintain nominated Pods.
   778  type PodNominator interface {
   779  	// AddNominatedPod adds the given pod to the nominator or
   780  	// updates it if it already exists.
   781  	AddNominatedPod(logger klog.Logger, pod *PodInfo, nominatingInfo *NominatingInfo)
   782  	// DeleteNominatedPodIfExists deletes nominatedPod from internal cache. It's a no-op if it doesn't exist.
   783  	DeleteNominatedPodIfExists(pod *v1.Pod)
   784  	// UpdateNominatedPod updates the <oldPod> with <newPod>.
   785  	UpdateNominatedPod(logger klog.Logger, oldPod *v1.Pod, newPodInfo *PodInfo)
   786  	// NominatedPodsForNode returns nominatedPods on the given node.
   787  	NominatedPodsForNode(nodeName string) []*PodInfo
   788  }
   789  
   790  // PluginsRunner abstracts operations to run some plugins.
   791  // This is used by preemption PostFilter plugins when evaluating the feasibility of
   792  // scheduling the pod on nodes when certain running pods get evicted.
   793  type PluginsRunner interface {
   794  	// RunPreScorePlugins runs the set of configured PreScore plugins. If any
   795  	// of these plugins returns any status other than "Success", the given pod is rejected.
   796  	RunPreScorePlugins(context.Context, *CycleState, *v1.Pod, []*NodeInfo) *Status
   797  	// RunScorePlugins runs the set of configured scoring plugins.
   798  	// It returns a list that stores scores from each plugin and total score for each Node.
   799  	// It also returns *Status, which is set to non-success if any of the plugins returns
   800  	// a non-success status.
   801  	RunScorePlugins(context.Context, *CycleState, *v1.Pod, []*NodeInfo) ([]NodePluginScores, *Status)
   802  	// RunFilterPlugins runs the set of configured Filter plugins for pod on
   803  	// the given node. Note that for the node being evaluated, the passed nodeInfo
   804  	// reference could be different from the one in NodeInfoSnapshot map (e.g., pods
   805  	// considered to be running on the node could be different). For example, during
   806  	// preemption, we may pass a copy of the original nodeInfo object that has some pods
   807  	// removed from it to evaluate the possibility of preempting them to
   808  	// schedule the target pod.
   809  	RunFilterPlugins(context.Context, *CycleState, *v1.Pod, *NodeInfo) *Status
   810  	// RunPreFilterExtensionAddPod calls the AddPod interface for the set of configured
   811  	// PreFilter plugins. It returns directly if any of the plugins return any
   812  	// status other than Success.
   813  	RunPreFilterExtensionAddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status
   814  	// RunPreFilterExtensionRemovePod calls the RemovePod interface for the set of configured
   815  	// PreFilter plugins. It returns directly if any of the plugins return any
   816  	// status other than Success.
   817  	RunPreFilterExtensionRemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status
   818  }
   819  

View as plain text