1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // This file defines the scheduling framework plugin interfaces. 18 19 package framework 20 21 import ( 22 "context" 23 "errors" 24 "math" 25 "strings" 26 "sync" 27 "time" 28 29 "github.com/google/go-cmp/cmp" 30 "github.com/google/go-cmp/cmp/cmpopts" 31 v1 "k8s.io/api/core/v1" 32 "k8s.io/apimachinery/pkg/types" 33 "k8s.io/apimachinery/pkg/util/sets" 34 "k8s.io/client-go/informers" 35 clientset "k8s.io/client-go/kubernetes" 36 restclient "k8s.io/client-go/rest" 37 "k8s.io/client-go/tools/events" 38 "k8s.io/klog/v2" 39 "k8s.io/kubernetes/pkg/scheduler/apis/config" 40 "k8s.io/kubernetes/pkg/scheduler/framework/parallelize" 41 ) 42 43 // NodeScoreList declares a list of nodes and their scores. 44 type NodeScoreList []NodeScore 45 46 // NodeScore is a struct with node name and score. 47 type NodeScore struct { 48 Name string 49 Score int64 50 } 51 52 // NodeToStatusMap declares map from node name to its status. 53 type NodeToStatusMap map[string]*Status 54 55 // NodePluginScores is a struct with node name and scores for that node. 56 type NodePluginScores struct { 57 // Name is node name. 58 Name string 59 // Scores is scores from plugins and extenders. 60 Scores []PluginScore 61 // TotalScore is the total score in Scores. 62 TotalScore int64 63 } 64 65 // PluginScore is a struct with plugin/extender name and score. 66 type PluginScore struct { 67 // Name is the name of plugin or extender. 68 Name string 69 Score int64 70 } 71 72 // Code is the Status code/type which is returned from plugins. 73 type Code int 74 75 // These are predefined codes used in a Status. 76 // Note: when you add a new status, you have to add it in `codes` slice below. 77 const ( 78 // Success means that plugin ran correctly and found pod schedulable. 79 // NOTE: A nil status is also considered as "Success". 80 Success Code = iota 81 // Error is one of the failures, used for internal plugin errors, unexpected input, etc. 82 // Plugin shouldn't return this code for expected failures, like Unschedulable. 83 // Since it's the unexpected failure, the scheduling queue registers the pod without unschedulable plugins. 84 // Meaning, the Pod will be requeued to activeQ/backoffQ soon. 85 Error 86 // Unschedulable is one of the failures, used when a plugin finds a pod unschedulable. 87 // If it's returned from PreFilter or Filter, the scheduler might attempt to 88 // run other postFilter plugins like preemption to get this pod scheduled. 89 // Use UnschedulableAndUnresolvable to make the scheduler skipping other postFilter plugins. 90 // The accompanying status message should explain why the pod is unschedulable. 91 // 92 // We regard the backoff as a penalty of wasting the scheduling cycle. 93 // When the scheduling queue requeues Pods, which was rejected with Unschedulable in the last scheduling, 94 // the Pod goes through backoff. 95 Unschedulable 96 // UnschedulableAndUnresolvable is used when a plugin finds a pod unschedulable and 97 // other postFilter plugins like preemption would not change anything. 98 // See the comment on PostFilter interface for more details about how PostFilter should handle this status. 99 // Plugins should return Unschedulable if it is possible that the pod can get scheduled 100 // after running other postFilter plugins. 101 // The accompanying status message should explain why the pod is unschedulable. 102 // 103 // We regard the backoff as a penalty of wasting the scheduling cycle. 104 // When the scheduling queue requeues Pods, which was rejected with UnschedulableAndUnresolvable in the last scheduling, 105 // the Pod goes through backoff. 106 UnschedulableAndUnresolvable 107 // Wait is used when a Permit plugin finds a pod scheduling should wait. 108 Wait 109 // Skip is used in the following scenarios: 110 // - when a Bind plugin chooses to skip binding. 111 // - when a PreFilter plugin returns Skip so that coupled Filter plugin/PreFilterExtensions() will be skipped. 112 // - when a PreScore plugin returns Skip so that coupled Score plugin will be skipped. 113 Skip 114 // Pending means that the scheduling process is finished successfully, 115 // but the plugin wants to stop the scheduling cycle/binding cycle here. 116 // 117 // For example, the DRA plugin sometimes needs to wait for the external device driver 118 // to provision the resource for the Pod. 119 // It's different from when to return Unschedulable/UnschedulableAndUnresolvable, 120 // because in this case, the scheduler decides where the Pod can go successfully, 121 // but we need to wait for the external component to do something based on that scheduling result. 122 // 123 // We regard the backoff as a penalty of wasting the scheduling cycle. 124 // In the case of returning Pending, we cannot say the scheduling cycle is wasted 125 // because the scheduling result is used to proceed the Pod's scheduling forward, 126 // that particular scheduling cycle is failed though. 127 // So, Pods rejected by such reasons don't need to suffer a penalty (backoff). 128 // When the scheduling queue requeues Pods, which was rejected with Pending in the last scheduling, 129 // the Pod goes to activeQ directly ignoring backoff. 130 Pending 131 ) 132 133 // This list should be exactly the same as the codes iota defined above in the same order. 134 var codes = []string{"Success", "Error", "Unschedulable", "UnschedulableAndUnresolvable", "Wait", "Skip", "Pending"} 135 136 func (c Code) String() string { 137 return codes[c] 138 } 139 140 const ( 141 // MaxNodeScore is the maximum score a Score plugin is expected to return. 142 MaxNodeScore int64 = 100 143 144 // MinNodeScore is the minimum score a Score plugin is expected to return. 145 MinNodeScore int64 = 0 146 147 // MaxTotalScore is the maximum total score. 148 MaxTotalScore int64 = math.MaxInt64 149 ) 150 151 // PodsToActivateKey is a reserved state key for stashing pods. 152 // If the stashed pods are present in unschedulablePods or backoffQ,they will be 153 // activated (i.e., moved to activeQ) in two phases: 154 // - end of a scheduling cycle if it succeeds (will be cleared from `PodsToActivate` if activated) 155 // - end of a binding cycle if it succeeds 156 var PodsToActivateKey StateKey = "kubernetes.io/pods-to-activate" 157 158 // PodsToActivate stores pods to be activated. 159 type PodsToActivate struct { 160 sync.Mutex 161 // Map is keyed with namespaced pod name, and valued with the pod. 162 Map map[string]*v1.Pod 163 } 164 165 // Clone just returns the same state. 166 func (s *PodsToActivate) Clone() StateData { 167 return s 168 } 169 170 // NewPodsToActivate instantiates a PodsToActivate object. 171 func NewPodsToActivate() *PodsToActivate { 172 return &PodsToActivate{Map: make(map[string]*v1.Pod)} 173 } 174 175 // Status indicates the result of running a plugin. It consists of a code, a 176 // message, (optionally) an error, and a plugin name it fails by. 177 // When the status code is not Success, the reasons should explain why. 178 // And, when code is Success, all the other fields should be empty. 179 // NOTE: A nil Status is also considered as Success. 180 type Status struct { 181 code Code 182 reasons []string 183 err error 184 // plugin is an optional field that records the plugin name causes this status. 185 // It's set by the framework when code is Unschedulable, UnschedulableAndUnresolvable or Pending. 186 plugin string 187 } 188 189 func (s *Status) WithError(err error) *Status { 190 s.err = err 191 return s 192 } 193 194 // Code returns code of the Status. 195 func (s *Status) Code() Code { 196 if s == nil { 197 return Success 198 } 199 return s.code 200 } 201 202 // Message returns a concatenated message on reasons of the Status. 203 func (s *Status) Message() string { 204 if s == nil { 205 return "" 206 } 207 return strings.Join(s.Reasons(), ", ") 208 } 209 210 // SetPlugin sets the given plugin name to s.plugin. 211 func (s *Status) SetPlugin(plugin string) { 212 s.plugin = plugin 213 } 214 215 // WithPlugin sets the given plugin name to s.plugin, 216 // and returns the given status object. 217 func (s *Status) WithPlugin(plugin string) *Status { 218 s.SetPlugin(plugin) 219 return s 220 } 221 222 // Plugin returns the plugin name which caused this status. 223 func (s *Status) Plugin() string { 224 return s.plugin 225 } 226 227 // Reasons returns reasons of the Status. 228 func (s *Status) Reasons() []string { 229 if s.err != nil { 230 return append([]string{s.err.Error()}, s.reasons...) 231 } 232 return s.reasons 233 } 234 235 // AppendReason appends given reason to the Status. 236 func (s *Status) AppendReason(reason string) { 237 s.reasons = append(s.reasons, reason) 238 } 239 240 // IsSuccess returns true if and only if "Status" is nil or Code is "Success". 241 func (s *Status) IsSuccess() bool { 242 return s.Code() == Success 243 } 244 245 // IsWait returns true if and only if "Status" is non-nil and its Code is "Wait". 246 func (s *Status) IsWait() bool { 247 return s.Code() == Wait 248 } 249 250 // IsSkip returns true if and only if "Status" is non-nil and its Code is "Skip". 251 func (s *Status) IsSkip() bool { 252 return s.Code() == Skip 253 } 254 255 // IsRejected returns true if "Status" is Unschedulable (Unschedulable, UnschedulableAndUnresolvable, or Pending). 256 func (s *Status) IsRejected() bool { 257 code := s.Code() 258 return code == Unschedulable || code == UnschedulableAndUnresolvable || code == Pending 259 } 260 261 // AsError returns nil if the status is a success, a wait or a skip; otherwise returns an "error" object 262 // with a concatenated message on reasons of the Status. 263 func (s *Status) AsError() error { 264 if s.IsSuccess() || s.IsWait() || s.IsSkip() { 265 return nil 266 } 267 if s.err != nil { 268 return s.err 269 } 270 return errors.New(s.Message()) 271 } 272 273 // Equal checks equality of two statuses. This is useful for testing with 274 // cmp.Equal. 275 func (s *Status) Equal(x *Status) bool { 276 if s == nil || x == nil { 277 return s.IsSuccess() && x.IsSuccess() 278 } 279 if s.code != x.code { 280 return false 281 } 282 if !cmp.Equal(s.err, x.err, cmpopts.EquateErrors()) { 283 return false 284 } 285 if !cmp.Equal(s.reasons, x.reasons) { 286 return false 287 } 288 return cmp.Equal(s.plugin, x.plugin) 289 } 290 291 func (s *Status) String() string { 292 return s.Message() 293 } 294 295 // NewStatus makes a Status out of the given arguments and returns its pointer. 296 func NewStatus(code Code, reasons ...string) *Status { 297 s := &Status{ 298 code: code, 299 reasons: reasons, 300 } 301 return s 302 } 303 304 // AsStatus wraps an error in a Status. 305 func AsStatus(err error) *Status { 306 if err == nil { 307 return nil 308 } 309 return &Status{ 310 code: Error, 311 err: err, 312 } 313 } 314 315 // WaitingPod represents a pod currently waiting in the permit phase. 316 type WaitingPod interface { 317 // GetPod returns a reference to the waiting pod. 318 GetPod() *v1.Pod 319 // GetPendingPlugins returns a list of pending Permit plugin's name. 320 GetPendingPlugins() []string 321 // Allow declares the waiting pod is allowed to be scheduled by the plugin named as "pluginName". 322 // If this is the last remaining plugin to allow, then a success signal is delivered 323 // to unblock the pod. 324 Allow(pluginName string) 325 // Reject declares the waiting pod unschedulable. 326 Reject(pluginName, msg string) 327 } 328 329 // Plugin is the parent type for all the scheduling framework plugins. 330 type Plugin interface { 331 Name() string 332 } 333 334 // PreEnqueuePlugin is an interface that must be implemented by "PreEnqueue" plugins. 335 // These plugins are called prior to adding Pods to activeQ. 336 // Note: an preEnqueue plugin is expected to be lightweight and efficient, so it's not expected to 337 // involve expensive calls like accessing external endpoints; otherwise it'd block other 338 // Pods' enqueuing in event handlers. 339 type PreEnqueuePlugin interface { 340 Plugin 341 // PreEnqueue is called prior to adding Pods to activeQ. 342 PreEnqueue(ctx context.Context, p *v1.Pod) *Status 343 } 344 345 // LessFunc is the function to sort pod info 346 type LessFunc func(podInfo1, podInfo2 *QueuedPodInfo) bool 347 348 // QueueSortPlugin is an interface that must be implemented by "QueueSort" plugins. 349 // These plugins are used to sort pods in the scheduling queue. Only one queue sort 350 // plugin may be enabled at a time. 351 type QueueSortPlugin interface { 352 Plugin 353 // Less are used to sort pods in the scheduling queue. 354 Less(*QueuedPodInfo, *QueuedPodInfo) bool 355 } 356 357 // EnqueueExtensions is an optional interface that plugins can implement to efficiently 358 // move unschedulable Pods in internal scheduling queues. 359 // In the scheduler, Pods can be unschedulable by PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins, 360 // and Pods rejected by these plugins are requeued based on this extension point. 361 // Failures from other extension points are regarded as temporal errors (e.g., network failure), 362 // and the scheduler requeue Pods without this extension point - always requeue Pods to activeQ after backoff. 363 // This is because such temporal errors cannot be resolved by specific cluster events, 364 // and we have no choise but keep retrying scheduling until the failure is resolved. 365 // 366 // Plugins that make pod unschedulable (PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins) should implement this interface, 367 // otherwise the default implementation will be used, which is less efficient in requeueing Pods rejected by the plugin. 368 // And, if plugins other than above extension points support this interface, they are just ignored. 369 type EnqueueExtensions interface { 370 Plugin 371 // EventsToRegister returns a series of possible events that may cause a Pod 372 // failed by this plugin schedulable. Each event has a callback function that 373 // filters out events to reduce useless retry of Pod's scheduling. 374 // The events will be registered when instantiating the internal scheduling queue, 375 // and leveraged to build event handlers dynamically. 376 // Note: the returned list needs to be static (not depend on configuration parameters); 377 // otherwise it would lead to undefined behavior. 378 // 379 // Appropriate implementation of this function will make Pod's re-scheduling accurate and performant. 380 EventsToRegister() []ClusterEventWithHint 381 } 382 383 // PreFilterExtensions is an interface that is included in plugins that allow specifying 384 // callbacks to make incremental updates to its supposedly pre-calculated 385 // state. 386 type PreFilterExtensions interface { 387 // AddPod is called by the framework while trying to evaluate the impact 388 // of adding podToAdd to the node while scheduling podToSchedule. 389 AddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status 390 // RemovePod is called by the framework while trying to evaluate the impact 391 // of removing podToRemove from the node while scheduling podToSchedule. 392 RemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status 393 } 394 395 // PreFilterPlugin is an interface that must be implemented by "PreFilter" plugins. 396 // These plugins are called at the beginning of the scheduling cycle. 397 type PreFilterPlugin interface { 398 Plugin 399 // PreFilter is called at the beginning of the scheduling cycle. All PreFilter 400 // plugins must return success or the pod will be rejected. PreFilter could optionally 401 // return a PreFilterResult to influence which nodes to evaluate downstream. This is useful 402 // for cases where it is possible to determine the subset of nodes to process in O(1) time. 403 // When PreFilterResult filters out some Nodes, the framework considers Nodes that are filtered out as getting "UnschedulableAndUnresolvable". 404 // i.e., those Nodes will be out of the candidates of the preemption. 405 // 406 // When it returns Skip status, returned PreFilterResult and other fields in status are just ignored, 407 // and coupled Filter plugin/PreFilterExtensions() will be skipped in this scheduling cycle. 408 PreFilter(ctx context.Context, state *CycleState, p *v1.Pod) (*PreFilterResult, *Status) 409 // PreFilterExtensions returns a PreFilterExtensions interface if the plugin implements one, 410 // or nil if it does not. A Pre-filter plugin can provide extensions to incrementally 411 // modify its pre-processed info. The framework guarantees that the extensions 412 // AddPod/RemovePod will only be called after PreFilter, possibly on a cloned 413 // CycleState, and may call those functions more than once before calling 414 // Filter again on a specific node. 415 PreFilterExtensions() PreFilterExtensions 416 } 417 418 // FilterPlugin is an interface for Filter plugins. These plugins are called at the 419 // filter extension point for filtering out hosts that cannot run a pod. 420 // This concept used to be called 'predicate' in the original scheduler. 421 // These plugins should return "Success", "Unschedulable" or "Error" in Status.code. 422 // However, the scheduler accepts other valid codes as well. 423 // Anything other than "Success" will lead to exclusion of the given host from 424 // running the pod. 425 type FilterPlugin interface { 426 Plugin 427 // Filter is called by the scheduling framework. 428 // All FilterPlugins should return "Success" to declare that 429 // the given node fits the pod. If Filter doesn't return "Success", 430 // it will return "Unschedulable", "UnschedulableAndUnresolvable" or "Error". 431 // For the node being evaluated, Filter plugins should look at the passed 432 // nodeInfo reference for this particular node's information (e.g., pods 433 // considered to be running on the node) instead of looking it up in the 434 // NodeInfoSnapshot because we don't guarantee that they will be the same. 435 // For example, during preemption, we may pass a copy of the original 436 // nodeInfo object that has some pods removed from it to evaluate the 437 // possibility of preempting them to schedule the target pod. 438 Filter(ctx context.Context, state *CycleState, pod *v1.Pod, nodeInfo *NodeInfo) *Status 439 } 440 441 // PostFilterPlugin is an interface for "PostFilter" plugins. These plugins are called 442 // after a pod cannot be scheduled. 443 type PostFilterPlugin interface { 444 Plugin 445 // PostFilter is called by the scheduling framework 446 // when the scheduling cycle failed at PreFilter or Filter by Unschedulable or UnschedulableAndUnresolvable. 447 // NodeToStatusMap has statuses that each Node got in the Filter phase. 448 // If this scheduling cycle failed at PreFilter, all Nodes have the status from the rejector PreFilter plugin in NodeToStatusMap. 449 // Note that the scheduling framework runs PostFilter plugins even when PreFilter returned UnschedulableAndUnresolvable. 450 // In that case, NodeToStatusMap contains all Nodes with UnschedulableAndUnresolvable. 451 // 452 // Also, ignoring Nodes with UnschedulableAndUnresolvable is the responsibility of each PostFilter plugin, 453 // meaning NodeToStatusMap obviously could have Nodes with UnschedulableAndUnresolvable 454 // and the scheduling framework does call PostFilter even when all Nodes in NodeToStatusMap are UnschedulableAndUnresolvable. 455 // 456 // A PostFilter plugin should return one of the following statuses: 457 // - Unschedulable: the plugin gets executed successfully but the pod cannot be made schedulable. 458 // - Success: the plugin gets executed successfully and the pod can be made schedulable. 459 // - Error: the plugin aborts due to some internal error. 460 // 461 // Informational plugins should be configured ahead of other ones, and always return Unschedulable status. 462 // Optionally, a non-nil PostFilterResult may be returned along with a Success status. For example, 463 // a preemption plugin may choose to return nominatedNodeName, so that framework can reuse that to update the 464 // preemptor pod's .spec.status.nominatedNodeName field. 465 PostFilter(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusMap) (*PostFilterResult, *Status) 466 } 467 468 // PreScorePlugin is an interface for "PreScore" plugin. PreScore is an 469 // informational extension point. Plugins will be called with a list of nodes 470 // that passed the filtering phase. A plugin may use this data to update internal 471 // state or to generate logs/metrics. 472 type PreScorePlugin interface { 473 Plugin 474 // PreScore is called by the scheduling framework after a list of nodes 475 // passed the filtering phase. All prescore plugins must return success or 476 // the pod will be rejected 477 // When it returns Skip status, other fields in status are just ignored, 478 // and coupled Score plugin will be skipped in this scheduling cycle. 479 PreScore(ctx context.Context, state *CycleState, pod *v1.Pod, nodes []*NodeInfo) *Status 480 } 481 482 // ScoreExtensions is an interface for Score extended functionality. 483 type ScoreExtensions interface { 484 // NormalizeScore is called for all node scores produced by the same plugin's "Score" 485 // method. A successful run of NormalizeScore will update the scores list and return 486 // a success status. 487 NormalizeScore(ctx context.Context, state *CycleState, p *v1.Pod, scores NodeScoreList) *Status 488 } 489 490 // ScorePlugin is an interface that must be implemented by "Score" plugins to rank 491 // nodes that passed the filtering phase. 492 type ScorePlugin interface { 493 Plugin 494 // Score is called on each filtered node. It must return success and an integer 495 // indicating the rank of the node. All scoring plugins must return success or 496 // the pod will be rejected. 497 Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (int64, *Status) 498 499 // ScoreExtensions returns a ScoreExtensions interface if it implements one, or nil if does not. 500 ScoreExtensions() ScoreExtensions 501 } 502 503 // ReservePlugin is an interface for plugins with Reserve and Unreserve 504 // methods. These are meant to update the state of the plugin. This concept 505 // used to be called 'assume' in the original scheduler. These plugins should 506 // return only Success or Error in Status.code. However, the scheduler accepts 507 // other valid codes as well. Anything other than Success will lead to 508 // rejection of the pod. 509 type ReservePlugin interface { 510 Plugin 511 // Reserve is called by the scheduling framework when the scheduler cache is 512 // updated. If this method returns a failed Status, the scheduler will call 513 // the Unreserve method for all enabled ReservePlugins. 514 Reserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status 515 // Unreserve is called by the scheduling framework when a reserved pod was 516 // rejected, an error occurred during reservation of subsequent plugins, or 517 // in a later phase. The Unreserve method implementation must be idempotent 518 // and may be called by the scheduler even if the corresponding Reserve 519 // method for the same plugin was not called. 520 Unreserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) 521 } 522 523 // PreBindPlugin is an interface that must be implemented by "PreBind" plugins. 524 // These plugins are called before a pod being scheduled. 525 type PreBindPlugin interface { 526 Plugin 527 // PreBind is called before binding a pod. All prebind plugins must return 528 // success or the pod will be rejected and won't be sent for binding. 529 PreBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status 530 } 531 532 // PostBindPlugin is an interface that must be implemented by "PostBind" plugins. 533 // These plugins are called after a pod is successfully bound to a node. 534 type PostBindPlugin interface { 535 Plugin 536 // PostBind is called after a pod is successfully bound. These plugins are 537 // informational. A common application of this extension point is for cleaning 538 // up. If a plugin needs to clean-up its state after a pod is scheduled and 539 // bound, PostBind is the extension point that it should register. 540 PostBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) 541 } 542 543 // PermitPlugin is an interface that must be implemented by "Permit" plugins. 544 // These plugins are called before a pod is bound to a node. 545 type PermitPlugin interface { 546 Plugin 547 // Permit is called before binding a pod (and before prebind plugins). Permit 548 // plugins are used to prevent or delay the binding of a Pod. A permit plugin 549 // must return success or wait with timeout duration, or the pod will be rejected. 550 // The pod will also be rejected if the wait timeout or the pod is rejected while 551 // waiting. Note that if the plugin returns "wait", the framework will wait only 552 // after running the remaining plugins given that no other plugin rejects the pod. 553 Permit(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (*Status, time.Duration) 554 } 555 556 // BindPlugin is an interface that must be implemented by "Bind" plugins. Bind 557 // plugins are used to bind a pod to a Node. 558 type BindPlugin interface { 559 Plugin 560 // Bind plugins will not be called until all pre-bind plugins have completed. Each 561 // bind plugin is called in the configured order. A bind plugin may choose whether 562 // or not to handle the given Pod. If a bind plugin chooses to handle a Pod, the 563 // remaining bind plugins are skipped. When a bind plugin does not handle a pod, 564 // it must return Skip in its Status code. If a bind plugin returns an Error, the 565 // pod is rejected and will not be bound. 566 Bind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status 567 } 568 569 // Framework manages the set of plugins in use by the scheduling framework. 570 // Configured plugins are called at specified points in a scheduling context. 571 type Framework interface { 572 Handle 573 574 // PreEnqueuePlugins returns the registered preEnqueue plugins. 575 PreEnqueuePlugins() []PreEnqueuePlugin 576 577 // EnqueueExtensions returns the registered Enqueue extensions. 578 EnqueueExtensions() []EnqueueExtensions 579 580 // QueueSortFunc returns the function to sort pods in scheduling queue 581 QueueSortFunc() LessFunc 582 583 // RunPreFilterPlugins runs the set of configured PreFilter plugins. It returns 584 // *Status and its code is set to non-success if any of the plugins returns 585 // anything but Success. If a non-success status is returned, then the scheduling 586 // cycle is aborted. 587 // It also returns a PreFilterResult, which may influence what or how many nodes to 588 // evaluate downstream. 589 RunPreFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod) (*PreFilterResult, *Status) 590 591 // RunPostFilterPlugins runs the set of configured PostFilter plugins. 592 // PostFilter plugins can either be informational, in which case should be configured 593 // to execute first and return Unschedulable status, or ones that try to change the 594 // cluster state to make the pod potentially schedulable in a future scheduling cycle. 595 RunPostFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusMap) (*PostFilterResult, *Status) 596 597 // RunPreBindPlugins runs the set of configured PreBind plugins. It returns 598 // *Status and its code is set to non-success if any of the plugins returns 599 // anything but Success. If the Status code is "Unschedulable", it is 600 // considered as a scheduling check failure, otherwise, it is considered as an 601 // internal error. In either case the pod is not going to be bound. 602 RunPreBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status 603 604 // RunPostBindPlugins runs the set of configured PostBind plugins. 605 RunPostBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) 606 607 // RunReservePluginsReserve runs the Reserve method of the set of 608 // configured Reserve plugins. If any of these calls returns an error, it 609 // does not continue running the remaining ones and returns the error. In 610 // such case, pod will not be scheduled. 611 RunReservePluginsReserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status 612 613 // RunReservePluginsUnreserve runs the Unreserve method of the set of 614 // configured Reserve plugins. 615 RunReservePluginsUnreserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) 616 617 // RunPermitPlugins runs the set of configured Permit plugins. If any of these 618 // plugins returns a status other than "Success" or "Wait", it does not continue 619 // running the remaining plugins and returns an error. Otherwise, if any of the 620 // plugins returns "Wait", then this function will create and add waiting pod 621 // to a map of currently waiting pods and return status with "Wait" code. 622 // Pod will remain waiting pod for the minimum duration returned by the Permit plugins. 623 RunPermitPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status 624 625 // WaitOnPermit will block, if the pod is a waiting pod, until the waiting pod is rejected or allowed. 626 WaitOnPermit(ctx context.Context, pod *v1.Pod) *Status 627 628 // RunBindPlugins runs the set of configured Bind plugins. A Bind plugin may choose 629 // whether or not to handle the given Pod. If a Bind plugin chooses to skip the 630 // binding, it should return code=5("skip") status. Otherwise, it should return "Error" 631 // or "Success". If none of the plugins handled binding, RunBindPlugins returns 632 // code=5("skip") status. 633 RunBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status 634 635 // HasFilterPlugins returns true if at least one Filter plugin is defined. 636 HasFilterPlugins() bool 637 638 // HasPostFilterPlugins returns true if at least one PostFilter plugin is defined. 639 HasPostFilterPlugins() bool 640 641 // HasScorePlugins returns true if at least one Score plugin is defined. 642 HasScorePlugins() bool 643 644 // ListPlugins returns a map of extension point name to list of configured Plugins. 645 ListPlugins() *config.Plugins 646 647 // ProfileName returns the profile name associated to a profile. 648 ProfileName() string 649 650 // PercentageOfNodesToScore returns percentageOfNodesToScore associated to a profile. 651 PercentageOfNodesToScore() *int32 652 653 // SetPodNominator sets the PodNominator 654 SetPodNominator(nominator PodNominator) 655 656 // Close calls Close method of each plugin. 657 Close() error 658 } 659 660 // Handle provides data and some tools that plugins can use. It is 661 // passed to the plugin factories at the time of plugin initialization. Plugins 662 // must store and use this handle to call framework functions. 663 type Handle interface { 664 // PodNominator abstracts operations to maintain nominated Pods. 665 PodNominator 666 // PluginsRunner abstracts operations to run some plugins. 667 PluginsRunner 668 // SnapshotSharedLister returns listers from the latest NodeInfo Snapshot. The snapshot 669 // is taken at the beginning of a scheduling cycle and remains unchanged until 670 // a pod finishes "Permit" point. 671 // 672 // It should be used only during scheduling cycle: 673 // - There is no guarantee that the information remains unchanged in the binding phase of scheduling. 674 // So, plugins shouldn't use it in the binding cycle (pre-bind/bind/post-bind/un-reserve plugin) 675 // otherwise, a concurrent read/write error might occur. 676 // - There is no guarantee that the information is always up-to-date. 677 // So, plugins shouldn't use it in QueueingHint and PreEnqueue 678 // otherwise, they might make a decision based on stale information. 679 // 680 // Instead, they should use the resources getting from Informer created from SharedInformerFactory(). 681 SnapshotSharedLister() SharedLister 682 683 // IterateOverWaitingPods acquires a read lock and iterates over the WaitingPods map. 684 IterateOverWaitingPods(callback func(WaitingPod)) 685 686 // GetWaitingPod returns a waiting pod given its UID. 687 GetWaitingPod(uid types.UID) WaitingPod 688 689 // RejectWaitingPod rejects a waiting pod given its UID. 690 // The return value indicates if the pod is waiting or not. 691 RejectWaitingPod(uid types.UID) bool 692 693 // ClientSet returns a kubernetes clientSet. 694 ClientSet() clientset.Interface 695 696 // KubeConfig returns the raw kube config. 697 KubeConfig() *restclient.Config 698 699 // EventRecorder returns an event recorder. 700 EventRecorder() events.EventRecorder 701 702 SharedInformerFactory() informers.SharedInformerFactory 703 704 // RunFilterPluginsWithNominatedPods runs the set of configured filter plugins for nominated pod on the given node. 705 RunFilterPluginsWithNominatedPods(ctx context.Context, state *CycleState, pod *v1.Pod, info *NodeInfo) *Status 706 707 // Extenders returns registered scheduler extenders. 708 Extenders() []Extender 709 710 // Parallelizer returns a parallelizer holding parallelism for scheduler. 711 Parallelizer() parallelize.Parallelizer 712 } 713 714 // PreFilterResult wraps needed info for scheduler framework to act upon PreFilter phase. 715 type PreFilterResult struct { 716 // The set of nodes that should be considered downstream; if nil then 717 // all nodes are eligible. 718 NodeNames sets.Set[string] 719 } 720 721 func (p *PreFilterResult) AllNodes() bool { 722 return p == nil || p.NodeNames == nil 723 } 724 725 func (p *PreFilterResult) Merge(in *PreFilterResult) *PreFilterResult { 726 if p.AllNodes() && in.AllNodes() { 727 return nil 728 } 729 730 r := PreFilterResult{} 731 if p.AllNodes() { 732 r.NodeNames = in.NodeNames.Clone() 733 return &r 734 } 735 if in.AllNodes() { 736 r.NodeNames = p.NodeNames.Clone() 737 return &r 738 } 739 740 r.NodeNames = p.NodeNames.Intersection(in.NodeNames) 741 return &r 742 } 743 744 type NominatingMode int 745 746 const ( 747 ModeNoop NominatingMode = iota 748 ModeOverride 749 ) 750 751 type NominatingInfo struct { 752 NominatedNodeName string 753 NominatingMode NominatingMode 754 } 755 756 // PostFilterResult wraps needed info for scheduler framework to act upon PostFilter phase. 757 type PostFilterResult struct { 758 *NominatingInfo 759 } 760 761 func NewPostFilterResultWithNominatedNode(name string) *PostFilterResult { 762 return &PostFilterResult{ 763 NominatingInfo: &NominatingInfo{ 764 NominatedNodeName: name, 765 NominatingMode: ModeOverride, 766 }, 767 } 768 } 769 770 func (ni *NominatingInfo) Mode() NominatingMode { 771 if ni == nil { 772 return ModeNoop 773 } 774 return ni.NominatingMode 775 } 776 777 // PodNominator abstracts operations to maintain nominated Pods. 778 type PodNominator interface { 779 // AddNominatedPod adds the given pod to the nominator or 780 // updates it if it already exists. 781 AddNominatedPod(logger klog.Logger, pod *PodInfo, nominatingInfo *NominatingInfo) 782 // DeleteNominatedPodIfExists deletes nominatedPod from internal cache. It's a no-op if it doesn't exist. 783 DeleteNominatedPodIfExists(pod *v1.Pod) 784 // UpdateNominatedPod updates the <oldPod> with <newPod>. 785 UpdateNominatedPod(logger klog.Logger, oldPod *v1.Pod, newPodInfo *PodInfo) 786 // NominatedPodsForNode returns nominatedPods on the given node. 787 NominatedPodsForNode(nodeName string) []*PodInfo 788 } 789 790 // PluginsRunner abstracts operations to run some plugins. 791 // This is used by preemption PostFilter plugins when evaluating the feasibility of 792 // scheduling the pod on nodes when certain running pods get evicted. 793 type PluginsRunner interface { 794 // RunPreScorePlugins runs the set of configured PreScore plugins. If any 795 // of these plugins returns any status other than "Success", the given pod is rejected. 796 RunPreScorePlugins(context.Context, *CycleState, *v1.Pod, []*NodeInfo) *Status 797 // RunScorePlugins runs the set of configured scoring plugins. 798 // It returns a list that stores scores from each plugin and total score for each Node. 799 // It also returns *Status, which is set to non-success if any of the plugins returns 800 // a non-success status. 801 RunScorePlugins(context.Context, *CycleState, *v1.Pod, []*NodeInfo) ([]NodePluginScores, *Status) 802 // RunFilterPlugins runs the set of configured Filter plugins for pod on 803 // the given node. Note that for the node being evaluated, the passed nodeInfo 804 // reference could be different from the one in NodeInfoSnapshot map (e.g., pods 805 // considered to be running on the node could be different). For example, during 806 // preemption, we may pass a copy of the original nodeInfo object that has some pods 807 // removed from it to evaluate the possibility of preempting them to 808 // schedule the target pod. 809 RunFilterPlugins(context.Context, *CycleState, *v1.Pod, *NodeInfo) *Status 810 // RunPreFilterExtensionAddPod calls the AddPod interface for the set of configured 811 // PreFilter plugins. It returns directly if any of the plugins return any 812 // status other than Success. 813 RunPreFilterExtensionAddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status 814 // RunPreFilterExtensionRemovePod calls the RemovePod interface for the set of configured 815 // PreFilter plugins. It returns directly if any of the plugins return any 816 // status other than Success. 817 RunPreFilterExtensionRemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status 818 } 819