1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package devicemanager 18 19 import ( 20 "time" 21 22 v1 "k8s.io/api/core/v1" 23 "k8s.io/apimachinery/pkg/util/sets" 24 "k8s.io/kubernetes/pkg/kubelet/cm/containermap" 25 "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" 26 "k8s.io/kubernetes/pkg/kubelet/config" 27 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 28 "k8s.io/kubernetes/pkg/kubelet/lifecycle" 29 "k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache" 30 schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" 31 ) 32 33 // Manager manages all the Device Plugins running on a node. 34 type Manager interface { 35 // Start starts device plugin registration service. 36 Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, initialContainers containermap.ContainerMap, initialContainerRunningSet sets.Set[string]) error 37 38 // Allocate configures and assigns devices to a container in a pod. From 39 // the requested device resources, Allocate will communicate with the 40 // owning device plugin to allow setup procedures to take place, and for 41 // the device plugin to provide runtime settings to use the device 42 // (environment variables, mount points and device files). 43 Allocate(pod *v1.Pod, container *v1.Container) error 44 45 // UpdatePluginResources updates node resources based on devices already 46 // allocated to pods. The node object is provided for the device manager to 47 // update the node capacity to reflect the currently available devices. 48 UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error 49 50 // Stop stops the manager. 51 Stop() error 52 53 // GetDeviceRunContainerOptions checks whether we have cached containerDevices 54 // for the passed-in <pod, container> and returns its DeviceRunContainerOptions 55 // for the found one. An empty struct is returned in case no cached state is found. 56 GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error) 57 58 // GetCapacity returns the amount of available device plugin resource capacity, resource allocatable 59 // and inactive device plugin resources previously registered on the node. 60 GetCapacity() (v1.ResourceList, v1.ResourceList, []string) 61 GetWatcherHandler() cache.PluginHandler 62 63 // GetDevices returns information about the devices assigned to pods and containers 64 GetDevices(podUID, containerName string) ResourceDeviceInstances 65 66 // GetAllocatableDevices returns information about all the devices known to the manager 67 GetAllocatableDevices() ResourceDeviceInstances 68 69 // ShouldResetExtendedResourceCapacity returns whether the extended resources should be reset or not, 70 // depending on the checkpoint file availability. Absence of the checkpoint file strongly indicates 71 // the node has been recreated. 72 ShouldResetExtendedResourceCapacity() bool 73 74 // TopologyManager HintProvider provider indicates the Device Manager implements the Topology Manager Interface 75 // and is consulted to make Topology aware resource alignments 76 GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint 77 78 // TopologyManager HintProvider provider indicates the Device Manager implements the Topology Manager Interface 79 // and is consulted to make Topology aware resource alignments per Pod 80 GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint 81 82 // UpdateAllocatedDevices frees any Devices that are bound to terminated pods. 83 UpdateAllocatedDevices() 84 } 85 86 // DeviceRunContainerOptions contains the combined container runtime settings to consume its allocated devices. 87 type DeviceRunContainerOptions struct { 88 // The environment variables list. 89 Envs []kubecontainer.EnvVar 90 // The mounts for the container. 91 Mounts []kubecontainer.Mount 92 // The host devices mapped into the container. 93 Devices []kubecontainer.DeviceInfo 94 // The Annotations for the container 95 Annotations []kubecontainer.Annotation 96 // CDI Devices for the container 97 CDIDevices []kubecontainer.CDIDevice 98 } 99 100 // TODO: evaluate whether we need this error definition. 101 const ( 102 errEndpointStopped = "endpoint %v has been stopped" 103 ) 104 105 // endpointStopGracePeriod indicates the grace period after an endpoint is stopped 106 // because its device plugin fails. DeviceManager keeps the stopped endpoint in its 107 // cache during this grace period to cover the time gap for the capacity change to 108 // take effect. 109 const endpointStopGracePeriod = time.Duration(5) * time.Minute 110 111 // kubeletDeviceManagerCheckpoint is the file name of device plugin checkpoint 112 const kubeletDeviceManagerCheckpoint = "kubelet_internal_checkpoint" 113