...

Source file src/k8s.io/kubernetes/test/e2e_node/podresources_test.go

Documentation: k8s.io/kubernetes/test/e2e_node

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package e2enode
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"os"
    24  	"strings"
    25  	"time"
    26  
    27  	v1 "k8s.io/api/core/v1"
    28  	"k8s.io/apimachinery/pkg/api/resource"
    29  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    30  	kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
    31  	kubeletpodresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
    32  	kubefeatures "k8s.io/kubernetes/pkg/features"
    33  	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
    34  	apisgrpc "k8s.io/kubernetes/pkg/kubelet/apis/grpc"
    35  	"k8s.io/kubernetes/pkg/kubelet/apis/podresources"
    36  	"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
    37  	"k8s.io/kubernetes/pkg/kubelet/util"
    38  	testutils "k8s.io/kubernetes/test/utils"
    39  	admissionapi "k8s.io/pod-security-admission/api"
    40  	"k8s.io/utils/cpuset"
    41  
    42  	"github.com/onsi/ginkgo/v2"
    43  	"github.com/onsi/gomega"
    44  	"github.com/onsi/gomega/gstruct"
    45  	"github.com/onsi/gomega/types"
    46  	"k8s.io/kubernetes/test/e2e/feature"
    47  	"k8s.io/kubernetes/test/e2e/framework"
    48  	e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
    49  	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
    50  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    51  	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
    52  	"k8s.io/kubernetes/test/e2e/nodefeature"
    53  )
    54  
    55  const (
    56  	defaultTopologyUnawareResourceName = "example.com/resource"
    57  )
    58  
    59  type podDesc struct {
    60  	podName        string
    61  	cntName        string
    62  	resourceName   string
    63  	resourceAmount int
    64  	cpuRequest     int // cpuRequest is in millicores
    65  	initContainers []initContainerDesc
    66  }
    67  
    68  func (desc podDesc) CpuRequestQty() resource.Quantity {
    69  	qty := resource.NewMilliQuantity(int64(desc.cpuRequest), resource.DecimalSI)
    70  	return *qty
    71  }
    72  
    73  func (desc podDesc) CpuRequestExclusive() int {
    74  	if (desc.cpuRequest % 1000) != 0 {
    75  		// exclusive cpus are request only if the quantity is integral;
    76  		// hence, explicitly rule out non-integral requests
    77  		return 0
    78  	}
    79  	return desc.cpuRequest / 1000
    80  }
    81  
    82  func (desc podDesc) RequiresCPU() bool {
    83  	return desc.cpuRequest > 0
    84  }
    85  
    86  func (desc podDesc) RequiresDevices() bool {
    87  	return desc.resourceName != "" && desc.resourceAmount > 0
    88  }
    89  
    90  type initContainerDesc struct {
    91  	cntName        string
    92  	resourceName   string
    93  	resourceAmount int
    94  	cpuRequest     int // cpuRequest is in millicores
    95  	restartPolicy  *v1.ContainerRestartPolicy
    96  }
    97  
    98  func (desc initContainerDesc) CPURequestQty() resource.Quantity {
    99  	qty := resource.NewMilliQuantity(int64(desc.cpuRequest), resource.DecimalSI)
   100  	return *qty
   101  }
   102  
   103  func (desc initContainerDesc) CPURequestExclusive() int {
   104  	if (desc.cpuRequest % 1000) != 0 {
   105  		// exclusive cpus are request only if the quantity is integral;
   106  		// hence, explicitly rule out non-integral requests
   107  		return 0
   108  	}
   109  	return desc.cpuRequest / 1000
   110  }
   111  
   112  func (desc initContainerDesc) RequiresCPU() bool {
   113  	return desc.cpuRequest > 0
   114  }
   115  
   116  func (desc initContainerDesc) RequiresDevices() bool {
   117  	return desc.resourceName != "" && desc.resourceAmount > 0
   118  }
   119  
   120  func makePodResourcesTestPod(desc podDesc) *v1.Pod {
   121  	cnt := v1.Container{
   122  		Name:  desc.cntName,
   123  		Image: busyboxImage,
   124  		Resources: v1.ResourceRequirements{
   125  			Requests: v1.ResourceList{},
   126  			Limits:   v1.ResourceList{},
   127  		},
   128  		Command: []string{"sh", "-c", "sleep 1d"},
   129  	}
   130  	if desc.RequiresCPU() {
   131  		cpuRequestQty := desc.CpuRequestQty()
   132  		cnt.Resources.Requests[v1.ResourceCPU] = cpuRequestQty
   133  		cnt.Resources.Limits[v1.ResourceCPU] = cpuRequestQty
   134  		// we don't really care, we only need to be in guaranteed QoS
   135  		cnt.Resources.Requests[v1.ResourceMemory] = resource.MustParse("100Mi")
   136  		cnt.Resources.Limits[v1.ResourceMemory] = resource.MustParse("100Mi")
   137  	}
   138  	if desc.RequiresDevices() {
   139  		cnt.Resources.Requests[v1.ResourceName(desc.resourceName)] = resource.MustParse(fmt.Sprintf("%d", desc.resourceAmount))
   140  		cnt.Resources.Limits[v1.ResourceName(desc.resourceName)] = resource.MustParse(fmt.Sprintf("%d", desc.resourceAmount))
   141  	}
   142  
   143  	var initCnts []v1.Container
   144  	for _, cntDesc := range desc.initContainers {
   145  		initCnt := v1.Container{
   146  			Name:  cntDesc.cntName,
   147  			Image: busyboxImage,
   148  			Resources: v1.ResourceRequirements{
   149  				Requests: v1.ResourceList{},
   150  				Limits:   v1.ResourceList{},
   151  			},
   152  			Command:       []string{"sh", "-c", "sleep 5s"},
   153  			RestartPolicy: cntDesc.restartPolicy,
   154  		}
   155  		if cntDesc.restartPolicy != nil && *cntDesc.restartPolicy == v1.ContainerRestartPolicyAlways {
   156  			initCnt.Command = []string{"sh", "-c", "sleep 1d"}
   157  		}
   158  		if cntDesc.RequiresCPU() {
   159  			cpuRequestQty := cntDesc.CPURequestQty()
   160  			initCnt.Resources.Requests[v1.ResourceCPU] = cpuRequestQty
   161  			initCnt.Resources.Limits[v1.ResourceCPU] = cpuRequestQty
   162  			// we don't really care, we only need to be in guaranteed QoS
   163  			initCnt.Resources.Requests[v1.ResourceMemory] = resource.MustParse("100Mi")
   164  			initCnt.Resources.Limits[v1.ResourceMemory] = resource.MustParse("100Mi")
   165  		}
   166  		if cntDesc.RequiresDevices() {
   167  			initCnt.Resources.Requests[v1.ResourceName(cntDesc.resourceName)] = resource.MustParse(fmt.Sprintf("%d", cntDesc.resourceAmount))
   168  			initCnt.Resources.Limits[v1.ResourceName(cntDesc.resourceName)] = resource.MustParse(fmt.Sprintf("%d", cntDesc.resourceAmount))
   169  		}
   170  		initCnts = append(initCnts, initCnt)
   171  	}
   172  
   173  	return &v1.Pod{
   174  		ObjectMeta: metav1.ObjectMeta{
   175  			Name: desc.podName,
   176  		},
   177  		Spec: v1.PodSpec{
   178  			RestartPolicy:  v1.RestartPolicyNever,
   179  			InitContainers: initCnts,
   180  			Containers: []v1.Container{
   181  				cnt,
   182  			},
   183  		},
   184  	}
   185  }
   186  
   187  func logPodResources(podIdx int, pr *kubeletpodresourcesv1.PodResources) {
   188  	ns := pr.GetNamespace()
   189  	cnts := pr.GetContainers()
   190  	if len(cnts) == 0 {
   191  		framework.Logf("#%02d/%02d/%02d - %s/%s/%s   No containers", podIdx, 0, 0, ns, pr.GetName(), "_")
   192  		return
   193  	}
   194  
   195  	for cntIdx, cnt := range cnts {
   196  		if len(cnt.Devices) == 0 {
   197  			framework.Logf("#%02d/%02d/%02d - %s/%s/%s   cpus -> %v   resources -> none", podIdx, cntIdx, 0, ns, pr.GetName(), cnt.Name, cnt.CpuIds)
   198  			continue
   199  		}
   200  
   201  		for devIdx, dev := range cnt.Devices {
   202  			framework.Logf("#%02d/%02d/%02d - %s/%s/%s   cpus -> %v   %s -> %s", podIdx, cntIdx, devIdx, ns, pr.GetName(), cnt.Name, cnt.CpuIds, dev.ResourceName, strings.Join(dev.DeviceIds, ", "))
   203  		}
   204  	}
   205  }
   206  
   207  type podResMap map[string]map[string]kubeletpodresourcesv1.ContainerResources
   208  
   209  func convertToMap(podsResources []*kubeletpodresourcesv1.PodResources) podResMap {
   210  	res := make(map[string]map[string]kubeletpodresourcesv1.ContainerResources)
   211  	for idx, podResource := range podsResources {
   212  		// to make troubleshooting easier
   213  		logPodResources(idx, podResource)
   214  
   215  		cnts := make(map[string]kubeletpodresourcesv1.ContainerResources)
   216  		for _, cnt := range podResource.GetContainers() {
   217  			cnts[cnt.GetName()] = *cnt
   218  		}
   219  		res[podResource.GetName()] = cnts
   220  	}
   221  	return res
   222  }
   223  
   224  func getPodResourcesValues(ctx context.Context, cli kubeletpodresourcesv1.PodResourcesListerClient) (podResMap, error) {
   225  	resp, err := cli.List(ctx, &kubeletpodresourcesv1.ListPodResourcesRequest{})
   226  	if err != nil {
   227  		return nil, err
   228  	}
   229  	return convertToMap(resp.GetPodResources()), nil
   230  }
   231  
   232  type testPodData struct {
   233  	PodMap map[string]*v1.Pod
   234  }
   235  
   236  func newTestPodData() *testPodData {
   237  	return &testPodData{
   238  		PodMap: make(map[string]*v1.Pod),
   239  	}
   240  }
   241  
   242  func (tpd *testPodData) createPodsForTest(ctx context.Context, f *framework.Framework, podReqs []podDesc) {
   243  	for _, podReq := range podReqs {
   244  		pod := makePodResourcesTestPod(podReq)
   245  		pod = e2epod.NewPodClient(f).CreateSync(ctx, pod)
   246  
   247  		framework.Logf("created pod %s", podReq.podName)
   248  		tpd.PodMap[podReq.podName] = pod
   249  	}
   250  }
   251  
   252  /* deletePodsForTest clean up all the pods run for a testcase. Must ensure proper cleanup */
   253  func (tpd *testPodData) deletePodsForTest(ctx context.Context, f *framework.Framework) {
   254  	deletePodsAsync(ctx, f, tpd.PodMap)
   255  }
   256  
   257  /* deletePod removes pod during a test. Should do a best-effort clean up */
   258  func (tpd *testPodData) deletePod(ctx context.Context, f *framework.Framework, podName string) {
   259  	_, ok := tpd.PodMap[podName]
   260  	if !ok {
   261  		return
   262  	}
   263  	deletePodSyncByName(ctx, f, podName)
   264  	delete(tpd.PodMap, podName)
   265  }
   266  
   267  func findContainerDeviceByName(devs []*kubeletpodresourcesv1.ContainerDevices, resourceName string) *kubeletpodresourcesv1.ContainerDevices {
   268  	for _, dev := range devs {
   269  		if dev.ResourceName == resourceName {
   270  			return dev
   271  		}
   272  	}
   273  	return nil
   274  }
   275  
   276  func matchPodDescWithResources(expected []podDesc, found podResMap) error {
   277  	for _, podReq := range expected {
   278  		framework.Logf("matching: %#v", podReq)
   279  
   280  		podInfo, ok := found[podReq.podName]
   281  		if !ok {
   282  			return fmt.Errorf("no pod resources for pod %q", podReq.podName)
   283  		}
   284  		cntInfo, ok := podInfo[podReq.cntName]
   285  		if !ok {
   286  			return fmt.Errorf("no container resources for pod %q container %q", podReq.podName, podReq.cntName)
   287  		}
   288  		if podReq.RequiresCPU() {
   289  			if exclusiveCpus := podReq.CpuRequestExclusive(); exclusiveCpus != len(cntInfo.CpuIds) {
   290  				if exclusiveCpus == 0 {
   291  					return fmt.Errorf("pod %q container %q requested %d expected to be allocated CPUs from shared pool %v", podReq.podName, podReq.cntName, podReq.cpuRequest, cntInfo.CpuIds)
   292  				}
   293  				return fmt.Errorf("pod %q container %q expected %d cpus got %v", podReq.podName, podReq.cntName, exclusiveCpus, cntInfo.CpuIds)
   294  			}
   295  		}
   296  		if podReq.RequiresDevices() {
   297  			dev := findContainerDeviceByName(cntInfo.GetDevices(), podReq.resourceName)
   298  			if dev == nil {
   299  				return fmt.Errorf("pod %q container %q expected data for resource %q not found", podReq.podName, podReq.cntName, podReq.resourceName)
   300  			}
   301  			if len(dev.DeviceIds) != podReq.resourceAmount {
   302  				return fmt.Errorf("pod %q container %q resource %q expected %d items got %v", podReq.podName, podReq.cntName, podReq.resourceName, podReq.resourceAmount, dev.DeviceIds)
   303  			}
   304  		} else {
   305  			devs := cntInfo.GetDevices()
   306  			if len(devs) > 0 {
   307  				return fmt.Errorf("pod %q container %q expected no resources, got %v", podReq.podName, podReq.cntName, devs)
   308  			}
   309  		}
   310  		if cnts, ok := found[defaultTopologyUnawareResourceName]; ok {
   311  			for _, cnt := range cnts {
   312  				for _, cd := range cnt.GetDevices() {
   313  					if cd.ResourceName != defaultTopologyUnawareResourceName {
   314  						continue
   315  					}
   316  					if cd.Topology != nil {
   317  						//we expect nil topology
   318  						return fmt.Errorf("Nil topology is expected")
   319  					}
   320  				}
   321  
   322  			}
   323  		}
   324  
   325  		// check init containers
   326  		for _, initCntDesc := range podReq.initContainers {
   327  			if initCntDesc.restartPolicy == nil || *initCntDesc.restartPolicy != v1.ContainerRestartPolicyAlways {
   328  				// If the init container is not restartable, we don't expect it
   329  				// to be reported.
   330  				_, ok := podInfo[initCntDesc.cntName]
   331  				if ok {
   332  					return fmt.Errorf("pod %q regular init container %q should not be reported", podReq.podName, initCntDesc.cntName)
   333  				}
   334  				continue
   335  			}
   336  
   337  			cntInfo, ok := podInfo[initCntDesc.cntName]
   338  			if !ok {
   339  				return fmt.Errorf("no container resources for pod %q container %q", podReq.podName, initCntDesc.cntName)
   340  			}
   341  			if initCntDesc.RequiresCPU() {
   342  				if exclusiveCpus := initCntDesc.CPURequestExclusive(); exclusiveCpus != len(cntInfo.CpuIds) {
   343  					if exclusiveCpus == 0 {
   344  						return fmt.Errorf("pod %q container %q requested %d expected to be allocated CPUs from shared pool %v", podReq.podName, initCntDesc.cntName, initCntDesc.cpuRequest, cntInfo.CpuIds)
   345  					}
   346  					return fmt.Errorf("pod %q container %q expected %d cpus got %v", podReq.podName, initCntDesc.cntName, exclusiveCpus, cntInfo.CpuIds)
   347  				}
   348  			}
   349  			if initCntDesc.RequiresDevices() {
   350  				dev := findContainerDeviceByName(cntInfo.GetDevices(), initCntDesc.resourceName)
   351  				if dev == nil {
   352  					return fmt.Errorf("pod %q container %q expected data for resource %q not found", podReq.podName, initCntDesc.cntName, initCntDesc.resourceName)
   353  				}
   354  				if len(dev.DeviceIds) != initCntDesc.resourceAmount {
   355  					return fmt.Errorf("pod %q container %q resource %q expected %d items got %v", podReq.podName, initCntDesc.cntName, initCntDesc.resourceName, initCntDesc.resourceAmount, dev.DeviceIds)
   356  				}
   357  			} else {
   358  				devs := cntInfo.GetDevices()
   359  				if len(devs) > 0 {
   360  					return fmt.Errorf("pod %q container %q expected no resources, got %v", podReq.podName, initCntDesc.cntName, devs)
   361  				}
   362  			}
   363  			if cnts, ok := found[defaultTopologyUnawareResourceName]; ok {
   364  				for _, cnt := range cnts {
   365  					for _, cd := range cnt.GetDevices() {
   366  						if cd.ResourceName != defaultTopologyUnawareResourceName {
   367  							continue
   368  						}
   369  						if cd.Topology != nil {
   370  							// we expect nil topology
   371  							return fmt.Errorf("Nil topology is expected")
   372  						}
   373  					}
   374  				}
   375  			}
   376  		}
   377  	}
   378  	return nil
   379  }
   380  
   381  func expectPodResources(ctx context.Context, offset int, cli kubeletpodresourcesv1.PodResourcesListerClient, expected []podDesc) {
   382  	gomega.EventuallyWithOffset(1+offset, ctx, func(ctx context.Context) error {
   383  		found, err := getPodResourcesValues(ctx, cli)
   384  		if err != nil {
   385  			return err
   386  		}
   387  		return matchPodDescWithResources(expected, found)
   388  	}, time.Minute, 10*time.Second).Should(gomega.Succeed())
   389  }
   390  
   391  func filterOutDesc(descs []podDesc, name string) []podDesc {
   392  	var ret []podDesc
   393  	for _, desc := range descs {
   394  		if desc.podName == name {
   395  			continue
   396  		}
   397  		ret = append(ret, desc)
   398  	}
   399  	return ret
   400  }
   401  
   402  func podresourcesListTests(ctx context.Context, f *framework.Framework, cli kubeletpodresourcesv1.PodResourcesListerClient, sd *sriovData, sidecarContainersEnabled bool) {
   403  	var tpd *testPodData
   404  
   405  	var found podResMap
   406  	var expected []podDesc
   407  	var extra podDesc
   408  
   409  	expectedBasePods := 0 /* nothing but pods we create */
   410  	if sd != nil {
   411  		expectedBasePods = 1 // sriovdp
   412  	}
   413  
   414  	var err error
   415  	ginkgo.By("checking the output when no pods are present")
   416  	found, err = getPodResourcesValues(ctx, cli)
   417  	framework.ExpectNoError(err, "getPodResourcesValues() failed err: %v", err)
   418  	gomega.ExpectWithOffset(1, found).To(gomega.HaveLen(expectedBasePods), "base pod expectation mismatch")
   419  
   420  	tpd = newTestPodData()
   421  	ginkgo.By("checking the output when only pods which don't require resources are present")
   422  	expected = []podDesc{
   423  		{
   424  			podName: "pod-00",
   425  			cntName: "cnt-00",
   426  		},
   427  		{
   428  			podName: "pod-01",
   429  			cntName: "cnt-00",
   430  		},
   431  	}
   432  
   433  	tpd.createPodsForTest(ctx, f, expected)
   434  	expectPodResources(ctx, 1, cli, expected)
   435  	tpd.deletePodsForTest(ctx, f)
   436  
   437  	tpd = newTestPodData()
   438  	ginkgo.By("checking the output when only a subset of pods require resources")
   439  	if sd != nil {
   440  		expected = []podDesc{
   441  			{
   442  				podName: "pod-00",
   443  				cntName: "cnt-00",
   444  			},
   445  			{
   446  				podName:        "pod-01",
   447  				cntName:        "cnt-00",
   448  				resourceName:   sd.resourceName,
   449  				resourceAmount: 1,
   450  				cpuRequest:     1000,
   451  			},
   452  			{
   453  				podName:    "pod-02",
   454  				cntName:    "cnt-00",
   455  				cpuRequest: 1000,
   456  			},
   457  			{
   458  				podName:        "pod-03",
   459  				cntName:        "cnt-00",
   460  				resourceName:   sd.resourceName,
   461  				resourceAmount: 1,
   462  				cpuRequest:     1000,
   463  			},
   464  		}
   465  	} else {
   466  		expected = []podDesc{
   467  			{
   468  				podName: "pod-00",
   469  				cntName: "cnt-00",
   470  			},
   471  			{
   472  				podName:    "pod-01",
   473  				cntName:    "cnt-00",
   474  				cpuRequest: 1000,
   475  			},
   476  			{
   477  				podName:    "pod-02",
   478  				cntName:    "cnt-00",
   479  				cpuRequest: 1000,
   480  			},
   481  			{
   482  				podName:    "pod-03",
   483  				cntName:    "cnt-00",
   484  				cpuRequest: 1000,
   485  			},
   486  		}
   487  
   488  	}
   489  	tpd.createPodsForTest(ctx, f, expected)
   490  	expectPodResources(ctx, 1, cli, expected)
   491  	tpd.deletePodsForTest(ctx, f)
   492  
   493  	tpd = newTestPodData()
   494  	ginkgo.By("checking the output when creating pods which require resources between calls")
   495  	if sd != nil {
   496  		expected = []podDesc{
   497  			{
   498  				podName: "pod-00",
   499  				cntName: "cnt-00",
   500  			},
   501  			{
   502  				podName:        "pod-01",
   503  				cntName:        "cnt-00",
   504  				resourceName:   sd.resourceName,
   505  				resourceAmount: 1,
   506  				cpuRequest:     1000,
   507  			},
   508  			{
   509  				podName:    "pod-02",
   510  				cntName:    "cnt-00",
   511  				cpuRequest: 1000,
   512  			},
   513  		}
   514  	} else {
   515  		expected = []podDesc{
   516  			{
   517  				podName: "pod-00",
   518  				cntName: "cnt-00",
   519  			},
   520  			{
   521  				podName:    "pod-01",
   522  				cntName:    "cnt-00",
   523  				cpuRequest: 1000,
   524  			},
   525  			{
   526  				podName:    "pod-02",
   527  				cntName:    "cnt-00",
   528  				cpuRequest: 1000,
   529  			},
   530  		}
   531  	}
   532  
   533  	tpd.createPodsForTest(ctx, f, expected)
   534  	expectPodResources(ctx, 1, cli, expected)
   535  
   536  	if sd != nil {
   537  		extra = podDesc{
   538  			podName:        "pod-03",
   539  			cntName:        "cnt-00",
   540  			resourceName:   sd.resourceName,
   541  			resourceAmount: 1,
   542  			cpuRequest:     1000,
   543  		}
   544  	} else {
   545  		extra = podDesc{
   546  			podName:    "pod-03",
   547  			cntName:    "cnt-00",
   548  			cpuRequest: 1000,
   549  		}
   550  
   551  	}
   552  
   553  	tpd.createPodsForTest(ctx, f, []podDesc{
   554  		extra,
   555  	})
   556  
   557  	expected = append(expected, extra)
   558  	expectPodResources(ctx, 1, cli, expected)
   559  	tpd.deletePodsForTest(ctx, f)
   560  
   561  	tpd = newTestPodData()
   562  	ginkgo.By("checking the output when deleting pods which require resources between calls")
   563  
   564  	if sd != nil {
   565  		expected = []podDesc{
   566  			{
   567  				podName:    "pod-00",
   568  				cntName:    "cnt-00",
   569  				cpuRequest: 1000,
   570  			},
   571  			{
   572  				podName:        "pod-01",
   573  				cntName:        "cnt-00",
   574  				resourceName:   sd.resourceName,
   575  				resourceAmount: 1,
   576  				cpuRequest:     2000,
   577  			},
   578  			{
   579  				podName: "pod-02",
   580  				cntName: "cnt-00",
   581  			},
   582  			{
   583  				podName:        "pod-03",
   584  				cntName:        "cnt-00",
   585  				resourceName:   sd.resourceName,
   586  				resourceAmount: 1,
   587  				cpuRequest:     1000,
   588  			},
   589  		}
   590  	} else {
   591  		expected = []podDesc{
   592  			{
   593  				podName:    "pod-00",
   594  				cntName:    "cnt-00",
   595  				cpuRequest: 1000,
   596  			},
   597  			{
   598  				podName:    "pod-01",
   599  				cntName:    "cnt-00",
   600  				cpuRequest: 1000,
   601  			},
   602  			{
   603  				podName: "pod-02",
   604  				cntName: "cnt-00",
   605  			},
   606  			{
   607  				podName:    "pod-03",
   608  				cntName:    "cnt-00",
   609  				cpuRequest: 1000,
   610  			},
   611  		}
   612  	}
   613  	tpd.createPodsForTest(ctx, f, expected)
   614  	expectPodResources(ctx, 1, cli, expected)
   615  
   616  	tpd.deletePod(ctx, f, "pod-01")
   617  	expectedPostDelete := filterOutDesc(expected, "pod-01")
   618  	expectPodResources(ctx, 1, cli, expectedPostDelete)
   619  	tpd.deletePodsForTest(ctx, f)
   620  
   621  	tpd = newTestPodData()
   622  	ginkgo.By("checking the output when pods request non integral CPUs")
   623  	if sd != nil {
   624  		expected = []podDesc{
   625  			{
   626  				podName:    "pod-00",
   627  				cntName:    "cnt-00",
   628  				cpuRequest: 1500,
   629  			},
   630  			{
   631  				podName:        "pod-01",
   632  				cntName:        "cnt-00",
   633  				resourceName:   sd.resourceName,
   634  				resourceAmount: 1,
   635  				cpuRequest:     1500,
   636  			},
   637  		}
   638  	} else {
   639  		expected = []podDesc{
   640  			{
   641  				podName:    "pod-00",
   642  				cntName:    "cnt-00",
   643  				cpuRequest: 1500,
   644  			},
   645  		}
   646  
   647  	}
   648  	tpd.createPodsForTest(ctx, f, expected)
   649  	expectPodResources(ctx, 1, cli, expected)
   650  	tpd.deletePodsForTest(ctx, f)
   651  
   652  	if sidecarContainersEnabled {
   653  		containerRestartPolicyAlways := v1.ContainerRestartPolicyAlways
   654  
   655  		tpd = newTestPodData()
   656  		ginkgo.By("checking the output when pods have init containers")
   657  		if sd != nil {
   658  			expected = []podDesc{
   659  				{
   660  					podName:    "pod-00",
   661  					cntName:    "regular-00",
   662  					cpuRequest: 1000,
   663  					initContainers: []initContainerDesc{
   664  						{
   665  							cntName:        "init-00",
   666  							resourceName:   sd.resourceName,
   667  							resourceAmount: 1,
   668  							cpuRequest:     1000,
   669  						},
   670  					},
   671  				},
   672  				{
   673  					podName:    "pod-01",
   674  					cntName:    "regular-00",
   675  					cpuRequest: 1000,
   676  					initContainers: []initContainerDesc{
   677  						{
   678  							cntName:        "restartable-init-00",
   679  							resourceName:   sd.resourceName,
   680  							resourceAmount: 1,
   681  							cpuRequest:     1000,
   682  							restartPolicy:  &containerRestartPolicyAlways,
   683  						},
   684  					},
   685  				},
   686  			}
   687  		} else {
   688  			expected = []podDesc{
   689  				{
   690  					podName:    "pod-00",
   691  					cntName:    "regular-00",
   692  					cpuRequest: 1000,
   693  					initContainers: []initContainerDesc{
   694  						{
   695  							cntName:    "init-00",
   696  							cpuRequest: 1000,
   697  						},
   698  					},
   699  				},
   700  				{
   701  					podName:    "pod-01",
   702  					cntName:    "regular-00",
   703  					cpuRequest: 1000,
   704  					initContainers: []initContainerDesc{
   705  						{
   706  							cntName:       "restartable-init-00",
   707  							cpuRequest:    1000,
   708  							restartPolicy: &containerRestartPolicyAlways,
   709  						},
   710  					},
   711  				},
   712  			}
   713  		}
   714  
   715  		tpd.createPodsForTest(ctx, f, expected)
   716  		expectPodResources(ctx, 1, cli, expected)
   717  		tpd.deletePodsForTest(ctx, f)
   718  	}
   719  }
   720  
   721  func podresourcesGetAllocatableResourcesTests(ctx context.Context, cli kubeletpodresourcesv1.PodResourcesListerClient, sd *sriovData, onlineCPUs, reservedSystemCPUs cpuset.CPUSet) {
   722  	ginkgo.By("checking the devices known to the kubelet")
   723  	resp, err := cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{})
   724  	framework.ExpectNoErrorWithOffset(1, err)
   725  	devs := resp.GetDevices()
   726  	var cpus []int
   727  	for _, cpuid := range resp.GetCpuIds() {
   728  		cpus = append(cpus, int(cpuid))
   729  	}
   730  	allocatableCPUs := cpuset.New(cpus...)
   731  
   732  	if onlineCPUs.Size() == 0 {
   733  		ginkgo.By("expecting no CPUs reported")
   734  		gomega.ExpectWithOffset(1, onlineCPUs.Size()).To(gomega.Equal(reservedSystemCPUs.Size()), "with no online CPUs, no CPUs should be reserved")
   735  	} else {
   736  		ginkgo.By(fmt.Sprintf("expecting online CPUs reported - online=%v (%d) reserved=%v (%d)", onlineCPUs, onlineCPUs.Size(), reservedSystemCPUs, reservedSystemCPUs.Size()))
   737  		if reservedSystemCPUs.Size() > onlineCPUs.Size() {
   738  			ginkgo.Fail("more reserved CPUs than online")
   739  		}
   740  		expectedCPUs := onlineCPUs.Difference(reservedSystemCPUs)
   741  
   742  		ginkgo.By(fmt.Sprintf("expecting CPUs '%v'='%v'", allocatableCPUs, expectedCPUs))
   743  		gomega.ExpectWithOffset(1, allocatableCPUs.Equals(expectedCPUs)).To(gomega.BeTrue(), "mismatch expecting CPUs")
   744  	}
   745  
   746  	if sd == nil { // no devices in the environment, so expect no devices
   747  		ginkgo.By("expecting no devices reported")
   748  		gomega.ExpectWithOffset(1, devs).To(gomega.BeEmpty(), fmt.Sprintf("got unexpected devices %#v", devs))
   749  		return
   750  	}
   751  
   752  	ginkgo.By(fmt.Sprintf("expecting some %q devices reported", sd.resourceName))
   753  	gomega.ExpectWithOffset(1, devs).ToNot(gomega.BeEmpty())
   754  	for _, dev := range devs {
   755  		gomega.Expect(dev.ResourceName).To(gomega.Equal(sd.resourceName))
   756  		gomega.ExpectWithOffset(1, dev.DeviceIds).ToNot(gomega.BeEmpty())
   757  	}
   758  }
   759  
   760  func podresourcesGetTests(ctx context.Context, f *framework.Framework, cli kubeletpodresourcesv1.PodResourcesListerClient, sidecarContainersEnabled bool) {
   761  	//var err error
   762  	ginkgo.By("checking the output when no pods are present")
   763  	expected := []podDesc{}
   764  	resp, err := cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "test", PodNamespace: f.Namespace.Name})
   765  	podResourceList := []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()}
   766  	gomega.Expect(err).To(gomega.HaveOccurred(), "pod not found")
   767  	res := convertToMap(podResourceList)
   768  	err = matchPodDescWithResources(expected, res)
   769  	framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err)
   770  
   771  	tpd := newTestPodData()
   772  	ginkgo.By("checking the output when only pods which don't require resources are present")
   773  	expected = []podDesc{
   774  		{
   775  			podName: "pod-00",
   776  			cntName: "cnt-00",
   777  		},
   778  	}
   779  	tpd.createPodsForTest(ctx, f, expected)
   780  	resp, err = cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "pod-00", PodNamespace: f.Namespace.Name})
   781  	framework.ExpectNoError(err, "Get() call failed for pod %s/%s", f.Namespace.Name, "pod-00")
   782  	podResourceList = []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()}
   783  	res = convertToMap(podResourceList)
   784  	err = matchPodDescWithResources(expected, res)
   785  	framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err)
   786  	tpd.deletePodsForTest(ctx, f)
   787  
   788  	tpd = newTestPodData()
   789  	ginkgo.By("checking the output when only pod require CPU")
   790  	expected = []podDesc{
   791  		{
   792  			podName:    "pod-01",
   793  			cntName:    "cnt-00",
   794  			cpuRequest: 1000,
   795  		},
   796  	}
   797  	tpd.createPodsForTest(ctx, f, expected)
   798  	resp, err = cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "pod-01", PodNamespace: f.Namespace.Name})
   799  	framework.ExpectNoError(err, "Get() call failed for pod %s/%s", f.Namespace.Name, "pod-01")
   800  	podResourceList = []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()}
   801  	res = convertToMap(podResourceList)
   802  	err = matchPodDescWithResources(expected, res)
   803  	framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err)
   804  	tpd.deletePodsForTest(ctx, f)
   805  
   806  	if sidecarContainersEnabled {
   807  		containerRestartPolicyAlways := v1.ContainerRestartPolicyAlways
   808  
   809  		tpd = newTestPodData()
   810  		ginkgo.By("checking the output when only pod with init containers require CPU")
   811  		expected = []podDesc{
   812  			{
   813  				podName:    "pod-01",
   814  				cntName:    "cnt-00",
   815  				cpuRequest: 1000,
   816  				initContainers: []initContainerDesc{
   817  					{
   818  						cntName:    "init-00",
   819  						cpuRequest: 1000,
   820  					},
   821  					{
   822  						cntName:       "restartable-init-01",
   823  						cpuRequest:    1000,
   824  						restartPolicy: &containerRestartPolicyAlways,
   825  					},
   826  				},
   827  			},
   828  		}
   829  		tpd.createPodsForTest(ctx, f, expected)
   830  		resp, err = cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "pod-01", PodNamespace: f.Namespace.Name})
   831  		framework.ExpectNoError(err, "Get() call failed for pod %s/%s", f.Namespace.Name, "pod-01")
   832  		podResourceList = []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()}
   833  		res = convertToMap(podResourceList)
   834  		err = matchPodDescWithResources(expected, res)
   835  		framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err)
   836  		tpd.deletePodsForTest(ctx, f)
   837  	}
   838  }
   839  
   840  // Serial because the test updates kubelet configuration.
   841  var _ = SIGDescribe("POD Resources", framework.WithSerial(), feature.PodResources, nodefeature.PodResources, func() {
   842  	f := framework.NewDefaultFramework("podresources-test")
   843  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
   844  
   845  	reservedSystemCPUs := cpuset.New(1)
   846  
   847  	ginkgo.Context("with SRIOV devices in the system", func() {
   848  		ginkgo.BeforeEach(func() {
   849  			requireSRIOVDevices()
   850  		})
   851  
   852  		ginkgo.Context("with CPU manager Static policy", func() {
   853  			ginkgo.BeforeEach(func(ctx context.Context) {
   854  				// this is a very rough check. We just want to rule out system that does NOT have enough resources
   855  				_, cpuAlloc, _ := getLocalNodeCPUDetails(ctx, f)
   856  
   857  				if cpuAlloc < minCoreCount {
   858  					e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount)
   859  				}
   860  			})
   861  
   862  			// empty context to apply kubelet config changes
   863  			ginkgo.Context("", func() {
   864  				tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
   865  					// Set the CPU Manager policy to static.
   866  					initialConfig.CPUManagerPolicy = string(cpumanager.PolicyStatic)
   867  
   868  					// Set the CPU Manager reconcile period to 1 second.
   869  					initialConfig.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second}
   870  
   871  					cpus := reservedSystemCPUs.String()
   872  					framework.Logf("configurePodResourcesInKubelet: using reservedSystemCPUs=%q", cpus)
   873  					initialConfig.ReservedSystemCPUs = cpus
   874  				})
   875  
   876  				ginkgo.It("should return the expected responses", func(ctx context.Context) {
   877  					onlineCPUs, err := getOnlineCPUs()
   878  					framework.ExpectNoError(err, "getOnlineCPUs() failed err: %v", err)
   879  
   880  					configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile)
   881  					sd := setupSRIOVConfigOrFail(ctx, f, configMap)
   882  					ginkgo.DeferCleanup(teardownSRIOVConfigOrFail, f, sd)
   883  
   884  					waitForSRIOVResources(ctx, f, sd)
   885  
   886  					endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
   887  					framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err)
   888  
   889  					cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
   890  					framework.ExpectNoError(err, "GetV1Client() failed err: %v", err)
   891  					defer conn.Close()
   892  
   893  					waitForSRIOVResources(ctx, f, sd)
   894  
   895  					ginkgo.By("checking List()")
   896  					podresourcesListTests(ctx, f, cli, sd, false)
   897  					ginkgo.By("checking GetAllocatableResources()")
   898  					podresourcesGetAllocatableResourcesTests(ctx, cli, sd, onlineCPUs, reservedSystemCPUs)
   899  				})
   900  
   901  				framework.It("should return the expected responses", nodefeature.SidecarContainers, func(ctx context.Context) {
   902  					onlineCPUs, err := getOnlineCPUs()
   903  					framework.ExpectNoError(err, "getOnlineCPUs() failed err: %v", err)
   904  
   905  					configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile)
   906  					sd := setupSRIOVConfigOrFail(ctx, f, configMap)
   907  					ginkgo.DeferCleanup(teardownSRIOVConfigOrFail, f, sd)
   908  
   909  					waitForSRIOVResources(ctx, f, sd)
   910  
   911  					endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
   912  					framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err)
   913  
   914  					cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
   915  					framework.ExpectNoError(err, "GetV1Client() failed err: %v", err)
   916  					defer framework.ExpectNoError(conn.Close())
   917  
   918  					waitForSRIOVResources(ctx, f, sd)
   919  
   920  					ginkgo.By("checking List()")
   921  					podresourcesListTests(ctx, f, cli, sd, true)
   922  					ginkgo.By("checking GetAllocatableResources()")
   923  					podresourcesGetAllocatableResourcesTests(ctx, cli, sd, onlineCPUs, reservedSystemCPUs)
   924  				})
   925  			})
   926  		})
   927  
   928  		ginkgo.Context("with CPU manager None policy", func() {
   929  			ginkgo.It("should return the expected responses", func(ctx context.Context) {
   930  				// current default is "none" policy - no need to restart the kubelet
   931  
   932  				requireSRIOVDevices()
   933  
   934  				configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile)
   935  				sd := setupSRIOVConfigOrFail(ctx, f, configMap)
   936  				ginkgo.DeferCleanup(teardownSRIOVConfigOrFail, f, sd)
   937  
   938  				waitForSRIOVResources(ctx, f, sd)
   939  
   940  				endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
   941  				framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err)
   942  
   943  				cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
   944  				framework.ExpectNoError(err, "GetV1Client() failed err: %v", err)
   945  				defer conn.Close()
   946  
   947  				waitForSRIOVResources(ctx, f, sd)
   948  
   949  				// intentionally passing empty cpuset instead of onlineCPUs because with none policy
   950  				// we should get no allocatable cpus - no exclusively allocatable CPUs, depends on policy static
   951  				podresourcesGetAllocatableResourcesTests(ctx, cli, sd, cpuset.CPUSet{}, cpuset.CPUSet{})
   952  			})
   953  		})
   954  	})
   955  
   956  	ginkgo.Context("without SRIOV devices in the system", func() {
   957  		ginkgo.BeforeEach(func() {
   958  			requireLackOfSRIOVDevices()
   959  		})
   960  
   961  		ginkgo.Context("with CPU manager Static policy", func() {
   962  			ginkgo.BeforeEach(func(ctx context.Context) {
   963  				// this is a very rough check. We just want to rule out system that does NOT have enough resources
   964  				_, cpuAlloc, _ := getLocalNodeCPUDetails(ctx, f)
   965  
   966  				if cpuAlloc < minCoreCount {
   967  					e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount)
   968  				}
   969  			})
   970  
   971  			// empty context to apply kubelet config changes
   972  			ginkgo.Context("", func() {
   973  				tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
   974  					// Set the CPU Manager policy to static.
   975  					initialConfig.CPUManagerPolicy = string(cpumanager.PolicyStatic)
   976  
   977  					// Set the CPU Manager reconcile period to 1 second.
   978  					initialConfig.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second}
   979  
   980  					cpus := reservedSystemCPUs.String()
   981  					framework.Logf("configurePodResourcesInKubelet: using reservedSystemCPUs=%q", cpus)
   982  					initialConfig.ReservedSystemCPUs = cpus
   983  					if initialConfig.FeatureGates == nil {
   984  						initialConfig.FeatureGates = make(map[string]bool)
   985  					}
   986  					initialConfig.FeatureGates[string(kubefeatures.KubeletPodResourcesGet)] = true
   987  				})
   988  
   989  				ginkgo.It("should return the expected responses", func(ctx context.Context) {
   990  					onlineCPUs, err := getOnlineCPUs()
   991  					framework.ExpectNoError(err, "getOnlineCPUs() failed err: %v", err)
   992  
   993  					endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
   994  					framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err)
   995  
   996  					cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
   997  					framework.ExpectNoError(err, "GetV1Client() failed err: %v", err)
   998  					defer conn.Close()
   999  
  1000  					podresourcesListTests(ctx, f, cli, nil, false)
  1001  					podresourcesGetAllocatableResourcesTests(ctx, cli, nil, onlineCPUs, reservedSystemCPUs)
  1002  					podresourcesGetTests(ctx, f, cli, false)
  1003  				})
  1004  
  1005  				framework.It("should return the expected responses", nodefeature.SidecarContainers, func(ctx context.Context) {
  1006  					onlineCPUs, err := getOnlineCPUs()
  1007  					framework.ExpectNoError(err, "getOnlineCPUs() failed err: %v", err)
  1008  
  1009  					endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
  1010  					framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err)
  1011  
  1012  					cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
  1013  					framework.ExpectNoError(err, "GetV1Client() failed err: %v", err)
  1014  					defer func() {
  1015  						framework.ExpectNoError(conn.Close())
  1016  					}()
  1017  
  1018  					podresourcesListTests(ctx, f, cli, nil, true)
  1019  					podresourcesGetAllocatableResourcesTests(ctx, cli, nil, onlineCPUs, reservedSystemCPUs)
  1020  					podresourcesGetTests(ctx, f, cli, true)
  1021  				})
  1022  				ginkgo.It("should account for resources of pods in terminal phase", func(ctx context.Context) {
  1023  					pd := podDesc{
  1024  						cntName:    "e2e-test-cnt",
  1025  						podName:    "e2e-test-pod",
  1026  						cpuRequest: 1000,
  1027  					}
  1028  					pod := makePodResourcesTestPod(pd)
  1029  					pod.Spec.Containers[0].Command = []string{"sh", "-c", "/bin/true"}
  1030  					pod = e2epod.NewPodClient(f).Create(ctx, pod)
  1031  					defer e2epod.NewPodClient(f).DeleteSync(ctx, pod.Name, metav1.DeleteOptions{}, time.Minute)
  1032  					err := e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name, "Pod Succeeded", time.Minute*2, testutils.PodSucceeded)
  1033  					framework.ExpectNoError(err)
  1034  					endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
  1035  					framework.ExpectNoError(err)
  1036  					cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
  1037  					framework.ExpectNoError(err)
  1038  					defer conn.Close()
  1039  					// although the pod moved into terminal state, PodResourcesAPI still list its cpus
  1040  					expectPodResources(ctx, 1, cli, []podDesc{pd})
  1041  
  1042  				})
  1043  			})
  1044  		})
  1045  
  1046  		ginkgo.Context("with CPU manager None policy", func() {
  1047  			ginkgo.It("should return the expected responses", func(ctx context.Context) {
  1048  				endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
  1049  				framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err)
  1050  
  1051  				cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
  1052  				framework.ExpectNoError(err, "GetV1Client() failed err: %v", err)
  1053  				defer conn.Close()
  1054  
  1055  				// intentionally passing empty cpuset instead of onlineCPUs because with none policy
  1056  				// we should get no allocatable cpus - no exclusively allocatable CPUs, depends on policy static
  1057  				podresourcesGetAllocatableResourcesTests(ctx, cli, nil, cpuset.CPUSet{}, cpuset.CPUSet{})
  1058  			})
  1059  		})
  1060  
  1061  		ginkgo.Context("with disabled KubeletPodResourcesGet feature gate", func() {
  1062  
  1063  			ginkgo.It("should return the expected error with the feature gate disabled", func(ctx context.Context) {
  1064  				endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
  1065  				framework.ExpectNoError(err, "LocalEndpoint() faild err %v", err)
  1066  
  1067  				cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
  1068  				framework.ExpectNoError(err, "GetV1Client() failed err %v", err)
  1069  				defer conn.Close()
  1070  
  1071  				ginkgo.By("checking Get fail if the feature gate is not enabled")
  1072  				getRes, err := cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "test", PodNamespace: f.Namespace.Name})
  1073  				framework.Logf("Get result: %v, err: %v", getRes, err)
  1074  				gomega.Expect(err).To(gomega.HaveOccurred(), "With feature gate disabled, the call must fail")
  1075  			})
  1076  		})
  1077  	})
  1078  
  1079  	ginkgo.Context("with a topology-unaware device plugin, which reports resources w/o hardware topology", func() {
  1080  		ginkgo.Context("with CPU manager Static policy", func() {
  1081  			ginkgo.BeforeEach(func(ctx context.Context) {
  1082  				// this is a very rough check. We just want to rule out system that does NOT have enough resources
  1083  				_, cpuAlloc, _ := getLocalNodeCPUDetails(ctx, f)
  1084  
  1085  				if cpuAlloc < minCoreCount {
  1086  					e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount)
  1087  				}
  1088  			})
  1089  
  1090  			// empty context to apply kubelet config changes
  1091  			ginkgo.Context("", func() {
  1092  				tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
  1093  					// Set the CPU Manager policy to static.
  1094  					initialConfig.CPUManagerPolicy = string(cpumanager.PolicyStatic)
  1095  
  1096  					// Set the CPU Manager reconcile period to 1 second.
  1097  					initialConfig.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second}
  1098  
  1099  					cpus := reservedSystemCPUs.String()
  1100  					framework.Logf("configurePodResourcesInKubelet: using reservedSystemCPUs=%q", cpus)
  1101  					initialConfig.ReservedSystemCPUs = cpus
  1102  				})
  1103  
  1104  				ginkgo.It("should return proper podresources the same as before the restart of kubelet", func(ctx context.Context) {
  1105  					dpPod := setupSampleDevicePluginOrFail(ctx, f)
  1106  					ginkgo.DeferCleanup(teardownSampleDevicePluginOrFail, f, dpPod)
  1107  
  1108  					waitForTopologyUnawareResources(ctx, f)
  1109  
  1110  					endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
  1111  					framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err)
  1112  
  1113  					cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
  1114  					framework.ExpectNoError(err, "GetV1Client() failed err: %v", err)
  1115  					defer conn.Close()
  1116  
  1117  					ginkgo.By("checking List and resources topology unaware resource should be without topology")
  1118  
  1119  					allocatableResponse, _ := cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{})
  1120  					for _, dev := range allocatableResponse.GetDevices() {
  1121  						if dev.ResourceName != defaultTopologyUnawareResourceName {
  1122  							continue
  1123  						}
  1124  						gomega.Expect(dev.Topology).To(gomega.BeNil(), "Topology is expected to be empty for topology unaware resources")
  1125  					}
  1126  
  1127  					desc := podDesc{
  1128  						podName:        "pod-01",
  1129  						cntName:        "cnt-01",
  1130  						resourceName:   defaultTopologyUnawareResourceName,
  1131  						resourceAmount: 1,
  1132  						cpuRequest:     1000,
  1133  					}
  1134  
  1135  					tpd := newTestPodData()
  1136  					tpd.createPodsForTest(ctx, f, []podDesc{
  1137  						desc,
  1138  					})
  1139  
  1140  					expectPodResources(ctx, 1, cli, []podDesc{desc})
  1141  
  1142  					ginkgo.By("Restarting Kubelet")
  1143  					restartKubelet(true)
  1144  
  1145  					// we need to wait for the node to be reported ready before we can safely query
  1146  					// the podresources endpoint again. Otherwise we will have false negatives.
  1147  					ginkgo.By("Wait for node to be ready")
  1148  					waitForTopologyUnawareResources(ctx, f)
  1149  
  1150  					expectPodResources(ctx, 1, cli, []podDesc{desc})
  1151  					tpd.deletePodsForTest(ctx, f)
  1152  				})
  1153  			})
  1154  		})
  1155  	})
  1156  
  1157  	f.Context("when querying /metrics", f.WithNodeConformance(), func() {
  1158  		tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) {
  1159  			if initialConfig.FeatureGates == nil {
  1160  				initialConfig.FeatureGates = make(map[string]bool)
  1161  			}
  1162  			initialConfig.FeatureGates[string(kubefeatures.KubeletPodResourcesGet)] = true
  1163  		})
  1164  		ginkgo.BeforeEach(func(ctx context.Context) {
  1165  			// ensure APIs have been called at least once
  1166  			endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
  1167  			framework.ExpectNoError(err, "LocalEndpoint() failed err %v", err)
  1168  
  1169  			cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
  1170  			framework.ExpectNoError(err, "GetV1Client() failed err %v", err)
  1171  			defer conn.Close()
  1172  
  1173  			_, err = cli.List(ctx, &kubeletpodresourcesv1.ListPodResourcesRequest{})
  1174  			framework.ExpectNoError(err, "List() failed err %v", err)
  1175  
  1176  			_, err = cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{})
  1177  			framework.ExpectNoError(err, "GetAllocatableResources() failed err %v", err)
  1178  
  1179  			desc := podDesc{
  1180  				podName: "pod-01",
  1181  				cntName: "cnt-01",
  1182  			}
  1183  			tpd := newTestPodData()
  1184  			tpd.createPodsForTest(ctx, f, []podDesc{
  1185  				desc,
  1186  			})
  1187  			expectPodResources(ctx, 1, cli, []podDesc{desc})
  1188  
  1189  			expected := []podDesc{}
  1190  			resp, err := cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "pod-01", PodNamespace: f.Namespace.Name})
  1191  			framework.ExpectNoError(err, "Get() call failed for pod %s/%s", f.Namespace.Name, "pod-01")
  1192  			podResourceList := []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()}
  1193  			res := convertToMap(podResourceList)
  1194  			err = matchPodDescWithResources(expected, res)
  1195  			framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err)
  1196  			tpd.deletePodsForTest(ctx, f)
  1197  		})
  1198  
  1199  		ginkgo.It("should report the values for the podresources metrics", func(ctx context.Context) {
  1200  			// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
  1201  			// being [Serial], we can also assume noone else but us is running pods.
  1202  			ginkgo.By("Checking the value of the podresources metrics")
  1203  
  1204  			matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
  1205  				"kubelet_pod_resources_endpoint_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
  1206  					"": timelessSampleAtLeast(1),
  1207  				}),
  1208  				"kubelet_pod_resources_endpoint_requests_list": gstruct.MatchAllElements(nodeID, gstruct.Elements{
  1209  					"": timelessSampleAtLeast(1),
  1210  				}),
  1211  				"kubelet_pod_resources_endpoint_requests_get_allocatable": gstruct.MatchAllElements(nodeID, gstruct.Elements{
  1212  					"": timelessSampleAtLeast(1),
  1213  				}),
  1214  				"kubelet_pod_resources_endpoint_requests_get": gstruct.MatchAllElements(nodeID, gstruct.Elements{
  1215  					"": timelessSampleAtLeast(1),
  1216  				}),
  1217  				// not checking errors: the calls don't have non-catastrophic (e.g. out of memory) error conditions yet.
  1218  			})
  1219  
  1220  			ginkgo.By("Giving the Kubelet time to start up and produce metrics")
  1221  			gomega.Eventually(ctx, getPodResourcesMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
  1222  			ginkgo.By("Ensuring the metrics match the expectations a few more times")
  1223  			gomega.Consistently(ctx, getPodResourcesMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
  1224  		})
  1225  	})
  1226  
  1227  	ginkgo.Context("with the builtin rate limit values", func() {
  1228  		ginkgo.It("should hit throttling when calling podresources List in a tight loop", func(ctx context.Context) {
  1229  			// ensure APIs have been called at least once
  1230  			endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
  1231  			framework.ExpectNoError(err, "LocalEndpoint() failed err %v", err)
  1232  
  1233  			ginkgo.By("Connecting to the kubelet endpoint")
  1234  			cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
  1235  			framework.ExpectNoError(err, "GetV1Client() failed err %v", err)
  1236  			defer conn.Close()
  1237  
  1238  			tries := podresources.DefaultQPS * 2 // This should also be greater than DefaultBurstTokens
  1239  			errs := []error{}
  1240  
  1241  			ginkgo.By(fmt.Sprintf("Issuing %d List() calls in a tight loop", tries))
  1242  			startTime := time.Now()
  1243  			for try := 0; try < tries; try++ {
  1244  				_, err = cli.List(ctx, &kubeletpodresourcesv1.ListPodResourcesRequest{})
  1245  				errs = append(errs, err)
  1246  			}
  1247  			elapsed := time.Since(startTime)
  1248  
  1249  			ginkgo.By(fmt.Sprintf("Checking return codes for %d List() calls in %v", tries, elapsed))
  1250  
  1251  			framework.ExpectNoError(errs[0], "the first List() call unexpectedly failed with %v", errs[0])
  1252  			// we would expect (burst) successes and then (tries-burst) errors on a clean test environment running with
  1253  			// enough CPU power. CI is usually harsher. So we relax constraints, expecting at least _a_ failure, while
  1254  			// we are likely to get much more. But we can't predict yet how more we should expect, so we prefer to relax
  1255  			// constraints than to risk flakes at this stage.
  1256  			errLimitExceededCount := 0
  1257  			for _, err := range errs[1:] {
  1258  				if errors.Is(err, apisgrpc.ErrorLimitExceeded) {
  1259  					errLimitExceededCount++
  1260  				}
  1261  			}
  1262  			gomega.Expect(errLimitExceededCount).ToNot(gomega.BeZero(), "never hit the rate limit trying %d calls in %v", tries, elapsed)
  1263  
  1264  			framework.Logf("got %d/%d rate limit errors, at least one needed, the more the better", errLimitExceededCount, tries)
  1265  
  1266  			// this is not needed for this test. We're done. But we need to play nice with *other* tests which may run just after,
  1267  			// and which need to query the API. If they run "too fast", they can still be throttled because the throttling period
  1268  			// is not exhausted yet, yielding false negatives, leading to flakes.
  1269  			// We can't reset the period for the rate limit, we just wait "long enough" to make sure we absorb the burst
  1270  			// and other queries are not rejected because happening to soon
  1271  			ginkgo.By("Cooling down to reset the podresources API rate limit")
  1272  			time.Sleep(5 * time.Second)
  1273  		})
  1274  	})
  1275  })
  1276  
  1277  func requireLackOfSRIOVDevices() {
  1278  	if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount > 0 {
  1279  		e2eskipper.Skipf("this test is meant to run on a system with no configured VF from SRIOV device")
  1280  	}
  1281  }
  1282  
  1283  func getOnlineCPUs() (cpuset.CPUSet, error) {
  1284  	onlineCPUList, err := os.ReadFile("/sys/devices/system/cpu/online")
  1285  	if err != nil {
  1286  		return cpuset.CPUSet{}, err
  1287  	}
  1288  	return cpuset.Parse(strings.TrimSpace(string(onlineCPUList)))
  1289  }
  1290  
  1291  func setupSampleDevicePluginOrFail(ctx context.Context, f *framework.Framework) *v1.Pod {
  1292  	e2enode.WaitForNodeToBeReady(ctx, f.ClientSet, framework.TestContext.NodeName, 5*time.Minute)
  1293  
  1294  	dp := getSampleDevicePluginPod(kubeletdevicepluginv1beta1.DevicePluginPath)
  1295  	dp.Spec.NodeName = framework.TestContext.NodeName
  1296  
  1297  	ginkgo.By("Create the sample device plugin pod")
  1298  
  1299  	dpPod := e2epod.NewPodClient(f).CreateSync(ctx, dp)
  1300  
  1301  	err := e2epod.WaitForPodCondition(ctx, f.ClientSet, dpPod.Namespace, dpPod.Name, "Ready", 120*time.Second, testutils.PodRunningReady)
  1302  	if err != nil {
  1303  		framework.Logf("Sample Device Pod %v took too long to enter running/ready: %v", dp.Name, err)
  1304  	}
  1305  	framework.ExpectNoError(err, "WaitForPodCondition() failed err: %v", err)
  1306  
  1307  	return dpPod
  1308  }
  1309  
  1310  func teardownSampleDevicePluginOrFail(ctx context.Context, f *framework.Framework, pod *v1.Pod) {
  1311  	gp := int64(0)
  1312  	deleteOptions := metav1.DeleteOptions{
  1313  		GracePeriodSeconds: &gp,
  1314  	}
  1315  	ginkgo.By(fmt.Sprintf("Delete sample device plugin pod %s/%s", pod.Namespace, pod.Name))
  1316  	err := f.ClientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, deleteOptions)
  1317  
  1318  	framework.ExpectNoError(err, "Failed to delete Pod %v in Namspace %v", pod.Name, pod.Namespace)
  1319  	waitForAllContainerRemoval(ctx, pod.Name, pod.Namespace)
  1320  }
  1321  
  1322  func waitForTopologyUnawareResources(ctx context.Context, f *framework.Framework) {
  1323  	ginkgo.By(fmt.Sprintf("Waiting for %q resources to become available on the local node", defaultTopologyUnawareResourceName))
  1324  
  1325  	gomega.Eventually(ctx, func(ctx context.Context) bool {
  1326  		node := getLocalNode(ctx, f)
  1327  		resourceAmount := CountSampleDeviceAllocatable(node)
  1328  		return resourceAmount > 0
  1329  	}, 2*time.Minute, framework.Poll).Should(gomega.BeTrue())
  1330  }
  1331  
  1332  func getPodResourcesMetrics(ctx context.Context) (e2emetrics.KubeletMetrics, error) {
  1333  	// we are running out of good names, so we need to be unnecessarily specific to avoid clashes
  1334  	ginkgo.By("getting Pod Resources metrics from the metrics API")
  1335  	return e2emetrics.GrabKubeletMetricsWithoutProxy(ctx, nodeNameOrIP()+":10255", "/metrics")
  1336  }
  1337  
  1338  func timelessSampleAtLeast(lower interface{}) types.GomegaMatcher {
  1339  	return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{
  1340  		// We already check Metric when matching the Id
  1341  		"Metric":    gstruct.Ignore(),
  1342  		"Value":     gomega.BeNumerically(">=", lower),
  1343  		"Timestamp": gstruct.Ignore(),
  1344  		"Histogram": gstruct.Ignore(),
  1345  	}))
  1346  }
  1347  

View as plain text