/* Copyright 2017 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package e2enode import ( "context" "fmt" "os" "os/exec" "strconv" "strings" "time" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/kubernetes/pkg/kubelet/cm" "k8s.io/kubernetes/test/e2e/feature" "k8s.io/kubernetes/test/e2e/framework" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" admissionapi "k8s.io/pod-security-admission/api" ) const ( hugepagesSize2M = 2048 hugepagesSize1G = 1048576 hugepagesDirPrefix = "/sys/kernel/mm/hugepages/hugepages" hugepagesCapacityFile = "nr_hugepages" hugepagesResourceName2Mi = "hugepages-2Mi" hugepagesResourceName1Gi = "hugepages-1Gi" hugepagesCgroup2MB = "hugetlb.2MB" hugepagesCgroup1GB = "hugetlb.1GB" mediumHugepages = "HugePages" mediumHugepages2Mi = "HugePages-2Mi" mediumHugepages1Gi = "HugePages-1Gi" ) var ( resourceToSize = map[string]int{ hugepagesResourceName2Mi: hugepagesSize2M, hugepagesResourceName1Gi: hugepagesSize1G, } resourceToCgroup = map[string]string{ hugepagesResourceName2Mi: hugepagesCgroup2MB, hugepagesResourceName1Gi: hugepagesCgroup1GB, } ) // makePodToVerifyHugePages returns a pod that verifies specified cgroup with hugetlb func makePodToVerifyHugePages(baseName string, hugePagesLimit resource.Quantity, hugepagesCgroup string) *v1.Pod { // convert the cgroup name to its literal form cgroupName := cm.NewCgroupName(cm.RootCgroupName, defaultNodeAllocatableCgroup, baseName) cgroupFsName := "" if kubeletCfg.CgroupDriver == "systemd" { cgroupFsName = cgroupName.ToSystemd() } else { cgroupFsName = cgroupName.ToCgroupfs() } hugetlbLimitFile := "" // this command takes the expected value and compares it against the actual value for the pod cgroup hugetlb.2MB. if IsCgroup2UnifiedMode() { hugetlbLimitFile = fmt.Sprintf("/tmp/%s/%s.max", cgroupFsName, hugepagesCgroup) } else { hugetlbLimitFile = fmt.Sprintf("/tmp/hugetlb/%s/%s.limit_in_bytes", cgroupFsName, hugepagesCgroup) } command := fmt.Sprintf("expected=%v; actual=$(cat %v); if [ \"$expected\" -ne \"$actual\" ]; then exit 1; fi; ", hugePagesLimit.Value(), hugetlbLimitFile) framework.Logf("Pod to run command: %v", command) pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "pod" + string(uuid.NewUUID()), }, Spec: v1.PodSpec{ RestartPolicy: v1.RestartPolicyNever, Containers: []v1.Container{ { Image: busyboxImage, Name: "container" + string(uuid.NewUUID()), Command: []string{"sh", "-c", command}, VolumeMounts: []v1.VolumeMount{ { Name: "sysfscgroup", MountPath: "/tmp", }, }, }, }, Volumes: []v1.Volume{ { Name: "sysfscgroup", VolumeSource: v1.VolumeSource{ HostPath: &v1.HostPathVolumeSource{Path: "/sys/fs/cgroup"}, }, }, }, }, } return pod } // configureHugePages attempts to allocate hugepages of the specified size func configureHugePages(hugepagesSize int, hugepagesCount int, numaNodeID *int) error { // Compact memory to make bigger contiguous blocks of memory available // before allocating huge pages. // https://www.kernel.org/doc/Documentation/sysctl/vm.txt if _, err := os.Stat("/proc/sys/vm/compact_memory"); err == nil { if err := exec.Command("/bin/sh", "-c", "echo 1 > /proc/sys/vm/compact_memory").Run(); err != nil { return err } } // e.g. hugepages/hugepages-2048kB/nr_hugepages hugepagesSuffix := fmt.Sprintf("hugepages/hugepages-%dkB/%s", hugepagesSize, hugepagesCapacityFile) // e.g. /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages hugepagesFile := fmt.Sprintf("/sys/kernel/mm/%s", hugepagesSuffix) if numaNodeID != nil { // e.g. /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages hugepagesFile = fmt.Sprintf("/sys/devices/system/node/node%d/%s", *numaNodeID, hugepagesSuffix) } // Reserve number of hugepages // e.g. /bin/sh -c "echo 5 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages" command := fmt.Sprintf("echo %d > %s", hugepagesCount, hugepagesFile) if err := exec.Command("/bin/sh", "-c", command).Run(); err != nil { return err } // verify that the number of hugepages was updated // e.g. /bin/sh -c "cat /sys/kernel/mm/hugepages/hugepages-2048kB/vm.nr_hugepages" command = fmt.Sprintf("cat %s", hugepagesFile) outData, err := exec.Command("/bin/sh", "-c", command).Output() if err != nil { return err } numHugePages, err := strconv.Atoi(strings.TrimSpace(string(outData))) if err != nil { return err } framework.Logf("Hugepages total is set to %v", numHugePages) if numHugePages == hugepagesCount { return nil } return fmt.Errorf("expected hugepages %v, but found %v", hugepagesCount, numHugePages) } // isHugePageAvailable returns true if hugepages of the specified size is available on the host func isHugePageAvailable(hugepagesSize int) bool { path := fmt.Sprintf("%s-%dkB/%s", hugepagesDirPrefix, hugepagesSize, hugepagesCapacityFile) if _, err := os.Stat(path); err != nil { return false } return true } func getHugepagesTestPod(f *framework.Framework, limits v1.ResourceList, mounts []v1.VolumeMount, volumes []v1.Volume) *v1.Pod { return &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ GenerateName: "hugepages-", Namespace: f.Namespace.Name, }, Spec: v1.PodSpec{ Containers: []v1.Container{ { Name: "container" + string(uuid.NewUUID()), Image: busyboxImage, Resources: v1.ResourceRequirements{ Limits: limits, }, Command: []string{"sleep", "3600"}, VolumeMounts: mounts, }, }, Volumes: volumes, }, } } // Serial because the test updates kubelet configuration. var _ = SIGDescribe("HugePages", framework.WithSerial(), feature.HugePages, "[NodeSpecialFeature:HugePages]", func() { f := framework.NewDefaultFramework("hugepages-test") f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged ginkgo.It("should remove resources for huge page sizes no longer supported", func(ctx context.Context) { ginkgo.By("mimicking support for 9Mi of 3Mi huge page memory by patching the node status") patch := []byte(`[{"op": "add", "path": "/status/capacity/hugepages-3Mi", "value": "9Mi"}, {"op": "add", "path": "/status/allocatable/hugepages-3Mi", "value": "9Mi"}]`) result := f.ClientSet.CoreV1().RESTClient().Patch(types.JSONPatchType).Resource("nodes").Name(framework.TestContext.NodeName).SubResource("status").Body(patch).Do(ctx) framework.ExpectNoError(result.Error(), "while patching") node, err := f.ClientSet.CoreV1().Nodes().Get(ctx, framework.TestContext.NodeName, metav1.GetOptions{}) framework.ExpectNoError(err, "while getting node status") ginkgo.By("Verifying that the node now supports huge pages with size 3Mi") value, ok := node.Status.Capacity["hugepages-3Mi"] if !ok { framework.Failf("capacity should contain resource hugepages-3Mi: %v", node.Status.Capacity) } gomega.Expect(value.String()).To(gomega.Equal("9Mi"), "huge pages with size 3Mi should be supported") ginkgo.By("restarting the node and verifying that huge pages with size 3Mi are not supported") restartKubelet(true) ginkgo.By("verifying that the hugepages-3Mi resource no longer is present") gomega.Eventually(ctx, func() bool { node, err = f.ClientSet.CoreV1().Nodes().Get(ctx, framework.TestContext.NodeName, metav1.GetOptions{}) framework.ExpectNoError(err, "while getting node status") _, isPresent := node.Status.Capacity["hugepages-3Mi"] return isPresent }, 30*time.Second, framework.Poll).Should(gomega.BeFalse()) }) ginkgo.It("should add resources for new huge page sizes on kubelet restart", func(ctx context.Context) { ginkgo.By("Stopping kubelet") startKubelet := stopKubelet() ginkgo.By(`Patching away support for hugepage resource "hugepages-2Mi"`) patch := []byte(`[{"op": "remove", "path": "/status/capacity/hugepages-2Mi"}, {"op": "remove", "path": "/status/allocatable/hugepages-2Mi"}]`) result := f.ClientSet.CoreV1().RESTClient().Patch(types.JSONPatchType).Resource("nodes").Name(framework.TestContext.NodeName).SubResource("status").Body(patch).Do(ctx) framework.ExpectNoError(result.Error(), "while patching") ginkgo.By("Starting kubelet again") startKubelet() ginkgo.By("verifying that the hugepages-2Mi resource is present") gomega.Eventually(ctx, func() bool { node, err := f.ClientSet.CoreV1().Nodes().Get(ctx, framework.TestContext.NodeName, metav1.GetOptions{}) framework.ExpectNoError(err, "while getting node status") _, isPresent := node.Status.Capacity["hugepages-2Mi"] return isPresent }, 30*time.Second, framework.Poll).Should(gomega.BeTrue()) }) ginkgo.When("start the pod", func() { var ( testpod *v1.Pod limits v1.ResourceList mounts []v1.VolumeMount volumes []v1.Volume hugepages map[string]int ) setHugepages := func(ctx context.Context) { for hugepagesResource, count := range hugepages { size := resourceToSize[hugepagesResource] ginkgo.By(fmt.Sprintf("Verifying hugepages %d are supported", size)) if !isHugePageAvailable(size) { e2eskipper.Skipf("skipping test because hugepages of size %d not supported", size) return } ginkgo.By(fmt.Sprintf("Configuring the host to reserve %d of pre-allocated hugepages of size %d", count, size)) gomega.Eventually(ctx, func() error { if err := configureHugePages(size, count, nil); err != nil { return err } return nil }, 30*time.Second, framework.Poll).Should(gomega.BeNil()) } } waitForHugepages := func(ctx context.Context) { ginkgo.By("Waiting for hugepages resource to become available on the local node") gomega.Eventually(ctx, func(ctx context.Context) error { node, err := f.ClientSet.CoreV1().Nodes().Get(ctx, framework.TestContext.NodeName, metav1.GetOptions{}) if err != nil { return err } for hugepagesResource, count := range hugepages { capacity, ok := node.Status.Capacity[v1.ResourceName(hugepagesResource)] if !ok { return fmt.Errorf("the node does not have the resource %s", hugepagesResource) } size, succeed := capacity.AsInt64() if !succeed { return fmt.Errorf("failed to convert quantity to int64") } expectedSize := count * resourceToSize[hugepagesResource] * 1024 if size != int64(expectedSize) { return fmt.Errorf("the actual size %d is different from the expected one %d", size, expectedSize) } } return nil }, time.Minute, framework.Poll).Should(gomega.BeNil()) } releaseHugepages := func(ctx context.Context) { ginkgo.By("Releasing hugepages") gomega.Eventually(ctx, func() error { for hugepagesResource := range hugepages { command := fmt.Sprintf("echo 0 > %s-%dkB/%s", hugepagesDirPrefix, resourceToSize[hugepagesResource], hugepagesCapacityFile) if err := exec.Command("/bin/sh", "-c", command).Run(); err != nil { return err } } return nil }, 30*time.Second, framework.Poll).Should(gomega.BeNil()) } runHugePagesTests := func() { ginkgo.It("should set correct hugetlb mount and limit under the container cgroup", func(ctx context.Context) { ginkgo.By("getting mounts for the test pod") command := []string{"mount"} out := e2epod.ExecCommandInContainer(f, testpod.Name, testpod.Spec.Containers[0].Name, command...) for _, mount := range mounts { ginkgo.By(fmt.Sprintf("checking that the hugetlb mount %s exists under the container", mount.MountPath)) gomega.Expect(out).To(gomega.ContainSubstring(mount.MountPath)) } for resourceName := range hugepages { verifyPod := makePodToVerifyHugePages( "pod"+string(testpod.UID), testpod.Spec.Containers[0].Resources.Limits[v1.ResourceName(resourceName)], resourceToCgroup[resourceName], ) ginkgo.By("checking if the expected hugetlb settings were applied") e2epod.NewPodClient(f).Create(ctx, verifyPod) err := e2epod.WaitForPodSuccessInNamespace(ctx, f.ClientSet, verifyPod.Name, f.Namespace.Name) framework.ExpectNoError(err) } }) } // setup ginkgo.JustBeforeEach(func(ctx context.Context) { setHugepages(ctx) ginkgo.By("restarting kubelet to pick up pre-allocated hugepages") restartKubelet(true) waitForHugepages(ctx) pod := getHugepagesTestPod(f, limits, mounts, volumes) ginkgo.By("by running a test pod that requests hugepages") testpod = e2epod.NewPodClient(f).CreateSync(ctx, pod) }) // we should use JustAfterEach because framework will teardown the client under the AfterEach method ginkgo.JustAfterEach(func(ctx context.Context) { ginkgo.By(fmt.Sprintf("deleting test pod %s", testpod.Name)) e2epod.NewPodClient(f).DeleteSync(ctx, testpod.Name, metav1.DeleteOptions{}, 2*time.Minute) releaseHugepages(ctx) ginkgo.By("restarting kubelet to pick up pre-allocated hugepages") restartKubelet(true) waitForHugepages(ctx) }) ginkgo.Context("with the resources requests that contain only one hugepages resource ", func() { ginkgo.Context("with the backward compatible API", func() { ginkgo.BeforeEach(func() { limits = v1.ResourceList{ v1.ResourceCPU: resource.MustParse("10m"), v1.ResourceMemory: resource.MustParse("100Mi"), hugepagesResourceName2Mi: resource.MustParse("6Mi"), } mounts = []v1.VolumeMount{ { Name: "hugepages", MountPath: "/hugepages", }, } volumes = []v1.Volume{ { Name: "hugepages", VolumeSource: v1.VolumeSource{ EmptyDir: &v1.EmptyDirVolumeSource{ Medium: mediumHugepages, }, }, }, } hugepages = map[string]int{hugepagesResourceName2Mi: 5} }) // run tests runHugePagesTests() }) ginkgo.Context("with the new API", func() { ginkgo.BeforeEach(func() { limits = v1.ResourceList{ v1.ResourceCPU: resource.MustParse("10m"), v1.ResourceMemory: resource.MustParse("100Mi"), hugepagesResourceName2Mi: resource.MustParse("6Mi"), } mounts = []v1.VolumeMount{ { Name: "hugepages-2mi", MountPath: "/hugepages-2Mi", }, } volumes = []v1.Volume{ { Name: "hugepages-2mi", VolumeSource: v1.VolumeSource{ EmptyDir: &v1.EmptyDirVolumeSource{ Medium: mediumHugepages2Mi, }, }, }, } hugepages = map[string]int{hugepagesResourceName2Mi: 5} }) runHugePagesTests() }) ginkgo.JustAfterEach(func() { hugepages = map[string]int{hugepagesResourceName2Mi: 0} }) }) ginkgo.Context("with the resources requests that contain multiple hugepages resources ", func() { ginkgo.BeforeEach(func() { hugepages = map[string]int{ hugepagesResourceName2Mi: 5, hugepagesResourceName1Gi: 1, } limits = v1.ResourceList{ v1.ResourceCPU: resource.MustParse("10m"), v1.ResourceMemory: resource.MustParse("100Mi"), hugepagesResourceName2Mi: resource.MustParse("6Mi"), hugepagesResourceName1Gi: resource.MustParse("1Gi"), } mounts = []v1.VolumeMount{ { Name: "hugepages-2mi", MountPath: "/hugepages-2Mi", }, { Name: "hugepages-1gi", MountPath: "/hugepages-1Gi", }, } volumes = []v1.Volume{ { Name: "hugepages-2mi", VolumeSource: v1.VolumeSource{ EmptyDir: &v1.EmptyDirVolumeSource{ Medium: mediumHugepages2Mi, }, }, }, { Name: "hugepages-1gi", VolumeSource: v1.VolumeSource{ EmptyDir: &v1.EmptyDirVolumeSource{ Medium: mediumHugepages1Gi, }, }, }, } }) runHugePagesTests() ginkgo.JustAfterEach(func() { hugepages = map[string]int{ hugepagesResourceName2Mi: 0, hugepagesResourceName1Gi: 0, } }) }) }) })