...

Source file src/k8s.io/kubernetes/test/e2e/e2e.go

Documentation: k8s.io/kubernetes/test/e2e

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package e2e
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"os"
    24  	"os/exec"
    25  	"path/filepath"
    26  	"strings"
    27  	"testing"
    28  	"time"
    29  
    30  	"k8s.io/klog/v2"
    31  
    32  	"github.com/onsi/ginkgo/v2"
    33  	"github.com/onsi/gomega"
    34  
    35  	appsv1 "k8s.io/api/apps/v1"
    36  	v1 "k8s.io/api/core/v1"
    37  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    38  	"k8s.io/apimachinery/pkg/util/wait"
    39  	"k8s.io/component-base/logs"
    40  	"k8s.io/component-base/version"
    41  	commontest "k8s.io/kubernetes/test/e2e/common"
    42  	"k8s.io/kubernetes/test/e2e/framework"
    43  	"k8s.io/kubernetes/test/e2e/framework/daemonset"
    44  	e2edebug "k8s.io/kubernetes/test/e2e/framework/debug"
    45  	e2ekubectl "k8s.io/kubernetes/test/e2e/framework/kubectl"
    46  	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
    47  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    48  	e2ereporters "k8s.io/kubernetes/test/e2e/reporters"
    49  	utilnet "k8s.io/utils/net"
    50  
    51  	clientset "k8s.io/client-go/kubernetes"
    52  	// ensure auth plugins are loaded
    53  	_ "k8s.io/client-go/plugin/pkg/client/auth"
    54  
    55  	// Ensure that logging flags are part of the command line.
    56  	_ "k8s.io/component-base/logs/testinit"
    57  )
    58  
    59  const (
    60  	// namespaceCleanupTimeout is how long to wait for the namespace to be deleted.
    61  	// If there are any orphaned namespaces to clean up, this test is running
    62  	// on a long lived cluster. A long wait here is preferably to spurious test
    63  	// failures caused by leaked resources from a previous test run.
    64  	namespaceCleanupTimeout = 15 * time.Minute
    65  )
    66  
    67  var progressReporter = &e2ereporters.ProgressReporter{}
    68  
    69  var _ = ginkgo.SynchronizedBeforeSuite(func(ctx context.Context) []byte {
    70  	// Reference common test to make the import valid.
    71  	commontest.CurrentSuite = commontest.E2E
    72  	progressReporter.SetStartMsg()
    73  	setupSuite(ctx)
    74  	return nil
    75  }, func(ctx context.Context, data []byte) {
    76  	// Run on all Ginkgo nodes
    77  	setupSuitePerGinkgoNode(ctx)
    78  })
    79  
    80  var _ = ginkgo.SynchronizedAfterSuite(func() {
    81  	progressReporter.SetEndMsg()
    82  }, func(ctx context.Context) {
    83  	AfterSuiteActions(ctx)
    84  })
    85  
    86  // RunE2ETests checks configuration parameters (specified through flags) and then runs
    87  // E2E tests using the Ginkgo runner.
    88  // If a "report directory" is specified, one or more JUnit test reports will be
    89  // generated in this directory, and cluster logs will also be saved.
    90  // This function is called on each Ginkgo node in parallel mode.
    91  func RunE2ETests(t *testing.T) {
    92  	// InitLogs disables contextual logging, without a way to enable it again
    93  	// in the E2E test suite because it has no feature gates. It used to have a
    94  	// misleading --feature-gates parameter but that didn't do what users
    95  	// and developers expected (define which features the cluster supports)
    96  	// and therefore got removed.
    97  	//
    98  	// Because contextual logging is useful and should get tested, it gets
    99  	// re-enabled here unconditionally.
   100  	logs.InitLogs()
   101  	defer logs.FlushLogs()
   102  	klog.EnableContextualLogging(true)
   103  
   104  	progressReporter = e2ereporters.NewProgressReporter(framework.TestContext.ProgressReportURL)
   105  	gomega.RegisterFailHandler(framework.Fail)
   106  
   107  	// Run tests through the Ginkgo runner with output to console + JUnit for Jenkins
   108  	suiteConfig, reporterConfig := framework.CreateGinkgoConfig()
   109  	klog.Infof("Starting e2e run %q on Ginkgo node %d", framework.RunID, suiteConfig.ParallelProcess)
   110  	ginkgo.RunSpecs(t, "Kubernetes e2e suite", suiteConfig, reporterConfig)
   111  }
   112  
   113  // getDefaultClusterIPFamily obtains the default IP family of the cluster
   114  // using the Cluster IP address of the kubernetes service created in the default namespace
   115  // This unequivocally identifies the default IP family because services are single family
   116  // TODO: dual-stack may support multiple families per service
   117  // but we can detect if a cluster is dual stack because pods have two addresses (one per family)
   118  func getDefaultClusterIPFamily(ctx context.Context, c clientset.Interface) string {
   119  	// Get the ClusterIP of the kubernetes service created in the default namespace
   120  	svc, err := c.CoreV1().Services(metav1.NamespaceDefault).Get(ctx, "kubernetes", metav1.GetOptions{})
   121  	if err != nil {
   122  		framework.Failf("Failed to get kubernetes service ClusterIP: %v", err)
   123  	}
   124  
   125  	if utilnet.IsIPv6String(svc.Spec.ClusterIP) {
   126  		return "ipv6"
   127  	}
   128  	return "ipv4"
   129  }
   130  
   131  // waitForDaemonSets for all daemonsets in the given namespace to be ready
   132  // (defined as all but 'allowedNotReadyNodes' pods associated with that
   133  // daemonset are ready).
   134  //
   135  // If allowedNotReadyNodes is -1, this method returns immediately without waiting.
   136  func waitForDaemonSets(ctx context.Context, c clientset.Interface, ns string, allowedNotReadyNodes int32, timeout time.Duration) error {
   137  	if allowedNotReadyNodes == -1 {
   138  		return nil
   139  	}
   140  
   141  	start := time.Now()
   142  	framework.Logf("Waiting up to %v for all daemonsets in namespace '%s' to start",
   143  		timeout, ns)
   144  
   145  	return wait.PollUntilContextTimeout(ctx, framework.Poll, timeout, true, func(ctx context.Context) (bool, error) {
   146  		dsList, err := c.AppsV1().DaemonSets(ns).List(ctx, metav1.ListOptions{})
   147  		if err != nil {
   148  			framework.Logf("Error getting daemonsets in namespace: '%s': %v", ns, err)
   149  			return false, err
   150  		}
   151  		var notReadyDaemonSets []string
   152  		for _, ds := range dsList.Items {
   153  			framework.Logf("%d / %d pods ready in namespace '%s' in daemonset '%s' (%d seconds elapsed)", ds.Status.NumberReady, ds.Status.DesiredNumberScheduled, ns, ds.ObjectMeta.Name, int(time.Since(start).Seconds()))
   154  			if ds.Status.DesiredNumberScheduled-ds.Status.NumberReady > allowedNotReadyNodes {
   155  				notReadyDaemonSets = append(notReadyDaemonSets, ds.ObjectMeta.Name)
   156  			}
   157  		}
   158  
   159  		if len(notReadyDaemonSets) > 0 {
   160  			framework.Logf("there are not ready daemonsets: %v", notReadyDaemonSets)
   161  			return false, nil
   162  		}
   163  
   164  		return true, nil
   165  	})
   166  }
   167  
   168  // setupSuite is the boilerplate that can be used to setup ginkgo test suites, on the SynchronizedBeforeSuite step.
   169  // There are certain operations we only want to run once per overall test invocation
   170  // (such as deleting old namespaces, or verifying that all system pods are running.
   171  // Because of the way Ginkgo runs tests in parallel, we must use SynchronizedBeforeSuite
   172  // to ensure that these operations only run on the first parallel Ginkgo node.
   173  //
   174  // This function takes two parameters: one function which runs on only the first Ginkgo node,
   175  // returning an opaque byte array, and then a second function which runs on all Ginkgo nodes,
   176  // accepting the byte array.
   177  func setupSuite(ctx context.Context) {
   178  	// Run only on Ginkgo node 1
   179  
   180  	switch framework.TestContext.Provider {
   181  	case "gce", "gke":
   182  		logClusterImageSources()
   183  	}
   184  
   185  	c, err := framework.LoadClientset()
   186  	framework.ExpectNoError(err, "Error loading client")
   187  
   188  	// Delete any namespaces except those created by the system. This ensures no
   189  	// lingering resources are left over from a previous test run.
   190  	if framework.TestContext.CleanStart {
   191  		deleted, err := framework.DeleteNamespaces(ctx, c, nil, /* deleteFilter */
   192  			[]string{
   193  				metav1.NamespaceSystem,
   194  				metav1.NamespaceDefault,
   195  				metav1.NamespacePublic,
   196  				v1.NamespaceNodeLease,
   197  			})
   198  		if err != nil {
   199  			framework.Failf("Error deleting orphaned namespaces: %v", err)
   200  		}
   201  		if err := framework.WaitForNamespacesDeleted(ctx, c, deleted, namespaceCleanupTimeout); err != nil {
   202  			framework.Failf("Failed to delete orphaned namespaces %v: %v", deleted, err)
   203  		}
   204  	}
   205  
   206  	timeouts := framework.NewTimeoutContext()
   207  
   208  	// In large clusters we may get to this point but still have a bunch
   209  	// of nodes without Routes created. Since this would make a node
   210  	// unschedulable, we need to wait until all of them are schedulable.
   211  	framework.ExpectNoError(e2enode.WaitForAllNodesSchedulable(ctx, c, timeouts.NodeSchedulable))
   212  
   213  	// If NumNodes is not specified then auto-detect how many are scheduleable and not tainted
   214  	if framework.TestContext.CloudConfig.NumNodes == framework.DefaultNumNodes {
   215  		nodes, err := e2enode.GetReadySchedulableNodes(ctx, c)
   216  		framework.ExpectNoError(err)
   217  		framework.TestContext.CloudConfig.NumNodes = len(nodes.Items)
   218  	}
   219  
   220  	// Ensure all pods are running and ready before starting tests (otherwise,
   221  	// cluster infrastructure pods that are being pulled or started can block
   222  	// test pods from running, and tests that ensure all pods are running and
   223  	// ready will fail).
   224  	//
   225  	// TODO: In large clusters, we often observe a non-starting pods due to
   226  	// #41007. To avoid those pods preventing the whole test runs (and just
   227  	// wasting the whole run), we allow for some not-ready pods (with the
   228  	// number equal to the number of allowed not-ready nodes).
   229  	if err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, int32(framework.TestContext.MinStartupPods), int32(framework.TestContext.AllowedNotReadyNodes), timeouts.SystemPodsStartup); err != nil {
   230  		e2edebug.DumpAllNamespaceInfo(ctx, c, metav1.NamespaceSystem)
   231  		e2ekubectl.LogFailedContainers(ctx, c, metav1.NamespaceSystem, framework.Logf)
   232  		framework.Failf("Error waiting for all pods to be running and ready: %v", err)
   233  	}
   234  
   235  	if err := waitForDaemonSets(ctx, c, metav1.NamespaceSystem, int32(framework.TestContext.AllowedNotReadyNodes), timeouts.SystemDaemonsetStartup); err != nil {
   236  		framework.Logf("WARNING: Waiting for all daemonsets to be ready failed: %v", err)
   237  	}
   238  
   239  	if framework.TestContext.PrepullImages {
   240  		framework.Logf("Pre-pulling images so that they are cached for the tests.")
   241  		prepullImages(ctx, c)
   242  	}
   243  
   244  	// Log the version of the server and this client.
   245  	framework.Logf("e2e test version: %s", version.Get().GitVersion)
   246  
   247  	dc := c.DiscoveryClient
   248  
   249  	serverVersion, serverErr := dc.ServerVersion()
   250  	if serverErr != nil {
   251  		framework.Logf("Unexpected server error retrieving version: %v", serverErr)
   252  	}
   253  	if serverVersion != nil {
   254  		framework.Logf("kube-apiserver version: %s", serverVersion.GitVersion)
   255  	}
   256  
   257  	if framework.TestContext.NodeKiller.Enabled {
   258  		nodeKiller := e2enode.NewNodeKiller(framework.TestContext.NodeKiller, c, framework.TestContext.Provider)
   259  		go nodeKiller.Run(framework.TestContext.NodeKiller.NodeKillerStopCtx)
   260  	}
   261  }
   262  
   263  // logClusterImageSources writes out cluster image sources.
   264  func logClusterImageSources() {
   265  	controlPlaneNodeImg, workerNodeImg, err := lookupClusterImageSources()
   266  	if err != nil {
   267  		framework.Logf("Cluster image sources lookup failed: %v\n", err)
   268  		return
   269  	}
   270  	framework.Logf("cluster-control-plane-node-image: %s", controlPlaneNodeImg)
   271  	framework.Logf("cluster-worker-node-image: %s", workerNodeImg)
   272  
   273  	images := map[string]string{
   274  		"control_plane_node_os_image": controlPlaneNodeImg,
   275  		"worker_node_os_image":        workerNodeImg,
   276  	}
   277  
   278  	outputBytes, _ := json.MarshalIndent(images, "", "  ")
   279  	filePath := filepath.Join(framework.TestContext.ReportDir, "images.json")
   280  	if err := os.WriteFile(filePath, outputBytes, 0644); err != nil {
   281  		framework.Logf("cluster images sources, could not write to %q: %v", filePath, err)
   282  	}
   283  }
   284  
   285  // TODO: These should really just use the GCE API client library or at least use
   286  // better formatted output from the --format flag.
   287  
   288  // Returns control plane node & worker node image string, or error
   289  func lookupClusterImageSources() (string, string, error) {
   290  	// Given args for a gcloud compute command, run it with other args, and return the values,
   291  	// whether separated by newlines, commas or semicolons.
   292  	gcloudf := func(argv ...string) ([]string, error) {
   293  		args := []string{"compute"}
   294  		args = append(args, argv...)
   295  		args = append(args, "--project", framework.TestContext.CloudConfig.ProjectID)
   296  		if framework.TestContext.CloudConfig.MultiMaster {
   297  			args = append(args, "--region", framework.TestContext.CloudConfig.Region)
   298  		} else {
   299  			args = append(args, "--zone", framework.TestContext.CloudConfig.Zone)
   300  		}
   301  		outputBytes, err := exec.Command("gcloud", args...).CombinedOutput()
   302  		str := strings.Replace(string(outputBytes), ",", "\n", -1)
   303  		str = strings.Replace(str, ";", "\n", -1)
   304  		lines := strings.Split(str, "\n")
   305  		if err != nil {
   306  			framework.Logf("lookupDiskImageSources: gcloud error with [%#v]; err:%v", argv, err)
   307  			for _, l := range lines {
   308  				framework.Logf(" > %s", l)
   309  			}
   310  		}
   311  		return lines, err
   312  	}
   313  
   314  	// Given a GCE instance, look through its disks, finding one that has a sourceImage
   315  	host2image := func(instance string) (string, error) {
   316  		// gcloud compute instances describe {INSTANCE} --format="get(disks[].source)"
   317  		// gcloud compute disks describe {DISKURL} --format="get(sourceImage)"
   318  		disks, err := gcloudf("instances", "describe", instance, "--format=get(disks[].source)")
   319  		if err != nil {
   320  			return "", err
   321  		} else if len(disks) == 0 {
   322  			return "", fmt.Errorf("instance %q had no findable disks", instance)
   323  		}
   324  		// Loop over disks, looking for the boot disk
   325  		for _, disk := range disks {
   326  			lines, err := gcloudf("disks", "describe", disk, "--format=get(sourceImage)")
   327  			if err != nil {
   328  				return "", err
   329  			} else if len(lines) > 0 && lines[0] != "" {
   330  				return lines[0], nil // break, we're done
   331  			}
   332  		}
   333  		return "", fmt.Errorf("instance %q had no disk with a sourceImage", instance)
   334  	}
   335  
   336  	// gcloud compute instance-groups list-instances {GROUPNAME} --format="get(instance)"
   337  	workerNodeName := ""
   338  	instGroupName := strings.Split(framework.TestContext.CloudConfig.NodeInstanceGroup, ",")[0]
   339  	if lines, err := gcloudf("instance-groups", "list-instances", instGroupName, "--format=get(instance)"); err != nil {
   340  		return "", "", err
   341  	} else if len(lines) == 0 {
   342  		return "", "", fmt.Errorf("no instances inside instance-group %q", instGroupName)
   343  	} else {
   344  		workerNodeName = lines[0]
   345  	}
   346  
   347  	workerNodeImg, err := host2image(workerNodeName)
   348  	if err != nil {
   349  		return "", "", err
   350  	}
   351  	frags := strings.Split(workerNodeImg, "/")
   352  	workerNodeImg = frags[len(frags)-1]
   353  
   354  	// For GKE clusters, controlPlaneNodeName will not be defined; we just leave controlPlaneNodeImg blank.
   355  	controlPlaneNodeImg := ""
   356  	if controlPlaneNodeName := framework.TestContext.CloudConfig.MasterName; controlPlaneNodeName != "" {
   357  		img, err := host2image(controlPlaneNodeName)
   358  		if err != nil {
   359  			return "", "", err
   360  		}
   361  		frags = strings.Split(img, "/")
   362  		controlPlaneNodeImg = frags[len(frags)-1]
   363  	}
   364  
   365  	return controlPlaneNodeImg, workerNodeImg, nil
   366  }
   367  
   368  // setupSuitePerGinkgoNode is the boilerplate that can be used to setup ginkgo test suites, on the SynchronizedBeforeSuite step.
   369  // There are certain operations we only want to run once per overall test invocation on each Ginkgo node
   370  // such as making some global variables accessible to all parallel executions
   371  // Because of the way Ginkgo runs tests in parallel, we must use SynchronizedBeforeSuite
   372  // Ref: https://onsi.github.io/ginkgo/#parallel-specs
   373  func setupSuitePerGinkgoNode(ctx context.Context) {
   374  	// Obtain the default IP family of the cluster
   375  	// Some e2e test are designed to work on IPv4 only, this global variable
   376  	// allows to adapt those tests to work on both IPv4 and IPv6
   377  	// TODO: dual-stack
   378  	// the dual stack clusters can be ipv4-ipv6 or ipv6-ipv4, order matters,
   379  	// and services use the primary IP family by default
   380  	c, err := framework.LoadClientset()
   381  	framework.ExpectNoError(err, "Error loading client")
   382  	framework.TestContext.IPFamily = getDefaultClusterIPFamily(ctx, c)
   383  	framework.Logf("Cluster IP family: %s", framework.TestContext.IPFamily)
   384  }
   385  
   386  func prepullImages(ctx context.Context, c clientset.Interface) {
   387  	namespace, err := framework.CreateTestingNS(ctx, "img-puller", c, map[string]string{
   388  		"e2e-framework": "img-puller",
   389  	})
   390  	framework.ExpectNoError(err)
   391  	ns := namespace.Name
   392  	ginkgo.DeferCleanup(c.CoreV1().Namespaces().Delete, ns, metav1.DeleteOptions{})
   393  
   394  	images := commontest.PrePulledImages
   395  	if framework.NodeOSDistroIs("windows") {
   396  		images = commontest.WindowsPrePulledImages
   397  	}
   398  
   399  	label := map[string]string{"app": "prepull-daemonset"}
   400  	var imgPullers []*appsv1.DaemonSet
   401  	for _, img := range images.List() {
   402  		dsName := fmt.Sprintf("img-pull-%s", strings.ReplaceAll(strings.ReplaceAll(img, "/", "-"), ":", "-"))
   403  
   404  		dsSpec := daemonset.NewDaemonSet(dsName, img, label, nil, nil, nil)
   405  		ds, err := c.AppsV1().DaemonSets(ns).Create(ctx, dsSpec, metav1.CreateOptions{})
   406  		framework.ExpectNoError(err)
   407  		imgPullers = append(imgPullers, ds)
   408  	}
   409  
   410  	// this should not be a multiple of 5, because node status updates
   411  	// every 5 seconds. See https://github.com/kubernetes/kubernetes/pull/14915.
   412  	dsRetryPeriod := 9 * time.Second
   413  	dsRetryTimeout := 5 * time.Minute
   414  
   415  	for _, imgPuller := range imgPullers {
   416  		checkDaemonset := func(ctx context.Context) (bool, error) {
   417  			return daemonset.CheckPresentOnNodes(ctx, c, imgPuller, ns, framework.TestContext.CloudConfig.NumNodes)
   418  		}
   419  		framework.Logf("Waiting for %s", imgPuller.Name)
   420  		err := wait.PollUntilContextTimeout(ctx, dsRetryPeriod, dsRetryTimeout, true, checkDaemonset)
   421  		framework.ExpectNoError(err, "error waiting for image to be pulled")
   422  	}
   423  }
   424  

View as plain text