reboot.go

Documentation: k8s.io/kubernetes/test/e2e/cloud/gcp

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package gcp
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"strings"
    23  	"sync"
    24  	"time"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	"k8s.io/apimachinery/pkg/fields"
    29  	"k8s.io/apimachinery/pkg/labels"
    30  	"k8s.io/apimachinery/pkg/util/sets"
    31  	clientset "k8s.io/client-go/kubernetes"
    32  	"k8s.io/kubernetes/test/e2e/feature"
    33  	"k8s.io/kubernetes/test/e2e/framework"
    34  	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
    35  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    36  	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
    37  	e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
    38  	testutils "k8s.io/kubernetes/test/utils"
    39  	admissionapi "k8s.io/pod-security-admission/api"
    40  
    41  	"github.com/onsi/ginkgo/v2"
    42  )
    43  
    44  const (
    45  	// How long a node is allowed to go from "Ready" to "NotReady" after a
    46  	// reboot is issued before the test is considered failed.
    47  	rebootNodeNotReadyTimeout = 2 * time.Minute
    48  
    49  	// How long a node is allowed to go from "NotReady" to "Ready" after a
    50  	// reboot is issued and it is found to be "NotReady" before the test is
    51  	// considered failed.
    52  	rebootNodeReadyAgainTimeout = 5 * time.Minute
    53  
    54  	// How long pods have to be "ready" after the reboot.
    55  	rebootPodReadyAgainTimeout = 5 * time.Minute
    56  )
    57  
    58  var _ = SIGDescribe("Reboot", framework.WithDisruptive(), feature.Reboot, func() {
    59  	var f *framework.Framework
    60  
    61  	ginkgo.BeforeEach(func() {
    62  		// These tests requires SSH to nodes, so the provider check should be identical to there
    63  		// (the limiting factor is the implementation of util.go's e2essh.GetSigner(...)).
    64  
    65  		// Cluster must support node reboot
    66  		e2eskipper.SkipUnlessProviderIs(framework.ProvidersWithSSH...)
    67  	})
    68  
    69  	ginkgo.AfterEach(func(ctx context.Context) {
    70  		if ginkgo.CurrentSpecReport().Failed() {
    71  			// Most of the reboot tests just make sure that addon/system pods are running, so dump
    72  			// events for the kube-system namespace on failures
    73  			namespaceName := metav1.NamespaceSystem
    74  			ginkgo.By(fmt.Sprintf("Collecting events from namespace %q.", namespaceName))
    75  			events, err := f.ClientSet.CoreV1().Events(namespaceName).List(ctx, metav1.ListOptions{})
    76  			framework.ExpectNoError(err)
    77  
    78  			for _, e := range events.Items {
    79  				framework.Logf("event for %v: %v %v: %v", e.InvolvedObject.Name, e.Source, e.Reason, e.Message)
    80  			}
    81  		}
    82  		// In GKE, our current tunneling setup has the potential to hold on to a broken tunnel (from a
    83  		// rebooted/deleted node) for up to 5 minutes before all tunnels are dropped and recreated.  Most tests
    84  		// make use of some proxy feature to verify functionality. So, if a reboot test runs right before a test
    85  		// that tries to get logs, for example, we may get unlucky and try to use a closed tunnel to a node that
    86  		// was recently rebooted. There's no good way to framework.Poll for proxies being closed, so we sleep.
    87  		//
    88  		// TODO(cjcullen) reduce this sleep (#19314)
    89  		if framework.ProviderIs("gke") {
    90  			ginkgo.By("waiting 5 minutes for all dead tunnels to be dropped")
    91  			time.Sleep(5 * time.Minute)
    92  		}
    93  	})
    94  
    95  	f = framework.NewDefaultFramework("reboot")
    96  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
    97  
    98  	ginkgo.It("each node by ordering clean reboot and ensure they function upon restart", func(ctx context.Context) {
    99  		// clean shutdown and restart
   100  		// We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is rebooted.
   101  		testReboot(ctx, f.ClientSet, "nohup sh -c 'sleep 10 && sudo reboot' >/dev/null 2>&1 &", nil)
   102  	})
   103  
   104  	ginkgo.It("each node by ordering unclean reboot and ensure they function upon restart", func(ctx context.Context) {
   105  		// unclean shutdown and restart
   106  		// We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is shutdown.
   107  		testReboot(ctx, f.ClientSet, "nohup sh -c 'echo 1 | sudo tee /proc/sys/kernel/sysrq && sleep 10 && echo b | sudo tee /proc/sysrq-trigger' >/dev/null 2>&1 &", nil)
   108  	})
   109  
   110  	ginkgo.It("each node by triggering kernel panic and ensure they function upon restart", func(ctx context.Context) {
   111  		// kernel panic
   112  		// We sleep 10 seconds to give some time for ssh command to cleanly finish before kernel panic is triggered.
   113  		testReboot(ctx, f.ClientSet, "nohup sh -c 'echo 1 | sudo tee /proc/sys/kernel/sysrq && sleep 10 && echo c | sudo tee /proc/sysrq-trigger' >/dev/null 2>&1 &", nil)
   114  	})
   115  
   116  	ginkgo.It("each node by switching off the network interface and ensure they function upon switch on", func(ctx context.Context) {
   117  		// switch the network interface off for a while to simulate a network outage
   118  		// We sleep 10 seconds to give some time for ssh command to cleanly finish before network is down.
   119  		cmd := "nohup sh -c '" +
   120  			"sleep 10; " +
   121  			"echo Shutting down eth0 | sudo tee /dev/kmsg; " +
   122  			"sudo ip link set eth0 down | sudo tee /dev/kmsg; " +
   123  			"sleep 120; " +
   124  			"echo Starting up eth0 | sudo tee /dev/kmsg; " +
   125  			"sudo ip link set eth0 up | sudo tee /dev/kmsg; " +
   126  			"sleep 10; " +
   127  			"echo Retrying starting up eth0 | sudo tee /dev/kmsg; " +
   128  			"sudo ip link set eth0 up | sudo tee /dev/kmsg; " +
   129  			"echo Running dhclient | sudo tee /dev/kmsg; " +
   130  			"sudo dhclient | sudo tee /dev/kmsg; " +
   131  			"echo Starting systemd-networkd | sudo tee /dev/kmsg; " +
   132  			"sudo systemctl restart systemd-networkd | sudo tee /dev/kmsg" +
   133  			"' >/dev/null 2>&1 &"
   134  		testReboot(ctx, f.ClientSet, cmd, nil)
   135  	})
   136  
   137  	ginkgo.It("each node by dropping all inbound packets for a while and ensure they function afterwards", func(ctx context.Context) {
   138  		// tell the firewall to drop all inbound packets for a while
   139  		// We sleep 10 seconds to give some time for ssh command to cleanly finish before starting dropping inbound packets.
   140  		// We still accept packages send from localhost to prevent monit from restarting kubelet.
   141  		tmpLogPath := "/tmp/drop-inbound.log"
   142  		testReboot(ctx, f.ClientSet, dropPacketsScript("INPUT", tmpLogPath), catLogHook(ctx, tmpLogPath))
   143  	})
   144  
   145  	ginkgo.It("each node by dropping all outbound packets for a while and ensure they function afterwards", func(ctx context.Context) {
   146  		// tell the firewall to drop all outbound packets for a while
   147  		// We sleep 10 seconds to give some time for ssh command to cleanly finish before starting dropping outbound packets.
   148  		// We still accept packages send to localhost to prevent monit from restarting kubelet.
   149  		tmpLogPath := "/tmp/drop-outbound.log"
   150  		testReboot(ctx, f.ClientSet, dropPacketsScript("OUTPUT", tmpLogPath), catLogHook(ctx, tmpLogPath))
   151  	})
   152  })
   153  
   154  func testReboot(ctx context.Context, c clientset.Interface, rebootCmd string, hook terminationHook) {
   155  	// Get all nodes, and kick off the test on each.
   156  	nodelist, err := e2enode.GetReadySchedulableNodes(ctx, c)
   157  	framework.ExpectNoError(err, "failed to list nodes")
   158  	if hook != nil {
   159  		defer func() {
   160  			framework.Logf("Executing termination hook on nodes")
   161  			hook(framework.TestContext.Provider, nodelist)
   162  		}()
   163  	}
   164  	result := make([]bool, len(nodelist.Items))
   165  	wg := sync.WaitGroup{}
   166  	wg.Add(len(nodelist.Items))
   167  
   168  	failed := false
   169  	for ix := range nodelist.Items {
   170  		go func(ix int) {
   171  			defer ginkgo.GinkgoRecover()
   172  			defer wg.Done()
   173  			n := nodelist.Items[ix]
   174  			result[ix] = rebootNode(ctx, c, framework.TestContext.Provider, n.ObjectMeta.Name, rebootCmd)
   175  			if !result[ix] {
   176  				failed = true
   177  			}
   178  		}(ix)
   179  	}
   180  
   181  	// Wait for all to finish and check the final result.
   182  	wg.Wait()
   183  
   184  	if failed {
   185  		for ix := range nodelist.Items {
   186  			n := nodelist.Items[ix]
   187  			if !result[ix] {
   188  				framework.Logf("Node %s failed reboot test.", n.ObjectMeta.Name)
   189  			}
   190  		}
   191  		framework.Failf("Test failed; at least one node failed to reboot in the time given.")
   192  	}
   193  }
   194  
   195  func printStatusAndLogsForNotReadyPods(ctx context.Context, c clientset.Interface, ns string, podNames []string, pods []*v1.Pod) {
   196  	printFn := func(id, log string, err error, previous bool) {
   197  		prefix := "Retrieving log for container"
   198  		if previous {
   199  			prefix = "Retrieving log for the last terminated container"
   200  		}
   201  		if err != nil {
   202  			framework.Logf("%s %s, err: %v:\n%s\n", prefix, id, err, log)
   203  		} else {
   204  			framework.Logf("%s %s:\n%s\n", prefix, id, log)
   205  		}
   206  	}
   207  	podNameSet := sets.NewString(podNames...)
   208  	for _, p := range pods {
   209  		if p.Namespace != ns {
   210  			continue
   211  		}
   212  		if !podNameSet.Has(p.Name) {
   213  			continue
   214  		}
   215  		if ok, _ := testutils.PodRunningReady(p); ok {
   216  			continue
   217  		}
   218  		framework.Logf("Status for not ready pod %s/%s: %+v", p.Namespace, p.Name, p.Status)
   219  		// Print the log of the containers if pod is not running and ready.
   220  		for _, container := range p.Status.ContainerStatuses {
   221  			cIdentifer := fmt.Sprintf("%s/%s/%s", p.Namespace, p.Name, container.Name)
   222  			log, err := e2epod.GetPodLogs(ctx, c, p.Namespace, p.Name, container.Name)
   223  			printFn(cIdentifer, log, err, false)
   224  			// Get log from the previous container.
   225  			if container.RestartCount > 0 {
   226  				printFn(cIdentifer, log, err, true)
   227  			}
   228  		}
   229  	}
   230  }
   231  
   232  // rebootNode takes node name on provider through the following steps using c:
   233  //   - ensures the node is ready
   234  //   - ensures all pods on the node are running and ready
   235  //   - reboots the node (by executing rebootCmd over ssh)
   236  //   - ensures the node reaches some non-ready state
   237  //   - ensures the node becomes ready again
   238  //   - ensures all pods on the node become running and ready again
   239  //
   240  // It returns true through result only if all of the steps pass; at the first
   241  // failed step, it will return false through result and not run the rest.
   242  func rebootNode(ctx context.Context, c clientset.Interface, provider, name, rebootCmd string) bool {
   243  	// Setup
   244  	ns := metav1.NamespaceSystem
   245  	ps, err := testutils.NewPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector("spec.nodeName", name))
   246  	if err != nil {
   247  		framework.Logf("Couldn't initialize pod store: %v", err)
   248  		return false
   249  	}
   250  	defer ps.Stop()
   251  
   252  	// Get the node initially.
   253  	framework.Logf("Getting %s", name)
   254  	node, err := c.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{})
   255  	if err != nil {
   256  		framework.Logf("Couldn't get node %s", name)
   257  		return false
   258  	}
   259  
   260  	// Node sanity check: ensure it is "ready".
   261  	if !e2enode.WaitForNodeToBeReady(ctx, c, name, framework.NodeReadyInitialTimeout) {
   262  		return false
   263  	}
   264  
   265  	// Get all the pods on the node that don't have liveness probe set.
   266  	// Liveness probe may cause restart of a pod during node reboot, and the pod may not be running.
   267  	pods := ps.List()
   268  	podNames := []string{}
   269  	for _, p := range pods {
   270  		probe := false
   271  		for _, c := range p.Spec.Containers {
   272  			if c.LivenessProbe != nil {
   273  				probe = true
   274  				break
   275  			}
   276  		}
   277  		if !probe {
   278  			podNames = append(podNames, p.ObjectMeta.Name)
   279  		}
   280  	}
   281  	framework.Logf("Node %s has %d assigned pods with no liveness probes: %v", name, len(podNames), podNames)
   282  
   283  	// For each pod, we do a sanity check to ensure it's running / healthy
   284  	// or succeeded now, as that's what we'll be checking later.
   285  	if !e2epod.CheckPodsRunningReadyOrSucceeded(ctx, c, ns, podNames, framework.PodReadyBeforeTimeout) {
   286  		printStatusAndLogsForNotReadyPods(ctx, c, ns, podNames, pods)
   287  		return false
   288  	}
   289  
   290  	// Reboot the node.
   291  	if err = e2essh.IssueSSHCommand(ctx, rebootCmd, provider, node); err != nil {
   292  		framework.Logf("Error while issuing ssh command: %v", err)
   293  		return false
   294  	}
   295  
   296  	// Wait for some kind of "not ready" status.
   297  	if !e2enode.WaitForNodeToBeNotReady(ctx, c, name, rebootNodeNotReadyTimeout) {
   298  		return false
   299  	}
   300  
   301  	// Wait for some kind of "ready" status.
   302  	if !e2enode.WaitForNodeToBeReady(ctx, c, name, rebootNodeReadyAgainTimeout) {
   303  		return false
   304  	}
   305  
   306  	// Ensure all of the pods that we found on this node before the reboot are
   307  	// running / healthy, or succeeded.
   308  	if !e2epod.CheckPodsRunningReadyOrSucceeded(ctx, c, ns, podNames, rebootPodReadyAgainTimeout) {
   309  		newPods := ps.List()
   310  		printStatusAndLogsForNotReadyPods(ctx, c, ns, podNames, newPods)
   311  		return false
   312  	}
   313  
   314  	framework.Logf("Reboot successful on node %s", name)
   315  	return true
   316  }
   317  
   318  type terminationHook func(provider string, nodes *v1.NodeList)
   319  
   320  func catLogHook(ctx context.Context, logPath string) terminationHook {
   321  	return func(provider string, nodes *v1.NodeList) {
   322  		for _, n := range nodes.Items {
   323  			cmd := fmt.Sprintf("cat %v && rm %v", logPath, logPath)
   324  			if _, err := e2essh.IssueSSHCommandWithResult(ctx, cmd, provider, &n); err != nil {
   325  				framework.Logf("Error while issuing ssh command: %v", err)
   326  			}
   327  		}
   328  
   329  	}
   330  }
   331  
   332  func dropPacketsScript(chainName, logPath string) string {
   333  	return strings.Replace(fmt.Sprintf(`
   334  		nohup sh -c '
   335  			set -x
   336  			sleep 10
   337  			while true; do sudo iptables -I ${CHAIN} 1 -s 127.0.0.1 -j ACCEPT && break; done
   338  			while true; do sudo iptables -I ${CHAIN} 2 -j DROP && break; done
   339  			date
   340  			sleep 120
   341  			while true; do sudo iptables -D ${CHAIN} -j DROP && break; done
   342  			while true; do sudo iptables -D ${CHAIN} -s 127.0.0.1 -j ACCEPT && break; done
   343  		' >%v 2>&1 &
   344  		`, logPath), "${CHAIN}", chainName, -1)
   345  }
   346
View as plain text