1
16
17 package gcp
18
19 import (
20 "context"
21 "fmt"
22 "strings"
23 "sync"
24 "time"
25
26 v1 "k8s.io/api/core/v1"
27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28 "k8s.io/apimachinery/pkg/fields"
29 "k8s.io/apimachinery/pkg/labels"
30 "k8s.io/apimachinery/pkg/util/sets"
31 clientset "k8s.io/client-go/kubernetes"
32 "k8s.io/kubernetes/test/e2e/feature"
33 "k8s.io/kubernetes/test/e2e/framework"
34 e2enode "k8s.io/kubernetes/test/e2e/framework/node"
35 e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
36 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
37 e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
38 testutils "k8s.io/kubernetes/test/utils"
39 admissionapi "k8s.io/pod-security-admission/api"
40
41 "github.com/onsi/ginkgo/v2"
42 )
43
44 const (
45
46
47 rebootNodeNotReadyTimeout = 2 * time.Minute
48
49
50
51
52 rebootNodeReadyAgainTimeout = 5 * time.Minute
53
54
55 rebootPodReadyAgainTimeout = 5 * time.Minute
56 )
57
58 var _ = SIGDescribe("Reboot", framework.WithDisruptive(), feature.Reboot, func() {
59 var f *framework.Framework
60
61 ginkgo.BeforeEach(func() {
62
63
64
65
66 e2eskipper.SkipUnlessProviderIs(framework.ProvidersWithSSH...)
67 })
68
69 ginkgo.AfterEach(func(ctx context.Context) {
70 if ginkgo.CurrentSpecReport().Failed() {
71
72
73 namespaceName := metav1.NamespaceSystem
74 ginkgo.By(fmt.Sprintf("Collecting events from namespace %q.", namespaceName))
75 events, err := f.ClientSet.CoreV1().Events(namespaceName).List(ctx, metav1.ListOptions{})
76 framework.ExpectNoError(err)
77
78 for _, e := range events.Items {
79 framework.Logf("event for %v: %v %v: %v", e.InvolvedObject.Name, e.Source, e.Reason, e.Message)
80 }
81 }
82
83
84
85
86
87
88
89 if framework.ProviderIs("gke") {
90 ginkgo.By("waiting 5 minutes for all dead tunnels to be dropped")
91 time.Sleep(5 * time.Minute)
92 }
93 })
94
95 f = framework.NewDefaultFramework("reboot")
96 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
97
98 ginkgo.It("each node by ordering clean reboot and ensure they function upon restart", func(ctx context.Context) {
99
100
101 testReboot(ctx, f.ClientSet, "nohup sh -c 'sleep 10 && sudo reboot' >/dev/null 2>&1 &", nil)
102 })
103
104 ginkgo.It("each node by ordering unclean reboot and ensure they function upon restart", func(ctx context.Context) {
105
106
107 testReboot(ctx, f.ClientSet, "nohup sh -c 'echo 1 | sudo tee /proc/sys/kernel/sysrq && sleep 10 && echo b | sudo tee /proc/sysrq-trigger' >/dev/null 2>&1 &", nil)
108 })
109
110 ginkgo.It("each node by triggering kernel panic and ensure they function upon restart", func(ctx context.Context) {
111
112
113 testReboot(ctx, f.ClientSet, "nohup sh -c 'echo 1 | sudo tee /proc/sys/kernel/sysrq && sleep 10 && echo c | sudo tee /proc/sysrq-trigger' >/dev/null 2>&1 &", nil)
114 })
115
116 ginkgo.It("each node by switching off the network interface and ensure they function upon switch on", func(ctx context.Context) {
117
118
119 cmd := "nohup sh -c '" +
120 "sleep 10; " +
121 "echo Shutting down eth0 | sudo tee /dev/kmsg; " +
122 "sudo ip link set eth0 down | sudo tee /dev/kmsg; " +
123 "sleep 120; " +
124 "echo Starting up eth0 | sudo tee /dev/kmsg; " +
125 "sudo ip link set eth0 up | sudo tee /dev/kmsg; " +
126 "sleep 10; " +
127 "echo Retrying starting up eth0 | sudo tee /dev/kmsg; " +
128 "sudo ip link set eth0 up | sudo tee /dev/kmsg; " +
129 "echo Running dhclient | sudo tee /dev/kmsg; " +
130 "sudo dhclient | sudo tee /dev/kmsg; " +
131 "echo Starting systemd-networkd | sudo tee /dev/kmsg; " +
132 "sudo systemctl restart systemd-networkd | sudo tee /dev/kmsg" +
133 "' >/dev/null 2>&1 &"
134 testReboot(ctx, f.ClientSet, cmd, nil)
135 })
136
137 ginkgo.It("each node by dropping all inbound packets for a while and ensure they function afterwards", func(ctx context.Context) {
138
139
140
141 tmpLogPath := "/tmp/drop-inbound.log"
142 testReboot(ctx, f.ClientSet, dropPacketsScript("INPUT", tmpLogPath), catLogHook(ctx, tmpLogPath))
143 })
144
145 ginkgo.It("each node by dropping all outbound packets for a while and ensure they function afterwards", func(ctx context.Context) {
146
147
148
149 tmpLogPath := "/tmp/drop-outbound.log"
150 testReboot(ctx, f.ClientSet, dropPacketsScript("OUTPUT", tmpLogPath), catLogHook(ctx, tmpLogPath))
151 })
152 })
153
154 func testReboot(ctx context.Context, c clientset.Interface, rebootCmd string, hook terminationHook) {
155
156 nodelist, err := e2enode.GetReadySchedulableNodes(ctx, c)
157 framework.ExpectNoError(err, "failed to list nodes")
158 if hook != nil {
159 defer func() {
160 framework.Logf("Executing termination hook on nodes")
161 hook(framework.TestContext.Provider, nodelist)
162 }()
163 }
164 result := make([]bool, len(nodelist.Items))
165 wg := sync.WaitGroup{}
166 wg.Add(len(nodelist.Items))
167
168 failed := false
169 for ix := range nodelist.Items {
170 go func(ix int) {
171 defer ginkgo.GinkgoRecover()
172 defer wg.Done()
173 n := nodelist.Items[ix]
174 result[ix] = rebootNode(ctx, c, framework.TestContext.Provider, n.ObjectMeta.Name, rebootCmd)
175 if !result[ix] {
176 failed = true
177 }
178 }(ix)
179 }
180
181
182 wg.Wait()
183
184 if failed {
185 for ix := range nodelist.Items {
186 n := nodelist.Items[ix]
187 if !result[ix] {
188 framework.Logf("Node %s failed reboot test.", n.ObjectMeta.Name)
189 }
190 }
191 framework.Failf("Test failed; at least one node failed to reboot in the time given.")
192 }
193 }
194
195 func printStatusAndLogsForNotReadyPods(ctx context.Context, c clientset.Interface, ns string, podNames []string, pods []*v1.Pod) {
196 printFn := func(id, log string, err error, previous bool) {
197 prefix := "Retrieving log for container"
198 if previous {
199 prefix = "Retrieving log for the last terminated container"
200 }
201 if err != nil {
202 framework.Logf("%s %s, err: %v:\n%s\n", prefix, id, err, log)
203 } else {
204 framework.Logf("%s %s:\n%s\n", prefix, id, log)
205 }
206 }
207 podNameSet := sets.NewString(podNames...)
208 for _, p := range pods {
209 if p.Namespace != ns {
210 continue
211 }
212 if !podNameSet.Has(p.Name) {
213 continue
214 }
215 if ok, _ := testutils.PodRunningReady(p); ok {
216 continue
217 }
218 framework.Logf("Status for not ready pod %s/%s: %+v", p.Namespace, p.Name, p.Status)
219
220 for _, container := range p.Status.ContainerStatuses {
221 cIdentifer := fmt.Sprintf("%s/%s/%s", p.Namespace, p.Name, container.Name)
222 log, err := e2epod.GetPodLogs(ctx, c, p.Namespace, p.Name, container.Name)
223 printFn(cIdentifer, log, err, false)
224
225 if container.RestartCount > 0 {
226 printFn(cIdentifer, log, err, true)
227 }
228 }
229 }
230 }
231
232
233
234
235
236
237
238
239
240
241
242 func rebootNode(ctx context.Context, c clientset.Interface, provider, name, rebootCmd string) bool {
243
244 ns := metav1.NamespaceSystem
245 ps, err := testutils.NewPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector("spec.nodeName", name))
246 if err != nil {
247 framework.Logf("Couldn't initialize pod store: %v", err)
248 return false
249 }
250 defer ps.Stop()
251
252
253 framework.Logf("Getting %s", name)
254 node, err := c.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{})
255 if err != nil {
256 framework.Logf("Couldn't get node %s", name)
257 return false
258 }
259
260
261 if !e2enode.WaitForNodeToBeReady(ctx, c, name, framework.NodeReadyInitialTimeout) {
262 return false
263 }
264
265
266
267 pods := ps.List()
268 podNames := []string{}
269 for _, p := range pods {
270 probe := false
271 for _, c := range p.Spec.Containers {
272 if c.LivenessProbe != nil {
273 probe = true
274 break
275 }
276 }
277 if !probe {
278 podNames = append(podNames, p.ObjectMeta.Name)
279 }
280 }
281 framework.Logf("Node %s has %d assigned pods with no liveness probes: %v", name, len(podNames), podNames)
282
283
284
285 if !e2epod.CheckPodsRunningReadyOrSucceeded(ctx, c, ns, podNames, framework.PodReadyBeforeTimeout) {
286 printStatusAndLogsForNotReadyPods(ctx, c, ns, podNames, pods)
287 return false
288 }
289
290
291 if err = e2essh.IssueSSHCommand(ctx, rebootCmd, provider, node); err != nil {
292 framework.Logf("Error while issuing ssh command: %v", err)
293 return false
294 }
295
296
297 if !e2enode.WaitForNodeToBeNotReady(ctx, c, name, rebootNodeNotReadyTimeout) {
298 return false
299 }
300
301
302 if !e2enode.WaitForNodeToBeReady(ctx, c, name, rebootNodeReadyAgainTimeout) {
303 return false
304 }
305
306
307
308 if !e2epod.CheckPodsRunningReadyOrSucceeded(ctx, c, ns, podNames, rebootPodReadyAgainTimeout) {
309 newPods := ps.List()
310 printStatusAndLogsForNotReadyPods(ctx, c, ns, podNames, newPods)
311 return false
312 }
313
314 framework.Logf("Reboot successful on node %s", name)
315 return true
316 }
317
318 type terminationHook func(provider string, nodes *v1.NodeList)
319
320 func catLogHook(ctx context.Context, logPath string) terminationHook {
321 return func(provider string, nodes *v1.NodeList) {
322 for _, n := range nodes.Items {
323 cmd := fmt.Sprintf("cat %v && rm %v", logPath, logPath)
324 if _, err := e2essh.IssueSSHCommandWithResult(ctx, cmd, provider, &n); err != nil {
325 framework.Logf("Error while issuing ssh command: %v", err)
326 }
327 }
328
329 }
330 }
331
332 func dropPacketsScript(chainName, logPath string) string {
333 return strings.Replace(fmt.Sprintf(`
334 nohup sh -c '
335 set -x
336 sleep 10
337 while true; do sudo iptables -I ${CHAIN} 1 -s 127.0.0.1 -j ACCEPT && break; done
338 while true; do sudo iptables -I ${CHAIN} 2 -j DROP && break; done
339 date
340 sleep 120
341 while true; do sudo iptables -D ${CHAIN} -j DROP && break; done
342 while true; do sudo iptables -D ${CHAIN} -s 127.0.0.1 -j ACCEPT && break; done
343 ' >%v 2>&1 &
344 `, logPath), "${CHAIN}", chainName, -1)
345 }
346
View as plain text