1
16
17 package apimachinery
18
19 import (
20 "context"
21 "time"
22
23 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
24 "k8s.io/apimachinery/pkg/labels"
25 "k8s.io/apimachinery/pkg/util/wait"
26 podutil "k8s.io/kubernetes/pkg/api/v1/pod"
27 "k8s.io/kubernetes/test/e2e/apps"
28 "k8s.io/kubernetes/test/e2e/framework"
29 e2erc "k8s.io/kubernetes/test/e2e/framework/rc"
30 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
31 e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
32 testutils "k8s.io/kubernetes/test/utils"
33 imageutils "k8s.io/kubernetes/test/utils/image"
34 admissionapi "k8s.io/pod-security-admission/api"
35
36 "github.com/onsi/ginkgo/v2"
37 )
38
39 var _ = SIGDescribe("Etcd failure", framework.WithDisruptive(), func() {
40
41 f := framework.NewDefaultFramework("etcd-failure")
42 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
43
44 ginkgo.BeforeEach(func(ctx context.Context) {
45
46
47
48
49
50 e2eskipper.SkipUnlessProviderIs("gce")
51 e2eskipper.SkipUnlessSSHKeyPresent()
52
53 err := e2erc.RunRC(ctx, testutils.RCConfig{
54 Client: f.ClientSet,
55 Name: "baz",
56 Namespace: f.Namespace.Name,
57 Image: imageutils.GetPauseImageName(),
58 Replicas: 1,
59 })
60 framework.ExpectNoError(err)
61 })
62
63 ginkgo.It("should recover from network partition with master", func(ctx context.Context) {
64 etcdFailTest(
65 ctx,
66 f,
67 "sudo iptables -A INPUT -p tcp --destination-port 2379 -j DROP",
68 "sudo iptables -D INPUT -p tcp --destination-port 2379 -j DROP",
69 )
70 })
71
72 ginkgo.It("should recover from SIGKILL", func(ctx context.Context) {
73 etcdFailTest(
74 ctx,
75 f,
76 "pgrep etcd | xargs -I {} sudo kill -9 {}",
77 "echo 'do nothing. monit should restart etcd.'",
78 )
79 })
80 })
81
82 func etcdFailTest(ctx context.Context, f *framework.Framework, failCommand, fixCommand string) {
83 doEtcdFailure(ctx, failCommand, fixCommand)
84
85 checkExistingRCRecovers(ctx, f)
86
87 apps.TestReplicationControllerServeImageOrFail(ctx, f, "basic", framework.ServeHostnameImage)
88 }
89
90
91
92
93
94
95 const etcdFailureDuration = 20 * time.Second
96
97 func doEtcdFailure(ctx context.Context, failCommand, fixCommand string) {
98 ginkgo.By("failing etcd")
99
100 masterExec(ctx, failCommand)
101 time.Sleep(etcdFailureDuration)
102 masterExec(ctx, fixCommand)
103 }
104
105 func masterExec(ctx context.Context, cmd string) {
106 host := framework.APIAddress() + ":22"
107 result, err := e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider)
108 framework.ExpectNoError(err, "failed to SSH to host %s on provider %s and run command: %q", host, framework.TestContext.Provider, cmd)
109 if result.Code != 0 {
110 e2essh.LogResult(result)
111 framework.Failf("master exec command returned non-zero")
112 }
113 }
114
115 func checkExistingRCRecovers(ctx context.Context, f *framework.Framework) {
116 ginkgo.By("assert that the pre-existing replication controller recovers")
117 podClient := f.ClientSet.CoreV1().Pods(f.Namespace.Name)
118 rcSelector := labels.Set{"name": "baz"}.AsSelector()
119
120 ginkgo.By("deleting pods from existing replication controller")
121 framework.ExpectNoError(wait.PollWithContext(ctx, time.Millisecond*500, time.Second*60, func(ctx context.Context) (bool, error) {
122 options := metav1.ListOptions{LabelSelector: rcSelector.String()}
123 pods, err := podClient.List(ctx, options)
124 if err != nil {
125 framework.Logf("apiserver returned error, as expected before recovery: %v", err)
126 return false, nil
127 }
128 if len(pods.Items) == 0 {
129 return false, nil
130 }
131 for _, pod := range pods.Items {
132 err = podClient.Delete(ctx, pod.Name, *metav1.NewDeleteOptions(0))
133 framework.ExpectNoError(err, "failed to delete pod %s in namespace: %s", pod.Name, f.Namespace.Name)
134 }
135 framework.Logf("apiserver has recovered")
136 return true, nil
137 }))
138
139 ginkgo.By("waiting for replication controller to recover")
140 framework.ExpectNoError(wait.PollWithContext(ctx, time.Millisecond*500, time.Second*60, func(ctx context.Context) (bool, error) {
141 options := metav1.ListOptions{LabelSelector: rcSelector.String()}
142 pods, err := podClient.List(ctx, options)
143 framework.ExpectNoError(err, "failed to list pods in namespace: %s, that match label selector: %s", f.Namespace.Name, rcSelector.String())
144 for _, pod := range pods.Items {
145 if pod.DeletionTimestamp == nil && podutil.IsPodReady(&pod) {
146 return true, nil
147 }
148 }
149 return false, nil
150 }))
151 }
152
View as plain text