1
16
17 package upgrade
18
19 import (
20 "context"
21 "fmt"
22 "os"
23 "time"
24
25 "github.com/pkg/errors"
26
27 batchv1 "k8s.io/api/batch/v1"
28 v1 "k8s.io/api/core/v1"
29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30 "k8s.io/apimachinery/pkg/labels"
31 "k8s.io/apimachinery/pkg/util/sets"
32 "k8s.io/apimachinery/pkg/util/wait"
33 clientset "k8s.io/client-go/kubernetes"
34 "k8s.io/klog/v2"
35 "k8s.io/utils/ptr"
36
37 kubeadmapi "k8s.io/kubernetes/cmd/kubeadm/app/apis/kubeadm"
38 "k8s.io/kubernetes/cmd/kubeadm/app/constants"
39 "k8s.io/kubernetes/cmd/kubeadm/app/images"
40 "k8s.io/kubernetes/cmd/kubeadm/app/preflight"
41 "k8s.io/kubernetes/cmd/kubeadm/app/util/output"
42 )
43
44
45 type healthCheck struct {
46 name string
47 client clientset.Interface
48 cfg *kubeadmapi.ClusterConfiguration
49
50 f func(clientset.Interface, *kubeadmapi.ClusterConfiguration) error
51 }
52
53
54 func (c *healthCheck) Check() (warnings, errors []error) {
55 if err := c.f(c.client, c.cfg); err != nil {
56 return nil, []error{err}
57 }
58 return nil, nil
59 }
60
61
62 func (c *healthCheck) Name() string {
63 return c.name
64 }
65
66
67
68
69
70 func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error {
71 _, _ = printer.Println("[upgrade] Running cluster health checks")
72
73 healthChecks := []preflight.Checker{
74 &healthCheck{
75 name: "CreateJob",
76 client: client,
77 cfg: cfg,
78 f: createJob,
79 },
80 &healthCheck{
81 name: "ControlPlaneNodesReady",
82 client: client,
83 f: controlPlaneNodesReady,
84 },
85 &healthCheck{
86 name: "StaticPodManifest",
87 f: staticPodManifestHealth,
88 },
89 }
90
91 return preflight.RunChecks(healthChecks, os.Stderr, ignoreChecksErrors)
92 }
93
94
95 func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) error {
96 const (
97 prefix = "upgrade-health-check"
98 fieldSelector = "spec.unschedulable=false"
99 ns = metav1.NamespaceSystem
100 timeout = 15 * time.Second
101 )
102 var (
103 err, lastError error
104 ctx = context.Background()
105 nodes *v1.NodeList
106 listOptions = metav1.ListOptions{Limit: 1, FieldSelector: fieldSelector}
107 )
108
109
110
111 if client.Discovery().RESTClient() == nil {
112 fmt.Printf("[upgrade/health] Would create the Job with the prefix %q in namespace %q and wait until it completes\n", prefix, ns)
113 return nil
114 }
115
116
117 err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
118 nodes, err = client.CoreV1().Nodes().List(context.Background(), listOptions)
119 if err != nil {
120 klog.V(2).Infof("Could not list Nodes with field selector %q: %v", fieldSelector, err)
121 lastError = err
122 return false, nil
123 }
124 return true, nil
125 })
126 if err != nil {
127 return errors.Wrap(lastError, "could not check if there is at least one Node that can schedule a test Pod")
128 }
129
130 if len(nodes.Items) == 0 {
131 klog.Warning("The preflight check \"CreateJob\" was skipped because there are no schedulable Nodes in the cluster.")
132 return nil
133 }
134
135
136 job := &batchv1.Job{
137 ObjectMeta: metav1.ObjectMeta{
138 GenerateName: prefix + "-",
139 Namespace: ns,
140 },
141 Spec: batchv1.JobSpec{
142 BackoffLimit: ptr.To[int32](0),
143 TTLSecondsAfterFinished: ptr.To[int32](int32(timeout.Seconds()) + 5),
144 Template: v1.PodTemplateSpec{
145 Spec: v1.PodSpec{
146 RestartPolicy: v1.RestartPolicyNever,
147 SecurityContext: &v1.PodSecurityContext{
148 RunAsUser: ptr.To[int64](999),
149 RunAsGroup: ptr.To[int64](999),
150 RunAsNonRoot: ptr.To(true),
151 },
152 Tolerations: []v1.Toleration{
153 {
154 Key: constants.LabelNodeRoleControlPlane,
155 Effect: v1.TaintEffectNoSchedule,
156 },
157 },
158 Containers: []v1.Container{
159 {
160 Name: prefix,
161 Image: images.GetPauseImage(cfg),
162 Args: []string{"-v"},
163 },
164 },
165 },
166 },
167 },
168 }
169
170
171 klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns)
172 var jobName string
173 err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
174 createdJob, err := client.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{})
175 if err != nil {
176 klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err)
177 lastError = err
178 return false, nil
179 }
180
181 jobName = createdJob.Name
182 return true, nil
183 })
184 if err != nil {
185 return errors.Wrapf(lastError, "could not create a Job with the prefix %q in the namespace %q", prefix, ns)
186 }
187
188
189 err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
190 job, err := client.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{})
191 if err != nil {
192 lastError = err
193 klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err)
194 return false, nil
195 }
196 for _, cond := range job.Status.Conditions {
197 if cond.Type == batchv1.JobComplete {
198 return true, nil
199 }
200 }
201 lastError = errors.Errorf("no condition of type %v", batchv1.JobComplete)
202 klog.V(2).Infof("Job %q in the namespace %q is not yet complete, retrying", jobName, ns)
203 return false, nil
204 })
205 if err != nil {
206 return errors.Wrapf(lastError, "Job %q in the namespace %q did not complete in %v", jobName, ns, timeout)
207 }
208
209 klog.V(2).Infof("Job %q in the namespace %q completed", jobName, ns)
210
211 return nil
212 }
213
214
215 func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterConfiguration) error {
216 selectorControlPlane := labels.SelectorFromSet(map[string]string{
217 constants.LabelNodeRoleControlPlane: "",
218 })
219 nodes, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
220 LabelSelector: selectorControlPlane.String(),
221 })
222 if err != nil {
223 return errors.Wrapf(err, "could not list nodes labeled with %q", constants.LabelNodeRoleControlPlane)
224 }
225
226 notReadyControlPlanes := getNotReadyNodes(nodes.Items)
227 if len(notReadyControlPlanes) != 0 {
228 return errors.Errorf("there are NotReady control-planes in the cluster: %v", notReadyControlPlanes)
229 }
230 return nil
231 }
232
233
234 func staticPodManifestHealth(_ clientset.Interface, _ *kubeadmapi.ClusterConfiguration) error {
235 var nonExistentManifests []string
236 for _, component := range constants.ControlPlaneComponents {
237 manifestFile := constants.GetStaticPodFilepath(component, constants.GetStaticPodDirectory())
238 if _, err := os.Stat(manifestFile); os.IsNotExist(err) {
239 nonExistentManifests = append(nonExistentManifests, manifestFile)
240 }
241 }
242 if len(nonExistentManifests) == 0 {
243 return nil
244 }
245 return errors.Errorf("The control plane seems to be Static Pod-hosted, but some of the manifests don't seem to exist on disk. This probably means you're running 'kubeadm upgrade' on a remote machine, which is not supported for a Static Pod-hosted cluster. Manifest files not found: %v", nonExistentManifests)
246 }
247
248
249 func getNotReadyNodes(nodes []v1.Node) []string {
250 var notReadyNodes []string
251 for _, node := range nodes {
252 for _, condition := range node.Status.Conditions {
253 if condition.Type == v1.NodeReady && condition.Status != v1.ConditionTrue {
254 notReadyNodes = append(notReadyNodes, node.ObjectMeta.Name)
255 }
256 }
257 }
258 return notReadyNodes
259 }
260
View as plain text