1
2
3
4
19
20 package e2enode
21
22 import (
23 "context"
24 "fmt"
25 "os"
26 "path"
27 "time"
28
29 "github.com/onsi/ginkgo/v2"
30 "github.com/onsi/gomega"
31 v1 "k8s.io/api/core/v1"
32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
33 "k8s.io/apimachinery/pkg/fields"
34 "k8s.io/apimachinery/pkg/types"
35 "k8s.io/apimachinery/pkg/util/uuid"
36 clientset "k8s.io/client-go/kubernetes"
37 coreclientset "k8s.io/client-go/kubernetes/typed/core/v1"
38 admissionapi "k8s.io/pod-security-admission/api"
39
40 "k8s.io/kubernetes/pkg/kubelet/util"
41 "k8s.io/kubernetes/test/e2e/framework"
42 e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
43 "k8s.io/kubernetes/test/e2e/nodefeature"
44 testutils "k8s.io/kubernetes/test/utils"
45 )
46
47 var _ = SIGDescribe("NodeProblemDetector", nodefeature.NodeProblemDetector, framework.WithSerial(), func() {
48 const (
49 pollInterval = 1 * time.Second
50 pollConsistent = 5 * time.Second
51 pollTimeout = 5 * time.Minute
52 )
53 f := framework.NewDefaultFramework("node-problem-detector")
54 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
55 var c clientset.Interface
56 var uid string
57 var ns, name, configName, eventNamespace string
58 var bootTime, nodeTime time.Time
59 var image string
60
61 ginkgo.BeforeEach(func() {
62 c = f.ClientSet
63 ns = f.Namespace.Name
64 uid = string(uuid.NewUUID())
65 name = "node-problem-detector-" + uid
66 configName = "node-problem-detector-config-" + uid
67
68 eventNamespace = metav1.NamespaceDefault
69 image = getNodeProblemDetectorImage()
70 ginkgo.By(fmt.Sprintf("Using node-problem-detector image: %s", image))
71 })
72
73
74 ginkgo.Describe("SystemLogMonitor", func() {
75 const (
76
77
78
79 condition = v1.NodeConditionType("TestCondition")
80
81
82 logFile = "/log/test.log"
83 configFile = "/config/testconfig.json"
84 kubeConfigFile = "/config/kubeconfig"
85 etcLocaltime = "/etc/localtime"
86
87
88 configVolume = "config"
89 logVolume = "log"
90 localtimeVolume = "localtime"
91
92
93 defaultReason = "Default"
94 defaultMessage = "default message"
95 tempReason = "Temporary"
96 tempMessage = "temporary error"
97 permReason1 = "Permanent1"
98 permMessage1 = "permanent error 1"
99 permReason2 = "Permanent2"
100 permMessage2 = "permanent error 2"
101 )
102 var source, config, hostLogFile string
103 var lookback time.Duration
104 var eventListOptions metav1.ListOptions
105
106 ginkgo.BeforeEach(func(ctx context.Context) {
107 ginkgo.By("Calculate Lookback duration")
108 var err error
109
110 nodeTime = time.Now()
111 bootTime, err = util.GetBootTime()
112 framework.ExpectNoError(err)
113
114
115
116 lookback = nodeTime.Sub(bootTime) + time.Hour
117
118
119 source = "kernel-monitor-" + uid
120 config = `
121 {
122 "plugin": "filelog",
123 "pluginConfig": {
124 "timestamp": "^.{15}",
125 "message": "kernel: \\[.*\\] (.*)",
126 "timestampFormat": "` + time.Stamp + `"
127 },
128 "logPath": "` + logFile + `",
129 "lookback": "` + lookback.String() + `",
130 "bufferSize": 10,
131 "source": "` + source + `",
132 "conditions": [
133 {
134 "type": "` + string(condition) + `",
135 "reason": "` + defaultReason + `",
136 "message": "` + defaultMessage + `"
137 }
138 ],
139 "rules": [
140 {
141 "type": "temporary",
142 "reason": "` + tempReason + `",
143 "pattern": "` + tempMessage + `"
144 },
145 {
146 "type": "permanent",
147 "condition": "` + string(condition) + `",
148 "reason": "` + permReason1 + `",
149 "pattern": "` + permMessage1 + ".*" + `"
150 },
151 {
152 "type": "permanent",
153 "condition": "` + string(condition) + `",
154 "reason": "` + permReason2 + `",
155 "pattern": "` + permMessage2 + ".*" + `"
156 }
157 ]
158 }`
159
160
161
162 kubeConfig := fmt.Sprintf(`
163 apiVersion: v1
164 kind: Config
165 users:
166 - name: node-problem-detector
167 user:
168 token: %s
169 clusters:
170 - cluster:
171 server: %s
172 insecure-skip-tls-verify: true
173 name: local
174 contexts:
175 - context:
176 cluster: local
177 user: node-problem-detector
178 name: local-context
179 current-context: local-context
180 `, framework.TestContext.BearerToken, framework.TestContext.Host)
181
182 ginkgo.By("Generate event list options")
183 selector := fields.Set{
184 "involvedObject.kind": "Node",
185 "involvedObject.name": framework.TestContext.NodeName,
186 "involvedObject.namespace": metav1.NamespaceAll,
187 "source": source,
188 }.AsSelector().String()
189 eventListOptions = metav1.ListOptions{FieldSelector: selector}
190
191 ginkgo.By("Create config map for the node problem detector")
192 _, err = c.CoreV1().ConfigMaps(ns).Create(ctx, &v1.ConfigMap{
193 ObjectMeta: metav1.ObjectMeta{Name: configName},
194 Data: map[string]string{
195 path.Base(configFile): config,
196 path.Base(kubeConfigFile): kubeConfig,
197 },
198 }, metav1.CreateOptions{})
199 framework.ExpectNoError(err)
200
201 ginkgo.By("Create the node problem detector")
202 hostPathType := new(v1.HostPathType)
203 *hostPathType = v1.HostPathFileOrCreate
204 pod := e2epod.NewPodClient(f).CreateSync(ctx, &v1.Pod{
205 ObjectMeta: metav1.ObjectMeta{
206 Name: name,
207 },
208 Spec: v1.PodSpec{
209 HostNetwork: true,
210 SecurityContext: &v1.PodSecurityContext{},
211 ServiceAccountName: name,
212 Volumes: []v1.Volume{
213 {
214 Name: configVolume,
215 VolumeSource: v1.VolumeSource{
216 ConfigMap: &v1.ConfigMapVolumeSource{
217 LocalObjectReference: v1.LocalObjectReference{Name: configName},
218 },
219 },
220 },
221 {
222 Name: logVolume,
223 VolumeSource: v1.VolumeSource{
224 EmptyDir: &v1.EmptyDirVolumeSource{},
225 },
226 },
227 {
228 Name: localtimeVolume,
229 VolumeSource: v1.VolumeSource{
230 HostPath: &v1.HostPathVolumeSource{
231 Path: etcLocaltime,
232 Type: hostPathType,
233 },
234 },
235 },
236 },
237 InitContainers: []v1.Container{
238 {
239 Name: "init-log-file",
240 Image: "debian",
241 Command: []string{"/bin/sh"},
242 Args: []string{
243 "-c",
244 fmt.Sprintf("touch %s", logFile),
245 },
246 VolumeMounts: []v1.VolumeMount{
247 {
248 Name: logVolume,
249 MountPath: path.Dir(logFile),
250 },
251 {
252 Name: localtimeVolume,
253 MountPath: etcLocaltime,
254 },
255 },
256 },
257 },
258 Containers: []v1.Container{
259 {
260 Name: name,
261 Image: image,
262 Command: []string{"/node-problem-detector"},
263 Args: []string{
264 "--logtostderr",
265 fmt.Sprintf("--system-log-monitors=%s", configFile),
266
267
268 fmt.Sprintf("--apiserver-override=%s?inClusterConfig=false&auth=%s", framework.TestContext.Host, kubeConfigFile),
269 },
270 Env: []v1.EnvVar{
271 {
272 Name: "NODE_NAME",
273 ValueFrom: &v1.EnvVarSource{
274 FieldRef: &v1.ObjectFieldSelector{
275 APIVersion: "v1",
276 FieldPath: "spec.nodeName",
277 },
278 },
279 },
280 },
281 VolumeMounts: []v1.VolumeMount{
282 {
283 Name: logVolume,
284 MountPath: path.Dir(logFile),
285 },
286 {
287 Name: localtimeVolume,
288 MountPath: etcLocaltime,
289 },
290 {
291 Name: configVolume,
292 MountPath: path.Dir(configFile),
293 },
294 },
295 },
296 },
297 },
298 })
299
300
301 hostLogFile = "/var/lib/kubelet/pods/" + string(pod.UID) + "/volumes/kubernetes.io~empty-dir" + logFile
302 })
303
304 ginkgo.It("should generate node condition and events for corresponding errors", func(ctx context.Context) {
305 for _, test := range []struct {
306 description string
307 timestamp time.Time
308 message string
309 messageNum int
310 tempEvents int
311 totalEvents int
312 conditionReason string
313 conditionMessage string
314 conditionType v1.ConditionStatus
315 }{
316 {
317 description: "should generate default node condition",
318 conditionReason: defaultReason,
319 conditionMessage: defaultMessage,
320 conditionType: v1.ConditionFalse,
321 },
322 {
323 description: "should not generate events for too old log",
324 timestamp: bootTime.Add(-1 * time.Minute),
325 message: tempMessage,
326 messageNum: 3,
327 conditionReason: defaultReason,
328 conditionMessage: defaultMessage,
329 conditionType: v1.ConditionFalse,
330 },
331 {
332 description: "should not change node condition for too old log",
333 timestamp: bootTime.Add(-1 * time.Minute),
334 message: permMessage1,
335 messageNum: 1,
336 conditionReason: defaultReason,
337 conditionMessage: defaultMessage,
338 conditionType: v1.ConditionFalse,
339 },
340 {
341 description: "should generate event for old log within lookback duration",
342 timestamp: nodeTime,
343 message: tempMessage,
344 messageNum: 3,
345 tempEvents: 3,
346 totalEvents: 3,
347 conditionReason: defaultReason,
348 conditionMessage: defaultMessage,
349 conditionType: v1.ConditionFalse,
350 },
351 {
352 description: "should change node condition for old log within lookback duration",
353 timestamp: nodeTime,
354 message: permMessage1,
355 messageNum: 1,
356 tempEvents: 3,
357 totalEvents: 4,
358 conditionReason: permReason1,
359 conditionMessage: permMessage1,
360 conditionType: v1.ConditionTrue,
361 },
362 {
363 description: "should generate event for new log",
364 timestamp: nodeTime.Add(5 * time.Minute),
365 message: tempMessage,
366 messageNum: 3,
367 tempEvents: 6,
368 totalEvents: 7,
369 conditionReason: permReason1,
370 conditionMessage: permMessage1,
371 conditionType: v1.ConditionTrue,
372 },
373 {
374 description: "should not update node condition with the same reason",
375 timestamp: nodeTime.Add(5 * time.Minute),
376 message: permMessage1 + "different message",
377 messageNum: 1,
378 tempEvents: 6,
379 totalEvents: 7,
380 conditionReason: permReason1,
381 conditionMessage: permMessage1,
382 conditionType: v1.ConditionTrue,
383 },
384 {
385 description: "should change node condition for new log",
386 timestamp: nodeTime.Add(5 * time.Minute),
387 message: permMessage2,
388 messageNum: 1,
389 tempEvents: 6,
390 totalEvents: 8,
391 conditionReason: permReason2,
392 conditionMessage: permMessage2,
393 conditionType: v1.ConditionTrue,
394 },
395 } {
396 ginkgo.By(test.description)
397 if test.messageNum > 0 {
398 ginkgo.By(fmt.Sprintf("Inject %d logs: %q", test.messageNum, test.message))
399 err := injectLog(hostLogFile, test.timestamp, test.message, test.messageNum)
400 framework.ExpectNoError(err)
401 }
402
403 ginkgo.By(fmt.Sprintf("Wait for %d temp events generated", test.tempEvents))
404 gomega.Eventually(ctx, func(ctx context.Context) error {
405 return verifyEvents(ctx, c.CoreV1().Events(eventNamespace), eventListOptions, test.tempEvents, tempReason, tempMessage)
406 }, pollTimeout, pollInterval).Should(gomega.Succeed())
407 ginkgo.By(fmt.Sprintf("Wait for %d total events generated", test.totalEvents))
408 gomega.Eventually(ctx, func(ctx context.Context) error {
409 return verifyTotalEvents(ctx, c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents)
410 }, pollTimeout, pollInterval).Should(gomega.Succeed())
411 ginkgo.By(fmt.Sprintf("Make sure only %d total events generated", test.totalEvents))
412 gomega.Consistently(ctx, func(ctx context.Context) error {
413 return verifyTotalEvents(ctx, c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents)
414 }, pollConsistent, pollInterval).Should(gomega.Succeed())
415
416 ginkgo.By(fmt.Sprintf("Make sure node condition %q is set", condition))
417 gomega.Eventually(ctx, func(ctx context.Context) error {
418 return verifyNodeCondition(ctx, c.CoreV1().Nodes(), condition, test.conditionType, test.conditionReason, test.conditionMessage)
419 }, pollTimeout, pollInterval).Should(gomega.Succeed())
420 ginkgo.By(fmt.Sprintf("Make sure node condition %q is stable", condition))
421 gomega.Consistently(ctx, func(ctx context.Context) error {
422 return verifyNodeCondition(ctx, c.CoreV1().Nodes(), condition, test.conditionType, test.conditionReason, test.conditionMessage)
423 }, pollConsistent, pollInterval).Should(gomega.Succeed())
424 }
425 })
426
427 ginkgo.AfterEach(func(ctx context.Context) {
428 if ginkgo.CurrentSpecReport().Failed() && framework.TestContext.DumpLogsOnFailure {
429 ginkgo.By("Get node problem detector log")
430 log, err := e2epod.GetPodLogs(ctx, c, ns, name, name)
431 gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
432 framework.Logf("Node Problem Detector logs:\n %s", log)
433 }
434 ginkgo.By("Delete the node problem detector")
435 framework.ExpectNoError(e2epod.NewPodClient(f).Delete(ctx, name, *metav1.NewDeleteOptions(0)))
436 ginkgo.By("Wait for the node problem detector to disappear")
437 gomega.Expect(e2epod.WaitForPodNotFoundInNamespace(ctx, c, name, ns, pollTimeout)).To(gomega.Succeed())
438 ginkgo.By("Delete the config map")
439 framework.ExpectNoError(c.CoreV1().ConfigMaps(ns).Delete(ctx, configName, metav1.DeleteOptions{}))
440 ginkgo.By("Clean up the events")
441 gomega.Expect(c.CoreV1().Events(eventNamespace).DeleteCollection(ctx, *metav1.NewDeleteOptions(0), eventListOptions)).To(gomega.Succeed())
442 ginkgo.By("Clean up the node condition")
443 patch := []byte(fmt.Sprintf(`{"status":{"conditions":[{"$patch":"delete","type":"%s"}]}}`, condition))
444 c.CoreV1().RESTClient().Patch(types.StrategicMergePatchType).Resource("nodes").Name(framework.TestContext.NodeName).SubResource("status").Body(patch).Do(ctx)
445 })
446 })
447 })
448
449
450 func injectLog(file string, timestamp time.Time, log string, num int) error {
451 f, err := os.OpenFile(file, os.O_RDWR|os.O_APPEND, 0666)
452 if err != nil {
453 return err
454 }
455 defer f.Close()
456 for i := 0; i < num; i++ {
457 _, err := f.WriteString(fmt.Sprintf("%s kernel: [0.000000] %s\n", timestamp.Format(time.Stamp), log))
458 if err != nil {
459 return err
460 }
461 }
462 return nil
463 }
464
465
466 func verifyEvents(ctx context.Context, e coreclientset.EventInterface, options metav1.ListOptions, num int, reason, message string) error {
467 events, err := e.List(ctx, options)
468 if err != nil {
469 return err
470 }
471 count := 0
472 for _, event := range events.Items {
473 if event.Reason != reason || event.Message != message {
474 continue
475 }
476 count += int(event.Count)
477 }
478 if count != num {
479 return fmt.Errorf("expected %d events with reason set to %s and message set to %s\nbut %d actual events occurred. Events : %v", num, reason, message, count, events.Items)
480 }
481 return nil
482 }
483
484
485 func verifyTotalEvents(ctx context.Context, e coreclientset.EventInterface, options metav1.ListOptions, num int) error {
486 events, err := e.List(ctx, options)
487 if err != nil {
488 return err
489 }
490 count := 0
491 for _, event := range events.Items {
492 count += int(event.Count)
493 }
494 if count != num {
495 return fmt.Errorf("expected total number of events was %d, actual events counted was %d\nEvents : %v", num, count, events.Items)
496 }
497 return nil
498 }
499
500
501 func verifyNodeCondition(ctx context.Context, n coreclientset.NodeInterface, condition v1.NodeConditionType, status v1.ConditionStatus, reason, message string) error {
502 node, err := n.Get(ctx, framework.TestContext.NodeName, metav1.GetOptions{})
503 if err != nil {
504 return err
505 }
506 _, c := testutils.GetNodeCondition(&node.Status, condition)
507 if c == nil {
508 return fmt.Errorf("node condition %q not found", condition)
509 }
510 if c.Status != status || c.Reason != reason || c.Message != message {
511 return fmt.Errorf("unexpected node condition %q: %+v", condition, c)
512 }
513 return nil
514 }
515
View as plain text