...
1#!/bin/bash
2
3# Copyright 2019 The Kubernetes Authors.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17# A small smoke test to run against a just-deployed kube-up cluster with Windows
18# nodes. Performs checks such as:
19# 1) Verifying that all Windows nodes have status Ready.
20# 2) Verifying that no system pods are attempting to run on Windows nodes.
21# 3) Verifying pairwise connectivity between most of the following: Linux
22# pods, Windows pods, K8s services, and the Internet.
23# 4) Verifying that basic DNS resolution works in Windows pods.
24#
25# This script assumes that it is run from the root of the kubernetes repository.
26#
27# TODOs:
28# - Implement the node-to-pod checks.
29# - Capture stdout for each command to a file and only print it when the test
30# fails.
31# - Move copy-pasted code into reusable functions.
32# - Continue running all checks after one fails.
33# - Test service connectivity by running a test pod with an http server and
34# exposing it as a service (rather than curl-ing from existing system
35# services that don't serve http requests).
36# - Add test retries for transient errors, such as:
37# "error: unable to upgrade connection: Authorization error
38# (user=kube-apiserver, verb=create, resource=nodes, subresource=proxy)"
39
40# Override this to use a different kubectl binary.
41kubectl=kubectl
42linux_deployment_timeout=60
43windows_deployment_timeout=600
44output_file=/tmp/k8s-smoke-test.out
45
46function check_windows_nodes_are_ready {
47 # kubectl filtering is the worst.
48 statuses=$(${kubectl} get nodes -l kubernetes.io/os=windows \
49 -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}')
50 for status in $statuses; do
51 if [[ $status == "False" ]]; then
52 echo "ERROR: some Windows node has status != Ready"
53 echo "kubectl get nodes -l kubernetes.io/os=windows"
54 ${kubectl} get nodes -l kubernetes.io/os=windows
55 exit 1
56 fi
57 done
58 echo "Verified that all Windows nodes have status Ready"
59}
60
61function untaint_windows_nodes {
62 # Untaint the windows nodes to allow test workloads without tolerations to be
63 # scheduled onto them.
64 WINDOWS_NODES=$(${kubectl} get nodes -l kubernetes.io/os=windows -o name)
65 for node in $WINDOWS_NODES; do
66 ${kubectl} taint node "$node" node.kubernetes.io/os:NoSchedule-
67 done
68}
69
70function check_no_system_pods_on_windows_nodes {
71 windows_system_pods=$(${kubectl} get pods --namespace kube-system \
72 -o wide | grep -E "Pending|windows" | wc -w)
73 if [[ $windows_system_pods -ne 0 ]]; then
74 echo "ERROR: there are kube-system pods trying to run on Windows nodes"
75 echo "kubectl get pods --namespace kube-system -o wide"
76 ${kubectl} get pods --namespace kube-system -o wide
77 exit 1
78 fi
79 echo "Verified that all system pods are running on Linux nodes"
80}
81
82linux_webserver_deployment=linux-nginx
83linux_webserver_pod_label=nginx
84linux_webserver_replicas=1
85
86function deploy_linux_webserver_pod {
87 echo "Writing example deployment to $linux_webserver_deployment.yaml"
88 cat <<EOF > $linux_webserver_deployment.yaml
89apiVersion: apps/v1
90kind: Deployment
91metadata:
92 name: $linux_webserver_deployment
93 labels:
94 app: $linux_webserver_pod_label
95spec:
96 replicas: $linux_webserver_replicas
97 selector:
98 matchLabels:
99 app: $linux_webserver_pod_label
100 template:
101 metadata:
102 labels:
103 app: $linux_webserver_pod_label
104 spec:
105 containers:
106 - name: nginx
107 image: nginx:1.7.9
108 nodeSelector:
109 kubernetes.io/os: linux
110EOF
111
112 if ! ${kubectl} create -f $linux_webserver_deployment.yaml; then
113 echo "kubectl create -f $linux_webserver_deployment.yaml failed"
114 exit 1
115 fi
116
117 timeout=$linux_deployment_timeout
118 while [[ $timeout -gt 0 ]]; do
119 echo "Waiting for $linux_webserver_replicas Linux $linux_webserver_pod_label pods to become Ready"
120 statuses=$(${kubectl} get pods -l app=$linux_webserver_pod_label \
121 -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \
122 | grep "True" | wc -w)
123 if [[ $statuses -eq $linux_webserver_replicas ]]; then
124 break
125 else
126 sleep 10
127 (( timeout=timeout-10 ))
128 fi
129 done
130
131 if [[ $timeout -gt 0 ]]; then
132 echo "All $linux_webserver_pod_label pods became Ready"
133 else
134 echo "ERROR: Not all $linux_webserver_pod_label pods became Ready"
135 echo "kubectl get pods -l app=$linux_webserver_pod_label"
136 ${kubectl} get pods -l app=$linux_webserver_pod_label
137 cleanup_deployments
138 exit 1
139 fi
140}
141
142# Returns the IP address of an arbitrary Linux webserver pod.
143function get_linux_webserver_pod_ip {
144 $kubectl get pods -l app="$linux_webserver_pod_label" \
145 -o jsonpath='{.items[0].status.podIP}'
146}
147
148function undeploy_linux_webserver_pod {
149 ${kubectl} delete deployment $linux_webserver_deployment
150}
151
152linux_command_deployment=linux-ubuntu
153linux_command_pod_label=ubuntu
154linux_command_replicas=1
155
156function deploy_linux_command_pod {
157 echo "Writing example deployment to $linux_command_deployment.yaml"
158 cat <<EOF > $linux_command_deployment.yaml
159apiVersion: apps/v1
160kind: Deployment
161metadata:
162 name: $linux_command_deployment
163 labels:
164 app: $linux_command_pod_label
165spec:
166 replicas: $linux_command_replicas
167 selector:
168 matchLabels:
169 app: $linux_command_pod_label
170 template:
171 metadata:
172 labels:
173 app: $linux_command_pod_label
174 spec:
175 containers:
176 - name: ubuntu
177 image: ubuntu
178 command: ["sleep", "123456"]
179 nodeSelector:
180 kubernetes.io/os: linux
181EOF
182
183 if ! ${kubectl} create -f $linux_command_deployment.yaml; then
184 echo "kubectl create -f $linux_command_deployment.yaml failed"
185 exit 1
186 fi
187
188 timeout=$linux_deployment_timeout
189 while [[ $timeout -gt 0 ]]; do
190 echo "Waiting for $linux_command_replicas Linux $linux_command_pod_label pods to become Ready"
191 statuses=$(${kubectl} get pods -l app="$linux_command_pod_label" \
192 -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \
193 | grep "True" | wc -w)
194 if [[ $statuses -eq $linux_command_replicas ]]; then
195 break
196 else
197 sleep 10
198 (( timeout=timeout-10 ))
199 fi
200 done
201
202 if [[ $timeout -gt 0 ]]; then
203 echo "All $linux_command_pod_label pods became Ready"
204 else
205 echo "ERROR: Not all $linux_command_pod_label pods became Ready"
206 echo "kubectl get pods -l app=$linux_command_pod_label"
207 ${kubectl} get pods -l app="$linux_command_pod_label"
208 cleanup_deployments
209 exit 1
210 fi
211}
212
213# Returns the name of an arbitrary Linux command pod.
214function get_linux_command_pod_name {
215 $kubectl get pods -l app="$linux_command_pod_label" \
216 -o jsonpath='{.items[0].metadata.name}'
217}
218
219# Installs test executables (ping, curl) in the Linux command pod.
220# NOTE: this assumes that there is only one Linux "command pod".
221# TODO(pjh): fix this.
222function prepare_linux_command_pod {
223 local linux_command_pod
224 linux_command_pod="$(get_linux_command_pod_name)"
225
226 echo "Installing test utilities in Linux command pod, may take a minute"
227 $kubectl exec "$linux_command_pod" -- apt-get update > /dev/null
228 $kubectl exec "$linux_command_pod" -- \
229 apt-get install -y iputils-ping curl > /dev/null
230}
231
232function undeploy_linux_command_pod {
233 ${kubectl} delete deployment $linux_command_deployment
234}
235
236windows_webserver_deployment=windows-agnhost
237windows_webserver_pod_label=agnhost
238# The default port for 'agnhost serve-hostname'. The documentation says that
239# this can be changed but the --port arg does not seem to work.
240windows_webserver_port=9376
241windows_webserver_replicas=1
242
243function deploy_windows_webserver_pod {
244 echo "Writing example deployment to $windows_webserver_deployment.yaml"
245 cat <<EOF > $windows_webserver_deployment.yaml
246# A multi-arch Windows container that runs an HTTP server on port
247# $windows_webserver_port that serves the container's hostname.
248# curl -s http://<pod_ip>:$windows_webserver_port
249apiVersion: apps/v1
250kind: Deployment
251metadata:
252 name: $windows_webserver_deployment
253 labels:
254 app: $windows_webserver_pod_label
255spec:
256 replicas: $windows_webserver_replicas
257 selector:
258 matchLabels:
259 app: $windows_webserver_pod_label
260 template:
261 metadata:
262 labels:
263 app: $windows_webserver_pod_label
264 spec:
265 containers:
266 - name: agnhost
267 image: e2eteam/agnhost:2.26
268 args:
269 - serve-hostname
270 nodeSelector:
271 kubernetes.io/os: windows
272 tolerations:
273 - effect: NoSchedule
274 key: node.kubernetes.io/os
275 operator: Equal
276 value: windows
277EOF
278
279 if ! ${kubectl} create -f $windows_webserver_deployment.yaml; then
280 echo "kubectl create -f $windows_webserver_deployment.yaml failed"
281 exit 1
282 fi
283
284 timeout=$windows_deployment_timeout
285 while [[ $timeout -gt 0 ]]; do
286 echo "Waiting for $windows_webserver_replicas Windows $windows_webserver_pod_label pods to become Ready"
287 statuses=$(${kubectl} get pods -l app=$windows_webserver_pod_label \
288 -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \
289 | grep "True" | wc -w)
290 if [[ $statuses -eq $windows_webserver_replicas ]]; then
291 break
292 else
293 sleep 10
294 (( timeout=timeout-10 ))
295 fi
296 done
297
298 if [[ $timeout -gt 0 ]]; then
299 echo "All $windows_webserver_pod_label pods became Ready"
300 else
301 echo "ERROR: Not all $windows_webserver_pod_label pods became Ready"
302 echo "kubectl get pods -l app=$windows_webserver_pod_label"
303 ${kubectl} get pods -l app=$windows_webserver_pod_label
304 cleanup_deployments
305 exit 1
306 fi
307}
308
309function get_windows_webserver_pod_ip {
310 ${kubectl} get pods -l app="$windows_webserver_pod_label" \
311 -o jsonpath='{.items[0].status.podIP}'
312}
313
314function undeploy_windows_webserver_pod {
315 ${kubectl} delete deployment "$windows_webserver_deployment"
316}
317
318windows_command_deployment=windows-powershell
319windows_command_pod_label=powershell
320windows_command_replicas=1
321
322# Deploys a multi-arch Windows pod capable of running PowerShell.
323function deploy_windows_command_pod {
324 echo "Writing example deployment to $windows_command_deployment.yaml"
325 cat <<EOF > $windows_command_deployment.yaml
326apiVersion: apps/v1
327kind: Deployment
328metadata:
329 name: $windows_command_deployment
330 labels:
331 app: $windows_command_pod_label
332spec:
333 replicas: $windows_command_replicas
334 selector:
335 matchLabels:
336 app: $windows_command_pod_label
337 template:
338 metadata:
339 labels:
340 app: $windows_command_pod_label
341 spec:
342 containers:
343 - name: pause-win
344 image: registry.k8s.io/pause:3.9
345 nodeSelector:
346 kubernetes.io/os: windows
347 tolerations:
348 - effect: NoSchedule
349 key: node.kubernetes.io/os
350 operator: Equal
351 value: windows
352EOF
353
354 if ! ${kubectl} create -f $windows_command_deployment.yaml; then
355 echo "kubectl create -f $windows_command_deployment.yaml failed"
356 exit 1
357 fi
358
359 timeout=$windows_deployment_timeout
360 while [[ $timeout -gt 0 ]]; do
361 echo "Waiting for $windows_command_replicas Windows $windows_command_pod_label pods to become Ready"
362 statuses=$(${kubectl} get pods -l app=$windows_command_pod_label \
363 -o jsonpath='{.items[*].status.conditions[?(@.type=="Ready")].status}' \
364 | grep "True" | wc -w)
365 if [[ $statuses -eq $windows_command_replicas ]]; then
366 break
367 else
368 sleep 10
369 (( timeout=timeout-10 ))
370 fi
371 done
372
373 if [[ $timeout -gt 0 ]]; then
374 echo "All $windows_command_pod_label pods became Ready"
375 else
376 echo "ERROR: Not all $windows_command_pod_label pods became Ready"
377 echo "kubectl get pods -l app=$windows_command_pod_label"
378 ${kubectl} get pods -l app=$windows_command_pod_label
379 cleanup_deployments
380 exit 1
381 fi
382}
383
384function get_windows_command_pod_name {
385 $kubectl get pods -l app="$windows_command_pod_label" \
386 -o jsonpath='{.items[0].metadata.name}'
387}
388
389function undeploy_windows_command_pod {
390 ${kubectl} delete deployment "$windows_command_deployment"
391}
392
393function test_linux_node_to_linux_pod {
394 echo "TODO: ${FUNCNAME[0]}"
395}
396
397function test_linux_node_to_windows_pod {
398 echo "TODO: ${FUNCNAME[0]}"
399}
400
401function test_linux_pod_to_linux_pod {
402 echo "TEST: ${FUNCNAME[0]}"
403 local linux_command_pod
404 linux_command_pod="$(get_linux_command_pod_name)"
405 local linux_webserver_pod_ip
406 linux_webserver_pod_ip="$(get_linux_webserver_pod_ip)"
407
408 if ! $kubectl exec "$linux_command_pod" -- curl -s -m 20 \
409 "http://$linux_webserver_pod_ip" &> $output_file; then
410 cleanup_deployments
411 echo "Failing output: $(cat $output_file)"
412 echo "FAILED: ${FUNCNAME[0]}"
413 exit 1
414 fi
415}
416
417# TODO(pjh): this test flakily fails on brand-new clusters, not sure why.
418# % Total % Received % Xferd Average Speed Time Time Time Current
419# Dload Upload Total Spent Left Speed
420# 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
421# curl: (6) Could not resolve host:
422# command terminated with exit code 6
423function test_linux_pod_to_windows_pod {
424 echo "TEST: ${FUNCNAME[0]}"
425 local linux_command_pod
426 linux_command_pod="$(get_linux_command_pod_name)"
427 local windows_webserver_pod_ip
428 windows_webserver_pod_ip="$(get_windows_webserver_pod_ip)"
429
430 if ! $kubectl exec "$linux_command_pod" -- curl -s -m 20 \
431 "http://$windows_webserver_pod_ip:$windows_webserver_port" &> $output_file; then
432 cleanup_deployments
433 echo "Failing output: $(cat $output_file)"
434 echo "FAILED: ${FUNCNAME[0]}"
435 echo "This test seems to be flaky. TODO(pjh): investigate."
436 exit 1
437 fi
438}
439
440function test_linux_pod_to_k8s_service {
441 echo "TEST: ${FUNCNAME[0]}"
442 local linux_command_pod
443 linux_command_pod="$(get_linux_command_pod_name)"
444 local service="metrics-server"
445 local service_ip
446 service_ip=$($kubectl get service --namespace kube-system $service \
447 -o jsonpath='{.spec.clusterIP}')
448 local service_port
449 service_port=$($kubectl get service --namespace kube-system $service \
450 -o jsonpath='{.spec.ports[?(@.protocol=="TCP")].port}')
451 echo "curl-ing $service address from Linux pod: $service_ip:$service_port"
452
453 # curl-ing the metrics-server service downloads 14 bytes of unprintable binary
454 # data and sets a return code of success (0).
455 if ! $kubectl exec "$linux_command_pod" -- \
456 curl -s -m 20 --insecure "https://$service_ip:$service_port" &> $output_file; then
457 cleanup_deployments
458 echo "Failing output: $(cat $output_file)"
459 echo "FAILED: ${FUNCNAME[0]}"
460 exit 1
461 fi
462}
463
464function test_windows_node_to_linux_pod {
465 echo "TODO: ${FUNCNAME[0]}"
466}
467
468function test_windows_node_to_windows_pod {
469 echo "TODO: ${FUNCNAME[0]}"
470}
471
472# TODO(pjh): this test failed for me once with
473# error: unable to upgrade connection: container not found ("nettest")
474# Maybe the container crashed for some reason? Investigate if it happens more.
475#
476# TODO(pjh): another one-time failure:
477# error: unable to upgrade connection: Authorization error
478# (user=kube-apiserver, verb=create, resource=nodes, subresource=proxy)
479function test_windows_pod_to_linux_pod {
480 echo "TEST: ${FUNCNAME[0]}"
481 local windows_command_pod
482 windows_command_pod="$(get_windows_command_pod_name)"
483 local linux_webserver_pod_ip
484 linux_webserver_pod_ip="$(get_linux_webserver_pod_ip)"
485
486 if ! $kubectl exec "$windows_command_pod" -- powershell.exe \
487 "curl -UseBasicParsing http://$linux_webserver_pod_ip" > \
488 $output_file; then
489 cleanup_deployments
490 echo "Failing output: $(cat $output_file)"
491 echo "FAILED: ${FUNCNAME[0]}"
492 exit 1
493 fi
494}
495
496function test_windows_pod_to_windows_pod {
497 echo "TEST: ${FUNCNAME[0]}"
498 local windows_command_pod
499 windows_command_pod="$(get_windows_command_pod_name)"
500 local windows_webserver_pod_ip
501 windows_webserver_pod_ip="$(get_windows_webserver_pod_ip)"
502
503 if ! $kubectl exec "$windows_command_pod" -- powershell.exe \
504 "curl -UseBasicParsing http://$windows_webserver_pod_ip:$windows_webserver_port" \
505 > $output_file; then
506 cleanup_deployments
507 echo "Failing output: $(cat $output_file)"
508 echo "FAILED: ${FUNCNAME[0]}"
509 exit 1
510 fi
511}
512
513function test_windows_pod_to_internet {
514 echo "TEST: ${FUNCNAME[0]}"
515 local windows_command_pod
516 windows_command_pod="$(get_windows_command_pod_name)"
517 # A stable (hopefully) HTTP server provided by Cloudflare. If this ever stops
518 # working, we can request from 8.8.8.8 (Google DNS) using https instead.
519 local internet_ip="1.1.1.1"
520
521 if ! $kubectl exec "$windows_command_pod" -- powershell.exe \
522 "curl -UseBasicParsing http://$internet_ip" > $output_file; then
523 cleanup_deployments
524 echo "Failing output: $(cat $output_file)"
525 echo "FAILED: ${FUNCNAME[0]}"
526 exit 1
527 fi
528}
529
530function test_windows_pod_to_k8s_service {
531 echo "TEST: ${FUNCNAME[0]}"
532 local windows_command_pod
533 windows_command_pod="$(get_windows_command_pod_name)"
534 local service="metrics-server"
535 local service_ip
536 service_ip=$($kubectl get service --namespace kube-system $service \
537 -o jsonpath='{.spec.clusterIP}')
538 local service_port
539 service_port=$($kubectl get service --namespace kube-system $service \
540 -o jsonpath='{.spec.ports[?(@.protocol=="TCP")].port}')
541 local service_address="$service_ip:$service_port"
542
543 echo "curl-ing $service address from Windows pod: $service_address"
544 # curl-ing the metrics-server service results in a ServerProtocolViolation
545 # ("The server committed a protocol violation. Section=ResponseStatusLine")
546 # exception. Since we don't care about what the metrics-server actually gives
547 # back to us, just that we can reach it, we check that we get the expected
548 # exception code and not some other exception code.
549 # TODO: it might be less fragile to check that we don't get the "Unable to
550 # connect to the remote server" exception code (2) instead of specifically
551 # expecting the protocol-violation exception code (11).
552 if ! $kubectl exec "$windows_command_pod" -- powershell.exe \
553 "\$result = try { \`
554 curl -UseBasicParsing http://$service_address -ErrorAction Stop \`
555 } catch [System.Net.WebException] { \`
556 \$_ \`
557 }; \`
558 if ([int]\$result.Exception.Status -eq 11) { \`
559 Write-Host \"curl $service_address got expected exception\"
560 exit 0 \`
561 } else { \`
562 Write-Host \"curl $service_address got unexpected result/exception: \$result\"
563 exit 1 \`
564 }" > $output_file; then
565 cleanup_deployments
566 echo "Failing output: $(cat $output_file)"
567 echo "FAILED: ${FUNCNAME[0]}"
568 exit 1
569 fi
570}
571
572function test_kube_dns_in_windows_pod {
573 echo "TEST: ${FUNCNAME[0]}"
574 local windows_command_pod
575 windows_command_pod="$(get_windows_command_pod_name)"
576 local service="kube-dns"
577 local service_ip
578 service_ip=$($kubectl get service --namespace kube-system $service \
579 -o jsonpath='{.spec.clusterIP}')
580
581 if ! $kubectl exec "$windows_command_pod" -- powershell.exe \
582 "Resolve-DnsName www.bing.com -server $service_ip" > $output_file; then
583 cleanup_deployments
584 echo "Failing output: $(cat $output_file)"
585 echo "FAILED: ${FUNCNAME[0]}"
586 exit 1
587 fi
588}
589
590function test_dns_just_works_in_windows_pod {
591 echo "TEST: ${FUNCNAME[0]}"
592 local windows_command_pod
593 windows_command_pod="$(get_windows_command_pod_name)"
594
595 if ! $kubectl exec "$windows_command_pod" -- powershell.exe \
596 "curl -UseBasicParsing http://www.bing.com" > $output_file; then
597 cleanup_deployments
598 echo "Failing output: $(cat $output_file)"
599 echo "FAILED: ${FUNCNAME[0]}"
600 exit 1
601 fi
602}
603
604function cleanup_deployments {
605 undeploy_linux_webserver_pod
606 undeploy_linux_command_pod
607 undeploy_windows_webserver_pod
608 undeploy_windows_command_pod
609}
610
611check_windows_nodes_are_ready
612untaint_windows_nodes
613check_no_system_pods_on_windows_nodes
614
615deploy_linux_webserver_pod
616deploy_linux_command_pod
617deploy_windows_webserver_pod
618deploy_windows_command_pod
619prepare_linux_command_pod
620echo ""
621
622test_linux_node_to_linux_pod
623test_linux_node_to_windows_pod
624test_linux_pod_to_linux_pod
625test_linux_pod_to_windows_pod
626test_linux_pod_to_k8s_service
627
628# Note: test_windows_node_to_k8s_service is not supported at this time.
629# https://docs.microsoft.com/en-us/virtualization/windowscontainers/kubernetes/common-problems#my-windows-node-cannot-access-my-services-using-the-service-ip
630test_windows_node_to_linux_pod
631test_windows_node_to_windows_pod
632test_windows_pod_to_linux_pod
633test_windows_pod_to_windows_pod
634test_windows_pod_to_internet
635test_windows_pod_to_k8s_service
636test_kube_dns_in_windows_pod
637test_dns_just_works_in_windows_pod
638echo ""
639
640cleanup_deployments
641echo "All tests passed!"
642exit 0
View as plain text