...
1#!/usr/bin/env bash
2
3# Copyright 2014 The Kubernetes Authors.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17# Validates that the cluster is healthy.
18# Error codes are:
19# 0 - success
20# 1 - fatal (cluster is unlikely to work)
21# 2 - non-fatal (encountered some errors, but cluster should be working correctly)
22
23set -o errexit
24set -o nounset
25set -o pipefail
26
27KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/..
28
29if [ -f "${KUBE_ROOT}/cluster/env.sh" ]; then
30 source "${KUBE_ROOT}/cluster/env.sh"
31fi
32
33source "${KUBE_ROOT}/hack/lib/util.sh"
34source "${KUBE_ROOT}/cluster/kube-util.sh"
35
36# Run kubectl and retry upon failure.
37function kubectl_retry() {
38 tries=3
39 while ! "${KUBE_ROOT}/cluster/kubectl.sh" "$@"; do
40 tries=$((tries-1))
41 if [[ ${tries} -le 0 ]]; then
42 echo "('kubectl $*' failed, giving up)" >&2
43 return 1
44 fi
45 echo "(kubectl failed, will retry ${tries} times)" >&2
46 sleep 1
47 done
48}
49
50ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}"
51CLUSTER_READY_ADDITIONAL_TIME_SECONDS="${CLUSTER_READY_ADDITIONAL_TIME_SECONDS:-30}"
52
53if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]]; then
54 if [[ "${KUBE_CREATE_NODES}" == "true" ]]; then
55 EXPECTED_NUM_NODES="$(get-num-nodes)"
56 else
57 EXPECTED_NUM_NODES="0"
58 fi
59 echo "Validating gce cluster, MULTIZONE=${MULTIZONE:-}"
60 # In multizone mode we need to add instances for all nodes in the region.
61 if [[ "${MULTIZONE:-}" == "true" ]]; then
62 EXPECTED_NUM_NODES=$(gcloud -q compute instances list --project="${PROJECT}" --format="[no-heading]" \
63 --filter="(name ~ '${NODE_INSTANCE_PREFIX}.*' OR name ~ '${WINDOWS_NODE_INSTANCE_PREFIX}.*') AND zone:($(gcloud -q compute zones list --project="${PROJECT}" --filter=region="${REGION}" --format="csv[no-heading](name)" | tr "\n" "," | sed "s/,$//"))" | wc -l)
64 echo "Computing number of nodes, NODE_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX}, REGION=${REGION}, EXPECTED_NUM_NODES=${EXPECTED_NUM_NODES}"
65 fi
66else
67 EXPECTED_NUM_NODES="${NUM_NODES}"
68fi
69
70if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]]; then
71 if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]]; then
72 NUM_MASTERS=$(get-master-replicas-count)
73 else
74 NUM_MASTERS=1
75 fi
76 EXPECTED_NUM_NODES=$((EXPECTED_NUM_NODES+NUM_MASTERS))
77fi
78
79REQUIRED_NUM_NODES=$((EXPECTED_NUM_NODES - ALLOWED_NOTREADY_NODES))
80# Make several attempts to deal with slow cluster birth.
81return_value=0
82attempt=0
83# Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters.
84PAUSE_BETWEEN_ITERATIONS_SECONDS=15
85MAX_ATTEMPTS=100
86ADDITIONAL_ITERATIONS=$(((CLUSTER_READY_ADDITIONAL_TIME_SECONDS + PAUSE_BETWEEN_ITERATIONS_SECONDS - 1)/PAUSE_BETWEEN_ITERATIONS_SECONDS))
87while true; do
88 # Pause between iterations of this large outer loop.
89 if [[ ${attempt} -gt 0 ]]; then
90 sleep 15
91 fi
92 attempt=$((attempt+1))
93
94 # The "kubectl get nodes -o template" exports node information.
95 #
96 # Echo the output and gather 2 counts:
97 # - Total number of nodes.
98 # - Number of "ready" nodes.
99 #
100 # Suppress errors from kubectl output because during cluster bootstrapping
101 # for clusters where the master node is registered, the apiserver will become
102 # available and then get restarted as the kubelet configures the docker bridge.
103 #
104 # We are assigning the result of kubectl_retry get nodes operation to the res
105 # variable in that way, to prevent stopping the whole script on an error.
106 #
107 # Bash command substitution $(kubectl_...) removes all trailing whitespaces
108 # which are important for line counting.
109 # Use trick from https://unix.stackexchange.com/a/383411 to avoid
110 # newline truncation.
111 node=$(kubectl_retry get nodes --no-headers; ret=$?; echo .; exit "$ret") && res="$?" || res="$?"
112 node="${node%.}"
113 if [ "${res}" -ne "0" ]; then
114 if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then
115 echo -e "${color_red:-} Failed to get nodes.${color_norm:-}"
116 exit 1
117 else
118 continue
119 fi
120 fi
121 found=$(echo -n "${node}" | wc -l)
122 # Use grep || true so that empty result doesn't return nonzero exit code.
123 ready=$(echo -n "${node}" | grep -c -v "NotReady" || true)
124
125 if (( "${found}" == "${EXPECTED_NUM_NODES}" )) && (( "${ready}" == "${EXPECTED_NUM_NODES}")); then
126 break
127 elif (( "${found}" > "${EXPECTED_NUM_NODES}" )); then
128 if [[ "${KUBE_USE_EXISTING_MASTER:-}" != "true" ]]; then
129 echo -e "${color_red}Found ${found} nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}"
130 fi
131 break
132 elif (( "${ready}" > "${EXPECTED_NUM_NODES}")); then
133 echo -e "${color_red}Found ${ready} ready nodes, but expected ${EXPECTED_NUM_NODES}. Your cluster may not behave correctly.${color_norm}"
134 break
135 else
136 if [[ "${REQUIRED_NUM_NODES}" -le "${ready}" ]]; then
137 echo -e "${color_green:-}Found ${REQUIRED_NUM_NODES} Nodes, allowing additional ${ADDITIONAL_ITERATIONS} iterations for other Nodes to join.${color_norm}"
138 last_run="${last_run:-$((attempt + ADDITIONAL_ITERATIONS - 1))}"
139 fi
140 if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then
141 echo -e "${color_yellow:-}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
142 kubectl_retry get nodes
143 if [[ "${REQUIRED_NUM_NODES}" -gt "${ready}" ]]; then
144 exit 1
145 else
146 return_value=2
147 break
148 fi
149 else
150 echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}"
151 fi
152 fi
153done
154echo "Found ${found} node(s)."
155kubectl_retry get nodes
156
157attempt=0
158while true; do
159 # The "kubectl componentstatuses -o template" exports components health information.
160 #
161 # Echo the output and gather 2 counts:
162 # - Total number of componentstatuses.
163 # - Number of "healthy" components.
164 cs_status=$(kubectl_retry get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{"\n"}}{{end}}') || true
165 componentstatuses=$(echo "${cs_status}" | grep -c 'Healthy:') || true
166 healthy=$(echo "${cs_status}" | grep -c 'Healthy:True') || true
167
168 if ((componentstatuses > healthy)) || ((componentstatuses == 0)); then
169 if ((attempt < 5)); then
170 echo -e "${color_yellow}Cluster not working yet.${color_norm}"
171 attempt=$((attempt+1))
172 sleep 30
173 else
174 echo -e " ${color_yellow}Validate output:${color_norm}"
175 kubectl_retry get cs
176 echo -e "${color_red}Validation returned one or more failed components. Cluster is probably broken.${color_norm}"
177 exit 1
178 fi
179 else
180 break
181 fi
182done
183
184echo "Validate output:"
185kubectl_retry get cs || true
186if [ "${return_value}" == "0" ]; then
187 echo -e "${color_green}Cluster validation succeeded${color_norm}"
188else
189 echo -e "${color_yellow}Cluster validation encountered some problems, but cluster should be in working order${color_norm}"
190fi
191
192exit "${return_value}"
View as plain text