...
1#!/usr/bin/env bash
2
3# Copyright 2017 The Kubernetes Authors.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17# Call this to dump all master and node logs into the folder specified in $1
18# (defaults to _artifacts). Only works if the provider supports SSH.
19
20# TODO(shyamjvs): This script should be moved to test/e2e which is where it ideally belongs.
21set -o errexit
22set -o nounset
23set -o pipefail
24
25readonly report_dir="${1:-_artifacts}"
26readonly gcs_artifacts_dir="${2:-}"
27readonly logexporter_namespace="${3:-logexporter}"
28
29# In order to more trivially extend log-dump for custom deployments,
30# check for a function named log_dump_custom_get_instances. If it's
31# defined, we assume the function can me called with one argument, the
32# role, which is either "master" or "node".
33echo 'Checking for custom logdump instances, if any'
34if [[ $(type -t log_dump_custom_get_instances) == "function" ]]; then
35 readonly use_custom_instance_list=yes
36else
37 readonly use_custom_instance_list=
38fi
39
40readonly master_ssh_supported_providers="gce aws"
41readonly node_ssh_supported_providers="gce gke aws"
42readonly gcloud_supported_providers="gce gke"
43
44readonly master_logfiles="kube-apiserver.log kube-apiserver-audit.log kube-scheduler.log kube-controller-manager.log cloud-controller-manager.log etcd.log etcd-events.log glbc.log cluster-autoscaler.log kube-addon-manager.log konnectivity-server.log fluentd.log kubelet.cov"
45readonly node_logfiles="kube-proxy.log containers/konnectivity-agent-*.log fluentd.log node-problem-detector.log kubelet.cov"
46readonly node_systemd_services="node-problem-detector"
47readonly hollow_node_logfiles="kubelet-hollow-node-*.log kubeproxy-hollow-node-*.log npd-hollow-node-*.log"
48readonly aws_logfiles="cloud-init-output.log"
49readonly gce_logfiles="startupscript.log"
50readonly kern_logfile="kern.log"
51readonly initd_logfiles="docker/log"
52readonly supervisord_logfiles="kubelet.log supervisor/supervisord.log supervisor/kubelet-stdout.log supervisor/kubelet-stderr.log supervisor/docker-stdout.log supervisor/docker-stderr.log"
53readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}"
54readonly extra_log_files="${LOG_DUMP_EXTRA_FILES:-}"
55readonly extra_systemd_services="${LOG_DUMP_SAVE_SERVICES:-}"
56readonly dump_systemd_journal="${LOG_DUMP_SYSTEMD_JOURNAL:-false}"
57# Log files found in WINDOWS_LOGS_DIR on Windows nodes:
58readonly windows_node_logfiles="kubelet.log kube-proxy.log docker.log docker_images.log csi-proxy.log"
59# Log files found in other directories on Windows nodes:
60readonly windows_node_otherfiles="C:\\Windows\\MEMORY.dmp"
61
62# Limit the number of concurrent node connections so that we don't run out of
63# file descriptors for large clusters.
64readonly max_dump_processes=25
65
66# Indicator variable whether we experienced a significant failure during
67# logexporter creation or execution.
68logexporter_failed=0
69
70# Percentage of nodes that must be logexported successfully (otherwise the
71# process will exit with a non-zero exit code).
72readonly log_dump_expected_success_percentage="${LOG_DUMP_EXPECTED_SUCCESS_PERCENTAGE:-0}"
73
74function print-deprecation-note() {
75 local -r dashline=$(printf -- '-%.0s' {1..100})
76 echo "${dashline}"
77 echo "k/k version of the log-dump.sh script is deprecated!"
78 echo "Please migrate your test job to use test-infra's repo version of log-dump.sh!"
79 echo "Migration steps can be found in the readme file."
80 echo "${dashline}"
81}
82
83# TODO: Get rid of all the sourcing of bash dependencies eventually.
84function setup() {
85 KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/../..
86 if [[ -z "${use_custom_instance_list}" ]]; then
87 : "${KUBE_CONFIG_FILE:=config-test.sh}"
88 echo 'Sourcing kube-util.sh'
89 source "${KUBE_ROOT}/cluster/kube-util.sh"
90 echo 'Detecting project'
91 detect-project 2>&1
92 elif [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then
93 echo "Using 'use_custom_instance_list' with gke, skipping check for LOG_DUMP_SSH_KEY and LOG_DUMP_SSH_USER"
94 # Source the below script for the ssh-to-node utility function.
95 # Hack to save and restore the value of the ZONE env as the script overwrites it.
96 local gke_zone="${ZONE:-}"
97 source "${KUBE_ROOT}/cluster/gce/util.sh"
98 ZONE="${gke_zone}"
99 elif [[ -z "${LOG_DUMP_SSH_KEY:-}" ]]; then
100 echo 'LOG_DUMP_SSH_KEY not set, but required when using log_dump_custom_get_instances'
101 exit 1
102 elif [[ -z "${LOG_DUMP_SSH_USER:-}" ]]; then
103 echo 'LOG_DUMP_SSH_USER not set, but required when using log_dump_custom_get_instances'
104 exit 1
105 fi
106 source "${KUBE_ROOT}/hack/lib/util.sh"
107}
108
109function log-dump-ssh() {
110 if [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
111 ssh-to-node "$@"
112 return
113 fi
114
115 local host="$1"
116 local cmd="$2"
117
118 ssh -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${LOG_DUMP_SSH_KEY}" "${LOG_DUMP_SSH_USER}@${host}" "${cmd}"
119}
120
121# Copy all files /var/log/{$3}.log on node $1 into local dir $2.
122# $3 should be a string array of file names.
123# This function shouldn't ever trigger errexit, but doesn't block stderr.
124function copy-logs-from-node() {
125 local -r node="${1}"
126 local -r dir="${2}"
127 shift
128 shift
129 local files=("$@")
130 # Append "*"
131 # The * at the end is needed to also copy rotated logs (which happens
132 # in large clusters and long runs).
133 files=( "${files[@]/%/*}" )
134 # Prepend "/var/log/"
135 files=( "${files[@]/#/\/var\/log\/}" )
136 # Comma delimit (even the singleton, or scp does the wrong thing), surround by braces.
137 local -r scp_files="{$(printf "%s," "${files[@]}")}"
138
139 if [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
140 # get-serial-port-output lets you ask for ports 1-4, but currently (11/21/2016) only port 1 contains useful information
141 gcloud compute instances get-serial-port-output --project "${PROJECT}" --zone "${ZONE}" --port 1 "${node}" > "${dir}/serial-1.log" || true
142 source_file_args=()
143 for single_file in "${files[@]}"; do
144 source_file_args+=( "${node}:${single_file}" )
145 done
146 gcloud compute scp --recurse --project "${PROJECT}" --zone "${ZONE}" "${source_file_args[@]}" "${dir}" > /dev/null || true
147 elif [[ "${KUBERNETES_PROVIDER}" == "aws" ]]; then
148 local ip
149 ip=$(get_ssh_hostname "${node}")
150 scp -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${AWS_SSH_KEY}" "${SSH_USER}@${ip}:${scp_files}" "${dir}" > /dev/null || true
151 elif [[ -n "${use_custom_instance_list}" ]]; then
152 scp -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${LOG_DUMP_SSH_KEY}" "${LOG_DUMP_SSH_USER}@${node}:${scp_files}" "${dir}" > /dev/null || true
153 else
154 echo "Unknown cloud-provider '${KUBERNETES_PROVIDER}' and use_custom_instance_list is unset too - skipping logdump for '${node}'"
155 fi
156}
157
158# Save logs for node $1 into directory $2. Pass in any non-common files in $3.
159# Pass in any non-common systemd services in $4.
160# $3 and $4 should be a space-separated list of files.
161# Set $5 to true to indicate it is on master. Default to false.
162# This function shouldn't ever trigger errexit
163function save-logs() {
164 local -r node_name="${1}"
165 local -r dir="${2}"
166 local files=()
167 IFS=' ' read -r -a files <<< "$3"
168 local opt_systemd_services="${4:-""}"
169 local on_master="${5:-"false"}"
170
171 local extra=()
172 IFS=' ' read -r -a extra <<< "$extra_log_files"
173 files+=("${extra[@]}")
174 if [[ -n "${use_custom_instance_list}" ]]; then
175 if [[ -n "${LOG_DUMP_SAVE_LOGS:-}" ]]; then
176 local dump=()
177 IFS=' ' read -r -a dump <<< "${LOG_DUMP_SAVE_LOGS:-}"
178 files+=("${dump[@]}")
179 fi
180 else
181 local providerlogs=()
182 case "${KUBERNETES_PROVIDER}" in
183 gce|gke)
184 IFS=' ' read -r -a providerlogs <<< "${gce_logfiles}"
185 ;;
186 aws)
187 IFS=' ' read -r -a providerlogs <<< "${aws_logfiles}"
188 ;;
189 esac
190 files+=("${providerlogs[@]}")
191 fi
192 local services
193 read -r -a services <<< "${systemd_services} ${opt_systemd_services} ${extra_systemd_services}"
194
195 if log-dump-ssh "${node_name}" "command -v journalctl" &> /dev/null; then
196 if [[ "${on_master}" == "true" ]]; then
197 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-master-installation.service" > "${dir}/kube-master-installation.log" || true
198 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-master-configuration.service" > "${dir}/kube-master-configuration.log" || true
199 else
200 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-node-installation.service" > "${dir}/kube-node-installation.log" || true
201 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-node-configuration.service" > "${dir}/kube-node-configuration.log" || true
202 fi
203 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -k" > "${dir}/kern.log" || true
204
205 for svc in "${services[@]}"; do
206 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u ${svc}.service" > "${dir}/${svc}.log" || true
207 done
208
209 if [[ "$dump_systemd_journal" == "true" ]]; then
210 log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise" > "${dir}/systemd.log" || true
211 fi
212 else
213 local tmpfiles=()
214 for f in "${kern_logfile}" "${initd_logfiles}" "${supervisord_logfiles}"; do
215 IFS=' ' read -r -a tmpfiles <<< "$f"
216 files+=("${tmpfiles[@]}")
217 done
218 fi
219
220 # log where we pull the images from
221 log-dump-ssh "${node_name}" "sudo ctr -n k8s.io images ls" > "${dir}/images-containerd.log" || true
222 log-dump-ssh "${node_name}" "sudo docker images --all" > "${dir}/images-docker.log" || true
223
224 # Try dumping coverage profiles, if it looks like coverage is enabled in the first place.
225 if log-dump-ssh "${node_name}" "stat /var/log/kubelet.cov" &> /dev/null; then
226 if log-dump-ssh "${node_name}" "command -v docker" &> /dev/null; then
227 if [[ "${on_master}" == "true" ]]; then
228 run-in-docker-container "${node_name}" "kube-apiserver" "cat /tmp/k8s-kube-apiserver.cov" > "${dir}/kube-apiserver.cov" || true
229 run-in-docker-container "${node_name}" "kube-scheduler" "cat /tmp/k8s-kube-scheduler.cov" > "${dir}/kube-scheduler.cov" || true
230 run-in-docker-container "${node_name}" "kube-controller-manager" "cat /tmp/k8s-kube-controller-manager.cov" > "${dir}/kube-controller-manager.cov" || true
231 else
232 run-in-docker-container "${node_name}" "kube-proxy" "cat /tmp/k8s-kube-proxy.cov" > "${dir}/kube-proxy.cov" || true
233 fi
234 else
235 echo 'Coverage profiles seem to exist, but cannot be retrieved from inside containers.'
236 fi
237 fi
238
239 echo 'Changing logfiles to be world-readable for download'
240 log-dump-ssh "${node_name}" "sudo chmod -R a+r /var/log" || true
241
242 echo "Copying '${files[*]}' from ${node_name}"
243 copy-logs-from-node "${node_name}" "${dir}" "${files[@]}"
244}
245
246# Saves a copy of the Windows Docker event log to ${WINDOWS_LOGS_DIR}\docker.log
247# on node $1.
248function export-windows-docker-event-log() {
249 local -r node="${1}"
250
251 local -r powershell_cmd="powershell.exe -Command \"\$logs=\$(Get-EventLog -LogName Application -Source Docker | Format-Table -Property TimeGenerated, EntryType, Message -Wrap); \$logs | Out-File -FilePath '${WINDOWS_LOGS_DIR}\\docker.log'\""
252
253 # Retry up to 3 times to allow ssh keys to be properly propagated and
254 # stored.
255 for retry in {1..3}; do
256 if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \
257 --command "$powershell_cmd"; then
258 break
259 else
260 sleep 10
261 fi
262 done
263}
264
265# Saves prepulled Windows Docker images list to ${WINDOWS_LOGS_DIR}\docker_images.log
266# on node $1.
267function export-windows-docker-images-list() {
268 local -r node="${1}"
269
270 local -r powershell_cmd="powershell.exe -Command \"\$logs=\$(docker image list); \$logs | Out-File -FilePath '${WINDOWS_LOGS_DIR}\\docker_images.log'\""
271
272 # Retry up to 3 times to allow ssh keys to be properly propagated and
273 # stored.
274 for retry in {1..3}; do
275 if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \
276 --command "$powershell_cmd"; then
277 break
278 else
279 sleep 10
280 fi
281 done
282}
283
284# Saves log files from diagnostics tool.(https://github.com/GoogleCloudPlatform/compute-image-tools/tree/master/cli_tools/diagnostics)
285function save-windows-logs-via-diagnostics-tool() {
286 local node="${1}"
287 local dest_dir="${2}"
288
289 gcloud compute instances add-metadata "${node}" --metadata enable-diagnostics=true --project="${PROJECT}" --zone="${ZONE}"
290 local logs_archive_in_gcs
291 logs_archive_in_gcs=$(gcloud alpha compute diagnose export-logs "${node}" "--zone=${ZONE}" "--project=${PROJECT}" | tail -n 1)
292 local temp_local_path="${node}.zip"
293 for retry in {1..20}; do
294 if gsutil mv "${logs_archive_in_gcs}" "${temp_local_path}" > /dev/null 2>&1; then
295 echo "Downloaded diagnostics log from ${logs_archive_in_gcs}"
296 break
297 else
298 sleep 10
299 fi
300 done
301
302 if [[ -f "${temp_local_path}" ]]; then
303 unzip "${temp_local_path}" -d "${dest_dir}" > /dev/null
304 rm -f "${temp_local_path}"
305 fi
306}
307
308# Saves log files from SSH
309function save-windows-logs-via-ssh() {
310 local node="${1}"
311 local dest_dir="${2}"
312
313 export-windows-docker-event-log "${node}"
314 export-windows-docker-images-list "${node}"
315
316 local remote_files=()
317 for file in "${windows_node_logfiles[@]}"; do
318 remote_files+=( "${WINDOWS_LOGS_DIR}\\${file}" )
319 done
320 remote_files+=( "${windows_node_otherfiles[@]}" )
321
322 # TODO(pjh, yujuhong): handle rotated logs and copying multiple files at the
323 # same time.
324 for remote_file in "${remote_files[@]}"; do
325 # Retry up to 3 times to allow ssh keys to be properly propagated and
326 # stored.
327 for retry in {1..3}; do
328 if gcloud compute scp --recurse --project "${PROJECT}" \
329 --zone "${ZONE}" "${node}:${remote_file}" "${dest_dir}" \
330 > /dev/null; then
331 break
332 else
333 sleep 10
334 fi
335 done
336 done
337}
338
339# Save log files and serial console output from Windows node $1 into local
340# directory $2.
341# This function shouldn't ever trigger errexit.
342function save-logs-windows() {
343 local -r node="${1}"
344 local -r dest_dir="${2}"
345
346 if [[ ! "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
347 echo "Not saving logs for ${node}, Windows log dumping requires gcloud support"
348 return
349 fi
350
351 if [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then
352 save-windows-logs-via-diagnostics-tool "${node}" "${dest_dir}"
353 else
354 save-windows-logs-via-ssh "${node}" "${dest_dir}"
355 fi
356
357 # Serial port 1 contains the Windows console output.
358 gcloud compute instances get-serial-port-output --project "${PROJECT}" \
359 --zone "${ZONE}" --port 1 "${node}" > "${dest_dir}/serial-1.log" || true
360}
361
362# Execute a command in container $2 on node $1.
363# Uses docker because the container may not ordinarily permit direct execution.
364function run-in-docker-container() {
365 local node_name="$1"
366 local container="$2"
367 shift 2
368 log-dump-ssh "${node_name}" "docker exec \"\$(docker ps -f label=io.kubernetes.container.name=${container} --format \"{{.ID}}\")\" $*"
369}
370
371function dump_masters() {
372 local master_names=()
373 if [[ -n "${use_custom_instance_list}" ]]; then
374 while IFS='' read -r line; do master_names+=("$line"); done < <(log_dump_custom_get_instances master)
375 elif [[ ! "${master_ssh_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
376 echo "Master SSH not supported for ${KUBERNETES_PROVIDER}"
377 return
378 elif [[ -n "${KUBEMARK_MASTER_NAME:-}" ]]; then
379 master_names=( "${KUBEMARK_MASTER_NAME}" )
380 else
381 if ! (detect-master); then
382 echo 'Master not detected. Is the cluster up?'
383 return
384 fi
385 master_names=( "${MASTER_NAME}" )
386 fi
387
388 if [[ "${#master_names[@]}" == 0 ]]; then
389 echo 'No masters found?'
390 return
391 fi
392
393 proc=${max_dump_processes}
394 for master_name in "${master_names[@]}"; do
395 master_dir="${report_dir}/${master_name}"
396 mkdir -p "${master_dir}"
397 save-logs "${master_name}" "${master_dir}" "${master_logfiles}" "" "true" &
398
399 # We don't want to run more than ${max_dump_processes} at a time, so
400 # wait once we hit that many nodes. This isn't ideal, since one might
401 # take much longer than the others, but it should help.
402 proc=$((proc - 1))
403 if [[ proc -eq 0 ]]; then
404 proc=${max_dump_processes}
405 wait
406 fi
407 done
408 # Wait for any remaining processes.
409 if [[ proc -gt 0 && proc -lt ${max_dump_processes} ]]; then
410 wait
411 fi
412}
413
414# Dumps logs from nodes in the cluster. Linux nodes to dump logs from can be
415# specified via $1 or $use_custom_instance_list. If not specified then the nodes
416# to dump logs for will be detected using detect-node-names(); if Windows nodes
417# are present then they will be detected and their logs will be dumped too.
418function dump_nodes() {
419 local node_names=()
420 local windows_node_names=()
421 if [[ -n "${1:-}" ]]; then
422 echo 'Dumping logs for nodes provided as args to dump_nodes() function'
423 node_names=( "$@" )
424 elif [[ -n "${use_custom_instance_list}" ]]; then
425 echo 'Dumping logs for nodes provided by log_dump_custom_get_instances() function'
426 while IFS='' read -r line; do node_names+=("$line"); done < <(log_dump_custom_get_instances node)
427 elif [[ ! "${node_ssh_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
428 echo "Node SSH not supported for ${KUBERNETES_PROVIDER}"
429 return
430 else
431 echo 'Detecting nodes in the cluster'
432 detect-node-names &> /dev/null
433 if [[ -n "${NODE_NAMES:-}" ]]; then
434 node_names=( "${NODE_NAMES[@]}" )
435 fi
436 if [[ -n "${WINDOWS_NODE_NAMES:-}" ]]; then
437 windows_node_names=( "${WINDOWS_NODE_NAMES[@]}" )
438 fi
439 fi
440
441 if [[ "${#node_names[@]}" == 0 && "${#windows_node_names[@]}" == 0 ]]; then
442 echo 'No nodes found!'
443 return
444 fi
445
446 node_logfiles_all="${node_logfiles}"
447 if [[ "${ENABLE_HOLLOW_NODE_LOGS:-}" == "true" ]]; then
448 node_logfiles_all="${node_logfiles_all} ${hollow_node_logfiles}"
449 fi
450
451 linux_nodes_selected_for_logs=()
452 if [[ -n "${LOGDUMP_ONLY_N_RANDOM_NODES:-}" ]]; then
453 # We randomly choose 'LOGDUMP_ONLY_N_RANDOM_NODES' many nodes for fetching logs.
454 for index in $(shuf -i 0-$(( ${#node_names[*]} - 1 )) -n "${LOGDUMP_ONLY_N_RANDOM_NODES}")
455 do
456 linux_nodes_selected_for_logs+=("${node_names[$index]}")
457 done
458 else
459 linux_nodes_selected_for_logs=( "${node_names[@]}" )
460 fi
461 all_selected_nodes=( "${linux_nodes_selected_for_logs[@]}" )
462 all_selected_nodes+=( "${windows_node_names[@]}" )
463
464 proc=${max_dump_processes}
465 start="$(date +%s)"
466 # log_dump_ssh_timeout is the maximal number of seconds the log dumping over
467 # SSH operation can take. Please note that the logic enforcing the timeout
468 # is only a best effort. The actual time of the operation may be longer
469 # due to waiting for all the child processes below.
470 log_dump_ssh_timeout_seconds="${LOG_DUMP_SSH_TIMEOUT_SECONDS:-}"
471 for i in "${!all_selected_nodes[@]}"; do
472 node_name="${all_selected_nodes[$i]}"
473 node_dir="${report_dir}/${node_name}"
474 mkdir -p "${node_dir}"
475 if [[ "${i}" -lt "${#linux_nodes_selected_for_logs[@]}" ]]; then
476 # Save logs in the background. This speeds up things when there are
477 # many nodes.
478 save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" &
479 else
480 save-logs-windows "${node_name}" "${node_dir}" &
481 fi
482
483 # We don't want to run more than ${max_dump_processes} at a time, so
484 # wait once we hit that many nodes. This isn't ideal, since one might
485 # take much longer than the others, but it should help.
486 proc=$((proc - 1))
487 if [[ proc -eq 0 ]]; then
488 proc=${max_dump_processes}
489 wait
490 now="$(date +%s)"
491 if [[ -n "${log_dump_ssh_timeout_seconds}" && $((now - start)) -gt ${log_dump_ssh_timeout_seconds} ]]; then
492 echo "WARNING: Hit timeout after ${log_dump_ssh_timeout_seconds} seconds, finishing log dumping over SSH shortly"
493 break
494 fi
495 fi
496 done
497 # Wait for any remaining processes.
498 if [[ proc -gt 0 && proc -lt ${max_dump_processes} ]]; then
499 wait
500 fi
501}
502
503# Collect names of nodes which didn't run logexporter successfully.
504# This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter
505# does not run on Windows nodes.
506#
507# Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes.
508# Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes).
509# Assumes:
510# NODE_NAMES
511# Sets:
512# NON_LOGEXPORTED_NODES
513function find_non_logexported_nodes() {
514 local file="${gcs_artifacts_dir}/logexported-nodes-registry"
515 echo "Listing marker files ($file) for successful nodes..."
516 succeeded_nodes=$(gsutil ls "${file}") || return 1
517 echo 'Successfully listed marker files for successful nodes'
518 NON_LOGEXPORTED_NODES=()
519 for node in "${NODE_NAMES[@]}"; do
520 if [[ ! "${succeeded_nodes}" =~ ${node} ]]; then
521 NON_LOGEXPORTED_NODES+=("${node}")
522 fi
523 done
524}
525
526# This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter
527# does not run on Windows nodes.
528function dump_nodes_with_logexporter() {
529 if [[ -n "${use_custom_instance_list}" ]]; then
530 echo 'Dumping logs for nodes provided by log_dump_custom_get_instances() function'
531 NODE_NAMES=()
532 while IFS='' read -r line; do NODE_NAMES+=("$line"); done < <(log_dump_custom_get_instances node)
533 else
534 echo 'Detecting nodes in the cluster'
535 detect-node-names &> /dev/null
536 fi
537
538 if [[ -z "${NODE_NAMES:-}" ]]; then
539 echo 'No nodes found!'
540 return
541 fi
542
543 # Obtain parameters required by logexporter.
544 local -r service_account_credentials="$(base64 "${GOOGLE_APPLICATION_CREDENTIALS}" | tr -d '\n')"
545 local -r cloud_provider="${KUBERNETES_PROVIDER}"
546 local -r enable_hollow_node_logs="${ENABLE_HOLLOW_NODE_LOGS:-false}"
547 local -r logexport_sleep_seconds="$(( 90 + NUM_NODES / 3 ))"
548 if [[ -z "${ZONE_NODE_SELECTOR_DISABLED:-}" ]]; then
549 local -r node_selector="${ZONE_NODE_SELECTOR_LABEL:-topology.kubernetes.io/zone}: ${ZONE}"
550 fi
551
552 # Fill in the parameters in the logexporter daemonset template.
553 local -r tmp="${KUBE_TEMP}/logexporter"
554 local -r manifest_yaml="${tmp}/logexporter-daemonset.yaml"
555 mkdir -p "${tmp}"
556 cp "${KUBE_ROOT}/cluster/log-dump/logexporter-daemonset.yaml" "${manifest_yaml}"
557
558 sed -i'' -e "s@{{.NodeSelector}}@${node_selector:-}@g" "${manifest_yaml}"
559 sed -i'' -e "s@{{.LogexporterNamespace}}@${logexporter_namespace}@g" "${manifest_yaml}"
560 sed -i'' -e "s@{{.ServiceAccountCredentials}}@${service_account_credentials}@g" "${manifest_yaml}"
561 sed -i'' -e "s@{{.CloudProvider}}@${cloud_provider}@g" "${manifest_yaml}"
562 sed -i'' -e "s@{{.GCSPath}}@${gcs_artifacts_dir}@g" "${manifest_yaml}"
563 sed -i'' -e "s@{{.EnableHollowNodeLogs}}@${enable_hollow_node_logs}@g" "${manifest_yaml}"
564 sed -i'' -e "s@{{.DumpSystemdJournal}}@${dump_systemd_journal}@g" "${manifest_yaml}"
565 sed -i'' -e "s@{{.ExtraLogFiles}}@${extra_log_files}@g" "${manifest_yaml}"
566 sed -i'' -e "s@{{.ExtraSystemdServices}}@${extra_systemd_services}@g" "${manifest_yaml}"
567
568 # Create the logexporter namespace, service-account secret and the logexporter daemonset within that namespace.
569 KUBECTL="${KUBE_ROOT}/cluster/kubectl.sh"
570 if ! "${KUBECTL}" create -f "${manifest_yaml}"; then
571 echo 'Failed to create logexporter daemonset.. falling back to logdump through SSH'
572 "${KUBECTL}" delete namespace "${logexporter_namespace}" || true
573 dump_nodes "${NODE_NAMES[@]}"
574 logexporter_failed=1
575 return
576 fi
577
578 # Periodically fetch list of already logexported nodes to verify
579 # if we aren't already done.
580 start="$(date +%s)"
581 while true; do
582 now="$(date +%s)"
583 if [[ $((now - start)) -gt ${logexport_sleep_seconds} ]]; then
584 echo 'Waiting for all nodes to be logexported timed out.'
585 break
586 fi
587 if find_non_logexported_nodes; then
588 if [[ -z "${NON_LOGEXPORTED_NODES:-}" ]]; then
589 break
590 fi
591 fi
592 sleep 15
593 done
594
595 # Store logs from logexporter pods to allow debugging log exporting process
596 # itself.
597 proc=${max_dump_processes}
598 "${KUBECTL}" get pods -n "${logexporter_namespace}" -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}{end}' | (while read -r pod node; do
599 echo "Fetching logs from ${pod} running on ${node}"
600 mkdir -p "${report_dir}/${node}"
601 "${KUBECTL}" logs -n "${logexporter_namespace}" "${pod}" > "${report_dir}/${node}/${pod}.log" &
602
603 # We don't want to run more than ${max_dump_processes} at a time, so
604 # wait once we hit that many nodes. This isn't ideal, since one might
605 # take much longer than the others, but it should help.
606 proc=$((proc - 1))
607 if [[ proc -eq 0 ]]; then
608 proc=${max_dump_processes}
609 wait
610 fi
611 # Wait for any remaining processes.
612 done; wait)
613
614 # List registry of marker files (of nodes whose logexporter succeeded) from GCS.
615 for retry in {1..10}; do
616 if find_non_logexported_nodes; then
617 break
618 else
619 echo "Attempt ${retry} failed to list marker files for successful nodes"
620 if [[ "${retry}" == 10 ]]; then
621 echo 'Final attempt to list marker files failed.. falling back to logdump through SSH'
622 "${KUBECTL}" delete namespace "${logexporter_namespace}" || true
623 dump_nodes "${NODE_NAMES[@]}"
624 logexporter_failed=1
625 return
626 fi
627 sleep 2
628 fi
629 done
630
631 failed_nodes=()
632 # The following if is needed, because defaulting for empty arrays
633 # seems to treat them as non-empty with single empty string.
634 if [[ -n "${NON_LOGEXPORTED_NODES:-}" ]]; then
635 for node in "${NON_LOGEXPORTED_NODES[@]:-}"; do
636 echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH."
637 failed_nodes+=("${node}")
638 done
639 fi
640
641 # If less than a certain ratio of the nodes got logexported, report an error.
642 if [[ $(((${#NODE_NAMES[@]} - ${#failed_nodes[@]}) * 100)) -lt $((${#NODE_NAMES[@]} * log_dump_expected_success_percentage )) ]]; then
643 logexporter_failed=1
644 fi
645
646 # Delete the logexporter resources and dump logs for the failed nodes (if any) through SSH.
647 "${KUBECTL}" get pods --namespace "${logexporter_namespace}" || true
648 "${KUBECTL}" delete namespace "${logexporter_namespace}" || true
649 if [[ "${#failed_nodes[@]}" != 0 ]]; then
650 echo -e "Dumping logs through SSH for the following nodes:\n${failed_nodes[*]}"
651 dump_nodes "${failed_nodes[@]}"
652 fi
653}
654
655function detect_node_failures() {
656 if ! [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
657 return
658 fi
659
660 detect-node-names
661 if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then
662 local all_instance_groups=("${INSTANCE_GROUPS[@]}" "${WINDOWS_INSTANCE_GROUPS[@]}")
663 else
664 local all_instance_groups=("${INSTANCE_GROUPS[@]}")
665 fi
666
667 if [ -z "${all_instance_groups:-}" ]; then
668 return
669 fi
670 for group in "${all_instance_groups[@]}"; do
671 local creation_timestamp
672 creation_timestamp=$(gcloud compute instance-groups managed describe \
673 "${group}" \
674 --project "${PROJECT}" \
675 --zone "${ZONE}" \
676 --format='value(creationTimestamp)')
677 echo "Failures for ${group} (if any):"
678 gcloud logging read --order=asc \
679 --format='table(timestamp,jsonPayload.resource.name,jsonPayload.event_subtype)' \
680 --project "${PROJECT}" \
681 "resource.type=\"gce_instance\"
682 logName=\"projects/${PROJECT}/logs/compute.googleapis.com%2Factivity_log\"
683 (jsonPayload.event_subtype=\"compute.instances.hostError\" OR jsonPayload.event_subtype=\"compute.instances.automaticRestart\")
684 jsonPayload.resource.name:\"${group}\"
685 timestamp >= \"${creation_timestamp}\""
686 done
687}
688
689function main() {
690 print-deprecation-note
691 setup
692 kube::util::ensure-temp-dir
693 # Copy master logs to artifacts dir locally (through SSH).
694 echo "Dumping logs from master locally to '${report_dir}'"
695 dump_masters
696 if [[ "${DUMP_ONLY_MASTER_LOGS:-}" == "true" ]]; then
697 echo 'Skipping dumping of node logs'
698 return
699 fi
700
701 # Copy logs from nodes to GCS directly or to artifacts dir locally (through SSH).
702 if [[ -n "${gcs_artifacts_dir}" ]]; then
703 echo "Dumping logs from nodes to GCS directly at '${gcs_artifacts_dir}' using logexporter"
704 dump_nodes_with_logexporter
705 else
706 echo "Dumping logs from nodes locally to '${report_dir}'"
707 dump_nodes
708 fi
709
710 detect_node_failures
711 if [[ ${logexporter_failed} -ne 0 && ${log_dump_expected_success_percentage} -gt 0 ]]; then
712 return 1
713 fi
714}
715
716main
View as plain text