log-dump.sh

Documentation: k8s.io/kubernetes/cluster/log-dump

     1#!/usr/bin/env bash
     2
     3# Copyright 2017 The Kubernetes Authors.
     4#
     5# Licensed under the Apache License, Version 2.0 (the "License");
     6# you may not use this file except in compliance with the License.
     7# You may obtain a copy of the License at
     8#
     9#     http://www.apache.org/licenses/LICENSE-2.0
    10#
    11# Unless required by applicable law or agreed to in writing, software
    12# distributed under the License is distributed on an "AS IS" BASIS,
    13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14# See the License for the specific language governing permissions and
    15# limitations under the License.
    16
    17# Call this to dump all master and node logs into the folder specified in $1
    18# (defaults to _artifacts). Only works if the provider supports SSH.
    19
    20# TODO(shyamjvs): This script should be moved to test/e2e which is where it ideally belongs.
    21set -o errexit
    22set -o nounset
    23set -o pipefail
    24
    25readonly report_dir="${1:-_artifacts}"
    26readonly gcs_artifacts_dir="${2:-}"
    27readonly logexporter_namespace="${3:-logexporter}"
    28
    29# In order to more trivially extend log-dump for custom deployments,
    30# check for a function named log_dump_custom_get_instances. If it's
    31# defined, we assume the function can me called with one argument, the
    32# role, which is either "master" or "node".
    33echo 'Checking for custom logdump instances, if any'
    34if [[ $(type -t log_dump_custom_get_instances) == "function" ]]; then
    35  readonly use_custom_instance_list=yes
    36else
    37  readonly use_custom_instance_list=
    38fi
    39
    40readonly master_ssh_supported_providers="gce aws"
    41readonly node_ssh_supported_providers="gce gke aws"
    42readonly gcloud_supported_providers="gce gke"
    43
    44readonly master_logfiles="kube-apiserver.log kube-apiserver-audit.log kube-scheduler.log kube-controller-manager.log cloud-controller-manager.log etcd.log etcd-events.log glbc.log cluster-autoscaler.log kube-addon-manager.log konnectivity-server.log fluentd.log kubelet.cov"
    45readonly node_logfiles="kube-proxy.log containers/konnectivity-agent-*.log fluentd.log node-problem-detector.log kubelet.cov"
    46readonly node_systemd_services="node-problem-detector"
    47readonly hollow_node_logfiles="kubelet-hollow-node-*.log kubeproxy-hollow-node-*.log npd-hollow-node-*.log"
    48readonly aws_logfiles="cloud-init-output.log"
    49readonly gce_logfiles="startupscript.log"
    50readonly kern_logfile="kern.log"
    51readonly initd_logfiles="docker/log"
    52readonly supervisord_logfiles="kubelet.log supervisor/supervisord.log supervisor/kubelet-stdout.log supervisor/kubelet-stderr.log supervisor/docker-stdout.log supervisor/docker-stderr.log"
    53readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}"
    54readonly extra_log_files="${LOG_DUMP_EXTRA_FILES:-}"
    55readonly extra_systemd_services="${LOG_DUMP_SAVE_SERVICES:-}"
    56readonly dump_systemd_journal="${LOG_DUMP_SYSTEMD_JOURNAL:-false}"
    57# Log files found in WINDOWS_LOGS_DIR on Windows nodes:
    58readonly windows_node_logfiles="kubelet.log kube-proxy.log docker.log docker_images.log csi-proxy.log"
    59# Log files found in other directories on Windows nodes:
    60readonly windows_node_otherfiles="C:\\Windows\\MEMORY.dmp"
    61
    62# Limit the number of concurrent node connections so that we don't run out of
    63# file descriptors for large clusters.
    64readonly max_dump_processes=25
    65
    66# Indicator variable whether we experienced a significant failure during
    67# logexporter creation or execution.
    68logexporter_failed=0
    69
    70# Percentage of nodes that must be logexported successfully (otherwise the
    71# process will exit with a non-zero exit code).
    72readonly log_dump_expected_success_percentage="${LOG_DUMP_EXPECTED_SUCCESS_PERCENTAGE:-0}"
    73
    74function print-deprecation-note() {
    75  local -r dashline=$(printf -- '-%.0s' {1..100})
    76  echo "${dashline}"
    77  echo "k/k version of the log-dump.sh script is deprecated!"
    78  echo "Please migrate your test job to use test-infra's repo version of log-dump.sh!"
    79  echo "Migration steps can be found in the readme file."
    80  echo "${dashline}"
    81}
    82
    83# TODO: Get rid of all the sourcing of bash dependencies eventually.
    84function setup() {
    85  KUBE_ROOT=$(dirname "${BASH_SOURCE[0]}")/../..
    86  if [[ -z "${use_custom_instance_list}" ]]; then
    87    : "${KUBE_CONFIG_FILE:=config-test.sh}"
    88    echo 'Sourcing kube-util.sh'
    89    source "${KUBE_ROOT}/cluster/kube-util.sh"
    90    echo 'Detecting project'
    91    detect-project 2>&1
    92  elif [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then
    93    echo "Using 'use_custom_instance_list' with gke, skipping check for LOG_DUMP_SSH_KEY and LOG_DUMP_SSH_USER"
    94    # Source the below script for the ssh-to-node utility function.
    95    # Hack to save and restore the value of the ZONE env as the script overwrites it.
    96    local gke_zone="${ZONE:-}"
    97    source "${KUBE_ROOT}/cluster/gce/util.sh"
    98    ZONE="${gke_zone}"
    99  elif [[ -z "${LOG_DUMP_SSH_KEY:-}" ]]; then
   100    echo 'LOG_DUMP_SSH_KEY not set, but required when using log_dump_custom_get_instances'
   101    exit 1
   102  elif [[ -z "${LOG_DUMP_SSH_USER:-}" ]]; then
   103    echo 'LOG_DUMP_SSH_USER not set, but required when using log_dump_custom_get_instances'
   104    exit 1
   105  fi
   106  source "${KUBE_ROOT}/hack/lib/util.sh"
   107}
   108
   109function log-dump-ssh() {
   110  if [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   111    ssh-to-node "$@"
   112    return
   113  fi
   114
   115  local host="$1"
   116  local cmd="$2"
   117
   118  ssh -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${LOG_DUMP_SSH_KEY}" "${LOG_DUMP_SSH_USER}@${host}" "${cmd}"
   119}
   120
   121# Copy all files /var/log/{$3}.log on node $1 into local dir $2.
   122# $3 should be a string array of file names.
   123# This function shouldn't ever trigger errexit, but doesn't block stderr.
   124function copy-logs-from-node() {
   125    local -r node="${1}"
   126    local -r dir="${2}"
   127    shift
   128    shift
   129    local files=("$@")
   130    # Append "*"
   131    # The * at the end is needed to also copy rotated logs (which happens
   132    # in large clusters and long runs).
   133    files=( "${files[@]/%/*}" )
   134    # Prepend "/var/log/"
   135    files=( "${files[@]/#/\/var\/log\/}" )
   136    # Comma delimit (even the singleton, or scp does the wrong thing), surround by braces.
   137    local -r scp_files="{$(printf "%s," "${files[@]}")}"
   138
   139    if [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   140      # get-serial-port-output lets you ask for ports 1-4, but currently (11/21/2016) only port 1 contains useful information
   141      gcloud compute instances get-serial-port-output --project "${PROJECT}" --zone "${ZONE}" --port 1 "${node}" > "${dir}/serial-1.log" || true
   142      source_file_args=()
   143      for single_file in "${files[@]}"; do
   144        source_file_args+=( "${node}:${single_file}" )
   145      done
   146      gcloud compute scp --recurse --project "${PROJECT}" --zone "${ZONE}" "${source_file_args[@]}" "${dir}" > /dev/null || true
   147    elif  [[ "${KUBERNETES_PROVIDER}" == "aws" ]]; then
   148      local ip
   149      ip=$(get_ssh_hostname "${node}")
   150      scp -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${AWS_SSH_KEY}" "${SSH_USER}@${ip}:${scp_files}" "${dir}" > /dev/null || true
   151    elif  [[ -n "${use_custom_instance_list}" ]]; then
   152      scp -oLogLevel=quiet -oConnectTimeout=30 -oStrictHostKeyChecking=no -i "${LOG_DUMP_SSH_KEY}" "${LOG_DUMP_SSH_USER}@${node}:${scp_files}" "${dir}" > /dev/null || true
   153    else
   154      echo "Unknown cloud-provider '${KUBERNETES_PROVIDER}' and use_custom_instance_list is unset too - skipping logdump for '${node}'"
   155    fi
   156}
   157
   158# Save logs for node $1 into directory $2. Pass in any non-common files in $3.
   159# Pass in any non-common systemd services in $4.
   160# $3 and $4 should be a space-separated list of files.
   161# Set $5 to true to indicate it is on master. Default to false.
   162# This function shouldn't ever trigger errexit
   163function save-logs() {
   164    local -r node_name="${1}"
   165    local -r dir="${2}"
   166    local files=()
   167    IFS=' ' read -r -a files <<< "$3"
   168    local opt_systemd_services="${4:-""}"
   169    local on_master="${5:-"false"}"
   170
   171    local extra=()
   172    IFS=' ' read -r -a extra <<< "$extra_log_files"
   173    files+=("${extra[@]}")
   174    if [[ -n "${use_custom_instance_list}" ]]; then
   175      if [[ -n "${LOG_DUMP_SAVE_LOGS:-}" ]]; then
   176	local dump=()
   177        IFS=' ' read -r -a dump <<< "${LOG_DUMP_SAVE_LOGS:-}"
   178        files+=("${dump[@]}")
   179      fi
   180    else
   181      local providerlogs=()
   182      case "${KUBERNETES_PROVIDER}" in
   183        gce|gke)
   184          IFS=' ' read -r -a providerlogs <<< "${gce_logfiles}"
   185          ;;
   186        aws)
   187          IFS=' ' read -r -a providerlogs <<< "${aws_logfiles}"
   188          ;;
   189      esac
   190      files+=("${providerlogs[@]}")
   191    fi
   192    local services
   193    read -r -a services <<< "${systemd_services} ${opt_systemd_services} ${extra_systemd_services}"
   194
   195    if log-dump-ssh "${node_name}" "command -v journalctl" &> /dev/null; then
   196        if [[ "${on_master}" == "true" ]]; then
   197          log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-master-installation.service" > "${dir}/kube-master-installation.log" || true
   198          log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-master-configuration.service" > "${dir}/kube-master-configuration.log" || true
   199        else
   200          log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-node-installation.service" > "${dir}/kube-node-installation.log" || true
   201          log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u kube-node-configuration.service" > "${dir}/kube-node-configuration.log" || true
   202        fi
   203        log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -k" > "${dir}/kern.log" || true
   204
   205        for svc in "${services[@]}"; do
   206            log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise -u ${svc}.service" > "${dir}/${svc}.log" || true
   207        done
   208
   209        if [[ "$dump_systemd_journal" == "true" ]]; then
   210          log-dump-ssh "${node_name}" "sudo journalctl --output=short-precise" > "${dir}/systemd.log" || true
   211        fi
   212    else
   213        local tmpfiles=()
   214        for f in "${kern_logfile}" "${initd_logfiles}" "${supervisord_logfiles}"; do
   215	    IFS=' ' read -r -a tmpfiles <<< "$f"
   216	    files+=("${tmpfiles[@]}")
   217        done
   218    fi
   219
   220    # log where we pull the images from
   221    log-dump-ssh "${node_name}" "sudo ctr -n k8s.io images ls" > "${dir}/images-containerd.log" || true
   222    log-dump-ssh "${node_name}" "sudo docker images --all" > "${dir}/images-docker.log" || true
   223
   224    # Try dumping coverage profiles, if it looks like coverage is enabled in the first place.
   225    if log-dump-ssh "${node_name}" "stat /var/log/kubelet.cov" &> /dev/null; then
   226      if log-dump-ssh "${node_name}" "command -v docker" &> /dev/null; then
   227        if [[ "${on_master}" == "true" ]]; then
   228          run-in-docker-container "${node_name}" "kube-apiserver" "cat /tmp/k8s-kube-apiserver.cov" > "${dir}/kube-apiserver.cov" || true
   229          run-in-docker-container "${node_name}" "kube-scheduler" "cat /tmp/k8s-kube-scheduler.cov" > "${dir}/kube-scheduler.cov" || true
   230          run-in-docker-container "${node_name}" "kube-controller-manager" "cat /tmp/k8s-kube-controller-manager.cov" > "${dir}/kube-controller-manager.cov" || true
   231        else
   232          run-in-docker-container "${node_name}" "kube-proxy" "cat /tmp/k8s-kube-proxy.cov" > "${dir}/kube-proxy.cov" || true
   233        fi
   234      else
   235        echo 'Coverage profiles seem to exist, but cannot be retrieved from inside containers.'
   236      fi
   237    fi
   238
   239    echo 'Changing logfiles to be world-readable for download'
   240    log-dump-ssh "${node_name}" "sudo chmod -R a+r /var/log" || true
   241
   242    echo "Copying '${files[*]}' from ${node_name}"
   243    copy-logs-from-node "${node_name}" "${dir}" "${files[@]}"
   244}
   245
   246# Saves a copy of the Windows Docker event log to ${WINDOWS_LOGS_DIR}\docker.log
   247# on node $1.
   248function export-windows-docker-event-log() {
   249    local -r node="${1}"
   250
   251    local -r powershell_cmd="powershell.exe -Command \"\$logs=\$(Get-EventLog -LogName Application -Source Docker | Format-Table -Property TimeGenerated, EntryType, Message -Wrap); \$logs | Out-File -FilePath '${WINDOWS_LOGS_DIR}\\docker.log'\""
   252
   253    # Retry up to 3 times to allow ssh keys to be properly propagated and
   254    # stored.
   255    for retry in {1..3}; do
   256      if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \
   257        --command "$powershell_cmd"; then
   258        break
   259      else
   260        sleep 10
   261      fi
   262    done
   263}
   264
   265# Saves prepulled Windows Docker images list to ${WINDOWS_LOGS_DIR}\docker_images.log
   266# on node $1.
   267function export-windows-docker-images-list() {
   268    local -r node="${1}"
   269
   270    local -r powershell_cmd="powershell.exe -Command \"\$logs=\$(docker image list); \$logs | Out-File -FilePath '${WINDOWS_LOGS_DIR}\\docker_images.log'\""
   271
   272    # Retry up to 3 times to allow ssh keys to be properly propagated and
   273    # stored.
   274    for retry in {1..3}; do
   275      if gcloud compute ssh --project "${PROJECT}" --zone "${ZONE}" "${node}" \
   276        --command "$powershell_cmd"; then
   277        break
   278      else
   279        sleep 10
   280      fi
   281    done
   282}
   283
   284# Saves log files from diagnostics tool.(https://github.com/GoogleCloudPlatform/compute-image-tools/tree/master/cli_tools/diagnostics)
   285function save-windows-logs-via-diagnostics-tool() {
   286    local node="${1}"
   287    local dest_dir="${2}"
   288
   289    gcloud compute instances add-metadata "${node}" --metadata enable-diagnostics=true --project="${PROJECT}" --zone="${ZONE}"
   290    local logs_archive_in_gcs
   291    logs_archive_in_gcs=$(gcloud alpha compute diagnose export-logs "${node}" "--zone=${ZONE}" "--project=${PROJECT}" | tail -n 1)
   292    local temp_local_path="${node}.zip"
   293    for retry in {1..20}; do
   294      if gsutil mv "${logs_archive_in_gcs}" "${temp_local_path}"  > /dev/null 2>&1; then
   295        echo "Downloaded diagnostics log from ${logs_archive_in_gcs}"
   296        break
   297      else
   298        sleep 10
   299      fi
   300    done
   301
   302    if [[ -f "${temp_local_path}" ]]; then
   303      unzip "${temp_local_path}" -d "${dest_dir}" > /dev/null
   304      rm -f "${temp_local_path}"
   305    fi
   306}
   307
   308# Saves log files from SSH
   309function save-windows-logs-via-ssh() {
   310    local node="${1}"
   311    local dest_dir="${2}"
   312
   313    export-windows-docker-event-log "${node}"
   314    export-windows-docker-images-list "${node}"
   315
   316    local remote_files=()
   317    for file in "${windows_node_logfiles[@]}"; do
   318      remote_files+=( "${WINDOWS_LOGS_DIR}\\${file}" )
   319    done
   320    remote_files+=( "${windows_node_otherfiles[@]}" )
   321
   322    # TODO(pjh, yujuhong): handle rotated logs and copying multiple files at the
   323    # same time.
   324    for remote_file in "${remote_files[@]}"; do
   325      # Retry up to 3 times to allow ssh keys to be properly propagated and
   326      # stored.
   327      for retry in {1..3}; do
   328        if gcloud compute scp --recurse --project "${PROJECT}" \
   329          --zone "${ZONE}" "${node}:${remote_file}" "${dest_dir}" \
   330          > /dev/null; then
   331          break
   332        else
   333          sleep 10
   334        fi
   335      done
   336    done
   337}
   338
   339# Save log files and serial console output from Windows node $1 into local
   340# directory $2.
   341# This function shouldn't ever trigger errexit.
   342function save-logs-windows() {
   343    local -r node="${1}"
   344    local -r dest_dir="${2}"
   345
   346    if [[ ! "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   347      echo "Not saving logs for ${node}, Windows log dumping requires gcloud support"
   348      return
   349    fi
   350
   351    if [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then
   352      save-windows-logs-via-diagnostics-tool "${node}" "${dest_dir}"
   353    else
   354      save-windows-logs-via-ssh "${node}" "${dest_dir}"
   355    fi
   356
   357    # Serial port 1 contains the Windows console output.
   358    gcloud compute instances get-serial-port-output --project "${PROJECT}" \
   359      --zone "${ZONE}" --port 1 "${node}" > "${dest_dir}/serial-1.log" || true
   360}
   361
   362# Execute a command in container $2 on node $1.
   363# Uses docker because the container may not ordinarily permit direct execution.
   364function run-in-docker-container() {
   365  local node_name="$1"
   366  local container="$2"
   367  shift 2
   368  log-dump-ssh "${node_name}" "docker exec \"\$(docker ps -f label=io.kubernetes.container.name=${container} --format \"{{.ID}}\")\" $*"
   369}
   370
   371function dump_masters() {
   372  local master_names=()
   373  if [[ -n "${use_custom_instance_list}" ]]; then
   374    while IFS='' read -r line; do master_names+=("$line"); done < <(log_dump_custom_get_instances master)
   375  elif [[ ! "${master_ssh_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   376    echo "Master SSH not supported for ${KUBERNETES_PROVIDER}"
   377    return
   378  elif [[ -n "${KUBEMARK_MASTER_NAME:-}" ]]; then
   379    master_names=( "${KUBEMARK_MASTER_NAME}" )
   380  else
   381    if ! (detect-master); then
   382      echo 'Master not detected. Is the cluster up?'
   383      return
   384    fi
   385    master_names=( "${MASTER_NAME}" )
   386  fi
   387
   388  if [[ "${#master_names[@]}" == 0 ]]; then
   389    echo 'No masters found?'
   390    return
   391  fi
   392
   393  proc=${max_dump_processes}
   394  for master_name in "${master_names[@]}"; do
   395    master_dir="${report_dir}/${master_name}"
   396    mkdir -p "${master_dir}"
   397    save-logs "${master_name}" "${master_dir}" "${master_logfiles}" "" "true" &
   398
   399    # We don't want to run more than ${max_dump_processes} at a time, so
   400    # wait once we hit that many nodes. This isn't ideal, since one might
   401    # take much longer than the others, but it should help.
   402    proc=$((proc - 1))
   403    if [[ proc -eq 0 ]]; then
   404      proc=${max_dump_processes}
   405      wait
   406    fi
   407  done
   408  # Wait for any remaining processes.
   409  if [[ proc -gt 0 && proc -lt ${max_dump_processes} ]]; then
   410    wait
   411  fi
   412}
   413
   414# Dumps logs from nodes in the cluster. Linux nodes to dump logs from can be
   415# specified via $1 or $use_custom_instance_list. If not specified then the nodes
   416# to dump logs for will be detected using detect-node-names(); if Windows nodes
   417# are present then they will be detected and their logs will be dumped too.
   418function dump_nodes() {
   419  local node_names=()
   420  local windows_node_names=()
   421  if [[ -n "${1:-}" ]]; then
   422    echo 'Dumping logs for nodes provided as args to dump_nodes() function'
   423    node_names=( "$@" )
   424  elif [[ -n "${use_custom_instance_list}" ]]; then
   425    echo 'Dumping logs for nodes provided by log_dump_custom_get_instances() function'
   426    while IFS='' read -r line; do node_names+=("$line"); done < <(log_dump_custom_get_instances node)
   427  elif [[ ! "${node_ssh_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   428    echo "Node SSH not supported for ${KUBERNETES_PROVIDER}"
   429    return
   430  else
   431    echo 'Detecting nodes in the cluster'
   432    detect-node-names &> /dev/null
   433    if [[ -n "${NODE_NAMES:-}" ]]; then
   434      node_names=( "${NODE_NAMES[@]}" )
   435    fi
   436    if [[ -n "${WINDOWS_NODE_NAMES:-}" ]]; then
   437      windows_node_names=( "${WINDOWS_NODE_NAMES[@]}" )
   438    fi
   439  fi
   440
   441  if [[ "${#node_names[@]}" == 0 && "${#windows_node_names[@]}" == 0 ]]; then
   442    echo 'No nodes found!'
   443    return
   444  fi
   445
   446  node_logfiles_all="${node_logfiles}"
   447  if [[ "${ENABLE_HOLLOW_NODE_LOGS:-}" == "true" ]]; then
   448    node_logfiles_all="${node_logfiles_all} ${hollow_node_logfiles}"
   449  fi
   450
   451  linux_nodes_selected_for_logs=()
   452  if [[ -n "${LOGDUMP_ONLY_N_RANDOM_NODES:-}" ]]; then
   453    # We randomly choose 'LOGDUMP_ONLY_N_RANDOM_NODES' many nodes for fetching logs.
   454    for index in $(shuf -i 0-$(( ${#node_names[*]} - 1 )) -n "${LOGDUMP_ONLY_N_RANDOM_NODES}")
   455    do
   456      linux_nodes_selected_for_logs+=("${node_names[$index]}")
   457    done
   458  else
   459    linux_nodes_selected_for_logs=( "${node_names[@]}" )
   460  fi
   461  all_selected_nodes=( "${linux_nodes_selected_for_logs[@]}" )
   462  all_selected_nodes+=( "${windows_node_names[@]}" )
   463
   464  proc=${max_dump_processes}
   465  start="$(date +%s)"
   466  # log_dump_ssh_timeout is the maximal number of seconds the log dumping over
   467  # SSH operation can take. Please note that the logic enforcing the timeout
   468  # is only a best effort. The actual time of the operation may be longer
   469  # due to waiting for all the child processes below.
   470  log_dump_ssh_timeout_seconds="${LOG_DUMP_SSH_TIMEOUT_SECONDS:-}"
   471  for i in "${!all_selected_nodes[@]}"; do
   472    node_name="${all_selected_nodes[$i]}"
   473    node_dir="${report_dir}/${node_name}"
   474    mkdir -p "${node_dir}"
   475    if [[ "${i}" -lt "${#linux_nodes_selected_for_logs[@]}" ]]; then
   476      # Save logs in the background. This speeds up things when there are
   477      # many nodes.
   478      save-logs "${node_name}" "${node_dir}" "${node_logfiles_all}" "${node_systemd_services}" &
   479    else
   480      save-logs-windows "${node_name}" "${node_dir}" &
   481    fi
   482
   483    # We don't want to run more than ${max_dump_processes} at a time, so
   484    # wait once we hit that many nodes. This isn't ideal, since one might
   485    # take much longer than the others, but it should help.
   486    proc=$((proc - 1))
   487    if [[ proc -eq 0 ]]; then
   488      proc=${max_dump_processes}
   489      wait
   490      now="$(date +%s)"
   491      if [[ -n "${log_dump_ssh_timeout_seconds}" && $((now - start)) -gt ${log_dump_ssh_timeout_seconds} ]]; then
   492        echo "WARNING: Hit timeout after ${log_dump_ssh_timeout_seconds} seconds, finishing log dumping over SSH shortly"
   493        break
   494      fi
   495    fi
   496  done
   497  # Wait for any remaining processes.
   498  if [[ proc -gt 0 && proc -lt ${max_dump_processes} ]]; then
   499    wait
   500  fi
   501}
   502
   503# Collect names of nodes which didn't run logexporter successfully.
   504# This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter
   505# does not run on Windows nodes.
   506#
   507# Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes.
   508# Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes).
   509# Assumes:
   510#   NODE_NAMES
   511# Sets:
   512#   NON_LOGEXPORTED_NODES
   513function find_non_logexported_nodes() {
   514  local file="${gcs_artifacts_dir}/logexported-nodes-registry"
   515  echo "Listing marker files ($file) for successful nodes..."
   516  succeeded_nodes=$(gsutil ls "${file}") || return 1
   517  echo 'Successfully listed marker files for successful nodes'
   518  NON_LOGEXPORTED_NODES=()
   519  for node in "${NODE_NAMES[@]}"; do
   520    if [[ ! "${succeeded_nodes}" =~ ${node} ]]; then
   521      NON_LOGEXPORTED_NODES+=("${node}")
   522    fi
   523  done
   524}
   525
   526# This function examines NODE_NAMES but not WINDOWS_NODE_NAMES since logexporter
   527# does not run on Windows nodes.
   528function dump_nodes_with_logexporter() {
   529  if [[ -n "${use_custom_instance_list}" ]]; then
   530    echo 'Dumping logs for nodes provided by log_dump_custom_get_instances() function'
   531    NODE_NAMES=()
   532    while IFS='' read -r line; do NODE_NAMES+=("$line"); done < <(log_dump_custom_get_instances node)
   533  else
   534    echo 'Detecting nodes in the cluster'
   535    detect-node-names &> /dev/null
   536  fi
   537
   538  if [[ -z "${NODE_NAMES:-}" ]]; then
   539    echo 'No nodes found!'
   540    return
   541  fi
   542
   543  # Obtain parameters required by logexporter.
   544  local -r service_account_credentials="$(base64 "${GOOGLE_APPLICATION_CREDENTIALS}" | tr -d '\n')"
   545  local -r cloud_provider="${KUBERNETES_PROVIDER}"
   546  local -r enable_hollow_node_logs="${ENABLE_HOLLOW_NODE_LOGS:-false}"
   547  local -r logexport_sleep_seconds="$(( 90 + NUM_NODES / 3 ))"
   548  if [[ -z "${ZONE_NODE_SELECTOR_DISABLED:-}" ]]; then
   549    local -r node_selector="${ZONE_NODE_SELECTOR_LABEL:-topology.kubernetes.io/zone}: ${ZONE}"
   550  fi
   551
   552  # Fill in the parameters in the logexporter daemonset template.
   553  local -r tmp="${KUBE_TEMP}/logexporter"
   554  local -r manifest_yaml="${tmp}/logexporter-daemonset.yaml"
   555  mkdir -p "${tmp}"
   556  cp "${KUBE_ROOT}/cluster/log-dump/logexporter-daemonset.yaml" "${manifest_yaml}"
   557
   558  sed -i'' -e "s@{{.NodeSelector}}@${node_selector:-}@g" "${manifest_yaml}"
   559  sed -i'' -e "s@{{.LogexporterNamespace}}@${logexporter_namespace}@g" "${manifest_yaml}"
   560  sed -i'' -e "s@{{.ServiceAccountCredentials}}@${service_account_credentials}@g" "${manifest_yaml}"
   561  sed -i'' -e "s@{{.CloudProvider}}@${cloud_provider}@g" "${manifest_yaml}"
   562  sed -i'' -e "s@{{.GCSPath}}@${gcs_artifacts_dir}@g" "${manifest_yaml}"
   563  sed -i'' -e "s@{{.EnableHollowNodeLogs}}@${enable_hollow_node_logs}@g" "${manifest_yaml}"
   564  sed -i'' -e "s@{{.DumpSystemdJournal}}@${dump_systemd_journal}@g" "${manifest_yaml}"
   565  sed -i'' -e "s@{{.ExtraLogFiles}}@${extra_log_files}@g" "${manifest_yaml}"
   566  sed -i'' -e "s@{{.ExtraSystemdServices}}@${extra_systemd_services}@g" "${manifest_yaml}"
   567
   568  # Create the logexporter namespace, service-account secret and the logexporter daemonset within that namespace.
   569  KUBECTL="${KUBE_ROOT}/cluster/kubectl.sh"
   570  if ! "${KUBECTL}" create -f "${manifest_yaml}"; then
   571    echo 'Failed to create logexporter daemonset.. falling back to logdump through SSH'
   572    "${KUBECTL}" delete namespace "${logexporter_namespace}" || true
   573    dump_nodes "${NODE_NAMES[@]}"
   574    logexporter_failed=1
   575    return
   576  fi
   577
   578  # Periodically fetch list of already logexported nodes to verify
   579  # if we aren't already done.
   580  start="$(date +%s)"
   581  while true; do
   582    now="$(date +%s)"
   583    if [[ $((now - start)) -gt ${logexport_sleep_seconds} ]]; then
   584      echo 'Waiting for all nodes to be logexported timed out.'
   585      break
   586    fi
   587    if find_non_logexported_nodes; then
   588      if [[ -z "${NON_LOGEXPORTED_NODES:-}" ]]; then
   589        break
   590      fi
   591    fi
   592    sleep 15
   593  done
   594
   595  # Store logs from logexporter pods to allow debugging log exporting process
   596  # itself.
   597  proc=${max_dump_processes}
   598  "${KUBECTL}" get pods -n "${logexporter_namespace}" -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.nodeName}{"\n"}{end}' | (while read -r pod node; do
   599    echo "Fetching logs from ${pod} running on ${node}"
   600    mkdir -p "${report_dir}/${node}"
   601    "${KUBECTL}" logs -n "${logexporter_namespace}" "${pod}" > "${report_dir}/${node}/${pod}.log" &
   602
   603    # We don't want to run more than ${max_dump_processes} at a time, so
   604    # wait once we hit that many nodes. This isn't ideal, since one might
   605    # take much longer than the others, but it should help.
   606    proc=$((proc - 1))
   607    if [[ proc -eq 0 ]]; then
   608      proc=${max_dump_processes}
   609      wait
   610    fi
   611  # Wait for any remaining processes.
   612  done; wait)
   613
   614  # List registry of marker files (of nodes whose logexporter succeeded) from GCS.
   615  for retry in {1..10}; do
   616    if find_non_logexported_nodes; then
   617      break
   618    else
   619      echo "Attempt ${retry} failed to list marker files for successful nodes"
   620      if [[ "${retry}" == 10 ]]; then
   621        echo 'Final attempt to list marker files failed.. falling back to logdump through SSH'
   622        "${KUBECTL}" delete namespace "${logexporter_namespace}" || true
   623        dump_nodes "${NODE_NAMES[@]}"
   624        logexporter_failed=1
   625        return
   626      fi
   627      sleep 2
   628    fi
   629  done
   630
   631  failed_nodes=()
   632  # The following if is needed, because defaulting for empty arrays
   633  # seems to treat them as non-empty with single empty string.
   634  if [[ -n "${NON_LOGEXPORTED_NODES:-}" ]]; then
   635    for node in "${NON_LOGEXPORTED_NODES[@]:-}"; do
   636      echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH."
   637      failed_nodes+=("${node}")
   638    done
   639  fi
   640
   641  # If less than a certain ratio of the nodes got logexported, report an error.
   642  if [[ $(((${#NODE_NAMES[@]} - ${#failed_nodes[@]}) * 100)) -lt $((${#NODE_NAMES[@]} * log_dump_expected_success_percentage )) ]]; then
   643    logexporter_failed=1
   644  fi
   645
   646  # Delete the logexporter resources and dump logs for the failed nodes (if any) through SSH.
   647  "${KUBECTL}" get pods --namespace "${logexporter_namespace}" || true
   648  "${KUBECTL}" delete namespace "${logexporter_namespace}" || true
   649  if [[ "${#failed_nodes[@]}" != 0 ]]; then
   650    echo -e "Dumping logs through SSH for the following nodes:\n${failed_nodes[*]}"
   651    dump_nodes "${failed_nodes[@]}"
   652  fi
   653}
   654
   655function detect_node_failures() {
   656  if ! [[ "${gcloud_supported_providers}" =~ ${KUBERNETES_PROVIDER} ]]; then
   657    return
   658  fi
   659
   660  detect-node-names
   661  if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then
   662    local all_instance_groups=("${INSTANCE_GROUPS[@]}" "${WINDOWS_INSTANCE_GROUPS[@]}")
   663  else
   664    local all_instance_groups=("${INSTANCE_GROUPS[@]}")
   665  fi
   666
   667  if [ -z "${all_instance_groups:-}" ]; then
   668    return
   669  fi
   670  for group in "${all_instance_groups[@]}"; do
   671    local creation_timestamp
   672    creation_timestamp=$(gcloud compute instance-groups managed describe \
   673                         "${group}" \
   674                         --project "${PROJECT}" \
   675                         --zone "${ZONE}" \
   676                         --format='value(creationTimestamp)')
   677    echo "Failures for ${group} (if any):"
   678    gcloud logging read --order=asc \
   679          --format='table(timestamp,jsonPayload.resource.name,jsonPayload.event_subtype)' \
   680          --project "${PROJECT}" \
   681          "resource.type=\"gce_instance\"
   682           logName=\"projects/${PROJECT}/logs/compute.googleapis.com%2Factivity_log\"
   683           (jsonPayload.event_subtype=\"compute.instances.hostError\" OR jsonPayload.event_subtype=\"compute.instances.automaticRestart\")
   684           jsonPayload.resource.name:\"${group}\"
   685           timestamp >= \"${creation_timestamp}\""
   686  done
   687}
   688
   689function main() {
   690  print-deprecation-note
   691  setup
   692  kube::util::ensure-temp-dir
   693  # Copy master logs to artifacts dir locally (through SSH).
   694  echo "Dumping logs from master locally to '${report_dir}'"
   695  dump_masters
   696  if [[ "${DUMP_ONLY_MASTER_LOGS:-}" == "true" ]]; then
   697    echo 'Skipping dumping of node logs'
   698    return
   699  fi
   700
   701  # Copy logs from nodes to GCS directly or to artifacts dir locally (through SSH).
   702  if [[ -n "${gcs_artifacts_dir}" ]]; then
   703    echo "Dumping logs from nodes to GCS directly at '${gcs_artifacts_dir}' using logexporter"
   704    dump_nodes_with_logexporter
   705  else
   706    echo "Dumping logs from nodes locally to '${report_dir}'"
   707    dump_nodes
   708  fi
   709
   710  detect_node_failures
   711  if [[ ${logexporter_failed} -ne 0 && ${log_dump_expected_success_percentage} -gt 0 ]]; then
   712    return 1
   713  fi
   714}
   715
   716main
View as plain text