...

Source file src/github.com/linkerd/linkerd2/pkg/healthcheck/healthcheck.go

Documentation: github.com/linkerd/linkerd2/pkg/healthcheck

     1  package healthcheck
     2  
     3  import (
     4  	"bufio"
     5  	"context"
     6  	"crypto/x509"
     7  	"errors"
     8  	"fmt"
     9  	"io"
    10  	"net"
    11  	"os"
    12  	"sort"
    13  	"strconv"
    14  	"strings"
    15  	"time"
    16  
    17  	controllerK8s "github.com/linkerd/linkerd2/controller/k8s"
    18  	l5dcharts "github.com/linkerd/linkerd2/pkg/charts/linkerd2"
    19  	"github.com/linkerd/linkerd2/pkg/config"
    20  	"github.com/linkerd/linkerd2/pkg/identity"
    21  	"github.com/linkerd/linkerd2/pkg/issuercerts"
    22  	"github.com/linkerd/linkerd2/pkg/k8s"
    23  	"github.com/linkerd/linkerd2/pkg/tls"
    24  	"github.com/linkerd/linkerd2/pkg/util"
    25  	"github.com/linkerd/linkerd2/pkg/version"
    26  	log "github.com/sirupsen/logrus"
    27  	admissionRegistration "k8s.io/api/admissionregistration/v1"
    28  	appsv1 "k8s.io/api/apps/v1"
    29  	corev1 "k8s.io/api/core/v1"
    30  	apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
    31  	kerrors "k8s.io/apimachinery/pkg/api/errors"
    32  	"k8s.io/apimachinery/pkg/api/meta"
    33  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    34  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    35  	"k8s.io/apimachinery/pkg/labels"
    36  	"k8s.io/apimachinery/pkg/runtime"
    37  	"k8s.io/apimachinery/pkg/runtime/schema"
    38  	yamlDecoder "k8s.io/apimachinery/pkg/util/yaml"
    39  	k8sVersion "k8s.io/apimachinery/pkg/version"
    40  	"k8s.io/client-go/kubernetes"
    41  	apiregistrationv1client "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/typed/apiregistration/v1"
    42  	"sigs.k8s.io/yaml"
    43  )
    44  
    45  // CategoryID is an identifier for the types of health checks.
    46  type CategoryID string
    47  
    48  const (
    49  	// KubernetesAPIChecks adds a series of checks to validate that the caller is
    50  	// configured to interact with a working Kubernetes cluster.
    51  	KubernetesAPIChecks CategoryID = "kubernetes-api"
    52  
    53  	// KubernetesVersionChecks validate that the cluster meets the minimum version
    54  	// requirements.
    55  	KubernetesVersionChecks CategoryID = "kubernetes-version"
    56  
    57  	// LinkerdPreInstall* checks enabled by `linkerd check --pre`
    58  
    59  	// LinkerdPreInstallChecks adds checks to validate that the control plane
    60  	// namespace does not already exist, and that the user can create cluster-wide
    61  	// resources, including ClusterRole, ClusterRoleBinding, and
    62  	// CustomResourceDefinition, as well as namespace-wide resources, including
    63  	// Service, Deployment, and ConfigMap. This check only runs as part of the set
    64  	// of pre-install checks.
    65  	// This check is dependent on the output of KubernetesAPIChecks, so those
    66  	// checks must be added first.
    67  	LinkerdPreInstallChecks CategoryID = "pre-kubernetes-setup"
    68  
    69  	// LinkerdCRDChecks adds checks to validate that the control plane CRDs
    70  	// exist. These checks can be run after installing the control plane CRDs
    71  	// but before installing the control plane itself.
    72  	LinkerdCRDChecks CategoryID = "linkerd-crd"
    73  
    74  	// LinkerdConfigChecks enabled by `linkerd check config`
    75  
    76  	// LinkerdConfigChecks adds a series of checks to validate that the Linkerd
    77  	// namespace, RBAC, ServiceAccounts, and CRDs were successfully created.
    78  	// These checks specifically validate that the `linkerd install config`
    79  	// command succeeded in a multi-stage install, but also applies to a default
    80  	// `linkerd install`.
    81  	// These checks are dependent on the output of KubernetesAPIChecks, so those
    82  	// checks must be added first.
    83  	LinkerdConfigChecks CategoryID = "linkerd-config"
    84  
    85  	// LinkerdIdentity Checks the integrity of the mTLS certificates
    86  	// that the control plane is configured with
    87  	LinkerdIdentity CategoryID = "linkerd-identity"
    88  
    89  	// LinkerdWebhooksAndAPISvcTLS the integrity of the mTLS certificates
    90  	// that of the for the injector and sp webhooks and the tap api svc
    91  	LinkerdWebhooksAndAPISvcTLS CategoryID = "linkerd-webhooks-and-apisvc-tls"
    92  
    93  	// LinkerdIdentityDataPlane checks that integrity of the mTLS
    94  	// certificates that the proxies are configured with and tries to
    95  	// report useful information with respect to whether the configuration
    96  	// is compatible with the one of the control plane
    97  	LinkerdIdentityDataPlane CategoryID = "linkerd-identity-data-plane"
    98  
    99  	// LinkerdControlPlaneExistenceChecks adds a series of checks to validate that
   100  	// the control plane namespace and controller pod exist.
   101  	// These checks are dependent on the output of KubernetesAPIChecks, so those
   102  	// checks must be added first.
   103  	LinkerdControlPlaneExistenceChecks CategoryID = "linkerd-existence"
   104  
   105  	// LinkerdVersionChecks adds a series of checks to query for the latest
   106  	// version, and validate the CLI is up to date.
   107  	LinkerdVersionChecks CategoryID = "linkerd-version"
   108  
   109  	// LinkerdControlPlaneVersionChecks adds a series of checks to validate that
   110  	// the control plane is running the latest available version.
   111  	// These checks are dependent on the following:
   112  	// 1) `latestVersions` from LinkerdVersionChecks
   113  	// 2) `serverVersion` from `LinkerdControlPlaneExistenceChecks`
   114  	LinkerdControlPlaneVersionChecks CategoryID = "control-plane-version"
   115  
   116  	// LinkerdDataPlaneChecks adds data plane checks to validate that the
   117  	// data plane namespace exists, and that the proxy containers are in a
   118  	// ready state and running the latest available version.  These checks
   119  	// are dependent on the output of KubernetesAPIChecks and
   120  	// `latestVersions` from LinkerdVersionChecks, so those checks must be
   121  	// added first.
   122  	LinkerdDataPlaneChecks CategoryID = "linkerd-data-plane"
   123  
   124  	// LinkerdControlPlaneProxyChecks adds data plane checks to validate the
   125  	// control-plane proxies. The checkers include running and version checks
   126  	LinkerdControlPlaneProxyChecks CategoryID = "linkerd-control-plane-proxy"
   127  
   128  	// LinkerdHAChecks adds checks to validate that the HA configuration
   129  	// is correct. These checks are no ops if linkerd is not in HA mode
   130  	LinkerdHAChecks CategoryID = "linkerd-ha-checks"
   131  
   132  	// LinkerdCNIPluginChecks adds checks to validate that the CNI
   133  	/// plugin is installed and ready
   134  	LinkerdCNIPluginChecks CategoryID = "linkerd-cni-plugin"
   135  
   136  	// LinkerdOpaquePortsDefinitionChecks adds checks to validate that the
   137  	// "opaque ports" annotation has been defined both in the service and the
   138  	// corresponding pods
   139  	LinkerdOpaquePortsDefinitionChecks CategoryID = "linkerd-opaque-ports-definition"
   140  
   141  	// LinkerdExtensionChecks adds checks to validate configuration for all
   142  	// extensions discovered in the cluster at runtime
   143  	LinkerdExtensionChecks CategoryID = "linkerd-extension-checks"
   144  
   145  	// LinkerdCNIResourceLabel is the label key that is used to identify
   146  	// whether a Kubernetes resource is related to the install-cni command
   147  	// The value is expected to be "true", "false" or "", where "false" and
   148  	// "" are equal, making "false" the default
   149  	LinkerdCNIResourceLabel = "linkerd.io/cni-resource"
   150  
   151  	linkerdCNIDisabledSkipReason = "skipping check because CNI is not enabled"
   152  	linkerdCNIResourceName       = "linkerd-cni"
   153  	linkerdCNIConfigMapName      = "linkerd-cni-config"
   154  
   155  	podCIDRUnavailableSkipReason    = "skipping check because the nodes aren't exposing podCIDR"
   156  	configMapDoesNotExistSkipReason = "skipping check because ConigMap does not exist"
   157  
   158  	proxyInjectorOldTLSSecretName = "linkerd-proxy-injector-tls"
   159  	proxyInjectorTLSSecretName    = "linkerd-proxy-injector-k8s-tls"
   160  
   161  	spValidatorOldTLSSecretName = "linkerd-sp-validator-tls"
   162  	spValidatorTLSSecretName    = "linkerd-sp-validator-k8s-tls"
   163  
   164  	policyValidatorTLSSecretName = "linkerd-policy-validator-k8s-tls"
   165  	certOldKeyName               = "crt.pem"
   166  	certKeyName                  = "tls.crt"
   167  	keyOldKeyName                = "key.pem"
   168  	keyKeyName                   = "tls.key"
   169  )
   170  
   171  // AllowedClockSkew sets the allowed skew in clock synchronization
   172  // between the system running inject command and the node(s), being
   173  // based on assumed node's heartbeat interval (5 minutes) plus default TLS
   174  // clock skew allowance.
   175  //
   176  // TODO: Make this default value overridable, e.g. by CLI flag
   177  const AllowedClockSkew = 5*time.Minute + tls.DefaultClockSkewAllowance
   178  
   179  var linkerdHAControlPlaneComponents = []string{
   180  	"linkerd-destination",
   181  	"linkerd-identity",
   182  	"linkerd-proxy-injector",
   183  }
   184  
   185  // ExpectedServiceAccountNames is a list of the service accounts that a healthy
   186  // Linkerd installation should have. Note that linkerd-heartbeat is optional,
   187  // so it doesn't appear here.
   188  var ExpectedServiceAccountNames = []string{
   189  	"linkerd-destination",
   190  	"linkerd-identity",
   191  	"linkerd-proxy-injector",
   192  }
   193  
   194  var (
   195  	retryWindow = 5 * time.Second
   196  	// RequestTimeout is the time it takes for a request to timeout
   197  	RequestTimeout = 30 * time.Second
   198  )
   199  
   200  // Resource provides a way to describe a Kubernetes object, kind, and name.
   201  // TODO: Consider sharing with the inject package's ResourceConfig.workload
   202  // struct, as it wraps both runtime.Object and metav1.TypeMeta.
   203  type Resource struct {
   204  	groupVersionKind schema.GroupVersionKind
   205  	name             string
   206  }
   207  
   208  // String outputs the resource in kind.group/name format, intended for
   209  // `linkerd install`.
   210  func (r *Resource) String() string {
   211  	return fmt.Sprintf("%s/%s", strings.ToLower(r.groupVersionKind.GroupKind().String()), r.name)
   212  }
   213  
   214  // ResourceError provides a custom error type for resource existence checks,
   215  // useful in printing detailed error messages in `linkerd check` and
   216  // `linkerd install`.
   217  type ResourceError struct {
   218  	resourceName string
   219  	Resources    []Resource
   220  }
   221  
   222  // Error satisfies the error interface for ResourceError. The output is intended
   223  // for `linkerd check`.
   224  func (e ResourceError) Error() string {
   225  	names := []string{}
   226  	for _, res := range e.Resources {
   227  		names = append(names, res.name)
   228  	}
   229  	return fmt.Sprintf("%s found but should not exist: %s", e.resourceName, strings.Join(names, " "))
   230  }
   231  
   232  // CategoryError provides a custom error type that also contains check category that emitted the error,
   233  // useful when needed to distinguish between errors from multiple categories
   234  type CategoryError struct {
   235  	Category CategoryID
   236  	Err      error
   237  }
   238  
   239  // Error satisfies the error interface for CategoryError.
   240  func (e CategoryError) Error() string {
   241  	return e.Err.Error()
   242  }
   243  
   244  // IsCategoryError returns true if passed in error is of type CategoryError and belong to the given category
   245  func IsCategoryError(err error, categoryID CategoryID) bool {
   246  	var ce CategoryError
   247  	if errors.As(err, &ce) {
   248  		return ce.Category == categoryID
   249  	}
   250  	return false
   251  }
   252  
   253  // SkipError is returned by a check in case this check needs to be ignored.
   254  type SkipError struct {
   255  	Reason string
   256  }
   257  
   258  // Error satisfies the error interface for SkipError.
   259  func (e SkipError) Error() string {
   260  	return e.Reason
   261  }
   262  
   263  // VerboseSuccess implements the error interface but represents a success with
   264  // a message.
   265  type VerboseSuccess struct {
   266  	Message string
   267  }
   268  
   269  // Error satisfies the error interface for VerboseSuccess.  Since VerboseSuccess
   270  // does not actually represent a failure, this returns the empty string.
   271  func (e VerboseSuccess) Error() string {
   272  	return ""
   273  }
   274  
   275  // Checker is a smallest unit performing a single check
   276  type Checker struct {
   277  	// description is the short description that's printed to the command line
   278  	// when the check is executed
   279  	description string
   280  
   281  	// hintAnchor, when appended to `HintBaseURL`, provides a URL to more
   282  	// information about the check
   283  	hintAnchor string
   284  
   285  	// fatal indicates that all remaining checks should be aborted if this check
   286  	// fails; it should only be used if subsequent checks cannot possibly succeed
   287  	// (default false)
   288  	fatal bool
   289  
   290  	// warning indicates that if this check fails, it should be reported, but it
   291  	// should not impact the overall outcome of the health check (default false)
   292  	warning bool
   293  
   294  	// retryDeadline establishes a deadline before which this check should be
   295  	// retried; if the deadline has passed, the check fails (default: no retries)
   296  	retryDeadline time.Time
   297  
   298  	// surfaceErrorOnRetry indicates that the error message should be displayed
   299  	// even if the check will be retried.  This is useful if the error message
   300  	// contains the current status of the check.
   301  	surfaceErrorOnRetry bool
   302  
   303  	// check is the function that's called to execute the check; if the function
   304  	// returns an error, the check fails
   305  	check func(context.Context) error
   306  }
   307  
   308  // NewChecker returns a new instance of checker type
   309  func NewChecker(description string) *Checker {
   310  	return &Checker{
   311  		description:   description,
   312  		retryDeadline: time.Time{},
   313  	}
   314  }
   315  
   316  // WithHintAnchor returns a checker with the given hint anchor
   317  func (c *Checker) WithHintAnchor(hint string) *Checker {
   318  	c.hintAnchor = hint
   319  	return c
   320  }
   321  
   322  // Fatal returns a checker with the fatal field set
   323  func (c *Checker) Fatal() *Checker {
   324  	c.fatal = true
   325  	return c
   326  }
   327  
   328  // Warning returns a checker with the warning field set
   329  func (c *Checker) Warning() *Checker {
   330  	c.warning = true
   331  	return c
   332  }
   333  
   334  // WithRetryDeadline returns a checker with the provided retry timeout
   335  func (c *Checker) WithRetryDeadline(retryDeadLine time.Time) *Checker {
   336  	c.retryDeadline = retryDeadLine
   337  	return c
   338  }
   339  
   340  // SurfaceErrorOnRetry returns a checker with the surfaceErrorOnRetry set
   341  func (c *Checker) SurfaceErrorOnRetry() *Checker {
   342  	c.surfaceErrorOnRetry = true
   343  	return c
   344  }
   345  
   346  // WithCheck returns a checker with the provided check func
   347  func (c *Checker) WithCheck(check func(context.Context) error) *Checker {
   348  	c.check = check
   349  	return c
   350  }
   351  
   352  // CheckResult encapsulates a check's identifying information and output
   353  // Note there exists an analogous user-facing type, `cmd.check`, for output via
   354  // `linkerd check -o json`.
   355  type CheckResult struct {
   356  	Category    CategoryID
   357  	Description string
   358  	HintURL     string
   359  	Retry       bool
   360  	Warning     bool
   361  	Err         error
   362  }
   363  
   364  // CheckObserver receives the results of each check.
   365  type CheckObserver func(*CheckResult)
   366  
   367  // Category is a group of checkers, to check a particular component or use-case
   368  type Category struct {
   369  	ID       CategoryID
   370  	checkers []Checker
   371  	enabled  bool
   372  	// hintBaseURL provides a base URL with more information
   373  	// about the check
   374  	hintBaseURL string
   375  }
   376  
   377  // NewCategory returns an instance of Category with the specified data
   378  func NewCategory(id CategoryID, checkers []Checker, enabled bool) *Category {
   379  	return &Category{
   380  		ID:          id,
   381  		checkers:    checkers,
   382  		enabled:     enabled,
   383  		hintBaseURL: HintBaseURL(version.Version),
   384  	}
   385  }
   386  
   387  // WithHintBaseURL returns a Category with the provided hintBaseURL
   388  func (c *Category) WithHintBaseURL(hintBaseURL string) *Category {
   389  	c.hintBaseURL = hintBaseURL
   390  	return c
   391  }
   392  
   393  // Options specifies configuration for a HealthChecker.
   394  type Options struct {
   395  	IsMainCheckCommand    bool
   396  	ControlPlaneNamespace string
   397  	CNINamespace          string
   398  	DataPlaneNamespace    string
   399  	KubeConfig            string
   400  	KubeContext           string
   401  	Impersonate           string
   402  	ImpersonateGroup      []string
   403  	APIAddr               string
   404  	VersionOverride       string
   405  	RetryDeadline         time.Time
   406  	CNIEnabled            bool
   407  	InstallManifest       string
   408  	CRDManifest           string
   409  	ChartValues           *l5dcharts.Values
   410  }
   411  
   412  // HealthChecker encapsulates all health check checkers, and clients required to
   413  // perform those checks.
   414  type HealthChecker struct {
   415  	categories []*Category
   416  	*Options
   417  
   418  	// these fields are set in the process of running checks
   419  	kubeAPI          *k8s.KubernetesAPI
   420  	kubeVersion      *k8sVersion.Info
   421  	controlPlanePods []corev1.Pod
   422  	LatestVersions   version.Channels
   423  	serverVersion    string
   424  	linkerdConfig    *l5dcharts.Values
   425  	uuid             string
   426  	issuerCert       *tls.Cred
   427  	trustAnchors     []*x509.Certificate
   428  	cniDaemonSet     *appsv1.DaemonSet
   429  }
   430  
   431  // Runner is implemented by any health-checkers that can be triggered with RunChecks()
   432  type Runner interface {
   433  	RunChecks(observer CheckObserver) (bool, bool)
   434  }
   435  
   436  // NewHealthChecker returns an initialized HealthChecker
   437  func NewHealthChecker(categoryIDs []CategoryID, options *Options) *HealthChecker {
   438  	hc := &HealthChecker{
   439  		Options: options,
   440  	}
   441  
   442  	hc.categories = hc.allCategories()
   443  
   444  	checkMap := map[CategoryID]struct{}{}
   445  	for _, category := range categoryIDs {
   446  		checkMap[category] = struct{}{}
   447  	}
   448  	for i := range hc.categories {
   449  		if _, ok := checkMap[hc.categories[i].ID]; ok {
   450  			hc.categories[i].enabled = true
   451  		}
   452  	}
   453  
   454  	return hc
   455  }
   456  
   457  func NewWithCoreChecks(options *Options) *HealthChecker {
   458  	checks := []CategoryID{KubernetesAPIChecks, LinkerdControlPlaneExistenceChecks}
   459  	return NewHealthChecker(checks, options)
   460  }
   461  
   462  // InitializeKubeAPIClient creates a client for the HealthChecker. It avoids
   463  // having to require the KubernetesAPIChecks check to run in order for the
   464  // HealthChecker to run other checks.
   465  func (hc *HealthChecker) InitializeKubeAPIClient() error {
   466  	k8sAPI, err := k8s.NewAPI(hc.KubeConfig, hc.KubeContext, hc.Impersonate, hc.ImpersonateGroup, RequestTimeout)
   467  	if err != nil {
   468  		return err
   469  	}
   470  	hc.kubeAPI = k8sAPI
   471  
   472  	return nil
   473  }
   474  
   475  // InitializeLinkerdGlobalConfig populates the linkerd config object in the
   476  // healthchecker. It avoids having to require the LinkerdControlPlaneExistenceChecks
   477  // check to run before running other checks
   478  func (hc *HealthChecker) InitializeLinkerdGlobalConfig(ctx context.Context) error {
   479  	uuid, l5dConfig, err := hc.checkLinkerdConfigConfigMap(ctx)
   480  	if err != nil {
   481  		return err
   482  	}
   483  
   484  	if l5dConfig != nil {
   485  		hc.CNIEnabled = l5dConfig.CNIEnabled
   486  	}
   487  	hc.uuid = uuid
   488  	hc.linkerdConfig = l5dConfig
   489  
   490  	return nil
   491  }
   492  
   493  // AppendCategories returns a HealthChecker instance appending the provided Categories
   494  func (hc *HealthChecker) AppendCategories(categories ...*Category) *HealthChecker {
   495  	hc.categories = append(hc.categories, categories...)
   496  	return hc
   497  }
   498  
   499  // GetCategories returns all the categories
   500  func (hc *HealthChecker) GetCategories() []*Category {
   501  	return hc.categories
   502  }
   503  
   504  // allCategories is the global, ordered list of all checkers, grouped by
   505  // category. This method is attached to the HealthChecker struct because the
   506  // checkers directly reference other members of the struct, such as kubeAPI,
   507  // controlPlanePods, etc.
   508  //
   509  // Ordering is important because checks rely on specific `HealthChecker` members
   510  // getting populated by earlier checks, such as kubeAPI, controlPlanePods, etc.
   511  //
   512  // Note that all checks should include a `hintAnchor` with a corresponding section
   513  // in the linkerd check faq:
   514  // https://linkerd.io/{major-version}/checks/#
   515  func (hc *HealthChecker) allCategories() []*Category {
   516  	return []*Category{
   517  		NewCategory(
   518  			KubernetesAPIChecks,
   519  			[]Checker{
   520  				{
   521  					description: "can initialize the client",
   522  					hintAnchor:  "k8s-api",
   523  					fatal:       true,
   524  					check: func(context.Context) (err error) {
   525  						err = hc.InitializeKubeAPIClient()
   526  						return
   527  					},
   528  				},
   529  				{
   530  					description: "can query the Kubernetes API",
   531  					hintAnchor:  "k8s-api",
   532  					fatal:       true,
   533  					check: func(ctx context.Context) (err error) {
   534  						hc.kubeVersion, err = hc.kubeAPI.GetVersionInfo()
   535  						return
   536  					},
   537  				},
   538  			},
   539  			false,
   540  		),
   541  		NewCategory(
   542  			KubernetesVersionChecks,
   543  			[]Checker{
   544  				{
   545  					description: "is running the minimum Kubernetes API version",
   546  					hintAnchor:  "k8s-version",
   547  					check: func(context.Context) error {
   548  						return hc.kubeAPI.CheckVersion(hc.kubeVersion)
   549  					},
   550  				},
   551  			},
   552  			false,
   553  		),
   554  		NewCategory(
   555  			LinkerdPreInstallChecks,
   556  			[]Checker{
   557  				{
   558  					description: "control plane namespace does not already exist",
   559  					hintAnchor:  "pre-ns",
   560  					check: func(ctx context.Context) error {
   561  						return hc.CheckNamespace(ctx, hc.ControlPlaneNamespace, false)
   562  					},
   563  				},
   564  				{
   565  					description: "can create non-namespaced resources",
   566  					hintAnchor:  "pre-k8s-cluster-k8s",
   567  					check: func(ctx context.Context) error {
   568  						return hc.checkCanCreateNonNamespacedResources(ctx)
   569  					},
   570  				},
   571  				{
   572  					description: "can create ServiceAccounts",
   573  					hintAnchor:  "pre-k8s",
   574  					check: func(ctx context.Context) error {
   575  						return hc.checkCanCreate(ctx, hc.ControlPlaneNamespace, "", "v1", "serviceaccounts")
   576  					},
   577  				},
   578  				{
   579  					description: "can create Services",
   580  					hintAnchor:  "pre-k8s",
   581  					check: func(ctx context.Context) error {
   582  						return hc.checkCanCreate(ctx, hc.ControlPlaneNamespace, "", "v1", "services")
   583  					},
   584  				},
   585  				{
   586  					description: "can create Deployments",
   587  					hintAnchor:  "pre-k8s",
   588  					check: func(ctx context.Context) error {
   589  						return hc.checkCanCreate(ctx, hc.ControlPlaneNamespace, "apps", "v1", "deployments")
   590  					},
   591  				},
   592  				{
   593  					description: "can create CronJobs",
   594  					hintAnchor:  "pre-k8s",
   595  					check: func(ctx context.Context) error {
   596  						return hc.checkCanCreate(ctx, hc.ControlPlaneNamespace, "batch", "v1beta1", "cronjobs")
   597  					},
   598  				},
   599  				{
   600  					description: "can create ConfigMaps",
   601  					hintAnchor:  "pre-k8s",
   602  					check: func(ctx context.Context) error {
   603  						return hc.checkCanCreate(ctx, hc.ControlPlaneNamespace, "", "v1", "configmaps")
   604  					},
   605  				},
   606  				{
   607  					description: "can create Secrets",
   608  					hintAnchor:  "pre-k8s",
   609  					check: func(ctx context.Context) error {
   610  						return hc.checkCanCreate(ctx, hc.ControlPlaneNamespace, "", "v1", "secrets")
   611  					},
   612  				},
   613  				{
   614  					description: "can read Secrets",
   615  					hintAnchor:  "pre-k8s",
   616  					check: func(ctx context.Context) error {
   617  						return hc.checkCanGet(ctx, hc.ControlPlaneNamespace, "", "v1", "secrets")
   618  					},
   619  				},
   620  				{
   621  					description: "can read extension-apiserver-authentication configmap",
   622  					hintAnchor:  "pre-k8s",
   623  					check: func(ctx context.Context) error {
   624  						return hc.checkExtensionAPIServerAuthentication(ctx)
   625  					},
   626  				},
   627  				{
   628  					description: "no clock skew detected",
   629  					hintAnchor:  "pre-k8s-clock-skew",
   630  					warning:     true,
   631  					check: func(ctx context.Context) error {
   632  						return hc.checkClockSkew(ctx)
   633  					},
   634  				},
   635  			},
   636  			false,
   637  		),
   638  		NewCategory(
   639  			LinkerdCRDChecks,
   640  			[]Checker{
   641  				{
   642  					description:   "control plane CustomResourceDefinitions exist",
   643  					hintAnchor:    "l5d-existence-crd",
   644  					fatal:         true,
   645  					retryDeadline: hc.RetryDeadline,
   646  					check: func(ctx context.Context) error {
   647  						return CheckCustomResourceDefinitions(ctx, hc.kubeAPI, hc.CRDManifest)
   648  					},
   649  				},
   650  			},
   651  			false,
   652  		),
   653  		NewCategory(
   654  			LinkerdControlPlaneExistenceChecks,
   655  			[]Checker{
   656  				{
   657  					description: "'linkerd-config' config map exists",
   658  					hintAnchor:  "l5d-existence-linkerd-config",
   659  					fatal:       true,
   660  					check: func(ctx context.Context) (err error) {
   661  						err = hc.InitializeLinkerdGlobalConfig(ctx)
   662  						return
   663  					},
   664  				},
   665  				{
   666  					description: "heartbeat ServiceAccount exist",
   667  					hintAnchor:  "l5d-existence-sa",
   668  					fatal:       true,
   669  					check: func(ctx context.Context) error {
   670  						if hc.isHeartbeatDisabled() {
   671  							return nil
   672  						}
   673  						return hc.checkServiceAccounts(ctx, []string{"linkerd-heartbeat"}, hc.ControlPlaneNamespace, controlPlaneComponentsSelector())
   674  					},
   675  				},
   676  				{
   677  					description:   "control plane replica sets are ready",
   678  					hintAnchor:    "l5d-existence-replicasets",
   679  					retryDeadline: hc.RetryDeadline,
   680  					fatal:         true,
   681  					check: func(ctx context.Context) error {
   682  						controlPlaneReplicaSet, err := hc.kubeAPI.GetReplicaSets(ctx, hc.ControlPlaneNamespace)
   683  						if err != nil {
   684  							return err
   685  						}
   686  						return checkControlPlaneReplicaSets(controlPlaneReplicaSet)
   687  					},
   688  				},
   689  				{
   690  					description:         "no unschedulable pods",
   691  					hintAnchor:          "l5d-existence-unschedulable-pods",
   692  					retryDeadline:       hc.RetryDeadline,
   693  					surfaceErrorOnRetry: true,
   694  					warning:             true,
   695  					check: func(ctx context.Context) error {
   696  						// do not save this into hc.controlPlanePods, as this check may
   697  						// succeed prior to all expected control plane pods being up
   698  						controlPlanePods, err := hc.kubeAPI.GetPodsByNamespace(ctx, hc.ControlPlaneNamespace)
   699  						if err != nil {
   700  							return err
   701  						}
   702  						return checkUnschedulablePods(controlPlanePods)
   703  					},
   704  				},
   705  				{
   706  					description:         "control plane pods are ready",
   707  					hintAnchor:          "l5d-api-control-ready",
   708  					retryDeadline:       hc.RetryDeadline,
   709  					surfaceErrorOnRetry: true,
   710  					fatal:               true,
   711  					check: func(ctx context.Context) error {
   712  						var err error
   713  						podList, err := hc.kubeAPI.CoreV1().Pods(hc.ControlPlaneNamespace).List(ctx, metav1.ListOptions{
   714  							LabelSelector: k8s.ControllerComponentLabel,
   715  						})
   716  						if err != nil {
   717  							return err
   718  						}
   719  						hc.controlPlanePods = podList.Items
   720  						return validateControlPlanePods(hc.controlPlanePods)
   721  					},
   722  				},
   723  				{
   724  					description: "cluster networks contains all node podCIDRs",
   725  					hintAnchor:  "l5d-cluster-networks-cidr",
   726  					check: func(ctx context.Context) error {
   727  						// We explicitly initialize the config here so that we dont rely on the "l5d-existence-linkerd-config"
   728  						// check to set the clusterNetworks value, since `linkerd check config` will skip that check.
   729  						err := hc.InitializeLinkerdGlobalConfig(ctx)
   730  						if err != nil {
   731  							return err
   732  						}
   733  						return hc.checkClusterNetworks(ctx)
   734  					},
   735  				},
   736  				{
   737  					description: "cluster networks contains all pods",
   738  					hintAnchor:  "l5d-cluster-networks-pods",
   739  					check: func(ctx context.Context) error {
   740  						return hc.checkClusterNetworksContainAllPods(ctx)
   741  					},
   742  				},
   743  				{
   744  					description: "cluster networks contains all services",
   745  					hintAnchor:  "l5d-cluster-networks-pods",
   746  					check: func(ctx context.Context) error {
   747  						return hc.checkClusterNetworksContainAllServices(ctx)
   748  					},
   749  				},
   750  			},
   751  			false,
   752  		),
   753  		NewCategory(
   754  			LinkerdConfigChecks,
   755  			[]Checker{
   756  				{
   757  					description: "control plane Namespace exists",
   758  					hintAnchor:  "l5d-existence-ns",
   759  					fatal:       true,
   760  					check: func(ctx context.Context) error {
   761  						return hc.CheckNamespace(ctx, hc.ControlPlaneNamespace, true)
   762  					},
   763  				},
   764  				{
   765  					description: "control plane ClusterRoles exist",
   766  					hintAnchor:  "l5d-existence-cr",
   767  					fatal:       true,
   768  					check: func(ctx context.Context) error {
   769  						return hc.checkClusterRoles(ctx, true, hc.expectedRBACNames(), controlPlaneComponentsSelector())
   770  					},
   771  				},
   772  				{
   773  					description: "control plane ClusterRoleBindings exist",
   774  					hintAnchor:  "l5d-existence-crb",
   775  					fatal:       true,
   776  					check: func(ctx context.Context) error {
   777  						return hc.checkClusterRoleBindings(ctx, true, hc.expectedRBACNames(), controlPlaneComponentsSelector())
   778  					},
   779  				},
   780  				{
   781  					description: "control plane ServiceAccounts exist",
   782  					hintAnchor:  "l5d-existence-sa",
   783  					fatal:       true,
   784  					check: func(ctx context.Context) error {
   785  						return hc.checkServiceAccounts(ctx, ExpectedServiceAccountNames, hc.ControlPlaneNamespace, controlPlaneComponentsSelector())
   786  					},
   787  				},
   788  				{
   789  					description: "control plane CustomResourceDefinitions exist",
   790  					hintAnchor:  "l5d-existence-crd",
   791  					fatal:       true,
   792  					check: func(ctx context.Context) error {
   793  						return CheckCustomResourceDefinitions(ctx, hc.kubeAPI, hc.CRDManifest)
   794  					},
   795  				},
   796  				{
   797  					description: "control plane MutatingWebhookConfigurations exist",
   798  					hintAnchor:  "l5d-existence-mwc",
   799  					fatal:       true,
   800  					check: func(ctx context.Context) error {
   801  						return hc.checkMutatingWebhookConfigurations(ctx, true)
   802  					},
   803  				},
   804  				{
   805  					description: "control plane ValidatingWebhookConfigurations exist",
   806  					hintAnchor:  "l5d-existence-vwc",
   807  					fatal:       true,
   808  					check: func(ctx context.Context) error {
   809  						return hc.checkValidatingWebhookConfigurations(ctx, true)
   810  					},
   811  				},
   812  				{
   813  					description: "proxy-init container runs as root user if docker container runtime is used",
   814  					hintAnchor:  "l5d-proxy-init-run-as-root",
   815  					fatal:       false,
   816  					check: func(ctx context.Context) error {
   817  						// We explicitly initialize the config here so that we dont rely on the "l5d-existence-linkerd-config"
   818  						// check to set the clusterNetworks value, since `linkerd check config` will skip that check.
   819  						err := hc.InitializeLinkerdGlobalConfig(ctx)
   820  						if err != nil {
   821  							if kerrors.IsNotFound(err) {
   822  								return SkipError{Reason: configMapDoesNotExistSkipReason}
   823  							}
   824  							return err
   825  						}
   826  						config := hc.LinkerdConfig()
   827  						runAsRoot := config != nil && config.ProxyInit != nil && config.ProxyInit.RunAsRoot
   828  						if !runAsRoot {
   829  							return CheckNodesHaveNonDockerRuntime(ctx, hc.KubeAPIClient())
   830  						}
   831  						return nil
   832  					},
   833  				},
   834  			},
   835  			false,
   836  		),
   837  		NewCategory(
   838  			LinkerdCNIPluginChecks,
   839  			[]Checker{
   840  				{
   841  					description: "cni plugin ConfigMap exists",
   842  					hintAnchor:  "cni-plugin-cm-exists",
   843  					fatal:       true,
   844  					check: func(ctx context.Context) error {
   845  						if !hc.CNIEnabled {
   846  							return SkipError{Reason: linkerdCNIDisabledSkipReason}
   847  						}
   848  						_, err := hc.kubeAPI.CoreV1().ConfigMaps(hc.CNINamespace).Get(ctx, linkerdCNIConfigMapName, metav1.GetOptions{})
   849  						return err
   850  					},
   851  				},
   852  				{
   853  					description: "cni plugin ClusterRole exists",
   854  					hintAnchor:  "cni-plugin-cr-exists",
   855  					fatal:       true,
   856  					check: func(ctx context.Context) error {
   857  						if !hc.CNIEnabled {
   858  							return SkipError{Reason: linkerdCNIDisabledSkipReason}
   859  						}
   860  						_, err := hc.kubeAPI.RbacV1().ClusterRoles().Get(ctx, linkerdCNIResourceName, metav1.GetOptions{})
   861  						if kerrors.IsNotFound(err) {
   862  							return fmt.Errorf("missing ClusterRole: %s", linkerdCNIResourceName)
   863  						}
   864  						return err
   865  					},
   866  				},
   867  				{
   868  					description: "cni plugin ClusterRoleBinding exists",
   869  					hintAnchor:  "cni-plugin-crb-exists",
   870  					fatal:       true,
   871  					check: func(ctx context.Context) error {
   872  						if !hc.CNIEnabled {
   873  							return SkipError{Reason: linkerdCNIDisabledSkipReason}
   874  						}
   875  						_, err := hc.kubeAPI.RbacV1().ClusterRoleBindings().Get(ctx, linkerdCNIResourceName, metav1.GetOptions{})
   876  						if kerrors.IsNotFound(err) {
   877  							return fmt.Errorf("missing ClusterRoleBinding: %s", linkerdCNIResourceName)
   878  						}
   879  						return err
   880  					},
   881  				},
   882  				{
   883  					description: "cni plugin ServiceAccount exists",
   884  					hintAnchor:  "cni-plugin-sa-exists",
   885  					fatal:       true,
   886  					check: func(ctx context.Context) error {
   887  						if !hc.CNIEnabled {
   888  							return SkipError{Reason: linkerdCNIDisabledSkipReason}
   889  						}
   890  						_, err := hc.kubeAPI.CoreV1().ServiceAccounts(hc.CNINamespace).Get(ctx, linkerdCNIResourceName, metav1.GetOptions{})
   891  						if kerrors.IsNotFound(err) {
   892  							return fmt.Errorf("missing ServiceAccount: %s", linkerdCNIResourceName)
   893  						}
   894  						return err
   895  					},
   896  				},
   897  				{
   898  					description: "cni plugin DaemonSet exists",
   899  					hintAnchor:  "cni-plugin-ds-exists",
   900  					fatal:       true,
   901  					check: func(ctx context.Context) (err error) {
   902  						if !hc.CNIEnabled {
   903  							return SkipError{Reason: linkerdCNIDisabledSkipReason}
   904  						}
   905  						hc.cniDaemonSet, err = hc.kubeAPI.Interface.AppsV1().DaemonSets(hc.CNINamespace).Get(ctx, linkerdCNIResourceName, metav1.GetOptions{})
   906  						if kerrors.IsNotFound(err) {
   907  							return fmt.Errorf("missing DaemonSet: %s", linkerdCNIResourceName)
   908  						}
   909  						return err
   910  					},
   911  				},
   912  				{
   913  					description:         "cni plugin pod is running on all nodes",
   914  					hintAnchor:          "cni-plugin-ready",
   915  					retryDeadline:       hc.RetryDeadline,
   916  					surfaceErrorOnRetry: true,
   917  					fatal:               true,
   918  					check: func(ctx context.Context) (err error) {
   919  						if !hc.CNIEnabled {
   920  							return SkipError{Reason: linkerdCNIDisabledSkipReason}
   921  						}
   922  						hc.cniDaemonSet, err = hc.kubeAPI.Interface.AppsV1().DaemonSets(hc.CNINamespace).Get(ctx, linkerdCNIResourceName, metav1.GetOptions{})
   923  						if kerrors.IsNotFound(err) {
   924  							return fmt.Errorf("missing DaemonSet: %s", linkerdCNIResourceName)
   925  						}
   926  						scheduled := hc.cniDaemonSet.Status.DesiredNumberScheduled
   927  						ready := hc.cniDaemonSet.Status.NumberReady
   928  						if scheduled != ready {
   929  							return fmt.Errorf("number ready: %d, number scheduled: %d", ready, scheduled)
   930  						}
   931  						return nil
   932  					},
   933  				},
   934  			},
   935  			false,
   936  		),
   937  		NewCategory(
   938  			LinkerdIdentity,
   939  			[]Checker{
   940  				{
   941  					description: "certificate config is valid",
   942  					hintAnchor:  "l5d-identity-cert-config-valid",
   943  					fatal:       true,
   944  					check: func(ctx context.Context) (err error) {
   945  						hc.issuerCert, hc.trustAnchors, err = hc.checkCertificatesConfig(ctx)
   946  						return
   947  					},
   948  				},
   949  				{
   950  					description: "trust anchors are using supported crypto algorithm",
   951  					hintAnchor:  "l5d-identity-trustAnchors-use-supported-crypto",
   952  					fatal:       true,
   953  					check: func(context.Context) error {
   954  						var invalidAnchors []string
   955  						for _, anchor := range hc.trustAnchors {
   956  							if err := issuercerts.CheckTrustAnchorAlgoRequirements(anchor); err != nil {
   957  								invalidAnchors = append(invalidAnchors, fmt.Sprintf("* %v %s %s", anchor.SerialNumber, anchor.Subject.CommonName, err))
   958  							}
   959  						}
   960  						if len(invalidAnchors) > 0 {
   961  							return fmt.Errorf("Invalid trustAnchors:\n\t%s", strings.Join(invalidAnchors, "\n\t"))
   962  						}
   963  						return nil
   964  					},
   965  				},
   966  				{
   967  					description: "trust anchors are within their validity period",
   968  					hintAnchor:  "l5d-identity-trustAnchors-are-time-valid",
   969  					fatal:       true,
   970  					check: func(ctx context.Context) error {
   971  						var expiredAnchors []string
   972  						for _, anchor := range hc.trustAnchors {
   973  							if err := issuercerts.CheckCertValidityPeriod(anchor); err != nil {
   974  								expiredAnchors = append(expiredAnchors, fmt.Sprintf("* %v %s %s", anchor.SerialNumber, anchor.Subject.CommonName, err))
   975  							}
   976  						}
   977  						if len(expiredAnchors) > 0 {
   978  							return fmt.Errorf("Invalid anchors:\n\t%s", strings.Join(expiredAnchors, "\n\t"))
   979  						}
   980  
   981  						return nil
   982  					},
   983  				},
   984  				{
   985  					description: "trust anchors are valid for at least 60 days",
   986  					hintAnchor:  "l5d-identity-trustAnchors-not-expiring-soon",
   987  					warning:     true,
   988  					check: func(ctx context.Context) error {
   989  						var expiringAnchors []string
   990  						for _, anchor := range hc.trustAnchors {
   991  							if err := issuercerts.CheckExpiringSoon(anchor); err != nil {
   992  								expiringAnchors = append(expiringAnchors, fmt.Sprintf("* %v %s %s", anchor.SerialNumber, anchor.Subject.CommonName, err))
   993  							}
   994  						}
   995  						if len(expiringAnchors) > 0 {
   996  							return fmt.Errorf("Anchors expiring soon:\n\t%s", strings.Join(expiringAnchors, "\n\t"))
   997  						}
   998  						return nil
   999  					},
  1000  				},
  1001  				{
  1002  					description: "issuer cert is using supported crypto algorithm",
  1003  					hintAnchor:  "l5d-identity-issuer-cert-uses-supported-crypto",
  1004  					fatal:       true,
  1005  					check: func(context.Context) error {
  1006  						if err := issuercerts.CheckIssuerCertAlgoRequirements(hc.issuerCert.Certificate); err != nil {
  1007  							return fmt.Errorf("issuer certificate %w", err)
  1008  						}
  1009  						return nil
  1010  					},
  1011  				},
  1012  				{
  1013  					description: "issuer cert is within its validity period",
  1014  					hintAnchor:  "l5d-identity-issuer-cert-is-time-valid",
  1015  					fatal:       true,
  1016  					check: func(ctx context.Context) error {
  1017  						if err := issuercerts.CheckCertValidityPeriod(hc.issuerCert.Certificate); err != nil {
  1018  							return fmt.Errorf("issuer certificate is %w", err)
  1019  						}
  1020  						return nil
  1021  					},
  1022  				},
  1023  				{
  1024  					description: "issuer cert is valid for at least 60 days",
  1025  					warning:     true,
  1026  					hintAnchor:  "l5d-identity-issuer-cert-not-expiring-soon",
  1027  					check: func(context.Context) error {
  1028  						if err := issuercerts.CheckExpiringSoon(hc.issuerCert.Certificate); err != nil {
  1029  							return fmt.Errorf("issuer certificate %w", err)
  1030  						}
  1031  						return nil
  1032  					},
  1033  				},
  1034  				{
  1035  					description: "issuer cert is issued by the trust anchor",
  1036  					hintAnchor:  "l5d-identity-issuer-cert-issued-by-trust-anchor",
  1037  					check: func(ctx context.Context) error {
  1038  						return hc.issuerCert.Verify(tls.CertificatesToPool(hc.trustAnchors), "", time.Time{})
  1039  					},
  1040  				},
  1041  			},
  1042  			false,
  1043  		),
  1044  		NewCategory(
  1045  			LinkerdWebhooksAndAPISvcTLS,
  1046  			[]Checker{
  1047  				{
  1048  					description: "proxy-injector webhook has valid cert",
  1049  					hintAnchor:  "l5d-proxy-injector-webhook-cert-valid",
  1050  					fatal:       true,
  1051  					check: func(ctx context.Context) (err error) {
  1052  						anchors, err := hc.fetchProxyInjectorCaBundle(ctx)
  1053  						if err != nil {
  1054  							return err
  1055  						}
  1056  						cert, err := hc.FetchCredsFromSecret(ctx, hc.ControlPlaneNamespace, proxyInjectorTLSSecretName)
  1057  						if kerrors.IsNotFound(err) {
  1058  							cert, err = hc.FetchCredsFromOldSecret(ctx, hc.ControlPlaneNamespace, proxyInjectorOldTLSSecretName)
  1059  						}
  1060  						if err != nil {
  1061  							return err
  1062  						}
  1063  
  1064  						identityName := fmt.Sprintf("linkerd-proxy-injector.%s.svc", hc.ControlPlaneNamespace)
  1065  						return hc.CheckCertAndAnchors(cert, anchors, identityName)
  1066  					},
  1067  				},
  1068  				{
  1069  					description: "proxy-injector cert is valid for at least 60 days",
  1070  					warning:     true,
  1071  					hintAnchor:  "l5d-proxy-injector-webhook-cert-not-expiring-soon",
  1072  					check: func(ctx context.Context) error {
  1073  						cert, err := hc.FetchCredsFromSecret(ctx, hc.ControlPlaneNamespace, proxyInjectorTLSSecretName)
  1074  						if kerrors.IsNotFound(err) {
  1075  							cert, err = hc.FetchCredsFromOldSecret(ctx, hc.ControlPlaneNamespace, proxyInjectorOldTLSSecretName)
  1076  						}
  1077  						if err != nil {
  1078  							return err
  1079  						}
  1080  						return hc.CheckCertAndAnchorsExpiringSoon(cert)
  1081  
  1082  					},
  1083  				},
  1084  				{
  1085  					description: "sp-validator webhook has valid cert",
  1086  					hintAnchor:  "l5d-sp-validator-webhook-cert-valid",
  1087  					fatal:       true,
  1088  					check: func(ctx context.Context) (err error) {
  1089  						anchors, err := hc.fetchWebhookCaBundle(ctx, k8s.SPValidatorWebhookConfigName)
  1090  						if err != nil {
  1091  							return err
  1092  						}
  1093  						cert, err := hc.FetchCredsFromSecret(ctx, hc.ControlPlaneNamespace, spValidatorTLSSecretName)
  1094  						if kerrors.IsNotFound(err) {
  1095  							cert, err = hc.FetchCredsFromOldSecret(ctx, hc.ControlPlaneNamespace, spValidatorOldTLSSecretName)
  1096  						}
  1097  						if err != nil {
  1098  							return err
  1099  						}
  1100  						identityName := fmt.Sprintf("linkerd-sp-validator.%s.svc", hc.ControlPlaneNamespace)
  1101  						return hc.CheckCertAndAnchors(cert, anchors, identityName)
  1102  					},
  1103  				},
  1104  				{
  1105  					description: "sp-validator cert is valid for at least 60 days",
  1106  					warning:     true,
  1107  					hintAnchor:  "l5d-sp-validator-webhook-cert-not-expiring-soon",
  1108  					check: func(ctx context.Context) error {
  1109  						cert, err := hc.FetchCredsFromSecret(ctx, hc.ControlPlaneNamespace, spValidatorTLSSecretName)
  1110  						if kerrors.IsNotFound(err) {
  1111  							cert, err = hc.FetchCredsFromOldSecret(ctx, hc.ControlPlaneNamespace, spValidatorOldTLSSecretName)
  1112  						}
  1113  						if err != nil {
  1114  							return err
  1115  						}
  1116  						return hc.CheckCertAndAnchorsExpiringSoon(cert)
  1117  
  1118  					},
  1119  				},
  1120  				{
  1121  					description: "policy-validator webhook has valid cert",
  1122  					hintAnchor:  "l5d-policy-validator-webhook-cert-valid",
  1123  					fatal:       true,
  1124  					check: func(ctx context.Context) (err error) {
  1125  						anchors, err := hc.fetchWebhookCaBundle(ctx, k8s.PolicyValidatorWebhookConfigName)
  1126  						if kerrors.IsNotFound(err) {
  1127  							return SkipError{Reason: "policy-validator not installed"}
  1128  						}
  1129  						if err != nil {
  1130  							return err
  1131  						}
  1132  						cert, err := hc.FetchCredsFromSecret(ctx, hc.ControlPlaneNamespace, policyValidatorTLSSecretName)
  1133  						if kerrors.IsNotFound(err) {
  1134  							return SkipError{Reason: "policy-validator not installed"}
  1135  						}
  1136  						if err != nil {
  1137  							return err
  1138  						}
  1139  						identityName := fmt.Sprintf("linkerd-policy-validator.%s.svc", hc.ControlPlaneNamespace)
  1140  						return hc.CheckCertAndAnchors(cert, anchors, identityName)
  1141  					},
  1142  				},
  1143  				{
  1144  					description: "policy-validator cert is valid for at least 60 days",
  1145  					warning:     true,
  1146  					hintAnchor:  "l5d-policy-validator-webhook-cert-not-expiring-soon",
  1147  					check: func(ctx context.Context) error {
  1148  						cert, err := hc.FetchCredsFromSecret(ctx, hc.ControlPlaneNamespace, policyValidatorTLSSecretName)
  1149  						if kerrors.IsNotFound(err) {
  1150  							return SkipError{Reason: "policy-validator not installed"}
  1151  						}
  1152  						if err != nil {
  1153  							return err
  1154  						}
  1155  						return hc.CheckCertAndAnchorsExpiringSoon(cert)
  1156  
  1157  					},
  1158  				},
  1159  			},
  1160  			false,
  1161  		),
  1162  		NewCategory(
  1163  			LinkerdIdentityDataPlane,
  1164  			[]Checker{
  1165  				{
  1166  					description: "data plane proxies certificate match CA",
  1167  					hintAnchor:  "l5d-identity-data-plane-proxies-certs-match-ca",
  1168  					warning:     true,
  1169  					check: func(ctx context.Context) error {
  1170  						return hc.checkDataPlaneProxiesCertificate(ctx)
  1171  					},
  1172  				},
  1173  			},
  1174  			false,
  1175  		),
  1176  		NewCategory(
  1177  			LinkerdVersionChecks,
  1178  			[]Checker{
  1179  				{
  1180  					description: "can determine the latest version",
  1181  					hintAnchor:  "l5d-version-latest",
  1182  					warning:     true,
  1183  					check: func(ctx context.Context) (err error) {
  1184  						if hc.VersionOverride != "" {
  1185  							hc.LatestVersions, err = version.NewChannels(hc.VersionOverride)
  1186  						} else {
  1187  							uuid := "unknown"
  1188  							if hc.uuid != "" {
  1189  								uuid = hc.uuid
  1190  							}
  1191  							hc.LatestVersions, err = version.GetLatestVersions(ctx, uuid, "cli")
  1192  						}
  1193  						return
  1194  					},
  1195  				},
  1196  				{
  1197  					description: "cli is up-to-date",
  1198  					hintAnchor:  "l5d-version-cli",
  1199  					warning:     true,
  1200  					check: func(context.Context) error {
  1201  						return hc.LatestVersions.Match(version.Version)
  1202  					},
  1203  				},
  1204  			},
  1205  			false,
  1206  		),
  1207  		NewCategory(
  1208  			LinkerdControlPlaneVersionChecks,
  1209  			[]Checker{
  1210  				{
  1211  					description:   "can retrieve the control plane version",
  1212  					hintAnchor:    "l5d-version-control",
  1213  					retryDeadline: hc.RetryDeadline,
  1214  					fatal:         true,
  1215  					check: func(ctx context.Context) (err error) {
  1216  						hc.serverVersion, err = GetServerVersion(ctx, hc.ControlPlaneNamespace, hc.kubeAPI)
  1217  						return
  1218  					},
  1219  				},
  1220  				{
  1221  					description: "control plane is up-to-date",
  1222  					hintAnchor:  "l5d-version-control",
  1223  					warning:     true,
  1224  					check: func(context.Context) error {
  1225  						return hc.LatestVersions.Match(hc.serverVersion)
  1226  					},
  1227  				},
  1228  				{
  1229  					description: "control plane and cli versions match",
  1230  					hintAnchor:  "l5d-version-control",
  1231  					warning:     true,
  1232  					check: func(context.Context) error {
  1233  						if hc.serverVersion != version.Version {
  1234  							return fmt.Errorf("control plane running %s but cli running %s", hc.serverVersion, version.Version)
  1235  						}
  1236  						return nil
  1237  					},
  1238  				},
  1239  			},
  1240  			false,
  1241  		),
  1242  		NewCategory(
  1243  			LinkerdControlPlaneProxyChecks,
  1244  			[]Checker{
  1245  				{
  1246  					description:         "control plane proxies are healthy",
  1247  					hintAnchor:          "l5d-cp-proxy-healthy",
  1248  					retryDeadline:       hc.RetryDeadline,
  1249  					surfaceErrorOnRetry: true,
  1250  					fatal:               true,
  1251  					check: func(ctx context.Context) error {
  1252  						return hc.CheckProxyHealth(ctx, hc.ControlPlaneNamespace, hc.ControlPlaneNamespace)
  1253  					},
  1254  				},
  1255  				{
  1256  					description: "control plane proxies are up-to-date",
  1257  					hintAnchor:  "l5d-cp-proxy-version",
  1258  					warning:     true,
  1259  					check: func(ctx context.Context) error {
  1260  						podList, err := hc.kubeAPI.CoreV1().Pods(hc.ControlPlaneNamespace).List(ctx, metav1.ListOptions{LabelSelector: k8s.ControllerNSLabel})
  1261  						if err != nil {
  1262  							return err
  1263  						}
  1264  
  1265  						return hc.CheckProxyVersionsUpToDate(podList.Items)
  1266  					},
  1267  				},
  1268  				{
  1269  					description: "control plane proxies and cli versions match",
  1270  					hintAnchor:  "l5d-cp-proxy-cli-version",
  1271  					warning:     true,
  1272  					check: func(ctx context.Context) error {
  1273  						podList, err := hc.kubeAPI.CoreV1().Pods(hc.ControlPlaneNamespace).List(ctx, metav1.ListOptions{LabelSelector: k8s.ControllerNSLabel})
  1274  						if err != nil {
  1275  							return err
  1276  						}
  1277  
  1278  						return CheckIfProxyVersionsMatchWithCLI(podList.Items)
  1279  					},
  1280  				},
  1281  			},
  1282  			false,
  1283  		),
  1284  		NewCategory(
  1285  			LinkerdDataPlaneChecks,
  1286  			[]Checker{
  1287  				{
  1288  					description: "data plane namespace exists",
  1289  					hintAnchor:  "l5d-data-plane-exists",
  1290  					fatal:       true,
  1291  					check: func(ctx context.Context) error {
  1292  						if hc.DataPlaneNamespace == "" {
  1293  							// when checking proxies in all namespaces, this check is a no-op
  1294  							return nil
  1295  						}
  1296  						return hc.CheckNamespace(ctx, hc.DataPlaneNamespace, true)
  1297  					},
  1298  				},
  1299  				{
  1300  					description:   "data plane proxies are ready",
  1301  					hintAnchor:    "l5d-data-plane-ready",
  1302  					retryDeadline: hc.RetryDeadline,
  1303  					fatal:         true,
  1304  					check: func(ctx context.Context) error {
  1305  						pods, err := hc.GetDataPlanePods(ctx)
  1306  						if err != nil {
  1307  							return err
  1308  						}
  1309  						return CheckPodsRunning(pods, hc.DataPlaneNamespace)
  1310  					},
  1311  				},
  1312  				{
  1313  					description: "data plane is up-to-date",
  1314  					hintAnchor:  "l5d-data-plane-version",
  1315  					warning:     true,
  1316  					check: func(ctx context.Context) error {
  1317  						pods, err := hc.GetDataPlanePods(ctx)
  1318  						if err != nil {
  1319  							return err
  1320  						}
  1321  
  1322  						return hc.CheckProxyVersionsUpToDate(pods)
  1323  					},
  1324  				},
  1325  				{
  1326  					description: "data plane and cli versions match",
  1327  					hintAnchor:  "l5d-data-plane-cli-version",
  1328  					warning:     true,
  1329  					check: func(ctx context.Context) error {
  1330  						pods, err := hc.GetDataPlanePods(ctx)
  1331  						if err != nil {
  1332  							return err
  1333  						}
  1334  
  1335  						return CheckIfProxyVersionsMatchWithCLI(pods)
  1336  					},
  1337  				},
  1338  				{
  1339  					description: "data plane pod labels are configured correctly",
  1340  					hintAnchor:  "l5d-data-plane-pod-labels",
  1341  					warning:     true,
  1342  					check: func(ctx context.Context) error {
  1343  						pods, err := hc.GetDataPlanePods(ctx)
  1344  						if err != nil {
  1345  							return err
  1346  						}
  1347  
  1348  						return checkMisconfiguredPodsLabels(pods)
  1349  					},
  1350  				},
  1351  				{
  1352  					description: "data plane service labels are configured correctly",
  1353  					hintAnchor:  "l5d-data-plane-services-labels",
  1354  					warning:     true,
  1355  					check: func(ctx context.Context) error {
  1356  						services, err := hc.GetServices(ctx)
  1357  						if err != nil {
  1358  							return err
  1359  						}
  1360  
  1361  						return checkMisconfiguredServiceLabels(services)
  1362  					},
  1363  				},
  1364  				{
  1365  					description: "data plane service annotations are configured correctly",
  1366  					hintAnchor:  "l5d-data-plane-services-annotations",
  1367  					warning:     true,
  1368  					check: func(ctx context.Context) error {
  1369  						services, err := hc.GetServices(ctx)
  1370  						if err != nil {
  1371  							return err
  1372  						}
  1373  
  1374  						return checkMisconfiguredServiceAnnotations(services)
  1375  					},
  1376  				},
  1377  				{
  1378  					description: "opaque ports are properly annotated",
  1379  					hintAnchor:  "linkerd-opaque-ports-definition",
  1380  					warning:     true,
  1381  					check: func(ctx context.Context) error {
  1382  						return hc.checkMisconfiguredOpaquePortAnnotations(ctx)
  1383  					},
  1384  				},
  1385  			},
  1386  			false,
  1387  		),
  1388  		NewCategory(
  1389  			LinkerdHAChecks,
  1390  			[]Checker{
  1391  				{
  1392  					description:   "multiple replicas of control plane pods",
  1393  					hintAnchor:    "l5d-control-plane-replicas",
  1394  					retryDeadline: hc.RetryDeadline,
  1395  					warning:       true,
  1396  					check: func(ctx context.Context) error {
  1397  						if hc.isHA() {
  1398  							return hc.checkMinReplicasAvailable(ctx)
  1399  						}
  1400  						return SkipError{Reason: "not run for non HA installs"}
  1401  					},
  1402  				},
  1403  			},
  1404  			false,
  1405  		),
  1406  		NewCategory(
  1407  			LinkerdExtensionChecks,
  1408  			[]Checker{
  1409  				{
  1410  					description: "namespace configuration for extensions",
  1411  					warning:     true,
  1412  					hintAnchor:  "l5d-extension-namespaces",
  1413  					check: func(ctx context.Context) error {
  1414  						return hc.checkExtensionNsLabels(ctx)
  1415  					},
  1416  				},
  1417  			},
  1418  			false,
  1419  		),
  1420  	}
  1421  }
  1422  
  1423  // CheckProxyVersionsUpToDate checks if all the proxies are on the latest
  1424  // installed version
  1425  func (hc *HealthChecker) CheckProxyVersionsUpToDate(pods []corev1.Pod) error {
  1426  	return CheckProxyVersionsUpToDate(pods, hc.LatestVersions)
  1427  }
  1428  
  1429  // CheckProxyVersionsUpToDate checks if all the proxies are on the latest
  1430  // installed version
  1431  func CheckProxyVersionsUpToDate(pods []corev1.Pod, versions version.Channels) error {
  1432  	outdatedPods := []string{}
  1433  	for _, pod := range pods {
  1434  		status := k8s.GetPodStatus(pod)
  1435  		if status == string(corev1.PodRunning) {
  1436  			proxyVersion := k8s.GetProxyVersion(pod)
  1437  			if proxyVersion == "" {
  1438  				continue
  1439  			}
  1440  			if err := versions.Match(proxyVersion); err != nil {
  1441  				outdatedPods = append(outdatedPods, fmt.Sprintf("\t* %s (%s)", pod.Name, proxyVersion))
  1442  			}
  1443  		}
  1444  	}
  1445  	if versions.Empty() {
  1446  		return errors.New("unable to determine version channel")
  1447  	}
  1448  	if len(outdatedPods) > 0 {
  1449  		podList := strings.Join(outdatedPods, "\n")
  1450  		return fmt.Errorf("some proxies are not running the current version:\n%s", podList)
  1451  	}
  1452  	return nil
  1453  }
  1454  
  1455  // CheckIfProxyVersionsMatchWithCLI checks if the latest proxy version
  1456  // matches that of the CLI
  1457  func CheckIfProxyVersionsMatchWithCLI(pods []corev1.Pod) error {
  1458  	for _, pod := range pods {
  1459  		status := k8s.GetPodStatus(pod)
  1460  		proxyVersion := k8s.GetProxyVersion(pod)
  1461  		if status == string(corev1.PodRunning) && proxyVersion != "" && proxyVersion != version.Version {
  1462  			return fmt.Errorf("%s running %s but cli running %s", pod.Name, proxyVersion, version.Version)
  1463  		}
  1464  	}
  1465  	return nil
  1466  }
  1467  
  1468  // CheckCertAndAnchors checks if the given cert and anchors are valid
  1469  func (hc *HealthChecker) CheckCertAndAnchors(cert *tls.Cred, trustAnchors []*x509.Certificate, identityName string) error {
  1470  
  1471  	// check anchors time validity
  1472  	var expiredAnchors []string
  1473  	for _, anchor := range trustAnchors {
  1474  		if err := issuercerts.CheckCertValidityPeriod(anchor); err != nil {
  1475  			expiredAnchors = append(expiredAnchors, fmt.Sprintf("* %v %s %s", anchor.SerialNumber, anchor.Subject.CommonName, err))
  1476  		}
  1477  	}
  1478  	if len(expiredAnchors) > 0 {
  1479  		return fmt.Errorf("anchors not within their validity period:\n\t%s", strings.Join(expiredAnchors, "\n\t"))
  1480  	}
  1481  
  1482  	// check cert validity
  1483  	if err := issuercerts.CheckCertValidityPeriod(cert.Certificate); err != nil {
  1484  		return fmt.Errorf("certificate is %w", err)
  1485  	}
  1486  
  1487  	if err := cert.Verify(tls.CertificatesToPool(trustAnchors), identityName, time.Time{}); err != nil {
  1488  		return fmt.Errorf("cert is not issued by the trust anchor: %w", err)
  1489  	}
  1490  
  1491  	return nil
  1492  }
  1493  
  1494  // CheckProxyHealth checks for the data-plane proxies health in the given namespace
  1495  // These checks consist of status and identity
  1496  func (hc *HealthChecker) CheckProxyHealth(ctx context.Context, controlPlaneNamespace, namespace string) error {
  1497  	podList, err := hc.kubeAPI.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: k8s.ControllerNSLabel})
  1498  	if err != nil {
  1499  		return err
  1500  	}
  1501  
  1502  	// Validate the status of the pods
  1503  	err = CheckPodsRunning(podList.Items, controlPlaneNamespace)
  1504  	if err != nil {
  1505  		return err
  1506  	}
  1507  
  1508  	// Check proxy certificates
  1509  	return checkPodsProxiesCertificate(ctx, *hc.kubeAPI, namespace, controlPlaneNamespace)
  1510  }
  1511  
  1512  // CheckCertAndAnchorsExpiringSoon checks if the given cert and anchors expire soon, and returns an
  1513  // error if they do.
  1514  func (hc *HealthChecker) CheckCertAndAnchorsExpiringSoon(cert *tls.Cred) error {
  1515  	// check anchors not expiring soon
  1516  	var expiringAnchors []string
  1517  	for _, anchor := range cert.TrustChain {
  1518  		anchor := anchor
  1519  		if err := issuercerts.CheckExpiringSoon(anchor); err != nil {
  1520  			expiringAnchors = append(expiringAnchors, fmt.Sprintf("* %v %s %s", anchor.SerialNumber, anchor.Subject.CommonName, err))
  1521  		}
  1522  	}
  1523  	if len(expiringAnchors) > 0 {
  1524  		return fmt.Errorf("Anchors expiring soon:\n\t%s", strings.Join(expiringAnchors, "\n\t"))
  1525  	}
  1526  
  1527  	// check cert not expiring soon
  1528  	if err := issuercerts.CheckExpiringSoon(cert.Certificate); err != nil {
  1529  		return fmt.Errorf("certificate %w", err)
  1530  	}
  1531  	return nil
  1532  }
  1533  
  1534  // CheckAPIService checks the status of the given API Service and returns an error if it's not running
  1535  func (hc *HealthChecker) CheckAPIService(ctx context.Context, serviceName string) error {
  1536  	apiServiceClient, err := apiregistrationv1client.NewForConfig(hc.kubeAPI.Config)
  1537  	if err != nil {
  1538  		return err
  1539  	}
  1540  
  1541  	apiStatus, err := apiServiceClient.APIServices().Get(ctx, serviceName, metav1.GetOptions{})
  1542  	if err != nil {
  1543  		return err
  1544  	}
  1545  
  1546  	for _, condition := range apiStatus.Status.Conditions {
  1547  		if condition.Type == "Available" {
  1548  			if condition.Status == "True" {
  1549  				return nil
  1550  			}
  1551  			return fmt.Errorf("%s: %s", condition.Reason, condition.Message)
  1552  		}
  1553  	}
  1554  
  1555  	return fmt.Errorf("%s service not available", apiStatus.Name)
  1556  }
  1557  
  1558  func (hc *HealthChecker) checkMinReplicasAvailable(ctx context.Context) error {
  1559  	faulty := []string{}
  1560  
  1561  	for _, component := range linkerdHAControlPlaneComponents {
  1562  		conf, err := hc.kubeAPI.AppsV1().Deployments(hc.ControlPlaneNamespace).Get(ctx, component, metav1.GetOptions{})
  1563  		if err != nil {
  1564  			return err
  1565  		}
  1566  
  1567  		if conf.Status.AvailableReplicas <= 1 {
  1568  			faulty = append(faulty, component)
  1569  		}
  1570  	}
  1571  
  1572  	if len(faulty) > 0 {
  1573  		return fmt.Errorf("not enough replicas available for %v", faulty)
  1574  	}
  1575  	return nil
  1576  }
  1577  
  1578  // RunChecks runs all configured checkers, and passes the results of each
  1579  // check to the observer. If a check fails and is marked as fatal, then all
  1580  // remaining checks are skipped. If at least one check fails, RunChecks returns
  1581  // false; if all checks passed, RunChecks returns true.  Checks which are
  1582  // designated as warnings will not cause RunCheck to return false, however.
  1583  func (hc *HealthChecker) RunChecks(observer CheckObserver) (bool, bool) {
  1584  	success := true
  1585  	warning := false
  1586  	for _, c := range hc.categories {
  1587  		if c.enabled {
  1588  			for _, checker := range c.checkers {
  1589  				checker := checker // pin
  1590  				if checker.check != nil {
  1591  					if !hc.runCheck(c, &checker, observer) {
  1592  						if !checker.warning {
  1593  							success = false
  1594  						} else {
  1595  							warning = true
  1596  						}
  1597  						if checker.fatal {
  1598  							return success, warning
  1599  						}
  1600  					}
  1601  				}
  1602  			}
  1603  		}
  1604  	}
  1605  
  1606  	return success, warning
  1607  }
  1608  
  1609  func (hc *HealthChecker) RunWithExitOnError() (bool, bool) {
  1610  	return hc.RunChecks(func(result *CheckResult) {
  1611  		if result.Retry {
  1612  			fmt.Fprintln(os.Stderr, "Waiting for control plane to become available")
  1613  			return
  1614  		}
  1615  
  1616  		if result.Err != nil && !result.Warning {
  1617  			var msg string
  1618  			switch result.Category {
  1619  			case KubernetesAPIChecks:
  1620  				msg = "Cannot connect to Kubernetes"
  1621  			case LinkerdControlPlaneExistenceChecks:
  1622  				msg = "Cannot find Linkerd"
  1623  			}
  1624  			fmt.Fprintf(os.Stderr, "%s: %s\nValidate the install with: 'linkerd check'\n",
  1625  				msg, result.Err)
  1626  			os.Exit(1)
  1627  		}
  1628  	})
  1629  }
  1630  
  1631  // LinkerdConfig gets the Linkerd configuration values.
  1632  func (hc *HealthChecker) LinkerdConfig() *l5dcharts.Values {
  1633  	return hc.linkerdConfig
  1634  }
  1635  
  1636  func (hc *HealthChecker) runCheck(category *Category, c *Checker, observer CheckObserver) bool {
  1637  	for {
  1638  		ctx, cancel := context.WithTimeout(context.Background(), RequestTimeout)
  1639  		err := c.check(ctx)
  1640  		cancel()
  1641  		var se SkipError
  1642  		if errors.As(err, &se) {
  1643  			log.Debugf("Skipping check: %s. Reason: %s", c.description, se.Reason)
  1644  			return true
  1645  		}
  1646  
  1647  		checkResult := &CheckResult{
  1648  			Category:    category.ID,
  1649  			Description: c.description,
  1650  			Warning:     c.warning,
  1651  			HintURL:     fmt.Sprintf("%s%s", category.hintBaseURL, c.hintAnchor),
  1652  		}
  1653  		var vs VerboseSuccess
  1654  		if errors.As(err, &vs) {
  1655  			checkResult.Description = fmt.Sprintf("%s\n%s", checkResult.Description, vs.Message)
  1656  		} else if err != nil {
  1657  			checkResult.Err = CategoryError{category.ID, err}
  1658  		}
  1659  
  1660  		if checkResult.Err != nil && time.Now().Before(c.retryDeadline) {
  1661  			checkResult.Retry = true
  1662  			if !c.surfaceErrorOnRetry {
  1663  				checkResult.Err = errors.New("waiting for check to complete")
  1664  			}
  1665  			log.Debugf("Retrying on error: %s", err)
  1666  
  1667  			observer(checkResult)
  1668  			time.Sleep(retryWindow)
  1669  			continue
  1670  		}
  1671  
  1672  		observer(checkResult)
  1673  		return checkResult.Err == nil
  1674  	}
  1675  }
  1676  
  1677  func controlPlaneComponentsSelector() string {
  1678  	return fmt.Sprintf("%s,!%s", k8s.ControllerNSLabel, LinkerdCNIResourceLabel)
  1679  }
  1680  
  1681  // KubeAPIClient returns a fully configured k8s API client. This client is
  1682  // only configured if the KubernetesAPIChecks are configured and run first.
  1683  func (hc *HealthChecker) KubeAPIClient() *k8s.KubernetesAPI {
  1684  	return hc.kubeAPI
  1685  }
  1686  
  1687  // UUID returns the UUID of the installation
  1688  func (hc *HealthChecker) UUID() string {
  1689  	return hc.uuid
  1690  }
  1691  
  1692  func (hc *HealthChecker) checkLinkerdConfigConfigMap(ctx context.Context) (string, *l5dcharts.Values, error) {
  1693  	configMap, values, err := FetchCurrentConfiguration(ctx, hc.kubeAPI, hc.ControlPlaneNamespace)
  1694  	if err != nil {
  1695  		return "", nil, err
  1696  	}
  1697  
  1698  	return string(configMap.GetUID()), values, nil
  1699  }
  1700  
  1701  // Checks whether the configuration of the linkerd-identity-issuer is correct. This means:
  1702  // 1. There is a config map present with identity context
  1703  // 2. The scheme in the identity context corresponds to the format of the issuer secret
  1704  // 3. The trust anchors (if scheme == kubernetes.io/tls) in the secret equal the ones in config
  1705  // 4. The certs and key are parsable
  1706  func (hc *HealthChecker) checkCertificatesConfig(ctx context.Context) (*tls.Cred, []*x509.Certificate, error) {
  1707  	_, values, err := FetchCurrentConfiguration(ctx, hc.kubeAPI, hc.ControlPlaneNamespace)
  1708  	if err != nil {
  1709  		return nil, nil, err
  1710  	}
  1711  
  1712  	var data *issuercerts.IssuerCertData
  1713  
  1714  	if values.Identity.Issuer.Scheme == "" || values.Identity.Issuer.Scheme == k8s.IdentityIssuerSchemeLinkerd {
  1715  		data, err = issuercerts.FetchIssuerData(ctx, hc.kubeAPI, values.IdentityTrustAnchorsPEM, hc.ControlPlaneNamespace)
  1716  	} else {
  1717  		data, err = issuercerts.FetchExternalIssuerData(ctx, hc.kubeAPI, hc.ControlPlaneNamespace)
  1718  	}
  1719  
  1720  	if err != nil {
  1721  		return nil, nil, err
  1722  	}
  1723  
  1724  	issuerCreds, err := tls.ValidateAndCreateCreds(data.IssuerCrt, data.IssuerKey)
  1725  	if err != nil {
  1726  		return nil, nil, err
  1727  	}
  1728  
  1729  	anchors, err := tls.DecodePEMCertificates(data.TrustAnchors)
  1730  	if err != nil {
  1731  		return nil, nil, err
  1732  	}
  1733  
  1734  	return issuerCreds, anchors, nil
  1735  }
  1736  
  1737  // FetchCurrentConfiguration retrieves the current Linkerd configuration
  1738  func FetchCurrentConfiguration(ctx context.Context, k kubernetes.Interface, controlPlaneNamespace string) (*corev1.ConfigMap, *l5dcharts.Values, error) {
  1739  	// Get the linkerd-config values if present.
  1740  	configMap, err := config.FetchLinkerdConfigMap(ctx, k, controlPlaneNamespace)
  1741  	if err != nil {
  1742  		return nil, nil, err
  1743  	}
  1744  
  1745  	rawValues := configMap.Data["values"]
  1746  	if rawValues == "" {
  1747  		return configMap, nil, nil
  1748  	}
  1749  
  1750  	// Convert into latest values, where global field is removed.
  1751  	rawValuesBytes, err := config.RemoveGlobalFieldIfPresent([]byte(rawValues))
  1752  	if err != nil {
  1753  		return nil, nil, err
  1754  	}
  1755  	rawValues = string(rawValuesBytes)
  1756  	var fullValues l5dcharts.Values
  1757  
  1758  	err = yaml.Unmarshal([]byte(rawValues), &fullValues)
  1759  	if err != nil {
  1760  		return nil, nil, err
  1761  	}
  1762  	return configMap, &fullValues, nil
  1763  }
  1764  
  1765  func (hc *HealthChecker) fetchProxyInjectorCaBundle(ctx context.Context) ([]*x509.Certificate, error) {
  1766  	mwh, err := hc.getProxyInjectorMutatingWebhook(ctx)
  1767  	if err != nil {
  1768  		return nil, err
  1769  	}
  1770  
  1771  	caBundle, err := tls.DecodePEMCertificates(string(mwh.ClientConfig.CABundle))
  1772  	if err != nil {
  1773  		return nil, err
  1774  	}
  1775  	return caBundle, nil
  1776  }
  1777  
  1778  func (hc *HealthChecker) fetchWebhookCaBundle(ctx context.Context, webhook string) ([]*x509.Certificate, error) {
  1779  	vwc, err := hc.kubeAPI.AdmissionregistrationV1().ValidatingWebhookConfigurations().Get(ctx, webhook, metav1.GetOptions{})
  1780  	if err != nil {
  1781  		return nil, err
  1782  	}
  1783  
  1784  	if len(vwc.Webhooks) != 1 {
  1785  		return nil, fmt.Errorf("expected 1 webhooks, found %d", len(vwc.Webhooks))
  1786  	}
  1787  
  1788  	caBundle, err := tls.DecodePEMCertificates(string(vwc.Webhooks[0].ClientConfig.CABundle))
  1789  	if err != nil {
  1790  		return nil, err
  1791  	}
  1792  	return caBundle, nil
  1793  }
  1794  
  1795  // FetchTrustBundle retrieves the ca-bundle from the config-map linkerd-identity-trust-roots
  1796  func FetchTrustBundle(ctx context.Context, kubeAPI k8s.KubernetesAPI, controlPlaneNamespace string) (string, error) {
  1797  	configMap, err := kubeAPI.CoreV1().ConfigMaps(controlPlaneNamespace).Get(ctx, "linkerd-identity-trust-roots", metav1.GetOptions{})
  1798  
  1799  	return configMap.Data["ca-bundle.crt"], err
  1800  }
  1801  
  1802  // FetchCredsFromSecret retrieves the TLS creds given a secret name
  1803  func (hc *HealthChecker) FetchCredsFromSecret(ctx context.Context, namespace string, secretName string) (*tls.Cred, error) {
  1804  	secret, err := hc.kubeAPI.CoreV1().Secrets(namespace).Get(ctx, secretName, metav1.GetOptions{})
  1805  	if err != nil {
  1806  		return nil, err
  1807  	}
  1808  
  1809  	crt, ok := secret.Data[certKeyName]
  1810  	if !ok {
  1811  		return nil, fmt.Errorf("key %s needs to exist in secret %s", certKeyName, secretName)
  1812  	}
  1813  
  1814  	key, ok := secret.Data[keyKeyName]
  1815  	if !ok {
  1816  		return nil, fmt.Errorf("key %s needs to exist in secret %s", keyKeyName, secretName)
  1817  	}
  1818  
  1819  	cred, err := tls.ValidateAndCreateCreds(string(crt), string(key))
  1820  	if err != nil {
  1821  		return nil, err
  1822  	}
  1823  
  1824  	return cred, nil
  1825  }
  1826  
  1827  // FetchCredsFromOldSecret function can be removed in later versions, once either all webhook secrets are recreated for each update
  1828  // (see https://github.com/linkerd/linkerd2/issues/4813)
  1829  // or later releases are only expected to update from the new names.
  1830  func (hc *HealthChecker) FetchCredsFromOldSecret(ctx context.Context, namespace string, secretName string) (*tls.Cred, error) {
  1831  	secret, err := hc.kubeAPI.CoreV1().Secrets(namespace).Get(ctx, secretName, metav1.GetOptions{})
  1832  	if err != nil {
  1833  		return nil, err
  1834  	}
  1835  
  1836  	crt, ok := secret.Data[certOldKeyName]
  1837  	if !ok {
  1838  		return nil, fmt.Errorf("key %s needs to exist in secret %s", certOldKeyName, secretName)
  1839  	}
  1840  
  1841  	key, ok := secret.Data[keyOldKeyName]
  1842  	if !ok {
  1843  		return nil, fmt.Errorf("key %s needs to exist in secret %s", keyOldKeyName, secretName)
  1844  	}
  1845  
  1846  	cred, err := tls.ValidateAndCreateCreds(string(crt), string(key))
  1847  	if err != nil {
  1848  		return nil, err
  1849  	}
  1850  
  1851  	return cred, nil
  1852  }
  1853  
  1854  // CheckNamespace checks whether the given namespace exists, and returns an
  1855  // error if it does not match `shouldExist`.
  1856  func (hc *HealthChecker) CheckNamespace(ctx context.Context, namespace string, shouldExist bool) error {
  1857  	exists, err := hc.kubeAPI.NamespaceExists(ctx, namespace)
  1858  	if err != nil {
  1859  		return err
  1860  	}
  1861  	if shouldExist && !exists {
  1862  		return fmt.Errorf("The \"%s\" namespace does not exist", namespace)
  1863  	}
  1864  	if !shouldExist && exists {
  1865  		return fmt.Errorf("The \"%s\" namespace already exists", namespace)
  1866  	}
  1867  	return nil
  1868  }
  1869  
  1870  func (hc *HealthChecker) checkClusterNetworks(ctx context.Context) error {
  1871  	nodes, err := hc.kubeAPI.GetNodes(ctx)
  1872  	if err != nil {
  1873  		return err
  1874  	}
  1875  	clusterNetworks := strings.Split(hc.linkerdConfig.ClusterNetworks, ",")
  1876  	clusterIPNets := make([]*net.IPNet, len(clusterNetworks))
  1877  	for i, clusterNetwork := range clusterNetworks {
  1878  		_, clusterIPNets[i], err = net.ParseCIDR(clusterNetwork)
  1879  		if err != nil {
  1880  			return err
  1881  		}
  1882  	}
  1883  	var badPodCIDRS []string
  1884  	var podCIDRExists bool
  1885  	for _, node := range nodes {
  1886  		podCIDR := node.Spec.PodCIDR
  1887  		if podCIDR == "" {
  1888  			continue
  1889  		}
  1890  		podCIDRExists = true
  1891  		podIP, podIPNet, err := net.ParseCIDR(podCIDR)
  1892  		if err != nil {
  1893  			return err
  1894  		}
  1895  		exists := cluterNetworksContainCIDR(clusterIPNets, podIPNet, podIP)
  1896  		if !exists {
  1897  			badPodCIDRS = append(badPodCIDRS, podCIDR)
  1898  		}
  1899  	}
  1900  	// If none of the nodes exposed a podCIDR then we cannot verify the clusterNetworks.
  1901  	if !podCIDRExists {
  1902  		// DigitalOcean for example, doesn't expose spec.podCIDR (#6398)
  1903  		return SkipError{Reason: podCIDRUnavailableSkipReason}
  1904  	}
  1905  	if len(badPodCIDRS) > 0 {
  1906  		sort.Strings(badPodCIDRS)
  1907  		return fmt.Errorf("node has podCIDR(s) %v which are not contained in the Linkerd clusterNetworks.\n\tTry installing linkerd via --set clusterNetworks=\"%s\"",
  1908  			badPodCIDRS, strings.Join(badPodCIDRS, "\\,"))
  1909  	}
  1910  	return nil
  1911  }
  1912  
  1913  func cluterNetworksContainCIDR(clusterIPNets []*net.IPNet, podIPNet *net.IPNet, podIP net.IP) bool {
  1914  	for _, clusterIPNet := range clusterIPNets {
  1915  		clusterIPMaskOnes, _ := clusterIPNet.Mask.Size()
  1916  		podCIDRMaskOnes, _ := podIPNet.Mask.Size()
  1917  		if clusterIPNet.Contains(podIP) && podCIDRMaskOnes >= clusterIPMaskOnes {
  1918  			return true
  1919  		}
  1920  	}
  1921  	return false
  1922  }
  1923  
  1924  func clusterNetworksContainIP(clusterIPNets []*net.IPNet, ip string) bool {
  1925  	for _, clusterIPNet := range clusterIPNets {
  1926  		if clusterIPNet.Contains(net.ParseIP(ip)) {
  1927  			return true
  1928  		}
  1929  	}
  1930  	return false
  1931  }
  1932  
  1933  func (hc *HealthChecker) checkClusterNetworksContainAllPods(ctx context.Context) error {
  1934  	clusterNetworks := strings.Split(hc.linkerdConfig.ClusterNetworks, ",")
  1935  	clusterIPNets := make([]*net.IPNet, len(clusterNetworks))
  1936  	var err error
  1937  	for i, clusterNetwork := range clusterNetworks {
  1938  		_, clusterIPNets[i], err = net.ParseCIDR(clusterNetwork)
  1939  		if err != nil {
  1940  			return err
  1941  		}
  1942  	}
  1943  	pods, err := hc.kubeAPI.CoreV1().Pods(corev1.NamespaceAll).List(ctx, metav1.ListOptions{})
  1944  	if err != nil {
  1945  		return err
  1946  	}
  1947  	for _, pod := range pods.Items {
  1948  		if pod.Spec.HostNetwork {
  1949  			continue
  1950  		}
  1951  		if len(pod.Status.PodIP) == 0 {
  1952  			continue
  1953  		}
  1954  		if !clusterNetworksContainIP(clusterIPNets, pod.Status.PodIP) {
  1955  			return fmt.Errorf("the Linkerd clusterNetworks [%q] do not include pod %s/%s (%s)", hc.linkerdConfig.ClusterNetworks, pod.Namespace, pod.Name, pod.Status.PodIP)
  1956  		}
  1957  	}
  1958  	return nil
  1959  }
  1960  
  1961  func (hc *HealthChecker) checkClusterNetworksContainAllServices(ctx context.Context) error {
  1962  	clusterNetworks := strings.Split(hc.linkerdConfig.ClusterNetworks, ",")
  1963  	clusterIPNets := make([]*net.IPNet, len(clusterNetworks))
  1964  	var err error
  1965  	for i, clusterNetwork := range clusterNetworks {
  1966  		_, clusterIPNets[i], err = net.ParseCIDR(clusterNetwork)
  1967  		if err != nil {
  1968  			return err
  1969  		}
  1970  	}
  1971  	svcs, err := hc.kubeAPI.CoreV1().Services(corev1.NamespaceAll).List(ctx, metav1.ListOptions{})
  1972  	if err != nil {
  1973  		return err
  1974  	}
  1975  	for _, svc := range svcs.Items {
  1976  		clusterIP := svc.Spec.ClusterIP
  1977  		if clusterIP != "" && clusterIP != "None" && !clusterNetworksContainIP(clusterIPNets, svc.Spec.ClusterIP) {
  1978  			return fmt.Errorf("the Linkerd clusterNetworks [%q] do not include svc %s/%s (%s)", hc.linkerdConfig.ClusterNetworks, svc.Namespace, svc.Name, svc.Spec.ClusterIP)
  1979  		}
  1980  	}
  1981  	return nil
  1982  }
  1983  
  1984  func (hc *HealthChecker) expectedRBACNames() []string {
  1985  	return []string{
  1986  		fmt.Sprintf("linkerd-%s-identity", hc.ControlPlaneNamespace),
  1987  		fmt.Sprintf("linkerd-%s-proxy-injector", hc.ControlPlaneNamespace),
  1988  	}
  1989  }
  1990  
  1991  func (hc *HealthChecker) checkClusterRoles(ctx context.Context, shouldExist bool, expectedNames []string, labelSelector string) error {
  1992  	return CheckClusterRoles(ctx, hc.kubeAPI, shouldExist, expectedNames, labelSelector)
  1993  }
  1994  
  1995  // CheckClusterRoles checks that the expected ClusterRoles exist.
  1996  func CheckClusterRoles(ctx context.Context, kubeAPI *k8s.KubernetesAPI, shouldExist bool, expectedNames []string, labelSelector string) error {
  1997  	options := metav1.ListOptions{
  1998  		LabelSelector: labelSelector,
  1999  	}
  2000  	crList, err := kubeAPI.RbacV1().ClusterRoles().List(ctx, options)
  2001  	if err != nil {
  2002  		return err
  2003  	}
  2004  
  2005  	objects := []runtime.Object{}
  2006  
  2007  	for _, item := range crList.Items {
  2008  		item := item // pin
  2009  		objects = append(objects, &item)
  2010  	}
  2011  
  2012  	return checkResources("ClusterRoles", objects, expectedNames, shouldExist)
  2013  }
  2014  
  2015  func (hc *HealthChecker) checkClusterRoleBindings(ctx context.Context, shouldExist bool, expectedNames []string, labelSelector string) error {
  2016  	return CheckClusterRoleBindings(ctx, hc.kubeAPI, shouldExist, expectedNames, labelSelector)
  2017  }
  2018  
  2019  // CheckClusterRoleBindings checks that the expected ClusterRoleBindings exist.
  2020  func CheckClusterRoleBindings(ctx context.Context, kubeAPI *k8s.KubernetesAPI, shouldExist bool, expectedNames []string, labelSelector string) error {
  2021  	options := metav1.ListOptions{
  2022  		LabelSelector: labelSelector,
  2023  	}
  2024  	crbList, err := kubeAPI.RbacV1().ClusterRoleBindings().List(ctx, options)
  2025  	if err != nil {
  2026  		return err
  2027  	}
  2028  
  2029  	objects := []runtime.Object{}
  2030  
  2031  	for _, item := range crbList.Items {
  2032  		item := item // pin
  2033  		objects = append(objects, &item)
  2034  	}
  2035  
  2036  	return checkResources("ClusterRoleBindings", objects, expectedNames, shouldExist)
  2037  }
  2038  
  2039  // CheckConfigMaps checks that the expected ConfigMaps  exist.
  2040  func CheckConfigMaps(ctx context.Context, kubeAPI *k8s.KubernetesAPI, namespace string, shouldExist bool, expectedNames []string, labelSelector string) error {
  2041  	options := metav1.ListOptions{
  2042  		LabelSelector: labelSelector,
  2043  	}
  2044  	crbList, err := kubeAPI.CoreV1().ConfigMaps(namespace).List(ctx, options)
  2045  	if err != nil {
  2046  		return err
  2047  	}
  2048  
  2049  	objects := []runtime.Object{}
  2050  
  2051  	for _, item := range crbList.Items {
  2052  		item := item // pin
  2053  		objects = append(objects, &item)
  2054  	}
  2055  
  2056  	return checkResources("ConfigMaps", objects, expectedNames, shouldExist)
  2057  }
  2058  
  2059  func (hc *HealthChecker) isHA() bool {
  2060  	return hc.linkerdConfig.HighAvailability
  2061  }
  2062  
  2063  func (hc *HealthChecker) isHeartbeatDisabled() bool {
  2064  	return hc.linkerdConfig.DisableHeartBeat
  2065  }
  2066  
  2067  func (hc *HealthChecker) checkServiceAccounts(ctx context.Context, saNames []string, ns, labelSelector string) error {
  2068  	return CheckServiceAccounts(ctx, hc.kubeAPI, saNames, ns, labelSelector)
  2069  }
  2070  
  2071  // CheckServiceAccounts check for serviceaccounts
  2072  func CheckServiceAccounts(ctx context.Context, api *k8s.KubernetesAPI, saNames []string, ns, labelSelector string) error {
  2073  	options := metav1.ListOptions{
  2074  		LabelSelector: labelSelector,
  2075  	}
  2076  	saList, err := api.CoreV1().ServiceAccounts(ns).List(ctx, options)
  2077  	if err != nil {
  2078  		return err
  2079  	}
  2080  
  2081  	objects := []runtime.Object{}
  2082  
  2083  	for _, item := range saList.Items {
  2084  		item := item // pin
  2085  		objects = append(objects, &item)
  2086  	}
  2087  
  2088  	return checkResources("ServiceAccounts", objects, saNames, true)
  2089  }
  2090  
  2091  // CheckIfLinkerdExists checks if Linkerd exists
  2092  func CheckIfLinkerdExists(ctx context.Context, kubeAPI *k8s.KubernetesAPI, controlPlaneNamespace string) (bool, error) {
  2093  	_, err := kubeAPI.CoreV1().Namespaces().Get(ctx, controlPlaneNamespace, metav1.GetOptions{})
  2094  	if err != nil {
  2095  		if kerrors.IsNotFound(err) {
  2096  			return false, nil
  2097  		}
  2098  		return false, err
  2099  	}
  2100  
  2101  	_, _, err = FetchCurrentConfiguration(ctx, kubeAPI, controlPlaneNamespace)
  2102  	if err != nil {
  2103  		if kerrors.IsNotFound(err) {
  2104  			return false, nil
  2105  		}
  2106  		return false, err
  2107  	}
  2108  
  2109  	return true, nil
  2110  }
  2111  
  2112  func (hc *HealthChecker) getProxyInjectorMutatingWebhook(ctx context.Context) (*admissionRegistration.MutatingWebhook, error) {
  2113  	mwc, err := hc.kubeAPI.AdmissionregistrationV1().MutatingWebhookConfigurations().Get(ctx, k8s.ProxyInjectorWebhookConfigName, metav1.GetOptions{})
  2114  	if err != nil {
  2115  		return nil, err
  2116  	}
  2117  	if len(mwc.Webhooks) != 1 {
  2118  		return nil, fmt.Errorf("expected 1 webhooks, found %d", len(mwc.Webhooks))
  2119  	}
  2120  	return &mwc.Webhooks[0], nil
  2121  }
  2122  
  2123  func (hc *HealthChecker) checkMutatingWebhookConfigurations(ctx context.Context, shouldExist bool) error {
  2124  	options := metav1.ListOptions{
  2125  		LabelSelector: controlPlaneComponentsSelector(),
  2126  	}
  2127  	mwc, err := hc.kubeAPI.AdmissionregistrationV1().MutatingWebhookConfigurations().List(ctx, options)
  2128  	if err != nil {
  2129  		return err
  2130  	}
  2131  
  2132  	objects := []runtime.Object{}
  2133  	for _, item := range mwc.Items {
  2134  		item := item // pin
  2135  		objects = append(objects, &item)
  2136  	}
  2137  
  2138  	return checkResources("MutatingWebhookConfigurations", objects, []string{k8s.ProxyInjectorWebhookConfigName}, shouldExist)
  2139  }
  2140  
  2141  func (hc *HealthChecker) checkValidatingWebhookConfigurations(ctx context.Context, shouldExist bool) error {
  2142  	options := metav1.ListOptions{
  2143  		LabelSelector: controlPlaneComponentsSelector(),
  2144  	}
  2145  	vwc, err := hc.kubeAPI.AdmissionregistrationV1().ValidatingWebhookConfigurations().List(ctx, options)
  2146  	if err != nil {
  2147  		return err
  2148  	}
  2149  
  2150  	objects := []runtime.Object{}
  2151  	for _, item := range vwc.Items {
  2152  		item := item // pin
  2153  		objects = append(objects, &item)
  2154  	}
  2155  
  2156  	return checkResources("ValidatingWebhookConfigurations", objects, []string{k8s.SPValidatorWebhookConfigName}, shouldExist)
  2157  }
  2158  
  2159  // CheckCustomResourceDefinitions checks that all of the Linkerd CRDs are
  2160  // installed on the cluster.
  2161  func CheckCustomResourceDefinitions(ctx context.Context, k8sAPI *k8s.KubernetesAPI, expectedCRDManifests string) error {
  2162  
  2163  	crdYamls := strings.Split(expectedCRDManifests, "\n---\n")
  2164  	crdVersions := []struct{ name, version string }{}
  2165  	for _, crdYaml := range crdYamls {
  2166  		var crd apiextv1.CustomResourceDefinition
  2167  		err := yaml.Unmarshal([]byte(crdYaml), &crd)
  2168  		if err != nil {
  2169  			return err
  2170  		}
  2171  		if len(crd.Spec.Versions) == 0 {
  2172  			continue
  2173  		}
  2174  		versionIndex := len(crd.Spec.Versions) - 1
  2175  		crdVersions = append(crdVersions, struct{ name, version string }{
  2176  			name:    crd.Name,
  2177  			version: crd.Spec.Versions[versionIndex].Name,
  2178  		})
  2179  	}
  2180  
  2181  	errMsgs := []string{}
  2182  
  2183  	for _, crdVersion := range crdVersions {
  2184  		name := crdVersion.name
  2185  		version := crdVersion.version
  2186  
  2187  		crd, err := k8sAPI.Apiextensions.ApiextensionsV1().CustomResourceDefinitions().Get(ctx, name, metav1.GetOptions{})
  2188  		if err != nil && kerrors.IsNotFound(err) {
  2189  			errMsgs = append(errMsgs, fmt.Sprintf("missing %s", name))
  2190  			continue
  2191  		} else if err != nil {
  2192  			return err
  2193  		}
  2194  		if !crdHasVersion(crd, version) {
  2195  			errMsgs = append(errMsgs, fmt.Sprintf("CRD %s is missing version %s", name, version))
  2196  		}
  2197  	}
  2198  	if len(errMsgs) > 0 {
  2199  		return errors.New(strings.Join(errMsgs, ", "))
  2200  	}
  2201  	return nil
  2202  }
  2203  
  2204  func crdHasVersion(crd *apiextv1.CustomResourceDefinition, version string) bool {
  2205  	for _, crdVersion := range crd.Spec.Versions {
  2206  		if crdVersion.Name == version {
  2207  			return true
  2208  		}
  2209  	}
  2210  	return false
  2211  }
  2212  
  2213  // CheckNodesHaveNonDockerRuntime checks that each node has a non-Docker
  2214  // runtime. This check is only called if proxyInit is not running as root
  2215  // which is a problem for clusters with a Docker container runtime.
  2216  func CheckNodesHaveNonDockerRuntime(ctx context.Context, k8sAPI *k8s.KubernetesAPI) error {
  2217  	hasDockerNodes := false
  2218  	continueToken := ""
  2219  	for {
  2220  		nodes, err := k8sAPI.CoreV1().Nodes().List(ctx, metav1.ListOptions{Continue: continueToken})
  2221  		if err != nil {
  2222  			return err
  2223  		}
  2224  		continueToken = nodes.Continue
  2225  		for _, node := range nodes.Items {
  2226  			crv := node.Status.NodeInfo.ContainerRuntimeVersion
  2227  			if strings.HasPrefix(crv, "docker:") {
  2228  				hasDockerNodes = true
  2229  				break
  2230  			}
  2231  		}
  2232  		if continueToken == "" {
  2233  			break
  2234  		}
  2235  	}
  2236  	if hasDockerNodes {
  2237  		return fmt.Errorf("there are nodes using the docker container runtime and proxy-init container must run as root user.\ntry installing linkerd via --set proxyInit.runAsRoot=true")
  2238  	}
  2239  	return nil
  2240  }
  2241  
  2242  // MeshedPodIdentityData contains meshed pod details + trust anchors of the proxy
  2243  type MeshedPodIdentityData struct {
  2244  	Name      string
  2245  	Namespace string
  2246  	Anchors   string
  2247  }
  2248  
  2249  // GetMeshedPodsIdentityData obtains the identity data (trust anchors) for all meshed pods
  2250  func GetMeshedPodsIdentityData(ctx context.Context, api kubernetes.Interface, dataPlaneNamespace string) ([]MeshedPodIdentityData, error) {
  2251  	podList, err := api.CoreV1().Pods(dataPlaneNamespace).List(ctx, metav1.ListOptions{LabelSelector: k8s.ControllerNSLabel})
  2252  	if err != nil {
  2253  		return nil, err
  2254  	}
  2255  	if len(podList.Items) == 0 {
  2256  		return nil, nil
  2257  	}
  2258  	pods := []MeshedPodIdentityData{}
  2259  	for _, pod := range podList.Items {
  2260  		containers := append(pod.Spec.InitContainers, pod.Spec.Containers...)
  2261  		for _, containerSpec := range containers {
  2262  			if containerSpec.Name != k8s.ProxyContainerName {
  2263  				continue
  2264  			}
  2265  			for _, envVar := range containerSpec.Env {
  2266  				if envVar.Name != identity.EnvTrustAnchors {
  2267  					continue
  2268  				}
  2269  				pods = append(pods, MeshedPodIdentityData{
  2270  					pod.Name, pod.Namespace, strings.TrimSpace(envVar.Value),
  2271  				})
  2272  			}
  2273  		}
  2274  	}
  2275  	return pods, nil
  2276  }
  2277  
  2278  func (hc *HealthChecker) checkDataPlaneProxiesCertificate(ctx context.Context) error {
  2279  	return checkPodsProxiesCertificate(ctx, *hc.kubeAPI, hc.DataPlaneNamespace, hc.ControlPlaneNamespace)
  2280  }
  2281  
  2282  func checkPodsProxiesCertificate(ctx context.Context, kubeAPI k8s.KubernetesAPI, targetNamespace, controlPlaneNamespace string) error {
  2283  	meshedPods, err := GetMeshedPodsIdentityData(ctx, kubeAPI, targetNamespace)
  2284  	if err != nil {
  2285  		return err
  2286  	}
  2287  
  2288  	trustAnchorsPem, err := FetchTrustBundle(ctx, kubeAPI, controlPlaneNamespace)
  2289  	if err != nil {
  2290  		return err
  2291  	}
  2292  
  2293  	offendingPods := []string{}
  2294  	for _, pod := range meshedPods {
  2295  		// Skip control plane pods since they load their trust anchors from the linkerd-identity-trust-anchors configmap.
  2296  		if pod.Namespace == controlPlaneNamespace {
  2297  			continue
  2298  		}
  2299  		if strings.TrimSpace(pod.Anchors) != strings.TrimSpace(trustAnchorsPem) {
  2300  			if targetNamespace == "" {
  2301  				offendingPods = append(offendingPods, fmt.Sprintf("* %s/%s", pod.Namespace, pod.Name))
  2302  			} else {
  2303  				offendingPods = append(offendingPods, fmt.Sprintf("* %s", pod.Name))
  2304  			}
  2305  		}
  2306  	}
  2307  	if len(offendingPods) == 0 {
  2308  		return nil
  2309  	}
  2310  	return fmt.Errorf("Some pods do not have the current trust bundle and must be restarted:\n\t%s", strings.Join(offendingPods, "\n\t"))
  2311  }
  2312  
  2313  func checkResources(resourceName string, objects []runtime.Object, expectedNames []string, shouldExist bool) error {
  2314  	if !shouldExist {
  2315  		if len(objects) > 0 {
  2316  			resources := []Resource{}
  2317  			for _, obj := range objects {
  2318  				m, err := meta.Accessor(obj)
  2319  				if err != nil {
  2320  					return err
  2321  				}
  2322  
  2323  				res := Resource{name: m.GetName()}
  2324  				gvks, _, err := k8s.ObjectKinds(obj)
  2325  				if err == nil && len(gvks) > 0 {
  2326  					res.groupVersionKind = gvks[0]
  2327  				}
  2328  				resources = append(resources, res)
  2329  			}
  2330  			return ResourceError{resourceName, resources}
  2331  		}
  2332  		return nil
  2333  	}
  2334  
  2335  	expected := map[string]bool{}
  2336  	for _, name := range expectedNames {
  2337  		expected[name] = false
  2338  	}
  2339  
  2340  	for _, obj := range objects {
  2341  		metaObj, err := meta.Accessor(obj)
  2342  		if err != nil {
  2343  			return err
  2344  		}
  2345  
  2346  		if _, ok := expected[metaObj.GetName()]; ok {
  2347  			expected[metaObj.GetName()] = true
  2348  		}
  2349  	}
  2350  
  2351  	missing := []string{}
  2352  	for name, found := range expected {
  2353  		if !found {
  2354  			missing = append(missing, name)
  2355  		}
  2356  	}
  2357  	if len(missing) > 0 {
  2358  		sort.Strings(missing)
  2359  		return fmt.Errorf("missing %s: %s", resourceName, strings.Join(missing, ", "))
  2360  	}
  2361  
  2362  	return nil
  2363  }
  2364  
  2365  // Check if there's a pod with the "opaque ports" annotation defined but a
  2366  // service selecting the aforementioned pod doesn't define it
  2367  func (hc *HealthChecker) checkMisconfiguredOpaquePortAnnotations(ctx context.Context) error {
  2368  	// Initialize and sync the kubernetes API
  2369  	// This is used instead of `hc.kubeAPI` to limit multiple k8s API requests
  2370  	// and use the caching logic in the shared informers
  2371  	// TODO: move the shared informer code out of `controller/`, and into `pkg` to simplify the dependency tree.
  2372  	kubeAPI := controllerK8s.NewClusterScopedAPI(hc.kubeAPI, nil, nil, "local", controllerK8s.Endpoint, controllerK8s.Pod, controllerK8s.Svc)
  2373  	kubeAPI.Sync(ctx.Done())
  2374  
  2375  	services, err := kubeAPI.Svc().Lister().Services(hc.DataPlaneNamespace).List(labels.Everything())
  2376  	if err != nil {
  2377  		return err
  2378  	}
  2379  
  2380  	var errStrings []string
  2381  	for _, service := range services {
  2382  		if service.Spec.ClusterIP == "None" {
  2383  			// skip headless services; they're handled differently
  2384  			continue
  2385  		}
  2386  
  2387  		endpoints, err := kubeAPI.Endpoint().Lister().Endpoints(service.Namespace).Get(service.Name)
  2388  		if err != nil {
  2389  			return err
  2390  		}
  2391  
  2392  		pods, err := getEndpointsPods(endpoints, kubeAPI, service.Namespace)
  2393  		if err != nil {
  2394  			return err
  2395  		}
  2396  
  2397  		for pod := range pods {
  2398  			err := misconfiguredOpaqueAnnotation(service, pod)
  2399  			if err != nil {
  2400  				errStrings = append(errStrings, fmt.Sprintf("\t* %s", err.Error()))
  2401  			}
  2402  		}
  2403  	}
  2404  
  2405  	if len(errStrings) >= 1 {
  2406  		return fmt.Errorf(strings.Join(errStrings, "\n    "))
  2407  	}
  2408  
  2409  	return nil
  2410  }
  2411  
  2412  // getEndpointsPods takes a collection of endpoints and returns the set of all
  2413  // the pods that they target.
  2414  func getEndpointsPods(endpoints *corev1.Endpoints, kubeAPI *controllerK8s.API, namespace string) (map[*corev1.Pod]struct{}, error) {
  2415  	pods := make(map[*corev1.Pod]struct{})
  2416  	for _, subset := range endpoints.Subsets {
  2417  		for _, addr := range subset.Addresses {
  2418  			if addr.TargetRef != nil && addr.TargetRef.Kind == "Pod" {
  2419  				pod, err := kubeAPI.Pod().Lister().Pods(namespace).Get(addr.TargetRef.Name)
  2420  				if err != nil {
  2421  					return nil, err
  2422  				}
  2423  				if _, ok := pods[pod]; !ok {
  2424  					pods[pod] = struct{}{}
  2425  				}
  2426  			}
  2427  		}
  2428  	}
  2429  	return pods, nil
  2430  }
  2431  
  2432  func misconfiguredOpaqueAnnotation(service *corev1.Service, pod *corev1.Pod) error {
  2433  	var svcPorts, podPorts []string
  2434  	if v, ok := service.Annotations[k8s.ProxyOpaquePortsAnnotation]; ok {
  2435  		svcPorts = strings.Split(v, ",")
  2436  	}
  2437  	if v, ok := pod.Annotations[k8s.ProxyOpaquePortsAnnotation]; ok {
  2438  		podPorts = strings.Split(v, ",")
  2439  	}
  2440  
  2441  	// First loop through the services opaque ports and assert that if the pod
  2442  	// exposes a port that is targeted by one of these ports, then it is
  2443  	// marked as opaque on the pod.
  2444  	for _, p := range svcPorts {
  2445  		port, err := strconv.Atoi(p)
  2446  		if err != nil {
  2447  			return fmt.Errorf("failed to convert %s to port number for pod %s", p, pod.Name)
  2448  		}
  2449  		err = checkPodPorts(service, pod, podPorts, port)
  2450  		if err != nil {
  2451  			return err
  2452  		}
  2453  	}
  2454  
  2455  	// Next loop through the pod's opaque ports and assert that if one of
  2456  	// the ports is targeted by a service port, then it is marked as opaque
  2457  	// on the service.
  2458  	for _, p := range podPorts {
  2459  		if util.ContainsString(p, svcPorts) {
  2460  			// The service exposes p and is marked as opaque.
  2461  			continue
  2462  		}
  2463  		port, err := strconv.Atoi(p)
  2464  		if err != nil {
  2465  			return fmt.Errorf("failed to convert %s to port number for pod %s", p, pod.Name)
  2466  		}
  2467  
  2468  		// p is marked as opaque on the pod, but the service that selects it
  2469  		// does not have it marked as opaque. We first check if the service
  2470  		// exposes it as a service or integer targetPort.
  2471  		ok, err := checkServiceIntPorts(service, svcPorts, port)
  2472  		if err != nil {
  2473  			return err
  2474  		}
  2475  		if ok {
  2476  			// The service targets the port as an integer and is marked as
  2477  			// opaque so continue checking other pod ports.
  2478  			continue
  2479  		}
  2480  
  2481  		// The service does not expose p as a service or integer targetPort.
  2482  		// We now check if it targets it as a named port, and if so, that the
  2483  		// service port is marked as opaque.
  2484  		err = checkServiceNamePorts(service, pod, port, svcPorts)
  2485  		if err != nil {
  2486  			return err
  2487  		}
  2488  	}
  2489  	return nil
  2490  }
  2491  
  2492  func checkPodPorts(service *corev1.Service, pod *corev1.Pod, podPorts []string, port int) error {
  2493  	for _, sp := range service.Spec.Ports {
  2494  		if int(sp.Port) == port {
  2495  			for _, c := range pod.Spec.Containers {
  2496  				for _, cp := range c.Ports {
  2497  					if cp.ContainerPort == sp.TargetPort.IntVal || cp.Name == sp.TargetPort.StrVal {
  2498  						// The pod exposes a container port that would be
  2499  						// targeted by this service port
  2500  						var strPort string
  2501  						if sp.TargetPort.Type == 0 {
  2502  							strPort = strconv.Itoa(int(sp.TargetPort.IntVal))
  2503  						} else {
  2504  							strPort = strconv.Itoa(int(cp.ContainerPort))
  2505  						}
  2506  						if util.ContainsString(strPort, podPorts) {
  2507  							return nil
  2508  						}
  2509  						return fmt.Errorf("service %s expects target port %s to be opaque; add it to pod %s %s annotation", service.Name, strPort, pod.Name, k8s.ProxyOpaquePortsAnnotation)
  2510  					}
  2511  				}
  2512  			}
  2513  		}
  2514  	}
  2515  	return nil
  2516  }
  2517  
  2518  func checkServiceIntPorts(service *corev1.Service, svcPorts []string, port int) (bool, error) {
  2519  	for _, p := range service.Spec.Ports {
  2520  		if p.TargetPort.Type == 0 && p.TargetPort.IntVal == 0 {
  2521  			if int(p.Port) == port {
  2522  				// The service does not have a target port, so its service
  2523  				// port should be marked as opaque.
  2524  				return false, fmt.Errorf("service %s targets the opaque port %d; add it to its %s annotation", service.Name, port, k8s.ProxyOpaquePortsAnnotation)
  2525  			}
  2526  		}
  2527  		if int(p.TargetPort.IntVal) == port {
  2528  			svcPort := strconv.Itoa(int(p.Port))
  2529  			if util.ContainsString(svcPort, svcPorts) {
  2530  				// The service exposes svcPort which targets p and svcPort
  2531  				// is properly as opaque.
  2532  				return true, nil
  2533  			}
  2534  			return false, fmt.Errorf("service %s targets the opaque port %d through %d; add %d to its %s annotation", service.Name, port, p.Port, p.Port, k8s.ProxyOpaquePortsAnnotation)
  2535  		}
  2536  	}
  2537  	return false, nil
  2538  }
  2539  
  2540  func checkServiceNamePorts(service *corev1.Service, pod *corev1.Pod, port int, svcPorts []string) error {
  2541  	for _, p := range service.Spec.Ports {
  2542  		if p.TargetPort.StrVal == "" {
  2543  			// The target port is not named so there is no named container
  2544  			// port to check.
  2545  			continue
  2546  		}
  2547  		for _, c := range pod.Spec.Containers {
  2548  			for _, cp := range c.Ports {
  2549  				if int(cp.ContainerPort) == port {
  2550  					// This is the containerPort that maps to the opaque port
  2551  					// we are currently checking.
  2552  					if cp.Name == p.TargetPort.StrVal {
  2553  						svcPort := strconv.Itoa(int(p.Port))
  2554  						if util.ContainsString(svcPort, svcPorts) {
  2555  							// The service targets the container port by name
  2556  							// and is marked as opaque.
  2557  							return nil
  2558  						}
  2559  						return fmt.Errorf("service %s targets the opaque port %s through %d; add %d to its %s annotation", service.Name, cp.Name, p.Port, p.Port, k8s.ProxyOpaquePortsAnnotation)
  2560  					}
  2561  				}
  2562  			}
  2563  		}
  2564  	}
  2565  	return nil
  2566  }
  2567  
  2568  // GetDataPlanePods returns all the pods with data plane
  2569  func (hc *HealthChecker) GetDataPlanePods(ctx context.Context) ([]corev1.Pod, error) {
  2570  	selector := fmt.Sprintf("%s=%s", k8s.ControllerNSLabel, hc.ControlPlaneNamespace)
  2571  	podList, err := hc.kubeAPI.CoreV1().Pods(hc.DataPlaneNamespace).List(ctx, metav1.ListOptions{LabelSelector: selector})
  2572  	if err != nil {
  2573  		return nil, err
  2574  	}
  2575  	return podList.Items, nil
  2576  }
  2577  
  2578  // GetServices returns all services within data plane namespace
  2579  func (hc *HealthChecker) GetServices(ctx context.Context) ([]corev1.Service, error) {
  2580  	svcList, err := hc.kubeAPI.CoreV1().Services(hc.DataPlaneNamespace).List(ctx, metav1.ListOptions{})
  2581  	if err != nil {
  2582  		return nil, err
  2583  	}
  2584  	return svcList.Items, nil
  2585  }
  2586  
  2587  func (hc *HealthChecker) checkCanCreate(ctx context.Context, namespace, group, version, resource string) error {
  2588  	return CheckCanPerformAction(ctx, hc.kubeAPI, "create", namespace, group, version, resource)
  2589  }
  2590  
  2591  func (hc *HealthChecker) checkCanCreateNonNamespacedResources(ctx context.Context) error {
  2592  	var errs []string
  2593  	dryRun := metav1.CreateOptions{DryRun: []string{metav1.DryRunAll}}
  2594  
  2595  	// Iterate over all resources in install manifest
  2596  	installManifestReader := strings.NewReader(hc.Options.InstallManifest)
  2597  	yamlReader := yamlDecoder.NewYAMLReader(bufio.NewReader(installManifestReader))
  2598  	for {
  2599  		// Read single object YAML
  2600  		objYAML, err := yamlReader.Read()
  2601  		if err != nil {
  2602  			if errors.Is(err, io.EOF) {
  2603  				break
  2604  			}
  2605  			return fmt.Errorf("error reading install manifest: %w", err)
  2606  		}
  2607  
  2608  		// Create unstructured object from YAML
  2609  		objMap := map[string]interface{}{}
  2610  		err = yaml.Unmarshal(objYAML, &objMap)
  2611  		if err != nil {
  2612  			return fmt.Errorf("error unmarshaling yaml object %s: %w", objYAML, err)
  2613  		}
  2614  		if len(objMap) == 0 {
  2615  			// Ignore header blocks with only comments
  2616  			continue
  2617  		}
  2618  		obj := &unstructured.Unstructured{Object: objMap}
  2619  
  2620  		// Skip namespaced resources (dry-run requires namespace to exist)
  2621  		if obj.GetNamespace() != "" {
  2622  			continue
  2623  		}
  2624  		// Attempt to create resource using dry-run
  2625  		resource, _ := meta.UnsafeGuessKindToResource(obj.GroupVersionKind())
  2626  		_, err = hc.kubeAPI.DynamicClient.Resource(resource).Create(ctx, obj, dryRun)
  2627  		if err != nil {
  2628  			errs = append(errs, fmt.Sprintf("cannot create %s/%s: %v", obj.GetKind(), obj.GetName(), err))
  2629  		}
  2630  	}
  2631  
  2632  	if len(errs) > 0 {
  2633  		return errors.New(strings.Join(errs, "\n    "))
  2634  	}
  2635  	return nil
  2636  }
  2637  
  2638  func (hc *HealthChecker) checkCanGet(ctx context.Context, namespace, group, version, resource string) error {
  2639  	return CheckCanPerformAction(ctx, hc.kubeAPI, "get", namespace, group, version, resource)
  2640  }
  2641  
  2642  func (hc *HealthChecker) checkExtensionAPIServerAuthentication(ctx context.Context) error {
  2643  	if hc.kubeAPI == nil {
  2644  		return fmt.Errorf("unexpected error: Kubernetes ClientSet not initialized")
  2645  	}
  2646  	m, err := hc.kubeAPI.CoreV1().ConfigMaps(metav1.NamespaceSystem).Get(ctx, k8s.ExtensionAPIServerAuthenticationConfigMapName, metav1.GetOptions{})
  2647  	if err != nil {
  2648  		return err
  2649  	}
  2650  	if v, exists := m.Data[k8s.ExtensionAPIServerAuthenticationRequestHeaderClientCAFileKey]; !exists || v == "" {
  2651  		return fmt.Errorf("--%s is not configured", k8s.ExtensionAPIServerAuthenticationRequestHeaderClientCAFileKey)
  2652  	}
  2653  	return nil
  2654  }
  2655  func (hc *HealthChecker) checkClockSkew(ctx context.Context) error {
  2656  	if hc.kubeAPI == nil {
  2657  		// we should never get here
  2658  		return errors.New("unexpected error: Kubernetes ClientSet not initialized")
  2659  	}
  2660  
  2661  	var clockSkewNodes []string
  2662  
  2663  	nodeList, err := hc.kubeAPI.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
  2664  	if err != nil {
  2665  		return err
  2666  	}
  2667  
  2668  	for _, node := range nodeList.Items {
  2669  		for _, condition := range node.Status.Conditions {
  2670  			// we want to check only KubeletReady condition and only execute if the node is ready
  2671  			if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue {
  2672  				since := time.Since(condition.LastHeartbeatTime.Time)
  2673  				if (since > AllowedClockSkew) || (since < -AllowedClockSkew) {
  2674  					clockSkewNodes = append(clockSkewNodes, node.Name)
  2675  				}
  2676  			}
  2677  		}
  2678  	}
  2679  
  2680  	if len(clockSkewNodes) > 0 {
  2681  		return fmt.Errorf("clock skew detected for node(s): %s", strings.Join(clockSkewNodes, ", "))
  2682  	}
  2683  
  2684  	return nil
  2685  }
  2686  
  2687  func (hc *HealthChecker) checkExtensionNsLabels(ctx context.Context) error {
  2688  	if hc.kubeAPI == nil {
  2689  		// oops something wrong happened
  2690  		return errors.New("unexpected error: Kubernetes ClientSet not initialized")
  2691  	}
  2692  
  2693  	namespaces, err := hc.kubeAPI.GetAllNamespacesWithExtensionLabel(ctx)
  2694  	if err != nil {
  2695  		return fmt.Errorf("unexpected error when retrieving namespaces: %w", err)
  2696  	}
  2697  
  2698  	freq := make(map[string][]string)
  2699  	for _, ns := range namespaces {
  2700  		// We can guarantee the namespace has the extension label since we used
  2701  		// a label selector when retrieving namespaces
  2702  		ext := ns.Labels[k8s.LinkerdExtensionLabel]
  2703  		// To make it easier to print, store already error-formatted namespace
  2704  		// in freq table
  2705  		freq[ext] = append(freq[ext], fmt.Sprintf("\t\t* %s", ns.Name))
  2706  	}
  2707  
  2708  	errs := []string{}
  2709  	for ext, namespaces := range freq {
  2710  		if len(namespaces) == 1 {
  2711  			continue
  2712  		}
  2713  		errs = append(errs, fmt.Sprintf("\t* label \"%s=%s\" is present on more than one namespace:\n%s", k8s.LinkerdExtensionLabel, ext, strings.Join(namespaces, "\n")))
  2714  	}
  2715  
  2716  	if len(errs) > 0 {
  2717  		return errors.New(strings.Join(
  2718  			append([]string{"some extensions have invalid configuration"}, errs...), "\n"))
  2719  	}
  2720  
  2721  	return nil
  2722  }
  2723  
  2724  // CheckRoles checks that the expected roles exist.
  2725  func CheckRoles(ctx context.Context, kubeAPI *k8s.KubernetesAPI, shouldExist bool, namespace string, expectedNames []string, labelSelector string) error {
  2726  	options := metav1.ListOptions{
  2727  		LabelSelector: labelSelector,
  2728  	}
  2729  	crList, err := kubeAPI.RbacV1().Roles(namespace).List(ctx, options)
  2730  	if err != nil {
  2731  		return err
  2732  	}
  2733  
  2734  	objects := []runtime.Object{}
  2735  
  2736  	for _, item := range crList.Items {
  2737  		item := item // pin
  2738  		objects = append(objects, &item)
  2739  	}
  2740  
  2741  	return checkResources("Roles", objects, expectedNames, shouldExist)
  2742  }
  2743  
  2744  // CheckRoleBindings checks that the expected RoleBindings exist.
  2745  func CheckRoleBindings(ctx context.Context, kubeAPI *k8s.KubernetesAPI, shouldExist bool, namespace string, expectedNames []string, labelSelector string) error {
  2746  	options := metav1.ListOptions{
  2747  		LabelSelector: labelSelector,
  2748  	}
  2749  	crbList, err := kubeAPI.RbacV1().RoleBindings(namespace).List(ctx, options)
  2750  	if err != nil {
  2751  		return err
  2752  	}
  2753  
  2754  	objects := []runtime.Object{}
  2755  
  2756  	for _, item := range crbList.Items {
  2757  		item := item // pin
  2758  		objects = append(objects, &item)
  2759  	}
  2760  
  2761  	return checkResources("RoleBindings", objects, expectedNames, shouldExist)
  2762  }
  2763  
  2764  // CheckCanPerformAction checks if a given k8s client is authorized to perform a given action.
  2765  func CheckCanPerformAction(ctx context.Context, api *k8s.KubernetesAPI, verb, namespace, group, version, resource string) error {
  2766  	if api == nil {
  2767  		// we should never get here
  2768  		return fmt.Errorf("unexpected error: Kubernetes ClientSet not initialized")
  2769  	}
  2770  
  2771  	return k8s.ResourceAuthz(
  2772  		ctx,
  2773  		api,
  2774  		namespace,
  2775  		verb,
  2776  		group,
  2777  		version,
  2778  		resource,
  2779  		"",
  2780  	)
  2781  }
  2782  
  2783  // getPodStatuses returns a map of all Linkerd container statuses:
  2784  // component =>
  2785  //
  2786  //	pod name =>
  2787  //	  container statuses
  2788  func getPodStatuses(pods []corev1.Pod) map[string]map[string][]corev1.ContainerStatus {
  2789  	statuses := make(map[string]map[string][]corev1.ContainerStatus)
  2790  
  2791  	for _, pod := range pods {
  2792  		if pod.Status.Phase == corev1.PodRunning && strings.HasPrefix(pod.Name, "linkerd-") {
  2793  			parts := strings.Split(pod.Name, "-")
  2794  			// All control plane pods should have a name that results in at least 4
  2795  			// substrings when string.Split on '-'
  2796  			if len(parts) >= 4 {
  2797  				name := strings.Join(parts[1:len(parts)-2], "-")
  2798  				if _, found := statuses[name]; !found {
  2799  					statuses[name] = make(map[string][]corev1.ContainerStatus)
  2800  				}
  2801  				statuses[name][pod.Name] = pod.Status.ContainerStatuses
  2802  			}
  2803  		}
  2804  	}
  2805  
  2806  	return statuses
  2807  }
  2808  
  2809  func validateControlPlanePods(pods []corev1.Pod) error {
  2810  	statuses := getPodStatuses(pods)
  2811  
  2812  	names := []string{"destination", "identity", "proxy-injector"}
  2813  
  2814  	for _, name := range names {
  2815  		pods, found := statuses[name]
  2816  		if !found {
  2817  			return fmt.Errorf("No running pods for \"linkerd-%s\"", name)
  2818  		}
  2819  		var err error
  2820  		var ready bool
  2821  		for pod, containers := range pods {
  2822  			containersReady := true
  2823  			for _, container := range containers {
  2824  				if !container.Ready {
  2825  					// TODO: Save this as a warning, allow check to pass but let the user
  2826  					// know there is at least one pod not ready. This might imply
  2827  					// restructuring health checks to allow individual checks to return
  2828  					// either fatal or warning, rather than setting this property at
  2829  					// compile time.
  2830  					err = fmt.Errorf("pod/%s container %s is not ready", pod, container.Name)
  2831  					containersReady = false
  2832  				}
  2833  			}
  2834  			if containersReady {
  2835  				// at least one pod has all containers ready
  2836  				ready = true
  2837  				break
  2838  			}
  2839  		}
  2840  		if !ready {
  2841  			return err
  2842  		}
  2843  	}
  2844  
  2845  	return nil
  2846  }
  2847  
  2848  func checkUnschedulablePods(pods []corev1.Pod) error {
  2849  	for _, pod := range pods {
  2850  		for _, condition := range pod.Status.Conditions {
  2851  			if condition.Reason == corev1.PodReasonUnschedulable {
  2852  				return fmt.Errorf("%s: %s", pod.Name, condition.Message)
  2853  			}
  2854  		}
  2855  	}
  2856  
  2857  	return nil
  2858  }
  2859  
  2860  func checkControlPlaneReplicaSets(rst []appsv1.ReplicaSet) error {
  2861  	var errors []string
  2862  	for _, rs := range rst {
  2863  		for _, r := range rs.Status.Conditions {
  2864  			if r.Type == appsv1.ReplicaSetReplicaFailure && r.Status == corev1.ConditionTrue {
  2865  				errors = append(errors, fmt.Sprintf("%s: %s", r.Reason, r.Message))
  2866  			}
  2867  		}
  2868  	}
  2869  
  2870  	if len(errors) > 0 {
  2871  		return fmt.Errorf("%s", strings.Join(errors, "\n   "))
  2872  	}
  2873  
  2874  	return nil
  2875  }
  2876  
  2877  // CheckForPods checks if the given deployments have pod resources present
  2878  func CheckForPods(pods []corev1.Pod, deployNames []string) error {
  2879  	exists := make(map[string]bool)
  2880  
  2881  	for _, pod := range pods {
  2882  		for label, value := range pod.Labels {
  2883  			// When the label value is `linkerd.io/control-plane-component` or
  2884  			// `component`, we'll take its value as the name of the deployment
  2885  			// that the pod is part of
  2886  			if label == k8s.ControllerComponentLabel || label == "component" {
  2887  				exists[value] = true
  2888  			}
  2889  		}
  2890  	}
  2891  
  2892  	for _, expected := range deployNames {
  2893  		if !exists[expected] {
  2894  			return fmt.Errorf("Could not find pods for deployment %s", expected)
  2895  		}
  2896  	}
  2897  
  2898  	return nil
  2899  }
  2900  
  2901  // CheckPodsRunning checks if the given pods are in running state
  2902  // along with containers to be in ready state
  2903  func CheckPodsRunning(pods []corev1.Pod, namespace string) error {
  2904  	if len(pods) == 0 {
  2905  		msg := fmt.Sprintf("no \"%s\" containers found", k8s.ProxyContainerName)
  2906  		if namespace != "" {
  2907  			msg += fmt.Sprintf(" in the \"%s\" namespace", namespace)
  2908  		}
  2909  		return fmt.Errorf(msg)
  2910  	}
  2911  	for _, pod := range pods {
  2912  		status := k8s.GetPodStatus(pod)
  2913  
  2914  		// Skip validating pods that have a status which indicates there would
  2915  		// be no running proxy container.
  2916  		switch status {
  2917  		case "Completed", "NodeShutdown", "Shutdown", "Terminated":
  2918  			continue
  2919  		}
  2920  		if status != string(corev1.PodRunning) && status != "Evicted" {
  2921  			return fmt.Errorf("pod \"%s\" status is %s", pod.Name, pod.Status.Phase)
  2922  		}
  2923  		if !k8s.GetProxyReady(pod) {
  2924  			return fmt.Errorf("container \"%s\" in pod \"%s\" is not ready", k8s.ProxyContainerName, pod.Name)
  2925  		}
  2926  	}
  2927  	return nil
  2928  }
  2929  
  2930  // CheckIfDataPlanePodsExist checks if the proxy is present in the given pods
  2931  func CheckIfDataPlanePodsExist(pods []corev1.Pod) error {
  2932  	for _, pod := range pods {
  2933  		if !containsProxy(pod) {
  2934  			return fmt.Errorf("could not find proxy container for %s pod", pod.Name)
  2935  		}
  2936  	}
  2937  	return nil
  2938  }
  2939  
  2940  func containsProxy(pod corev1.Pod) bool {
  2941  	containers := append(pod.Spec.InitContainers, pod.Spec.Containers...)
  2942  	for _, containerSpec := range containers {
  2943  		if containerSpec.Name == k8s.ProxyContainerName {
  2944  			return true
  2945  		}
  2946  	}
  2947  	return false
  2948  }
  2949  

View as plain text