package healthcheck import ( "bufio" "context" "crypto/x509" "errors" "fmt" "io" "net" "os" "sort" "strconv" "strings" "time" controllerK8s "github.com/linkerd/linkerd2/controller/k8s" l5dcharts "github.com/linkerd/linkerd2/pkg/charts/linkerd2" "github.com/linkerd/linkerd2/pkg/config" "github.com/linkerd/linkerd2/pkg/identity" "github.com/linkerd/linkerd2/pkg/issuercerts" "github.com/linkerd/linkerd2/pkg/k8s" "github.com/linkerd/linkerd2/pkg/tls" "github.com/linkerd/linkerd2/pkg/util" "github.com/linkerd/linkerd2/pkg/version" log "github.com/sirupsen/logrus" admissionRegistration "k8s.io/api/admissionregistration/v1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" yamlDecoder "k8s.io/apimachinery/pkg/util/yaml" k8sVersion "k8s.io/apimachinery/pkg/version" "k8s.io/client-go/kubernetes" apiregistrationv1client "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/typed/apiregistration/v1" "sigs.k8s.io/yaml" ) // CategoryID is an identifier for the types of health checks. type CategoryID string const ( // KubernetesAPIChecks adds a series of checks to validate that the caller is // configured to interact with a working Kubernetes cluster. KubernetesAPIChecks CategoryID = "kubernetes-api" // KubernetesVersionChecks validate that the cluster meets the minimum version // requirements. KubernetesVersionChecks CategoryID = "kubernetes-version" // LinkerdPreInstall* checks enabled by `linkerd check --pre` // LinkerdPreInstallChecks adds checks to validate that the control plane // namespace does not already exist, and that the user can create cluster-wide // resources, including ClusterRole, ClusterRoleBinding, and // CustomResourceDefinition, as well as namespace-wide resources, including // Service, Deployment, and ConfigMap. This check only runs as part of the set // of pre-install checks. // This check is dependent on the output of KubernetesAPIChecks, so those // checks must be added first. LinkerdPreInstallChecks CategoryID = "pre-kubernetes-setup" // LinkerdCRDChecks adds checks to validate that the control plane CRDs // exist. These checks can be run after installing the control plane CRDs // but before installing the control plane itself. LinkerdCRDChecks CategoryID = "linkerd-crd" // LinkerdConfigChecks enabled by `linkerd check config` // LinkerdConfigChecks adds a series of checks to validate that the Linkerd // namespace, RBAC, ServiceAccounts, and CRDs were successfully created. // These checks specifically validate that the `linkerd install config` // command succeeded in a multi-stage install, but also applies to a default // `linkerd install`. // These checks are dependent on the output of KubernetesAPIChecks, so those // checks must be added first. LinkerdConfigChecks CategoryID = "linkerd-config" // LinkerdIdentity Checks the integrity of the mTLS certificates // that the control plane is configured with LinkerdIdentity CategoryID = "linkerd-identity" // LinkerdWebhooksAndAPISvcTLS the integrity of the mTLS certificates // that of the for the injector and sp webhooks and the tap api svc LinkerdWebhooksAndAPISvcTLS CategoryID = "linkerd-webhooks-and-apisvc-tls" // LinkerdIdentityDataPlane checks that integrity of the mTLS // certificates that the proxies are configured with and tries to // report useful information with respect to whether the configuration // is compatible with the one of the control plane LinkerdIdentityDataPlane CategoryID = "linkerd-identity-data-plane" // LinkerdControlPlaneExistenceChecks adds a series of checks to validate that // the control plane namespace and controller pod exist. // These checks are dependent on the output of KubernetesAPIChecks, so those // checks must be added first. LinkerdControlPlaneExistenceChecks CategoryID = "linkerd-existence" // LinkerdVersionChecks adds a series of checks to query for the latest // version, and validate the CLI is up to date. LinkerdVersionChecks CategoryID = "linkerd-version" // LinkerdControlPlaneVersionChecks adds a series of checks to validate that // the control plane is running the latest available version. // These checks are dependent on the following: // 1) `latestVersions` from LinkerdVersionChecks // 2) `serverVersion` from `LinkerdControlPlaneExistenceChecks` LinkerdControlPlaneVersionChecks CategoryID = "control-plane-version" // LinkerdDataPlaneChecks adds data plane checks to validate that the // data plane namespace exists, and that the proxy containers are in a // ready state and running the latest available version. These checks // are dependent on the output of KubernetesAPIChecks and // `latestVersions` from LinkerdVersionChecks, so those checks must be // added first. LinkerdDataPlaneChecks CategoryID = "linkerd-data-plane" // LinkerdControlPlaneProxyChecks adds data plane checks to validate the // control-plane proxies. The checkers include running and version checks LinkerdControlPlaneProxyChecks CategoryID = "linkerd-control-plane-proxy" // LinkerdHAChecks adds checks to validate that the HA configuration // is correct. These checks are no ops if linkerd is not in HA mode LinkerdHAChecks CategoryID = "linkerd-ha-checks" // LinkerdCNIPluginChecks adds checks to validate that the CNI /// plugin is installed and ready LinkerdCNIPluginChecks CategoryID = "linkerd-cni-plugin" // LinkerdOpaquePortsDefinitionChecks adds checks to validate that the // "opaque ports" annotation has been defined both in the service and the // corresponding pods LinkerdOpaquePortsDefinitionChecks CategoryID = "linkerd-opaque-ports-definition" // LinkerdExtensionChecks adds checks to validate configuration for all // extensions discovered in the cluster at runtime LinkerdExtensionChecks CategoryID = "linkerd-extension-checks" // LinkerdCNIResourceLabel is the label key that is used to identify // whether a Kubernetes resource is related to the install-cni command // The value is expected to be "true", "false" or "", where "false" and // "" are equal, making "false" the default LinkerdCNIResourceLabel = "linkerd.io/cni-resource" linkerdCNIDisabledSkipReason = "skipping check because CNI is not enabled" linkerdCNIResourceName = "linkerd-cni" linkerdCNIConfigMapName = "linkerd-cni-config" podCIDRUnavailableSkipReason = "skipping check because the nodes aren't exposing podCIDR" configMapDoesNotExistSkipReason = "skipping check because ConigMap does not exist" proxyInjectorOldTLSSecretName = "linkerd-proxy-injector-tls" proxyInjectorTLSSecretName = "linkerd-proxy-injector-k8s-tls" spValidatorOldTLSSecretName = "linkerd-sp-validator-tls" spValidatorTLSSecretName = "linkerd-sp-validator-k8s-tls" policyValidatorTLSSecretName = "linkerd-policy-validator-k8s-tls" certOldKeyName = "crt.pem" certKeyName = "tls.crt" keyOldKeyName = "key.pem" keyKeyName = "tls.key" ) // AllowedClockSkew sets the allowed skew in clock synchronization // between the system running inject command and the node(s), being // based on assumed node's heartbeat interval (5 minutes) plus default TLS // clock skew allowance. // // TODO: Make this default value overridable, e.g. by CLI flag const AllowedClockSkew = 5*time.Minute + tls.DefaultClockSkewAllowance var linkerdHAControlPlaneComponents = []string{ "linkerd-destination", "linkerd-identity", "linkerd-proxy-injector", } // ExpectedServiceAccountNames is a list of the service accounts that a healthy // Linkerd installation should have. Note that linkerd-heartbeat is optional, // so it doesn't appear here. var ExpectedServiceAccountNames = []string{ "linkerd-destination", "linkerd-identity", "linkerd-proxy-injector", } var ( retryWindow = 5 * time.Second // RequestTimeout is the time it takes for a request to timeout RequestTimeout = 30 * time.Second ) // Resource provides a way to describe a Kubernetes object, kind, and name. // TODO: Consider sharing with the inject package's ResourceConfig.workload // struct, as it wraps both runtime.Object and metav1.TypeMeta. type Resource struct { groupVersionKind schema.GroupVersionKind name string } // String outputs the resource in kind.group/name format, intended for // `linkerd install`. func (r *Resource) String() string { return fmt.Sprintf("%s/%s", strings.ToLower(r.groupVersionKind.GroupKind().String()), r.name) } // ResourceError provides a custom error type for resource existence checks, // useful in printing detailed error messages in `linkerd check` and // `linkerd install`. type ResourceError struct { resourceName string Resources []Resource } // Error satisfies the error interface for ResourceError. The output is intended // for `linkerd check`. func (e ResourceError) Error() string { names := []string{} for _, res := range e.Resources { names = append(names, res.name) } return fmt.Sprintf("%s found but should not exist: %s", e.resourceName, strings.Join(names, " ")) } // CategoryError provides a custom error type that also contains check category that emitted the error, // useful when needed to distinguish between errors from multiple categories type CategoryError struct { Category CategoryID Err error } // Error satisfies the error interface for CategoryError. func (e CategoryError) Error() string { return e.Err.Error() } // IsCategoryError returns true if passed in error is of type CategoryError and belong to the given category func IsCategoryError(err error, categoryID CategoryID) bool { var ce CategoryError if errors.As(err, &ce) { return ce.Category == categoryID } return false } // SkipError is returned by a check in case this check needs to be ignored. type SkipError struct { Reason string } // Error satisfies the error interface for SkipError. func (e SkipError) Error() string { return e.Reason } // VerboseSuccess implements the error interface but represents a success with // a message. type VerboseSuccess struct { Message string } // Error satisfies the error interface for VerboseSuccess. Since VerboseSuccess // does not actually represent a failure, this returns the empty string. func (e VerboseSuccess) Error() string { return "" } // Checker is a smallest unit performing a single check type Checker struct { // description is the short description that's printed to the command line // when the check is executed description string // hintAnchor, when appended to `HintBaseURL`, provides a URL to more // information about the check hintAnchor string // fatal indicates that all remaining checks should be aborted if this check // fails; it should only be used if subsequent checks cannot possibly succeed // (default false) fatal bool // warning indicates that if this check fails, it should be reported, but it // should not impact the overall outcome of the health check (default false) warning bool // retryDeadline establishes a deadline before which this check should be // retried; if the deadline has passed, the check fails (default: no retries) retryDeadline time.Time // surfaceErrorOnRetry indicates that the error message should be displayed // even if the check will be retried. This is useful if the error message // contains the current status of the check. surfaceErrorOnRetry bool // check is the function that's called to execute the check; if the function // returns an error, the check fails check func(context.Context) error } // NewChecker returns a new instance of checker type func NewChecker(description string) *Checker { return &Checker{ description: description, retryDeadline: time.Time{}, } } // WithHintAnchor returns a checker with the given hint anchor func (c *Checker) WithHintAnchor(hint string) *Checker { c.hintAnchor = hint return c } // Fatal returns a checker with the fatal field set func (c *Checker) Fatal() *Checker { c.fatal = true return c } // Warning returns a checker with the warning field set func (c *Checker) Warning() *Checker { c.warning = true return c } // WithRetryDeadline returns a checker with the provided retry timeout func (c *Checker) WithRetryDeadline(retryDeadLine time.Time) *Checker { c.retryDeadline = retryDeadLine return c } // SurfaceErrorOnRetry returns a checker with the surfaceErrorOnRetry set func (c *Checker) SurfaceErrorOnRetry() *Checker { c.surfaceErrorOnRetry = true return c } // WithCheck returns a checker with the provided check func func (c *Checker) WithCheck(check func(context.Context) error) *Checker { c.check = check return c } // CheckResult encapsulates a check's identifying information and output // Note there exists an analogous user-facing type, `cmd.check`, for output via // `linkerd check -o json`. type CheckResult struct { Category CategoryID Description string HintURL string Retry bool Warning bool Err error } // CheckObserver receives the results of each check. type CheckObserver func(*CheckResult) // Category is a group of checkers, to check a particular component or use-case type Category struct { ID CategoryID checkers []Checker enabled bool // hintBaseURL provides a base URL with more information // about the check hintBaseURL string } // NewCategory returns an instance of Category with the specified data func NewCategory(id CategoryID, checkers []Checker, enabled bool) *Category { return &Category{ ID: id, checkers: checkers, enabled: enabled, hintBaseURL: HintBaseURL(version.Version), } } // WithHintBaseURL returns a Category with the provided hintBaseURL func (c *Category) WithHintBaseURL(hintBaseURL string) *Category { c.hintBaseURL = hintBaseURL return c } // Options specifies configuration for a HealthChecker. type Options struct { IsMainCheckCommand bool ControlPlaneNamespace string CNINamespace string DataPlaneNamespace string KubeConfig string KubeContext string Impersonate string ImpersonateGroup []string APIAddr string VersionOverride string RetryDeadline time.Time CNIEnabled bool InstallManifest string CRDManifest string ChartValues *l5dcharts.Values } // HealthChecker encapsulates all health check checkers, and clients required to // perform those checks. type HealthChecker struct { categories []*Category *Options // these fields are set in the process of running checks kubeAPI *k8s.KubernetesAPI kubeVersion *k8sVersion.Info controlPlanePods []corev1.Pod LatestVersions version.Channels serverVersion string linkerdConfig *l5dcharts.Values uuid string issuerCert *tls.Cred trustAnchors []*x509.Certificate cniDaemonSet *appsv1.DaemonSet } // Runner is implemented by any health-checkers that can be triggered with RunChecks() type Runner interface { RunChecks(observer CheckObserver) (bool, bool) } // NewHealthChecker returns an initialized HealthChecker func NewHealthChecker(categoryIDs []CategoryID, options *Options) *HealthChecker { hc := &HealthChecker{ Options: options, } hc.categories = hc.allCategories() checkMap := map[CategoryID]struct{}{} for _, category := range categoryIDs { checkMap[category] = struct{}{} } for i := range hc.categories { if _, ok := checkMap[hc.categories[i].ID]; ok { hc.categories[i].enabled = true } } return hc } func NewWithCoreChecks(options *Options) *HealthChecker { checks := []CategoryID{KubernetesAPIChecks, LinkerdControlPlaneExistenceChecks} return NewHealthChecker(checks, options) } // InitializeKubeAPIClient creates a client for the HealthChecker. It avoids // having to require the KubernetesAPIChecks check to run in order for the // HealthChecker to run other checks. func (hc *HealthChecker) InitializeKubeAPIClient() error { k8sAPI, err := k8s.NewAPI(hc.KubeConfig, hc.KubeContext, hc.Impersonate, hc.ImpersonateGroup, RequestTimeout) if err != nil { return err } hc.kubeAPI = k8sAPI return nil } // InitializeLinkerdGlobalConfig populates the linkerd config object in the // healthchecker. It avoids having to require the LinkerdControlPlaneExistenceChecks // check to run before running other checks func (hc *HealthChecker) InitializeLinkerdGlobalConfig(ctx context.Context) error { uuid, l5dConfig, err := hc.checkLinkerdConfigConfigMap(ctx) if err != nil { return err } if l5dConfig != nil { hc.CNIEnabled = l5dConfig.CNIEnabled } hc.uuid = uuid hc.linkerdConfig = l5dConfig return nil } // AppendCategories returns a HealthChecker instance appending the provided Categories func (hc *HealthChecker) AppendCategories(categories ...*Category) *HealthChecker { hc.categories = append(hc.categories, categories...) return hc } // GetCategories returns all the categories func (hc *HealthChecker) GetCategories() []*Category { return hc.categories } // allCategories is the global, ordered list of all checkers, grouped by // category. This method is attached to the HealthChecker struct because the // checkers directly reference other members of the struct, such as kubeAPI, // controlPlanePods, etc. // // Ordering is important because checks rely on specific `HealthChecker` members // getting populated by earlier checks, such as kubeAPI, controlPlanePods, etc. // // Note that all checks should include a `hintAnchor` with a corresponding section // in the linkerd check faq: // https://linkerd.io/{major-version}/checks/# func (hc *HealthChecker) allCategories() []*Category { return []*Category{ NewCategory( KubernetesAPIChecks, []Checker{ { description: "can initialize the client", hintAnchor: "k8s-api", fatal: true, check: func(context.Context) (err error) { err = hc.InitializeKubeAPIClient() return }, }, { description: "can query the Kubernetes API", hintAnchor: "k8s-api", fatal: true, check: func(ctx context.Context) (err error) { hc.kubeVersion, err = hc.kubeAPI.GetVersionInfo() return }, }, }, false, ), NewCategory( KubernetesVersionChecks, []Checker{ { description: "is running the minimum Kubernetes API version", hintAnchor: "k8s-version", check: func(context.Context) error { return hc.kubeAPI.CheckVersion(hc.kubeVersion) }, }, }, false, ), NewCategory( LinkerdPreInstallChecks, []Checker{ { description: "control plane namespace does not already exist", hintAnchor: "pre-ns", check: func(ctx context.Context) error { return hc.CheckNamespace(ctx, hc.ControlPlaneNamespace, false) }, }, { description: "can create non-namespaced resources", hintAnchor: "pre-k8s-cluster-k8s", check: func(ctx context.Context) error { return hc.checkCanCreateNonNamespacedResources(ctx) }, }, { description: "can create ServiceAccounts", hintAnchor: "pre-k8s", check: func(ctx context.Context) error { return hc.checkCanCreate(ctx, hc.ControlPlaneNamespace, "", "v1", "serviceaccounts") }, }, { description: "can create Services", hintAnchor: "pre-k8s", check: func(ctx context.Context) error { return hc.checkCanCreate(ctx, hc.ControlPlaneNamespace, "", "v1", "services") }, }, { description: "can create Deployments", hintAnchor: "pre-k8s", check: func(ctx context.Context) error { return hc.checkCanCreate(ctx, hc.ControlPlaneNamespace, "apps", "v1", "deployments") }, }, { description: "can create CronJobs", hintAnchor: "pre-k8s", check: func(ctx context.Context) error { return hc.checkCanCreate(ctx, hc.ControlPlaneNamespace, "batch", "v1beta1", "cronjobs") }, }, { description: "can create ConfigMaps", hintAnchor: "pre-k8s", check: func(ctx context.Context) error { return hc.checkCanCreate(ctx, hc.ControlPlaneNamespace, "", "v1", "configmaps") }, }, { description: "can create Secrets", hintAnchor: "pre-k8s", check: func(ctx context.Context) error { return hc.checkCanCreate(ctx, hc.ControlPlaneNamespace, "", "v1", "secrets") }, }, { description: "can read Secrets", hintAnchor: "pre-k8s", check: func(ctx context.Context) error { return hc.checkCanGet(ctx, hc.ControlPlaneNamespace, "", "v1", "secrets") }, }, { description: "can read extension-apiserver-authentication configmap", hintAnchor: "pre-k8s", check: func(ctx context.Context) error { return hc.checkExtensionAPIServerAuthentication(ctx) }, }, { description: "no clock skew detected", hintAnchor: "pre-k8s-clock-skew", warning: true, check: func(ctx context.Context) error { return hc.checkClockSkew(ctx) }, }, }, false, ), NewCategory( LinkerdCRDChecks, []Checker{ { description: "control plane CustomResourceDefinitions exist", hintAnchor: "l5d-existence-crd", fatal: true, retryDeadline: hc.RetryDeadline, check: func(ctx context.Context) error { return CheckCustomResourceDefinitions(ctx, hc.kubeAPI, hc.CRDManifest) }, }, }, false, ), NewCategory( LinkerdControlPlaneExistenceChecks, []Checker{ { description: "'linkerd-config' config map exists", hintAnchor: "l5d-existence-linkerd-config", fatal: true, check: func(ctx context.Context) (err error) { err = hc.InitializeLinkerdGlobalConfig(ctx) return }, }, { description: "heartbeat ServiceAccount exist", hintAnchor: "l5d-existence-sa", fatal: true, check: func(ctx context.Context) error { if hc.isHeartbeatDisabled() { return nil } return hc.checkServiceAccounts(ctx, []string{"linkerd-heartbeat"}, hc.ControlPlaneNamespace, controlPlaneComponentsSelector()) }, }, { description: "control plane replica sets are ready", hintAnchor: "l5d-existence-replicasets", retryDeadline: hc.RetryDeadline, fatal: true, check: func(ctx context.Context) error { controlPlaneReplicaSet, err := hc.kubeAPI.GetReplicaSets(ctx, hc.ControlPlaneNamespace) if err != nil { return err } return checkControlPlaneReplicaSets(controlPlaneReplicaSet) }, }, { description: "no unschedulable pods", hintAnchor: "l5d-existence-unschedulable-pods", retryDeadline: hc.RetryDeadline, surfaceErrorOnRetry: true, warning: true, check: func(ctx context.Context) error { // do not save this into hc.controlPlanePods, as this check may // succeed prior to all expected control plane pods being up controlPlanePods, err := hc.kubeAPI.GetPodsByNamespace(ctx, hc.ControlPlaneNamespace) if err != nil { return err } return checkUnschedulablePods(controlPlanePods) }, }, { description: "control plane pods are ready", hintAnchor: "l5d-api-control-ready", retryDeadline: hc.RetryDeadline, surfaceErrorOnRetry: true, fatal: true, check: func(ctx context.Context) error { var err error podList, err := hc.kubeAPI.CoreV1().Pods(hc.ControlPlaneNamespace).List(ctx, metav1.ListOptions{ LabelSelector: k8s.ControllerComponentLabel, }) if err != nil { return err } hc.controlPlanePods = podList.Items return validateControlPlanePods(hc.controlPlanePods) }, }, { description: "cluster networks contains all node podCIDRs", hintAnchor: "l5d-cluster-networks-cidr", check: func(ctx context.Context) error { // We explicitly initialize the config here so that we dont rely on the "l5d-existence-linkerd-config" // check to set the clusterNetworks value, since `linkerd check config` will skip that check. err := hc.InitializeLinkerdGlobalConfig(ctx) if err != nil { return err } return hc.checkClusterNetworks(ctx) }, }, { description: "cluster networks contains all pods", hintAnchor: "l5d-cluster-networks-pods", check: func(ctx context.Context) error { return hc.checkClusterNetworksContainAllPods(ctx) }, }, { description: "cluster networks contains all services", hintAnchor: "l5d-cluster-networks-pods", check: func(ctx context.Context) error { return hc.checkClusterNetworksContainAllServices(ctx) }, }, }, false, ), NewCategory( LinkerdConfigChecks, []Checker{ { description: "control plane Namespace exists", hintAnchor: "l5d-existence-ns", fatal: true, check: func(ctx context.Context) error { return hc.CheckNamespace(ctx, hc.ControlPlaneNamespace, true) }, }, { description: "control plane ClusterRoles exist", hintAnchor: "l5d-existence-cr", fatal: true, check: func(ctx context.Context) error { return hc.checkClusterRoles(ctx, true, hc.expectedRBACNames(), controlPlaneComponentsSelector()) }, }, { description: "control plane ClusterRoleBindings exist", hintAnchor: "l5d-existence-crb", fatal: true, check: func(ctx context.Context) error { return hc.checkClusterRoleBindings(ctx, true, hc.expectedRBACNames(), controlPlaneComponentsSelector()) }, }, { description: "control plane ServiceAccounts exist", hintAnchor: "l5d-existence-sa", fatal: true, check: func(ctx context.Context) error { return hc.checkServiceAccounts(ctx, ExpectedServiceAccountNames, hc.ControlPlaneNamespace, controlPlaneComponentsSelector()) }, }, { description: "control plane CustomResourceDefinitions exist", hintAnchor: "l5d-existence-crd", fatal: true, check: func(ctx context.Context) error { return CheckCustomResourceDefinitions(ctx, hc.kubeAPI, hc.CRDManifest) }, }, { description: "control plane MutatingWebhookConfigurations exist", hintAnchor: "l5d-existence-mwc", fatal: true, check: func(ctx context.Context) error { return hc.checkMutatingWebhookConfigurations(ctx, true) }, }, { description: "control plane ValidatingWebhookConfigurations exist", hintAnchor: "l5d-existence-vwc", fatal: true, check: func(ctx context.Context) error { return hc.checkValidatingWebhookConfigurations(ctx, true) }, }, { description: "proxy-init container runs as root user if docker container runtime is used", hintAnchor: "l5d-proxy-init-run-as-root", fatal: false, check: func(ctx context.Context) error { // We explicitly initialize the config here so that we dont rely on the "l5d-existence-linkerd-config" // check to set the clusterNetworks value, since `linkerd check config` will skip that check. err := hc.InitializeLinkerdGlobalConfig(ctx) if err != nil { if kerrors.IsNotFound(err) { return SkipError{Reason: configMapDoesNotExistSkipReason} } return err } config := hc.LinkerdConfig() runAsRoot := config != nil && config.ProxyInit != nil && config.ProxyInit.RunAsRoot if !runAsRoot { return CheckNodesHaveNonDockerRuntime(ctx, hc.KubeAPIClient()) } return nil }, }, }, false, ), NewCategory( LinkerdCNIPluginChecks, []Checker{ { description: "cni plugin ConfigMap exists", hintAnchor: "cni-plugin-cm-exists", fatal: true, check: func(ctx context.Context) error { if !hc.CNIEnabled { return SkipError{Reason: linkerdCNIDisabledSkipReason} } _, err := hc.kubeAPI.CoreV1().ConfigMaps(hc.CNINamespace).Get(ctx, linkerdCNIConfigMapName, metav1.GetOptions{}) return err }, }, { description: "cni plugin ClusterRole exists", hintAnchor: "cni-plugin-cr-exists", fatal: true, check: func(ctx context.Context) error { if !hc.CNIEnabled { return SkipError{Reason: linkerdCNIDisabledSkipReason} } _, err := hc.kubeAPI.RbacV1().ClusterRoles().Get(ctx, linkerdCNIResourceName, metav1.GetOptions{}) if kerrors.IsNotFound(err) { return fmt.Errorf("missing ClusterRole: %s", linkerdCNIResourceName) } return err }, }, { description: "cni plugin ClusterRoleBinding exists", hintAnchor: "cni-plugin-crb-exists", fatal: true, check: func(ctx context.Context) error { if !hc.CNIEnabled { return SkipError{Reason: linkerdCNIDisabledSkipReason} } _, err := hc.kubeAPI.RbacV1().ClusterRoleBindings().Get(ctx, linkerdCNIResourceName, metav1.GetOptions{}) if kerrors.IsNotFound(err) { return fmt.Errorf("missing ClusterRoleBinding: %s", linkerdCNIResourceName) } return err }, }, { description: "cni plugin ServiceAccount exists", hintAnchor: "cni-plugin-sa-exists", fatal: true, check: func(ctx context.Context) error { if !hc.CNIEnabled { return SkipError{Reason: linkerdCNIDisabledSkipReason} } _, err := hc.kubeAPI.CoreV1().ServiceAccounts(hc.CNINamespace).Get(ctx, linkerdCNIResourceName, metav1.GetOptions{}) if kerrors.IsNotFound(err) { return fmt.Errorf("missing ServiceAccount: %s", linkerdCNIResourceName) } return err }, }, { description: "cni plugin DaemonSet exists", hintAnchor: "cni-plugin-ds-exists", fatal: true, check: func(ctx context.Context) (err error) { if !hc.CNIEnabled { return SkipError{Reason: linkerdCNIDisabledSkipReason} } hc.cniDaemonSet, err = hc.kubeAPI.Interface.AppsV1().DaemonSets(hc.CNINamespace).Get(ctx, linkerdCNIResourceName, metav1.GetOptions{}) if kerrors.IsNotFound(err) { return fmt.Errorf("missing DaemonSet: %s", linkerdCNIResourceName) } return err }, }, { description: "cni plugin pod is running on all nodes", hintAnchor: "cni-plugin-ready", retryDeadline: hc.RetryDeadline, surfaceErrorOnRetry: true, fatal: true, check: func(ctx context.Context) (err error) { if !hc.CNIEnabled { return SkipError{Reason: linkerdCNIDisabledSkipReason} } hc.cniDaemonSet, err = hc.kubeAPI.Interface.AppsV1().DaemonSets(hc.CNINamespace).Get(ctx, linkerdCNIResourceName, metav1.GetOptions{}) if kerrors.IsNotFound(err) { return fmt.Errorf("missing DaemonSet: %s", linkerdCNIResourceName) } scheduled := hc.cniDaemonSet.Status.DesiredNumberScheduled ready := hc.cniDaemonSet.Status.NumberReady if scheduled != ready { return fmt.Errorf("number ready: %d, number scheduled: %d", ready, scheduled) } return nil }, }, }, false, ), NewCategory( LinkerdIdentity, []Checker{ { description: "certificate config is valid", hintAnchor: "l5d-identity-cert-config-valid", fatal: true, check: func(ctx context.Context) (err error) { hc.issuerCert, hc.trustAnchors, err = hc.checkCertificatesConfig(ctx) return }, }, { description: "trust anchors are using supported crypto algorithm", hintAnchor: "l5d-identity-trustAnchors-use-supported-crypto", fatal: true, check: func(context.Context) error { var invalidAnchors []string for _, anchor := range hc.trustAnchors { if err := issuercerts.CheckTrustAnchorAlgoRequirements(anchor); err != nil { invalidAnchors = append(invalidAnchors, fmt.Sprintf("* %v %s %s", anchor.SerialNumber, anchor.Subject.CommonName, err)) } } if len(invalidAnchors) > 0 { return fmt.Errorf("Invalid trustAnchors:\n\t%s", strings.Join(invalidAnchors, "\n\t")) } return nil }, }, { description: "trust anchors are within their validity period", hintAnchor: "l5d-identity-trustAnchors-are-time-valid", fatal: true, check: func(ctx context.Context) error { var expiredAnchors []string for _, anchor := range hc.trustAnchors { if err := issuercerts.CheckCertValidityPeriod(anchor); err != nil { expiredAnchors = append(expiredAnchors, fmt.Sprintf("* %v %s %s", anchor.SerialNumber, anchor.Subject.CommonName, err)) } } if len(expiredAnchors) > 0 { return fmt.Errorf("Invalid anchors:\n\t%s", strings.Join(expiredAnchors, "\n\t")) } return nil }, }, { description: "trust anchors are valid for at least 60 days", hintAnchor: "l5d-identity-trustAnchors-not-expiring-soon", warning: true, check: func(ctx context.Context) error { var expiringAnchors []string for _, anchor := range hc.trustAnchors { if err := issuercerts.CheckExpiringSoon(anchor); err != nil { expiringAnchors = append(expiringAnchors, fmt.Sprintf("* %v %s %s", anchor.SerialNumber, anchor.Subject.CommonName, err)) } } if len(expiringAnchors) > 0 { return fmt.Errorf("Anchors expiring soon:\n\t%s", strings.Join(expiringAnchors, "\n\t")) } return nil }, }, { description: "issuer cert is using supported crypto algorithm", hintAnchor: "l5d-identity-issuer-cert-uses-supported-crypto", fatal: true, check: func(context.Context) error { if err := issuercerts.CheckIssuerCertAlgoRequirements(hc.issuerCert.Certificate); err != nil { return fmt.Errorf("issuer certificate %w", err) } return nil }, }, { description: "issuer cert is within its validity period", hintAnchor: "l5d-identity-issuer-cert-is-time-valid", fatal: true, check: func(ctx context.Context) error { if err := issuercerts.CheckCertValidityPeriod(hc.issuerCert.Certificate); err != nil { return fmt.Errorf("issuer certificate is %w", err) } return nil }, }, { description: "issuer cert is valid for at least 60 days", warning: true, hintAnchor: "l5d-identity-issuer-cert-not-expiring-soon", check: func(context.Context) error { if err := issuercerts.CheckExpiringSoon(hc.issuerCert.Certificate); err != nil { return fmt.Errorf("issuer certificate %w", err) } return nil }, }, { description: "issuer cert is issued by the trust anchor", hintAnchor: "l5d-identity-issuer-cert-issued-by-trust-anchor", check: func(ctx context.Context) error { return hc.issuerCert.Verify(tls.CertificatesToPool(hc.trustAnchors), "", time.Time{}) }, }, }, false, ), NewCategory( LinkerdWebhooksAndAPISvcTLS, []Checker{ { description: "proxy-injector webhook has valid cert", hintAnchor: "l5d-proxy-injector-webhook-cert-valid", fatal: true, check: func(ctx context.Context) (err error) { anchors, err := hc.fetchProxyInjectorCaBundle(ctx) if err != nil { return err } cert, err := hc.FetchCredsFromSecret(ctx, hc.ControlPlaneNamespace, proxyInjectorTLSSecretName) if kerrors.IsNotFound(err) { cert, err = hc.FetchCredsFromOldSecret(ctx, hc.ControlPlaneNamespace, proxyInjectorOldTLSSecretName) } if err != nil { return err } identityName := fmt.Sprintf("linkerd-proxy-injector.%s.svc", hc.ControlPlaneNamespace) return hc.CheckCertAndAnchors(cert, anchors, identityName) }, }, { description: "proxy-injector cert is valid for at least 60 days", warning: true, hintAnchor: "l5d-proxy-injector-webhook-cert-not-expiring-soon", check: func(ctx context.Context) error { cert, err := hc.FetchCredsFromSecret(ctx, hc.ControlPlaneNamespace, proxyInjectorTLSSecretName) if kerrors.IsNotFound(err) { cert, err = hc.FetchCredsFromOldSecret(ctx, hc.ControlPlaneNamespace, proxyInjectorOldTLSSecretName) } if err != nil { return err } return hc.CheckCertAndAnchorsExpiringSoon(cert) }, }, { description: "sp-validator webhook has valid cert", hintAnchor: "l5d-sp-validator-webhook-cert-valid", fatal: true, check: func(ctx context.Context) (err error) { anchors, err := hc.fetchWebhookCaBundle(ctx, k8s.SPValidatorWebhookConfigName) if err != nil { return err } cert, err := hc.FetchCredsFromSecret(ctx, hc.ControlPlaneNamespace, spValidatorTLSSecretName) if kerrors.IsNotFound(err) { cert, err = hc.FetchCredsFromOldSecret(ctx, hc.ControlPlaneNamespace, spValidatorOldTLSSecretName) } if err != nil { return err } identityName := fmt.Sprintf("linkerd-sp-validator.%s.svc", hc.ControlPlaneNamespace) return hc.CheckCertAndAnchors(cert, anchors, identityName) }, }, { description: "sp-validator cert is valid for at least 60 days", warning: true, hintAnchor: "l5d-sp-validator-webhook-cert-not-expiring-soon", check: func(ctx context.Context) error { cert, err := hc.FetchCredsFromSecret(ctx, hc.ControlPlaneNamespace, spValidatorTLSSecretName) if kerrors.IsNotFound(err) { cert, err = hc.FetchCredsFromOldSecret(ctx, hc.ControlPlaneNamespace, spValidatorOldTLSSecretName) } if err != nil { return err } return hc.CheckCertAndAnchorsExpiringSoon(cert) }, }, { description: "policy-validator webhook has valid cert", hintAnchor: "l5d-policy-validator-webhook-cert-valid", fatal: true, check: func(ctx context.Context) (err error) { anchors, err := hc.fetchWebhookCaBundle(ctx, k8s.PolicyValidatorWebhookConfigName) if kerrors.IsNotFound(err) { return SkipError{Reason: "policy-validator not installed"} } if err != nil { return err } cert, err := hc.FetchCredsFromSecret(ctx, hc.ControlPlaneNamespace, policyValidatorTLSSecretName) if kerrors.IsNotFound(err) { return SkipError{Reason: "policy-validator not installed"} } if err != nil { return err } identityName := fmt.Sprintf("linkerd-policy-validator.%s.svc", hc.ControlPlaneNamespace) return hc.CheckCertAndAnchors(cert, anchors, identityName) }, }, { description: "policy-validator cert is valid for at least 60 days", warning: true, hintAnchor: "l5d-policy-validator-webhook-cert-not-expiring-soon", check: func(ctx context.Context) error { cert, err := hc.FetchCredsFromSecret(ctx, hc.ControlPlaneNamespace, policyValidatorTLSSecretName) if kerrors.IsNotFound(err) { return SkipError{Reason: "policy-validator not installed"} } if err != nil { return err } return hc.CheckCertAndAnchorsExpiringSoon(cert) }, }, }, false, ), NewCategory( LinkerdIdentityDataPlane, []Checker{ { description: "data plane proxies certificate match CA", hintAnchor: "l5d-identity-data-plane-proxies-certs-match-ca", warning: true, check: func(ctx context.Context) error { return hc.checkDataPlaneProxiesCertificate(ctx) }, }, }, false, ), NewCategory( LinkerdVersionChecks, []Checker{ { description: "can determine the latest version", hintAnchor: "l5d-version-latest", warning: true, check: func(ctx context.Context) (err error) { if hc.VersionOverride != "" { hc.LatestVersions, err = version.NewChannels(hc.VersionOverride) } else { uuid := "unknown" if hc.uuid != "" { uuid = hc.uuid } hc.LatestVersions, err = version.GetLatestVersions(ctx, uuid, "cli") } return }, }, { description: "cli is up-to-date", hintAnchor: "l5d-version-cli", warning: true, check: func(context.Context) error { return hc.LatestVersions.Match(version.Version) }, }, }, false, ), NewCategory( LinkerdControlPlaneVersionChecks, []Checker{ { description: "can retrieve the control plane version", hintAnchor: "l5d-version-control", retryDeadline: hc.RetryDeadline, fatal: true, check: func(ctx context.Context) (err error) { hc.serverVersion, err = GetServerVersion(ctx, hc.ControlPlaneNamespace, hc.kubeAPI) return }, }, { description: "control plane is up-to-date", hintAnchor: "l5d-version-control", warning: true, check: func(context.Context) error { return hc.LatestVersions.Match(hc.serverVersion) }, }, { description: "control plane and cli versions match", hintAnchor: "l5d-version-control", warning: true, check: func(context.Context) error { if hc.serverVersion != version.Version { return fmt.Errorf("control plane running %s but cli running %s", hc.serverVersion, version.Version) } return nil }, }, }, false, ), NewCategory( LinkerdControlPlaneProxyChecks, []Checker{ { description: "control plane proxies are healthy", hintAnchor: "l5d-cp-proxy-healthy", retryDeadline: hc.RetryDeadline, surfaceErrorOnRetry: true, fatal: true, check: func(ctx context.Context) error { return hc.CheckProxyHealth(ctx, hc.ControlPlaneNamespace, hc.ControlPlaneNamespace) }, }, { description: "control plane proxies are up-to-date", hintAnchor: "l5d-cp-proxy-version", warning: true, check: func(ctx context.Context) error { podList, err := hc.kubeAPI.CoreV1().Pods(hc.ControlPlaneNamespace).List(ctx, metav1.ListOptions{LabelSelector: k8s.ControllerNSLabel}) if err != nil { return err } return hc.CheckProxyVersionsUpToDate(podList.Items) }, }, { description: "control plane proxies and cli versions match", hintAnchor: "l5d-cp-proxy-cli-version", warning: true, check: func(ctx context.Context) error { podList, err := hc.kubeAPI.CoreV1().Pods(hc.ControlPlaneNamespace).List(ctx, metav1.ListOptions{LabelSelector: k8s.ControllerNSLabel}) if err != nil { return err } return CheckIfProxyVersionsMatchWithCLI(podList.Items) }, }, }, false, ), NewCategory( LinkerdDataPlaneChecks, []Checker{ { description: "data plane namespace exists", hintAnchor: "l5d-data-plane-exists", fatal: true, check: func(ctx context.Context) error { if hc.DataPlaneNamespace == "" { // when checking proxies in all namespaces, this check is a no-op return nil } return hc.CheckNamespace(ctx, hc.DataPlaneNamespace, true) }, }, { description: "data plane proxies are ready", hintAnchor: "l5d-data-plane-ready", retryDeadline: hc.RetryDeadline, fatal: true, check: func(ctx context.Context) error { pods, err := hc.GetDataPlanePods(ctx) if err != nil { return err } return CheckPodsRunning(pods, hc.DataPlaneNamespace) }, }, { description: "data plane is up-to-date", hintAnchor: "l5d-data-plane-version", warning: true, check: func(ctx context.Context) error { pods, err := hc.GetDataPlanePods(ctx) if err != nil { return err } return hc.CheckProxyVersionsUpToDate(pods) }, }, { description: "data plane and cli versions match", hintAnchor: "l5d-data-plane-cli-version", warning: true, check: func(ctx context.Context) error { pods, err := hc.GetDataPlanePods(ctx) if err != nil { return err } return CheckIfProxyVersionsMatchWithCLI(pods) }, }, { description: "data plane pod labels are configured correctly", hintAnchor: "l5d-data-plane-pod-labels", warning: true, check: func(ctx context.Context) error { pods, err := hc.GetDataPlanePods(ctx) if err != nil { return err } return checkMisconfiguredPodsLabels(pods) }, }, { description: "data plane service labels are configured correctly", hintAnchor: "l5d-data-plane-services-labels", warning: true, check: func(ctx context.Context) error { services, err := hc.GetServices(ctx) if err != nil { return err } return checkMisconfiguredServiceLabels(services) }, }, { description: "data plane service annotations are configured correctly", hintAnchor: "l5d-data-plane-services-annotations", warning: true, check: func(ctx context.Context) error { services, err := hc.GetServices(ctx) if err != nil { return err } return checkMisconfiguredServiceAnnotations(services) }, }, { description: "opaque ports are properly annotated", hintAnchor: "linkerd-opaque-ports-definition", warning: true, check: func(ctx context.Context) error { return hc.checkMisconfiguredOpaquePortAnnotations(ctx) }, }, }, false, ), NewCategory( LinkerdHAChecks, []Checker{ { description: "multiple replicas of control plane pods", hintAnchor: "l5d-control-plane-replicas", retryDeadline: hc.RetryDeadline, warning: true, check: func(ctx context.Context) error { if hc.isHA() { return hc.checkMinReplicasAvailable(ctx) } return SkipError{Reason: "not run for non HA installs"} }, }, }, false, ), NewCategory( LinkerdExtensionChecks, []Checker{ { description: "namespace configuration for extensions", warning: true, hintAnchor: "l5d-extension-namespaces", check: func(ctx context.Context) error { return hc.checkExtensionNsLabels(ctx) }, }, }, false, ), } } // CheckProxyVersionsUpToDate checks if all the proxies are on the latest // installed version func (hc *HealthChecker) CheckProxyVersionsUpToDate(pods []corev1.Pod) error { return CheckProxyVersionsUpToDate(pods, hc.LatestVersions) } // CheckProxyVersionsUpToDate checks if all the proxies are on the latest // installed version func CheckProxyVersionsUpToDate(pods []corev1.Pod, versions version.Channels) error { outdatedPods := []string{} for _, pod := range pods { status := k8s.GetPodStatus(pod) if status == string(corev1.PodRunning) { proxyVersion := k8s.GetProxyVersion(pod) if proxyVersion == "" { continue } if err := versions.Match(proxyVersion); err != nil { outdatedPods = append(outdatedPods, fmt.Sprintf("\t* %s (%s)", pod.Name, proxyVersion)) } } } if versions.Empty() { return errors.New("unable to determine version channel") } if len(outdatedPods) > 0 { podList := strings.Join(outdatedPods, "\n") return fmt.Errorf("some proxies are not running the current version:\n%s", podList) } return nil } // CheckIfProxyVersionsMatchWithCLI checks if the latest proxy version // matches that of the CLI func CheckIfProxyVersionsMatchWithCLI(pods []corev1.Pod) error { for _, pod := range pods { status := k8s.GetPodStatus(pod) proxyVersion := k8s.GetProxyVersion(pod) if status == string(corev1.PodRunning) && proxyVersion != "" && proxyVersion != version.Version { return fmt.Errorf("%s running %s but cli running %s", pod.Name, proxyVersion, version.Version) } } return nil } // CheckCertAndAnchors checks if the given cert and anchors are valid func (hc *HealthChecker) CheckCertAndAnchors(cert *tls.Cred, trustAnchors []*x509.Certificate, identityName string) error { // check anchors time validity var expiredAnchors []string for _, anchor := range trustAnchors { if err := issuercerts.CheckCertValidityPeriod(anchor); err != nil { expiredAnchors = append(expiredAnchors, fmt.Sprintf("* %v %s %s", anchor.SerialNumber, anchor.Subject.CommonName, err)) } } if len(expiredAnchors) > 0 { return fmt.Errorf("anchors not within their validity period:\n\t%s", strings.Join(expiredAnchors, "\n\t")) } // check cert validity if err := issuercerts.CheckCertValidityPeriod(cert.Certificate); err != nil { return fmt.Errorf("certificate is %w", err) } if err := cert.Verify(tls.CertificatesToPool(trustAnchors), identityName, time.Time{}); err != nil { return fmt.Errorf("cert is not issued by the trust anchor: %w", err) } return nil } // CheckProxyHealth checks for the data-plane proxies health in the given namespace // These checks consist of status and identity func (hc *HealthChecker) CheckProxyHealth(ctx context.Context, controlPlaneNamespace, namespace string) error { podList, err := hc.kubeAPI.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: k8s.ControllerNSLabel}) if err != nil { return err } // Validate the status of the pods err = CheckPodsRunning(podList.Items, controlPlaneNamespace) if err != nil { return err } // Check proxy certificates return checkPodsProxiesCertificate(ctx, *hc.kubeAPI, namespace, controlPlaneNamespace) } // CheckCertAndAnchorsExpiringSoon checks if the given cert and anchors expire soon, and returns an // error if they do. func (hc *HealthChecker) CheckCertAndAnchorsExpiringSoon(cert *tls.Cred) error { // check anchors not expiring soon var expiringAnchors []string for _, anchor := range cert.TrustChain { anchor := anchor if err := issuercerts.CheckExpiringSoon(anchor); err != nil { expiringAnchors = append(expiringAnchors, fmt.Sprintf("* %v %s %s", anchor.SerialNumber, anchor.Subject.CommonName, err)) } } if len(expiringAnchors) > 0 { return fmt.Errorf("Anchors expiring soon:\n\t%s", strings.Join(expiringAnchors, "\n\t")) } // check cert not expiring soon if err := issuercerts.CheckExpiringSoon(cert.Certificate); err != nil { return fmt.Errorf("certificate %w", err) } return nil } // CheckAPIService checks the status of the given API Service and returns an error if it's not running func (hc *HealthChecker) CheckAPIService(ctx context.Context, serviceName string) error { apiServiceClient, err := apiregistrationv1client.NewForConfig(hc.kubeAPI.Config) if err != nil { return err } apiStatus, err := apiServiceClient.APIServices().Get(ctx, serviceName, metav1.GetOptions{}) if err != nil { return err } for _, condition := range apiStatus.Status.Conditions { if condition.Type == "Available" { if condition.Status == "True" { return nil } return fmt.Errorf("%s: %s", condition.Reason, condition.Message) } } return fmt.Errorf("%s service not available", apiStatus.Name) } func (hc *HealthChecker) checkMinReplicasAvailable(ctx context.Context) error { faulty := []string{} for _, component := range linkerdHAControlPlaneComponents { conf, err := hc.kubeAPI.AppsV1().Deployments(hc.ControlPlaneNamespace).Get(ctx, component, metav1.GetOptions{}) if err != nil { return err } if conf.Status.AvailableReplicas <= 1 { faulty = append(faulty, component) } } if len(faulty) > 0 { return fmt.Errorf("not enough replicas available for %v", faulty) } return nil } // RunChecks runs all configured checkers, and passes the results of each // check to the observer. If a check fails and is marked as fatal, then all // remaining checks are skipped. If at least one check fails, RunChecks returns // false; if all checks passed, RunChecks returns true. Checks which are // designated as warnings will not cause RunCheck to return false, however. func (hc *HealthChecker) RunChecks(observer CheckObserver) (bool, bool) { success := true warning := false for _, c := range hc.categories { if c.enabled { for _, checker := range c.checkers { checker := checker // pin if checker.check != nil { if !hc.runCheck(c, &checker, observer) { if !checker.warning { success = false } else { warning = true } if checker.fatal { return success, warning } } } } } } return success, warning } func (hc *HealthChecker) RunWithExitOnError() (bool, bool) { return hc.RunChecks(func(result *CheckResult) { if result.Retry { fmt.Fprintln(os.Stderr, "Waiting for control plane to become available") return } if result.Err != nil && !result.Warning { var msg string switch result.Category { case KubernetesAPIChecks: msg = "Cannot connect to Kubernetes" case LinkerdControlPlaneExistenceChecks: msg = "Cannot find Linkerd" } fmt.Fprintf(os.Stderr, "%s: %s\nValidate the install with: 'linkerd check'\n", msg, result.Err) os.Exit(1) } }) } // LinkerdConfig gets the Linkerd configuration values. func (hc *HealthChecker) LinkerdConfig() *l5dcharts.Values { return hc.linkerdConfig } func (hc *HealthChecker) runCheck(category *Category, c *Checker, observer CheckObserver) bool { for { ctx, cancel := context.WithTimeout(context.Background(), RequestTimeout) err := c.check(ctx) cancel() var se SkipError if errors.As(err, &se) { log.Debugf("Skipping check: %s. Reason: %s", c.description, se.Reason) return true } checkResult := &CheckResult{ Category: category.ID, Description: c.description, Warning: c.warning, HintURL: fmt.Sprintf("%s%s", category.hintBaseURL, c.hintAnchor), } var vs VerboseSuccess if errors.As(err, &vs) { checkResult.Description = fmt.Sprintf("%s\n%s", checkResult.Description, vs.Message) } else if err != nil { checkResult.Err = CategoryError{category.ID, err} } if checkResult.Err != nil && time.Now().Before(c.retryDeadline) { checkResult.Retry = true if !c.surfaceErrorOnRetry { checkResult.Err = errors.New("waiting for check to complete") } log.Debugf("Retrying on error: %s", err) observer(checkResult) time.Sleep(retryWindow) continue } observer(checkResult) return checkResult.Err == nil } } func controlPlaneComponentsSelector() string { return fmt.Sprintf("%s,!%s", k8s.ControllerNSLabel, LinkerdCNIResourceLabel) } // KubeAPIClient returns a fully configured k8s API client. This client is // only configured if the KubernetesAPIChecks are configured and run first. func (hc *HealthChecker) KubeAPIClient() *k8s.KubernetesAPI { return hc.kubeAPI } // UUID returns the UUID of the installation func (hc *HealthChecker) UUID() string { return hc.uuid } func (hc *HealthChecker) checkLinkerdConfigConfigMap(ctx context.Context) (string, *l5dcharts.Values, error) { configMap, values, err := FetchCurrentConfiguration(ctx, hc.kubeAPI, hc.ControlPlaneNamespace) if err != nil { return "", nil, err } return string(configMap.GetUID()), values, nil } // Checks whether the configuration of the linkerd-identity-issuer is correct. This means: // 1. There is a config map present with identity context // 2. The scheme in the identity context corresponds to the format of the issuer secret // 3. The trust anchors (if scheme == kubernetes.io/tls) in the secret equal the ones in config // 4. The certs and key are parsable func (hc *HealthChecker) checkCertificatesConfig(ctx context.Context) (*tls.Cred, []*x509.Certificate, error) { _, values, err := FetchCurrentConfiguration(ctx, hc.kubeAPI, hc.ControlPlaneNamespace) if err != nil { return nil, nil, err } var data *issuercerts.IssuerCertData if values.Identity.Issuer.Scheme == "" || values.Identity.Issuer.Scheme == k8s.IdentityIssuerSchemeLinkerd { data, err = issuercerts.FetchIssuerData(ctx, hc.kubeAPI, values.IdentityTrustAnchorsPEM, hc.ControlPlaneNamespace) } else { data, err = issuercerts.FetchExternalIssuerData(ctx, hc.kubeAPI, hc.ControlPlaneNamespace) } if err != nil { return nil, nil, err } issuerCreds, err := tls.ValidateAndCreateCreds(data.IssuerCrt, data.IssuerKey) if err != nil { return nil, nil, err } anchors, err := tls.DecodePEMCertificates(data.TrustAnchors) if err != nil { return nil, nil, err } return issuerCreds, anchors, nil } // FetchCurrentConfiguration retrieves the current Linkerd configuration func FetchCurrentConfiguration(ctx context.Context, k kubernetes.Interface, controlPlaneNamespace string) (*corev1.ConfigMap, *l5dcharts.Values, error) { // Get the linkerd-config values if present. configMap, err := config.FetchLinkerdConfigMap(ctx, k, controlPlaneNamespace) if err != nil { return nil, nil, err } rawValues := configMap.Data["values"] if rawValues == "" { return configMap, nil, nil } // Convert into latest values, where global field is removed. rawValuesBytes, err := config.RemoveGlobalFieldIfPresent([]byte(rawValues)) if err != nil { return nil, nil, err } rawValues = string(rawValuesBytes) var fullValues l5dcharts.Values err = yaml.Unmarshal([]byte(rawValues), &fullValues) if err != nil { return nil, nil, err } return configMap, &fullValues, nil } func (hc *HealthChecker) fetchProxyInjectorCaBundle(ctx context.Context) ([]*x509.Certificate, error) { mwh, err := hc.getProxyInjectorMutatingWebhook(ctx) if err != nil { return nil, err } caBundle, err := tls.DecodePEMCertificates(string(mwh.ClientConfig.CABundle)) if err != nil { return nil, err } return caBundle, nil } func (hc *HealthChecker) fetchWebhookCaBundle(ctx context.Context, webhook string) ([]*x509.Certificate, error) { vwc, err := hc.kubeAPI.AdmissionregistrationV1().ValidatingWebhookConfigurations().Get(ctx, webhook, metav1.GetOptions{}) if err != nil { return nil, err } if len(vwc.Webhooks) != 1 { return nil, fmt.Errorf("expected 1 webhooks, found %d", len(vwc.Webhooks)) } caBundle, err := tls.DecodePEMCertificates(string(vwc.Webhooks[0].ClientConfig.CABundle)) if err != nil { return nil, err } return caBundle, nil } // FetchTrustBundle retrieves the ca-bundle from the config-map linkerd-identity-trust-roots func FetchTrustBundle(ctx context.Context, kubeAPI k8s.KubernetesAPI, controlPlaneNamespace string) (string, error) { configMap, err := kubeAPI.CoreV1().ConfigMaps(controlPlaneNamespace).Get(ctx, "linkerd-identity-trust-roots", metav1.GetOptions{}) return configMap.Data["ca-bundle.crt"], err } // FetchCredsFromSecret retrieves the TLS creds given a secret name func (hc *HealthChecker) FetchCredsFromSecret(ctx context.Context, namespace string, secretName string) (*tls.Cred, error) { secret, err := hc.kubeAPI.CoreV1().Secrets(namespace).Get(ctx, secretName, metav1.GetOptions{}) if err != nil { return nil, err } crt, ok := secret.Data[certKeyName] if !ok { return nil, fmt.Errorf("key %s needs to exist in secret %s", certKeyName, secretName) } key, ok := secret.Data[keyKeyName] if !ok { return nil, fmt.Errorf("key %s needs to exist in secret %s", keyKeyName, secretName) } cred, err := tls.ValidateAndCreateCreds(string(crt), string(key)) if err != nil { return nil, err } return cred, nil } // FetchCredsFromOldSecret function can be removed in later versions, once either all webhook secrets are recreated for each update // (see https://github.com/linkerd/linkerd2/issues/4813) // or later releases are only expected to update from the new names. func (hc *HealthChecker) FetchCredsFromOldSecret(ctx context.Context, namespace string, secretName string) (*tls.Cred, error) { secret, err := hc.kubeAPI.CoreV1().Secrets(namespace).Get(ctx, secretName, metav1.GetOptions{}) if err != nil { return nil, err } crt, ok := secret.Data[certOldKeyName] if !ok { return nil, fmt.Errorf("key %s needs to exist in secret %s", certOldKeyName, secretName) } key, ok := secret.Data[keyOldKeyName] if !ok { return nil, fmt.Errorf("key %s needs to exist in secret %s", keyOldKeyName, secretName) } cred, err := tls.ValidateAndCreateCreds(string(crt), string(key)) if err != nil { return nil, err } return cred, nil } // CheckNamespace checks whether the given namespace exists, and returns an // error if it does not match `shouldExist`. func (hc *HealthChecker) CheckNamespace(ctx context.Context, namespace string, shouldExist bool) error { exists, err := hc.kubeAPI.NamespaceExists(ctx, namespace) if err != nil { return err } if shouldExist && !exists { return fmt.Errorf("The \"%s\" namespace does not exist", namespace) } if !shouldExist && exists { return fmt.Errorf("The \"%s\" namespace already exists", namespace) } return nil } func (hc *HealthChecker) checkClusterNetworks(ctx context.Context) error { nodes, err := hc.kubeAPI.GetNodes(ctx) if err != nil { return err } clusterNetworks := strings.Split(hc.linkerdConfig.ClusterNetworks, ",") clusterIPNets := make([]*net.IPNet, len(clusterNetworks)) for i, clusterNetwork := range clusterNetworks { _, clusterIPNets[i], err = net.ParseCIDR(clusterNetwork) if err != nil { return err } } var badPodCIDRS []string var podCIDRExists bool for _, node := range nodes { podCIDR := node.Spec.PodCIDR if podCIDR == "" { continue } podCIDRExists = true podIP, podIPNet, err := net.ParseCIDR(podCIDR) if err != nil { return err } exists := cluterNetworksContainCIDR(clusterIPNets, podIPNet, podIP) if !exists { badPodCIDRS = append(badPodCIDRS, podCIDR) } } // If none of the nodes exposed a podCIDR then we cannot verify the clusterNetworks. if !podCIDRExists { // DigitalOcean for example, doesn't expose spec.podCIDR (#6398) return SkipError{Reason: podCIDRUnavailableSkipReason} } if len(badPodCIDRS) > 0 { sort.Strings(badPodCIDRS) return fmt.Errorf("node has podCIDR(s) %v which are not contained in the Linkerd clusterNetworks.\n\tTry installing linkerd via --set clusterNetworks=\"%s\"", badPodCIDRS, strings.Join(badPodCIDRS, "\\,")) } return nil } func cluterNetworksContainCIDR(clusterIPNets []*net.IPNet, podIPNet *net.IPNet, podIP net.IP) bool { for _, clusterIPNet := range clusterIPNets { clusterIPMaskOnes, _ := clusterIPNet.Mask.Size() podCIDRMaskOnes, _ := podIPNet.Mask.Size() if clusterIPNet.Contains(podIP) && podCIDRMaskOnes >= clusterIPMaskOnes { return true } } return false } func clusterNetworksContainIP(clusterIPNets []*net.IPNet, ip string) bool { for _, clusterIPNet := range clusterIPNets { if clusterIPNet.Contains(net.ParseIP(ip)) { return true } } return false } func (hc *HealthChecker) checkClusterNetworksContainAllPods(ctx context.Context) error { clusterNetworks := strings.Split(hc.linkerdConfig.ClusterNetworks, ",") clusterIPNets := make([]*net.IPNet, len(clusterNetworks)) var err error for i, clusterNetwork := range clusterNetworks { _, clusterIPNets[i], err = net.ParseCIDR(clusterNetwork) if err != nil { return err } } pods, err := hc.kubeAPI.CoreV1().Pods(corev1.NamespaceAll).List(ctx, metav1.ListOptions{}) if err != nil { return err } for _, pod := range pods.Items { if pod.Spec.HostNetwork { continue } if len(pod.Status.PodIP) == 0 { continue } if !clusterNetworksContainIP(clusterIPNets, pod.Status.PodIP) { return fmt.Errorf("the Linkerd clusterNetworks [%q] do not include pod %s/%s (%s)", hc.linkerdConfig.ClusterNetworks, pod.Namespace, pod.Name, pod.Status.PodIP) } } return nil } func (hc *HealthChecker) checkClusterNetworksContainAllServices(ctx context.Context) error { clusterNetworks := strings.Split(hc.linkerdConfig.ClusterNetworks, ",") clusterIPNets := make([]*net.IPNet, len(clusterNetworks)) var err error for i, clusterNetwork := range clusterNetworks { _, clusterIPNets[i], err = net.ParseCIDR(clusterNetwork) if err != nil { return err } } svcs, err := hc.kubeAPI.CoreV1().Services(corev1.NamespaceAll).List(ctx, metav1.ListOptions{}) if err != nil { return err } for _, svc := range svcs.Items { clusterIP := svc.Spec.ClusterIP if clusterIP != "" && clusterIP != "None" && !clusterNetworksContainIP(clusterIPNets, svc.Spec.ClusterIP) { return fmt.Errorf("the Linkerd clusterNetworks [%q] do not include svc %s/%s (%s)", hc.linkerdConfig.ClusterNetworks, svc.Namespace, svc.Name, svc.Spec.ClusterIP) } } return nil } func (hc *HealthChecker) expectedRBACNames() []string { return []string{ fmt.Sprintf("linkerd-%s-identity", hc.ControlPlaneNamespace), fmt.Sprintf("linkerd-%s-proxy-injector", hc.ControlPlaneNamespace), } } func (hc *HealthChecker) checkClusterRoles(ctx context.Context, shouldExist bool, expectedNames []string, labelSelector string) error { return CheckClusterRoles(ctx, hc.kubeAPI, shouldExist, expectedNames, labelSelector) } // CheckClusterRoles checks that the expected ClusterRoles exist. func CheckClusterRoles(ctx context.Context, kubeAPI *k8s.KubernetesAPI, shouldExist bool, expectedNames []string, labelSelector string) error { options := metav1.ListOptions{ LabelSelector: labelSelector, } crList, err := kubeAPI.RbacV1().ClusterRoles().List(ctx, options) if err != nil { return err } objects := []runtime.Object{} for _, item := range crList.Items { item := item // pin objects = append(objects, &item) } return checkResources("ClusterRoles", objects, expectedNames, shouldExist) } func (hc *HealthChecker) checkClusterRoleBindings(ctx context.Context, shouldExist bool, expectedNames []string, labelSelector string) error { return CheckClusterRoleBindings(ctx, hc.kubeAPI, shouldExist, expectedNames, labelSelector) } // CheckClusterRoleBindings checks that the expected ClusterRoleBindings exist. func CheckClusterRoleBindings(ctx context.Context, kubeAPI *k8s.KubernetesAPI, shouldExist bool, expectedNames []string, labelSelector string) error { options := metav1.ListOptions{ LabelSelector: labelSelector, } crbList, err := kubeAPI.RbacV1().ClusterRoleBindings().List(ctx, options) if err != nil { return err } objects := []runtime.Object{} for _, item := range crbList.Items { item := item // pin objects = append(objects, &item) } return checkResources("ClusterRoleBindings", objects, expectedNames, shouldExist) } // CheckConfigMaps checks that the expected ConfigMaps exist. func CheckConfigMaps(ctx context.Context, kubeAPI *k8s.KubernetesAPI, namespace string, shouldExist bool, expectedNames []string, labelSelector string) error { options := metav1.ListOptions{ LabelSelector: labelSelector, } crbList, err := kubeAPI.CoreV1().ConfigMaps(namespace).List(ctx, options) if err != nil { return err } objects := []runtime.Object{} for _, item := range crbList.Items { item := item // pin objects = append(objects, &item) } return checkResources("ConfigMaps", objects, expectedNames, shouldExist) } func (hc *HealthChecker) isHA() bool { return hc.linkerdConfig.HighAvailability } func (hc *HealthChecker) isHeartbeatDisabled() bool { return hc.linkerdConfig.DisableHeartBeat } func (hc *HealthChecker) checkServiceAccounts(ctx context.Context, saNames []string, ns, labelSelector string) error { return CheckServiceAccounts(ctx, hc.kubeAPI, saNames, ns, labelSelector) } // CheckServiceAccounts check for serviceaccounts func CheckServiceAccounts(ctx context.Context, api *k8s.KubernetesAPI, saNames []string, ns, labelSelector string) error { options := metav1.ListOptions{ LabelSelector: labelSelector, } saList, err := api.CoreV1().ServiceAccounts(ns).List(ctx, options) if err != nil { return err } objects := []runtime.Object{} for _, item := range saList.Items { item := item // pin objects = append(objects, &item) } return checkResources("ServiceAccounts", objects, saNames, true) } // CheckIfLinkerdExists checks if Linkerd exists func CheckIfLinkerdExists(ctx context.Context, kubeAPI *k8s.KubernetesAPI, controlPlaneNamespace string) (bool, error) { _, err := kubeAPI.CoreV1().Namespaces().Get(ctx, controlPlaneNamespace, metav1.GetOptions{}) if err != nil { if kerrors.IsNotFound(err) { return false, nil } return false, err } _, _, err = FetchCurrentConfiguration(ctx, kubeAPI, controlPlaneNamespace) if err != nil { if kerrors.IsNotFound(err) { return false, nil } return false, err } return true, nil } func (hc *HealthChecker) getProxyInjectorMutatingWebhook(ctx context.Context) (*admissionRegistration.MutatingWebhook, error) { mwc, err := hc.kubeAPI.AdmissionregistrationV1().MutatingWebhookConfigurations().Get(ctx, k8s.ProxyInjectorWebhookConfigName, metav1.GetOptions{}) if err != nil { return nil, err } if len(mwc.Webhooks) != 1 { return nil, fmt.Errorf("expected 1 webhooks, found %d", len(mwc.Webhooks)) } return &mwc.Webhooks[0], nil } func (hc *HealthChecker) checkMutatingWebhookConfigurations(ctx context.Context, shouldExist bool) error { options := metav1.ListOptions{ LabelSelector: controlPlaneComponentsSelector(), } mwc, err := hc.kubeAPI.AdmissionregistrationV1().MutatingWebhookConfigurations().List(ctx, options) if err != nil { return err } objects := []runtime.Object{} for _, item := range mwc.Items { item := item // pin objects = append(objects, &item) } return checkResources("MutatingWebhookConfigurations", objects, []string{k8s.ProxyInjectorWebhookConfigName}, shouldExist) } func (hc *HealthChecker) checkValidatingWebhookConfigurations(ctx context.Context, shouldExist bool) error { options := metav1.ListOptions{ LabelSelector: controlPlaneComponentsSelector(), } vwc, err := hc.kubeAPI.AdmissionregistrationV1().ValidatingWebhookConfigurations().List(ctx, options) if err != nil { return err } objects := []runtime.Object{} for _, item := range vwc.Items { item := item // pin objects = append(objects, &item) } return checkResources("ValidatingWebhookConfigurations", objects, []string{k8s.SPValidatorWebhookConfigName}, shouldExist) } // CheckCustomResourceDefinitions checks that all of the Linkerd CRDs are // installed on the cluster. func CheckCustomResourceDefinitions(ctx context.Context, k8sAPI *k8s.KubernetesAPI, expectedCRDManifests string) error { crdYamls := strings.Split(expectedCRDManifests, "\n---\n") crdVersions := []struct{ name, version string }{} for _, crdYaml := range crdYamls { var crd apiextv1.CustomResourceDefinition err := yaml.Unmarshal([]byte(crdYaml), &crd) if err != nil { return err } if len(crd.Spec.Versions) == 0 { continue } versionIndex := len(crd.Spec.Versions) - 1 crdVersions = append(crdVersions, struct{ name, version string }{ name: crd.Name, version: crd.Spec.Versions[versionIndex].Name, }) } errMsgs := []string{} for _, crdVersion := range crdVersions { name := crdVersion.name version := crdVersion.version crd, err := k8sAPI.Apiextensions.ApiextensionsV1().CustomResourceDefinitions().Get(ctx, name, metav1.GetOptions{}) if err != nil && kerrors.IsNotFound(err) { errMsgs = append(errMsgs, fmt.Sprintf("missing %s", name)) continue } else if err != nil { return err } if !crdHasVersion(crd, version) { errMsgs = append(errMsgs, fmt.Sprintf("CRD %s is missing version %s", name, version)) } } if len(errMsgs) > 0 { return errors.New(strings.Join(errMsgs, ", ")) } return nil } func crdHasVersion(crd *apiextv1.CustomResourceDefinition, version string) bool { for _, crdVersion := range crd.Spec.Versions { if crdVersion.Name == version { return true } } return false } // CheckNodesHaveNonDockerRuntime checks that each node has a non-Docker // runtime. This check is only called if proxyInit is not running as root // which is a problem for clusters with a Docker container runtime. func CheckNodesHaveNonDockerRuntime(ctx context.Context, k8sAPI *k8s.KubernetesAPI) error { hasDockerNodes := false continueToken := "" for { nodes, err := k8sAPI.CoreV1().Nodes().List(ctx, metav1.ListOptions{Continue: continueToken}) if err != nil { return err } continueToken = nodes.Continue for _, node := range nodes.Items { crv := node.Status.NodeInfo.ContainerRuntimeVersion if strings.HasPrefix(crv, "docker:") { hasDockerNodes = true break } } if continueToken == "" { break } } if hasDockerNodes { return fmt.Errorf("there are nodes using the docker container runtime and proxy-init container must run as root user.\ntry installing linkerd via --set proxyInit.runAsRoot=true") } return nil } // MeshedPodIdentityData contains meshed pod details + trust anchors of the proxy type MeshedPodIdentityData struct { Name string Namespace string Anchors string } // GetMeshedPodsIdentityData obtains the identity data (trust anchors) for all meshed pods func GetMeshedPodsIdentityData(ctx context.Context, api kubernetes.Interface, dataPlaneNamespace string) ([]MeshedPodIdentityData, error) { podList, err := api.CoreV1().Pods(dataPlaneNamespace).List(ctx, metav1.ListOptions{LabelSelector: k8s.ControllerNSLabel}) if err != nil { return nil, err } if len(podList.Items) == 0 { return nil, nil } pods := []MeshedPodIdentityData{} for _, pod := range podList.Items { containers := append(pod.Spec.InitContainers, pod.Spec.Containers...) for _, containerSpec := range containers { if containerSpec.Name != k8s.ProxyContainerName { continue } for _, envVar := range containerSpec.Env { if envVar.Name != identity.EnvTrustAnchors { continue } pods = append(pods, MeshedPodIdentityData{ pod.Name, pod.Namespace, strings.TrimSpace(envVar.Value), }) } } } return pods, nil } func (hc *HealthChecker) checkDataPlaneProxiesCertificate(ctx context.Context) error { return checkPodsProxiesCertificate(ctx, *hc.kubeAPI, hc.DataPlaneNamespace, hc.ControlPlaneNamespace) } func checkPodsProxiesCertificate(ctx context.Context, kubeAPI k8s.KubernetesAPI, targetNamespace, controlPlaneNamespace string) error { meshedPods, err := GetMeshedPodsIdentityData(ctx, kubeAPI, targetNamespace) if err != nil { return err } trustAnchorsPem, err := FetchTrustBundle(ctx, kubeAPI, controlPlaneNamespace) if err != nil { return err } offendingPods := []string{} for _, pod := range meshedPods { // Skip control plane pods since they load their trust anchors from the linkerd-identity-trust-anchors configmap. if pod.Namespace == controlPlaneNamespace { continue } if strings.TrimSpace(pod.Anchors) != strings.TrimSpace(trustAnchorsPem) { if targetNamespace == "" { offendingPods = append(offendingPods, fmt.Sprintf("* %s/%s", pod.Namespace, pod.Name)) } else { offendingPods = append(offendingPods, fmt.Sprintf("* %s", pod.Name)) } } } if len(offendingPods) == 0 { return nil } return fmt.Errorf("Some pods do not have the current trust bundle and must be restarted:\n\t%s", strings.Join(offendingPods, "\n\t")) } func checkResources(resourceName string, objects []runtime.Object, expectedNames []string, shouldExist bool) error { if !shouldExist { if len(objects) > 0 { resources := []Resource{} for _, obj := range objects { m, err := meta.Accessor(obj) if err != nil { return err } res := Resource{name: m.GetName()} gvks, _, err := k8s.ObjectKinds(obj) if err == nil && len(gvks) > 0 { res.groupVersionKind = gvks[0] } resources = append(resources, res) } return ResourceError{resourceName, resources} } return nil } expected := map[string]bool{} for _, name := range expectedNames { expected[name] = false } for _, obj := range objects { metaObj, err := meta.Accessor(obj) if err != nil { return err } if _, ok := expected[metaObj.GetName()]; ok { expected[metaObj.GetName()] = true } } missing := []string{} for name, found := range expected { if !found { missing = append(missing, name) } } if len(missing) > 0 { sort.Strings(missing) return fmt.Errorf("missing %s: %s", resourceName, strings.Join(missing, ", ")) } return nil } // Check if there's a pod with the "opaque ports" annotation defined but a // service selecting the aforementioned pod doesn't define it func (hc *HealthChecker) checkMisconfiguredOpaquePortAnnotations(ctx context.Context) error { // Initialize and sync the kubernetes API // This is used instead of `hc.kubeAPI` to limit multiple k8s API requests // and use the caching logic in the shared informers // TODO: move the shared informer code out of `controller/`, and into `pkg` to simplify the dependency tree. kubeAPI := controllerK8s.NewClusterScopedAPI(hc.kubeAPI, nil, nil, "local", controllerK8s.Endpoint, controllerK8s.Pod, controllerK8s.Svc) kubeAPI.Sync(ctx.Done()) services, err := kubeAPI.Svc().Lister().Services(hc.DataPlaneNamespace).List(labels.Everything()) if err != nil { return err } var errStrings []string for _, service := range services { if service.Spec.ClusterIP == "None" { // skip headless services; they're handled differently continue } endpoints, err := kubeAPI.Endpoint().Lister().Endpoints(service.Namespace).Get(service.Name) if err != nil { return err } pods, err := getEndpointsPods(endpoints, kubeAPI, service.Namespace) if err != nil { return err } for pod := range pods { err := misconfiguredOpaqueAnnotation(service, pod) if err != nil { errStrings = append(errStrings, fmt.Sprintf("\t* %s", err.Error())) } } } if len(errStrings) >= 1 { return fmt.Errorf(strings.Join(errStrings, "\n ")) } return nil } // getEndpointsPods takes a collection of endpoints and returns the set of all // the pods that they target. func getEndpointsPods(endpoints *corev1.Endpoints, kubeAPI *controllerK8s.API, namespace string) (map[*corev1.Pod]struct{}, error) { pods := make(map[*corev1.Pod]struct{}) for _, subset := range endpoints.Subsets { for _, addr := range subset.Addresses { if addr.TargetRef != nil && addr.TargetRef.Kind == "Pod" { pod, err := kubeAPI.Pod().Lister().Pods(namespace).Get(addr.TargetRef.Name) if err != nil { return nil, err } if _, ok := pods[pod]; !ok { pods[pod] = struct{}{} } } } } return pods, nil } func misconfiguredOpaqueAnnotation(service *corev1.Service, pod *corev1.Pod) error { var svcPorts, podPorts []string if v, ok := service.Annotations[k8s.ProxyOpaquePortsAnnotation]; ok { svcPorts = strings.Split(v, ",") } if v, ok := pod.Annotations[k8s.ProxyOpaquePortsAnnotation]; ok { podPorts = strings.Split(v, ",") } // First loop through the services opaque ports and assert that if the pod // exposes a port that is targeted by one of these ports, then it is // marked as opaque on the pod. for _, p := range svcPorts { port, err := strconv.Atoi(p) if err != nil { return fmt.Errorf("failed to convert %s to port number for pod %s", p, pod.Name) } err = checkPodPorts(service, pod, podPorts, port) if err != nil { return err } } // Next loop through the pod's opaque ports and assert that if one of // the ports is targeted by a service port, then it is marked as opaque // on the service. for _, p := range podPorts { if util.ContainsString(p, svcPorts) { // The service exposes p and is marked as opaque. continue } port, err := strconv.Atoi(p) if err != nil { return fmt.Errorf("failed to convert %s to port number for pod %s", p, pod.Name) } // p is marked as opaque on the pod, but the service that selects it // does not have it marked as opaque. We first check if the service // exposes it as a service or integer targetPort. ok, err := checkServiceIntPorts(service, svcPorts, port) if err != nil { return err } if ok { // The service targets the port as an integer and is marked as // opaque so continue checking other pod ports. continue } // The service does not expose p as a service or integer targetPort. // We now check if it targets it as a named port, and if so, that the // service port is marked as opaque. err = checkServiceNamePorts(service, pod, port, svcPorts) if err != nil { return err } } return nil } func checkPodPorts(service *corev1.Service, pod *corev1.Pod, podPorts []string, port int) error { for _, sp := range service.Spec.Ports { if int(sp.Port) == port { for _, c := range pod.Spec.Containers { for _, cp := range c.Ports { if cp.ContainerPort == sp.TargetPort.IntVal || cp.Name == sp.TargetPort.StrVal { // The pod exposes a container port that would be // targeted by this service port var strPort string if sp.TargetPort.Type == 0 { strPort = strconv.Itoa(int(sp.TargetPort.IntVal)) } else { strPort = strconv.Itoa(int(cp.ContainerPort)) } if util.ContainsString(strPort, podPorts) { return nil } return fmt.Errorf("service %s expects target port %s to be opaque; add it to pod %s %s annotation", service.Name, strPort, pod.Name, k8s.ProxyOpaquePortsAnnotation) } } } } } return nil } func checkServiceIntPorts(service *corev1.Service, svcPorts []string, port int) (bool, error) { for _, p := range service.Spec.Ports { if p.TargetPort.Type == 0 && p.TargetPort.IntVal == 0 { if int(p.Port) == port { // The service does not have a target port, so its service // port should be marked as opaque. return false, fmt.Errorf("service %s targets the opaque port %d; add it to its %s annotation", service.Name, port, k8s.ProxyOpaquePortsAnnotation) } } if int(p.TargetPort.IntVal) == port { svcPort := strconv.Itoa(int(p.Port)) if util.ContainsString(svcPort, svcPorts) { // The service exposes svcPort which targets p and svcPort // is properly as opaque. return true, nil } return false, fmt.Errorf("service %s targets the opaque port %d through %d; add %d to its %s annotation", service.Name, port, p.Port, p.Port, k8s.ProxyOpaquePortsAnnotation) } } return false, nil } func checkServiceNamePorts(service *corev1.Service, pod *corev1.Pod, port int, svcPorts []string) error { for _, p := range service.Spec.Ports { if p.TargetPort.StrVal == "" { // The target port is not named so there is no named container // port to check. continue } for _, c := range pod.Spec.Containers { for _, cp := range c.Ports { if int(cp.ContainerPort) == port { // This is the containerPort that maps to the opaque port // we are currently checking. if cp.Name == p.TargetPort.StrVal { svcPort := strconv.Itoa(int(p.Port)) if util.ContainsString(svcPort, svcPorts) { // The service targets the container port by name // and is marked as opaque. return nil } return fmt.Errorf("service %s targets the opaque port %s through %d; add %d to its %s annotation", service.Name, cp.Name, p.Port, p.Port, k8s.ProxyOpaquePortsAnnotation) } } } } } return nil } // GetDataPlanePods returns all the pods with data plane func (hc *HealthChecker) GetDataPlanePods(ctx context.Context) ([]corev1.Pod, error) { selector := fmt.Sprintf("%s=%s", k8s.ControllerNSLabel, hc.ControlPlaneNamespace) podList, err := hc.kubeAPI.CoreV1().Pods(hc.DataPlaneNamespace).List(ctx, metav1.ListOptions{LabelSelector: selector}) if err != nil { return nil, err } return podList.Items, nil } // GetServices returns all services within data plane namespace func (hc *HealthChecker) GetServices(ctx context.Context) ([]corev1.Service, error) { svcList, err := hc.kubeAPI.CoreV1().Services(hc.DataPlaneNamespace).List(ctx, metav1.ListOptions{}) if err != nil { return nil, err } return svcList.Items, nil } func (hc *HealthChecker) checkCanCreate(ctx context.Context, namespace, group, version, resource string) error { return CheckCanPerformAction(ctx, hc.kubeAPI, "create", namespace, group, version, resource) } func (hc *HealthChecker) checkCanCreateNonNamespacedResources(ctx context.Context) error { var errs []string dryRun := metav1.CreateOptions{DryRun: []string{metav1.DryRunAll}} // Iterate over all resources in install manifest installManifestReader := strings.NewReader(hc.Options.InstallManifest) yamlReader := yamlDecoder.NewYAMLReader(bufio.NewReader(installManifestReader)) for { // Read single object YAML objYAML, err := yamlReader.Read() if err != nil { if errors.Is(err, io.EOF) { break } return fmt.Errorf("error reading install manifest: %w", err) } // Create unstructured object from YAML objMap := map[string]interface{}{} err = yaml.Unmarshal(objYAML, &objMap) if err != nil { return fmt.Errorf("error unmarshaling yaml object %s: %w", objYAML, err) } if len(objMap) == 0 { // Ignore header blocks with only comments continue } obj := &unstructured.Unstructured{Object: objMap} // Skip namespaced resources (dry-run requires namespace to exist) if obj.GetNamespace() != "" { continue } // Attempt to create resource using dry-run resource, _ := meta.UnsafeGuessKindToResource(obj.GroupVersionKind()) _, err = hc.kubeAPI.DynamicClient.Resource(resource).Create(ctx, obj, dryRun) if err != nil { errs = append(errs, fmt.Sprintf("cannot create %s/%s: %v", obj.GetKind(), obj.GetName(), err)) } } if len(errs) > 0 { return errors.New(strings.Join(errs, "\n ")) } return nil } func (hc *HealthChecker) checkCanGet(ctx context.Context, namespace, group, version, resource string) error { return CheckCanPerformAction(ctx, hc.kubeAPI, "get", namespace, group, version, resource) } func (hc *HealthChecker) checkExtensionAPIServerAuthentication(ctx context.Context) error { if hc.kubeAPI == nil { return fmt.Errorf("unexpected error: Kubernetes ClientSet not initialized") } m, err := hc.kubeAPI.CoreV1().ConfigMaps(metav1.NamespaceSystem).Get(ctx, k8s.ExtensionAPIServerAuthenticationConfigMapName, metav1.GetOptions{}) if err != nil { return err } if v, exists := m.Data[k8s.ExtensionAPIServerAuthenticationRequestHeaderClientCAFileKey]; !exists || v == "" { return fmt.Errorf("--%s is not configured", k8s.ExtensionAPIServerAuthenticationRequestHeaderClientCAFileKey) } return nil } func (hc *HealthChecker) checkClockSkew(ctx context.Context) error { if hc.kubeAPI == nil { // we should never get here return errors.New("unexpected error: Kubernetes ClientSet not initialized") } var clockSkewNodes []string nodeList, err := hc.kubeAPI.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return err } for _, node := range nodeList.Items { for _, condition := range node.Status.Conditions { // we want to check only KubeletReady condition and only execute if the node is ready if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue { since := time.Since(condition.LastHeartbeatTime.Time) if (since > AllowedClockSkew) || (since < -AllowedClockSkew) { clockSkewNodes = append(clockSkewNodes, node.Name) } } } } if len(clockSkewNodes) > 0 { return fmt.Errorf("clock skew detected for node(s): %s", strings.Join(clockSkewNodes, ", ")) } return nil } func (hc *HealthChecker) checkExtensionNsLabels(ctx context.Context) error { if hc.kubeAPI == nil { // oops something wrong happened return errors.New("unexpected error: Kubernetes ClientSet not initialized") } namespaces, err := hc.kubeAPI.GetAllNamespacesWithExtensionLabel(ctx) if err != nil { return fmt.Errorf("unexpected error when retrieving namespaces: %w", err) } freq := make(map[string][]string) for _, ns := range namespaces { // We can guarantee the namespace has the extension label since we used // a label selector when retrieving namespaces ext := ns.Labels[k8s.LinkerdExtensionLabel] // To make it easier to print, store already error-formatted namespace // in freq table freq[ext] = append(freq[ext], fmt.Sprintf("\t\t* %s", ns.Name)) } errs := []string{} for ext, namespaces := range freq { if len(namespaces) == 1 { continue } errs = append(errs, fmt.Sprintf("\t* label \"%s=%s\" is present on more than one namespace:\n%s", k8s.LinkerdExtensionLabel, ext, strings.Join(namespaces, "\n"))) } if len(errs) > 0 { return errors.New(strings.Join( append([]string{"some extensions have invalid configuration"}, errs...), "\n")) } return nil } // CheckRoles checks that the expected roles exist. func CheckRoles(ctx context.Context, kubeAPI *k8s.KubernetesAPI, shouldExist bool, namespace string, expectedNames []string, labelSelector string) error { options := metav1.ListOptions{ LabelSelector: labelSelector, } crList, err := kubeAPI.RbacV1().Roles(namespace).List(ctx, options) if err != nil { return err } objects := []runtime.Object{} for _, item := range crList.Items { item := item // pin objects = append(objects, &item) } return checkResources("Roles", objects, expectedNames, shouldExist) } // CheckRoleBindings checks that the expected RoleBindings exist. func CheckRoleBindings(ctx context.Context, kubeAPI *k8s.KubernetesAPI, shouldExist bool, namespace string, expectedNames []string, labelSelector string) error { options := metav1.ListOptions{ LabelSelector: labelSelector, } crbList, err := kubeAPI.RbacV1().RoleBindings(namespace).List(ctx, options) if err != nil { return err } objects := []runtime.Object{} for _, item := range crbList.Items { item := item // pin objects = append(objects, &item) } return checkResources("RoleBindings", objects, expectedNames, shouldExist) } // CheckCanPerformAction checks if a given k8s client is authorized to perform a given action. func CheckCanPerformAction(ctx context.Context, api *k8s.KubernetesAPI, verb, namespace, group, version, resource string) error { if api == nil { // we should never get here return fmt.Errorf("unexpected error: Kubernetes ClientSet not initialized") } return k8s.ResourceAuthz( ctx, api, namespace, verb, group, version, resource, "", ) } // getPodStatuses returns a map of all Linkerd container statuses: // component => // // pod name => // container statuses func getPodStatuses(pods []corev1.Pod) map[string]map[string][]corev1.ContainerStatus { statuses := make(map[string]map[string][]corev1.ContainerStatus) for _, pod := range pods { if pod.Status.Phase == corev1.PodRunning && strings.HasPrefix(pod.Name, "linkerd-") { parts := strings.Split(pod.Name, "-") // All control plane pods should have a name that results in at least 4 // substrings when string.Split on '-' if len(parts) >= 4 { name := strings.Join(parts[1:len(parts)-2], "-") if _, found := statuses[name]; !found { statuses[name] = make(map[string][]corev1.ContainerStatus) } statuses[name][pod.Name] = pod.Status.ContainerStatuses } } } return statuses } func validateControlPlanePods(pods []corev1.Pod) error { statuses := getPodStatuses(pods) names := []string{"destination", "identity", "proxy-injector"} for _, name := range names { pods, found := statuses[name] if !found { return fmt.Errorf("No running pods for \"linkerd-%s\"", name) } var err error var ready bool for pod, containers := range pods { containersReady := true for _, container := range containers { if !container.Ready { // TODO: Save this as a warning, allow check to pass but let the user // know there is at least one pod not ready. This might imply // restructuring health checks to allow individual checks to return // either fatal or warning, rather than setting this property at // compile time. err = fmt.Errorf("pod/%s container %s is not ready", pod, container.Name) containersReady = false } } if containersReady { // at least one pod has all containers ready ready = true break } } if !ready { return err } } return nil } func checkUnschedulablePods(pods []corev1.Pod) error { for _, pod := range pods { for _, condition := range pod.Status.Conditions { if condition.Reason == corev1.PodReasonUnschedulable { return fmt.Errorf("%s: %s", pod.Name, condition.Message) } } } return nil } func checkControlPlaneReplicaSets(rst []appsv1.ReplicaSet) error { var errors []string for _, rs := range rst { for _, r := range rs.Status.Conditions { if r.Type == appsv1.ReplicaSetReplicaFailure && r.Status == corev1.ConditionTrue { errors = append(errors, fmt.Sprintf("%s: %s", r.Reason, r.Message)) } } } if len(errors) > 0 { return fmt.Errorf("%s", strings.Join(errors, "\n ")) } return nil } // CheckForPods checks if the given deployments have pod resources present func CheckForPods(pods []corev1.Pod, deployNames []string) error { exists := make(map[string]bool) for _, pod := range pods { for label, value := range pod.Labels { // When the label value is `linkerd.io/control-plane-component` or // `component`, we'll take its value as the name of the deployment // that the pod is part of if label == k8s.ControllerComponentLabel || label == "component" { exists[value] = true } } } for _, expected := range deployNames { if !exists[expected] { return fmt.Errorf("Could not find pods for deployment %s", expected) } } return nil } // CheckPodsRunning checks if the given pods are in running state // along with containers to be in ready state func CheckPodsRunning(pods []corev1.Pod, namespace string) error { if len(pods) == 0 { msg := fmt.Sprintf("no \"%s\" containers found", k8s.ProxyContainerName) if namespace != "" { msg += fmt.Sprintf(" in the \"%s\" namespace", namespace) } return fmt.Errorf(msg) } for _, pod := range pods { status := k8s.GetPodStatus(pod) // Skip validating pods that have a status which indicates there would // be no running proxy container. switch status { case "Completed", "NodeShutdown", "Shutdown", "Terminated": continue } if status != string(corev1.PodRunning) && status != "Evicted" { return fmt.Errorf("pod \"%s\" status is %s", pod.Name, pod.Status.Phase) } if !k8s.GetProxyReady(pod) { return fmt.Errorf("container \"%s\" in pod \"%s\" is not ready", k8s.ProxyContainerName, pod.Name) } } return nil } // CheckIfDataPlanePodsExist checks if the proxy is present in the given pods func CheckIfDataPlanePodsExist(pods []corev1.Pod) error { for _, pod := range pods { if !containsProxy(pod) { return fmt.Errorf("could not find proxy container for %s pod", pod.Name) } } return nil } func containsProxy(pod corev1.Pod) bool { containers := append(pod.Spec.InitContainers, pod.Spec.Containers...) for _, containerSpec := range containers { if containerSpec.Name == k8s.ProxyContainerName { return true } } return false }