...

Source file src/k8s.io/kubernetes/pkg/kubelet/kuberuntime/kuberuntime_gc.go

Documentation: k8s.io/kubernetes/pkg/kubelet/kuberuntime

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package kuberuntime
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"os"
    23  	"path/filepath"
    24  	"sort"
    25  	"time"
    26  
    27  	"go.opentelemetry.io/otel/trace"
    28  	"k8s.io/apimachinery/pkg/types"
    29  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    30  	"k8s.io/apimachinery/pkg/util/sets"
    31  	internalapi "k8s.io/cri-api/pkg/apis"
    32  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    33  	"k8s.io/klog/v2"
    34  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    35  )
    36  
    37  // containerGC is the manager of garbage collection.
    38  type containerGC struct {
    39  	client           internalapi.RuntimeService
    40  	manager          *kubeGenericRuntimeManager
    41  	podStateProvider podStateProvider
    42  	tracer           trace.Tracer
    43  }
    44  
    45  // NewContainerGC creates a new containerGC.
    46  func newContainerGC(client internalapi.RuntimeService, podStateProvider podStateProvider, manager *kubeGenericRuntimeManager, tracer trace.Tracer) *containerGC {
    47  	return &containerGC{
    48  		client:           client,
    49  		manager:          manager,
    50  		podStateProvider: podStateProvider,
    51  		tracer:           tracer,
    52  	}
    53  }
    54  
    55  // containerGCInfo is the internal information kept for containers being considered for GC.
    56  type containerGCInfo struct {
    57  	// The ID of the container.
    58  	id string
    59  	// The name of the container.
    60  	name string
    61  	// Creation time for the container.
    62  	createTime time.Time
    63  	// If true, the container is in unknown state. Garbage collector should try
    64  	// to stop containers before removal.
    65  	unknown bool
    66  }
    67  
    68  // sandboxGCInfo is the internal information kept for sandboxes being considered for GC.
    69  type sandboxGCInfo struct {
    70  	// The ID of the sandbox.
    71  	id string
    72  	// Creation time for the sandbox.
    73  	createTime time.Time
    74  	// If true, the sandbox is ready or still has containers.
    75  	active bool
    76  }
    77  
    78  // evictUnit is considered for eviction as units of (UID, container name) pair.
    79  type evictUnit struct {
    80  	// UID of the pod.
    81  	uid types.UID
    82  	// Name of the container in the pod.
    83  	name string
    84  }
    85  
    86  type containersByEvictUnit map[evictUnit][]containerGCInfo
    87  type sandboxesByPodUID map[types.UID][]sandboxGCInfo
    88  
    89  // NumContainers returns the number of containers in this map.
    90  func (cu containersByEvictUnit) NumContainers() int {
    91  	num := 0
    92  	for key := range cu {
    93  		num += len(cu[key])
    94  	}
    95  	return num
    96  }
    97  
    98  // NumEvictUnits returns the number of pod in this map.
    99  func (cu containersByEvictUnit) NumEvictUnits() int {
   100  	return len(cu)
   101  }
   102  
   103  // Newest first.
   104  type byCreated []containerGCInfo
   105  
   106  func (a byCreated) Len() int           { return len(a) }
   107  func (a byCreated) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
   108  func (a byCreated) Less(i, j int) bool { return a[i].createTime.After(a[j].createTime) }
   109  
   110  // Newest first.
   111  type sandboxByCreated []sandboxGCInfo
   112  
   113  func (a sandboxByCreated) Len() int           { return len(a) }
   114  func (a sandboxByCreated) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
   115  func (a sandboxByCreated) Less(i, j int) bool { return a[i].createTime.After(a[j].createTime) }
   116  
   117  // enforceMaxContainersPerEvictUnit enforces MaxPerPodContainer for each evictUnit.
   118  func (cgc *containerGC) enforceMaxContainersPerEvictUnit(ctx context.Context, evictUnits containersByEvictUnit, MaxContainers int) {
   119  	for key := range evictUnits {
   120  		toRemove := len(evictUnits[key]) - MaxContainers
   121  
   122  		if toRemove > 0 {
   123  			evictUnits[key] = cgc.removeOldestN(ctx, evictUnits[key], toRemove)
   124  		}
   125  	}
   126  }
   127  
   128  // removeOldestN removes the oldest toRemove containers and returns the resulting slice.
   129  func (cgc *containerGC) removeOldestN(ctx context.Context, containers []containerGCInfo, toRemove int) []containerGCInfo {
   130  	// Remove from oldest to newest (last to first).
   131  	numToKeep := len(containers) - toRemove
   132  	if numToKeep > 0 {
   133  		sort.Sort(byCreated(containers))
   134  	}
   135  	for i := len(containers) - 1; i >= numToKeep; i-- {
   136  		if containers[i].unknown {
   137  			// Containers in known state could be running, we should try
   138  			// to stop it before removal.
   139  			id := kubecontainer.ContainerID{
   140  				Type: cgc.manager.runtimeName,
   141  				ID:   containers[i].id,
   142  			}
   143  			message := "Container is in unknown state, try killing it before removal"
   144  			if err := cgc.manager.killContainer(ctx, nil, id, containers[i].name, message, reasonUnknown, nil, nil); err != nil {
   145  				klog.ErrorS(err, "Failed to stop container", "containerID", containers[i].id)
   146  				continue
   147  			}
   148  		}
   149  		if err := cgc.manager.removeContainer(ctx, containers[i].id); err != nil {
   150  			klog.ErrorS(err, "Failed to remove container", "containerID", containers[i].id)
   151  		}
   152  	}
   153  
   154  	// Assume we removed the containers so that we're not too aggressive.
   155  	return containers[:numToKeep]
   156  }
   157  
   158  // removeOldestNSandboxes removes the oldest inactive toRemove sandboxes and
   159  // returns the resulting slice.
   160  func (cgc *containerGC) removeOldestNSandboxes(ctx context.Context, sandboxes []sandboxGCInfo, toRemove int) {
   161  	numToKeep := len(sandboxes) - toRemove
   162  	if numToKeep > 0 {
   163  		sort.Sort(sandboxByCreated(sandboxes))
   164  	}
   165  	// Remove from oldest to newest (last to first).
   166  	for i := len(sandboxes) - 1; i >= numToKeep; i-- {
   167  		if !sandboxes[i].active {
   168  			cgc.removeSandbox(ctx, sandboxes[i].id)
   169  		}
   170  	}
   171  }
   172  
   173  // removeSandbox removes the sandbox by sandboxID.
   174  func (cgc *containerGC) removeSandbox(ctx context.Context, sandboxID string) {
   175  	klog.V(4).InfoS("Removing sandbox", "sandboxID", sandboxID)
   176  	// In normal cases, kubelet should've already called StopPodSandbox before
   177  	// GC kicks in. To guard against the rare cases where this is not true, try
   178  	// stopping the sandbox before removing it.
   179  	if err := cgc.client.StopPodSandbox(ctx, sandboxID); err != nil {
   180  		klog.ErrorS(err, "Failed to stop sandbox before removing", "sandboxID", sandboxID)
   181  		return
   182  	}
   183  	if err := cgc.client.RemovePodSandbox(ctx, sandboxID); err != nil {
   184  		klog.ErrorS(err, "Failed to remove sandbox", "sandboxID", sandboxID)
   185  	}
   186  }
   187  
   188  // evictableContainers gets all containers that are evictable. Evictable containers are: not running
   189  // and created more than MinAge ago.
   190  func (cgc *containerGC) evictableContainers(ctx context.Context, minAge time.Duration) (containersByEvictUnit, error) {
   191  	containers, err := cgc.manager.getKubeletContainers(ctx, true)
   192  	if err != nil {
   193  		return containersByEvictUnit{}, err
   194  	}
   195  
   196  	evictUnits := make(containersByEvictUnit)
   197  	newestGCTime := time.Now().Add(-minAge)
   198  	for _, container := range containers {
   199  		// Prune out running containers.
   200  		if container.State == runtimeapi.ContainerState_CONTAINER_RUNNING {
   201  			continue
   202  		}
   203  
   204  		createdAt := time.Unix(0, container.CreatedAt)
   205  		if newestGCTime.Before(createdAt) {
   206  			continue
   207  		}
   208  
   209  		labeledInfo := getContainerInfoFromLabels(container.Labels)
   210  		containerInfo := containerGCInfo{
   211  			id:         container.Id,
   212  			name:       container.Metadata.Name,
   213  			createTime: createdAt,
   214  			unknown:    container.State == runtimeapi.ContainerState_CONTAINER_UNKNOWN,
   215  		}
   216  		key := evictUnit{
   217  			uid:  labeledInfo.PodUID,
   218  			name: containerInfo.name,
   219  		}
   220  		evictUnits[key] = append(evictUnits[key], containerInfo)
   221  	}
   222  
   223  	return evictUnits, nil
   224  }
   225  
   226  // evict all containers that are evictable
   227  func (cgc *containerGC) evictContainers(ctx context.Context, gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
   228  	// Separate containers by evict units.
   229  	evictUnits, err := cgc.evictableContainers(ctx, gcPolicy.MinAge)
   230  	if err != nil {
   231  		return err
   232  	}
   233  
   234  	// Remove deleted pod containers if all sources are ready.
   235  	if allSourcesReady {
   236  		for key, unit := range evictUnits {
   237  			if cgc.podStateProvider.ShouldPodContentBeRemoved(key.uid) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(key.uid)) {
   238  				cgc.removeOldestN(ctx, unit, len(unit)) // Remove all.
   239  				delete(evictUnits, key)
   240  			}
   241  		}
   242  	}
   243  
   244  	// Enforce max containers per evict unit.
   245  	if gcPolicy.MaxPerPodContainer >= 0 {
   246  		cgc.enforceMaxContainersPerEvictUnit(ctx, evictUnits, gcPolicy.MaxPerPodContainer)
   247  	}
   248  
   249  	// Enforce max total number of containers.
   250  	if gcPolicy.MaxContainers >= 0 && evictUnits.NumContainers() > gcPolicy.MaxContainers {
   251  		// Leave an equal number of containers per evict unit (min: 1).
   252  		numContainersPerEvictUnit := gcPolicy.MaxContainers / evictUnits.NumEvictUnits()
   253  		if numContainersPerEvictUnit < 1 {
   254  			numContainersPerEvictUnit = 1
   255  		}
   256  		cgc.enforceMaxContainersPerEvictUnit(ctx, evictUnits, numContainersPerEvictUnit)
   257  
   258  		// If we still need to evict, evict oldest first.
   259  		numContainers := evictUnits.NumContainers()
   260  		if numContainers > gcPolicy.MaxContainers {
   261  			flattened := make([]containerGCInfo, 0, numContainers)
   262  			for key := range evictUnits {
   263  				flattened = append(flattened, evictUnits[key]...)
   264  			}
   265  			sort.Sort(byCreated(flattened))
   266  
   267  			cgc.removeOldestN(ctx, flattened, numContainers-gcPolicy.MaxContainers)
   268  		}
   269  	}
   270  	return nil
   271  }
   272  
   273  // evictSandboxes remove all evictable sandboxes. An evictable sandbox must
   274  // meet the following requirements:
   275  //  1. not in ready state
   276  //  2. contains no containers.
   277  //  3. belong to a non-existent (i.e., already removed) pod, or is not the
   278  //     most recently created sandbox for the pod.
   279  func (cgc *containerGC) evictSandboxes(ctx context.Context, evictNonDeletedPods bool) error {
   280  	containers, err := cgc.manager.getKubeletContainers(ctx, true)
   281  	if err != nil {
   282  		return err
   283  	}
   284  
   285  	sandboxes, err := cgc.manager.getKubeletSandboxes(ctx, true)
   286  	if err != nil {
   287  		return err
   288  	}
   289  
   290  	// collect all the PodSandboxId of container
   291  	sandboxIDs := sets.NewString()
   292  	for _, container := range containers {
   293  		sandboxIDs.Insert(container.PodSandboxId)
   294  	}
   295  
   296  	sandboxesByPod := make(sandboxesByPodUID, len(sandboxes))
   297  	for _, sandbox := range sandboxes {
   298  		podUID := types.UID(sandbox.Metadata.Uid)
   299  		sandboxInfo := sandboxGCInfo{
   300  			id:         sandbox.Id,
   301  			createTime: time.Unix(0, sandbox.CreatedAt),
   302  		}
   303  
   304  		// Set ready sandboxes and sandboxes that still have containers to be active.
   305  		if sandbox.State == runtimeapi.PodSandboxState_SANDBOX_READY || sandboxIDs.Has(sandbox.Id) {
   306  			sandboxInfo.active = true
   307  		}
   308  
   309  		sandboxesByPod[podUID] = append(sandboxesByPod[podUID], sandboxInfo)
   310  	}
   311  
   312  	for podUID, sandboxes := range sandboxesByPod {
   313  		if cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) || (evictNonDeletedPods && cgc.podStateProvider.ShouldPodRuntimeBeRemoved(podUID)) {
   314  			// Remove all evictable sandboxes if the pod has been removed.
   315  			// Note that the latest dead sandbox is also removed if there is
   316  			// already an active one.
   317  			cgc.removeOldestNSandboxes(ctx, sandboxes, len(sandboxes))
   318  		} else {
   319  			// Keep latest one if the pod still exists.
   320  			cgc.removeOldestNSandboxes(ctx, sandboxes, len(sandboxes)-1)
   321  		}
   322  	}
   323  	return nil
   324  }
   325  
   326  // evictPodLogsDirectories evicts all evictable pod logs directories. Pod logs directories
   327  // are evictable if there are no corresponding pods.
   328  func (cgc *containerGC) evictPodLogsDirectories(ctx context.Context, allSourcesReady bool) error {
   329  	osInterface := cgc.manager.osInterface
   330  	podLogsDirectory := cgc.manager.podLogsDirectory
   331  	if allSourcesReady {
   332  		// Only remove pod logs directories when all sources are ready.
   333  		dirs, err := osInterface.ReadDir(podLogsDirectory)
   334  		if err != nil {
   335  			return fmt.Errorf("failed to read podLogsDirectory %q: %w", podLogsDirectory, err)
   336  		}
   337  		for _, dir := range dirs {
   338  			name := dir.Name()
   339  			podUID := parsePodUIDFromLogsDirectory(name)
   340  			if !cgc.podStateProvider.ShouldPodContentBeRemoved(podUID) {
   341  				continue
   342  			}
   343  			klog.V(4).InfoS("Removing pod logs", "podUID", podUID)
   344  			err := osInterface.RemoveAll(filepath.Join(podLogsDirectory, name))
   345  			if err != nil {
   346  				klog.ErrorS(err, "Failed to remove pod logs directory", "path", name)
   347  			}
   348  		}
   349  	}
   350  
   351  	// Remove dead container log symlinks.
   352  	// TODO(random-liu): Remove this after cluster logging supports CRI container log path.
   353  	logSymlinks, _ := osInterface.Glob(filepath.Join(legacyContainerLogsDir, fmt.Sprintf("*.%s", legacyLogSuffix)))
   354  	for _, logSymlink := range logSymlinks {
   355  		if _, err := osInterface.Stat(logSymlink); os.IsNotExist(err) {
   356  			if containerID, err := getContainerIDFromLegacyLogSymlink(logSymlink); err == nil {
   357  				resp, err := cgc.manager.runtimeService.ContainerStatus(ctx, containerID, false)
   358  				if err != nil {
   359  					// TODO: we should handle container not found (i.e. container was deleted) case differently
   360  					// once https://github.com/kubernetes/kubernetes/issues/63336 is resolved
   361  					klog.InfoS("Error getting ContainerStatus for containerID", "containerID", containerID, "err", err)
   362  				} else {
   363  					status := resp.GetStatus()
   364  					if status == nil {
   365  						klog.V(4).InfoS("Container status is nil")
   366  						continue
   367  					}
   368  					if status.State != runtimeapi.ContainerState_CONTAINER_EXITED {
   369  						// Here is how container log rotation works (see containerLogManager#rotateLatestLog):
   370  						//
   371  						// 1. rename current log to rotated log file whose filename contains current timestamp (fmt.Sprintf("%s.%s", log, timestamp))
   372  						// 2. reopen the container log
   373  						// 3. if #2 fails, rename rotated log file back to container log
   374  						//
   375  						// There is small but indeterministic amount of time during which log file doesn't exist (between steps #1 and #2, between #1 and #3).
   376  						// Hence the symlink may be deemed unhealthy during that period.
   377  						// See https://github.com/kubernetes/kubernetes/issues/52172
   378  						//
   379  						// We only remove unhealthy symlink for dead containers
   380  						klog.V(5).InfoS("Container is still running, not removing symlink", "containerID", containerID, "path", logSymlink)
   381  						continue
   382  					}
   383  				}
   384  			} else {
   385  				klog.V(4).InfoS("Unable to obtain container ID", "err", err)
   386  			}
   387  			err := osInterface.Remove(logSymlink)
   388  			if err != nil {
   389  				klog.ErrorS(err, "Failed to remove container log dead symlink", "path", logSymlink)
   390  			} else {
   391  				klog.V(4).InfoS("Removed symlink", "path", logSymlink)
   392  			}
   393  		}
   394  	}
   395  	return nil
   396  }
   397  
   398  // GarbageCollect removes dead containers using the specified container gc policy.
   399  // Note that gc policy is not applied to sandboxes. Sandboxes are only removed when they are
   400  // not ready and containing no containers.
   401  //
   402  // GarbageCollect consists of the following steps:
   403  // * gets evictable containers which are not active and created more than gcPolicy.MinAge ago.
   404  // * removes oldest dead containers for each pod by enforcing gcPolicy.MaxPerPodContainer.
   405  // * removes oldest dead containers by enforcing gcPolicy.MaxContainers.
   406  // * gets evictable sandboxes which are not ready and contains no containers.
   407  // * removes evictable sandboxes.
   408  func (cgc *containerGC) GarbageCollect(ctx context.Context, gcPolicy kubecontainer.GCPolicy, allSourcesReady bool, evictNonDeletedPods bool) error {
   409  	ctx, otelSpan := cgc.tracer.Start(ctx, "Containers/GarbageCollect")
   410  	defer otelSpan.End()
   411  	errors := []error{}
   412  	// Remove evictable containers
   413  	if err := cgc.evictContainers(ctx, gcPolicy, allSourcesReady, evictNonDeletedPods); err != nil {
   414  		errors = append(errors, err)
   415  	}
   416  
   417  	// Remove sandboxes with zero containers
   418  	if err := cgc.evictSandboxes(ctx, evictNonDeletedPods); err != nil {
   419  		errors = append(errors, err)
   420  	}
   421  
   422  	// Remove pod sandbox log directory
   423  	if err := cgc.evictPodLogsDirectories(ctx, allSourcesReady); err != nil {
   424  		errors = append(errors, err)
   425  	}
   426  	return utilerrors.NewAggregate(errors)
   427  }
   428  

View as plain text