...

Source file src/github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/pod.go

Documentation: github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1

     1  //go:build windows
     2  
     3  package main
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"os"
     9  	"path/filepath"
    10  	"strings"
    11  	"sync"
    12  
    13  	"github.com/Microsoft/hcsshim/internal/log"
    14  	"github.com/Microsoft/hcsshim/internal/oci"
    15  	"github.com/Microsoft/hcsshim/internal/uvm"
    16  	"github.com/Microsoft/hcsshim/osversion"
    17  	"github.com/Microsoft/hcsshim/pkg/annotations"
    18  	eventstypes "github.com/containerd/containerd/api/events"
    19  	"github.com/containerd/containerd/errdefs"
    20  	"github.com/containerd/containerd/runtime"
    21  	"github.com/containerd/containerd/runtime/v2/task"
    22  	"github.com/opencontainers/runtime-spec/specs-go"
    23  	"github.com/pkg/errors"
    24  	"golang.org/x/sync/errgroup"
    25  )
    26  
    27  // shimPod represents the logical grouping of all tasks in a single set of
    28  // shared namespaces. The pod sandbox (container) is represented by the task
    29  // that matches the `shimPod.ID()`
    30  type shimPod interface {
    31  	// ID is the id of the task representing the pause (sandbox) container.
    32  	ID() string
    33  	// CreateTask creates a workload task within this pod named `tid` with
    34  	// settings `s`.
    35  	//
    36  	// If `tid==ID()` or `tid` is the same as any other task in this pod, this
    37  	// pod MUST return `errdefs.ErrAlreadyExists`.
    38  	CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *specs.Spec) (shimTask, error)
    39  	// GetTask returns a task in this pod that matches `tid`.
    40  	//
    41  	// If `tid` is not found, this pod MUST return `errdefs.ErrNotFound`.
    42  	GetTask(tid string) (shimTask, error)
    43  	// GetTasks returns every task in the pod.
    44  	//
    45  	// If a shim cannot be loaded, this will return an error.
    46  	ListTasks() ([]shimTask, error)
    47  	// KillTask sends `signal` to task that matches `tid`.
    48  	//
    49  	// If `tid` is not found, this pod MUST return `errdefs.ErrNotFound`.
    50  	//
    51  	// If `tid==ID() && eid == "" && all == true` this pod will send `signal` to
    52  	// all tasks in the pod and lastly send `signal` to the sandbox itself.
    53  	//
    54  	// If `all == true && eid != ""` this pod MUST return
    55  	// `errdefs.ErrFailedPrecondition`.
    56  	//
    57  	// A call to `KillTask` is only valid when the exec found by `tid,eid` is in
    58  	// the `shimExecStateRunning, shimExecStateExited` states. If the exec is
    59  	// not in this state this pod MUST return `errdefs.ErrFailedPrecondition`.
    60  	KillTask(ctx context.Context, tid, eid string, signal uint32, all bool) error
    61  	// DeleteTask removes a task from being tracked by this pod, and cleans up
    62  	// the resources the shim allocated for the task.
    63  	//
    64  	// The task's init exec (eid == "") must be either `shimExecStateCreated` or
    65  	// `shimExecStateExited`.  If the exec is not in this state this pod MUST
    66  	// return `errdefs.ErrFailedPrecondition`. Deleting the pod's sandbox task
    67  	// is a no-op.
    68  	DeleteTask(ctx context.Context, tid string) error
    69  }
    70  
    71  func createPod(ctx context.Context, events publisher, req *task.CreateTaskRequest, s *specs.Spec) (_ shimPod, err error) {
    72  	log.G(ctx).WithField("tid", req.ID).Debug("createPod")
    73  
    74  	if osversion.Build() < osversion.RS5 {
    75  		return nil, errors.Wrapf(errdefs.ErrFailedPrecondition, "pod support is not available on Windows versions previous to RS5 (%d)", osversion.RS5)
    76  	}
    77  
    78  	ct, sid, err := oci.GetSandboxTypeAndID(s.Annotations)
    79  	if err != nil {
    80  		return nil, err
    81  	}
    82  	if ct != oci.KubernetesContainerTypeSandbox {
    83  		return nil, errors.Wrapf(
    84  			errdefs.ErrFailedPrecondition,
    85  			"expected annotation: '%s': '%s' got '%s'",
    86  			annotations.KubernetesContainerType,
    87  			oci.KubernetesContainerTypeSandbox,
    88  			ct)
    89  	}
    90  	if sid != req.ID {
    91  		return nil, errors.Wrapf(
    92  			errdefs.ErrFailedPrecondition,
    93  			"expected annotation '%s': '%s' got '%s'",
    94  			annotations.KubernetesSandboxID,
    95  			req.ID,
    96  			sid)
    97  	}
    98  
    99  	owner := filepath.Base(os.Args[0])
   100  	isWCOW := oci.IsWCOW(s)
   101  
   102  	p := pod{
   103  		events: events,
   104  		id:     req.ID,
   105  		spec:   s,
   106  	}
   107  
   108  	var parent *uvm.UtilityVM
   109  	var lopts *uvm.OptionsLCOW
   110  	if oci.IsIsolated(s) {
   111  		// Create the UVM parent
   112  		opts, err := oci.SpecToUVMCreateOpts(ctx, s, fmt.Sprintf("%s@vm", req.ID), owner)
   113  		if err != nil {
   114  			return nil, err
   115  		}
   116  		switch opts.(type) {
   117  		case *uvm.OptionsLCOW:
   118  			lopts = (opts).(*uvm.OptionsLCOW)
   119  			lopts.BundleDirectory = req.Bundle
   120  			parent, err = uvm.CreateLCOW(ctx, lopts)
   121  			if err != nil {
   122  				return nil, err
   123  			}
   124  		case *uvm.OptionsWCOW:
   125  			wopts := (opts).(*uvm.OptionsWCOW)
   126  
   127  			// In order for the UVM sandbox.vhdx not to collide with the actual
   128  			// nested Argon sandbox.vhdx we append the \vm folder to the last
   129  			// entry in the list.
   130  			layersLen := len(s.Windows.LayerFolders)
   131  			layers := make([]string, layersLen)
   132  			copy(layers, s.Windows.LayerFolders)
   133  
   134  			vmPath := filepath.Join(layers[layersLen-1], "vm")
   135  			err := os.MkdirAll(vmPath, 0)
   136  			if err != nil {
   137  				return nil, err
   138  			}
   139  			layers[layersLen-1] = vmPath
   140  			wopts.LayerFolders = layers
   141  
   142  			parent, err = uvm.CreateWCOW(ctx, wopts)
   143  			if err != nil {
   144  				return nil, err
   145  			}
   146  		}
   147  		err = parent.Start(ctx)
   148  		if err != nil {
   149  			parent.Close()
   150  			return nil, err
   151  		}
   152  
   153  	} else if oci.IsJobContainer(s) {
   154  		// If we're making a job container fake a task (i.e reuse the wcowPodSandbox logic)
   155  		p.sandboxTask = newWcowPodSandboxTask(ctx, events, req.ID, req.Bundle, parent, "")
   156  		if err := events.publishEvent(
   157  			ctx,
   158  			runtime.TaskCreateEventTopic,
   159  			&eventstypes.TaskCreate{
   160  				ContainerID: req.ID,
   161  				Bundle:      req.Bundle,
   162  				Rootfs:      req.Rootfs,
   163  				IO: &eventstypes.TaskIO{
   164  					Stdin:    req.Stdin,
   165  					Stdout:   req.Stdout,
   166  					Stderr:   req.Stderr,
   167  					Terminal: req.Terminal,
   168  				},
   169  				Checkpoint: "",
   170  				Pid:        0,
   171  			}); err != nil {
   172  			return nil, err
   173  		}
   174  		p.jobContainer = true
   175  		return &p, nil
   176  	} else if !isWCOW {
   177  		return nil, errors.Wrap(errdefs.ErrFailedPrecondition, "oci spec does not contain WCOW or LCOW spec")
   178  	}
   179  
   180  	defer func() {
   181  		// clean up the uvm if we fail any further operations
   182  		if err != nil && parent != nil {
   183  			parent.Close()
   184  		}
   185  	}()
   186  
   187  	p.host = parent
   188  	if parent != nil {
   189  		cid := req.ID
   190  		if id, ok := s.Annotations[annotations.NcproxyContainerID]; ok {
   191  			cid = id
   192  		}
   193  		caAddr := fmt.Sprintf(uvm.ComputeAgentAddrFmt, cid)
   194  		if err := parent.CreateAndAssignNetworkSetup(ctx, caAddr, cid); err != nil {
   195  			return nil, err
   196  		}
   197  	}
   198  
   199  	// TODO: JTERRY75 - There is a bug in the compartment activation for Windows
   200  	// Process isolated that requires us to create the real pause container to
   201  	// hold the network compartment open. This is not required for Windows
   202  	// Hypervisor isolated. When we have a build that supports this for Windows
   203  	// Process isolated make sure to move back to this model.
   204  
   205  	// For WCOW we fake out the init task since we dont need it. We only
   206  	// need to provision the guest network namespace if this is hypervisor
   207  	// isolated. Process isolated WCOW gets the namespace endpoints
   208  	// automatically.
   209  	nsid := ""
   210  	if isWCOW && parent != nil {
   211  		if s.Windows != nil && s.Windows.Network != nil {
   212  			nsid = s.Windows.Network.NetworkNamespace
   213  		}
   214  
   215  		if nsid != "" {
   216  			if err := parent.ConfigureNetworking(ctx, nsid); err != nil {
   217  				return nil, errors.Wrapf(err, "failed to setup networking for pod %q", req.ID)
   218  			}
   219  		}
   220  		p.sandboxTask = newWcowPodSandboxTask(ctx, events, req.ID, req.Bundle, parent, nsid)
   221  		// Publish the created event. We only do this for a fake WCOW task. A
   222  		// HCS Task will event itself based on actual process lifetime.
   223  		if err := events.publishEvent(
   224  			ctx,
   225  			runtime.TaskCreateEventTopic,
   226  			&eventstypes.TaskCreate{
   227  				ContainerID: req.ID,
   228  				Bundle:      req.Bundle,
   229  				Rootfs:      req.Rootfs,
   230  				IO: &eventstypes.TaskIO{
   231  					Stdin:    req.Stdin,
   232  					Stdout:   req.Stdout,
   233  					Stderr:   req.Stderr,
   234  					Terminal: req.Terminal,
   235  				},
   236  				Checkpoint: "",
   237  				Pid:        0,
   238  			}); err != nil {
   239  			return nil, err
   240  		}
   241  	} else {
   242  		if isWCOW {
   243  			defaultArgs := "c:\\windows\\system32\\cmd.exe"
   244  			// For the default pause image, the  entrypoint
   245  			// used is pause.exe
   246  			// If the default pause image is not used for pause containers,
   247  			// the activation will immediately exit on Windows
   248  			// because there is no command. We forcibly update the command here
   249  			// to keep it alive only for non-default pause images.
   250  			// TODO: This override can be completely removed from containerd/1.7
   251  			if (len(s.Process.Args) == 1 && strings.EqualFold(s.Process.Args[0], defaultArgs)) ||
   252  				strings.EqualFold(s.Process.CommandLine, defaultArgs) {
   253  				log.G(ctx).Warning("Detected CMD override for pause container entrypoint." +
   254  					"Please consider switching to a pause image with an explicit cmd set")
   255  				s.Process.CommandLine = "cmd /c ping -t 127.0.0.1 > nul"
   256  			}
   257  		}
   258  		// LCOW (and WCOW Process Isolated for the time being) requires a real
   259  		// task for the sandbox.
   260  		lt, err := newHcsTask(ctx, events, parent, true, req, s)
   261  		if err != nil {
   262  			return nil, err
   263  		}
   264  		p.sandboxTask = lt
   265  	}
   266  	return &p, nil
   267  }
   268  
   269  var _ = (shimPod)(&pod{})
   270  
   271  type pod struct {
   272  	events publisher
   273  	// id is the id of the sandbox task when the pod is created.
   274  	//
   275  	// It MUST be treated as read only in the lifetime of the pod.
   276  	id string
   277  	// sandboxTask is the task that represents the sandbox.
   278  	//
   279  	// Note: The invariant `id==sandboxTask.ID()` MUST be true.
   280  	//
   281  	// It MUST be treated as read only in the lifetime of the pod.
   282  	sandboxTask shimTask
   283  	// host is the UtilityVM that is hosting `sandboxTask` if the task is
   284  	// hypervisor isolated.
   285  	//
   286  	// It MUST be treated as read only in the lifetime of the pod.
   287  	host *uvm.UtilityVM
   288  
   289  	// jobContainer specifies whether this pod is for WCOW job containers only.
   290  	//
   291  	// It MUST be treated as read only in the lifetime of the pod.
   292  	jobContainer bool
   293  
   294  	// spec is the OCI runtime specification for the pod sandbox container.
   295  	spec *specs.Spec
   296  
   297  	workloadTasks sync.Map
   298  }
   299  
   300  func (p *pod) ID() string {
   301  	return p.id
   302  }
   303  
   304  func (p *pod) CreateTask(ctx context.Context, req *task.CreateTaskRequest, s *specs.Spec) (_ shimTask, err error) {
   305  	if req.ID == p.id {
   306  		return nil, errors.Wrapf(errdefs.ErrAlreadyExists, "task with id: '%s' already exists", req.ID)
   307  	}
   308  	e, _ := p.sandboxTask.GetExec("")
   309  	if e.State() != shimExecStateRunning {
   310  		return nil, errors.Wrapf(errdefs.ErrFailedPrecondition, "task with id: '%s' cannot be created in pod: '%s' which is not running", req.ID, p.id)
   311  	}
   312  
   313  	_, ok := p.workloadTasks.Load(req.ID)
   314  	if ok {
   315  		return nil, errors.Wrapf(errdefs.ErrAlreadyExists, "task with id: '%s' already exists id pod: '%s'", req.ID, p.id)
   316  	}
   317  
   318  	if p.jobContainer {
   319  		// This is a short circuit to make sure that all containers in a pod will have
   320  		// the same IP address/be added to the same compartment.
   321  		//
   322  		// There will need to be OS work needed to support this scenario, so for now we need to block on
   323  		// this.
   324  		if !oci.IsJobContainer(s) {
   325  			return nil, errors.New("cannot create a normal process isolated container if the pod sandbox is a job container")
   326  		}
   327  		// Pass through some annotations from the pod spec that if specified will need to be made available
   328  		// to every container as well. Kubernetes only passes annotations to RunPodSandbox so there needs to be
   329  		// a way for individual containers to get access to these.
   330  		oci.SandboxAnnotationsPassThrough(
   331  			p.spec.Annotations,
   332  			s.Annotations,
   333  			annotations.HostProcessInheritUser,
   334  			annotations.HostProcessRootfsLocation,
   335  		)
   336  	}
   337  
   338  	ct, sid, err := oci.GetSandboxTypeAndID(s.Annotations)
   339  	if err != nil {
   340  		return nil, err
   341  	}
   342  	if ct != oci.KubernetesContainerTypeContainer {
   343  		return nil, errors.Wrapf(
   344  			errdefs.ErrFailedPrecondition,
   345  			"expected annotation: '%s': '%s' got '%s'",
   346  			annotations.KubernetesContainerType,
   347  			oci.KubernetesContainerTypeContainer,
   348  			ct)
   349  	}
   350  	if sid != p.id {
   351  		return nil, errors.Wrapf(
   352  			errdefs.ErrFailedPrecondition,
   353  			"expected annotation '%s': '%s' got '%s'",
   354  			annotations.KubernetesSandboxID,
   355  			p.id,
   356  			sid)
   357  	}
   358  
   359  	st, err := newHcsTask(ctx, p.events, p.host, false, req, s)
   360  	if err != nil {
   361  		return nil, err
   362  	}
   363  
   364  	p.workloadTasks.Store(req.ID, st)
   365  	return st, nil
   366  }
   367  
   368  func (p *pod) GetTask(tid string) (shimTask, error) {
   369  	if tid == p.id {
   370  		return p.sandboxTask, nil
   371  	}
   372  	raw, loaded := p.workloadTasks.Load(tid)
   373  	if !loaded {
   374  		return nil, errors.Wrapf(errdefs.ErrNotFound, "task with id: '%s' not found", tid)
   375  	}
   376  	return raw.(shimTask), nil
   377  }
   378  
   379  func (p *pod) ListTasks() (_ []shimTask, err error) {
   380  	tasks := []shimTask{p.sandboxTask}
   381  	p.workloadTasks.Range(func(key, value interface{}) bool {
   382  		wt, loaded := value.(shimTask)
   383  		if !loaded {
   384  			err = fmt.Errorf("failed to load tasks %s", key)
   385  			return false
   386  		}
   387  		tasks = append(tasks, wt)
   388  		// Iterate all. Returning false stops the iteration. See:
   389  		// https://pkg.go.dev/sync#Map.Range
   390  		return true
   391  	})
   392  	if err != nil {
   393  		return nil, err
   394  	}
   395  	return tasks, nil
   396  }
   397  
   398  func (p *pod) KillTask(ctx context.Context, tid, eid string, signal uint32, all bool) error {
   399  	t, err := p.GetTask(tid)
   400  	if err != nil {
   401  		return err
   402  	}
   403  	if all && eid != "" {
   404  		return errors.Wrapf(errdefs.ErrFailedPrecondition, "cannot signal all with non empty ExecID: '%s'", eid)
   405  	}
   406  	eg := errgroup.Group{}
   407  	if all && tid == p.id {
   408  		// We are in a kill all on the sandbox task. Signal everything.
   409  		p.workloadTasks.Range(func(key, value interface{}) bool {
   410  			wt := value.(shimTask)
   411  			eg.Go(func() error {
   412  				return wt.KillExec(ctx, eid, signal, all)
   413  			})
   414  
   415  			// Iterate all. Returning false stops the iteration. See:
   416  			// https://pkg.go.dev/sync#Map.Range
   417  			return true
   418  		})
   419  	}
   420  	eg.Go(func() error {
   421  		return t.KillExec(ctx, eid, signal, all)
   422  	})
   423  	return eg.Wait()
   424  }
   425  
   426  func (p *pod) DeleteTask(ctx context.Context, tid string) error {
   427  	// Deleting the sandbox task is a no-op, since the service should delete its
   428  	// reference to the sandbox task or pod, and `p.sandboxTask != nil` is an
   429  	// invariant that is relied on elsewhere.
   430  	// However, still get the init exec for all tasks to ensure that they have
   431  	// been properly stopped.
   432  
   433  	t, err := p.GetTask(tid)
   434  	if err != nil {
   435  		return errors.Wrap(err, "could not find task to delete")
   436  	}
   437  
   438  	e, err := t.GetExec("")
   439  	if err != nil {
   440  		return errors.Wrap(err, "could not get initial exec")
   441  	}
   442  	if e.State() == shimExecStateRunning {
   443  		return errors.Wrap(errdefs.ErrFailedPrecondition, "cannot delete task with running exec")
   444  	}
   445  
   446  	if p.id != tid {
   447  		p.workloadTasks.Delete(tid)
   448  	}
   449  
   450  	return nil
   451  }
   452  

View as plain text