...

Source file src/github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/task_hcs.go

Documentation: github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1

     1  //go:build windows
     2  
     3  package main
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"os"
     9  	"path/filepath"
    10  	"strings"
    11  	"sync"
    12  	"time"
    13  
    14  	eventstypes "github.com/containerd/containerd/api/events"
    15  	"github.com/containerd/containerd/errdefs"
    16  	"github.com/containerd/containerd/runtime"
    17  	"github.com/containerd/containerd/runtime/v2/task"
    18  	"github.com/containerd/typeurl"
    19  	"github.com/opencontainers/runtime-spec/specs-go"
    20  	"github.com/pkg/errors"
    21  	"github.com/sirupsen/logrus"
    22  	"go.opencensus.io/trace"
    23  
    24  	"github.com/Microsoft/go-winio/pkg/fs"
    25  	runhcsopts "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options"
    26  	"github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/stats"
    27  	"github.com/Microsoft/hcsshim/internal/cmd"
    28  	"github.com/Microsoft/hcsshim/internal/cow"
    29  	"github.com/Microsoft/hcsshim/internal/guestpath"
    30  	"github.com/Microsoft/hcsshim/internal/hcs"
    31  	"github.com/Microsoft/hcsshim/internal/hcs/resourcepaths"
    32  	"github.com/Microsoft/hcsshim/internal/hcs/schema1"
    33  	hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
    34  	"github.com/Microsoft/hcsshim/internal/hcsoci"
    35  	"github.com/Microsoft/hcsshim/internal/jobcontainers"
    36  	"github.com/Microsoft/hcsshim/internal/log"
    37  	"github.com/Microsoft/hcsshim/internal/memory"
    38  	"github.com/Microsoft/hcsshim/internal/oc"
    39  	"github.com/Microsoft/hcsshim/internal/oci"
    40  	"github.com/Microsoft/hcsshim/internal/processorinfo"
    41  	"github.com/Microsoft/hcsshim/internal/protocol/guestrequest"
    42  	"github.com/Microsoft/hcsshim/internal/protocol/guestresource"
    43  	"github.com/Microsoft/hcsshim/internal/resources"
    44  	"github.com/Microsoft/hcsshim/internal/shimdiag"
    45  	"github.com/Microsoft/hcsshim/internal/uvm"
    46  	"github.com/Microsoft/hcsshim/osversion"
    47  	"github.com/Microsoft/hcsshim/pkg/annotations"
    48  	"github.com/Microsoft/hcsshim/pkg/ctrdtaskapi"
    49  )
    50  
    51  func newHcsStandaloneTask(ctx context.Context, events publisher, req *task.CreateTaskRequest, s *specs.Spec) (shimTask, error) {
    52  	log.G(ctx).WithField("tid", req.ID).Debug("newHcsStandaloneTask")
    53  
    54  	ct, _, err := oci.GetSandboxTypeAndID(s.Annotations)
    55  	if err != nil {
    56  		return nil, err
    57  	}
    58  	if ct != oci.KubernetesContainerTypeNone {
    59  		return nil, errors.Wrapf(
    60  			errdefs.ErrFailedPrecondition,
    61  			"cannot create standalone task, expected no annotation: '%s': got '%s'",
    62  			annotations.KubernetesContainerType,
    63  			ct)
    64  	}
    65  
    66  	owner := filepath.Base(os.Args[0])
    67  
    68  	var parent *uvm.UtilityVM
    69  	if osversion.Build() >= osversion.RS5 && oci.IsIsolated(s) {
    70  		// Create the UVM parent
    71  		opts, err := oci.SpecToUVMCreateOpts(ctx, s, fmt.Sprintf("%s@vm", req.ID), owner)
    72  		if err != nil {
    73  			return nil, err
    74  		}
    75  		switch opts.(type) {
    76  		case *uvm.OptionsLCOW:
    77  			lopts := (opts).(*uvm.OptionsLCOW)
    78  			parent, err = uvm.CreateLCOW(ctx, lopts)
    79  			if err != nil {
    80  				return nil, err
    81  			}
    82  		case *uvm.OptionsWCOW:
    83  			wopts := (opts).(*uvm.OptionsWCOW)
    84  
    85  			// In order for the UVM sandbox.vhdx not to collide with the actual
    86  			// nested Argon sandbox.vhdx we append the \vm folder to the last
    87  			// entry in the list.
    88  			layersLen := len(s.Windows.LayerFolders)
    89  			layers := make([]string, layersLen)
    90  			copy(layers, s.Windows.LayerFolders)
    91  
    92  			vmPath := filepath.Join(layers[layersLen-1], "vm")
    93  			err := os.MkdirAll(vmPath, 0)
    94  			if err != nil {
    95  				return nil, err
    96  			}
    97  			layers[layersLen-1] = vmPath
    98  			wopts.LayerFolders = layers
    99  
   100  			parent, err = uvm.CreateWCOW(ctx, wopts)
   101  			if err != nil {
   102  				return nil, err
   103  			}
   104  		}
   105  		err = parent.Start(ctx)
   106  		if err != nil {
   107  			parent.Close()
   108  		}
   109  	} else if !oci.IsWCOW(s) {
   110  		return nil, errors.Wrap(errdefs.ErrFailedPrecondition, "oci spec does not contain WCOW or LCOW spec")
   111  	}
   112  
   113  	shim, err := newHcsTask(ctx, events, parent, true, req, s)
   114  	if err != nil {
   115  		if parent != nil {
   116  			parent.Close()
   117  		}
   118  		return nil, err
   119  	}
   120  	return shim, nil
   121  }
   122  
   123  // createContainer is a generic call to return either a process/hypervisor isolated container, or a job container
   124  // based on what is set in the OCI spec.
   125  func createContainer(ctx context.Context, id, owner, netNS string, s *specs.Spec, parent *uvm.UtilityVM, shimOpts *runhcsopts.Options) (cow.Container, *resources.Resources, error) {
   126  	var (
   127  		err       error
   128  		container cow.Container
   129  		resources *resources.Resources
   130  	)
   131  
   132  	if oci.IsJobContainer(s) {
   133  		container, resources, err = jobcontainers.Create(ctx, id, s)
   134  		if err != nil {
   135  			return nil, nil, err
   136  		}
   137  	} else {
   138  		opts := &hcsoci.CreateOptions{
   139  			ID:               id,
   140  			Owner:            owner,
   141  			Spec:             s,
   142  			HostingSystem:    parent,
   143  			NetworkNamespace: netNS,
   144  		}
   145  		if shimOpts != nil {
   146  			opts.ScaleCPULimitsToSandbox = shimOpts.ScaleCpuLimitsToSandbox
   147  		}
   148  		container, resources, err = hcsoci.CreateContainer(ctx, opts)
   149  		if err != nil {
   150  			return nil, nil, err
   151  		}
   152  	}
   153  	return container, resources, nil
   154  }
   155  
   156  // newHcsTask creates a container within `parent` and its init exec process in
   157  // the `shimExecCreated` state and returns the task that tracks its lifetime.
   158  //
   159  // If `parent == nil` the container is created on the host.
   160  func newHcsTask(
   161  	ctx context.Context,
   162  	events publisher,
   163  	parent *uvm.UtilityVM,
   164  	ownsParent bool,
   165  	req *task.CreateTaskRequest,
   166  	s *specs.Spec) (_ shimTask, err error) {
   167  	log.G(ctx).WithFields(logrus.Fields{
   168  		"tid":        req.ID,
   169  		"ownsParent": ownsParent,
   170  	}).Debug("newHcsTask")
   171  
   172  	owner := filepath.Base(os.Args[0])
   173  
   174  	var netNS string
   175  	if s.Windows != nil &&
   176  		s.Windows.Network != nil {
   177  		netNS = s.Windows.Network.NetworkNamespace
   178  	}
   179  
   180  	var shimOpts *runhcsopts.Options
   181  	if req.Options != nil {
   182  		v, err := typeurl.UnmarshalAny(req.Options)
   183  		if err != nil {
   184  			return nil, err
   185  		}
   186  		shimOpts = v.(*runhcsopts.Options)
   187  	}
   188  
   189  	// Default to an infinite timeout (zero value)
   190  	var ioRetryTimeout time.Duration
   191  	if shimOpts != nil {
   192  		ioRetryTimeout = time.Duration(shimOpts.IoRetryTimeoutInSec) * time.Second
   193  	}
   194  	io, err := cmd.NewUpstreamIO(ctx, req.ID, req.Stdout, req.Stderr, req.Stdin, req.Terminal, ioRetryTimeout)
   195  	if err != nil {
   196  		return nil, err
   197  	}
   198  
   199  	container, resources, err := createContainer(ctx, req.ID, owner, netNS, s, parent, shimOpts)
   200  	if err != nil {
   201  		return nil, err
   202  	}
   203  
   204  	ht := &hcsTask{
   205  		events:         events,
   206  		id:             req.ID,
   207  		isWCOW:         oci.IsWCOW(s),
   208  		c:              container,
   209  		cr:             resources,
   210  		ownsHost:       ownsParent,
   211  		host:           parent,
   212  		closed:         make(chan struct{}),
   213  		taskSpec:       s,
   214  		ioRetryTimeout: ioRetryTimeout,
   215  	}
   216  	ht.init = newHcsExec(
   217  		ctx,
   218  		events,
   219  		req.ID,
   220  		parent,
   221  		container,
   222  		req.ID,
   223  		req.Bundle,
   224  		ht.isWCOW,
   225  		s.Process,
   226  		io,
   227  	)
   228  
   229  	if parent != nil {
   230  		// We have a parent UVM. Listen for its exit and forcibly close this
   231  		// task. This is not expected but in the event of a UVM crash we need to
   232  		// handle this case.
   233  		go ht.waitForHostExit()
   234  	}
   235  
   236  	go ht.waitInitExit()
   237  
   238  	// Publish the created event
   239  	if err := ht.events.publishEvent(
   240  		ctx,
   241  		runtime.TaskCreateEventTopic,
   242  		&eventstypes.TaskCreate{
   243  			ContainerID: req.ID,
   244  			Bundle:      req.Bundle,
   245  			Rootfs:      req.Rootfs,
   246  			IO: &eventstypes.TaskIO{
   247  				Stdin:    req.Stdin,
   248  				Stdout:   req.Stdout,
   249  				Stderr:   req.Stderr,
   250  				Terminal: req.Terminal,
   251  			},
   252  			Checkpoint: "",
   253  			Pid:        uint32(ht.init.Pid()),
   254  		}); err != nil {
   255  		return nil, err
   256  	}
   257  	return ht, nil
   258  }
   259  
   260  var _ = (shimTask)(&hcsTask{})
   261  
   262  // hcsTask is a generic task that represents a WCOW Container (process or
   263  // hypervisor isolated), or a LCOW Container. This task MAY own the UVM the
   264  // container is in but in the case of a POD it may just track the UVM for
   265  // container lifetime management. In the case of ownership when the init
   266  // task/exec is stopped the UVM itself will be stopped as well.
   267  type hcsTask struct {
   268  	events publisher
   269  	// id is the id of this task when it is created.
   270  	//
   271  	// It MUST be treated as read only in the liftetime of the task.
   272  	id string
   273  	// isWCOW is set to `true` if this is a task representing a Windows container.
   274  	//
   275  	// It MUST be treated as read only in the liftetime of the task.
   276  	isWCOW bool
   277  	// c is the container backing this task.
   278  	//
   279  	// It MUST be treated as read only in the lifetime of this task EXCEPT after
   280  	// a Kill to the init task in which it must be shutdown.
   281  	c cow.Container
   282  	// cr is the container resources this task is holding.
   283  	//
   284  	// It MUST be treated as read only in the lifetime of this task EXCEPT after
   285  	// a Kill to the init task in which all resources must be released.
   286  	cr *resources.Resources
   287  	// init is the init process of the container.
   288  	//
   289  	// Note: the invariant `container state == init.State()` MUST be true. IE:
   290  	// if the init process exits the container as a whole and all exec's MUST
   291  	// exit.
   292  	//
   293  	// It MUST be treated as read only in the lifetime of the task.
   294  	init shimExec
   295  	// ownsHost is `true` if this task owns `host`. If so when this tasks init
   296  	// exec shuts down it is required that `host` be shut down as well.
   297  	ownsHost bool
   298  	// host is the hosting VM for this exec if hypervisor isolated. If
   299  	// `host==nil` this is an Argon task so no UVM cleanup is required.
   300  	//
   301  	// NOTE: if `osversion.Build() < osversion.RS5` this will always be
   302  	// `nil`.
   303  	host *uvm.UtilityVM
   304  
   305  	// ecl is the exec create lock for all non-init execs and MUST be held
   306  	// during create to prevent ID duplication.
   307  	ecl   sync.Mutex
   308  	execs sync.Map
   309  
   310  	closed    chan struct{}
   311  	closeOnce sync.Once
   312  	// closeHostOnce is used to close `host`. This will only be used if
   313  	// `ownsHost==true` and `host != nil`.
   314  	closeHostOnce sync.Once
   315  
   316  	// taskSpec represents the spec/configuration for this task.
   317  	taskSpec *specs.Spec
   318  
   319  	// ioRetryTimeout is the time for how long to try reconnecting to stdio pipes from containerd.
   320  	ioRetryTimeout time.Duration
   321  }
   322  
   323  func (ht *hcsTask) ID() string {
   324  	return ht.id
   325  }
   326  
   327  func (ht *hcsTask) CreateExec(ctx context.Context, req *task.ExecProcessRequest, spec *specs.Process) error {
   328  	ht.ecl.Lock()
   329  	defer ht.ecl.Unlock()
   330  
   331  	// If the task exists or we got a request for "" which is the init task
   332  	// fail.
   333  	if _, loaded := ht.execs.Load(req.ExecID); loaded || req.ExecID == "" {
   334  		return errors.Wrapf(errdefs.ErrAlreadyExists, "exec: '%s' in task: '%s' already exists", req.ExecID, ht.id)
   335  	}
   336  
   337  	if ht.init.State() != shimExecStateRunning {
   338  		return errors.Wrapf(errdefs.ErrFailedPrecondition, "exec: '' in task: '%s' must be running to create additional execs", ht.id)
   339  	}
   340  
   341  	io, err := cmd.NewUpstreamIO(ctx, req.ID, req.Stdout, req.Stderr, req.Stdin, req.Terminal, ht.ioRetryTimeout)
   342  	if err != nil {
   343  		return err
   344  	}
   345  
   346  	he := newHcsExec(
   347  		ctx,
   348  		ht.events,
   349  		ht.id,
   350  		ht.host,
   351  		ht.c,
   352  		req.ExecID,
   353  		ht.init.Status().Bundle,
   354  		ht.isWCOW,
   355  		spec,
   356  		io,
   357  	)
   358  
   359  	ht.execs.Store(req.ExecID, he)
   360  
   361  	// Publish the created event
   362  	return ht.events.publishEvent(
   363  		ctx,
   364  		runtime.TaskExecAddedEventTopic,
   365  		&eventstypes.TaskExecAdded{
   366  			ContainerID: ht.id,
   367  			ExecID:      req.ExecID,
   368  		})
   369  }
   370  
   371  func (ht *hcsTask) GetExec(eid string) (shimExec, error) {
   372  	if eid == "" {
   373  		return ht.init, nil
   374  	}
   375  	raw, loaded := ht.execs.Load(eid)
   376  	if !loaded {
   377  		return nil, errors.Wrapf(errdefs.ErrNotFound, "exec: '%s' in task: '%s' not found", eid, ht.id)
   378  	}
   379  	return raw.(shimExec), nil
   380  }
   381  
   382  func (ht *hcsTask) ListExecs() (_ []shimExec, err error) {
   383  	var execs []shimExec
   384  	ht.execs.Range(func(key, value interface{}) bool {
   385  		wt, ok := value.(shimExec)
   386  		if !ok {
   387  			err = fmt.Errorf("failed to load exec %q", key)
   388  			return false
   389  		}
   390  		execs = append(execs, wt)
   391  		return true
   392  	})
   393  	if err != nil {
   394  		return nil, err
   395  	}
   396  	return execs, nil
   397  }
   398  
   399  func (ht *hcsTask) KillExec(ctx context.Context, eid string, signal uint32, all bool) error {
   400  	e, err := ht.GetExec(eid)
   401  	if err != nil {
   402  		return err
   403  	}
   404  	if all && eid != "" {
   405  		return errors.Wrapf(errdefs.ErrFailedPrecondition, "cannot signal all for non-empty exec: '%s'", eid)
   406  	}
   407  	if all {
   408  		// We are in a kill all on the init task. Signal everything.
   409  		ht.execs.Range(func(key, value interface{}) bool {
   410  			err := value.(shimExec).Kill(ctx, signal)
   411  			if err != nil {
   412  				log.G(ctx).WithFields(logrus.Fields{
   413  					"eid":           key,
   414  					logrus.ErrorKey: err,
   415  				}).Warn("failed to kill exec in task")
   416  			}
   417  
   418  			// Iterate all. Returning false stops the iteration. See:
   419  			// https://pkg.go.dev/sync#Map.Range
   420  			return true
   421  		})
   422  	}
   423  	if signal == 0x9 && eid == "" && ht.host != nil {
   424  		// If this is a SIGKILL against the init process we start a background
   425  		// timer and wait on either the timer expiring or the process exiting
   426  		// cleanly. If the timer exires first we forcibly close the UVM as we
   427  		// assume the guest is misbehaving for some reason.
   428  		go func() {
   429  			t := time.NewTimer(30 * time.Second)
   430  			execExited := make(chan struct{})
   431  			go func() {
   432  				e.Wait()
   433  				close(execExited)
   434  			}()
   435  			select {
   436  			case <-execExited:
   437  				t.Stop()
   438  			case <-t.C:
   439  				// Safe to call multiple times if called previously on
   440  				// successful shutdown.
   441  				ht.host.Close()
   442  			}
   443  		}()
   444  	}
   445  	return e.Kill(ctx, signal)
   446  }
   447  
   448  func (ht *hcsTask) DeleteExec(ctx context.Context, eid string) (int, uint32, time.Time, error) {
   449  	e, err := ht.GetExec(eid)
   450  	if err != nil {
   451  		return 0, 0, time.Time{}, err
   452  	}
   453  	if eid == "" {
   454  		// We are deleting the init exec. Forcibly exit any additional exec's.
   455  		ht.execs.Range(func(key, value interface{}) bool {
   456  			ex := value.(shimExec)
   457  			if s := ex.State(); s != shimExecStateExited {
   458  				ex.ForceExit(ctx, 1)
   459  			}
   460  
   461  			// Iterate all. Returning false stops the iteration. See:
   462  			// https://pkg.go.dev/sync#Map.Range
   463  			return true
   464  		})
   465  	}
   466  	switch state := e.State(); state {
   467  	case shimExecStateCreated:
   468  		e.ForceExit(ctx, 0)
   469  	case shimExecStateRunning:
   470  		return 0, 0, time.Time{}, newExecInvalidStateError(ht.id, eid, state, "delete")
   471  	}
   472  
   473  	if eid == "" {
   474  		// We are killing the init task, so we expect the container to be
   475  		// stopped after this.
   476  		//
   477  		// The task process may have already exited, and the status set to
   478  		// shimExecStateExited, but resources may still be in the process
   479  		// of being cleaned up. Wait for ht.closed to be closed. This signals
   480  		// that waitInitExit() has finished destroying container resources,
   481  		// and layers were umounted.
   482  		// If the shim exits before resources are cleaned up, those resources
   483  		// will remain locked and untracked, which leads to lingering sandboxes
   484  		// and container resources like base vhdx.
   485  		select {
   486  		case <-time.After(30 * time.Second):
   487  			log.G(ctx).Error("timed out waiting for resource cleanup")
   488  			return 0, 0, time.Time{}, errors.Wrap(hcs.ErrTimeout, "waiting for container resource cleanup")
   489  		case <-ht.closed:
   490  		}
   491  
   492  		// The init task has now exited. A ForceExit() has already been sent to
   493  		// execs. Cleanup execs and continue.
   494  		ht.execs.Range(func(key, value interface{}) bool {
   495  			if key == "" {
   496  				// Iterate next.
   497  				return true
   498  			}
   499  			ht.execs.Delete(key)
   500  
   501  			// Iterate all. Returning false stops the iteration. See:
   502  			// https://pkg.go.dev/sync#Map.Range
   503  			return true
   504  		})
   505  
   506  		// cleanup the container directories inside the UVM if required.
   507  		if ht.host != nil {
   508  			if err := ht.host.DeleteContainerState(ctx, ht.id); err != nil {
   509  				log.G(ctx).WithError(err).Errorf("failed to delete container state")
   510  			}
   511  		}
   512  	}
   513  
   514  	status := e.Status()
   515  	if eid != "" {
   516  		ht.execs.Delete(eid)
   517  	}
   518  
   519  	// Publish the deleted event
   520  	if err := ht.events.publishEvent(
   521  		ctx,
   522  		runtime.TaskDeleteEventTopic,
   523  		&eventstypes.TaskDelete{
   524  			ContainerID: ht.id,
   525  			ID:          eid,
   526  			Pid:         status.Pid,
   527  			ExitStatus:  status.ExitStatus,
   528  			ExitedAt:    status.ExitedAt,
   529  		}); err != nil {
   530  		return 0, 0, time.Time{}, err
   531  	}
   532  
   533  	return int(status.Pid), status.ExitStatus, status.ExitedAt, nil
   534  }
   535  
   536  func (ht *hcsTask) Pids(ctx context.Context) ([]runhcsopts.ProcessDetails, error) {
   537  	// Map all user created exec's to pid/exec-id
   538  	pidMap := make(map[int]string)
   539  	ht.execs.Range(func(key, value interface{}) bool {
   540  		ex := value.(shimExec)
   541  		pidMap[ex.Pid()] = ex.ID()
   542  
   543  		// Iterate all. Returning false stops the iteration. See:
   544  		// https://pkg.go.dev/sync#Map.Range
   545  		return true
   546  	})
   547  	pidMap[ht.init.Pid()] = ht.init.ID()
   548  
   549  	// Get the guest pids
   550  	props, err := ht.c.Properties(ctx, schema1.PropertyTypeProcessList)
   551  	if err != nil {
   552  		if isStatsNotFound(err) {
   553  			return nil, errors.Wrapf(errdefs.ErrNotFound, "failed to fetch pids: %s", err)
   554  		}
   555  		return nil, err
   556  	}
   557  
   558  	// Copy to pid/exec-id pair's
   559  	pairs := make([]runhcsopts.ProcessDetails, len(props.ProcessList))
   560  	for i, p := range props.ProcessList {
   561  		pairs[i].ImageName = p.ImageName
   562  		pairs[i].CreatedAt = p.CreateTimestamp
   563  		pairs[i].KernelTime_100Ns = p.KernelTime100ns
   564  		pairs[i].MemoryCommitBytes = p.MemoryCommitBytes
   565  		pairs[i].MemoryWorkingSetPrivateBytes = p.MemoryWorkingSetPrivateBytes
   566  		pairs[i].MemoryWorkingSetSharedBytes = p.MemoryWorkingSetSharedBytes
   567  		pairs[i].ProcessID = p.ProcessId
   568  		pairs[i].UserTime_100Ns = p.KernelTime100ns
   569  
   570  		if eid, ok := pidMap[int(p.ProcessId)]; ok {
   571  			pairs[i].ExecID = eid
   572  		}
   573  	}
   574  	return pairs, nil
   575  }
   576  
   577  func (ht *hcsTask) Wait() *task.StateResponse {
   578  	<-ht.closed
   579  	return ht.init.Wait()
   580  }
   581  
   582  func (ht *hcsTask) waitInitExit() {
   583  	ctx, span := oc.StartSpan(context.Background(), "hcsTask::waitInitExit")
   584  	defer span.End()
   585  	span.AddAttributes(trace.StringAttribute("tid", ht.id))
   586  
   587  	// Wait for it to exit on its own
   588  	ht.init.Wait()
   589  
   590  	// Close the host and event the exit
   591  	ht.close(ctx)
   592  }
   593  
   594  // waitForHostExit waits for the host virtual machine to exit. Once exited
   595  // forcibly exits all additional exec's in this task.
   596  //
   597  // This MUST be called via a goroutine to wait on a background thread.
   598  //
   599  // Note: For Windows process isolated containers there is no host virtual
   600  // machine so this should not be called.
   601  func (ht *hcsTask) waitForHostExit() {
   602  	ctx, span := oc.StartSpan(context.Background(), "hcsTask::waitForHostExit")
   603  	defer span.End()
   604  	span.AddAttributes(trace.StringAttribute("tid", ht.id))
   605  
   606  	err := ht.host.Wait()
   607  	if err != nil {
   608  		log.G(ctx).WithError(err).Error("failed to wait for host virtual machine exit")
   609  	} else {
   610  		log.G(ctx).Debug("host virtual machine exited")
   611  	}
   612  
   613  	ht.execs.Range(func(key, value interface{}) bool {
   614  		ex := value.(shimExec)
   615  		ex.ForceExit(ctx, 1)
   616  
   617  		// Iterate all. Returning false stops the iteration. See:
   618  		// https://pkg.go.dev/sync#Map.Range
   619  		return true
   620  	})
   621  	ht.init.ForceExit(ctx, 1)
   622  	ht.closeHost(ctx)
   623  }
   624  
   625  // close shuts down the container that is owned by this task and if
   626  // `ht.ownsHost` will shutdown the hosting VM the container was placed in.
   627  //
   628  // NOTE: For Windows process isolated containers `ht.ownsHost==true && ht.host
   629  // == nil`.
   630  func (ht *hcsTask) close(ctx context.Context) {
   631  	ht.closeOnce.Do(func() {
   632  		log.G(ctx).Debug("hcsTask::closeOnce")
   633  
   634  		// ht.c should never be nil for a real task but in testing we stub
   635  		// this to avoid a nil dereference. We really should introduce a
   636  		// method or interface for ht.c operations that we can stub for
   637  		// testing.
   638  		if ht.c != nil {
   639  			// Do our best attempt to tear down the container.
   640  			var werr error
   641  			ch := make(chan struct{})
   642  			go func() {
   643  				werr = ht.c.Wait()
   644  				close(ch)
   645  			}()
   646  			err := ht.c.Shutdown(ctx)
   647  			if err != nil {
   648  				log.G(ctx).WithError(err).Error("failed to shutdown container")
   649  			} else {
   650  				t := time.NewTimer(time.Second * 30)
   651  				select {
   652  				case <-ch:
   653  					err = werr
   654  					t.Stop()
   655  					if err != nil {
   656  						log.G(ctx).WithError(err).Error("failed to wait for container shutdown")
   657  					}
   658  				case <-t.C:
   659  					err = hcs.ErrTimeout
   660  					log.G(ctx).WithError(err).Error("failed to wait for container shutdown")
   661  				}
   662  			}
   663  
   664  			if err != nil {
   665  				err = ht.c.Terminate(ctx)
   666  				if err != nil {
   667  					log.G(ctx).WithError(err).Error("failed to terminate container")
   668  				} else {
   669  					t := time.NewTimer(time.Second * 30)
   670  					select {
   671  					case <-ch:
   672  						err = werr
   673  						t.Stop()
   674  						if err != nil {
   675  							log.G(ctx).WithError(err).Error("failed to wait for container terminate")
   676  						}
   677  					case <-t.C:
   678  						log.G(ctx).WithError(hcs.ErrTimeout).Error("failed to wait for container terminate")
   679  					}
   680  				}
   681  			}
   682  
   683  			// Release any resources associated with the container.
   684  			if err := resources.ReleaseResources(ctx, ht.cr, ht.host, true); err != nil {
   685  				log.G(ctx).WithError(err).Error("failed to release container resources")
   686  			}
   687  
   688  			// Close the container handle invalidating all future access.
   689  			if err := ht.c.Close(); err != nil {
   690  				log.G(ctx).WithError(err).Error("failed to close container")
   691  			}
   692  		}
   693  		ht.closeHost(ctx)
   694  	})
   695  }
   696  
   697  // closeHost safely closes the hosting UVM if this task is the owner. Once
   698  // closed and all resources released it events the `runtime.TaskExitEventTopic`
   699  // for all upstream listeners.
   700  //
   701  // Note: If this is a process isolated task the hosting UVM is simply a `noop`.
   702  //
   703  // This call is idempotent and safe to call multiple times.
   704  func (ht *hcsTask) closeHost(ctx context.Context) {
   705  	ht.closeHostOnce.Do(func() {
   706  		log.G(ctx).Debug("hcsTask::closeHostOnce")
   707  
   708  		if ht.ownsHost && ht.host != nil {
   709  			if err := ht.host.Close(); err != nil {
   710  				log.G(ctx).WithError(err).Error("failed host vm shutdown")
   711  			}
   712  		}
   713  		// Send the `init` exec exit notification always.
   714  		exit := ht.init.Status()
   715  
   716  		if err := ht.events.publishEvent(
   717  			ctx,
   718  			runtime.TaskExitEventTopic,
   719  			&eventstypes.TaskExit{
   720  				ContainerID: ht.id,
   721  				ID:          exit.ID,
   722  				Pid:         uint32(exit.Pid),
   723  				ExitStatus:  exit.ExitStatus,
   724  				ExitedAt:    exit.ExitedAt,
   725  			}); err != nil {
   726  			log.G(ctx).WithError(err).Error("failed to publish TaskExitEventTopic")
   727  		}
   728  		close(ht.closed)
   729  	})
   730  }
   731  
   732  func (ht *hcsTask) ExecInHost(ctx context.Context, req *shimdiag.ExecProcessRequest) (int, error) {
   733  	cmdReq := &cmd.CmdProcessRequest{
   734  		Args:     req.Args,
   735  		Workdir:  req.Workdir,
   736  		Terminal: req.Terminal,
   737  		Stdin:    req.Stdin,
   738  		Stdout:   req.Stdout,
   739  		Stderr:   req.Stderr,
   740  	}
   741  
   742  	if ht.host == nil {
   743  		return cmd.ExecInShimHost(ctx, cmdReq)
   744  	}
   745  	return cmd.ExecInUvm(ctx, ht.host, cmdReq)
   746  }
   747  
   748  func (ht *hcsTask) DumpGuestStacks(ctx context.Context) string {
   749  	if ht.host != nil {
   750  		stacks, err := ht.host.DumpStacks(ctx)
   751  		if err != nil {
   752  			log.G(ctx).WithError(err).Warn("failed to capture guest stacks")
   753  		} else {
   754  			return stacks
   755  		}
   756  	}
   757  	return ""
   758  }
   759  
   760  func (ht *hcsTask) Share(ctx context.Context, req *shimdiag.ShareRequest) error {
   761  	if ht.host == nil {
   762  		return errTaskNotIsolated
   763  	}
   764  	return ht.host.Share(ctx, req.HostPath, req.UvmPath, req.ReadOnly)
   765  }
   766  
   767  func hcsPropertiesToWindowsStats(props *hcsschema.Properties) *stats.Statistics_Windows {
   768  	wcs := &stats.Statistics_Windows{Windows: &stats.WindowsContainerStatistics{}}
   769  	if props.Statistics != nil {
   770  		wcs.Windows.Timestamp = props.Statistics.Timestamp
   771  		wcs.Windows.ContainerStartTime = props.Statistics.ContainerStartTime
   772  		wcs.Windows.UptimeNS = props.Statistics.Uptime100ns * 100
   773  		if props.Statistics.Processor != nil {
   774  			wcs.Windows.Processor = &stats.WindowsContainerProcessorStatistics{
   775  				TotalRuntimeNS:  props.Statistics.Processor.TotalRuntime100ns * 100,
   776  				RuntimeUserNS:   props.Statistics.Processor.RuntimeUser100ns * 100,
   777  				RuntimeKernelNS: props.Statistics.Processor.RuntimeKernel100ns * 100,
   778  			}
   779  		}
   780  		if props.Statistics.Memory != nil {
   781  			wcs.Windows.Memory = &stats.WindowsContainerMemoryStatistics{
   782  				MemoryUsageCommitBytes:            props.Statistics.Memory.MemoryUsageCommitBytes,
   783  				MemoryUsageCommitPeakBytes:        props.Statistics.Memory.MemoryUsageCommitPeakBytes,
   784  				MemoryUsagePrivateWorkingSetBytes: props.Statistics.Memory.MemoryUsagePrivateWorkingSetBytes,
   785  			}
   786  		}
   787  		if props.Statistics.Storage != nil {
   788  			wcs.Windows.Storage = &stats.WindowsContainerStorageStatistics{
   789  				ReadCountNormalized:  props.Statistics.Storage.ReadCountNormalized,
   790  				ReadSizeBytes:        props.Statistics.Storage.ReadSizeBytes,
   791  				WriteCountNormalized: props.Statistics.Storage.WriteCountNormalized,
   792  				WriteSizeBytes:       props.Statistics.Storage.WriteSizeBytes,
   793  			}
   794  		}
   795  	}
   796  	return wcs
   797  }
   798  
   799  func (ht *hcsTask) Stats(ctx context.Context) (*stats.Statistics, error) {
   800  	s := &stats.Statistics{}
   801  	props, err := ht.c.PropertiesV2(ctx, hcsschema.PTStatistics)
   802  	if err != nil {
   803  		if isStatsNotFound(err) {
   804  			return nil, errors.Wrapf(errdefs.ErrNotFound, "failed to fetch stats: %s", err)
   805  		}
   806  		return nil, err
   807  	}
   808  
   809  	if props != nil {
   810  		if ht.isWCOW {
   811  			s.Container = hcsPropertiesToWindowsStats(props)
   812  		} else {
   813  			s.Container = &stats.Statistics_Linux{Linux: props.Metrics}
   814  		}
   815  	}
   816  	if ht.ownsHost && ht.host != nil {
   817  		vmStats, err := ht.host.Stats(ctx)
   818  		if err != nil && !isStatsNotFound(err) {
   819  			return nil, err
   820  		}
   821  		s.VM = vmStats
   822  	}
   823  	return s, nil
   824  }
   825  
   826  func (ht *hcsTask) Update(ctx context.Context, req *task.UpdateTaskRequest) error {
   827  	resources, err := typeurl.UnmarshalAny(req.Resources)
   828  	if err != nil {
   829  		return errors.Wrapf(err, "failed to unmarshal resources for container %s update request", req.ID)
   830  	}
   831  
   832  	if err := verifyTaskUpdateResourcesType(resources); err != nil {
   833  		return err
   834  	}
   835  
   836  	if ht.ownsHost && ht.host != nil {
   837  		return ht.host.Update(ctx, resources, req.Annotations)
   838  	}
   839  
   840  	return ht.updateTaskContainerResources(ctx, resources, req.Annotations)
   841  }
   842  
   843  func (ht *hcsTask) updateTaskContainerResources(ctx context.Context, data interface{}, annotations map[string]string) error {
   844  	if ht.isWCOW {
   845  		switch resources := data.(type) {
   846  		case *specs.WindowsResources:
   847  			return ht.updateWCOWResources(ctx, resources, annotations)
   848  		case *ctrdtaskapi.ContainerMount:
   849  			// Adding mount to a running container is currently only supported for windows containers
   850  			return ht.updateWCOWContainerMount(ctx, resources, annotations)
   851  		default:
   852  			return errNotSupportedResourcesRequest
   853  		}
   854  	}
   855  
   856  	return ht.updateLCOWResources(ctx, data, annotations)
   857  }
   858  
   859  func (ht *hcsTask) updateWCOWContainerCPU(ctx context.Context, cpu *specs.WindowsCPUResources) error {
   860  	// if host is 20h2+ then we can make a request directly to hcs
   861  	if osversion.Get().Build >= osversion.V20H2 {
   862  		req := &hcsschema.Processor{}
   863  		if cpu.Count != nil {
   864  			procCount := int32(*cpu.Count)
   865  			hostProcs := processorinfo.ProcessorCount()
   866  			if ht.host != nil {
   867  				hostProcs = ht.host.ProcessorCount()
   868  			}
   869  			req.Count = hcsoci.NormalizeProcessorCount(ctx, ht.id, procCount, hostProcs)
   870  		}
   871  		if cpu.Maximum != nil {
   872  			req.Maximum = int32(*cpu.Maximum)
   873  		}
   874  		if cpu.Shares != nil {
   875  			req.Weight = int32(*cpu.Shares)
   876  		}
   877  		return ht.requestUpdateContainer(ctx, resourcepaths.SiloProcessorResourcePath, req)
   878  	}
   879  
   880  	return errdefs.ErrNotImplemented
   881  }
   882  
   883  func isValidWindowsCPUResources(c *specs.WindowsCPUResources) bool {
   884  	return (c.Count != nil && (c.Shares == nil && c.Maximum == nil)) ||
   885  		(c.Shares != nil && (c.Count == nil && c.Maximum == nil)) ||
   886  		(c.Maximum != nil && (c.Count == nil && c.Shares == nil))
   887  }
   888  
   889  func (ht *hcsTask) updateWCOWResources(ctx context.Context, resources *specs.WindowsResources, annotations map[string]string) error {
   890  	if resources.Memory != nil && resources.Memory.Limit != nil {
   891  		newMemorySizeInMB := *resources.Memory.Limit / memory.MiB
   892  		memoryLimit := hcsoci.NormalizeMemorySize(ctx, ht.id, newMemorySizeInMB)
   893  		if err := ht.requestUpdateContainer(ctx, resourcepaths.SiloMemoryResourcePath, memoryLimit); err != nil {
   894  			return err
   895  		}
   896  	}
   897  	if resources.CPU != nil {
   898  		if !isValidWindowsCPUResources(resources.CPU) {
   899  			return fmt.Errorf("invalid cpu resources request for container %s: %v", ht.id, resources.CPU)
   900  		}
   901  		if err := ht.updateWCOWContainerCPU(ctx, resources.CPU); err != nil {
   902  			return err
   903  		}
   904  	}
   905  	return nil
   906  }
   907  
   908  func (ht *hcsTask) updateLCOWResources(ctx context.Context, data interface{}, annotations map[string]string) error {
   909  	resources, ok := data.(*specs.LinuxResources)
   910  	if !ok || resources == nil {
   911  		return errors.New("must have resources be non-nil and type *LinuxResources when updating a lcow container")
   912  	}
   913  	settings := guestresource.LCOWContainerConstraints{
   914  		Linux: *resources,
   915  	}
   916  	return ht.requestUpdateContainer(ctx, "", settings)
   917  }
   918  
   919  func (ht *hcsTask) requestUpdateContainer(ctx context.Context, resourcePath string, settings interface{}) error {
   920  	var modification interface{}
   921  	if ht.isWCOW {
   922  		modification = &hcsschema.ModifySettingRequest{
   923  			ResourcePath: resourcePath,
   924  			RequestType:  guestrequest.RequestTypeUpdate,
   925  			Settings:     settings,
   926  		}
   927  	} else {
   928  		modification = guestrequest.ModificationRequest{
   929  			ResourceType: guestresource.ResourceTypeContainerConstraints,
   930  			RequestType:  guestrequest.RequestTypeUpdate,
   931  			Settings:     settings,
   932  		}
   933  	}
   934  	return ht.c.Modify(ctx, modification)
   935  }
   936  
   937  func (ht *hcsTask) ProcessorInfo(ctx context.Context) (*processorInfo, error) {
   938  	if ht.host == nil {
   939  		return nil, errTaskNotIsolated
   940  	}
   941  	if !ht.ownsHost {
   942  		return nil, errors.New("not implemented")
   943  	}
   944  	return &processorInfo{
   945  		count: ht.host.ProcessorCount(),
   946  	}, nil
   947  }
   948  
   949  func (ht *hcsTask) requestAddContainerMount(ctx context.Context, resourcePath string, settings interface{}) error {
   950  	modification := &hcsschema.ModifySettingRequest{
   951  		ResourcePath: resourcePath,
   952  		RequestType:  guestrequest.RequestTypeAdd,
   953  		Settings:     settings,
   954  	}
   955  	return ht.c.Modify(ctx, modification)
   956  }
   957  
   958  func isMountTypeSupported(hostPath, mountType string) bool {
   959  	// currently we only support mounting of host volumes/directories
   960  	switch mountType {
   961  	case hcsoci.MountTypeBind, hcsoci.MountTypePhysicalDisk,
   962  		hcsoci.MountTypeVirtualDisk, hcsoci.MountTypeExtensibleVirtualDisk:
   963  		return false
   964  	default:
   965  		// Ensure that host path is not sandbox://, hugepages://
   966  		if strings.HasPrefix(hostPath, guestpath.SandboxMountPrefix) ||
   967  			strings.HasPrefix(hostPath, guestpath.HugePagesMountPrefix) ||
   968  			strings.HasPrefix(hostPath, guestpath.PipePrefix) {
   969  			return false
   970  		} else {
   971  			// hcsshim treats mountType == "" as a normal directory mount
   972  			// and this is supported
   973  			return mountType == ""
   974  		}
   975  	}
   976  }
   977  
   978  func (ht *hcsTask) updateWCOWContainerMount(ctx context.Context, resources *ctrdtaskapi.ContainerMount, annotations map[string]string) error {
   979  	// Hcsschema v2 should be supported
   980  	if osversion.Build() < osversion.RS5 {
   981  		// OSVerions < RS5 only support hcsshema v1
   982  		return fmt.Errorf("hcsschema v1 unsupported")
   983  	}
   984  
   985  	if resources.HostPath == "" || resources.ContainerPath == "" {
   986  		return fmt.Errorf("invalid OCI spec - a mount must have both host and container path set")
   987  	}
   988  
   989  	// Check for valid mount type
   990  	if !isMountTypeSupported(resources.HostPath, resources.Type) {
   991  		return fmt.Errorf("invalid mount type %v. Currently only host volumes/directories can be mounted to running containers", resources.Type)
   992  	}
   993  
   994  	if ht.host == nil {
   995  		// HCS has a bug where it does not correctly resolve file (not dir) paths
   996  		// if the path includes a symlink. Therefore, we resolve the path here before
   997  		// passing it in. The issue does not occur with VSMB, so don't need to worry
   998  		// about the isolated case.
   999  		hostPath, err := fs.ResolvePath(resources.HostPath)
  1000  		if err != nil {
  1001  			return errors.Wrapf(err, "failed to resolve path for hostPath %s", resources.HostPath)
  1002  		}
  1003  
  1004  		// process isolated windows container
  1005  		settings := hcsschema.MappedDirectory{
  1006  			HostPath:      hostPath,
  1007  			ContainerPath: resources.ContainerPath,
  1008  			ReadOnly:      resources.ReadOnly,
  1009  		}
  1010  		if err := ht.requestAddContainerMount(ctx, resourcepaths.SiloMappedDirectoryResourcePath, settings); err != nil {
  1011  			return errors.Wrapf(err, "failed to add mount to process isolated container")
  1012  		}
  1013  	} else {
  1014  		// if it is a mount request for a running hyperV WCOW container, we should first mount volume to the
  1015  		// UVM as a VSMB share and then mount to the running container using the src path as seen by the UVM
  1016  		vsmbShare, guestPath, err := ht.host.AddVsmbAndGetSharePath(ctx, resources.HostPath, resources.ContainerPath, resources.ReadOnly)
  1017  		if err != nil {
  1018  			return err
  1019  		}
  1020  		// Add mount to list of resources to be released on container cleanup
  1021  		ht.cr.Add(vsmbShare)
  1022  
  1023  		settings := hcsschema.MappedDirectory{
  1024  			HostPath:      guestPath,
  1025  			ContainerPath: resources.ContainerPath,
  1026  			ReadOnly:      resources.ReadOnly,
  1027  		}
  1028  		if err := ht.requestAddContainerMount(ctx, resourcepaths.SiloMappedDirectoryResourcePath, settings); err != nil {
  1029  			return errors.Wrapf(err, "failed to add mount to hyperV container")
  1030  		}
  1031  	}
  1032  	return nil
  1033  }
  1034  

View as plain text