...

Source file src/github.com/Microsoft/hcsshim/internal/jobcontainers/jobcontainer.go

Documentation: github.com/Microsoft/hcsshim/internal/jobcontainers

     1  //go:build windows
     2  
     3  package jobcontainers
     4  
     5  import (
     6  	"context"
     7  	"fmt"
     8  	"os"
     9  	"path/filepath"
    10  	"regexp"
    11  	"strings"
    12  	"sync"
    13  	"time"
    14  	"unsafe"
    15  
    16  	"github.com/Microsoft/go-winio/pkg/guid"
    17  	"github.com/Microsoft/hcsshim/internal/conpty"
    18  	"github.com/Microsoft/hcsshim/internal/cow"
    19  	"github.com/Microsoft/hcsshim/internal/exec"
    20  	"github.com/Microsoft/hcsshim/internal/hcs"
    21  	"github.com/Microsoft/hcsshim/internal/hcs/schema1"
    22  	hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
    23  	"github.com/Microsoft/hcsshim/internal/jobobject"
    24  	"github.com/Microsoft/hcsshim/internal/log"
    25  	"github.com/Microsoft/hcsshim/internal/queue"
    26  	"github.com/Microsoft/hcsshim/internal/resources"
    27  	"github.com/Microsoft/hcsshim/internal/winapi"
    28  	specs "github.com/opencontainers/runtime-spec/specs-go"
    29  	"github.com/pkg/errors"
    30  	"golang.org/x/sys/windows"
    31  )
    32  
    33  var (
    34  	fileBindingSupport   bool
    35  	checkBindSupportOnce sync.Once
    36  )
    37  
    38  const (
    39  	// jobContainerNameFmt is the naming format that job objects for job containers will follow.
    40  	jobContainerNameFmt = "JobContainer_%s"
    41  	// Environment variable set in every process in the job detailing where the containers volume
    42  	// is mounted on the host.
    43  	sandboxMountPointEnvVar = "CONTAINER_SANDBOX_MOUNT_POINT"
    44  )
    45  
    46  // Split arguments but ignore spaces in quotes.
    47  //
    48  // For example instead of:
    49  // "\"Hello good\" morning world" --> ["\"Hello", "good\"", "morning", "world"]
    50  // we get ["\"Hello good\"", "morning", "world"]
    51  func splitArgs(cmdLine string) []string {
    52  	r := regexp.MustCompile(`[^\s"]+|"([^"]*)"`)
    53  	return r.FindAllString(cmdLine, -1)
    54  }
    55  
    56  type initProc struct {
    57  	initDoOnce sync.Once
    58  	proc       *JobProcess
    59  	initBlock  chan struct{}
    60  }
    61  
    62  // JobContainer represents a lightweight container composed from a job object.
    63  type JobContainer struct {
    64  	id string
    65  	// OCI spec used to create the container.
    66  	spec *specs.Spec
    67  	// The job object the container owns.
    68  	job *jobobject.JobObject
    69  	// Path to where the rootfs is located on the host
    70  	// if no file binding support is available, or in the
    71  	// silo if it is.
    72  	rootfsLocation string
    73  
    74  	closedWaitOnce   sync.Once
    75  	init             initProc
    76  	token            windows.Token
    77  	localUserAccount string
    78  	startTimestamp   time.Time
    79  	exited           chan struct{}
    80  	waitBlock        chan struct{}
    81  	waitError        error
    82  }
    83  
    84  // Compile time checks for interface adherence.
    85  var (
    86  	_ cow.ProcessHost = &JobContainer{}
    87  	_ cow.Container   = &JobContainer{}
    88  )
    89  
    90  func newJobContainer(id string, s *specs.Spec) *JobContainer {
    91  	return &JobContainer{
    92  		id:        id,
    93  		spec:      s,
    94  		waitBlock: make(chan struct{}),
    95  		exited:    make(chan struct{}),
    96  		init:      initProc{initBlock: make(chan struct{})},
    97  	}
    98  }
    99  
   100  // Create creates a new JobContainer from the OCI runtime spec `s`.
   101  func Create(ctx context.Context, id string, s *specs.Spec) (_ cow.Container, _ *resources.Resources, err error) {
   102  	log.G(ctx).WithField("id", id).Debug("Creating job container")
   103  
   104  	if s == nil {
   105  		return nil, nil, errors.New("Spec must be supplied")
   106  	}
   107  
   108  	if id == "" {
   109  		g, err := guid.NewV4()
   110  		if err != nil {
   111  			return nil, nil, err
   112  		}
   113  		id = g.String()
   114  	}
   115  
   116  	container := newJobContainer(id, s)
   117  
   118  	// Create the job object all processes will run in.
   119  	options := &jobobject.Options{
   120  		Name:             fmt.Sprintf(jobContainerNameFmt, id),
   121  		Notifications:    true,
   122  		EnableIOTracking: true,
   123  	}
   124  	container.job, err = jobobject.Create(ctx, options)
   125  	if err != nil {
   126  		return nil, nil, fmt.Errorf("failed to create job object: %w", err)
   127  	}
   128  
   129  	// Parity with how we handle process isolated containers. We set the same flag which
   130  	// behaves the same way for a server silo.
   131  	if err := container.job.SetTerminateOnLastHandleClose(); err != nil {
   132  		return nil, nil, fmt.Errorf("failed to set terminate on last handle close on job container: %w", err)
   133  	}
   134  
   135  	r := resources.NewContainerResources(id)
   136  	defer func() {
   137  		if err != nil {
   138  			container.Close()
   139  			_ = resources.ReleaseResources(ctx, r, nil, true)
   140  		}
   141  	}()
   142  
   143  	// Check if we support file binding once to avoid needing to stat for the dll on
   144  	// every container creation.
   145  	//
   146  	// If file/directory binding support is available on the host, there's a lot of new functionality we
   147  	// can make use of that improves the UX for volume mounts and where the containers rootfs
   148  	// shows up on the host. The exhaustive list of differences in functionality would be:
   149  	//
   150  	// 1. The containers job object is now upgraded to a silo. This is so we can make use of
   151  	// some functionality for silos that allows you to bind in a filesystem path and have it
   152  	// be unique to that silo and not viewable outside of the silo or in any other silos. This
   153  	// is the building block for the other changes below.
   154  	//
   155  	// 2. Directory and file mounts will now show up exactly where the container_path is
   156  	// pointing to. For example, with the below mount C:\path would show up in the container
   157  	// at C:\path\in\container just as you'd expect.
   158  	//
   159  	// {"host_path": "C:\path", "container_path": "C:\path\in\container"}
   160  	//
   161  	// Without file binding support mounts will be symlinks under a relative path in the containers
   162  	// rootfs location on the host. For example, using the same request as above, C:\path\in\container
   163  	// would end up being placed at C:\<rootfslocation>\path\in\container. This is due to
   164  	// there being no way for us to have the path be unique in the face of multiple containers, or just
   165  	// the same file existing on the host. If two containers asked for two different paths to show up
   166  	// at C:\path\in\container, we can't symlink them both to that location. Another thing to note however
   167  	// is as a backwards compatibility measure for machines that don't have file binding support
   168  	// (ws2019 at the moment) we *also* bind the path under the containers rootfs location so checking
   169  	// for your mount in either the old or new location will work.
   170  	//
   171  	// 3. The containers rootfs location (C:\ in a typical Windows Server Container) can now be the
   172  	// same path in every container, and the default location is C:\hpc. This is possible because of the
   173  	// same per silo file binding support mentioned above, we can take the unioned view of the containers
   174  	// layers and bind them to C:\hpc in the container and have a unique C:\hpc in every one. On machines
   175  	// where file binding isn't available the path has to be unique, as there is no form of filesystem
   176  	// virtualization or namespacing available to regular job objects. The format for machines with no
   177  	// file binding support is C:\hpc\<ContainerID>.
   178  	//
   179  	// 4. Users working directory will be respected instead of taken as a relative path under C:\<rootfslocation>.
   180  	// On machines without file binding there was no way to know the path that the rootfs for the container would
   181  	// show up at beforehand as you would need to know the containers ID before you launched it. Now that the
   182  	// rootfs location can be static, a user can easily supply C:\hpc\rest\of\path as their work dir and still
   183  	// supply anything outside of C:\hpc if they want another location on the host.
   184  	checkBindSupportOnce.Do(func() {
   185  		bindDLL := `C:\windows\system32\bindfltapi.dll`
   186  		if _, err := os.Stat(bindDLL); err == nil {
   187  			fileBindingSupport = true
   188  		}
   189  	})
   190  
   191  	var closer resources.ResourceCloser
   192  	if fileBindingSupport {
   193  		closer, err = container.bindSetup(ctx, s)
   194  	} else {
   195  		closer, err = container.fallbackSetup(ctx, s)
   196  	}
   197  	if err != nil {
   198  		return nil, nil, err
   199  	}
   200  	r.SetLayers(closer)
   201  
   202  	volumeGUIDRegex := `^\\\\\?\\(Volume)\{{0,1}[0-9a-fA-F]{8}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{12}(\}){0,1}\}(|\\)$`
   203  	if matched, err := regexp.MatchString(volumeGUIDRegex, s.Root.Path); !matched || err != nil {
   204  		return nil, nil, fmt.Errorf(`invalid container spec - Root.Path '%s' must be a volume GUID path in the format '\\?\Volume{GUID}\'`, s.Root.Path)
   205  	}
   206  
   207  	limits, err := specToLimits(ctx, id, s)
   208  	if err != nil {
   209  		return nil, nil, fmt.Errorf("failed to convert OCI spec to job object limits: %w", err)
   210  	}
   211  
   212  	// Set resource limits on the job object based off of oci spec.
   213  	if err := container.job.SetResourceLimits(limits); err != nil {
   214  		return nil, nil, fmt.Errorf("failed to set resource limits: %w", err)
   215  	}
   216  
   217  	go container.waitBackground(ctx)
   218  	return container, r, nil
   219  }
   220  
   221  // CreateProcess creates a process on the host, starts it, adds it to the containers
   222  // job object and then waits for exit.
   223  func (c *JobContainer) CreateProcess(ctx context.Context, config interface{}) (_ cow.Process, err error) {
   224  	conf, ok := config.(*hcsschema.ProcessParameters)
   225  	if !ok {
   226  		return nil, errors.New("unsupported process config passed in")
   227  	}
   228  
   229  	removeDriveLetter := func(name string) string {
   230  		// If just the letter and colon (C:) then replace with a single backslash. Else just trim the drive letter and
   231  		// leave the rest of the path.
   232  		if len(name) == 2 && name[1] == ':' {
   233  			name = "\\"
   234  		} else if len(name) > 2 && name[1] == ':' {
   235  			name = name[2:]
   236  		}
   237  		return name
   238  	}
   239  
   240  	workDir := c.rootfsLocation
   241  	if conf.WorkingDirectory != "" {
   242  		var changed bool
   243  		// The below calls replaceWithMountPoint to replace any occurrences of the environment variable that points to where the container image
   244  		// volume is mounted. This is useful on machines without bindflt as the volume is at a random location not known prior to launching the
   245  		// container, so %CONTAINER_SANDBOX_MOUNT_POINT% will place you at the root of the containers filesystem.
   246  		workDir, changed = c.replaceWithMountPoint(conf.WorkingDirectory)
   247  		// On non-bindflt machines we join the working directory requested with where the sandbox volume is located. It's expected that the
   248  		// default behavior would be to treat all paths as relative to the volume.
   249  		//
   250  		// For example:
   251  		// A working directory of C:\ would become C:\hpc\12345678\
   252  		// A working directory of C:\work\dir would become C:\hpc\12345678\work\dir
   253  		//
   254  		// If the working directory was changed, that means the user supplied %CONTAINER_SANDBOX_MOUNT_POINT%\\my\dir or something similar.
   255  		// In that case there's nothing left to do, as we don't want to join it with the mount point again.. If it *wasn't* changed, and there's
   256  		// no bindflt support then we need to join it with the mount point, as it's some normal path.
   257  		if !changed && !fileBindingSupport {
   258  			workDir = filepath.Join(c.rootfsLocation, removeDriveLetter(workDir))
   259  		}
   260  	}
   261  
   262  	// Make sure the working directory exists.
   263  	if _, err := os.Stat(workDir); os.IsNotExist(err) {
   264  		if err := os.MkdirAll(workDir, 0700); err != nil {
   265  			return nil, err
   266  		}
   267  	}
   268  
   269  	// If we haven't grabbed a token yet this is the init process being launched. Skip grabbing another token afterwards if we've already
   270  	// done the work (c.token != 0), this would typically be for an exec being launched.
   271  	if c.token == 0 {
   272  		if inheritUserTokenIsSet(c.spec.Annotations) {
   273  			c.token, err = openCurrentProcessToken()
   274  			if err != nil {
   275  				return nil, err
   276  			}
   277  		} else {
   278  			c.token, err = c.processToken(ctx, conf.User)
   279  			if err != nil {
   280  				return nil, fmt.Errorf("failed to create user process token: %w", err)
   281  			}
   282  		}
   283  	}
   284  
   285  	env, err := defaultEnvBlock(c.token)
   286  	if err != nil {
   287  		return nil, errors.Wrap(err, "failed to get default environment block")
   288  	}
   289  
   290  	// Convert environment map to a slice of environment variables in the form [Key1=val1, key2=val2]
   291  	var envs []string
   292  	for k, v := range conf.Environment {
   293  		expanded, _ := c.replaceWithMountPoint(v)
   294  		envs = append(envs, k+"="+expanded)
   295  	}
   296  	env = append(env, envs...)
   297  	env = append(env, sandboxMountPointEnvVar+"="+c.rootfsLocation)
   298  
   299  	var path string
   300  	for idx, envVar := range env {
   301  		ev := strings.TrimSpace(envVar)
   302  		if strings.HasPrefix(strings.ToLower(ev), "path=") {
   303  			// Add the rootfs location to PATH so you can run things from the root of the image.
   304  			rootfsLoc := c.rootfsLocation
   305  			if rune(ev[len(ev)-1]) != ';' {
   306  				rootfsLoc = ";" + rootfsLoc
   307  			}
   308  			// Additionally add in the default location of powershell and wmi cache. Powershell is a
   309  			// very common choice for this container type, and folks re-using a windows server image with
   310  			// the path unknowingly adjusted in the image might run into some headaches here.
   311  			extraPaths := `;C:\WINDOWS\System32\WindowsPowerShell\v1.0\;C:\WINDOWS\System32\Wbem`
   312  			path = ev + rootfsLoc + extraPaths
   313  			env[idx] = path
   314  		}
   315  	}
   316  
   317  	// Replace any occurrences of the sandbox mount env variable in the commandline.
   318  	// For example: %CONTAINER_SANDBOX_MOUNTPOINT%\mybinary.exe -> C:\<rootfslocation>\mybinary.exe.
   319  	commandLine, _ := c.replaceWithMountPoint(conf.CommandLine)
   320  
   321  	// This is to workaround a rather unfortunate outcome with launching a process in a silo that
   322  	// has bound files.
   323  	//
   324  	// If a user requested to launch a program at C:\<rootfslocation>\mybinary.exe because they
   325  	// expect C:\<rootfslocation>\mybinary.exe to exist once the file bindings are done, this
   326  	// won't work. This is because the executable is searched for using the parent processes filesystem view
   327  	// and not the containers/silos that has access to these bound in files. Our Containerd shim is not
   328  	// running in the containers silo, and by virtue of this we won't be able to find the process being asked
   329  	// for as C:\<rootfslocation> is not viewable to processes outside of the silo. Deep down in the depths
   330  	// of CreateProcessW the culprit is a NtQueryAttributesFile call on the binary we're asking to run that
   331  	// fails as it doesn't have any context surrounding paths available to our silo.
   332  	//
   333  	// A way to get around this is to launch a process that will always exist (cmd) and is in our
   334  	// path, and then just invoke the program with the cmdline supplied. This works as the process
   335  	// (cmd in this case) after launch can now see C:\<rootfslocation> as it's in the silo. We could
   336  	// also add a new mode/flag for the shim where it's just a dummy process launcher, so we can invoke
   337  	// the shim instead of cmd and have more control over things.
   338  	if fileBindingSupport {
   339  		commandLine = "cmd /c " + commandLine
   340  	}
   341  
   342  	// Reassign commandline here in case it needed to be quoted. For example if "foo bar baz" was supplied, and
   343  	// "foo bar.exe" exists, then return: "\"foo bar\" baz"
   344  	absPath, commandLine, err := getApplicationName(commandLine, workDir, strings.Trim(path, "PATH="))
   345  	if err != nil {
   346  		return nil, errors.Wrapf(err, "failed to get application name from commandline %q", conf.CommandLine)
   347  	}
   348  
   349  	// exec.Cmd internally does its own path resolution and as part of this checks some well known file extensions on the file given (e.g. if
   350  	// the user just provided /path/to/mybinary). CreateProcess is perfectly capable of launching an executable that doesn't have the .exe extension
   351  	// so this adds an empty string entry to the end of what extensions GO checks against so that a binary with no extension can be launched.
   352  	// The extensions are checked in order, so that if mybinary.exe and mybinary both existed in the same directory, mybinary.exe would be chosen.
   353  	// This is mostly to handle a common Kubernetes test image named agnhost that has the main entrypoint as a binary named agnhost with no extension.
   354  	// https://github.com/kubernetes/kubernetes/blob/d64e91878517b1208a0bce7e2b7944645ace8ede/test/images/agnhost/Dockerfile_windows
   355  	if err := os.Setenv("PATHEXT", ".COM;.EXE;.BAT;.CMD; "); err != nil {
   356  		return nil, errors.Wrap(err, "failed to set PATHEXT")
   357  	}
   358  
   359  	var cpty *conpty.Pty
   360  	if conf.EmulateConsole {
   361  		height := int16(25)
   362  		width := int16(80)
   363  		// ConsoleSize is just an empty slice that needs to be filled. First element is expected to
   364  		// be height, second is width.
   365  		if len(conf.ConsoleSize) == 2 {
   366  			if conf.ConsoleSize[0] != 0 {
   367  				height = int16(conf.ConsoleSize[0])
   368  			}
   369  			if conf.ConsoleSize[1] != 0 {
   370  				width = int16(conf.ConsoleSize[1])
   371  			}
   372  		}
   373  
   374  		cpty, err = conpty.Create(width, height, 0)
   375  		if err != nil {
   376  			return nil, err
   377  		}
   378  	}
   379  
   380  	cmd, err := exec.New(
   381  		absPath,
   382  		commandLine,
   383  		exec.WithDir(workDir),
   384  		exec.WithEnv(env),
   385  		exec.WithToken(c.token),
   386  		exec.WithJobObject(c.job),
   387  		exec.WithConPty(cpty),
   388  		exec.WithProcessFlags(windows.CREATE_BREAKAWAY_FROM_JOB),
   389  		exec.WithStdio(conf.CreateStdOutPipe, conf.CreateStdErrPipe, conf.CreateStdInPipe),
   390  	)
   391  	if err != nil {
   392  		return nil, err
   393  	}
   394  	process := newProcess(cmd, cpty)
   395  
   396  	// Create process pipes if asked for.
   397  	if conf.CreateStdInPipe {
   398  		process.stdin = process.cmd.Stdin()
   399  	}
   400  
   401  	if conf.CreateStdOutPipe {
   402  		process.stdout = process.cmd.Stdout()
   403  	}
   404  
   405  	if conf.CreateStdErrPipe {
   406  		process.stderr = process.cmd.Stderr()
   407  	}
   408  
   409  	defer func() {
   410  		if err != nil {
   411  			process.Close()
   412  		}
   413  	}()
   414  
   415  	if err = process.Start(); err != nil {
   416  		return nil, errors.Wrap(err, "failed to start host process")
   417  	}
   418  
   419  	// Assign the first process made as the init process of the container.
   420  	c.init.initDoOnce.Do(func() {
   421  		c.init.proc = process
   422  		close(c.init.initBlock)
   423  	})
   424  
   425  	// Wait for process exit
   426  	go c.pollJobMsgs(ctx)
   427  	go process.waitBackground(ctx)
   428  	return process, nil
   429  }
   430  
   431  func (c *JobContainer) Modify(ctx context.Context, config interface{}) (err error) {
   432  	return errors.New("modify not supported for job containers")
   433  }
   434  
   435  // Start starts the container. There's nothing to "start" for job containers, so this just
   436  // sets the start timestamp.
   437  func (c *JobContainer) Start(ctx context.Context) error {
   438  	c.startTimestamp = time.Now()
   439  	return nil
   440  }
   441  
   442  // Close free's up any resources (handles, temporary accounts).
   443  func (c *JobContainer) Close() error {
   444  	// Do not return the first error so we can finish cleaning up.
   445  
   446  	var closeErr bool
   447  	if err := c.job.Close(); err != nil {
   448  		log.G(context.Background()).WithError(err).WithField("cid", c.id).Warning("failed to close job object")
   449  		closeErr = true
   450  	}
   451  
   452  	if err := c.token.Close(); err != nil {
   453  		log.G(context.Background()).WithError(err).WithField("cid", c.id).Warning("failed to close token")
   454  		closeErr = true
   455  	}
   456  
   457  	// Delete the containers local account if one was created
   458  	if c.localUserAccount != "" {
   459  		if err := winapi.NetUserDel("", c.localUserAccount); err != nil {
   460  			log.G(context.Background()).WithError(err).WithField("cid", c.id).Warning("failed to delete local account")
   461  			closeErr = true
   462  		}
   463  	}
   464  
   465  	c.closedWaitOnce.Do(func() {
   466  		c.waitError = hcs.ErrAlreadyClosed
   467  		close(c.waitBlock)
   468  	})
   469  	if closeErr {
   470  		return errors.New("failed to close one or more job container resources")
   471  	}
   472  	return nil
   473  }
   474  
   475  // ID returns the ID of the container. This is the name used to create the job object.
   476  func (c *JobContainer) ID() string {
   477  	return c.id
   478  }
   479  
   480  // Shutdown gracefully shuts down the container.
   481  func (c *JobContainer) Shutdown(ctx context.Context) error {
   482  	log.G(ctx).WithField("id", c.id).Debug("shutting down job container")
   483  
   484  	ctx, cancel := context.WithTimeout(ctx, time.Second*5)
   485  	defer cancel()
   486  	return c.shutdown(ctx)
   487  }
   488  
   489  // shutdown will loop through all the pids in the container and send a signal to exit.
   490  // If there are no processes in the container it will early return nil.
   491  // If the all processes exited message is not received within the context timeout set, it will
   492  // terminate the job.
   493  func (c *JobContainer) shutdown(ctx context.Context) error {
   494  	pids, err := c.job.Pids()
   495  	if err != nil {
   496  		return errors.Wrap(err, "failed to get pids in container")
   497  	}
   498  
   499  	if len(pids) == 0 {
   500  		return nil
   501  	}
   502  
   503  	for _, pid := range pids {
   504  		// If any process can't be signaled just wait until the timeout hits
   505  		if err := signalProcess(pid, windows.CTRL_SHUTDOWN_EVENT); err != nil {
   506  			log.G(ctx).WithField("pid", pid).Error("failed to signal process in job container")
   507  		}
   508  	}
   509  
   510  	select {
   511  	case <-c.exited:
   512  	case <-ctx.Done():
   513  		return c.Terminate(ctx)
   514  	}
   515  	return nil
   516  }
   517  
   518  // PropertiesV2 returns properties relating to the job container. This is an HCS construct but
   519  // to adhere to the interface for containers on Windows it is partially implemented. The only
   520  // supported property is schema2.PTStatistics.
   521  func (c *JobContainer) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (*hcsschema.Properties, error) {
   522  	if len(types) == 0 {
   523  		return nil, errors.New("no property types supplied for PropertiesV2 call")
   524  	}
   525  	if types[0] != hcsschema.PTStatistics {
   526  		return nil, errors.New("PTStatistics is the only supported property type for job containers")
   527  	}
   528  
   529  	// Start timestamp before we grab the stats to match HCS' behavior
   530  	timestamp := time.Now()
   531  
   532  	memInfo, err := c.job.QueryMemoryStats()
   533  	if err != nil {
   534  		return nil, errors.Wrap(err, "failed to query for job containers memory information")
   535  	}
   536  
   537  	processorInfo, err := c.job.QueryProcessorStats()
   538  	if err != nil {
   539  		return nil, errors.Wrap(err, "failed to query for job containers processor information")
   540  	}
   541  
   542  	storageInfo, err := c.job.QueryStorageStats()
   543  	if err != nil {
   544  		return nil, errors.Wrap(err, "failed to query for job containers storage information")
   545  	}
   546  
   547  	privateWorkingSet, err := c.job.QueryPrivateWorkingSet()
   548  	if err != nil {
   549  		return nil, fmt.Errorf("failed to get private working set for container: %w", err)
   550  	}
   551  
   552  	return &hcsschema.Properties{
   553  		Statistics: &hcsschema.Statistics{
   554  			Timestamp:          timestamp,
   555  			Uptime100ns:        uint64(time.Since(c.startTimestamp).Nanoseconds()) / 100,
   556  			ContainerStartTime: c.startTimestamp,
   557  			Memory: &hcsschema.MemoryStats{
   558  				MemoryUsageCommitBytes:            memInfo.JobMemory,
   559  				MemoryUsageCommitPeakBytes:        memInfo.PeakJobMemoryUsed,
   560  				MemoryUsagePrivateWorkingSetBytes: privateWorkingSet,
   561  			},
   562  			Processor: &hcsschema.ProcessorStats{
   563  				RuntimeKernel100ns: uint64(processorInfo.TotalKernelTime),
   564  				RuntimeUser100ns:   uint64(processorInfo.TotalUserTime),
   565  				TotalRuntime100ns:  uint64(processorInfo.TotalKernelTime + processorInfo.TotalUserTime),
   566  			},
   567  			Storage: &hcsschema.StorageStats{
   568  				ReadCountNormalized:  uint64(storageInfo.ReadStats.IoCount),
   569  				ReadSizeBytes:        storageInfo.ReadStats.TotalSize,
   570  				WriteCountNormalized: uint64(storageInfo.WriteStats.IoCount),
   571  				WriteSizeBytes:       storageInfo.WriteStats.TotalSize,
   572  			},
   573  		},
   574  	}, nil
   575  }
   576  
   577  // Properties returns properties relating to the job container. This is an HCS construct but
   578  // to adhere to the interface for containers on Windows it is partially implemented. The only
   579  // supported property is schema1.PropertyTypeProcessList.
   580  func (c *JobContainer) Properties(ctx context.Context, types ...schema1.PropertyType) (*schema1.ContainerProperties, error) {
   581  	if len(types) == 0 {
   582  		return nil, errors.New("no property types supplied for Properties call")
   583  	}
   584  	if types[0] != schema1.PropertyTypeProcessList {
   585  		return nil, errors.New("ProcessList is the only supported property type for job containers")
   586  	}
   587  
   588  	var processList []schema1.ProcessListItem
   589  	err := forEachProcessInfo(c.job, func(procInfo *winapi.SYSTEM_PROCESS_INFORMATION) {
   590  		proc := schema1.ProcessListItem{
   591  			CreateTimestamp:              time.Unix(0, procInfo.CreateTime),
   592  			ProcessId:                    uint32(procInfo.UniqueProcessID),
   593  			ImageName:                    procInfo.ImageName.String(),
   594  			UserTime100ns:                uint64(procInfo.UserTime),
   595  			KernelTime100ns:              uint64(procInfo.KernelTime),
   596  			MemoryCommitBytes:            uint64(procInfo.PrivatePageCount),
   597  			MemoryWorkingSetPrivateBytes: uint64(procInfo.WorkingSetPrivateSize),
   598  			MemoryWorkingSetSharedBytes:  uint64(procInfo.WorkingSetSize) - uint64(procInfo.WorkingSetPrivateSize),
   599  		}
   600  		processList = append(processList, proc)
   601  	})
   602  	if err != nil {
   603  		return nil, errors.Wrap(err, "failed to get process ")
   604  	}
   605  
   606  	return &schema1.ContainerProperties{ProcessList: processList}, nil
   607  }
   608  
   609  // Terminate terminates the job object (kills every process in the job).
   610  func (c *JobContainer) Terminate(ctx context.Context) error {
   611  	log.G(ctx).WithField("id", c.id).Debug("terminating job container")
   612  
   613  	if err := c.job.Terminate(1); err != nil {
   614  		return errors.Wrap(err, "failed to terminate job container")
   615  	}
   616  	return nil
   617  }
   618  
   619  func (c *JobContainer) WaitChannel() <-chan struct{} {
   620  	return c.waitBlock
   621  }
   622  
   623  func (c *JobContainer) WaitError() error {
   624  	return c.waitError
   625  }
   626  
   627  // Wait synchronously waits for the container to shutdown or terminate. If
   628  // the container has already exited returns the previous error (if any).
   629  func (c *JobContainer) Wait() error {
   630  	<-c.WaitChannel()
   631  	return c.WaitError()
   632  }
   633  
   634  func (c *JobContainer) waitBackground(ctx context.Context) {
   635  	// Wait for there to be an init process assigned.
   636  	<-c.init.initBlock
   637  
   638  	// Once the init process finishes, if there's any other processes in the container we need to signal
   639  	// them to exit.
   640  	<-c.init.proc.waitBlock
   641  
   642  	ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
   643  	defer cancel()
   644  	if err := c.Shutdown(ctx); err != nil {
   645  		_ = c.Terminate(ctx)
   646  	}
   647  
   648  	c.closedWaitOnce.Do(func() {
   649  		c.waitError = c.init.proc.waitError
   650  		close(c.waitBlock)
   651  	})
   652  }
   653  
   654  // Polls for notifications from the job objects assigned IO completion port.
   655  func (c *JobContainer) pollJobMsgs(ctx context.Context) {
   656  	for {
   657  		notif, err := c.job.PollNotification()
   658  		if err != nil {
   659  			// Queues closed or we somehow aren't registered to receive notifications. There won't be
   660  			// any notifications arriving so we're safe to return.
   661  			if err == queue.ErrQueueClosed || err == jobobject.ErrNotRegistered {
   662  				return
   663  			}
   664  			log.G(ctx).WithError(err).Warn("error while polling for job container notification")
   665  		}
   666  
   667  		switch msg := notif.(type) {
   668  		// All processes have exited. Close the waitblock so we can cleanup and then return.
   669  		case jobobject.MsgAllProcessesExited:
   670  			close(c.exited)
   671  			return
   672  		case jobobject.MsgUnimplemented:
   673  		default:
   674  			log.G(ctx).WithField("message", msg).Warn("unknown job object notification encountered")
   675  		}
   676  	}
   677  }
   678  
   679  // IsOCI - Just to satisfy the cow.ProcessHost interface. Follow the WCOW behavior
   680  func (c *JobContainer) IsOCI() bool {
   681  	return false
   682  }
   683  
   684  // OS returns the operating system name as a string. This should always be windows.
   685  func (c *JobContainer) OS() string {
   686  	return "windows"
   687  }
   688  
   689  // For every process in the job `job`, run the function `work`. This can be used to grab/filter the SYSTEM_PROCESS_INFORMATION
   690  // data from every process in a job.
   691  func forEachProcessInfo(job *jobobject.JobObject, work func(*winapi.SYSTEM_PROCESS_INFORMATION)) error {
   692  	procInfos, err := systemProcessInformation()
   693  	if err != nil {
   694  		return err
   695  	}
   696  
   697  	pids, err := job.Pids()
   698  	if err != nil {
   699  		return err
   700  	}
   701  
   702  	pidsMap := make(map[uint32]struct{})
   703  	for _, pid := range pids {
   704  		pidsMap[pid] = struct{}{}
   705  	}
   706  
   707  	for _, procInfo := range procInfos {
   708  		if _, ok := pidsMap[uint32(procInfo.UniqueProcessID)]; ok {
   709  			work(procInfo)
   710  		}
   711  	}
   712  	return nil
   713  }
   714  
   715  // Get a slice of SYSTEM_PROCESS_INFORMATION for all of the processes running on the system.
   716  func systemProcessInformation() ([]*winapi.SYSTEM_PROCESS_INFORMATION, error) {
   717  	var (
   718  		systemProcInfo *winapi.SYSTEM_PROCESS_INFORMATION
   719  		procInfos      []*winapi.SYSTEM_PROCESS_INFORMATION
   720  		// This happens to be the buffer size hcs uses but there's really no hard need to keep it
   721  		// the same, it's just a sane default.
   722  		size   = uint32(1024 * 512)
   723  		bounds uintptr
   724  	)
   725  	for {
   726  		b := make([]byte, size)
   727  		systemProcInfo = (*winapi.SYSTEM_PROCESS_INFORMATION)(unsafe.Pointer(&b[0]))
   728  		status := winapi.NtQuerySystemInformation(
   729  			winapi.SystemProcessInformation,
   730  			unsafe.Pointer(systemProcInfo),
   731  			size,
   732  			&size,
   733  		)
   734  		if winapi.NTSuccess(status) {
   735  			// Cache the address of the end of our buffer so we can check we don't go past this
   736  			// in some odd case.
   737  			bounds = uintptr(unsafe.Pointer(&b[len(b)-1]))
   738  			break
   739  		} else if status != winapi.STATUS_INFO_LENGTH_MISMATCH {
   740  			return nil, winapi.RtlNtStatusToDosError(status)
   741  		}
   742  	}
   743  
   744  	for {
   745  		if uintptr(unsafe.Pointer(systemProcInfo))+uintptr(systemProcInfo.NextEntryOffset) >= bounds {
   746  			// The next entry is outside of the bounds of our buffer somehow, abort.
   747  			return nil, errors.New("system process info entry exceeds allocated buffer")
   748  		}
   749  		procInfos = append(procInfos, systemProcInfo)
   750  		if systemProcInfo.NextEntryOffset == 0 {
   751  			break
   752  		}
   753  		systemProcInfo = (*winapi.SYSTEM_PROCESS_INFORMATION)(unsafe.Pointer(uintptr(unsafe.Pointer(systemProcInfo)) + uintptr(systemProcInfo.NextEntryOffset)))
   754  	}
   755  
   756  	return procInfos, nil
   757  }
   758  
   759  // Takes a string and replaces any occurrences of CONTAINER_SANDBOX_MOUNT_POINT with where the containers' volume is mounted, as well as returning
   760  // if the string actually contained the environment variable.
   761  func (c *JobContainer) replaceWithMountPoint(str string) (string, bool) {
   762  	mountPoint := c.rootfsLocation
   763  	newStr := strings.ReplaceAll(str, "%"+sandboxMountPointEnvVar+"%", mountPoint[:len(mountPoint)-1])
   764  	newStr = strings.ReplaceAll(newStr, "$env:"+sandboxMountPointEnvVar, mountPoint[:len(mountPoint)-1])
   765  	return newStr, str != newStr
   766  }
   767  
   768  func (c *JobContainer) bindSetup(ctx context.Context, s *specs.Spec) (_ resources.ResourceCloser, err error) {
   769  	// Must be upgraded to a silo so we can get per silo bindings for the container.
   770  	if err := c.job.PromoteToSilo(); err != nil {
   771  		return nil, err
   772  	}
   773  	// Union the container layers.
   774  	closer, err := c.mountLayers(ctx, c.id, s, "")
   775  	if err != nil {
   776  		return nil, fmt.Errorf("failed to mount container layers: %w", err)
   777  	}
   778  	defer func() {
   779  		if err != nil {
   780  			_ = closer.Release(ctx)
   781  		}
   782  	}()
   783  
   784  	rootfsLocation := defaultSiloRootfsLocation
   785  	if loc := customRootfsLocation(s.Annotations); loc != "" {
   786  		rootfsLocation = loc
   787  	}
   788  
   789  	if err := c.setupRootfsBinding(rootfsLocation, s.Root.Path); err != nil {
   790  		return nil, err
   791  	}
   792  	c.rootfsLocation = rootfsLocation
   793  	if err := c.setupMounts(ctx, s); err != nil {
   794  		return nil, err
   795  	}
   796  	return closer, nil
   797  }
   798  
   799  // This handles the fallback case where bind mounting isn't available on the machine. This mounts the
   800  // container layers on the host and sets up any mounts present in the OCI runtime spec.
   801  func (c *JobContainer) fallbackSetup(ctx context.Context, s *specs.Spec) (_ resources.ResourceCloser, err error) {
   802  	rootfsLocation := fmt.Sprintf(fallbackRootfsFormat, c.id)
   803  	if loc := customRootfsLocation(s.Annotations); loc != "" {
   804  		rootfsLocation = filepath.Join(loc, c.id)
   805  	}
   806  	closer, err := c.mountLayers(ctx, c.id, s, rootfsLocation)
   807  	if err != nil {
   808  		return nil, fmt.Errorf("failed to mount container layers: %w", err)
   809  	}
   810  	defer func() {
   811  		if err != nil {
   812  			_ = closer.Release(ctx)
   813  		}
   814  	}()
   815  	c.rootfsLocation = rootfsLocation
   816  	if err := fallbackMountSetup(s, c.rootfsLocation); err != nil {
   817  		return nil, err
   818  	}
   819  	return closer, nil
   820  }
   821  

View as plain text