...

Source file src/github.com/Microsoft/hcsshim/internal/uvm/create_lcow.go

Documentation: github.com/Microsoft/hcsshim/internal/uvm

     1  //go:build windows
     2  
     3  package uvm
     4  
     5  import (
     6  	"context"
     7  	"encoding/base64"
     8  	"fmt"
     9  	"io"
    10  	"net"
    11  	"os"
    12  	"path/filepath"
    13  	"strings"
    14  
    15  	"github.com/Microsoft/go-winio"
    16  	"github.com/Microsoft/go-winio/pkg/guid"
    17  	"github.com/Microsoft/hcsshim/internal/security"
    18  	"github.com/Microsoft/hcsshim/pkg/securitypolicy"
    19  	"github.com/pkg/errors"
    20  	"github.com/sirupsen/logrus"
    21  	"go.opencensus.io/trace"
    22  
    23  	"github.com/Microsoft/hcsshim/internal/gcs"
    24  	hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
    25  	"github.com/Microsoft/hcsshim/internal/log"
    26  	"github.com/Microsoft/hcsshim/internal/logfields"
    27  	"github.com/Microsoft/hcsshim/internal/oc"
    28  	"github.com/Microsoft/hcsshim/internal/processorinfo"
    29  	"github.com/Microsoft/hcsshim/internal/protocol/guestrequest"
    30  	"github.com/Microsoft/hcsshim/internal/schemaversion"
    31  	"github.com/Microsoft/hcsshim/osversion"
    32  )
    33  
    34  // General information about how this works at a high level.
    35  //
    36  // The purpose is to start an LCOW Utility VM or UVM using the Host Compute Service, an API to create and manipulate running virtual machines
    37  // HCS takes json descriptions of the work to be done.
    38  //
    39  // When a pod (there is a one to one mapping of pod to UVM) is to be created various annotations and defaults are combined into an options object which is
    40  // passed to CreateLCOW (see below) where the options are transformed into a json document to be presented to the HCS VM creation code.
    41  //
    42  // There are two paths in CreateLCOW to creating the json document. The most flexible case is makeLCOWDoc which is used where no specialist hardware security
    43  // applies, then there is makeLCOWSecurityDoc which is used in the case of AMD SEV-SNP memory encryption and integrity protection. There is quite
    44  // a lot of difference between the two paths, for example the regular path has options about the type of kernel and initrd binary whereas the AMD SEV-SNP
    45  // path has only one file but there are many other detail differences, so the code is split for clarity.
    46  //
    47  // makeLCOW*Doc returns an instance of hcsschema.ComputeSystem. That is then serialised to the json string provided to the flat C api. A similar scheme is used
    48  // for later adjustments, for example adding a newtwork adpator.
    49  //
    50  // Examples of the eventual json are inline as comments by these two functions to show the eventual effect of the code.
    51  //
    52  // Note that the schema files, ie the Go objects that represent the json, are generated outside of the local build process.
    53  
    54  type PreferredRootFSType int
    55  
    56  const (
    57  	PreferredRootFSTypeInitRd PreferredRootFSType = iota
    58  	PreferredRootFSTypeVHD
    59  	PreferredRootFSTypeNA
    60  
    61  	entropyVsockPort  = 1
    62  	linuxLogVsockPort = 109
    63  )
    64  
    65  // OutputHandler is used to process the output from the program run in the UVM.
    66  type OutputHandler func(io.Reader)
    67  
    68  const (
    69  	// InitrdFile is the default file name for an initrd.img used to boot LCOW.
    70  	InitrdFile = "initrd.img"
    71  	// VhdFile is the default file name for a rootfs.vhd used to boot LCOW.
    72  	VhdFile = "rootfs.vhd"
    73  	// KernelFile is the default file name for a kernel used to boot LCOW.
    74  	KernelFile = "kernel"
    75  	// UncompressedKernelFile is the default file name for an uncompressed
    76  	// kernel used to boot LCOW with KernelDirect.
    77  	UncompressedKernelFile = "vmlinux"
    78  	// GuestStateFile is the default file name for a vmgs (VM Guest State) file
    79  	// which combines kernel and initrd and is used to boot from in the SNP case.
    80  	GuestStateFile = "kernelinitrd.vmgs"
    81  	// UVMReferenceInfoFile is the default file name for a COSE_Sign1
    82  	// reference UVM info, which can be made available to workload containers
    83  	// and can be used for validation purposes.
    84  	UVMReferenceInfoFile = "reference_info.cose"
    85  )
    86  
    87  type ConfidentialOptions struct {
    88  	GuestStateFile         string // The vmgs file to load
    89  	UseGuestStateFile      bool   // Use a vmgs file that contains a kernel and initrd, required for SNP
    90  	SecurityPolicy         string // Optional security policy
    91  	SecurityPolicyEnabled  bool   // Set when there is a security policy to apply on actual SNP hardware, use this rathen than checking the string length
    92  	SecurityPolicyEnforcer string // Set which security policy enforcer to use (open door, standard or rego). This allows for better fallback mechanic.
    93  	UVMReferenceInfoFile   string // Filename under `BootFilesPath` for (potentially signed) UVM image reference information.
    94  	BundleDirectory        string // pod bundle directory
    95  }
    96  
    97  // OptionsLCOW are the set of options passed to CreateLCOW() to create a utility vm.
    98  type OptionsLCOW struct {
    99  	*Options
   100  	*ConfidentialOptions
   101  
   102  	BootFilesPath           string              // Folder in which kernel and root file system reside. Defaults to \Program Files\Linux Containers
   103  	KernelFile              string              // Filename under `BootFilesPath` for the kernel. Defaults to `kernel`
   104  	KernelDirect            bool                // Skip UEFI and boot directly to `kernel`
   105  	RootFSFile              string              // Filename under `BootFilesPath` for the UVMs root file system. Defaults to `InitrdFile`
   106  	KernelBootOptions       string              // Additional boot options for the kernel
   107  	EnableGraphicsConsole   bool                // If true, enable a graphics console for the utility VM
   108  	ConsolePipe             string              // The named pipe path to use for the serial console.  eg \\.\pipe\vmpipe
   109  	UseGuestConnection      bool                // Whether the HCS should connect to the UVM's GCS. Defaults to true
   110  	ExecCommandLine         string              // The command line to exec from init. Defaults to GCS
   111  	ForwardStdout           bool                // Whether stdout will be forwarded from the executed program. Defaults to false
   112  	ForwardStderr           bool                // Whether stderr will be forwarded from the executed program. Defaults to true
   113  	OutputHandler           OutputHandler       `json:"-"` // Controls how output received over HVSocket from the UVM is handled. Defaults to parsing output as logrus messages
   114  	VPMemDeviceCount        uint32              // Number of VPMem devices. Defaults to `DefaultVPMEMCount`. Limit at 128. If booting UVM from VHD, device 0 is taken.
   115  	VPMemSizeBytes          uint64              // Size of the VPMem devices. Defaults to `DefaultVPMemSizeBytes`.
   116  	VPMemNoMultiMapping     bool                // Disables LCOW layer multi mapping
   117  	PreferredRootFSType     PreferredRootFSType // If `KernelFile` is `InitrdFile` use `PreferredRootFSTypeInitRd`. If `KernelFile` is `VhdFile` use `PreferredRootFSTypeVHD`
   118  	EnableColdDiscardHint   bool                // Whether the HCS should use cold discard hints. Defaults to false
   119  	VPCIEnabled             bool                // Whether the kernel should enable pci
   120  	EnableScratchEncryption bool                // Whether the scratch should be encrypted
   121  	DisableTimeSyncService  bool                // Disables the time synchronization service
   122  }
   123  
   124  // defaultLCOWOSBootFilesPath returns the default path used to locate the LCOW
   125  // OS kernel and root FS files. This default is the subdirectory
   126  // `LinuxBootFiles` in the directory of the executable that started the current
   127  // process; or, if it does not exist, `%ProgramFiles%\Linux Containers`.
   128  func defaultLCOWOSBootFilesPath() string {
   129  	localDirPath := filepath.Join(filepath.Dir(os.Args[0]), "LinuxBootFiles")
   130  	if _, err := os.Stat(localDirPath); err == nil {
   131  		return localDirPath
   132  	}
   133  	return filepath.Join(os.Getenv("ProgramFiles"), "Linux Containers")
   134  }
   135  
   136  // NewDefaultOptionsLCOW creates the default options for a bootable version of
   137  // LCOW.
   138  //
   139  // `id` the ID of the compute system. If not passed will generate a new GUID.
   140  //
   141  // `owner` the owner of the compute system. If not passed will use the
   142  // executable files name.
   143  func NewDefaultOptionsLCOW(id, owner string) *OptionsLCOW {
   144  	// Use KernelDirect boot by default on all builds that support it.
   145  	kernelDirectSupported := osversion.Build() >= 18286
   146  	opts := &OptionsLCOW{
   147  		Options:                 newDefaultOptions(id, owner),
   148  		BootFilesPath:           defaultLCOWOSBootFilesPath(),
   149  		KernelFile:              KernelFile,
   150  		KernelDirect:            kernelDirectSupported,
   151  		RootFSFile:              InitrdFile,
   152  		KernelBootOptions:       "",
   153  		EnableGraphicsConsole:   false,
   154  		ConsolePipe:             "",
   155  		UseGuestConnection:      true,
   156  		ExecCommandLine:         fmt.Sprintf("/bin/gcs -v4 -log-format json -loglevel %s", logrus.StandardLogger().Level.String()),
   157  		ForwardStdout:           false,
   158  		ForwardStderr:           true,
   159  		OutputHandler:           parseLogrus(id),
   160  		VPMemDeviceCount:        DefaultVPMEMCount,
   161  		VPMemSizeBytes:          DefaultVPMemSizeBytes,
   162  		VPMemNoMultiMapping:     osversion.Get().Build < osversion.V19H1,
   163  		PreferredRootFSType:     PreferredRootFSTypeInitRd,
   164  		EnableColdDiscardHint:   false,
   165  		VPCIEnabled:             false,
   166  		EnableScratchEncryption: false,
   167  		DisableTimeSyncService:  false,
   168  		ConfidentialOptions: &ConfidentialOptions{
   169  			SecurityPolicyEnabled: false,
   170  			UVMReferenceInfoFile:  UVMReferenceInfoFile,
   171  		},
   172  	}
   173  
   174  	if _, err := os.Stat(filepath.Join(opts.BootFilesPath, VhdFile)); err == nil {
   175  		// We have a rootfs.vhd in the boot files path. Use it over an initrd.img
   176  		opts.RootFSFile = VhdFile
   177  		opts.PreferredRootFSType = PreferredRootFSTypeVHD
   178  	}
   179  
   180  	if kernelDirectSupported {
   181  		// KernelDirect supports uncompressed kernel if the kernel is present.
   182  		// Default to uncompressed if on box. NOTE: If `kernel` is already
   183  		// uncompressed and simply named 'kernel' it will still be used
   184  		// uncompressed automatically.
   185  		if _, err := os.Stat(filepath.Join(opts.BootFilesPath, UncompressedKernelFile)); err == nil {
   186  			opts.KernelFile = UncompressedKernelFile
   187  		}
   188  	}
   189  	return opts
   190  }
   191  
   192  // Get an acceptable number of processors given option and actual constraints.
   193  func fetchProcessor(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (*hcsschema.Processor2, error) {
   194  	processorTopology, err := processorinfo.HostProcessorInfo(ctx)
   195  	if err != nil {
   196  		return nil, fmt.Errorf("failed to get host processor information: %s", err)
   197  	}
   198  
   199  	// To maintain compatibility with Docker we need to automatically downgrade
   200  	// a user CPU count if the setting is not possible.
   201  	uvm.processorCount = uvm.normalizeProcessorCount(ctx, opts.ProcessorCount, processorTopology)
   202  
   203  	processor := &hcsschema.Processor2{
   204  		Count:  uvm.processorCount,
   205  		Limit:  opts.ProcessorLimit,
   206  		Weight: opts.ProcessorWeight,
   207  	}
   208  	// We can set a cpu group for the VM at creation time in recent builds.
   209  	if opts.CPUGroupID != "" {
   210  		if osversion.Build() < osversion.V21H1 {
   211  			return nil, errCPUGroupCreateNotSupported
   212  		}
   213  		processor.CpuGroup = &hcsschema.CpuGroup{Id: opts.CPUGroupID}
   214  	}
   215  	return processor, nil
   216  }
   217  
   218  /*
   219  Example JSON document produced once the hcsschema.ComputeSytem returned by makeLCOWSecurityDoc is serialised:
   220  {
   221      "Owner": "containerd-shim-runhcs-v1.exe",
   222      "SchemaVersion": {
   223          "Major": 2,
   224          "Minor": 5
   225      },
   226      "ShouldTerminateOnLastHandleClosed": true,
   227      "VirtualMachine": {
   228          "Chipset": {
   229              "Uefi": {
   230                  "ApplySecureBootTemplate": "Apply",
   231                  "SecureBootTemplateId": "1734c6e8-3154-4dda-ba5f-a874cc483422"
   232              }
   233          },
   234          "ComputeTopology": {
   235              "Memory": {
   236                  "SizeInMB": 1024
   237              },
   238              "Processor": {
   239                  "Count": 2
   240              }
   241          },
   242          "Devices": {
   243              "Scsi" : { "0" : {} },
   244              "HvSocket": {
   245                  "HvSocketConfig": {
   246                      "DefaultBindSecurityDescriptor":  "D:P(A;;FA;;;WD)",
   247                      "DefaultConnectSecurityDescriptor": "D:P(A;;FA;;;SY)(A;;FA;;;BA)",
   248                      "ServiceTable" : {
   249                           "00000808-facb-11e6-bd58-64006a7986d3" :  {
   250                               "AllowWildcardBinds" : true,
   251                               "BindSecurityDescriptor":   "D:P(A;;FA;;;WD)",
   252                               "ConnectSecurityDescriptor": "D:P(A;;FA;;;SY)(A;;FA;;;BA)"
   253                           },
   254                           "0000006d-facb-11e6-bd58-64006a7986d3" :  {
   255                               "AllowWildcardBinds" : true,
   256                               "BindSecurityDescriptor":   "D:P(A;;FA;;;WD)",
   257                               "ConnectSecurityDescriptor": "D:P(A;;FA;;;SY)(A;;FA;;;BA)"
   258                           },
   259                           "00000001-facb-11e6-bd58-64006a7986d3" :  {
   260                               "AllowWildcardBinds" : true,
   261                               "BindSecurityDescriptor":   "D:P(A;;FA;;;WD)",
   262                               "ConnectSecurityDescriptor": "D:P(A;;FA;;;SY)(A;;FA;;;BA)"
   263                           },
   264                           "40000000-facb-11e6-bd58-64006a7986d3" :  {
   265                               "AllowWildcardBinds" : true,
   266                               "BindSecurityDescriptor":  "D:P(A;;FA;;;WD)",
   267                               "ConnectSecurityDescriptor": "D:P(A;;FA;;;SY)(A;;FA;;;BA)"
   268                           }
   269                       }
   270                  }
   271              },
   272              "Plan9": {}
   273          },
   274          "GuestState": {
   275              "GuestStateFilePath": "d:\\ken\\aug27\\gcsinitnew.vmgs",
   276              "GuestStateFileType": "FileMode",
   277  			"ForceTransientState": true
   278          },
   279          "SecuritySettings": {
   280              "Isolation": {
   281                  "IsolationType": "SecureNestedPaging",
   282                  "LaunchData": "kBifgKNijdHjxdSUshmavrNofo2B01LiIi1cr8R4ytI="
   283              }
   284          },
   285          "Version": {
   286              "Major": 254,
   287              "Minor": 0
   288          }
   289      }
   290  }
   291  */
   292  
   293  // A large part of difference between the SNP case and the usual kernel+option+initrd case is to do with booting
   294  // from a VMGS file. The VMGS part may be used other than with SNP so is split out here.
   295  
   296  // Make a hcsschema.ComputeSytem with the parts that target booting from a VMGS file
   297  func makeLCOWVMGSDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcsschema.ComputeSystem, err error) {
   298  	// Kernel and initrd are combined into a single vmgs file.
   299  	vmgsTemplatePath := filepath.Join(opts.BootFilesPath, opts.GuestStateFile)
   300  	if _, err := os.Stat(vmgsTemplatePath); os.IsNotExist(err) {
   301  		return nil, fmt.Errorf("the GuestState vmgs file '%s' was not found", vmgsTemplatePath)
   302  	}
   303  
   304  	// The rootfs must be provided as an initrd within the VMGS file.
   305  	// Raise an error if instructed to use a particular sort of rootfs.
   306  	if opts.PreferredRootFSType != PreferredRootFSTypeNA {
   307  		return nil, fmt.Errorf("cannot override rootfs when using VMGS file")
   308  	}
   309  
   310  	var processor *hcsschema.Processor2
   311  	processor, err = fetchProcessor(ctx, opts, uvm)
   312  	if err != nil {
   313  		return nil, err
   314  	}
   315  
   316  	vmgsFile, err := os.Create(filepath.Join(opts.BundleDirectory, opts.GuestStateFile))
   317  	if err != nil {
   318  		return nil, fmt.Errorf("failed to create temporary VMGS file: %w", err)
   319  	}
   320  	defer func() {
   321  		_ = vmgsFile.Close()
   322  		if err != nil {
   323  			if rmErr := os.RemoveAll(vmgsFile.Name()); rmErr != nil {
   324  				log.G(ctx).WithError(rmErr).Error("failed to remove temporary VMGS file")
   325  			}
   326  		}
   327  	}()
   328  
   329  	templateFile, err := os.Open(vmgsTemplatePath)
   330  	if err != nil {
   331  		return nil, fmt.Errorf("failed to open template VMGS file for copy: %w", err)
   332  	}
   333  	defer templateFile.Close()
   334  
   335  	if _, err := io.Copy(vmgsFile, templateFile); err != nil {
   336  		return nil, fmt.Errorf("failed to copy template VMGS file: %w", err)
   337  	}
   338  
   339  	if err := security.GrantVmGroupAccessWithMask(vmgsFile.Name(), security.AccessMaskAll); err != nil {
   340  		return nil, fmt.Errorf("failed to grant VM group access ALL: %w", err)
   341  	}
   342  
   343  	// Align the requested memory size.
   344  	memorySizeInMB := uvm.normalizeMemorySize(ctx, opts.MemorySizeInMB)
   345  
   346  	doc := &hcsschema.ComputeSystem{
   347  		Owner:                             uvm.owner,
   348  		SchemaVersion:                     schemaversion.SchemaV25(),
   349  		ShouldTerminateOnLastHandleClosed: true,
   350  		VirtualMachine: &hcsschema.VirtualMachine{
   351  			StopOnReset: true,
   352  			Chipset:     &hcsschema.Chipset{},
   353  			ComputeTopology: &hcsschema.Topology{
   354  				Memory: &hcsschema.Memory2{
   355  					SizeInMB:              memorySizeInMB,
   356  					AllowOvercommit:       opts.AllowOvercommit,
   357  					EnableDeferredCommit:  opts.EnableDeferredCommit,
   358  					EnableColdDiscardHint: opts.EnableColdDiscardHint,
   359  					LowMMIOGapInMB:        opts.LowMMIOGapInMB,
   360  					HighMMIOBaseInMB:      opts.HighMMIOBaseInMB,
   361  					HighMMIOGapInMB:       opts.HighMMIOGapInMB,
   362  				},
   363  				Processor: processor,
   364  			},
   365  			Devices: &hcsschema.Devices{
   366  				HvSocket: &hcsschema.HvSocket2{
   367  					HvSocketConfig: &hcsschema.HvSocketSystemConfig{
   368  						// Allow administrators and SYSTEM to bind to vsock sockets
   369  						// so that we can create a GCS log socket.
   370  						DefaultBindSecurityDescriptor:    "D:P(A;;FA;;;WD)", // Differs for SNP
   371  						DefaultConnectSecurityDescriptor: "D:P(A;;FA;;;SY)(A;;FA;;;BA)",
   372  						ServiceTable:                     make(map[string]hcsschema.HvSocketServiceConfig),
   373  					},
   374  				},
   375  				Plan9: &hcsschema.Plan9{},
   376  			},
   377  		},
   378  	}
   379  
   380  	// Set permissions for the VSock ports:
   381  	//		entropyVsockPort - 1 is the entropy port,
   382  	//		linuxLogVsockPort - 109 used by vsockexec to log stdout/stderr logging,
   383  	//		0x40000000 + 1 (LinuxGcsVsockPort + 1) is the bridge (see guestconnectiuon.go)
   384  	hvSockets := [...]uint32{entropyVsockPort, linuxLogVsockPort, gcs.LinuxGcsVsockPort, gcs.LinuxGcsVsockPort + 1}
   385  	for _, whichSocket := range hvSockets {
   386  		key := fmt.Sprintf("%08x-facb-11e6-bd58-64006a7986d3", whichSocket) // format of a linux hvsock GUID is port#-facb-11e6-bd58-64006a7986d3
   387  		doc.VirtualMachine.Devices.HvSocket.HvSocketConfig.ServiceTable[key] = hcsschema.HvSocketServiceConfig{
   388  			AllowWildcardBinds:        true,
   389  			BindSecurityDescriptor:    "D:P(A;;FA;;;WD)",
   390  			ConnectSecurityDescriptor: "D:P(A;;FA;;;SY)(A;;FA;;;BA)",
   391  		}
   392  	}
   393  
   394  	// Handle StorageQoS if set
   395  	if opts.StorageQoSBandwidthMaximum > 0 || opts.StorageQoSIopsMaximum > 0 {
   396  		doc.VirtualMachine.StorageQoS = &hcsschema.StorageQoS{
   397  			IopsMaximum:      opts.StorageQoSIopsMaximum,
   398  			BandwidthMaximum: opts.StorageQoSBandwidthMaximum,
   399  		}
   400  	}
   401  
   402  	if uvm.scsiControllerCount > 0 {
   403  		doc.VirtualMachine.Devices.Scsi = map[string]hcsschema.Scsi{}
   404  		for i := 0; i < int(uvm.scsiControllerCount); i++ {
   405  			doc.VirtualMachine.Devices.Scsi[guestrequest.ScsiControllerGuids[i]] = hcsschema.Scsi{
   406  				Attachments: make(map[string]hcsschema.Attachment),
   407  			}
   408  		}
   409  	}
   410  
   411  	// Required by HCS for the isolated boot scheme, see also https://docs.microsoft.com/en-us/windows-server/virtualization/hyper-v/learn-more/generation-2-virtual-machine-security-settings-for-hyper-v
   412  	// A complete explanation of the why's and wherefores of starting an encrypted, isolated VM are beond the scope of these comments.
   413  	doc.VirtualMachine.Chipset.Uefi = &hcsschema.Uefi{
   414  		ApplySecureBootTemplate: "Apply",
   415  		SecureBootTemplateId:    "1734c6e8-3154-4dda-ba5f-a874cc483422", // aka MicrosoftWindowsSecureBootTemplateGUID equivalent to "Microsoft Windows" template from Get-VMHost | select SecureBootTemplates,
   416  
   417  	}
   418  
   419  	// Point at the file that contains the linux kernel and initrd images.
   420  	doc.VirtualMachine.GuestState = &hcsschema.GuestState{
   421  		GuestStateFilePath:  vmgsFile.Name(),
   422  		GuestStateFileType:  "FileMode",
   423  		ForceTransientState: true, // tell HCS that this is just the source of the images, not ongoing state
   424  	}
   425  
   426  	return doc, nil
   427  }
   428  
   429  // Programatically make the hcsschema.ComputeSystem document for the SNP case.
   430  // This is done prior to json seriaisation and sending to the HCS layer to actually do the work of creating the VM.
   431  // Many details are quite different (see the typical JSON examples), in particular it boots from a VMGS file
   432  // which contains both the kernel and initrd as well as kernel boot options.
   433  func makeLCOWSecurityDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcsschema.ComputeSystem, err error) {
   434  	doc, vmgsErr := makeLCOWVMGSDoc(ctx, opts, uvm)
   435  	if vmgsErr != nil {
   436  		return nil, vmgsErr
   437  	}
   438  
   439  	// Part of the protocol to ensure that the rules in the user's Security Policy are
   440  	// respected is to provide a hash of the policy to the hardware. This is immutable
   441  	// and can be used to check that the policy used by opengcs is the required one as
   442  	// a condition of releasing secrets to the container.
   443  
   444  	policyDigest, err := securitypolicy.NewSecurityPolicyDigest(opts.SecurityPolicy)
   445  	if err != nil {
   446  		return nil, err
   447  	}
   448  	// HCS API expect a base64 encoded string as LaunchData. Internally it
   449  	// decodes it to bytes. SEV later returns the decoded byte blob as HostData
   450  	// field of the report.
   451  	hostData := base64.StdEncoding.EncodeToString(policyDigest)
   452  
   453  	// Put the measurement into the LaunchData field of the HCS creation command.
   454  	// This will end-up in HOST_DATA of SNP_LAUNCH_FINISH command the and ATTESTATION_REPORT
   455  	// retrieved by the guest later.
   456  	doc.VirtualMachine.SecuritySettings = &hcsschema.SecuritySettings{
   457  		EnableTpm: false,
   458  		Isolation: &hcsschema.IsolationSettings{
   459  			IsolationType: "SecureNestedPaging",
   460  			LaunchData:    hostData,
   461  			// HclEnabled:    true, /* Not available in schema 2.5 - REQUIRED when using BlockStorage in 2.6 */
   462  		},
   463  	}
   464  
   465  	return doc, nil
   466  }
   467  
   468  /*
   469  Example JSON document produced once the hcsschema.ComputeSytem returned by makeLCOWDoc is serialised. Note that the boot scheme is entirely different.
   470  {
   471      "Owner": "containerd-shim-runhcs-v1.exe",
   472      "SchemaVersion": {
   473          "Major": 2,
   474          "Minor": 1
   475      },
   476      "VirtualMachine": {
   477          "StopOnReset": true,
   478          "Chipset": {
   479              "LinuxKernelDirect": {
   480                  "KernelFilePath": "C:\\ContainerPlat\\LinuxBootFiles\\vmlinux",
   481                  "InitRdPath": "C:\\ContainerPlat\\LinuxBootFiles\\initrd.img",
   482                  "KernelCmdLine": " 8250_core.nr_uarts=0 panic=-1 quiet pci=off nr_cpus=2 brd.rd_nr=0 pmtmr=0 -- -e 1 /bin/vsockexec -e 109 /bin/gcs -v4 -log-format json -loglevel debug"
   483              }
   484          },
   485          "ComputeTopology": {
   486              "Memory": {
   487                  "SizeInMB": 1024,
   488                  "AllowOvercommit": true
   489              },
   490              "Processor": {
   491                  "Count": 2
   492              }
   493          },
   494          "Devices": {
   495              "Scsi": {
   496                  "0": {}
   497              },
   498              "HvSocket": {
   499                  "HvSocketConfig": {
   500                      "DefaultBindSecurityDescriptor": "D:P(A;;FA;;;SY)(A;;FA;;;BA)"
   501                  }
   502              },
   503              "Plan9": {}
   504          }
   505      },
   506      "ShouldTerminateOnLastHandleClosed": true
   507  }
   508  */
   509  
   510  // Make the ComputeSystem document object that will be serialised to json to be presented to the HCS api.
   511  func makeLCOWDoc(ctx context.Context, opts *OptionsLCOW, uvm *UtilityVM) (_ *hcsschema.ComputeSystem, err error) {
   512  	logrus.Tracef("makeLCOWDoc %v\n", opts)
   513  
   514  	kernelFullPath := filepath.Join(opts.BootFilesPath, opts.KernelFile)
   515  	if _, err := os.Stat(kernelFullPath); os.IsNotExist(err) {
   516  		return nil, fmt.Errorf("kernel: '%s' not found", kernelFullPath)
   517  	}
   518  	rootfsFullPath := filepath.Join(opts.BootFilesPath, opts.RootFSFile)
   519  	if _, err := os.Stat(rootfsFullPath); os.IsNotExist(err) {
   520  		return nil, fmt.Errorf("boot file: '%s' not found", rootfsFullPath)
   521  	}
   522  
   523  	var processor *hcsschema.Processor2
   524  	processor, err = fetchProcessor(ctx, opts, uvm) // must happen after the file existence tests above.
   525  	if err != nil {
   526  		return nil, err
   527  	}
   528  
   529  	// Align the requested memory size.
   530  	memorySizeInMB := uvm.normalizeMemorySize(ctx, opts.MemorySizeInMB)
   531  
   532  	doc := &hcsschema.ComputeSystem{
   533  		Owner:                             uvm.owner,
   534  		SchemaVersion:                     schemaversion.SchemaV21(),
   535  		ShouldTerminateOnLastHandleClosed: true,
   536  		VirtualMachine: &hcsschema.VirtualMachine{
   537  			StopOnReset: true,
   538  			Chipset:     &hcsschema.Chipset{},
   539  			ComputeTopology: &hcsschema.Topology{
   540  				Memory: &hcsschema.Memory2{
   541  					SizeInMB:              memorySizeInMB,
   542  					AllowOvercommit:       opts.AllowOvercommit,
   543  					EnableDeferredCommit:  opts.EnableDeferredCommit,
   544  					EnableColdDiscardHint: opts.EnableColdDiscardHint,
   545  					LowMMIOGapInMB:        opts.LowMMIOGapInMB,
   546  					HighMMIOBaseInMB:      opts.HighMMIOBaseInMB,
   547  					HighMMIOGapInMB:       opts.HighMMIOGapInMB,
   548  				},
   549  				Processor: processor,
   550  			},
   551  			Devices: &hcsschema.Devices{
   552  				HvSocket: &hcsschema.HvSocket2{
   553  					HvSocketConfig: &hcsschema.HvSocketSystemConfig{
   554  						// Allow administrators and SYSTEM to bind to vsock sockets
   555  						// so that we can create a GCS log socket.
   556  						DefaultBindSecurityDescriptor: "D:P(A;;FA;;;SY)(A;;FA;;;BA)",
   557  					},
   558  				},
   559  				Plan9: &hcsschema.Plan9{},
   560  			},
   561  		},
   562  	}
   563  
   564  	// Handle StorageQoS if set
   565  	if opts.StorageQoSBandwidthMaximum > 0 || opts.StorageQoSIopsMaximum > 0 {
   566  		doc.VirtualMachine.StorageQoS = &hcsschema.StorageQoS{
   567  			IopsMaximum:      opts.StorageQoSIopsMaximum,
   568  			BandwidthMaximum: opts.StorageQoSBandwidthMaximum,
   569  		}
   570  	}
   571  
   572  	if uvm.scsiControllerCount > 0 {
   573  		doc.VirtualMachine.Devices.Scsi = map[string]hcsschema.Scsi{}
   574  		for i := 0; i < int(uvm.scsiControllerCount); i++ {
   575  			doc.VirtualMachine.Devices.Scsi[guestrequest.ScsiControllerGuids[i]] = hcsschema.Scsi{
   576  				Attachments: make(map[string]hcsschema.Attachment),
   577  			}
   578  		}
   579  	}
   580  
   581  	if uvm.vpmemMaxCount > 0 {
   582  		doc.VirtualMachine.Devices.VirtualPMem = &hcsschema.VirtualPMemController{
   583  			MaximumCount:     uvm.vpmemMaxCount,
   584  			MaximumSizeBytes: uvm.vpmemMaxSizeBytes,
   585  		}
   586  	}
   587  
   588  	var kernelArgs string
   589  	switch opts.PreferredRootFSType {
   590  	case PreferredRootFSTypeInitRd:
   591  		if !opts.KernelDirect {
   592  			kernelArgs = "initrd=/" + opts.RootFSFile
   593  		}
   594  	case PreferredRootFSTypeVHD:
   595  		if uvm.vpmemMaxCount > 0 {
   596  			// Support for VPMem VHD(X) booting rather than initrd..
   597  			kernelArgs = "root=/dev/pmem0 ro rootwait init=/init"
   598  			imageFormat := "Vhd1"
   599  			if strings.ToLower(filepath.Ext(opts.RootFSFile)) == "vhdx" {
   600  				imageFormat = "Vhdx"
   601  			}
   602  			doc.VirtualMachine.Devices.VirtualPMem.Devices = map[string]hcsschema.VirtualPMemDevice{
   603  				"0": {
   604  					HostPath:    rootfsFullPath,
   605  					ReadOnly:    true,
   606  					ImageFormat: imageFormat,
   607  				},
   608  			}
   609  			if uvm.vpmemMultiMapping {
   610  				pmem := newPackedVPMemDevice()
   611  				pmem.maxMappedDeviceCount = 1
   612  
   613  				st, stErr := os.Stat(rootfsFullPath)
   614  				if stErr != nil {
   615  					return nil, errors.Wrapf(stErr, "failed to stat rootfs: %q", rootfsFullPath)
   616  				}
   617  				devSize := pageAlign(uint64(st.Size()))
   618  				memReg, pErr := pmem.Allocate(devSize)
   619  				if pErr != nil {
   620  					return nil, errors.Wrap(pErr, "failed to allocate memory for rootfs")
   621  				}
   622  				defer func() {
   623  					if err != nil {
   624  						if err = pmem.Release(memReg); err != nil {
   625  							log.G(ctx).WithError(err).Debug("failed to release memory region")
   626  						}
   627  					}
   628  				}()
   629  
   630  				dev := newVPMemMappedDevice(opts.RootFSFile, "/", devSize, memReg)
   631  				if err := pmem.mapVHDLayer(ctx, dev); err != nil {
   632  					return nil, errors.Wrapf(err, "failed to save internal state for a multi-mapped rootfs device")
   633  				}
   634  				uvm.vpmemDevicesMultiMapped[0] = pmem
   635  			} else {
   636  				dev := newDefaultVPMemInfo(opts.RootFSFile, "/")
   637  				uvm.vpmemDevicesDefault[0] = dev
   638  			}
   639  		} else {
   640  			kernelArgs = "root=/dev/sda ro rootwait init=/init"
   641  			doc.VirtualMachine.Devices.Scsi[guestrequest.ScsiControllerGuids[0]].Attachments["0"] = hcsschema.Attachment{
   642  				Type_:    "VirtualDisk",
   643  				Path:     rootfsFullPath,
   644  				ReadOnly: true,
   645  			}
   646  			uvm.scsiLocations[0][0] = newSCSIMount(uvm, rootfsFullPath, "/", "VirtualDisk", "", 1, 0, 0, true, false)
   647  		}
   648  	}
   649  
   650  	vmDebugging := false
   651  	if opts.ConsolePipe != "" {
   652  		vmDebugging = true
   653  		kernelArgs += " 8250_core.nr_uarts=1 8250_core.skip_txen_test=1 console=ttyS0,115200"
   654  		doc.VirtualMachine.Devices.ComPorts = map[string]hcsschema.ComPort{
   655  			"0": { // Which is actually COM1
   656  				NamedPipe: opts.ConsolePipe,
   657  			},
   658  		}
   659  	} else {
   660  		kernelArgs += " 8250_core.nr_uarts=0"
   661  	}
   662  
   663  	if opts.EnableGraphicsConsole {
   664  		vmDebugging = true
   665  		kernelArgs += " console=tty"
   666  		doc.VirtualMachine.Devices.Keyboard = &hcsschema.Keyboard{}
   667  		doc.VirtualMachine.Devices.EnhancedModeVideo = &hcsschema.EnhancedModeVideo{}
   668  		doc.VirtualMachine.Devices.VideoMonitor = &hcsschema.VideoMonitor{}
   669  	}
   670  
   671  	if !vmDebugging {
   672  		// Terminate the VM if there is a kernel panic.
   673  		kernelArgs += " panic=-1 quiet"
   674  	}
   675  
   676  	// Add Kernel Boot options
   677  	if opts.KernelBootOptions != "" {
   678  		kernelArgs += " " + opts.KernelBootOptions
   679  	}
   680  
   681  	if !opts.VPCIEnabled {
   682  		kernelArgs += ` pci=off`
   683  	}
   684  
   685  	// Inject initial entropy over vsock during init launch.
   686  	entropyArgs := fmt.Sprintf("-e %d", entropyVsockPort)
   687  
   688  	// With default options, run GCS with stderr pointing to the vsock port
   689  	// created below in order to forward guest logs to logrus.
   690  	execCmdArgs := "/bin/vsockexec"
   691  
   692  	if opts.ForwardStdout {
   693  		execCmdArgs += fmt.Sprintf(" -o %d", linuxLogVsockPort)
   694  	}
   695  
   696  	if opts.ForwardStderr {
   697  		execCmdArgs += fmt.Sprintf(" -e %d", linuxLogVsockPort)
   698  	}
   699  
   700  	if opts.DisableTimeSyncService {
   701  		opts.ExecCommandLine = fmt.Sprintf("%s -disable-time-sync", opts.ExecCommandLine)
   702  	}
   703  
   704  	if log.IsScrubbingEnabled() {
   705  		opts.ExecCommandLine += " -scrub-logs"
   706  	}
   707  
   708  	execCmdArgs += " " + opts.ExecCommandLine
   709  
   710  	if opts.ProcessDumpLocation != "" {
   711  		execCmdArgs += " -core-dump-location " + opts.ProcessDumpLocation
   712  	}
   713  
   714  	initArgs := fmt.Sprintf("%s %s", entropyArgs, execCmdArgs)
   715  	if vmDebugging {
   716  		// Launch a shell on the console.
   717  		initArgs = entropyArgs + ` sh -c "` + execCmdArgs + ` & exec sh"`
   718  	}
   719  
   720  	kernelArgs += fmt.Sprintf(" nr_cpus=%d", opts.ProcessorCount)
   721  	kernelArgs += ` brd.rd_nr=0 pmtmr=0 -- ` + initArgs
   722  
   723  	if !opts.KernelDirect {
   724  		doc.VirtualMachine.Chipset.Uefi = &hcsschema.Uefi{
   725  			BootThis: &hcsschema.UefiBootEntry{
   726  				DevicePath:    `\` + opts.KernelFile,
   727  				DeviceType:    "VmbFs",
   728  				VmbFsRootPath: opts.BootFilesPath,
   729  				OptionalData:  kernelArgs,
   730  			},
   731  		}
   732  	} else {
   733  		doc.VirtualMachine.Chipset.LinuxKernelDirect = &hcsschema.LinuxKernelDirect{
   734  			KernelFilePath: kernelFullPath,
   735  			KernelCmdLine:  kernelArgs,
   736  		}
   737  		if opts.PreferredRootFSType == PreferredRootFSTypeInitRd {
   738  			doc.VirtualMachine.Chipset.LinuxKernelDirect.InitRdPath = rootfsFullPath
   739  		}
   740  	}
   741  	return doc, nil
   742  }
   743  
   744  // CreateLCOW creates an HCS compute system representing a utility VM. It
   745  // consumes a set of options derived from various defaults and options
   746  // expressed as annotations.
   747  func CreateLCOW(ctx context.Context, opts *OptionsLCOW) (_ *UtilityVM, err error) {
   748  	ctx, span := oc.StartSpan(ctx, "uvm::CreateLCOW")
   749  	defer span.End()
   750  	defer func() { oc.SetSpanStatus(span, err) }()
   751  
   752  	if opts.ID == "" {
   753  		g, err := guid.NewV4()
   754  		if err != nil {
   755  			return nil, err
   756  		}
   757  		opts.ID = g.String()
   758  	}
   759  
   760  	span.AddAttributes(trace.StringAttribute(logfields.UVMID, opts.ID))
   761  	log.G(ctx).WithField("options", fmt.Sprintf("%+v", opts)).Debug("uvm::CreateLCOW options")
   762  
   763  	// We dont serialize OutputHandler so if it is missing we need to put it back to the default.
   764  	if opts.OutputHandler == nil {
   765  		opts.OutputHandler = parseLogrus(opts.ID)
   766  	}
   767  
   768  	uvm := &UtilityVM{
   769  		id:                      opts.ID,
   770  		owner:                   opts.Owner,
   771  		operatingSystem:         "linux",
   772  		scsiControllerCount:     opts.SCSIControllerCount,
   773  		vpmemMaxCount:           opts.VPMemDeviceCount,
   774  		vpmemMaxSizeBytes:       opts.VPMemSizeBytes,
   775  		vpciDevices:             make(map[VPCIDeviceKey]*VPCIDevice),
   776  		physicallyBacked:        !opts.AllowOvercommit,
   777  		devicesPhysicallyBacked: opts.FullyPhysicallyBacked,
   778  		createOpts:              opts,
   779  		vpmemMultiMapping:       !opts.VPMemNoMultiMapping,
   780  		encryptScratch:          opts.EnableScratchEncryption,
   781  		noWritableFileShares:    opts.NoWritableFileShares,
   782  		confidentialUVMOptions:  opts.ConfidentialOptions,
   783  	}
   784  
   785  	defer func() {
   786  		if err != nil {
   787  			uvm.Close()
   788  		}
   789  	}()
   790  
   791  	// vpmemMaxCount has been set to 0 which means we are going to need multiple SCSI controllers
   792  	// to support lots of layers.
   793  	if osversion.Build() >= osversion.RS5 && uvm.vpmemMaxCount == 0 {
   794  		uvm.scsiControllerCount = 4
   795  	}
   796  
   797  	if err = verifyOptions(ctx, opts); err != nil {
   798  		return nil, errors.Wrap(err, errBadUVMOpts.Error())
   799  	}
   800  
   801  	// HCS config for SNP isolated vm is quite different to the usual case
   802  	var doc *hcsschema.ComputeSystem
   803  	if opts.SecurityPolicyEnabled {
   804  		doc, err = makeLCOWSecurityDoc(ctx, opts, uvm)
   805  		log.G(ctx).Tracef("create_lcow::CreateLCOW makeLCOWSecurityDoc result doc: %v err %v", doc, err)
   806  	} else {
   807  		doc, err = makeLCOWDoc(ctx, opts, uvm)
   808  		log.G(ctx).Tracef("create_lcow::CreateLCOW makeLCOWDoc result doc: %v err %v", doc, err)
   809  	}
   810  	if err != nil {
   811  		return nil, err
   812  	}
   813  
   814  	if err = uvm.create(ctx, doc); err != nil {
   815  		return nil, fmt.Errorf("error while creating the compute system: %w", err)
   816  	}
   817  	log.G(ctx).WithField("uvm", uvm).Trace("create_lcow::CreateLCOW uvm.create result")
   818  
   819  	// Create a socket to inject entropy during boot.
   820  	uvm.entropyListener, err = uvm.listenVsock(entropyVsockPort)
   821  	if err != nil {
   822  		return nil, err
   823  	}
   824  
   825  	// Create a socket that the executed program can send to. This is usually
   826  	// used by GCS to send log data.
   827  	if opts.ForwardStdout || opts.ForwardStderr {
   828  		uvm.outputHandler = opts.OutputHandler
   829  		uvm.outputProcessingDone = make(chan struct{})
   830  		uvm.outputListener, err = uvm.listenVsock(linuxLogVsockPort)
   831  		if err != nil {
   832  			return nil, err
   833  		}
   834  	}
   835  
   836  	if opts.UseGuestConnection {
   837  		log.G(ctx).WithField("vmID", uvm.runtimeID).Debug("Using external GCS bridge")
   838  		l, err := uvm.listenVsock(gcs.LinuxGcsVsockPort)
   839  		if err != nil {
   840  			return nil, err
   841  		}
   842  		uvm.gcListener = l
   843  	}
   844  
   845  	uvm.ncProxyClientAddress = opts.NetworkConfigProxy
   846  
   847  	return uvm, nil
   848  }
   849  
   850  func (uvm *UtilityVM) listenVsock(port uint32) (net.Listener, error) {
   851  	return winio.ListenHvsock(&winio.HvsockAddr{
   852  		VMID:      uvm.runtimeID,
   853  		ServiceID: winio.VsockServiceID(port),
   854  	})
   855  }
   856  

View as plain text