...

Source file src/github.com/opencontainers/runc/libcontainer/container_linux.go

Documentation: github.com/opencontainers/runc/libcontainer

     1  package libcontainer
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/json"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  	"net"
    10  	"os"
    11  	"os/exec"
    12  	"path"
    13  	"path/filepath"
    14  	"reflect"
    15  	"strconv"
    16  	"strings"
    17  	"sync"
    18  	"time"
    19  
    20  	"github.com/checkpoint-restore/go-criu/v5"
    21  	criurpc "github.com/checkpoint-restore/go-criu/v5/rpc"
    22  	securejoin "github.com/cyphar/filepath-securejoin"
    23  	"github.com/opencontainers/runtime-spec/specs-go"
    24  	"github.com/sirupsen/logrus"
    25  	"github.com/vishvananda/netlink/nl"
    26  	"golang.org/x/sys/unix"
    27  	"google.golang.org/protobuf/proto"
    28  
    29  	"github.com/opencontainers/runc/libcontainer/cgroups"
    30  	"github.com/opencontainers/runc/libcontainer/configs"
    31  	"github.com/opencontainers/runc/libcontainer/intelrdt"
    32  	"github.com/opencontainers/runc/libcontainer/system"
    33  	"github.com/opencontainers/runc/libcontainer/utils"
    34  )
    35  
    36  const stdioFdCount = 3
    37  
    38  type linuxContainer struct {
    39  	id                   string
    40  	root                 string
    41  	config               *configs.Config
    42  	cgroupManager        cgroups.Manager
    43  	intelRdtManager      *intelrdt.Manager
    44  	initPath             string
    45  	initArgs             []string
    46  	initProcess          parentProcess
    47  	initProcessStartTime uint64
    48  	criuPath             string
    49  	newuidmapPath        string
    50  	newgidmapPath        string
    51  	m                    sync.Mutex
    52  	criuVersion          int
    53  	state                containerState
    54  	created              time.Time
    55  	fifo                 *os.File
    56  }
    57  
    58  // State represents a running container's state
    59  type State struct {
    60  	BaseState
    61  
    62  	// Platform specific fields below here
    63  
    64  	// Specified if the container was started under the rootless mode.
    65  	// Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
    66  	Rootless bool `json:"rootless"`
    67  
    68  	// Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths
    69  	//
    70  	// For cgroup v1, a key is cgroup subsystem name, and the value is the path
    71  	// to the cgroup for this subsystem.
    72  	//
    73  	// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
    74  	CgroupPaths map[string]string `json:"cgroup_paths"`
    75  
    76  	// NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
    77  	// with the value as the path.
    78  	NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
    79  
    80  	// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
    81  	ExternalDescriptors []string `json:"external_descriptors,omitempty"`
    82  
    83  	// Intel RDT "resource control" filesystem path
    84  	IntelRdtPath string `json:"intel_rdt_path"`
    85  }
    86  
    87  // Container is a libcontainer container object.
    88  //
    89  // Each container is thread-safe within the same process. Since a container can
    90  // be destroyed by a separate process, any function may return that the container
    91  // was not found.
    92  type Container interface {
    93  	BaseContainer
    94  
    95  	// Methods below here are platform specific
    96  
    97  	// Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
    98  	Checkpoint(criuOpts *CriuOpts) error
    99  
   100  	// Restore restores the checkpointed container to a running state using the criu(8) utility.
   101  	Restore(process *Process, criuOpts *CriuOpts) error
   102  
   103  	// If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
   104  	// the execution of any user processes. Asynchronously, when the container finished being paused the
   105  	// state is changed to PAUSED.
   106  	// If the Container state is PAUSED, do nothing.
   107  	Pause() error
   108  
   109  	// If the Container state is PAUSED, resumes the execution of any user processes in the
   110  	// Container before setting the Container state to RUNNING.
   111  	// If the Container state is RUNNING, do nothing.
   112  	Resume() error
   113  
   114  	// NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
   115  	NotifyOOM() (<-chan struct{}, error)
   116  
   117  	// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
   118  	NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
   119  }
   120  
   121  // ID returns the container's unique ID
   122  func (c *linuxContainer) ID() string {
   123  	return c.id
   124  }
   125  
   126  // Config returns the container's configuration
   127  func (c *linuxContainer) Config() configs.Config {
   128  	return *c.config
   129  }
   130  
   131  func (c *linuxContainer) Status() (Status, error) {
   132  	c.m.Lock()
   133  	defer c.m.Unlock()
   134  	return c.currentStatus()
   135  }
   136  
   137  func (c *linuxContainer) State() (*State, error) {
   138  	c.m.Lock()
   139  	defer c.m.Unlock()
   140  	return c.currentState()
   141  }
   142  
   143  func (c *linuxContainer) OCIState() (*specs.State, error) {
   144  	c.m.Lock()
   145  	defer c.m.Unlock()
   146  	return c.currentOCIState()
   147  }
   148  
   149  // ignoreCgroupError filters out cgroup-related errors that can be ignored,
   150  // because the container is stopped and its cgroup is gone.
   151  func (c *linuxContainer) ignoreCgroupError(err error) error {
   152  	if err == nil {
   153  		return nil
   154  	}
   155  	if errors.Is(err, os.ErrNotExist) && c.runType() == Stopped && !c.cgroupManager.Exists() {
   156  		return nil
   157  	}
   158  	return err
   159  }
   160  
   161  func (c *linuxContainer) Processes() ([]int, error) {
   162  	pids, err := c.cgroupManager.GetAllPids()
   163  	if err = c.ignoreCgroupError(err); err != nil {
   164  		return nil, fmt.Errorf("unable to get all container pids: %w", err)
   165  	}
   166  	return pids, nil
   167  }
   168  
   169  func (c *linuxContainer) Stats() (*Stats, error) {
   170  	var (
   171  		err   error
   172  		stats = &Stats{}
   173  	)
   174  	if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
   175  		return stats, fmt.Errorf("unable to get container cgroup stats: %w", err)
   176  	}
   177  	if c.intelRdtManager != nil {
   178  		if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
   179  			return stats, fmt.Errorf("unable to get container Intel RDT stats: %w", err)
   180  		}
   181  	}
   182  	for _, iface := range c.config.Networks {
   183  		switch iface.Type {
   184  		case "veth":
   185  			istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
   186  			if err != nil {
   187  				return stats, fmt.Errorf("unable to get network stats for interface %q: %w", iface.HostInterfaceName, err)
   188  			}
   189  			stats.Interfaces = append(stats.Interfaces, istats)
   190  		}
   191  	}
   192  	return stats, nil
   193  }
   194  
   195  func (c *linuxContainer) Set(config configs.Config) error {
   196  	c.m.Lock()
   197  	defer c.m.Unlock()
   198  	status, err := c.currentStatus()
   199  	if err != nil {
   200  		return err
   201  	}
   202  	if status == Stopped {
   203  		return ErrNotRunning
   204  	}
   205  	if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil {
   206  		// Set configs back
   207  		if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
   208  			logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
   209  		}
   210  		return err
   211  	}
   212  	if c.intelRdtManager != nil {
   213  		if err := c.intelRdtManager.Set(&config); err != nil {
   214  			// Set configs back
   215  			if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
   216  				logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
   217  			}
   218  			if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
   219  				logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
   220  			}
   221  			return err
   222  		}
   223  	}
   224  	// After config setting succeed, update config and states
   225  	c.config = &config
   226  	_, err = c.updateState(nil)
   227  	return err
   228  }
   229  
   230  func (c *linuxContainer) Start(process *Process) error {
   231  	c.m.Lock()
   232  	defer c.m.Unlock()
   233  	if c.config.Cgroups.Resources.SkipDevices {
   234  		return errors.New("can't start container with SkipDevices set")
   235  	}
   236  	if process.Init {
   237  		if err := c.createExecFifo(); err != nil {
   238  			return err
   239  		}
   240  	}
   241  	if err := c.start(process); err != nil {
   242  		if process.Init {
   243  			c.deleteExecFifo()
   244  		}
   245  		return err
   246  	}
   247  	return nil
   248  }
   249  
   250  func (c *linuxContainer) Run(process *Process) error {
   251  	if err := c.Start(process); err != nil {
   252  		return err
   253  	}
   254  	if process.Init {
   255  		return c.exec()
   256  	}
   257  	return nil
   258  }
   259  
   260  func (c *linuxContainer) Exec() error {
   261  	c.m.Lock()
   262  	defer c.m.Unlock()
   263  	return c.exec()
   264  }
   265  
   266  func (c *linuxContainer) exec() error {
   267  	path := filepath.Join(c.root, execFifoFilename)
   268  	pid := c.initProcess.pid()
   269  	blockingFifoOpenCh := awaitFifoOpen(path)
   270  	for {
   271  		select {
   272  		case result := <-blockingFifoOpenCh:
   273  			return handleFifoResult(result)
   274  
   275  		case <-time.After(time.Millisecond * 100):
   276  			stat, err := system.Stat(pid)
   277  			if err != nil || stat.State == system.Zombie {
   278  				// could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check.
   279  				// see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete).
   280  				if err := handleFifoResult(fifoOpen(path, false)); err != nil {
   281  					return errors.New("container process is already dead")
   282  				}
   283  				return nil
   284  			}
   285  		}
   286  	}
   287  }
   288  
   289  func readFromExecFifo(execFifo io.Reader) error {
   290  	data, err := io.ReadAll(execFifo)
   291  	if err != nil {
   292  		return err
   293  	}
   294  	if len(data) <= 0 {
   295  		return errors.New("cannot start an already running container")
   296  	}
   297  	return nil
   298  }
   299  
   300  func awaitFifoOpen(path string) <-chan openResult {
   301  	fifoOpened := make(chan openResult)
   302  	go func() {
   303  		result := fifoOpen(path, true)
   304  		fifoOpened <- result
   305  	}()
   306  	return fifoOpened
   307  }
   308  
   309  func fifoOpen(path string, block bool) openResult {
   310  	flags := os.O_RDONLY
   311  	if !block {
   312  		flags |= unix.O_NONBLOCK
   313  	}
   314  	f, err := os.OpenFile(path, flags, 0)
   315  	if err != nil {
   316  		return openResult{err: fmt.Errorf("exec fifo: %w", err)}
   317  	}
   318  	return openResult{file: f}
   319  }
   320  
   321  func handleFifoResult(result openResult) error {
   322  	if result.err != nil {
   323  		return result.err
   324  	}
   325  	f := result.file
   326  	defer f.Close()
   327  	if err := readFromExecFifo(f); err != nil {
   328  		return err
   329  	}
   330  	return os.Remove(f.Name())
   331  }
   332  
   333  type openResult struct {
   334  	file *os.File
   335  	err  error
   336  }
   337  
   338  func (c *linuxContainer) start(process *Process) (retErr error) {
   339  	parent, err := c.newParentProcess(process)
   340  	if err != nil {
   341  		return fmt.Errorf("unable to create new parent process: %w", err)
   342  	}
   343  
   344  	logsDone := parent.forwardChildLogs()
   345  	if logsDone != nil {
   346  		defer func() {
   347  			// Wait for log forwarder to finish. This depends on
   348  			// runc init closing the _LIBCONTAINER_LOGPIPE log fd.
   349  			err := <-logsDone
   350  			if err != nil && retErr == nil {
   351  				retErr = fmt.Errorf("unable to forward init logs: %w", err)
   352  			}
   353  		}()
   354  	}
   355  
   356  	// Before starting "runc init", mark all non-stdio open files as O_CLOEXEC
   357  	// to make sure we don't leak any files into "runc init". Any files to be
   358  	// passed to "runc init" through ExtraFiles will get dup2'd by the Go
   359  	// runtime and thus their O_CLOEXEC flag will be cleared. This is some
   360  	// additional protection against attacks like CVE-2024-21626, by making
   361  	// sure we never leak files to "runc init" we didn't intend to.
   362  	if err := utils.CloseExecFrom(3); err != nil {
   363  		return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
   364  	}
   365  	if err := parent.start(); err != nil {
   366  		return fmt.Errorf("unable to start container process: %w", err)
   367  	}
   368  
   369  	if process.Init {
   370  		c.fifo.Close()
   371  		if c.config.Hooks != nil {
   372  			s, err := c.currentOCIState()
   373  			if err != nil {
   374  				return err
   375  			}
   376  
   377  			if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil {
   378  				if err := ignoreTerminateErrors(parent.terminate()); err != nil {
   379  					logrus.Warn(fmt.Errorf("error running poststart hook: %w", err))
   380  				}
   381  				return err
   382  			}
   383  		}
   384  	}
   385  	return nil
   386  }
   387  
   388  func (c *linuxContainer) Signal(s os.Signal, all bool) error {
   389  	c.m.Lock()
   390  	defer c.m.Unlock()
   391  	status, err := c.currentStatus()
   392  	if err != nil {
   393  		return err
   394  	}
   395  	if all {
   396  		if status == Stopped && !c.cgroupManager.Exists() {
   397  			// Avoid calling signalAllProcesses which may print
   398  			// a warning trying to freeze a non-existing cgroup.
   399  			return nil
   400  		}
   401  		return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, s))
   402  	}
   403  	// to avoid a PID reuse attack
   404  	if status == Running || status == Created || status == Paused {
   405  		if err := c.initProcess.signal(s); err != nil {
   406  			return fmt.Errorf("unable to signal init: %w", err)
   407  		}
   408  		if status == Paused {
   409  			// For cgroup v1, killing a process in a frozen cgroup
   410  			// does nothing until it's thawed. Only thaw the cgroup
   411  			// for SIGKILL.
   412  			if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL {
   413  				_ = c.cgroupManager.Freeze(configs.Thawed)
   414  			}
   415  		}
   416  		return nil
   417  	}
   418  	return ErrNotRunning
   419  }
   420  
   421  func (c *linuxContainer) createExecFifo() error {
   422  	rootuid, err := c.Config().HostRootUID()
   423  	if err != nil {
   424  		return err
   425  	}
   426  	rootgid, err := c.Config().HostRootGID()
   427  	if err != nil {
   428  		return err
   429  	}
   430  
   431  	fifoName := filepath.Join(c.root, execFifoFilename)
   432  	if _, err := os.Stat(fifoName); err == nil {
   433  		return fmt.Errorf("exec fifo %s already exists", fifoName)
   434  	}
   435  	oldMask := unix.Umask(0o000)
   436  	if err := unix.Mkfifo(fifoName, 0o622); err != nil {
   437  		unix.Umask(oldMask)
   438  		return err
   439  	}
   440  	unix.Umask(oldMask)
   441  	return os.Chown(fifoName, rootuid, rootgid)
   442  }
   443  
   444  func (c *linuxContainer) deleteExecFifo() {
   445  	fifoName := filepath.Join(c.root, execFifoFilename)
   446  	os.Remove(fifoName)
   447  }
   448  
   449  // includeExecFifo opens the container's execfifo as a pathfd, so that the
   450  // container cannot access the statedir (and the FIFO itself remains
   451  // un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
   452  // fd, with _LIBCONTAINER_FIFOFD set to its fd number.
   453  func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
   454  	fifoName := filepath.Join(c.root, execFifoFilename)
   455  	fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
   456  	if err != nil {
   457  		return err
   458  	}
   459  	c.fifo = fifo
   460  
   461  	cmd.ExtraFiles = append(cmd.ExtraFiles, fifo)
   462  	cmd.Env = append(cmd.Env,
   463  		"_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
   464  	return nil
   465  }
   466  
   467  func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
   468  	parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
   469  	if err != nil {
   470  		return nil, fmt.Errorf("unable to create init pipe: %w", err)
   471  	}
   472  	messageSockPair := filePair{parentInitPipe, childInitPipe}
   473  
   474  	parentLogPipe, childLogPipe, err := os.Pipe()
   475  	if err != nil {
   476  		return nil, fmt.Errorf("unable to create log pipe: %w", err)
   477  	}
   478  	logFilePair := filePair{parentLogPipe, childLogPipe}
   479  
   480  	cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
   481  	if !p.Init {
   482  		return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
   483  	}
   484  
   485  	// We only set up fifoFd if we're not doing a `runc exec`. The historic
   486  	// reason for this is that previously we would pass a dirfd that allowed
   487  	// for container rootfs escape (and not doing it in `runc exec` avoided
   488  	// that problem), but we no longer do that. However, there's no need to do
   489  	// this for `runc exec` so we just keep it this way to be safe.
   490  	if err := c.includeExecFifo(cmd); err != nil {
   491  		return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
   492  	}
   493  	return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
   494  }
   495  
   496  func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
   497  	cmd := exec.Command(c.initPath, c.initArgs[1:]...)
   498  	cmd.Args[0] = c.initArgs[0]
   499  	cmd.Stdin = p.Stdin
   500  	cmd.Stdout = p.Stdout
   501  	cmd.Stderr = p.Stderr
   502  	cmd.Dir = c.config.Rootfs
   503  	if cmd.SysProcAttr == nil {
   504  		cmd.SysProcAttr = &unix.SysProcAttr{}
   505  	}
   506  	cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))
   507  	cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
   508  	if p.ConsoleSocket != nil {
   509  		cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
   510  		cmd.Env = append(cmd.Env,
   511  			"_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
   512  		)
   513  	}
   514  	cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe)
   515  	cmd.Env = append(cmd.Env,
   516  		"_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
   517  		"_LIBCONTAINER_STATEDIR="+c.root,
   518  	)
   519  
   520  	cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
   521  	cmd.Env = append(cmd.Env,
   522  		"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
   523  		"_LIBCONTAINER_LOGLEVEL="+p.LogLevel,
   524  	)
   525  
   526  	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
   527  	// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
   528  	// even with the parent still running.
   529  	if c.config.ParentDeathSignal > 0 {
   530  		cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
   531  	}
   532  	return cmd
   533  }
   534  
   535  // shouldSendMountSources says whether the child process must setup bind mounts with
   536  // the source pre-opened (O_PATH) in the host user namespace.
   537  // See https://github.com/opencontainers/runc/issues/2484
   538  func (c *linuxContainer) shouldSendMountSources() bool {
   539  	// Passing the mount sources via SCM_RIGHTS is only necessary when
   540  	// both userns and mntns are active.
   541  	if !c.config.Namespaces.Contains(configs.NEWUSER) ||
   542  		!c.config.Namespaces.Contains(configs.NEWNS) {
   543  		return false
   544  	}
   545  
   546  	// nsexec.c send_mountsources() requires setns(mntns) capabilities
   547  	// CAP_SYS_CHROOT and CAP_SYS_ADMIN.
   548  	if c.config.RootlessEUID {
   549  		return false
   550  	}
   551  
   552  	// We need to send sources if there are bind-mounts.
   553  	for _, m := range c.config.Mounts {
   554  		if m.IsBind() {
   555  			return true
   556  		}
   557  	}
   558  
   559  	return false
   560  }
   561  
   562  func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
   563  	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
   564  	nsMaps := make(map[configs.NamespaceType]string)
   565  	for _, ns := range c.config.Namespaces {
   566  		if ns.Path != "" {
   567  			nsMaps[ns.Type] = ns.Path
   568  		}
   569  	}
   570  	_, sharePidns := nsMaps[configs.NEWPID]
   571  	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
   572  	if err != nil {
   573  		return nil, err
   574  	}
   575  
   576  	if c.shouldSendMountSources() {
   577  		// Elements on this slice will be paired with mounts (see StartInitialization() and
   578  		// prepareRootfs()). This slice MUST have the same size as c.config.Mounts.
   579  		mountFds := make([]int, len(c.config.Mounts))
   580  		for i, m := range c.config.Mounts {
   581  			if !m.IsBind() {
   582  				// Non bind-mounts do not use an fd.
   583  				mountFds[i] = -1
   584  				continue
   585  			}
   586  
   587  			// The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need
   588  			// to allocate a fd so that we know the number to pass in the environment variable. The fd
   589  			// must not be closed before cmd.Start(), so we reuse messageSockPair.child because the
   590  			// lifecycle of that fd is already taken care of.
   591  			cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child)
   592  			mountFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
   593  		}
   594  
   595  		mountFdsJson, err := json.Marshal(mountFds)
   596  		if err != nil {
   597  			return nil, fmt.Errorf("Error creating _LIBCONTAINER_MOUNT_FDS: %w", err)
   598  		}
   599  
   600  		cmd.Env = append(cmd.Env,
   601  			"_LIBCONTAINER_MOUNT_FDS="+string(mountFdsJson),
   602  		)
   603  	}
   604  
   605  	init := &initProcess{
   606  		cmd:             cmd,
   607  		messageSockPair: messageSockPair,
   608  		logFilePair:     logFilePair,
   609  		manager:         c.cgroupManager,
   610  		intelRdtManager: c.intelRdtManager,
   611  		config:          c.newInitConfig(p),
   612  		container:       c,
   613  		process:         p,
   614  		bootstrapData:   data,
   615  		sharePidns:      sharePidns,
   616  	}
   617  	c.initProcess = init
   618  	return init, nil
   619  }
   620  
   621  func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) {
   622  	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
   623  	state, err := c.currentState()
   624  	if err != nil {
   625  		return nil, fmt.Errorf("unable to get container state: %w", err)
   626  	}
   627  	// for setns process, we don't have to set cloneflags as the process namespaces
   628  	// will only be set via setns syscall
   629  	data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
   630  	if err != nil {
   631  		return nil, err
   632  	}
   633  	proc := &setnsProcess{
   634  		cmd:             cmd,
   635  		cgroupPaths:     state.CgroupPaths,
   636  		rootlessCgroups: c.config.RootlessCgroups,
   637  		intelRdtPath:    state.IntelRdtPath,
   638  		messageSockPair: messageSockPair,
   639  		logFilePair:     logFilePair,
   640  		manager:         c.cgroupManager,
   641  		config:          c.newInitConfig(p),
   642  		process:         p,
   643  		bootstrapData:   data,
   644  		initProcessPid:  state.InitProcessPid,
   645  	}
   646  	if len(p.SubCgroupPaths) > 0 {
   647  		if add, ok := p.SubCgroupPaths[""]; ok {
   648  			// cgroup v1: using the same path for all controllers.
   649  			// cgroup v2: the only possible way.
   650  			for k := range proc.cgroupPaths {
   651  				subPath := path.Join(proc.cgroupPaths[k], add)
   652  				if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) {
   653  					return nil, fmt.Errorf("%s is not a sub cgroup path", add)
   654  				}
   655  				proc.cgroupPaths[k] = subPath
   656  			}
   657  			// cgroup v2: do not try to join init process's cgroup
   658  			// as a fallback (see (*setnsProcess).start).
   659  			proc.initProcessPid = 0
   660  		} else {
   661  			// Per-controller paths.
   662  			for ctrl, add := range p.SubCgroupPaths {
   663  				if val, ok := proc.cgroupPaths[ctrl]; ok {
   664  					subPath := path.Join(val, add)
   665  					if !strings.HasPrefix(subPath, val) {
   666  						return nil, fmt.Errorf("%s is not a sub cgroup path", add)
   667  					}
   668  					proc.cgroupPaths[ctrl] = subPath
   669  				} else {
   670  					return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
   671  				}
   672  			}
   673  		}
   674  	}
   675  	return proc, nil
   676  }
   677  
   678  func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
   679  	cfg := &initConfig{
   680  		Config:           c.config,
   681  		Args:             process.Args,
   682  		Env:              process.Env,
   683  		User:             process.User,
   684  		AdditionalGroups: process.AdditionalGroups,
   685  		Cwd:              process.Cwd,
   686  		Capabilities:     process.Capabilities,
   687  		PassedFilesCount: len(process.ExtraFiles),
   688  		ContainerId:      c.ID(),
   689  		NoNewPrivileges:  c.config.NoNewPrivileges,
   690  		RootlessEUID:     c.config.RootlessEUID,
   691  		RootlessCgroups:  c.config.RootlessCgroups,
   692  		AppArmorProfile:  c.config.AppArmorProfile,
   693  		ProcessLabel:     c.config.ProcessLabel,
   694  		Rlimits:          c.config.Rlimits,
   695  		CreateConsole:    process.ConsoleSocket != nil,
   696  		ConsoleWidth:     process.ConsoleWidth,
   697  		ConsoleHeight:    process.ConsoleHeight,
   698  	}
   699  	if process.NoNewPrivileges != nil {
   700  		cfg.NoNewPrivileges = *process.NoNewPrivileges
   701  	}
   702  	if process.AppArmorProfile != "" {
   703  		cfg.AppArmorProfile = process.AppArmorProfile
   704  	}
   705  	if process.Label != "" {
   706  		cfg.ProcessLabel = process.Label
   707  	}
   708  	if len(process.Rlimits) > 0 {
   709  		cfg.Rlimits = process.Rlimits
   710  	}
   711  	if cgroups.IsCgroup2UnifiedMode() {
   712  		cfg.Cgroup2Path = c.cgroupManager.Path("")
   713  	}
   714  
   715  	return cfg
   716  }
   717  
   718  func (c *linuxContainer) Destroy() error {
   719  	c.m.Lock()
   720  	defer c.m.Unlock()
   721  	return c.state.destroy()
   722  }
   723  
   724  func (c *linuxContainer) Pause() error {
   725  	c.m.Lock()
   726  	defer c.m.Unlock()
   727  	status, err := c.currentStatus()
   728  	if err != nil {
   729  		return err
   730  	}
   731  	switch status {
   732  	case Running, Created:
   733  		if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
   734  			return err
   735  		}
   736  		return c.state.transition(&pausedState{
   737  			c: c,
   738  		})
   739  	}
   740  	return ErrNotRunning
   741  }
   742  
   743  func (c *linuxContainer) Resume() error {
   744  	c.m.Lock()
   745  	defer c.m.Unlock()
   746  	status, err := c.currentStatus()
   747  	if err != nil {
   748  		return err
   749  	}
   750  	if status != Paused {
   751  		return ErrNotPaused
   752  	}
   753  	if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
   754  		return err
   755  	}
   756  	return c.state.transition(&runningState{
   757  		c: c,
   758  	})
   759  }
   760  
   761  func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
   762  	// XXX(cyphar): This requires cgroups.
   763  	if c.config.RootlessCgroups {
   764  		logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
   765  	}
   766  	path := c.cgroupManager.Path("memory")
   767  	if cgroups.IsCgroup2UnifiedMode() {
   768  		return notifyOnOOMV2(path)
   769  	}
   770  	return notifyOnOOM(path)
   771  }
   772  
   773  func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
   774  	// XXX(cyphar): This requires cgroups.
   775  	if c.config.RootlessCgroups {
   776  		logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
   777  	}
   778  	return notifyMemoryPressure(c.cgroupManager.Path("memory"), level)
   779  }
   780  
   781  var criuFeatures *criurpc.CriuFeatures
   782  
   783  func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
   784  	t := criurpc.CriuReqType_FEATURE_CHECK
   785  
   786  	// make sure the features we are looking for are really not from
   787  	// some previous check
   788  	criuFeatures = nil
   789  
   790  	req := &criurpc.CriuReq{
   791  		Type: &t,
   792  		// Theoretically this should not be necessary but CRIU
   793  		// segfaults if Opts is empty.
   794  		// Fixed in CRIU  2.12
   795  		Opts:     rpcOpts,
   796  		Features: criuFeat,
   797  	}
   798  
   799  	err := c.criuSwrk(nil, req, criuOpts, nil)
   800  	if err != nil {
   801  		logrus.Debugf("%s", err)
   802  		return errors.New("CRIU feature check failed")
   803  	}
   804  
   805  	missingFeatures := false
   806  
   807  	// The outer if checks if the fields actually exist
   808  	if (criuFeat.MemTrack != nil) &&
   809  		(criuFeatures.MemTrack != nil) {
   810  		// The inner if checks if they are set to true
   811  		if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
   812  			missingFeatures = true
   813  			logrus.Debugf("CRIU does not support MemTrack")
   814  		}
   815  	}
   816  
   817  	// This needs to be repeated for every new feature check.
   818  	// Is there a way to put this in a function. Reflection?
   819  	if (criuFeat.LazyPages != nil) &&
   820  		(criuFeatures.LazyPages != nil) {
   821  		if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
   822  			missingFeatures = true
   823  			logrus.Debugf("CRIU does not support LazyPages")
   824  		}
   825  	}
   826  
   827  	if missingFeatures {
   828  		return errors.New("CRIU is missing features")
   829  	}
   830  
   831  	return nil
   832  }
   833  
   834  func compareCriuVersion(criuVersion int, minVersion int) error {
   835  	// simple function to perform the actual version compare
   836  	if criuVersion < minVersion {
   837  		return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion)
   838  	}
   839  
   840  	return nil
   841  }
   842  
   843  // checkCriuVersion checks Criu version greater than or equal to minVersion
   844  func (c *linuxContainer) checkCriuVersion(minVersion int) error {
   845  	// If the version of criu has already been determined there is no need
   846  	// to ask criu for the version again. Use the value from c.criuVersion.
   847  	if c.criuVersion != 0 {
   848  		return compareCriuVersion(c.criuVersion, minVersion)
   849  	}
   850  
   851  	criu := criu.MakeCriu()
   852  	criu.SetCriuPath(c.criuPath)
   853  	var err error
   854  	c.criuVersion, err = criu.GetCriuVersion()
   855  	if err != nil {
   856  		return fmt.Errorf("CRIU version check failed: %w", err)
   857  	}
   858  
   859  	return compareCriuVersion(c.criuVersion, minVersion)
   860  }
   861  
   862  const descriptorsFilename = "descriptors.json"
   863  
   864  func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
   865  	mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
   866  	if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
   867  		mountDest = dest[len(c.config.Rootfs):]
   868  	}
   869  	extMnt := &criurpc.ExtMountMap{
   870  		Key: proto.String(mountDest),
   871  		Val: proto.String(mountDest),
   872  	}
   873  	req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
   874  }
   875  
   876  func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
   877  	for _, path := range c.config.MaskPaths {
   878  		fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
   879  		if err != nil {
   880  			if os.IsNotExist(err) {
   881  				continue
   882  			}
   883  			return err
   884  		}
   885  		if fi.IsDir() {
   886  			continue
   887  		}
   888  
   889  		extMnt := &criurpc.ExtMountMap{
   890  			Key: proto.String(path),
   891  			Val: proto.String("/dev/null"),
   892  		}
   893  		req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
   894  	}
   895  	return nil
   896  }
   897  
   898  func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) {
   899  	// CRIU will evaluate a configuration starting with release 3.11.
   900  	// Settings in the configuration file will overwrite RPC settings.
   901  	// Look for annotations. The annotation 'org.criu.config'
   902  	// specifies if CRIU should use a different, container specific
   903  	// configuration file.
   904  	_, annotations := utils.Annotations(c.config.Labels)
   905  	configFile, exists := annotations["org.criu.config"]
   906  	if exists {
   907  		// If the annotation 'org.criu.config' exists and is set
   908  		// to a non-empty string, tell CRIU to use that as a
   909  		// configuration file. If the file does not exist, CRIU
   910  		// will just ignore it.
   911  		if configFile != "" {
   912  			rpcOpts.ConfigFile = proto.String(configFile)
   913  		}
   914  		// If 'org.criu.config' exists and is set to an empty
   915  		// string, a runc specific CRIU configuration file will
   916  		// be not set at all.
   917  	} else {
   918  		// If the mentioned annotation has not been found, specify
   919  		// a default CRIU configuration file.
   920  		rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf")
   921  	}
   922  }
   923  
   924  func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool {
   925  	var minVersion int
   926  	switch t {
   927  	case configs.NEWNET:
   928  		// CRIU supports different external namespace with different released CRIU versions.
   929  		// For network namespaces to work we need at least criu 3.11.0 => 31100.
   930  		minVersion = 31100
   931  	case configs.NEWPID:
   932  		// For PID namespaces criu 31500 is needed.
   933  		minVersion = 31500
   934  	default:
   935  		return false
   936  	}
   937  	return c.checkCriuVersion(minVersion) == nil
   938  }
   939  
   940  func criuNsToKey(t configs.NamespaceType) string {
   941  	return "extRoot" + strings.Title(configs.NsName(t)) + "NS" //nolint:staticcheck // SA1019: strings.Title is deprecated
   942  }
   943  
   944  func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error {
   945  	if !c.criuSupportsExtNS(t) {
   946  		return nil
   947  	}
   948  
   949  	nsPath := c.config.Namespaces.PathOf(t)
   950  	if nsPath == "" {
   951  		return nil
   952  	}
   953  	// CRIU expects the information about an external namespace
   954  	// like this: --external <TYPE>[<inode>]:<key>
   955  	// This <key> is always 'extRoot<TYPE>NS'.
   956  	var ns unix.Stat_t
   957  	if err := unix.Stat(nsPath, &ns); err != nil {
   958  		return err
   959  	}
   960  	criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t))
   961  	rpcOpts.External = append(rpcOpts.External, criuExternal)
   962  
   963  	return nil
   964  }
   965  
   966  func (c *linuxContainer) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error {
   967  	for _, ns := range c.config.Namespaces {
   968  		switch ns.Type {
   969  		case configs.NEWNET, configs.NEWPID:
   970  			// If the container is running in a network or PID namespace and has
   971  			// a path to the network or PID namespace configured, we will dump
   972  			// that network or PID namespace as an external namespace and we
   973  			// will expect that the namespace exists during restore.
   974  			// This basically means that CRIU will ignore the namespace
   975  			// and expect it to be setup correctly.
   976  			if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil {
   977  				return err
   978  			}
   979  		default:
   980  			// For all other namespaces except NET and PID CRIU has
   981  			// a simpler way of joining the existing namespace if set
   982  			nsPath := c.config.Namespaces.PathOf(ns.Type)
   983  			if nsPath == "" {
   984  				continue
   985  			}
   986  			if ns.Type == configs.NEWCGROUP {
   987  				// CRIU has no code to handle NEWCGROUP
   988  				return fmt.Errorf("Do not know how to handle namespace %v", ns.Type)
   989  			}
   990  			// CRIU has code to handle NEWTIME, but it does not seem to be defined in runc
   991  
   992  			// CRIU will issue a warning for NEWUSER:
   993  			// criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous'
   994  			rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{
   995  				Ns:     proto.String(configs.NsName(ns.Type)),
   996  				NsFile: proto.String(nsPath),
   997  			})
   998  		}
   999  	}
  1000  
  1001  	return nil
  1002  }
  1003  
  1004  func (c *linuxContainer) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error {
  1005  	if !c.criuSupportsExtNS(t) {
  1006  		return nil
  1007  	}
  1008  
  1009  	nsPath := c.config.Namespaces.PathOf(t)
  1010  	if nsPath == "" {
  1011  		return nil
  1012  	}
  1013  	// CRIU wants the information about an existing namespace
  1014  	// like this: --inherit-fd fd[<fd>]:<key>
  1015  	// The <key> needs to be the same as during checkpointing.
  1016  	// We are always using 'extRoot<TYPE>NS' as the key in this.
  1017  	nsFd, err := os.Open(nsPath)
  1018  	if err != nil {
  1019  		logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
  1020  		return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
  1021  	}
  1022  	inheritFd := &criurpc.InheritFd{
  1023  		Key: proto.String(criuNsToKey(t)),
  1024  		// The offset of four is necessary because 0, 1, 2 and 3 are
  1025  		// already used by stdin, stdout, stderr, 'criu swrk' socket.
  1026  		Fd: proto.Int32(int32(4 + len(*extraFiles))),
  1027  	}
  1028  	rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd)
  1029  	// All open FDs need to be transferred to CRIU via extraFiles
  1030  	*extraFiles = append(*extraFiles, nsFd)
  1031  
  1032  	return nil
  1033  }
  1034  
  1035  func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
  1036  	c.m.Lock()
  1037  	defer c.m.Unlock()
  1038  
  1039  	// Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
  1040  	// (CLI prints a warning)
  1041  	// TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
  1042  	//               support for doing unprivileged dumps, but the setup of
  1043  	//               rootless containers might make this complicated.
  1044  
  1045  	// We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
  1046  	if err := c.checkCriuVersion(30000); err != nil {
  1047  		return err
  1048  	}
  1049  
  1050  	if criuOpts.ImagesDirectory == "" {
  1051  		return errors.New("invalid directory to save checkpoint")
  1052  	}
  1053  
  1054  	// Since a container can be C/R'ed multiple times,
  1055  	// the checkpoint directory may already exist.
  1056  	if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) {
  1057  		return err
  1058  	}
  1059  
  1060  	imageDir, err := os.Open(criuOpts.ImagesDirectory)
  1061  	if err != nil {
  1062  		return err
  1063  	}
  1064  	defer imageDir.Close()
  1065  
  1066  	rpcOpts := criurpc.CriuOpts{
  1067  		ImagesDirFd:     proto.Int32(int32(imageDir.Fd())),
  1068  		LogLevel:        proto.Int32(4),
  1069  		LogFile:         proto.String("dump.log"),
  1070  		Root:            proto.String(c.config.Rootfs),
  1071  		ManageCgroups:   proto.Bool(true),
  1072  		NotifyScripts:   proto.Bool(true),
  1073  		Pid:             proto.Int32(int32(c.initProcess.pid())),
  1074  		ShellJob:        proto.Bool(criuOpts.ShellJob),
  1075  		LeaveRunning:    proto.Bool(criuOpts.LeaveRunning),
  1076  		TcpEstablished:  proto.Bool(criuOpts.TcpEstablished),
  1077  		ExtUnixSk:       proto.Bool(criuOpts.ExternalUnixConnections),
  1078  		FileLocks:       proto.Bool(criuOpts.FileLocks),
  1079  		EmptyNs:         proto.Uint32(criuOpts.EmptyNs),
  1080  		OrphanPtsMaster: proto.Bool(true),
  1081  		AutoDedup:       proto.Bool(criuOpts.AutoDedup),
  1082  		LazyPages:       proto.Bool(criuOpts.LazyPages),
  1083  	}
  1084  
  1085  	// if criuOpts.WorkDirectory is not set, criu default is used.
  1086  	if criuOpts.WorkDirectory != "" {
  1087  		if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
  1088  			return err
  1089  		}
  1090  		workDir, err := os.Open(criuOpts.WorkDirectory)
  1091  		if err != nil {
  1092  			return err
  1093  		}
  1094  		defer workDir.Close()
  1095  		rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
  1096  	}
  1097  
  1098  	c.handleCriuConfigurationFile(&rpcOpts)
  1099  
  1100  	// If the container is running in a network namespace and has
  1101  	// a path to the network namespace configured, we will dump
  1102  	// that network namespace as an external namespace and we
  1103  	// will expect that the namespace exists during restore.
  1104  	// This basically means that CRIU will ignore the namespace
  1105  	// and expect to be setup correctly.
  1106  	if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil {
  1107  		return err
  1108  	}
  1109  
  1110  	// Same for possible external PID namespaces
  1111  	if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil {
  1112  		return err
  1113  	}
  1114  
  1115  	// CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup
  1116  	// is not set, CRIU uses ptrace() to pause the processes.
  1117  	// Note cgroup v2 freezer is only supported since CRIU release 3.14.
  1118  	if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil {
  1119  		if fcg := c.cgroupManager.Path("freezer"); fcg != "" {
  1120  			rpcOpts.FreezeCgroup = proto.String(fcg)
  1121  		}
  1122  	}
  1123  
  1124  	// append optional criu opts, e.g., page-server and port
  1125  	if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
  1126  		rpcOpts.Ps = &criurpc.CriuPageServerInfo{
  1127  			Address: proto.String(criuOpts.PageServer.Address),
  1128  			Port:    proto.Int32(criuOpts.PageServer.Port),
  1129  		}
  1130  	}
  1131  
  1132  	// pre-dump may need parentImage param to complete iterative migration
  1133  	if criuOpts.ParentImage != "" {
  1134  		rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
  1135  		rpcOpts.TrackMem = proto.Bool(true)
  1136  	}
  1137  
  1138  	// append optional manage cgroups mode
  1139  	if criuOpts.ManageCgroupsMode != 0 {
  1140  		mode := criuOpts.ManageCgroupsMode
  1141  		rpcOpts.ManageCgroupsMode = &mode
  1142  	}
  1143  
  1144  	var t criurpc.CriuReqType
  1145  	if criuOpts.PreDump {
  1146  		feat := criurpc.CriuFeatures{
  1147  			MemTrack: proto.Bool(true),
  1148  		}
  1149  
  1150  		if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
  1151  			return err
  1152  		}
  1153  
  1154  		t = criurpc.CriuReqType_PRE_DUMP
  1155  	} else {
  1156  		t = criurpc.CriuReqType_DUMP
  1157  	}
  1158  
  1159  	if criuOpts.LazyPages {
  1160  		// lazy migration requested; check if criu supports it
  1161  		feat := criurpc.CriuFeatures{
  1162  			LazyPages: proto.Bool(true),
  1163  		}
  1164  		if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
  1165  			return err
  1166  		}
  1167  
  1168  		if fd := criuOpts.StatusFd; fd != -1 {
  1169  			// check that the FD is valid
  1170  			flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0)
  1171  			if err != nil {
  1172  				return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err)
  1173  			}
  1174  			// and writable
  1175  			if flags&unix.O_WRONLY == 0 {
  1176  				return fmt.Errorf("invalid --status-fd argument %d: not writable", fd)
  1177  			}
  1178  
  1179  			if c.checkCriuVersion(31500) != nil {
  1180  				// For criu 3.15+, use notifications (see case "status-ready"
  1181  				// in criuNotifications). Otherwise, rely on criu status fd.
  1182  				rpcOpts.StatusFd = proto.Int32(int32(fd))
  1183  			}
  1184  		}
  1185  	}
  1186  
  1187  	req := &criurpc.CriuReq{
  1188  		Type: &t,
  1189  		Opts: &rpcOpts,
  1190  	}
  1191  
  1192  	// no need to dump all this in pre-dump
  1193  	if !criuOpts.PreDump {
  1194  		hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
  1195  		for _, m := range c.config.Mounts {
  1196  			switch m.Device {
  1197  			case "bind":
  1198  				c.addCriuDumpMount(req, m)
  1199  			case "cgroup":
  1200  				if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
  1201  					// real mount(s)
  1202  					continue
  1203  				}
  1204  				// a set of "external" bind mounts
  1205  				binds, err := getCgroupMounts(m)
  1206  				if err != nil {
  1207  					return err
  1208  				}
  1209  				for _, b := range binds {
  1210  					c.addCriuDumpMount(req, b)
  1211  				}
  1212  			}
  1213  		}
  1214  
  1215  		if err := c.addMaskPaths(req); err != nil {
  1216  			return err
  1217  		}
  1218  
  1219  		for _, node := range c.config.Devices {
  1220  			m := &configs.Mount{Destination: node.Path, Source: node.Path}
  1221  			c.addCriuDumpMount(req, m)
  1222  		}
  1223  
  1224  		// Write the FD info to a file in the image directory
  1225  		fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
  1226  		if err != nil {
  1227  			return err
  1228  		}
  1229  
  1230  		err = os.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600)
  1231  		if err != nil {
  1232  			return err
  1233  		}
  1234  	}
  1235  
  1236  	err = c.criuSwrk(nil, req, criuOpts, nil)
  1237  	if err != nil {
  1238  		return err
  1239  	}
  1240  	return nil
  1241  }
  1242  
  1243  func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
  1244  	mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
  1245  	if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
  1246  		mountDest = dest[len(c.config.Rootfs):]
  1247  	}
  1248  	extMnt := &criurpc.ExtMountMap{
  1249  		Key: proto.String(mountDest),
  1250  		Val: proto.String(m.Source),
  1251  	}
  1252  	req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
  1253  }
  1254  
  1255  func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
  1256  	for _, iface := range c.config.Networks {
  1257  		switch iface.Type {
  1258  		case "veth":
  1259  			veth := new(criurpc.CriuVethPair)
  1260  			veth.IfOut = proto.String(iface.HostInterfaceName)
  1261  			veth.IfIn = proto.String(iface.Name)
  1262  			req.Opts.Veths = append(req.Opts.Veths, veth)
  1263  		case "loopback":
  1264  			// Do nothing
  1265  		}
  1266  	}
  1267  	for _, i := range criuOpts.VethPairs {
  1268  		veth := new(criurpc.CriuVethPair)
  1269  		veth.IfOut = proto.String(i.HostInterfaceName)
  1270  		veth.IfIn = proto.String(i.ContainerInterfaceName)
  1271  		req.Opts.Veths = append(req.Opts.Veths, veth)
  1272  	}
  1273  }
  1274  
  1275  // makeCriuRestoreMountpoints makes the actual mountpoints for the
  1276  // restore using CRIU. This function is inspired from the code in
  1277  // rootfs_linux.go
  1278  func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
  1279  	switch m.Device {
  1280  	case "cgroup":
  1281  		// No mount point(s) need to be created:
  1282  		//
  1283  		// * for v1, mount points are saved by CRIU because
  1284  		//   /sys/fs/cgroup is a tmpfs mount
  1285  		//
  1286  		// * for v2, /sys/fs/cgroup is a real mount, but
  1287  		//   the mountpoint appears as soon as /sys is mounted
  1288  		return nil
  1289  	case "bind":
  1290  		// The prepareBindMount() function checks if source
  1291  		// exists. So it cannot be used for other filesystem types.
  1292  		// TODO: pass something else than nil? Not sure if criu is
  1293  		// impacted by issue #2484
  1294  		if err := prepareBindMount(m, c.config.Rootfs, nil); err != nil {
  1295  			return err
  1296  		}
  1297  	default:
  1298  		// for all other filesystems just create the mountpoints
  1299  		dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination)
  1300  		if err != nil {
  1301  			return err
  1302  		}
  1303  		if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil {
  1304  			return err
  1305  		}
  1306  		if err := os.MkdirAll(dest, 0o755); err != nil {
  1307  			return err
  1308  		}
  1309  	}
  1310  	return nil
  1311  }
  1312  
  1313  // isPathInPrefixList is a small function for CRIU restore to make sure
  1314  // mountpoints, which are on a tmpfs, are not created in the roofs
  1315  func isPathInPrefixList(path string, prefix []string) bool {
  1316  	for _, p := range prefix {
  1317  		if strings.HasPrefix(path, p+"/") {
  1318  			return true
  1319  		}
  1320  	}
  1321  	return false
  1322  }
  1323  
  1324  // prepareCriuRestoreMounts tries to set up the rootfs of the
  1325  // container to be restored in the same way runc does it for
  1326  // initial container creation. Even for a read-only rootfs container
  1327  // runc modifies the rootfs to add mountpoints which do not exist.
  1328  // This function also creates missing mountpoints as long as they
  1329  // are not on top of a tmpfs, as CRIU will restore tmpfs content anyway.
  1330  func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
  1331  	// First get a list of a all tmpfs mounts
  1332  	tmpfs := []string{}
  1333  	for _, m := range mounts {
  1334  		switch m.Device {
  1335  		case "tmpfs":
  1336  			tmpfs = append(tmpfs, m.Destination)
  1337  		}
  1338  	}
  1339  	// Now go through all mounts and create the mountpoints
  1340  	// if the mountpoints are not on a tmpfs, as CRIU will
  1341  	// restore the complete tmpfs content from its checkpoint.
  1342  	umounts := []string{}
  1343  	defer func() {
  1344  		for _, u := range umounts {
  1345  			_ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error {
  1346  				if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil {
  1347  					if e != unix.EINVAL { //nolint:errorlint // unix errors are bare
  1348  						// Ignore EINVAL as it means 'target is not a mount point.'
  1349  						// It probably has already been unmounted.
  1350  						logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e)
  1351  					}
  1352  				}
  1353  				return nil
  1354  			})
  1355  		}
  1356  	}()
  1357  	for _, m := range mounts {
  1358  		if !isPathInPrefixList(m.Destination, tmpfs) {
  1359  			if err := c.makeCriuRestoreMountpoints(m); err != nil {
  1360  				return err
  1361  			}
  1362  			// If the mount point is a bind mount, we need to mount
  1363  			// it now so that runc can create the necessary mount
  1364  			// points for mounts in bind mounts.
  1365  			// This also happens during initial container creation.
  1366  			// Without this CRIU restore will fail
  1367  			// See: https://github.com/opencontainers/runc/issues/2748
  1368  			// It is also not necessary to order the mount points
  1369  			// because during initial container creation mounts are
  1370  			// set up in the order they are configured.
  1371  			if m.Device == "bind" {
  1372  				if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(procfd string) error {
  1373  					if err := mount(m.Source, m.Destination, procfd, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
  1374  						return err
  1375  					}
  1376  					return nil
  1377  				}); err != nil {
  1378  					return err
  1379  				}
  1380  				umounts = append(umounts, m.Destination)
  1381  			}
  1382  		}
  1383  	}
  1384  	return nil
  1385  }
  1386  
  1387  func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
  1388  	c.m.Lock()
  1389  	defer c.m.Unlock()
  1390  
  1391  	var extraFiles []*os.File
  1392  
  1393  	// Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
  1394  	// (CLI prints a warning)
  1395  	// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
  1396  	//               support for unprivileged restore at the moment.
  1397  
  1398  	// We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
  1399  	if err := c.checkCriuVersion(30000); err != nil {
  1400  		return err
  1401  	}
  1402  	if criuOpts.ImagesDirectory == "" {
  1403  		return errors.New("invalid directory to restore checkpoint")
  1404  	}
  1405  	imageDir, err := os.Open(criuOpts.ImagesDirectory)
  1406  	if err != nil {
  1407  		return err
  1408  	}
  1409  	defer imageDir.Close()
  1410  	// CRIU has a few requirements for a root directory:
  1411  	// * it must be a mount point
  1412  	// * its parent must not be overmounted
  1413  	// c.config.Rootfs is bind-mounted to a temporary directory
  1414  	// to satisfy these requirements.
  1415  	root := filepath.Join(c.root, "criu-root")
  1416  	if err := os.Mkdir(root, 0o755); err != nil {
  1417  		return err
  1418  	}
  1419  	defer os.Remove(root)
  1420  	root, err = filepath.EvalSymlinks(root)
  1421  	if err != nil {
  1422  		return err
  1423  	}
  1424  	err = mount(c.config.Rootfs, root, "", "", unix.MS_BIND|unix.MS_REC, "")
  1425  	if err != nil {
  1426  		return err
  1427  	}
  1428  	defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck
  1429  	t := criurpc.CriuReqType_RESTORE
  1430  	req := &criurpc.CriuReq{
  1431  		Type: &t,
  1432  		Opts: &criurpc.CriuOpts{
  1433  			ImagesDirFd:     proto.Int32(int32(imageDir.Fd())),
  1434  			EvasiveDevices:  proto.Bool(true),
  1435  			LogLevel:        proto.Int32(4),
  1436  			LogFile:         proto.String("restore.log"),
  1437  			RstSibling:      proto.Bool(true),
  1438  			Root:            proto.String(root),
  1439  			ManageCgroups:   proto.Bool(true),
  1440  			NotifyScripts:   proto.Bool(true),
  1441  			ShellJob:        proto.Bool(criuOpts.ShellJob),
  1442  			ExtUnixSk:       proto.Bool(criuOpts.ExternalUnixConnections),
  1443  			TcpEstablished:  proto.Bool(criuOpts.TcpEstablished),
  1444  			FileLocks:       proto.Bool(criuOpts.FileLocks),
  1445  			EmptyNs:         proto.Uint32(criuOpts.EmptyNs),
  1446  			OrphanPtsMaster: proto.Bool(true),
  1447  			AutoDedup:       proto.Bool(criuOpts.AutoDedup),
  1448  			LazyPages:       proto.Bool(criuOpts.LazyPages),
  1449  		},
  1450  	}
  1451  
  1452  	if criuOpts.LsmProfile != "" {
  1453  		// CRIU older than 3.16 has a bug which breaks the possibility
  1454  		// to set a different LSM profile.
  1455  		if err := c.checkCriuVersion(31600); err != nil {
  1456  			return errors.New("--lsm-profile requires at least CRIU 3.16")
  1457  		}
  1458  		req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile)
  1459  	}
  1460  	if criuOpts.LsmMountContext != "" {
  1461  		if err := c.checkCriuVersion(31600); err != nil {
  1462  			return errors.New("--lsm-mount-context requires at least CRIU 3.16")
  1463  		}
  1464  		req.Opts.LsmMountContext = proto.String(criuOpts.LsmMountContext)
  1465  	}
  1466  
  1467  	if criuOpts.WorkDirectory != "" {
  1468  		// Since a container can be C/R'ed multiple times,
  1469  		// the work directory may already exist.
  1470  		if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
  1471  			return err
  1472  		}
  1473  		workDir, err := os.Open(criuOpts.WorkDirectory)
  1474  		if err != nil {
  1475  			return err
  1476  		}
  1477  		defer workDir.Close()
  1478  		req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
  1479  	}
  1480  	c.handleCriuConfigurationFile(req.Opts)
  1481  
  1482  	if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil {
  1483  		return err
  1484  	}
  1485  
  1486  	// This will modify the rootfs of the container in the same way runc
  1487  	// modifies the container during initial creation.
  1488  	if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil {
  1489  		return err
  1490  	}
  1491  
  1492  	hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
  1493  	for _, m := range c.config.Mounts {
  1494  		switch m.Device {
  1495  		case "bind":
  1496  			c.addCriuRestoreMount(req, m)
  1497  		case "cgroup":
  1498  			if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
  1499  				continue
  1500  			}
  1501  			// cgroup v1 is a set of bind mounts, unless cgroupns is used
  1502  			binds, err := getCgroupMounts(m)
  1503  			if err != nil {
  1504  				return err
  1505  			}
  1506  			for _, b := range binds {
  1507  				c.addCriuRestoreMount(req, b)
  1508  			}
  1509  		}
  1510  	}
  1511  
  1512  	if len(c.config.MaskPaths) > 0 {
  1513  		m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
  1514  		c.addCriuRestoreMount(req, m)
  1515  	}
  1516  
  1517  	for _, node := range c.config.Devices {
  1518  		m := &configs.Mount{Destination: node.Path, Source: node.Path}
  1519  		c.addCriuRestoreMount(req, m)
  1520  	}
  1521  
  1522  	if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 {
  1523  		c.restoreNetwork(req, criuOpts)
  1524  	}
  1525  
  1526  	// append optional manage cgroups mode
  1527  	if criuOpts.ManageCgroupsMode != 0 {
  1528  		mode := criuOpts.ManageCgroupsMode
  1529  		req.Opts.ManageCgroupsMode = &mode
  1530  	}
  1531  
  1532  	var (
  1533  		fds    []string
  1534  		fdJSON []byte
  1535  	)
  1536  	if fdJSON, err = os.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
  1537  		return err
  1538  	}
  1539  
  1540  	if err := json.Unmarshal(fdJSON, &fds); err != nil {
  1541  		return err
  1542  	}
  1543  	for i := range fds {
  1544  		if s := fds[i]; strings.Contains(s, "pipe:") {
  1545  			inheritFd := new(criurpc.InheritFd)
  1546  			inheritFd.Key = proto.String(s)
  1547  			inheritFd.Fd = proto.Int32(int32(i))
  1548  			req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
  1549  		}
  1550  	}
  1551  	err = c.criuSwrk(process, req, criuOpts, extraFiles)
  1552  
  1553  	// Now that CRIU is done let's close all opened FDs CRIU needed.
  1554  	for _, fd := range extraFiles {
  1555  		fd.Close()
  1556  	}
  1557  
  1558  	return err
  1559  }
  1560  
  1561  func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
  1562  	// need to apply cgroups only on restore
  1563  	if req.GetType() != criurpc.CriuReqType_RESTORE {
  1564  		return nil
  1565  	}
  1566  
  1567  	// XXX: Do we need to deal with this case? AFAIK criu still requires root.
  1568  	if err := c.cgroupManager.Apply(pid); err != nil {
  1569  		return err
  1570  	}
  1571  
  1572  	if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil {
  1573  		return err
  1574  	}
  1575  
  1576  	if cgroups.IsCgroup2UnifiedMode() {
  1577  		return nil
  1578  	}
  1579  	// the stuff below is cgroupv1-specific
  1580  
  1581  	path := fmt.Sprintf("/proc/%d/cgroup", pid)
  1582  	cgroupsPaths, err := cgroups.ParseCgroupFile(path)
  1583  	if err != nil {
  1584  		return err
  1585  	}
  1586  
  1587  	for c, p := range cgroupsPaths {
  1588  		cgroupRoot := &criurpc.CgroupRoot{
  1589  			Ctrl: proto.String(c),
  1590  			Path: proto.String(p),
  1591  		}
  1592  		req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
  1593  	}
  1594  
  1595  	return nil
  1596  }
  1597  
  1598  func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error {
  1599  	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
  1600  	if err != nil {
  1601  		return err
  1602  	}
  1603  
  1604  	var logPath string
  1605  	if opts != nil {
  1606  		logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
  1607  	} else {
  1608  		// For the VERSION RPC 'opts' is set to 'nil' and therefore
  1609  		// opts.WorkDirectory does not exist. Set logPath to "".
  1610  		logPath = ""
  1611  	}
  1612  	criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
  1613  	criuClientFileCon, err := net.FileConn(criuClient)
  1614  	criuClient.Close()
  1615  	if err != nil {
  1616  		return err
  1617  	}
  1618  
  1619  	criuClientCon := criuClientFileCon.(*net.UnixConn)
  1620  	defer criuClientCon.Close()
  1621  
  1622  	criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
  1623  	defer criuServer.Close()
  1624  
  1625  	args := []string{"swrk", "3"}
  1626  	if c.criuVersion != 0 {
  1627  		// If the CRIU Version is still '0' then this is probably
  1628  		// the initial CRIU run to detect the version. Skip it.
  1629  		logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
  1630  	}
  1631  	cmd := exec.Command(c.criuPath, args...)
  1632  	if process != nil {
  1633  		cmd.Stdin = process.Stdin
  1634  		cmd.Stdout = process.Stdout
  1635  		cmd.Stderr = process.Stderr
  1636  	}
  1637  	cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
  1638  	if extraFiles != nil {
  1639  		cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
  1640  	}
  1641  
  1642  	if err := cmd.Start(); err != nil {
  1643  		return err
  1644  	}
  1645  	// we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang.
  1646  	criuServer.Close()
  1647  	// cmd.Process will be replaced by a restored init.
  1648  	criuProcess := cmd.Process
  1649  
  1650  	var criuProcessState *os.ProcessState
  1651  	defer func() {
  1652  		if criuProcessState == nil {
  1653  			criuClientCon.Close()
  1654  			_, err := criuProcess.Wait()
  1655  			if err != nil {
  1656  				logrus.Warnf("wait on criuProcess returned %v", err)
  1657  			}
  1658  		}
  1659  	}()
  1660  
  1661  	if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil {
  1662  		return err
  1663  	}
  1664  
  1665  	var extFds []string
  1666  	if process != nil {
  1667  		extFds, err = getPipeFds(criuProcess.Pid)
  1668  		if err != nil {
  1669  			return err
  1670  		}
  1671  	}
  1672  
  1673  	logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
  1674  	// In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
  1675  	// should be empty. For older CRIU versions it still will be
  1676  	// available but empty. criurpc.CriuReqType_VERSION actually
  1677  	// has no req.GetOpts().
  1678  	if logrus.GetLevel() >= logrus.DebugLevel &&
  1679  		!(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
  1680  			req.GetType() == criurpc.CriuReqType_VERSION) {
  1681  
  1682  		val := reflect.ValueOf(req.GetOpts())
  1683  		v := reflect.Indirect(val)
  1684  		for i := 0; i < v.NumField(); i++ {
  1685  			st := v.Type()
  1686  			name := st.Field(i).Name
  1687  			if 'A' <= name[0] && name[0] <= 'Z' {
  1688  				value := val.MethodByName("Get" + name).Call([]reflect.Value{})
  1689  				logrus.Debugf("CRIU option %s with value %v", name, value[0])
  1690  			}
  1691  		}
  1692  	}
  1693  	data, err := proto.Marshal(req)
  1694  	if err != nil {
  1695  		return err
  1696  	}
  1697  	_, err = criuClientCon.Write(data)
  1698  	if err != nil {
  1699  		return err
  1700  	}
  1701  
  1702  	buf := make([]byte, 10*4096)
  1703  	oob := make([]byte, 4096)
  1704  	for {
  1705  		n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
  1706  		if req.Opts != nil && req.Opts.StatusFd != nil {
  1707  			// Close status_fd as soon as we got something back from criu,
  1708  			// assuming it has consumed (reopened) it by this time.
  1709  			// Otherwise it will might be left open forever and whoever
  1710  			// is waiting on it will wait forever.
  1711  			fd := int(*req.Opts.StatusFd)
  1712  			_ = unix.Close(fd)
  1713  			req.Opts.StatusFd = nil
  1714  		}
  1715  		if err != nil {
  1716  			return err
  1717  		}
  1718  		if n == 0 {
  1719  			return errors.New("unexpected EOF")
  1720  		}
  1721  		if n == len(buf) {
  1722  			return errors.New("buffer is too small")
  1723  		}
  1724  
  1725  		resp := new(criurpc.CriuResp)
  1726  		err = proto.Unmarshal(buf[:n], resp)
  1727  		if err != nil {
  1728  			return err
  1729  		}
  1730  		if !resp.GetSuccess() {
  1731  			typeString := req.GetType().String()
  1732  			return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
  1733  		}
  1734  
  1735  		t := resp.GetType()
  1736  		switch {
  1737  		case t == criurpc.CriuReqType_FEATURE_CHECK:
  1738  			logrus.Debugf("Feature check says: %s", resp)
  1739  			criuFeatures = resp.GetFeatures()
  1740  		case t == criurpc.CriuReqType_NOTIFY:
  1741  			if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil {
  1742  				return err
  1743  			}
  1744  			t = criurpc.CriuReqType_NOTIFY
  1745  			req = &criurpc.CriuReq{
  1746  				Type:          &t,
  1747  				NotifySuccess: proto.Bool(true),
  1748  			}
  1749  			data, err = proto.Marshal(req)
  1750  			if err != nil {
  1751  				return err
  1752  			}
  1753  			_, err = criuClientCon.Write(data)
  1754  			if err != nil {
  1755  				return err
  1756  			}
  1757  			continue
  1758  		case t == criurpc.CriuReqType_RESTORE:
  1759  		case t == criurpc.CriuReqType_DUMP:
  1760  		case t == criurpc.CriuReqType_PRE_DUMP:
  1761  		default:
  1762  			return fmt.Errorf("unable to parse the response %s", resp.String())
  1763  		}
  1764  
  1765  		break
  1766  	}
  1767  
  1768  	_ = criuClientCon.CloseWrite()
  1769  	// cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
  1770  	// Here we want to wait only the CRIU process.
  1771  	criuProcessState, err = criuProcess.Wait()
  1772  	if err != nil {
  1773  		return err
  1774  	}
  1775  
  1776  	// In pre-dump mode CRIU is in a loop and waits for
  1777  	// the final DUMP command.
  1778  	// The current runc pre-dump approach, however, is
  1779  	// start criu in PRE_DUMP once for a single pre-dump
  1780  	// and not the whole series of pre-dump, pre-dump, ...m, dump
  1781  	// If we got the message CriuReqType_PRE_DUMP it means
  1782  	// CRIU was successful and we need to forcefully stop CRIU
  1783  	if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP {
  1784  		return fmt.Errorf("criu failed: %s\nlog file: %s", criuProcessState.String(), logPath)
  1785  	}
  1786  	return nil
  1787  }
  1788  
  1789  // block any external network activity
  1790  func lockNetwork(config *configs.Config) error {
  1791  	for _, config := range config.Networks {
  1792  		strategy, err := getStrategy(config.Type)
  1793  		if err != nil {
  1794  			return err
  1795  		}
  1796  
  1797  		if err := strategy.detach(config); err != nil {
  1798  			return err
  1799  		}
  1800  	}
  1801  	return nil
  1802  }
  1803  
  1804  func unlockNetwork(config *configs.Config) error {
  1805  	for _, config := range config.Networks {
  1806  		strategy, err := getStrategy(config.Type)
  1807  		if err != nil {
  1808  			return err
  1809  		}
  1810  		if err = strategy.attach(config); err != nil {
  1811  			return err
  1812  		}
  1813  	}
  1814  	return nil
  1815  }
  1816  
  1817  func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error {
  1818  	notify := resp.GetNotify()
  1819  	if notify == nil {
  1820  		return fmt.Errorf("invalid response: %s", resp.String())
  1821  	}
  1822  	script := notify.GetScript()
  1823  	logrus.Debugf("notify: %s\n", script)
  1824  	switch script {
  1825  	case "post-dump":
  1826  		f, err := os.Create(filepath.Join(c.root, "checkpoint"))
  1827  		if err != nil {
  1828  			return err
  1829  		}
  1830  		f.Close()
  1831  	case "network-unlock":
  1832  		if err := unlockNetwork(c.config); err != nil {
  1833  			return err
  1834  		}
  1835  	case "network-lock":
  1836  		if err := lockNetwork(c.config); err != nil {
  1837  			return err
  1838  		}
  1839  	case "setup-namespaces":
  1840  		if c.config.Hooks != nil {
  1841  			s, err := c.currentOCIState()
  1842  			if err != nil {
  1843  				return nil
  1844  			}
  1845  			s.Pid = int(notify.GetPid())
  1846  
  1847  			if err := c.config.Hooks[configs.Prestart].RunHooks(s); err != nil {
  1848  				return err
  1849  			}
  1850  			if err := c.config.Hooks[configs.CreateRuntime].RunHooks(s); err != nil {
  1851  				return err
  1852  			}
  1853  		}
  1854  	case "post-restore":
  1855  		pid := notify.GetPid()
  1856  
  1857  		p, err := os.FindProcess(int(pid))
  1858  		if err != nil {
  1859  			return err
  1860  		}
  1861  		cmd.Process = p
  1862  
  1863  		r, err := newRestoredProcess(cmd, fds)
  1864  		if err != nil {
  1865  			return err
  1866  		}
  1867  		process.ops = r
  1868  		if err := c.state.transition(&restoredState{
  1869  			imageDir: opts.ImagesDirectory,
  1870  			c:        c,
  1871  		}); err != nil {
  1872  			return err
  1873  		}
  1874  		// create a timestamp indicating when the restored checkpoint was started
  1875  		c.created = time.Now().UTC()
  1876  		if _, err := c.updateState(r); err != nil {
  1877  			return err
  1878  		}
  1879  		if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
  1880  			if !os.IsNotExist(err) {
  1881  				logrus.Error(err)
  1882  			}
  1883  		}
  1884  	case "orphan-pts-master":
  1885  		scm, err := unix.ParseSocketControlMessage(oob)
  1886  		if err != nil {
  1887  			return err
  1888  		}
  1889  		fds, err := unix.ParseUnixRights(&scm[0])
  1890  		if err != nil {
  1891  			return err
  1892  		}
  1893  
  1894  		master := os.NewFile(uintptr(fds[0]), "orphan-pts-master")
  1895  		defer master.Close()
  1896  
  1897  		// While we can access console.master, using the API is a good idea.
  1898  		if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
  1899  			return err
  1900  		}
  1901  	case "status-ready":
  1902  		if opts.StatusFd != -1 {
  1903  			// write \0 to status fd to notify that lazy page server is ready
  1904  			_, err := unix.Write(opts.StatusFd, []byte{0})
  1905  			if err != nil {
  1906  				logrus.Warnf("can't write \\0 to status fd: %v", err)
  1907  			}
  1908  			_ = unix.Close(opts.StatusFd)
  1909  			opts.StatusFd = -1
  1910  		}
  1911  	}
  1912  	return nil
  1913  }
  1914  
  1915  func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
  1916  	if process != nil {
  1917  		c.initProcess = process
  1918  	}
  1919  	state, err := c.currentState()
  1920  	if err != nil {
  1921  		return nil, err
  1922  	}
  1923  	err = c.saveState(state)
  1924  	if err != nil {
  1925  		return nil, err
  1926  	}
  1927  	return state, nil
  1928  }
  1929  
  1930  func (c *linuxContainer) saveState(s *State) (retErr error) {
  1931  	tmpFile, err := os.CreateTemp(c.root, "state-")
  1932  	if err != nil {
  1933  		return err
  1934  	}
  1935  
  1936  	defer func() {
  1937  		if retErr != nil {
  1938  			tmpFile.Close()
  1939  			os.Remove(tmpFile.Name())
  1940  		}
  1941  	}()
  1942  
  1943  	err = utils.WriteJSON(tmpFile, s)
  1944  	if err != nil {
  1945  		return err
  1946  	}
  1947  	err = tmpFile.Close()
  1948  	if err != nil {
  1949  		return err
  1950  	}
  1951  
  1952  	stateFilePath := filepath.Join(c.root, stateFilename)
  1953  	return os.Rename(tmpFile.Name(), stateFilePath)
  1954  }
  1955  
  1956  func (c *linuxContainer) currentStatus() (Status, error) {
  1957  	if err := c.refreshState(); err != nil {
  1958  		return -1, err
  1959  	}
  1960  	return c.state.status(), nil
  1961  }
  1962  
  1963  // refreshState needs to be called to verify that the current state on the
  1964  // container is what is true.  Because consumers of libcontainer can use it
  1965  // out of process we need to verify the container's status based on runtime
  1966  // information and not rely on our in process info.
  1967  func (c *linuxContainer) refreshState() error {
  1968  	paused, err := c.isPaused()
  1969  	if err != nil {
  1970  		return err
  1971  	}
  1972  	if paused {
  1973  		return c.state.transition(&pausedState{c: c})
  1974  	}
  1975  	t := c.runType()
  1976  	switch t {
  1977  	case Created:
  1978  		return c.state.transition(&createdState{c: c})
  1979  	case Running:
  1980  		return c.state.transition(&runningState{c: c})
  1981  	}
  1982  	return c.state.transition(&stoppedState{c: c})
  1983  }
  1984  
  1985  func (c *linuxContainer) runType() Status {
  1986  	if c.initProcess == nil {
  1987  		return Stopped
  1988  	}
  1989  	pid := c.initProcess.pid()
  1990  	stat, err := system.Stat(pid)
  1991  	if err != nil {
  1992  		return Stopped
  1993  	}
  1994  	if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
  1995  		return Stopped
  1996  	}
  1997  	// We'll create exec fifo and blocking on it after container is created,
  1998  	// and delete it after start container.
  1999  	if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
  2000  		return Created
  2001  	}
  2002  	return Running
  2003  }
  2004  
  2005  func (c *linuxContainer) isPaused() (bool, error) {
  2006  	state, err := c.cgroupManager.GetFreezerState()
  2007  	if err != nil {
  2008  		return false, err
  2009  	}
  2010  	return state == configs.Frozen, nil
  2011  }
  2012  
  2013  func (c *linuxContainer) currentState() (*State, error) {
  2014  	var (
  2015  		startTime           uint64
  2016  		externalDescriptors []string
  2017  		pid                 = -1
  2018  	)
  2019  	if c.initProcess != nil {
  2020  		pid = c.initProcess.pid()
  2021  		startTime, _ = c.initProcess.startTime()
  2022  		externalDescriptors = c.initProcess.externalDescriptors()
  2023  	}
  2024  
  2025  	intelRdtPath := ""
  2026  	if c.intelRdtManager != nil {
  2027  		intelRdtPath = c.intelRdtManager.GetPath()
  2028  	}
  2029  	state := &State{
  2030  		BaseState: BaseState{
  2031  			ID:                   c.ID(),
  2032  			Config:               *c.config,
  2033  			InitProcessPid:       pid,
  2034  			InitProcessStartTime: startTime,
  2035  			Created:              c.created,
  2036  		},
  2037  		Rootless:            c.config.RootlessEUID && c.config.RootlessCgroups,
  2038  		CgroupPaths:         c.cgroupManager.GetPaths(),
  2039  		IntelRdtPath:        intelRdtPath,
  2040  		NamespacePaths:      make(map[configs.NamespaceType]string),
  2041  		ExternalDescriptors: externalDescriptors,
  2042  	}
  2043  	if pid > 0 {
  2044  		for _, ns := range c.config.Namespaces {
  2045  			state.NamespacePaths[ns.Type] = ns.GetPath(pid)
  2046  		}
  2047  		for _, nsType := range configs.NamespaceTypes() {
  2048  			if !configs.IsNamespaceSupported(nsType) {
  2049  				continue
  2050  			}
  2051  			if _, ok := state.NamespacePaths[nsType]; !ok {
  2052  				ns := configs.Namespace{Type: nsType}
  2053  				state.NamespacePaths[ns.Type] = ns.GetPath(pid)
  2054  			}
  2055  		}
  2056  	}
  2057  	return state, nil
  2058  }
  2059  
  2060  func (c *linuxContainer) currentOCIState() (*specs.State, error) {
  2061  	bundle, annotations := utils.Annotations(c.config.Labels)
  2062  	state := &specs.State{
  2063  		Version:     specs.Version,
  2064  		ID:          c.ID(),
  2065  		Bundle:      bundle,
  2066  		Annotations: annotations,
  2067  	}
  2068  	status, err := c.currentStatus()
  2069  	if err != nil {
  2070  		return nil, err
  2071  	}
  2072  	state.Status = specs.ContainerState(status.String())
  2073  	if status != Stopped {
  2074  		if c.initProcess != nil {
  2075  			state.Pid = c.initProcess.pid()
  2076  		}
  2077  	}
  2078  	return state, nil
  2079  }
  2080  
  2081  // orderNamespacePaths sorts namespace paths into a list of paths that we
  2082  // can setns in order.
  2083  func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
  2084  	paths := []string{}
  2085  	for _, ns := range configs.NamespaceTypes() {
  2086  
  2087  		// Remove namespaces that we don't need to join.
  2088  		if !c.config.Namespaces.Contains(ns) {
  2089  			continue
  2090  		}
  2091  
  2092  		if p, ok := namespaces[ns]; ok && p != "" {
  2093  			// check if the requested namespace is supported
  2094  			if !configs.IsNamespaceSupported(ns) {
  2095  				return nil, fmt.Errorf("namespace %s is not supported", ns)
  2096  			}
  2097  			// only set to join this namespace if it exists
  2098  			if _, err := os.Lstat(p); err != nil {
  2099  				return nil, fmt.Errorf("namespace path: %w", err)
  2100  			}
  2101  			// do not allow namespace path with comma as we use it to separate
  2102  			// the namespace paths
  2103  			if strings.ContainsRune(p, ',') {
  2104  				return nil, fmt.Errorf("invalid namespace path %s", p)
  2105  			}
  2106  			paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
  2107  		}
  2108  
  2109  	}
  2110  
  2111  	return paths, nil
  2112  }
  2113  
  2114  func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
  2115  	data := bytes.NewBuffer(nil)
  2116  	for _, im := range idMap {
  2117  		line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
  2118  		if _, err := data.WriteString(line); err != nil {
  2119  			return nil, err
  2120  		}
  2121  	}
  2122  	return data.Bytes(), nil
  2123  }
  2124  
  2125  // netlinkError is an error wrapper type for use by custom netlink message
  2126  // types. Panics with errors are wrapped in netlinkError so that the recover
  2127  // in bootstrapData can distinguish intentional panics.
  2128  type netlinkError struct{ error }
  2129  
  2130  // bootstrapData encodes the necessary data in netlink binary format
  2131  // as a io.Reader.
  2132  // Consumer can write the data to a bootstrap program
  2133  // such as one that uses nsenter package to bootstrap the container's
  2134  // init process correctly, i.e. with correct namespaces, uid/gid
  2135  // mapping etc.
  2136  func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
  2137  	// create the netlink message
  2138  	r := nl.NewNetlinkRequest(int(InitMsg), 0)
  2139  
  2140  	// Our custom messages cannot bubble up an error using returns, instead
  2141  	// they will panic with the specific error type, netlinkError. In that
  2142  	// case, recover from the panic and return that as an error.
  2143  	defer func() {
  2144  		if r := recover(); r != nil {
  2145  			if e, ok := r.(netlinkError); ok {
  2146  				Err = e.error
  2147  			} else {
  2148  				panic(r)
  2149  			}
  2150  		}
  2151  	}()
  2152  
  2153  	// write cloneFlags
  2154  	r.AddData(&Int32msg{
  2155  		Type:  CloneFlagsAttr,
  2156  		Value: uint32(cloneFlags),
  2157  	})
  2158  
  2159  	// write custom namespace paths
  2160  	if len(nsMaps) > 0 {
  2161  		nsPaths, err := c.orderNamespacePaths(nsMaps)
  2162  		if err != nil {
  2163  			return nil, err
  2164  		}
  2165  		r.AddData(&Bytemsg{
  2166  			Type:  NsPathsAttr,
  2167  			Value: []byte(strings.Join(nsPaths, ",")),
  2168  		})
  2169  	}
  2170  
  2171  	// write namespace paths only when we are not joining an existing user ns
  2172  	_, joinExistingUser := nsMaps[configs.NEWUSER]
  2173  	if !joinExistingUser {
  2174  		// write uid mappings
  2175  		if len(c.config.UidMappings) > 0 {
  2176  			if c.config.RootlessEUID && c.newuidmapPath != "" {
  2177  				r.AddData(&Bytemsg{
  2178  					Type:  UidmapPathAttr,
  2179  					Value: []byte(c.newuidmapPath),
  2180  				})
  2181  			}
  2182  			b, err := encodeIDMapping(c.config.UidMappings)
  2183  			if err != nil {
  2184  				return nil, err
  2185  			}
  2186  			r.AddData(&Bytemsg{
  2187  				Type:  UidmapAttr,
  2188  				Value: b,
  2189  			})
  2190  		}
  2191  
  2192  		// write gid mappings
  2193  		if len(c.config.GidMappings) > 0 {
  2194  			b, err := encodeIDMapping(c.config.GidMappings)
  2195  			if err != nil {
  2196  				return nil, err
  2197  			}
  2198  			r.AddData(&Bytemsg{
  2199  				Type:  GidmapAttr,
  2200  				Value: b,
  2201  			})
  2202  			if c.config.RootlessEUID && c.newgidmapPath != "" {
  2203  				r.AddData(&Bytemsg{
  2204  					Type:  GidmapPathAttr,
  2205  					Value: []byte(c.newgidmapPath),
  2206  				})
  2207  			}
  2208  			if requiresRootOrMappingTool(c.config) {
  2209  				r.AddData(&Boolmsg{
  2210  					Type:  SetgroupAttr,
  2211  					Value: true,
  2212  				})
  2213  			}
  2214  		}
  2215  	}
  2216  
  2217  	if c.config.OomScoreAdj != nil {
  2218  		// write oom_score_adj
  2219  		r.AddData(&Bytemsg{
  2220  			Type:  OomScoreAdjAttr,
  2221  			Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)),
  2222  		})
  2223  	}
  2224  
  2225  	// write rootless
  2226  	r.AddData(&Boolmsg{
  2227  		Type:  RootlessEUIDAttr,
  2228  		Value: c.config.RootlessEUID,
  2229  	})
  2230  
  2231  	// Bind mount source to open.
  2232  	if it == initStandard && c.shouldSendMountSources() {
  2233  		var mounts []byte
  2234  		for _, m := range c.config.Mounts {
  2235  			if m.IsBind() {
  2236  				if strings.IndexByte(m.Source, 0) >= 0 {
  2237  					return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
  2238  				}
  2239  				mounts = append(mounts, []byte(m.Source)...)
  2240  			}
  2241  			mounts = append(mounts, byte(0))
  2242  		}
  2243  
  2244  		r.AddData(&Bytemsg{
  2245  			Type:  MountSourcesAttr,
  2246  			Value: mounts,
  2247  		})
  2248  	}
  2249  
  2250  	return bytes.NewReader(r.Serialize()), nil
  2251  }
  2252  
  2253  // ignoreTerminateErrors returns nil if the given err matches an error known
  2254  // to indicate that the terminate occurred successfully or err was nil, otherwise
  2255  // err is returned unaltered.
  2256  func ignoreTerminateErrors(err error) error {
  2257  	if err == nil {
  2258  		return nil
  2259  	}
  2260  	// terminate() might return an error from either Kill or Wait.
  2261  	// The (*Cmd).Wait documentation says: "If the command fails to run
  2262  	// or doesn't complete successfully, the error is of type *ExitError".
  2263  	// Filter out such errors (like "exit status 1" or "signal: killed").
  2264  	var exitErr *exec.ExitError
  2265  	if errors.As(err, &exitErr) {
  2266  		return nil
  2267  	}
  2268  	if errors.Is(err, os.ErrProcessDone) {
  2269  		return nil
  2270  	}
  2271  	s := err.Error()
  2272  	if strings.Contains(s, "Wait was already called") {
  2273  		return nil
  2274  	}
  2275  	return err
  2276  }
  2277  
  2278  func requiresRootOrMappingTool(c *configs.Config) bool {
  2279  	gidMap := []configs.IDMap{
  2280  		{ContainerID: 0, HostID: int64(os.Getegid()), Size: 1},
  2281  	}
  2282  	return !reflect.DeepEqual(c.GidMappings, gidMap)
  2283  }
  2284  

View as plain text