rootfs_linux.go

Documentation: github.com/opencontainers/runc/libcontainer

     1  package libcontainer
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io"
     7  	"os"
     8  	"os/exec"
     9  	"path"
    10  	"path/filepath"
    11  	"strconv"
    12  	"strings"
    13  	"time"
    14  
    15  	securejoin "github.com/cyphar/filepath-securejoin"
    16  	"github.com/moby/sys/mountinfo"
    17  	"github.com/mrunalp/fileutils"
    18  	"github.com/opencontainers/runc/libcontainer/cgroups"
    19  	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
    20  	"github.com/opencontainers/runc/libcontainer/configs"
    21  	"github.com/opencontainers/runc/libcontainer/devices"
    22  	"github.com/opencontainers/runc/libcontainer/userns"
    23  	"github.com/opencontainers/runc/libcontainer/utils"
    24  	"github.com/opencontainers/runtime-spec/specs-go"
    25  	"github.com/opencontainers/selinux/go-selinux/label"
    26  	"github.com/sirupsen/logrus"
    27  	"golang.org/x/sys/unix"
    28  )
    29  
    30  const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
    31  
    32  type mountConfig struct {
    33  	root            string
    34  	label           string
    35  	cgroup2Path     string
    36  	rootlessCgroups bool
    37  	cgroupns        bool
    38  	fd              *int
    39  }
    40  
    41  // needsSetupDev returns true if /dev needs to be set up.
    42  func needsSetupDev(config *configs.Config) bool {
    43  	for _, m := range config.Mounts {
    44  		if m.Device == "bind" && utils.CleanPath(m.Destination) == "/dev" {
    45  			return false
    46  		}
    47  	}
    48  	return true
    49  }
    50  
    51  // prepareRootfs sets up the devices, mount points, and filesystems for use
    52  // inside a new mount namespace. It doesn't set anything as ro. You must call
    53  // finalizeRootfs after this function to finish setting up the rootfs.
    54  func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds []int) (err error) {
    55  	config := iConfig.Config
    56  	if err := prepareRoot(config); err != nil {
    57  		return fmt.Errorf("error preparing rootfs: %w", err)
    58  	}
    59  
    60  	if mountFds != nil && len(mountFds) != len(config.Mounts) {
    61  		return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v. Slice: %v", len(config.Mounts), len(mountFds), mountFds)
    62  	}
    63  
    64  	mountConfig := &mountConfig{
    65  		root:            config.Rootfs,
    66  		label:           config.MountLabel,
    67  		cgroup2Path:     iConfig.Cgroup2Path,
    68  		rootlessCgroups: iConfig.RootlessCgroups,
    69  		cgroupns:        config.Namespaces.Contains(configs.NEWCGROUP),
    70  	}
    71  	setupDev := needsSetupDev(config)
    72  	for i, m := range config.Mounts {
    73  		for _, precmd := range m.PremountCmds {
    74  			if err := mountCmd(precmd); err != nil {
    75  				return fmt.Errorf("error running premount command: %w", err)
    76  			}
    77  		}
    78  
    79  		// Just before the loop we checked that if not empty, len(mountFds) == len(config.Mounts).
    80  		// Therefore, we can access mountFds[i] without any concerns.
    81  		if mountFds != nil && mountFds[i] != -1 {
    82  			mountConfig.fd = &mountFds[i]
    83  		} else {
    84  			mountConfig.fd = nil
    85  		}
    86  
    87  		if err := mountToRootfs(m, mountConfig); err != nil {
    88  			return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
    89  		}
    90  
    91  		for _, postcmd := range m.PostmountCmds {
    92  			if err := mountCmd(postcmd); err != nil {
    93  				return fmt.Errorf("error running postmount command: %w", err)
    94  			}
    95  		}
    96  	}
    97  
    98  	if setupDev {
    99  		if err := createDevices(config); err != nil {
   100  			return fmt.Errorf("error creating device nodes: %w", err)
   101  		}
   102  		if err := setupPtmx(config); err != nil {
   103  			return fmt.Errorf("error setting up ptmx: %w", err)
   104  		}
   105  		if err := setupDevSymlinks(config.Rootfs); err != nil {
   106  			return fmt.Errorf("error setting up /dev symlinks: %w", err)
   107  		}
   108  	}
   109  
   110  	// Signal the parent to run the pre-start hooks.
   111  	// The hooks are run after the mounts are setup, but before we switch to the new
   112  	// root, so that the old root is still available in the hooks for any mount
   113  	// manipulations.
   114  	// Note that iConfig.Cwd is not guaranteed to exist here.
   115  	if err := syncParentHooks(pipe); err != nil {
   116  		return err
   117  	}
   118  
   119  	// The reason these operations are done here rather than in finalizeRootfs
   120  	// is because the console-handling code gets quite sticky if we have to set
   121  	// up the console before doing the pivot_root(2). This is because the
   122  	// Console API has to also work with the ExecIn case, which means that the
   123  	// API must be able to deal with being inside as well as outside the
   124  	// container. It's just cleaner to do this here (at the expense of the
   125  	// operation not being perfectly split).
   126  
   127  	if err := unix.Chdir(config.Rootfs); err != nil {
   128  		return &os.PathError{Op: "chdir", Path: config.Rootfs, Err: err}
   129  	}
   130  
   131  	s := iConfig.SpecState
   132  	s.Pid = unix.Getpid()
   133  	s.Status = specs.StateCreating
   134  	if err := iConfig.Config.Hooks[configs.CreateContainer].RunHooks(s); err != nil {
   135  		return err
   136  	}
   137  
   138  	if config.NoPivotRoot {
   139  		err = msMoveRoot(config.Rootfs)
   140  	} else if config.Namespaces.Contains(configs.NEWNS) {
   141  		err = pivotRoot(config.Rootfs)
   142  	} else {
   143  		err = chroot()
   144  	}
   145  	if err != nil {
   146  		return fmt.Errorf("error jailing process inside rootfs: %w", err)
   147  	}
   148  
   149  	if setupDev {
   150  		if err := reOpenDevNull(); err != nil {
   151  			return fmt.Errorf("error reopening /dev/null inside container: %w", err)
   152  		}
   153  	}
   154  
   155  	if cwd := iConfig.Cwd; cwd != "" {
   156  		// Note that spec.Process.Cwd can contain unclean value like  "../../../../foo/bar...".
   157  		// However, we are safe to call MkDirAll directly because we are in the jail here.
   158  		if err := os.MkdirAll(cwd, 0o755); err != nil {
   159  			return err
   160  		}
   161  	}
   162  
   163  	return nil
   164  }
   165  
   166  // finalizeRootfs sets anything to ro if necessary. You must call
   167  // prepareRootfs first.
   168  func finalizeRootfs(config *configs.Config) (err error) {
   169  	// All tmpfs mounts and /dev were previously mounted as rw
   170  	// by mountPropagate. Remount them read-only as requested.
   171  	for _, m := range config.Mounts {
   172  		if m.Flags&unix.MS_RDONLY != unix.MS_RDONLY {
   173  			continue
   174  		}
   175  		if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" {
   176  			if err := remountReadonly(m); err != nil {
   177  				return err
   178  			}
   179  		}
   180  	}
   181  
   182  	// set rootfs ( / ) as readonly
   183  	if config.Readonlyfs {
   184  		if err := setReadonly(); err != nil {
   185  			return fmt.Errorf("error setting rootfs as readonly: %w", err)
   186  		}
   187  	}
   188  
   189  	if config.Umask != nil {
   190  		unix.Umask(int(*config.Umask))
   191  	} else {
   192  		unix.Umask(0o022)
   193  	}
   194  	return nil
   195  }
   196  
   197  // /tmp has to be mounted as private to allow MS_MOVE to work in all situations
   198  func prepareTmp(topTmpDir string) (string, error) {
   199  	tmpdir, err := os.MkdirTemp(topTmpDir, "runctop")
   200  	if err != nil {
   201  		return "", err
   202  	}
   203  	if err := mount(tmpdir, tmpdir, "", "bind", unix.MS_BIND, ""); err != nil {
   204  		return "", err
   205  	}
   206  	if err := mount("", tmpdir, "", "", uintptr(unix.MS_PRIVATE), ""); err != nil {
   207  		return "", err
   208  	}
   209  	return tmpdir, nil
   210  }
   211  
   212  func cleanupTmp(tmpdir string) {
   213  	_ = unix.Unmount(tmpdir, 0)
   214  	_ = os.RemoveAll(tmpdir)
   215  }
   216  
   217  func mountCmd(cmd configs.Command) error {
   218  	command := exec.Command(cmd.Path, cmd.Args[:]...)
   219  	command.Env = cmd.Env
   220  	command.Dir = cmd.Dir
   221  	if out, err := command.CombinedOutput(); err != nil {
   222  		return fmt.Errorf("%#v failed: %s: %w", cmd, string(out), err)
   223  	}
   224  	return nil
   225  }
   226  
   227  func prepareBindMount(m *configs.Mount, rootfs string, mountFd *int) error {
   228  	source := m.Source
   229  	if mountFd != nil {
   230  		source = "/proc/self/fd/" + strconv.Itoa(*mountFd)
   231  	}
   232  
   233  	stat, err := os.Stat(source)
   234  	if err != nil {
   235  		// error out if the source of a bind mount does not exist as we will be
   236  		// unable to bind anything to it.
   237  		return err
   238  	}
   239  	// ensure that the destination of the bind mount is resolved of symlinks at mount time because
   240  	// any previous mounts can invalidate the next mount's destination.
   241  	// this can happen when a user specifies mounts within other mounts to cause breakouts or other
   242  	// evil stuff to try to escape the container's rootfs.
   243  	var dest string
   244  	if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
   245  		return err
   246  	}
   247  	if err := checkProcMount(rootfs, dest, source); err != nil {
   248  		return err
   249  	}
   250  	if err := createIfNotExists(dest, stat.IsDir()); err != nil {
   251  		return err
   252  	}
   253  
   254  	return nil
   255  }
   256  
   257  func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
   258  	binds, err := getCgroupMounts(m)
   259  	if err != nil {
   260  		return err
   261  	}
   262  	var merged []string
   263  	for _, b := range binds {
   264  		ss := filepath.Base(b.Destination)
   265  		if strings.Contains(ss, ",") {
   266  			merged = append(merged, ss)
   267  		}
   268  	}
   269  	tmpfs := &configs.Mount{
   270  		Source:           "tmpfs",
   271  		Device:           "tmpfs",
   272  		Destination:      m.Destination,
   273  		Flags:            defaultMountFlags,
   274  		Data:             "mode=755",
   275  		PropagationFlags: m.PropagationFlags,
   276  	}
   277  
   278  	if err := mountToRootfs(tmpfs, c); err != nil {
   279  		return err
   280  	}
   281  
   282  	for _, b := range binds {
   283  		if c.cgroupns {
   284  			subsystemPath := filepath.Join(c.root, b.Destination)
   285  			if err := os.MkdirAll(subsystemPath, 0o755); err != nil {
   286  				return err
   287  			}
   288  			if err := utils.WithProcfd(c.root, b.Destination, func(procfd string) error {
   289  				flags := defaultMountFlags
   290  				if m.Flags&unix.MS_RDONLY != 0 {
   291  					flags = flags | unix.MS_RDONLY
   292  				}
   293  				var (
   294  					source = "cgroup"
   295  					data   = filepath.Base(subsystemPath)
   296  				)
   297  				if data == "systemd" {
   298  					data = cgroups.CgroupNamePrefix + data
   299  					source = "systemd"
   300  				}
   301  				return mount(source, b.Destination, procfd, "cgroup", uintptr(flags), data)
   302  			}); err != nil {
   303  				return err
   304  			}
   305  		} else {
   306  			if err := mountToRootfs(b, c); err != nil {
   307  				return err
   308  			}
   309  		}
   310  	}
   311  	for _, mc := range merged {
   312  		for _, ss := range strings.Split(mc, ",") {
   313  			// symlink(2) is very dumb, it will just shove the path into
   314  			// the link and doesn't do any checks or relative path
   315  			// conversion. Also, don't error out if the cgroup already exists.
   316  			if err := os.Symlink(mc, filepath.Join(c.root, m.Destination, ss)); err != nil && !os.IsExist(err) {
   317  				return err
   318  			}
   319  		}
   320  	}
   321  	return nil
   322  }
   323  
   324  func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
   325  	dest, err := securejoin.SecureJoin(c.root, m.Destination)
   326  	if err != nil {
   327  		return err
   328  	}
   329  	if err := os.MkdirAll(dest, 0o755); err != nil {
   330  		return err
   331  	}
   332  	err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
   333  		return mount(m.Source, m.Destination, procfd, "cgroup2", uintptr(m.Flags), m.Data)
   334  	})
   335  	if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) {
   336  		return err
   337  	}
   338  
   339  	// When we are in UserNS but CgroupNS is not unshared, we cannot mount
   340  	// cgroup2 (#2158), so fall back to bind mount.
   341  	bindM := &configs.Mount{
   342  		Device:           "bind",
   343  		Source:           fs2.UnifiedMountpoint,
   344  		Destination:      m.Destination,
   345  		Flags:            unix.MS_BIND | m.Flags,
   346  		PropagationFlags: m.PropagationFlags,
   347  	}
   348  	if c.cgroupns && c.cgroup2Path != "" {
   349  		// Emulate cgroupns by bind-mounting the container cgroup path
   350  		// rather than the whole /sys/fs/cgroup.
   351  		bindM.Source = c.cgroup2Path
   352  	}
   353  	// mountToRootfs() handles remounting for MS_RDONLY.
   354  	// No need to set c.fd here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate().
   355  	err = mountToRootfs(bindM, c)
   356  	if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
   357  		// ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed
   358  		// outside the userns+mountns.
   359  		//
   360  		// Mask `/sys/fs/cgroup` to ensure it is read-only, even when `/sys` is mounted
   361  		// with `rbind,ro` (`runc spec --rootless` produces `rbind,ro` for `/sys`).
   362  		err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
   363  			return maskPath(procfd, c.label)
   364  		})
   365  	}
   366  	return err
   367  }
   368  
   369  func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
   370  	// Set up a scratch dir for the tmpfs on the host.
   371  	tmpdir, err := prepareTmp("/tmp")
   372  	if err != nil {
   373  		return fmt.Errorf("tmpcopyup: failed to setup tmpdir: %w", err)
   374  	}
   375  	defer cleanupTmp(tmpdir)
   376  	tmpDir, err := os.MkdirTemp(tmpdir, "runctmpdir")
   377  	if err != nil {
   378  		return fmt.Errorf("tmpcopyup: failed to create tmpdir: %w", err)
   379  	}
   380  	defer os.RemoveAll(tmpDir)
   381  
   382  	// Configure the *host* tmpdir as if it's the container mount. We change
   383  	// m.Destination since we are going to mount *on the host*.
   384  	oldDest := m.Destination
   385  	m.Destination = tmpDir
   386  	err = mountPropagate(m, "/", mountLabel, nil)
   387  	m.Destination = oldDest
   388  	if err != nil {
   389  		return err
   390  	}
   391  	defer func() {
   392  		if Err != nil {
   393  			if err := unmount(tmpDir, unix.MNT_DETACH); err != nil {
   394  				logrus.Warnf("tmpcopyup: %v", err)
   395  			}
   396  		}
   397  	}()
   398  
   399  	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) (Err error) {
   400  		// Copy the container data to the host tmpdir. We append "/" to force
   401  		// CopyDirectory to resolve the symlink rather than trying to copy the
   402  		// symlink itself.
   403  		if err := fileutils.CopyDirectory(procfd+"/", tmpDir); err != nil {
   404  			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, procfd, tmpDir, err)
   405  		}
   406  		// Now move the mount into the container.
   407  		if err := mount(tmpDir, m.Destination, procfd, "", unix.MS_MOVE, ""); err != nil {
   408  			return fmt.Errorf("tmpcopyup: failed to move mount: %w", err)
   409  		}
   410  		return nil
   411  	})
   412  }
   413  
   414  func mountToRootfs(m *configs.Mount, c *mountConfig) error {
   415  	rootfs := c.root
   416  
   417  	// procfs and sysfs are special because we need to ensure they are actually
   418  	// mounted on a specific path in a container without any funny business.
   419  	switch m.Device {
   420  	case "proc", "sysfs":
   421  		// If the destination already exists and is not a directory, we bail
   422  		// out. This is to avoid mounting through a symlink or similar -- which
   423  		// has been a "fun" attack scenario in the past.
   424  		// TODO: This won't be necessary once we switch to libpathrs and we can
   425  		//       stop all of these symlink-exchange attacks.
   426  		dest := filepath.Clean(m.Destination)
   427  		if !strings.HasPrefix(dest, rootfs) {
   428  			// Do not use securejoin as it resolves symlinks.
   429  			dest = filepath.Join(rootfs, dest)
   430  		}
   431  		if fi, err := os.Lstat(dest); err != nil {
   432  			if !os.IsNotExist(err) {
   433  				return err
   434  			}
   435  		} else if !fi.IsDir() {
   436  			return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device)
   437  		}
   438  		if err := os.MkdirAll(dest, 0o755); err != nil {
   439  			return err
   440  		}
   441  		// Selinux kernels do not support labeling of /proc or /sys.
   442  		return mountPropagate(m, rootfs, "", nil)
   443  	}
   444  
   445  	mountLabel := c.label
   446  	mountFd := c.fd
   447  	dest, err := securejoin.SecureJoin(rootfs, m.Destination)
   448  	if err != nil {
   449  		return err
   450  	}
   451  
   452  	switch m.Device {
   453  	case "mqueue":
   454  		if err := os.MkdirAll(dest, 0o755); err != nil {
   455  			return err
   456  		}
   457  		if err := mountPropagate(m, rootfs, "", nil); err != nil {
   458  			return err
   459  		}
   460  		return label.SetFileLabel(dest, mountLabel)
   461  	case "tmpfs":
   462  		if stat, err := os.Stat(dest); err != nil {
   463  			if err := os.MkdirAll(dest, 0o755); err != nil {
   464  				return err
   465  			}
   466  		} else {
   467  			dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode()))
   468  			if m.Data != "" {
   469  				dt = dt + "," + m.Data
   470  			}
   471  			m.Data = dt
   472  		}
   473  
   474  		if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
   475  			err = doTmpfsCopyUp(m, rootfs, mountLabel)
   476  		} else {
   477  			err = mountPropagate(m, rootfs, mountLabel, nil)
   478  		}
   479  
   480  		return err
   481  	case "bind":
   482  		if err := prepareBindMount(m, rootfs, mountFd); err != nil {
   483  			return err
   484  		}
   485  		if err := mountPropagate(m, rootfs, mountLabel, mountFd); err != nil {
   486  			return err
   487  		}
   488  		// bind mount won't change mount options, we need remount to make mount options effective.
   489  		// first check that we have non-default options required before attempting a remount
   490  		if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 {
   491  			// only remount if unique mount options are set
   492  			if err := remount(m, rootfs, mountFd); err != nil {
   493  				return err
   494  			}
   495  		}
   496  
   497  		if m.Relabel != "" {
   498  			if err := label.Validate(m.Relabel); err != nil {
   499  				return err
   500  			}
   501  			shared := label.IsShared(m.Relabel)
   502  			if err := label.Relabel(m.Source, mountLabel, shared); err != nil {
   503  				return err
   504  			}
   505  		}
   506  	case "cgroup":
   507  		if cgroups.IsCgroup2UnifiedMode() {
   508  			return mountCgroupV2(m, c)
   509  		}
   510  		return mountCgroupV1(m, c)
   511  	default:
   512  		if err := checkProcMount(rootfs, dest, m.Source); err != nil {
   513  			return err
   514  		}
   515  		if err := os.MkdirAll(dest, 0o755); err != nil {
   516  			return err
   517  		}
   518  		return mountPropagate(m, rootfs, mountLabel, mountFd)
   519  	}
   520  	if err := setRecAttr(m, rootfs); err != nil {
   521  		return err
   522  	}
   523  	return nil
   524  }
   525  
   526  func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
   527  	mounts, err := cgroups.GetCgroupMounts(false)
   528  	if err != nil {
   529  		return nil, err
   530  	}
   531  
   532  	cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
   533  	if err != nil {
   534  		return nil, err
   535  	}
   536  
   537  	var binds []*configs.Mount
   538  
   539  	for _, mm := range mounts {
   540  		dir, err := mm.GetOwnCgroup(cgroupPaths)
   541  		if err != nil {
   542  			return nil, err
   543  		}
   544  		relDir, err := filepath.Rel(mm.Root, dir)
   545  		if err != nil {
   546  			return nil, err
   547  		}
   548  		binds = append(binds, &configs.Mount{
   549  			Device:           "bind",
   550  			Source:           filepath.Join(mm.Mountpoint, relDir),
   551  			Destination:      filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)),
   552  			Flags:            unix.MS_BIND | unix.MS_REC | m.Flags,
   553  			PropagationFlags: m.PropagationFlags,
   554  		})
   555  	}
   556  
   557  	return binds, nil
   558  }
   559  
   560  // checkProcMount checks to ensure that the mount destination is not over the top of /proc.
   561  // dest is required to be an abs path and have any symlinks resolved before calling this function.
   562  //
   563  // if source is nil, don't stat the filesystem.  This is used for restore of a checkpoint.
   564  func checkProcMount(rootfs, dest, source string) error {
   565  	const procPath = "/proc"
   566  	path, err := filepath.Rel(filepath.Join(rootfs, procPath), dest)
   567  	if err != nil {
   568  		return err
   569  	}
   570  	// pass if the mount path is located outside of /proc
   571  	if strings.HasPrefix(path, "..") {
   572  		return nil
   573  	}
   574  	if path == "." {
   575  		// an empty source is pasted on restore
   576  		if source == "" {
   577  			return nil
   578  		}
   579  		// only allow a mount on-top of proc if it's source is "proc"
   580  		isproc, err := isProc(source)
   581  		if err != nil {
   582  			return err
   583  		}
   584  		// pass if the mount is happening on top of /proc and the source of
   585  		// the mount is a proc filesystem
   586  		if isproc {
   587  			return nil
   588  		}
   589  		return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest)
   590  	}
   591  
   592  	// Here dest is definitely under /proc. Do not allow those,
   593  	// except for a few specific entries emulated by lxcfs.
   594  	validProcMounts := []string{
   595  		"/proc/cpuinfo",
   596  		"/proc/diskstats",
   597  		"/proc/meminfo",
   598  		"/proc/stat",
   599  		"/proc/swaps",
   600  		"/proc/uptime",
   601  		"/proc/loadavg",
   602  		"/proc/slabinfo",
   603  		"/proc/net/dev",
   604  		"/proc/sys/kernel/ns_last_pid",
   605  	}
   606  	for _, valid := range validProcMounts {
   607  		path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
   608  		if err != nil {
   609  			return err
   610  		}
   611  		if path == "." {
   612  			return nil
   613  		}
   614  	}
   615  
   616  	return fmt.Errorf("%q cannot be mounted because it is inside /proc", dest)
   617  }
   618  
   619  func isProc(path string) (bool, error) {
   620  	var s unix.Statfs_t
   621  	if err := unix.Statfs(path, &s); err != nil {
   622  		return false, &os.PathError{Op: "statfs", Path: path, Err: err}
   623  	}
   624  	return s.Type == unix.PROC_SUPER_MAGIC, nil
   625  }
   626  
   627  func setupDevSymlinks(rootfs string) error {
   628  	links := [][2]string{
   629  		{"/proc/self/fd", "/dev/fd"},
   630  		{"/proc/self/fd/0", "/dev/stdin"},
   631  		{"/proc/self/fd/1", "/dev/stdout"},
   632  		{"/proc/self/fd/2", "/dev/stderr"},
   633  	}
   634  	// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
   635  	// in /dev if it exists in /proc.
   636  	if _, err := os.Stat("/proc/kcore"); err == nil {
   637  		links = append(links, [2]string{"/proc/kcore", "/dev/core"})
   638  	}
   639  	for _, link := range links {
   640  		var (
   641  			src = link[0]
   642  			dst = filepath.Join(rootfs, link[1])
   643  		)
   644  		if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) {
   645  			return err
   646  		}
   647  	}
   648  	return nil
   649  }
   650  
   651  // If stdin, stdout, and/or stderr are pointing to `/dev/null` in the parent's rootfs
   652  // this method will make them point to `/dev/null` in this container's rootfs.  This
   653  // needs to be called after we chroot/pivot into the container's rootfs so that any
   654  // symlinks are resolved locally.
   655  func reOpenDevNull() error {
   656  	var stat, devNullStat unix.Stat_t
   657  	file, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
   658  	if err != nil {
   659  		return err
   660  	}
   661  	defer file.Close() //nolint: errcheck
   662  	if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil {
   663  		return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
   664  	}
   665  	for fd := 0; fd < 3; fd++ {
   666  		if err := unix.Fstat(fd, &stat); err != nil {
   667  			return &os.PathError{Op: "fstat", Path: "fd " + strconv.Itoa(fd), Err: err}
   668  		}
   669  		if stat.Rdev == devNullStat.Rdev {
   670  			// Close and re-open the fd.
   671  			if err := unix.Dup3(int(file.Fd()), fd, 0); err != nil {
   672  				return &os.PathError{
   673  					Op:   "dup3",
   674  					Path: "fd " + strconv.Itoa(int(file.Fd())),
   675  					Err:  err,
   676  				}
   677  			}
   678  		}
   679  	}
   680  	return nil
   681  }
   682  
   683  // Create the device nodes in the container.
   684  func createDevices(config *configs.Config) error {
   685  	useBindMount := userns.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
   686  	oldMask := unix.Umask(0o000)
   687  	for _, node := range config.Devices {
   688  
   689  		// The /dev/ptmx device is setup by setupPtmx()
   690  		if utils.CleanPath(node.Path) == "/dev/ptmx" {
   691  			continue
   692  		}
   693  
   694  		// containers running in a user namespace are not allowed to mknod
   695  		// devices so we can just bind mount it from the host.
   696  		if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
   697  			unix.Umask(oldMask)
   698  			return err
   699  		}
   700  	}
   701  	unix.Umask(oldMask)
   702  	return nil
   703  }
   704  
   705  func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
   706  	f, err := os.Create(dest)
   707  	if err != nil && !os.IsExist(err) {
   708  		return err
   709  	}
   710  	if f != nil {
   711  		_ = f.Close()
   712  	}
   713  	return utils.WithProcfd(rootfs, dest, func(procfd string) error {
   714  		return mount(node.Path, dest, procfd, "bind", unix.MS_BIND, "")
   715  	})
   716  }
   717  
   718  // Creates the device node in the rootfs of the container.
   719  func createDeviceNode(rootfs string, node *devices.Device, bind bool) error {
   720  	if node.Path == "" {
   721  		// The node only exists for cgroup reasons, ignore it here.
   722  		return nil
   723  	}
   724  	dest, err := securejoin.SecureJoin(rootfs, node.Path)
   725  	if err != nil {
   726  		return err
   727  	}
   728  	if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil {
   729  		return err
   730  	}
   731  	if bind {
   732  		return bindMountDeviceNode(rootfs, dest, node)
   733  	}
   734  	if err := mknodDevice(dest, node); err != nil {
   735  		if errors.Is(err, os.ErrExist) {
   736  			return nil
   737  		} else if errors.Is(err, os.ErrPermission) {
   738  			return bindMountDeviceNode(rootfs, dest, node)
   739  		}
   740  		return err
   741  	}
   742  	return nil
   743  }
   744  
   745  func mknodDevice(dest string, node *devices.Device) error {
   746  	fileMode := node.FileMode
   747  	switch node.Type {
   748  	case devices.BlockDevice:
   749  		fileMode |= unix.S_IFBLK
   750  	case devices.CharDevice:
   751  		fileMode |= unix.S_IFCHR
   752  	case devices.FifoDevice:
   753  		fileMode |= unix.S_IFIFO
   754  	default:
   755  		return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path)
   756  	}
   757  	dev, err := node.Mkdev()
   758  	if err != nil {
   759  		return err
   760  	}
   761  	if err := unix.Mknod(dest, uint32(fileMode), int(dev)); err != nil {
   762  		return &os.PathError{Op: "mknod", Path: dest, Err: err}
   763  	}
   764  	return os.Chown(dest, int(node.Uid), int(node.Gid))
   765  }
   766  
   767  // Get the parent mount point of directory passed in as argument. Also return
   768  // optional fields.
   769  func getParentMount(rootfs string) (string, string, error) {
   770  	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(rootfs))
   771  	if err != nil {
   772  		return "", "", err
   773  	}
   774  	if len(mi) < 1 {
   775  		return "", "", fmt.Errorf("could not find parent mount of %s", rootfs)
   776  	}
   777  
   778  	// find the longest mount point
   779  	var idx, maxlen int
   780  	for i := range mi {
   781  		if len(mi[i].Mountpoint) > maxlen {
   782  			maxlen = len(mi[i].Mountpoint)
   783  			idx = i
   784  		}
   785  	}
   786  	return mi[idx].Mountpoint, mi[idx].Optional, nil
   787  }
   788  
   789  // Make parent mount private if it was shared
   790  func rootfsParentMountPrivate(rootfs string) error {
   791  	sharedMount := false
   792  
   793  	parentMount, optionalOpts, err := getParentMount(rootfs)
   794  	if err != nil {
   795  		return err
   796  	}
   797  
   798  	optsSplit := strings.Split(optionalOpts, " ")
   799  	for _, opt := range optsSplit {
   800  		if strings.HasPrefix(opt, "shared:") {
   801  			sharedMount = true
   802  			break
   803  		}
   804  	}
   805  
   806  	// Make parent mount PRIVATE if it was shared. It is needed for two
   807  	// reasons. First of all pivot_root() will fail if parent mount is
   808  	// shared. Secondly when we bind mount rootfs it will propagate to
   809  	// parent namespace and we don't want that to happen.
   810  	if sharedMount {
   811  		return mount("", parentMount, "", "", unix.MS_PRIVATE, "")
   812  	}
   813  
   814  	return nil
   815  }
   816  
   817  func prepareRoot(config *configs.Config) error {
   818  	flag := unix.MS_SLAVE | unix.MS_REC
   819  	if config.RootPropagation != 0 {
   820  		flag = config.RootPropagation
   821  	}
   822  	if err := mount("", "/", "", "", uintptr(flag), ""); err != nil {
   823  		return err
   824  	}
   825  
   826  	// Make parent mount private to make sure following bind mount does
   827  	// not propagate in other namespaces. Also it will help with kernel
   828  	// check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent))
   829  	if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
   830  		return err
   831  	}
   832  
   833  	return mount(config.Rootfs, config.Rootfs, "", "bind", unix.MS_BIND|unix.MS_REC, "")
   834  }
   835  
   836  func setReadonly() error {
   837  	flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY)
   838  
   839  	err := mount("", "/", "", "", flags, "")
   840  	if err == nil {
   841  		return nil
   842  	}
   843  	var s unix.Statfs_t
   844  	if err := unix.Statfs("/", &s); err != nil {
   845  		return &os.PathError{Op: "statfs", Path: "/", Err: err}
   846  	}
   847  	flags |= uintptr(s.Flags)
   848  	return mount("", "/", "", "", flags, "")
   849  }
   850  
   851  func setupPtmx(config *configs.Config) error {
   852  	ptmx := filepath.Join(config.Rootfs, "dev/ptmx")
   853  	if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
   854  		return err
   855  	}
   856  	if err := os.Symlink("pts/ptmx", ptmx); err != nil {
   857  		return err
   858  	}
   859  	return nil
   860  }
   861  
   862  // pivotRoot will call pivot_root such that rootfs becomes the new root
   863  // filesystem, and everything else is cleaned up.
   864  func pivotRoot(rootfs string) error {
   865  	// While the documentation may claim otherwise, pivot_root(".", ".") is
   866  	// actually valid. What this results in is / being the new root but
   867  	// /proc/self/cwd being the old root. Since we can play around with the cwd
   868  	// with pivot_root this allows us to pivot without creating directories in
   869  	// the rootfs. Shout-outs to the LXC developers for giving us this idea.
   870  
   871  	oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
   872  	if err != nil {
   873  		return &os.PathError{Op: "open", Path: "/", Err: err}
   874  	}
   875  	defer unix.Close(oldroot) //nolint: errcheck
   876  
   877  	newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
   878  	if err != nil {
   879  		return &os.PathError{Op: "open", Path: rootfs, Err: err}
   880  	}
   881  	defer unix.Close(newroot) //nolint: errcheck
   882  
   883  	// Change to the new root so that the pivot_root actually acts on it.
   884  	if err := unix.Fchdir(newroot); err != nil {
   885  		return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err}
   886  	}
   887  
   888  	if err := unix.PivotRoot(".", "."); err != nil {
   889  		return &os.PathError{Op: "pivot_root", Path: ".", Err: err}
   890  	}
   891  
   892  	// Currently our "." is oldroot (according to the current kernel code).
   893  	// However, purely for safety, we will fchdir(oldroot) since there isn't
   894  	// really any guarantee from the kernel what /proc/self/cwd will be after a
   895  	// pivot_root(2).
   896  
   897  	if err := unix.Fchdir(oldroot); err != nil {
   898  		return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(oldroot), Err: err}
   899  	}
   900  
   901  	// Make oldroot rslave to make sure our unmounts don't propagate to the
   902  	// host (and thus bork the machine). We don't use rprivate because this is
   903  	// known to cause issues due to races where we still have a reference to a
   904  	// mount while a process in the host namespace are trying to operate on
   905  	// something they think has no mounts (devicemapper in particular).
   906  	if err := mount("", ".", "", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
   907  		return err
   908  	}
   909  	// Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
   910  	if err := unmount(".", unix.MNT_DETACH); err != nil {
   911  		return err
   912  	}
   913  
   914  	// Switch back to our shiny new root.
   915  	if err := unix.Chdir("/"); err != nil {
   916  		return &os.PathError{Op: "chdir", Path: "/", Err: err}
   917  	}
   918  	return nil
   919  }
   920  
   921  func msMoveRoot(rootfs string) error {
   922  	// Before we move the root and chroot we have to mask all "full" sysfs and
   923  	// procfs mounts which exist on the host. This is because while the kernel
   924  	// has protections against mounting procfs if it has masks, when using
   925  	// chroot(2) the *host* procfs mount is still reachable in the mount
   926  	// namespace and the kernel permits procfs mounts inside --no-pivot
   927  	// containers.
   928  	//
   929  	// Users shouldn't be using --no-pivot except in exceptional circumstances,
   930  	// but to avoid such a trivial security flaw we apply a best-effort
   931  	// protection here. The kernel only allows a mount of a pseudo-filesystem
   932  	// like procfs or sysfs if there is a *full* mount (the root of the
   933  	// filesystem is mounted) without any other locked mount points covering a
   934  	// subtree of the mount.
   935  	//
   936  	// So we try to unmount (or mount tmpfs on top of) any mountpoint which is
   937  	// a full mount of either sysfs or procfs (since those are the most
   938  	// concerning filesystems to us).
   939  	mountinfos, err := mountinfo.GetMounts(func(info *mountinfo.Info) (skip, stop bool) {
   940  		// Collect every sysfs and procfs filesystem, except for those which
   941  		// are non-full mounts or are inside the rootfs of the container.
   942  		if info.Root != "/" ||
   943  			(info.FSType != "proc" && info.FSType != "sysfs") ||
   944  			strings.HasPrefix(info.Mountpoint, rootfs) {
   945  			skip = true
   946  		}
   947  		return
   948  	})
   949  	if err != nil {
   950  		return err
   951  	}
   952  	for _, info := range mountinfos {
   953  		p := info.Mountpoint
   954  		// Be sure umount events are not propagated to the host.
   955  		if err := mount("", p, "", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
   956  			if errors.Is(err, unix.ENOENT) {
   957  				// If the mountpoint doesn't exist that means that we've
   958  				// already blasted away some parent directory of the mountpoint
   959  				// and so we don't care about this error.
   960  				continue
   961  			}
   962  			return err
   963  		}
   964  		if err := unmount(p, unix.MNT_DETACH); err != nil {
   965  			if !errors.Is(err, unix.EINVAL) && !errors.Is(err, unix.EPERM) {
   966  				return err
   967  			} else {
   968  				// If we have not privileges for umounting (e.g. rootless), then
   969  				// cover the path.
   970  				if err := mount("tmpfs", p, "", "tmpfs", 0, ""); err != nil {
   971  					return err
   972  				}
   973  			}
   974  		}
   975  	}
   976  
   977  	// Move the rootfs on top of "/" in our mount namespace.
   978  	if err := mount(rootfs, "/", "", "", unix.MS_MOVE, ""); err != nil {
   979  		return err
   980  	}
   981  	return chroot()
   982  }
   983  
   984  func chroot() error {
   985  	if err := unix.Chroot("."); err != nil {
   986  		return &os.PathError{Op: "chroot", Path: ".", Err: err}
   987  	}
   988  	if err := unix.Chdir("/"); err != nil {
   989  		return &os.PathError{Op: "chdir", Path: "/", Err: err}
   990  	}
   991  	return nil
   992  }
   993  
   994  // createIfNotExists creates a file or a directory only if it does not already exist.
   995  func createIfNotExists(path string, isDir bool) error {
   996  	if _, err := os.Stat(path); err != nil {
   997  		if os.IsNotExist(err) {
   998  			if isDir {
   999  				return os.MkdirAll(path, 0o755)
  1000  			}
  1001  			if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
  1002  				return err
  1003  			}
  1004  			f, err := os.OpenFile(path, os.O_CREATE, 0o755)
  1005  			if err != nil {
  1006  				return err
  1007  			}
  1008  			_ = f.Close()
  1009  		}
  1010  	}
  1011  	return nil
  1012  }
  1013  
  1014  // readonlyPath will make a path read only.
  1015  func readonlyPath(path string) error {
  1016  	if err := mount(path, path, "", "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
  1017  		if errors.Is(err, os.ErrNotExist) {
  1018  			return nil
  1019  		}
  1020  		return err
  1021  	}
  1022  
  1023  	var s unix.Statfs_t
  1024  	if err := unix.Statfs(path, &s); err != nil {
  1025  		return &os.PathError{Op: "statfs", Path: path, Err: err}
  1026  	}
  1027  	flags := uintptr(s.Flags) & (unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC)
  1028  
  1029  	if err := mount(path, path, "", "", flags|unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil {
  1030  		return err
  1031  	}
  1032  
  1033  	return nil
  1034  }
  1035  
  1036  // remountReadonly will remount an existing mount point and ensure that it is read-only.
  1037  func remountReadonly(m *configs.Mount) error {
  1038  	var (
  1039  		dest  = m.Destination
  1040  		flags = m.Flags
  1041  	)
  1042  	for i := 0; i < 5; i++ {
  1043  		// There is a special case in the kernel for
  1044  		// MS_REMOUNT | MS_BIND, which allows us to change only the
  1045  		// flags even as an unprivileged user (i.e. user namespace)
  1046  		// assuming we don't drop any security related flags (nodev,
  1047  		// nosuid, etc.). So, let's use that case so that we can do
  1048  		// this re-mount without failing in a userns.
  1049  		flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY
  1050  		if err := mount("", dest, "", "", uintptr(flags), ""); err != nil {
  1051  			if errors.Is(err, unix.EBUSY) {
  1052  				time.Sleep(100 * time.Millisecond)
  1053  				continue
  1054  			}
  1055  			return err
  1056  		}
  1057  		return nil
  1058  	}
  1059  	return fmt.Errorf("unable to mount %s as readonly max retries reached", dest)
  1060  }
  1061  
  1062  // maskPath masks the top of the specified path inside a container to avoid
  1063  // security issues from processes reading information from non-namespace aware
  1064  // mounts ( proc/kcore ).
  1065  // For files, maskPath bind mounts /dev/null over the top of the specified path.
  1066  // For directories, maskPath mounts read-only tmpfs over the top of the specified path.
  1067  func maskPath(path string, mountLabel string) error {
  1068  	if err := mount("/dev/null", path, "", "", unix.MS_BIND, ""); err != nil && !errors.Is(err, os.ErrNotExist) {
  1069  		if errors.Is(err, unix.ENOTDIR) {
  1070  			return mount("tmpfs", path, "", "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel))
  1071  		}
  1072  		return err
  1073  	}
  1074  	return nil
  1075  }
  1076  
  1077  // writeSystemProperty writes the value to a path under /proc/sys as determined from the key.
  1078  // For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward.
  1079  func writeSystemProperty(key, value string) error {
  1080  	keyPath := strings.Replace(key, ".", "/", -1)
  1081  	return os.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0o644)
  1082  }
  1083  
  1084  func remount(m *configs.Mount, rootfs string, mountFd *int) error {
  1085  	source := m.Source
  1086  	if mountFd != nil {
  1087  		source = "/proc/self/fd/" + strconv.Itoa(*mountFd)
  1088  	}
  1089  
  1090  	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
  1091  		flags := uintptr(m.Flags | unix.MS_REMOUNT)
  1092  		err := mount(source, m.Destination, procfd, m.Device, flags, "")
  1093  		if err == nil {
  1094  			return nil
  1095  		}
  1096  		// Check if the source has ro flag...
  1097  		var s unix.Statfs_t
  1098  		if err := unix.Statfs(source, &s); err != nil {
  1099  			return &os.PathError{Op: "statfs", Path: source, Err: err}
  1100  		}
  1101  		if s.Flags&unix.MS_RDONLY != unix.MS_RDONLY {
  1102  			return err
  1103  		}
  1104  		// ... and retry the mount with ro flag set.
  1105  		flags |= unix.MS_RDONLY
  1106  		return mount(source, m.Destination, procfd, m.Device, flags, "")
  1107  	})
  1108  }
  1109  
  1110  // Do the mount operation followed by additional mounts required to take care
  1111  // of propagation flags. This will always be scoped inside the container rootfs.
  1112  func mountPropagate(m *configs.Mount, rootfs string, mountLabel string, mountFd *int) error {
  1113  	var (
  1114  		data  = label.FormatMountLabel(m.Data, mountLabel)
  1115  		flags = m.Flags
  1116  	)
  1117  	// Delay mounting the filesystem read-only if we need to do further
  1118  	// operations on it. We need to set up files in "/dev", and other tmpfs
  1119  	// mounts may need to be chmod-ed after mounting. These mounts will be
  1120  	// remounted ro later in finalizeRootfs(), if necessary.
  1121  	if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" {
  1122  		flags &= ^unix.MS_RDONLY
  1123  	}
  1124  
  1125  	// Because the destination is inside a container path which might be
  1126  	// mutating underneath us, we verify that we are actually going to mount
  1127  	// inside the container with WithProcfd() -- mounting through a procfd
  1128  	// mounts on the target.
  1129  	source := m.Source
  1130  	if mountFd != nil {
  1131  		source = "/proc/self/fd/" + strconv.Itoa(*mountFd)
  1132  	}
  1133  
  1134  	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
  1135  		return mount(source, m.Destination, procfd, m.Device, uintptr(flags), data)
  1136  	}); err != nil {
  1137  		return err
  1138  	}
  1139  	// We have to apply mount propagation flags in a separate WithProcfd() call
  1140  	// because the previous call invalidates the passed procfd -- the mount
  1141  	// target needs to be re-opened.
  1142  	if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
  1143  		for _, pflag := range m.PropagationFlags {
  1144  			if err := mount("", m.Destination, procfd, "", uintptr(pflag), ""); err != nil {
  1145  				return err
  1146  			}
  1147  		}
  1148  		return nil
  1149  	}); err != nil {
  1150  		return fmt.Errorf("change mount propagation through procfd: %w", err)
  1151  	}
  1152  	return nil
  1153  }
  1154  
  1155  func setRecAttr(m *configs.Mount, rootfs string) error {
  1156  	if m.RecAttr == nil {
  1157  		return nil
  1158  	}
  1159  	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
  1160  		return unix.MountSetattr(-1, procfd, unix.AT_RECURSIVE, m.RecAttr)
  1161  	})
  1162  }
  1163
View as plain text