1 package libcontainer
2
3 import (
4 "bytes"
5 "encoding/json"
6 "errors"
7 "fmt"
8 "io"
9 "net"
10 "os"
11 "os/exec"
12 "path"
13 "path/filepath"
14 "reflect"
15 "strconv"
16 "strings"
17 "sync"
18 "time"
19
20 "github.com/checkpoint-restore/go-criu/v5"
21 criurpc "github.com/checkpoint-restore/go-criu/v5/rpc"
22 securejoin "github.com/cyphar/filepath-securejoin"
23 "github.com/opencontainers/runtime-spec/specs-go"
24 "github.com/sirupsen/logrus"
25 "github.com/vishvananda/netlink/nl"
26 "golang.org/x/sys/unix"
27 "google.golang.org/protobuf/proto"
28
29 "github.com/opencontainers/runc/libcontainer/cgroups"
30 "github.com/opencontainers/runc/libcontainer/configs"
31 "github.com/opencontainers/runc/libcontainer/intelrdt"
32 "github.com/opencontainers/runc/libcontainer/system"
33 "github.com/opencontainers/runc/libcontainer/utils"
34 )
35
36 const stdioFdCount = 3
37
38 type linuxContainer struct {
39 id string
40 root string
41 config *configs.Config
42 cgroupManager cgroups.Manager
43 intelRdtManager *intelrdt.Manager
44 initPath string
45 initArgs []string
46 initProcess parentProcess
47 initProcessStartTime uint64
48 criuPath string
49 newuidmapPath string
50 newgidmapPath string
51 m sync.Mutex
52 criuVersion int
53 state containerState
54 created time.Time
55 fifo *os.File
56 }
57
58
59 type State struct {
60 BaseState
61
62
63
64
65
66 Rootless bool `json:"rootless"`
67
68
69
70
71
72
73
74 CgroupPaths map[string]string `json:"cgroup_paths"`
75
76
77
78 NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
79
80
81 ExternalDescriptors []string `json:"external_descriptors,omitempty"`
82
83
84 IntelRdtPath string `json:"intel_rdt_path"`
85 }
86
87
88
89
90
91
92 type Container interface {
93 BaseContainer
94
95
96
97
98 Checkpoint(criuOpts *CriuOpts) error
99
100
101 Restore(process *Process, criuOpts *CriuOpts) error
102
103
104
105
106
107 Pause() error
108
109
110
111
112 Resume() error
113
114
115 NotifyOOM() (<-chan struct{}, error)
116
117
118 NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
119 }
120
121
122 func (c *linuxContainer) ID() string {
123 return c.id
124 }
125
126
127 func (c *linuxContainer) Config() configs.Config {
128 return *c.config
129 }
130
131 func (c *linuxContainer) Status() (Status, error) {
132 c.m.Lock()
133 defer c.m.Unlock()
134 return c.currentStatus()
135 }
136
137 func (c *linuxContainer) State() (*State, error) {
138 c.m.Lock()
139 defer c.m.Unlock()
140 return c.currentState()
141 }
142
143 func (c *linuxContainer) OCIState() (*specs.State, error) {
144 c.m.Lock()
145 defer c.m.Unlock()
146 return c.currentOCIState()
147 }
148
149
150
151 func (c *linuxContainer) ignoreCgroupError(err error) error {
152 if err == nil {
153 return nil
154 }
155 if errors.Is(err, os.ErrNotExist) && c.runType() == Stopped && !c.cgroupManager.Exists() {
156 return nil
157 }
158 return err
159 }
160
161 func (c *linuxContainer) Processes() ([]int, error) {
162 pids, err := c.cgroupManager.GetAllPids()
163 if err = c.ignoreCgroupError(err); err != nil {
164 return nil, fmt.Errorf("unable to get all container pids: %w", err)
165 }
166 return pids, nil
167 }
168
169 func (c *linuxContainer) Stats() (*Stats, error) {
170 var (
171 err error
172 stats = &Stats{}
173 )
174 if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
175 return stats, fmt.Errorf("unable to get container cgroup stats: %w", err)
176 }
177 if c.intelRdtManager != nil {
178 if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
179 return stats, fmt.Errorf("unable to get container Intel RDT stats: %w", err)
180 }
181 }
182 for _, iface := range c.config.Networks {
183 switch iface.Type {
184 case "veth":
185 istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
186 if err != nil {
187 return stats, fmt.Errorf("unable to get network stats for interface %q: %w", iface.HostInterfaceName, err)
188 }
189 stats.Interfaces = append(stats.Interfaces, istats)
190 }
191 }
192 return stats, nil
193 }
194
195 func (c *linuxContainer) Set(config configs.Config) error {
196 c.m.Lock()
197 defer c.m.Unlock()
198 status, err := c.currentStatus()
199 if err != nil {
200 return err
201 }
202 if status == Stopped {
203 return ErrNotRunning
204 }
205 if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil {
206
207 if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
208 logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
209 }
210 return err
211 }
212 if c.intelRdtManager != nil {
213 if err := c.intelRdtManager.Set(&config); err != nil {
214
215 if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
216 logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
217 }
218 if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
219 logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
220 }
221 return err
222 }
223 }
224
225 c.config = &config
226 _, err = c.updateState(nil)
227 return err
228 }
229
230 func (c *linuxContainer) Start(process *Process) error {
231 c.m.Lock()
232 defer c.m.Unlock()
233 if c.config.Cgroups.Resources.SkipDevices {
234 return errors.New("can't start container with SkipDevices set")
235 }
236 if process.Init {
237 if err := c.createExecFifo(); err != nil {
238 return err
239 }
240 }
241 if err := c.start(process); err != nil {
242 if process.Init {
243 c.deleteExecFifo()
244 }
245 return err
246 }
247 return nil
248 }
249
250 func (c *linuxContainer) Run(process *Process) error {
251 if err := c.Start(process); err != nil {
252 return err
253 }
254 if process.Init {
255 return c.exec()
256 }
257 return nil
258 }
259
260 func (c *linuxContainer) Exec() error {
261 c.m.Lock()
262 defer c.m.Unlock()
263 return c.exec()
264 }
265
266 func (c *linuxContainer) exec() error {
267 path := filepath.Join(c.root, execFifoFilename)
268 pid := c.initProcess.pid()
269 blockingFifoOpenCh := awaitFifoOpen(path)
270 for {
271 select {
272 case result := <-blockingFifoOpenCh:
273 return handleFifoResult(result)
274
275 case <-time.After(time.Millisecond * 100):
276 stat, err := system.Stat(pid)
277 if err != nil || stat.State == system.Zombie {
278
279
280 if err := handleFifoResult(fifoOpen(path, false)); err != nil {
281 return errors.New("container process is already dead")
282 }
283 return nil
284 }
285 }
286 }
287 }
288
289 func readFromExecFifo(execFifo io.Reader) error {
290 data, err := io.ReadAll(execFifo)
291 if err != nil {
292 return err
293 }
294 if len(data) <= 0 {
295 return errors.New("cannot start an already running container")
296 }
297 return nil
298 }
299
300 func awaitFifoOpen(path string) <-chan openResult {
301 fifoOpened := make(chan openResult)
302 go func() {
303 result := fifoOpen(path, true)
304 fifoOpened <- result
305 }()
306 return fifoOpened
307 }
308
309 func fifoOpen(path string, block bool) openResult {
310 flags := os.O_RDONLY
311 if !block {
312 flags |= unix.O_NONBLOCK
313 }
314 f, err := os.OpenFile(path, flags, 0)
315 if err != nil {
316 return openResult{err: fmt.Errorf("exec fifo: %w", err)}
317 }
318 return openResult{file: f}
319 }
320
321 func handleFifoResult(result openResult) error {
322 if result.err != nil {
323 return result.err
324 }
325 f := result.file
326 defer f.Close()
327 if err := readFromExecFifo(f); err != nil {
328 return err
329 }
330 return os.Remove(f.Name())
331 }
332
333 type openResult struct {
334 file *os.File
335 err error
336 }
337
338 func (c *linuxContainer) start(process *Process) (retErr error) {
339 parent, err := c.newParentProcess(process)
340 if err != nil {
341 return fmt.Errorf("unable to create new parent process: %w", err)
342 }
343
344 logsDone := parent.forwardChildLogs()
345 if logsDone != nil {
346 defer func() {
347
348
349 err := <-logsDone
350 if err != nil && retErr == nil {
351 retErr = fmt.Errorf("unable to forward init logs: %w", err)
352 }
353 }()
354 }
355
356
357
358
359
360
361
362 if err := utils.CloseExecFrom(3); err != nil {
363 return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
364 }
365 if err := parent.start(); err != nil {
366 return fmt.Errorf("unable to start container process: %w", err)
367 }
368
369 if process.Init {
370 c.fifo.Close()
371 if c.config.Hooks != nil {
372 s, err := c.currentOCIState()
373 if err != nil {
374 return err
375 }
376
377 if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil {
378 if err := ignoreTerminateErrors(parent.terminate()); err != nil {
379 logrus.Warn(fmt.Errorf("error running poststart hook: %w", err))
380 }
381 return err
382 }
383 }
384 }
385 return nil
386 }
387
388 func (c *linuxContainer) Signal(s os.Signal, all bool) error {
389 c.m.Lock()
390 defer c.m.Unlock()
391 status, err := c.currentStatus()
392 if err != nil {
393 return err
394 }
395 if all {
396 if status == Stopped && !c.cgroupManager.Exists() {
397
398
399 return nil
400 }
401 return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, s))
402 }
403
404 if status == Running || status == Created || status == Paused {
405 if err := c.initProcess.signal(s); err != nil {
406 return fmt.Errorf("unable to signal init: %w", err)
407 }
408 if status == Paused {
409
410
411
412 if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL {
413 _ = c.cgroupManager.Freeze(configs.Thawed)
414 }
415 }
416 return nil
417 }
418 return ErrNotRunning
419 }
420
421 func (c *linuxContainer) createExecFifo() error {
422 rootuid, err := c.Config().HostRootUID()
423 if err != nil {
424 return err
425 }
426 rootgid, err := c.Config().HostRootGID()
427 if err != nil {
428 return err
429 }
430
431 fifoName := filepath.Join(c.root, execFifoFilename)
432 if _, err := os.Stat(fifoName); err == nil {
433 return fmt.Errorf("exec fifo %s already exists", fifoName)
434 }
435 oldMask := unix.Umask(0o000)
436 if err := unix.Mkfifo(fifoName, 0o622); err != nil {
437 unix.Umask(oldMask)
438 return err
439 }
440 unix.Umask(oldMask)
441 return os.Chown(fifoName, rootuid, rootgid)
442 }
443
444 func (c *linuxContainer) deleteExecFifo() {
445 fifoName := filepath.Join(c.root, execFifoFilename)
446 os.Remove(fifoName)
447 }
448
449
450
451
452
453 func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
454 fifoName := filepath.Join(c.root, execFifoFilename)
455 fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
456 if err != nil {
457 return err
458 }
459 c.fifo = fifo
460
461 cmd.ExtraFiles = append(cmd.ExtraFiles, fifo)
462 cmd.Env = append(cmd.Env,
463 "_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
464 return nil
465 }
466
467 func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
468 parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
469 if err != nil {
470 return nil, fmt.Errorf("unable to create init pipe: %w", err)
471 }
472 messageSockPair := filePair{parentInitPipe, childInitPipe}
473
474 parentLogPipe, childLogPipe, err := os.Pipe()
475 if err != nil {
476 return nil, fmt.Errorf("unable to create log pipe: %w", err)
477 }
478 logFilePair := filePair{parentLogPipe, childLogPipe}
479
480 cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
481 if !p.Init {
482 return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
483 }
484
485
486
487
488
489
490 if err := c.includeExecFifo(cmd); err != nil {
491 return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
492 }
493 return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
494 }
495
496 func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
497 cmd := exec.Command(c.initPath, c.initArgs[1:]...)
498 cmd.Args[0] = c.initArgs[0]
499 cmd.Stdin = p.Stdin
500 cmd.Stdout = p.Stdout
501 cmd.Stderr = p.Stderr
502 cmd.Dir = c.config.Rootfs
503 if cmd.SysProcAttr == nil {
504 cmd.SysProcAttr = &unix.SysProcAttr{}
505 }
506 cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))
507 cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
508 if p.ConsoleSocket != nil {
509 cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
510 cmd.Env = append(cmd.Env,
511 "_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
512 )
513 }
514 cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe)
515 cmd.Env = append(cmd.Env,
516 "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
517 "_LIBCONTAINER_STATEDIR="+c.root,
518 )
519
520 cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
521 cmd.Env = append(cmd.Env,
522 "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
523 "_LIBCONTAINER_LOGLEVEL="+p.LogLevel,
524 )
525
526
527
528
529 if c.config.ParentDeathSignal > 0 {
530 cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
531 }
532 return cmd
533 }
534
535
536
537
538 func (c *linuxContainer) shouldSendMountSources() bool {
539
540
541 if !c.config.Namespaces.Contains(configs.NEWUSER) ||
542 !c.config.Namespaces.Contains(configs.NEWNS) {
543 return false
544 }
545
546
547
548 if c.config.RootlessEUID {
549 return false
550 }
551
552
553 for _, m := range c.config.Mounts {
554 if m.IsBind() {
555 return true
556 }
557 }
558
559 return false
560 }
561
562 func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
563 cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
564 nsMaps := make(map[configs.NamespaceType]string)
565 for _, ns := range c.config.Namespaces {
566 if ns.Path != "" {
567 nsMaps[ns.Type] = ns.Path
568 }
569 }
570 _, sharePidns := nsMaps[configs.NEWPID]
571 data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
572 if err != nil {
573 return nil, err
574 }
575
576 if c.shouldSendMountSources() {
577
578
579 mountFds := make([]int, len(c.config.Mounts))
580 for i, m := range c.config.Mounts {
581 if !m.IsBind() {
582
583 mountFds[i] = -1
584 continue
585 }
586
587
588
589
590
591 cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child)
592 mountFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
593 }
594
595 mountFdsJson, err := json.Marshal(mountFds)
596 if err != nil {
597 return nil, fmt.Errorf("Error creating _LIBCONTAINER_MOUNT_FDS: %w", err)
598 }
599
600 cmd.Env = append(cmd.Env,
601 "_LIBCONTAINER_MOUNT_FDS="+string(mountFdsJson),
602 )
603 }
604
605 init := &initProcess{
606 cmd: cmd,
607 messageSockPair: messageSockPair,
608 logFilePair: logFilePair,
609 manager: c.cgroupManager,
610 intelRdtManager: c.intelRdtManager,
611 config: c.newInitConfig(p),
612 container: c,
613 process: p,
614 bootstrapData: data,
615 sharePidns: sharePidns,
616 }
617 c.initProcess = init
618 return init, nil
619 }
620
621 func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) {
622 cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
623 state, err := c.currentState()
624 if err != nil {
625 return nil, fmt.Errorf("unable to get container state: %w", err)
626 }
627
628
629 data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
630 if err != nil {
631 return nil, err
632 }
633 proc := &setnsProcess{
634 cmd: cmd,
635 cgroupPaths: state.CgroupPaths,
636 rootlessCgroups: c.config.RootlessCgroups,
637 intelRdtPath: state.IntelRdtPath,
638 messageSockPair: messageSockPair,
639 logFilePair: logFilePair,
640 manager: c.cgroupManager,
641 config: c.newInitConfig(p),
642 process: p,
643 bootstrapData: data,
644 initProcessPid: state.InitProcessPid,
645 }
646 if len(p.SubCgroupPaths) > 0 {
647 if add, ok := p.SubCgroupPaths[""]; ok {
648
649
650 for k := range proc.cgroupPaths {
651 subPath := path.Join(proc.cgroupPaths[k], add)
652 if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) {
653 return nil, fmt.Errorf("%s is not a sub cgroup path", add)
654 }
655 proc.cgroupPaths[k] = subPath
656 }
657
658
659 proc.initProcessPid = 0
660 } else {
661
662 for ctrl, add := range p.SubCgroupPaths {
663 if val, ok := proc.cgroupPaths[ctrl]; ok {
664 subPath := path.Join(val, add)
665 if !strings.HasPrefix(subPath, val) {
666 return nil, fmt.Errorf("%s is not a sub cgroup path", add)
667 }
668 proc.cgroupPaths[ctrl] = subPath
669 } else {
670 return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
671 }
672 }
673 }
674 }
675 return proc, nil
676 }
677
678 func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
679 cfg := &initConfig{
680 Config: c.config,
681 Args: process.Args,
682 Env: process.Env,
683 User: process.User,
684 AdditionalGroups: process.AdditionalGroups,
685 Cwd: process.Cwd,
686 Capabilities: process.Capabilities,
687 PassedFilesCount: len(process.ExtraFiles),
688 ContainerId: c.ID(),
689 NoNewPrivileges: c.config.NoNewPrivileges,
690 RootlessEUID: c.config.RootlessEUID,
691 RootlessCgroups: c.config.RootlessCgroups,
692 AppArmorProfile: c.config.AppArmorProfile,
693 ProcessLabel: c.config.ProcessLabel,
694 Rlimits: c.config.Rlimits,
695 CreateConsole: process.ConsoleSocket != nil,
696 ConsoleWidth: process.ConsoleWidth,
697 ConsoleHeight: process.ConsoleHeight,
698 }
699 if process.NoNewPrivileges != nil {
700 cfg.NoNewPrivileges = *process.NoNewPrivileges
701 }
702 if process.AppArmorProfile != "" {
703 cfg.AppArmorProfile = process.AppArmorProfile
704 }
705 if process.Label != "" {
706 cfg.ProcessLabel = process.Label
707 }
708 if len(process.Rlimits) > 0 {
709 cfg.Rlimits = process.Rlimits
710 }
711 if cgroups.IsCgroup2UnifiedMode() {
712 cfg.Cgroup2Path = c.cgroupManager.Path("")
713 }
714
715 return cfg
716 }
717
718 func (c *linuxContainer) Destroy() error {
719 c.m.Lock()
720 defer c.m.Unlock()
721 return c.state.destroy()
722 }
723
724 func (c *linuxContainer) Pause() error {
725 c.m.Lock()
726 defer c.m.Unlock()
727 status, err := c.currentStatus()
728 if err != nil {
729 return err
730 }
731 switch status {
732 case Running, Created:
733 if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
734 return err
735 }
736 return c.state.transition(&pausedState{
737 c: c,
738 })
739 }
740 return ErrNotRunning
741 }
742
743 func (c *linuxContainer) Resume() error {
744 c.m.Lock()
745 defer c.m.Unlock()
746 status, err := c.currentStatus()
747 if err != nil {
748 return err
749 }
750 if status != Paused {
751 return ErrNotPaused
752 }
753 if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
754 return err
755 }
756 return c.state.transition(&runningState{
757 c: c,
758 })
759 }
760
761 func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
762
763 if c.config.RootlessCgroups {
764 logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
765 }
766 path := c.cgroupManager.Path("memory")
767 if cgroups.IsCgroup2UnifiedMode() {
768 return notifyOnOOMV2(path)
769 }
770 return notifyOnOOM(path)
771 }
772
773 func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
774
775 if c.config.RootlessCgroups {
776 logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
777 }
778 return notifyMemoryPressure(c.cgroupManager.Path("memory"), level)
779 }
780
781 var criuFeatures *criurpc.CriuFeatures
782
783 func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
784 t := criurpc.CriuReqType_FEATURE_CHECK
785
786
787
788 criuFeatures = nil
789
790 req := &criurpc.CriuReq{
791 Type: &t,
792
793
794
795 Opts: rpcOpts,
796 Features: criuFeat,
797 }
798
799 err := c.criuSwrk(nil, req, criuOpts, nil)
800 if err != nil {
801 logrus.Debugf("%s", err)
802 return errors.New("CRIU feature check failed")
803 }
804
805 missingFeatures := false
806
807
808 if (criuFeat.MemTrack != nil) &&
809 (criuFeatures.MemTrack != nil) {
810
811 if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
812 missingFeatures = true
813 logrus.Debugf("CRIU does not support MemTrack")
814 }
815 }
816
817
818
819 if (criuFeat.LazyPages != nil) &&
820 (criuFeatures.LazyPages != nil) {
821 if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
822 missingFeatures = true
823 logrus.Debugf("CRIU does not support LazyPages")
824 }
825 }
826
827 if missingFeatures {
828 return errors.New("CRIU is missing features")
829 }
830
831 return nil
832 }
833
834 func compareCriuVersion(criuVersion int, minVersion int) error {
835
836 if criuVersion < minVersion {
837 return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion)
838 }
839
840 return nil
841 }
842
843
844 func (c *linuxContainer) checkCriuVersion(minVersion int) error {
845
846
847 if c.criuVersion != 0 {
848 return compareCriuVersion(c.criuVersion, minVersion)
849 }
850
851 criu := criu.MakeCriu()
852 criu.SetCriuPath(c.criuPath)
853 var err error
854 c.criuVersion, err = criu.GetCriuVersion()
855 if err != nil {
856 return fmt.Errorf("CRIU version check failed: %w", err)
857 }
858
859 return compareCriuVersion(c.criuVersion, minVersion)
860 }
861
862 const descriptorsFilename = "descriptors.json"
863
864 func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
865 mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
866 if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
867 mountDest = dest[len(c.config.Rootfs):]
868 }
869 extMnt := &criurpc.ExtMountMap{
870 Key: proto.String(mountDest),
871 Val: proto.String(mountDest),
872 }
873 req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
874 }
875
876 func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
877 for _, path := range c.config.MaskPaths {
878 fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
879 if err != nil {
880 if os.IsNotExist(err) {
881 continue
882 }
883 return err
884 }
885 if fi.IsDir() {
886 continue
887 }
888
889 extMnt := &criurpc.ExtMountMap{
890 Key: proto.String(path),
891 Val: proto.String("/dev/null"),
892 }
893 req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
894 }
895 return nil
896 }
897
898 func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) {
899
900
901
902
903
904 _, annotations := utils.Annotations(c.config.Labels)
905 configFile, exists := annotations["org.criu.config"]
906 if exists {
907
908
909
910
911 if configFile != "" {
912 rpcOpts.ConfigFile = proto.String(configFile)
913 }
914
915
916
917 } else {
918
919
920 rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf")
921 }
922 }
923
924 func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool {
925 var minVersion int
926 switch t {
927 case configs.NEWNET:
928
929
930 minVersion = 31100
931 case configs.NEWPID:
932
933 minVersion = 31500
934 default:
935 return false
936 }
937 return c.checkCriuVersion(minVersion) == nil
938 }
939
940 func criuNsToKey(t configs.NamespaceType) string {
941 return "extRoot" + strings.Title(configs.NsName(t)) + "NS"
942 }
943
944 func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error {
945 if !c.criuSupportsExtNS(t) {
946 return nil
947 }
948
949 nsPath := c.config.Namespaces.PathOf(t)
950 if nsPath == "" {
951 return nil
952 }
953
954
955
956 var ns unix.Stat_t
957 if err := unix.Stat(nsPath, &ns); err != nil {
958 return err
959 }
960 criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t))
961 rpcOpts.External = append(rpcOpts.External, criuExternal)
962
963 return nil
964 }
965
966 func (c *linuxContainer) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error {
967 for _, ns := range c.config.Namespaces {
968 switch ns.Type {
969 case configs.NEWNET, configs.NEWPID:
970
971
972
973
974
975
976 if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil {
977 return err
978 }
979 default:
980
981
982 nsPath := c.config.Namespaces.PathOf(ns.Type)
983 if nsPath == "" {
984 continue
985 }
986 if ns.Type == configs.NEWCGROUP {
987
988 return fmt.Errorf("Do not know how to handle namespace %v", ns.Type)
989 }
990
991
992
993
994 rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{
995 Ns: proto.String(configs.NsName(ns.Type)),
996 NsFile: proto.String(nsPath),
997 })
998 }
999 }
1000
1001 return nil
1002 }
1003
1004 func (c *linuxContainer) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error {
1005 if !c.criuSupportsExtNS(t) {
1006 return nil
1007 }
1008
1009 nsPath := c.config.Namespaces.PathOf(t)
1010 if nsPath == "" {
1011 return nil
1012 }
1013
1014
1015
1016
1017 nsFd, err := os.Open(nsPath)
1018 if err != nil {
1019 logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
1020 return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
1021 }
1022 inheritFd := &criurpc.InheritFd{
1023 Key: proto.String(criuNsToKey(t)),
1024
1025
1026 Fd: proto.Int32(int32(4 + len(*extraFiles))),
1027 }
1028 rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd)
1029
1030 *extraFiles = append(*extraFiles, nsFd)
1031
1032 return nil
1033 }
1034
1035 func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
1036 c.m.Lock()
1037 defer c.m.Unlock()
1038
1039
1040
1041
1042
1043
1044
1045
1046 if err := c.checkCriuVersion(30000); err != nil {
1047 return err
1048 }
1049
1050 if criuOpts.ImagesDirectory == "" {
1051 return errors.New("invalid directory to save checkpoint")
1052 }
1053
1054
1055
1056 if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) {
1057 return err
1058 }
1059
1060 imageDir, err := os.Open(criuOpts.ImagesDirectory)
1061 if err != nil {
1062 return err
1063 }
1064 defer imageDir.Close()
1065
1066 rpcOpts := criurpc.CriuOpts{
1067 ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
1068 LogLevel: proto.Int32(4),
1069 LogFile: proto.String("dump.log"),
1070 Root: proto.String(c.config.Rootfs),
1071 ManageCgroups: proto.Bool(true),
1072 NotifyScripts: proto.Bool(true),
1073 Pid: proto.Int32(int32(c.initProcess.pid())),
1074 ShellJob: proto.Bool(criuOpts.ShellJob),
1075 LeaveRunning: proto.Bool(criuOpts.LeaveRunning),
1076 TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
1077 ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
1078 FileLocks: proto.Bool(criuOpts.FileLocks),
1079 EmptyNs: proto.Uint32(criuOpts.EmptyNs),
1080 OrphanPtsMaster: proto.Bool(true),
1081 AutoDedup: proto.Bool(criuOpts.AutoDedup),
1082 LazyPages: proto.Bool(criuOpts.LazyPages),
1083 }
1084
1085
1086 if criuOpts.WorkDirectory != "" {
1087 if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
1088 return err
1089 }
1090 workDir, err := os.Open(criuOpts.WorkDirectory)
1091 if err != nil {
1092 return err
1093 }
1094 defer workDir.Close()
1095 rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
1096 }
1097
1098 c.handleCriuConfigurationFile(&rpcOpts)
1099
1100
1101
1102
1103
1104
1105
1106 if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil {
1107 return err
1108 }
1109
1110
1111 if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil {
1112 return err
1113 }
1114
1115
1116
1117
1118 if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil {
1119 if fcg := c.cgroupManager.Path("freezer"); fcg != "" {
1120 rpcOpts.FreezeCgroup = proto.String(fcg)
1121 }
1122 }
1123
1124
1125 if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
1126 rpcOpts.Ps = &criurpc.CriuPageServerInfo{
1127 Address: proto.String(criuOpts.PageServer.Address),
1128 Port: proto.Int32(criuOpts.PageServer.Port),
1129 }
1130 }
1131
1132
1133 if criuOpts.ParentImage != "" {
1134 rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
1135 rpcOpts.TrackMem = proto.Bool(true)
1136 }
1137
1138
1139 if criuOpts.ManageCgroupsMode != 0 {
1140 mode := criuOpts.ManageCgroupsMode
1141 rpcOpts.ManageCgroupsMode = &mode
1142 }
1143
1144 var t criurpc.CriuReqType
1145 if criuOpts.PreDump {
1146 feat := criurpc.CriuFeatures{
1147 MemTrack: proto.Bool(true),
1148 }
1149
1150 if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
1151 return err
1152 }
1153
1154 t = criurpc.CriuReqType_PRE_DUMP
1155 } else {
1156 t = criurpc.CriuReqType_DUMP
1157 }
1158
1159 if criuOpts.LazyPages {
1160
1161 feat := criurpc.CriuFeatures{
1162 LazyPages: proto.Bool(true),
1163 }
1164 if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
1165 return err
1166 }
1167
1168 if fd := criuOpts.StatusFd; fd != -1 {
1169
1170 flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0)
1171 if err != nil {
1172 return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err)
1173 }
1174
1175 if flags&unix.O_WRONLY == 0 {
1176 return fmt.Errorf("invalid --status-fd argument %d: not writable", fd)
1177 }
1178
1179 if c.checkCriuVersion(31500) != nil {
1180
1181
1182 rpcOpts.StatusFd = proto.Int32(int32(fd))
1183 }
1184 }
1185 }
1186
1187 req := &criurpc.CriuReq{
1188 Type: &t,
1189 Opts: &rpcOpts,
1190 }
1191
1192
1193 if !criuOpts.PreDump {
1194 hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
1195 for _, m := range c.config.Mounts {
1196 switch m.Device {
1197 case "bind":
1198 c.addCriuDumpMount(req, m)
1199 case "cgroup":
1200 if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
1201
1202 continue
1203 }
1204
1205 binds, err := getCgroupMounts(m)
1206 if err != nil {
1207 return err
1208 }
1209 for _, b := range binds {
1210 c.addCriuDumpMount(req, b)
1211 }
1212 }
1213 }
1214
1215 if err := c.addMaskPaths(req); err != nil {
1216 return err
1217 }
1218
1219 for _, node := range c.config.Devices {
1220 m := &configs.Mount{Destination: node.Path, Source: node.Path}
1221 c.addCriuDumpMount(req, m)
1222 }
1223
1224
1225 fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
1226 if err != nil {
1227 return err
1228 }
1229
1230 err = os.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600)
1231 if err != nil {
1232 return err
1233 }
1234 }
1235
1236 err = c.criuSwrk(nil, req, criuOpts, nil)
1237 if err != nil {
1238 return err
1239 }
1240 return nil
1241 }
1242
1243 func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
1244 mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
1245 if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
1246 mountDest = dest[len(c.config.Rootfs):]
1247 }
1248 extMnt := &criurpc.ExtMountMap{
1249 Key: proto.String(mountDest),
1250 Val: proto.String(m.Source),
1251 }
1252 req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
1253 }
1254
1255 func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
1256 for _, iface := range c.config.Networks {
1257 switch iface.Type {
1258 case "veth":
1259 veth := new(criurpc.CriuVethPair)
1260 veth.IfOut = proto.String(iface.HostInterfaceName)
1261 veth.IfIn = proto.String(iface.Name)
1262 req.Opts.Veths = append(req.Opts.Veths, veth)
1263 case "loopback":
1264
1265 }
1266 }
1267 for _, i := range criuOpts.VethPairs {
1268 veth := new(criurpc.CriuVethPair)
1269 veth.IfOut = proto.String(i.HostInterfaceName)
1270 veth.IfIn = proto.String(i.ContainerInterfaceName)
1271 req.Opts.Veths = append(req.Opts.Veths, veth)
1272 }
1273 }
1274
1275
1276
1277
1278 func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
1279 switch m.Device {
1280 case "cgroup":
1281
1282
1283
1284
1285
1286
1287
1288 return nil
1289 case "bind":
1290
1291
1292
1293
1294 if err := prepareBindMount(m, c.config.Rootfs, nil); err != nil {
1295 return err
1296 }
1297 default:
1298
1299 dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination)
1300 if err != nil {
1301 return err
1302 }
1303 if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil {
1304 return err
1305 }
1306 if err := os.MkdirAll(dest, 0o755); err != nil {
1307 return err
1308 }
1309 }
1310 return nil
1311 }
1312
1313
1314
1315 func isPathInPrefixList(path string, prefix []string) bool {
1316 for _, p := range prefix {
1317 if strings.HasPrefix(path, p+"/") {
1318 return true
1319 }
1320 }
1321 return false
1322 }
1323
1324
1325
1326
1327
1328
1329
1330 func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
1331
1332 tmpfs := []string{}
1333 for _, m := range mounts {
1334 switch m.Device {
1335 case "tmpfs":
1336 tmpfs = append(tmpfs, m.Destination)
1337 }
1338 }
1339
1340
1341
1342 umounts := []string{}
1343 defer func() {
1344 for _, u := range umounts {
1345 _ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error {
1346 if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil {
1347 if e != unix.EINVAL {
1348
1349
1350 logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e)
1351 }
1352 }
1353 return nil
1354 })
1355 }
1356 }()
1357 for _, m := range mounts {
1358 if !isPathInPrefixList(m.Destination, tmpfs) {
1359 if err := c.makeCriuRestoreMountpoints(m); err != nil {
1360 return err
1361 }
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371 if m.Device == "bind" {
1372 if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(procfd string) error {
1373 if err := mount(m.Source, m.Destination, procfd, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
1374 return err
1375 }
1376 return nil
1377 }); err != nil {
1378 return err
1379 }
1380 umounts = append(umounts, m.Destination)
1381 }
1382 }
1383 }
1384 return nil
1385 }
1386
1387 func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
1388 c.m.Lock()
1389 defer c.m.Unlock()
1390
1391 var extraFiles []*os.File
1392
1393
1394
1395
1396
1397
1398
1399 if err := c.checkCriuVersion(30000); err != nil {
1400 return err
1401 }
1402 if criuOpts.ImagesDirectory == "" {
1403 return errors.New("invalid directory to restore checkpoint")
1404 }
1405 imageDir, err := os.Open(criuOpts.ImagesDirectory)
1406 if err != nil {
1407 return err
1408 }
1409 defer imageDir.Close()
1410
1411
1412
1413
1414
1415 root := filepath.Join(c.root, "criu-root")
1416 if err := os.Mkdir(root, 0o755); err != nil {
1417 return err
1418 }
1419 defer os.Remove(root)
1420 root, err = filepath.EvalSymlinks(root)
1421 if err != nil {
1422 return err
1423 }
1424 err = mount(c.config.Rootfs, root, "", "", unix.MS_BIND|unix.MS_REC, "")
1425 if err != nil {
1426 return err
1427 }
1428 defer unix.Unmount(root, unix.MNT_DETACH)
1429 t := criurpc.CriuReqType_RESTORE
1430 req := &criurpc.CriuReq{
1431 Type: &t,
1432 Opts: &criurpc.CriuOpts{
1433 ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
1434 EvasiveDevices: proto.Bool(true),
1435 LogLevel: proto.Int32(4),
1436 LogFile: proto.String("restore.log"),
1437 RstSibling: proto.Bool(true),
1438 Root: proto.String(root),
1439 ManageCgroups: proto.Bool(true),
1440 NotifyScripts: proto.Bool(true),
1441 ShellJob: proto.Bool(criuOpts.ShellJob),
1442 ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
1443 TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
1444 FileLocks: proto.Bool(criuOpts.FileLocks),
1445 EmptyNs: proto.Uint32(criuOpts.EmptyNs),
1446 OrphanPtsMaster: proto.Bool(true),
1447 AutoDedup: proto.Bool(criuOpts.AutoDedup),
1448 LazyPages: proto.Bool(criuOpts.LazyPages),
1449 },
1450 }
1451
1452 if criuOpts.LsmProfile != "" {
1453
1454
1455 if err := c.checkCriuVersion(31600); err != nil {
1456 return errors.New("--lsm-profile requires at least CRIU 3.16")
1457 }
1458 req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile)
1459 }
1460 if criuOpts.LsmMountContext != "" {
1461 if err := c.checkCriuVersion(31600); err != nil {
1462 return errors.New("--lsm-mount-context requires at least CRIU 3.16")
1463 }
1464 req.Opts.LsmMountContext = proto.String(criuOpts.LsmMountContext)
1465 }
1466
1467 if criuOpts.WorkDirectory != "" {
1468
1469
1470 if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
1471 return err
1472 }
1473 workDir, err := os.Open(criuOpts.WorkDirectory)
1474 if err != nil {
1475 return err
1476 }
1477 defer workDir.Close()
1478 req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
1479 }
1480 c.handleCriuConfigurationFile(req.Opts)
1481
1482 if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil {
1483 return err
1484 }
1485
1486
1487
1488 if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil {
1489 return err
1490 }
1491
1492 hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
1493 for _, m := range c.config.Mounts {
1494 switch m.Device {
1495 case "bind":
1496 c.addCriuRestoreMount(req, m)
1497 case "cgroup":
1498 if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
1499 continue
1500 }
1501
1502 binds, err := getCgroupMounts(m)
1503 if err != nil {
1504 return err
1505 }
1506 for _, b := range binds {
1507 c.addCriuRestoreMount(req, b)
1508 }
1509 }
1510 }
1511
1512 if len(c.config.MaskPaths) > 0 {
1513 m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
1514 c.addCriuRestoreMount(req, m)
1515 }
1516
1517 for _, node := range c.config.Devices {
1518 m := &configs.Mount{Destination: node.Path, Source: node.Path}
1519 c.addCriuRestoreMount(req, m)
1520 }
1521
1522 if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 {
1523 c.restoreNetwork(req, criuOpts)
1524 }
1525
1526
1527 if criuOpts.ManageCgroupsMode != 0 {
1528 mode := criuOpts.ManageCgroupsMode
1529 req.Opts.ManageCgroupsMode = &mode
1530 }
1531
1532 var (
1533 fds []string
1534 fdJSON []byte
1535 )
1536 if fdJSON, err = os.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
1537 return err
1538 }
1539
1540 if err := json.Unmarshal(fdJSON, &fds); err != nil {
1541 return err
1542 }
1543 for i := range fds {
1544 if s := fds[i]; strings.Contains(s, "pipe:") {
1545 inheritFd := new(criurpc.InheritFd)
1546 inheritFd.Key = proto.String(s)
1547 inheritFd.Fd = proto.Int32(int32(i))
1548 req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
1549 }
1550 }
1551 err = c.criuSwrk(process, req, criuOpts, extraFiles)
1552
1553
1554 for _, fd := range extraFiles {
1555 fd.Close()
1556 }
1557
1558 return err
1559 }
1560
1561 func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
1562
1563 if req.GetType() != criurpc.CriuReqType_RESTORE {
1564 return nil
1565 }
1566
1567
1568 if err := c.cgroupManager.Apply(pid); err != nil {
1569 return err
1570 }
1571
1572 if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil {
1573 return err
1574 }
1575
1576 if cgroups.IsCgroup2UnifiedMode() {
1577 return nil
1578 }
1579
1580
1581 path := fmt.Sprintf("/proc/%d/cgroup", pid)
1582 cgroupsPaths, err := cgroups.ParseCgroupFile(path)
1583 if err != nil {
1584 return err
1585 }
1586
1587 for c, p := range cgroupsPaths {
1588 cgroupRoot := &criurpc.CgroupRoot{
1589 Ctrl: proto.String(c),
1590 Path: proto.String(p),
1591 }
1592 req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
1593 }
1594
1595 return nil
1596 }
1597
1598 func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error {
1599 fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
1600 if err != nil {
1601 return err
1602 }
1603
1604 var logPath string
1605 if opts != nil {
1606 logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
1607 } else {
1608
1609
1610 logPath = ""
1611 }
1612 criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
1613 criuClientFileCon, err := net.FileConn(criuClient)
1614 criuClient.Close()
1615 if err != nil {
1616 return err
1617 }
1618
1619 criuClientCon := criuClientFileCon.(*net.UnixConn)
1620 defer criuClientCon.Close()
1621
1622 criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
1623 defer criuServer.Close()
1624
1625 args := []string{"swrk", "3"}
1626 if c.criuVersion != 0 {
1627
1628
1629 logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
1630 }
1631 cmd := exec.Command(c.criuPath, args...)
1632 if process != nil {
1633 cmd.Stdin = process.Stdin
1634 cmd.Stdout = process.Stdout
1635 cmd.Stderr = process.Stderr
1636 }
1637 cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
1638 if extraFiles != nil {
1639 cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
1640 }
1641
1642 if err := cmd.Start(); err != nil {
1643 return err
1644 }
1645
1646 criuServer.Close()
1647
1648 criuProcess := cmd.Process
1649
1650 var criuProcessState *os.ProcessState
1651 defer func() {
1652 if criuProcessState == nil {
1653 criuClientCon.Close()
1654 _, err := criuProcess.Wait()
1655 if err != nil {
1656 logrus.Warnf("wait on criuProcess returned %v", err)
1657 }
1658 }
1659 }()
1660
1661 if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil {
1662 return err
1663 }
1664
1665 var extFds []string
1666 if process != nil {
1667 extFds, err = getPipeFds(criuProcess.Pid)
1668 if err != nil {
1669 return err
1670 }
1671 }
1672
1673 logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
1674
1675
1676
1677
1678 if logrus.GetLevel() >= logrus.DebugLevel &&
1679 !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
1680 req.GetType() == criurpc.CriuReqType_VERSION) {
1681
1682 val := reflect.ValueOf(req.GetOpts())
1683 v := reflect.Indirect(val)
1684 for i := 0; i < v.NumField(); i++ {
1685 st := v.Type()
1686 name := st.Field(i).Name
1687 if 'A' <= name[0] && name[0] <= 'Z' {
1688 value := val.MethodByName("Get" + name).Call([]reflect.Value{})
1689 logrus.Debugf("CRIU option %s with value %v", name, value[0])
1690 }
1691 }
1692 }
1693 data, err := proto.Marshal(req)
1694 if err != nil {
1695 return err
1696 }
1697 _, err = criuClientCon.Write(data)
1698 if err != nil {
1699 return err
1700 }
1701
1702 buf := make([]byte, 10*4096)
1703 oob := make([]byte, 4096)
1704 for {
1705 n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
1706 if req.Opts != nil && req.Opts.StatusFd != nil {
1707
1708
1709
1710
1711 fd := int(*req.Opts.StatusFd)
1712 _ = unix.Close(fd)
1713 req.Opts.StatusFd = nil
1714 }
1715 if err != nil {
1716 return err
1717 }
1718 if n == 0 {
1719 return errors.New("unexpected EOF")
1720 }
1721 if n == len(buf) {
1722 return errors.New("buffer is too small")
1723 }
1724
1725 resp := new(criurpc.CriuResp)
1726 err = proto.Unmarshal(buf[:n], resp)
1727 if err != nil {
1728 return err
1729 }
1730 if !resp.GetSuccess() {
1731 typeString := req.GetType().String()
1732 return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
1733 }
1734
1735 t := resp.GetType()
1736 switch {
1737 case t == criurpc.CriuReqType_FEATURE_CHECK:
1738 logrus.Debugf("Feature check says: %s", resp)
1739 criuFeatures = resp.GetFeatures()
1740 case t == criurpc.CriuReqType_NOTIFY:
1741 if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil {
1742 return err
1743 }
1744 t = criurpc.CriuReqType_NOTIFY
1745 req = &criurpc.CriuReq{
1746 Type: &t,
1747 NotifySuccess: proto.Bool(true),
1748 }
1749 data, err = proto.Marshal(req)
1750 if err != nil {
1751 return err
1752 }
1753 _, err = criuClientCon.Write(data)
1754 if err != nil {
1755 return err
1756 }
1757 continue
1758 case t == criurpc.CriuReqType_RESTORE:
1759 case t == criurpc.CriuReqType_DUMP:
1760 case t == criurpc.CriuReqType_PRE_DUMP:
1761 default:
1762 return fmt.Errorf("unable to parse the response %s", resp.String())
1763 }
1764
1765 break
1766 }
1767
1768 _ = criuClientCon.CloseWrite()
1769
1770
1771 criuProcessState, err = criuProcess.Wait()
1772 if err != nil {
1773 return err
1774 }
1775
1776
1777
1778
1779
1780
1781
1782
1783 if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP {
1784 return fmt.Errorf("criu failed: %s\nlog file: %s", criuProcessState.String(), logPath)
1785 }
1786 return nil
1787 }
1788
1789
1790 func lockNetwork(config *configs.Config) error {
1791 for _, config := range config.Networks {
1792 strategy, err := getStrategy(config.Type)
1793 if err != nil {
1794 return err
1795 }
1796
1797 if err := strategy.detach(config); err != nil {
1798 return err
1799 }
1800 }
1801 return nil
1802 }
1803
1804 func unlockNetwork(config *configs.Config) error {
1805 for _, config := range config.Networks {
1806 strategy, err := getStrategy(config.Type)
1807 if err != nil {
1808 return err
1809 }
1810 if err = strategy.attach(config); err != nil {
1811 return err
1812 }
1813 }
1814 return nil
1815 }
1816
1817 func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error {
1818 notify := resp.GetNotify()
1819 if notify == nil {
1820 return fmt.Errorf("invalid response: %s", resp.String())
1821 }
1822 script := notify.GetScript()
1823 logrus.Debugf("notify: %s\n", script)
1824 switch script {
1825 case "post-dump":
1826 f, err := os.Create(filepath.Join(c.root, "checkpoint"))
1827 if err != nil {
1828 return err
1829 }
1830 f.Close()
1831 case "network-unlock":
1832 if err := unlockNetwork(c.config); err != nil {
1833 return err
1834 }
1835 case "network-lock":
1836 if err := lockNetwork(c.config); err != nil {
1837 return err
1838 }
1839 case "setup-namespaces":
1840 if c.config.Hooks != nil {
1841 s, err := c.currentOCIState()
1842 if err != nil {
1843 return nil
1844 }
1845 s.Pid = int(notify.GetPid())
1846
1847 if err := c.config.Hooks[configs.Prestart].RunHooks(s); err != nil {
1848 return err
1849 }
1850 if err := c.config.Hooks[configs.CreateRuntime].RunHooks(s); err != nil {
1851 return err
1852 }
1853 }
1854 case "post-restore":
1855 pid := notify.GetPid()
1856
1857 p, err := os.FindProcess(int(pid))
1858 if err != nil {
1859 return err
1860 }
1861 cmd.Process = p
1862
1863 r, err := newRestoredProcess(cmd, fds)
1864 if err != nil {
1865 return err
1866 }
1867 process.ops = r
1868 if err := c.state.transition(&restoredState{
1869 imageDir: opts.ImagesDirectory,
1870 c: c,
1871 }); err != nil {
1872 return err
1873 }
1874
1875 c.created = time.Now().UTC()
1876 if _, err := c.updateState(r); err != nil {
1877 return err
1878 }
1879 if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
1880 if !os.IsNotExist(err) {
1881 logrus.Error(err)
1882 }
1883 }
1884 case "orphan-pts-master":
1885 scm, err := unix.ParseSocketControlMessage(oob)
1886 if err != nil {
1887 return err
1888 }
1889 fds, err := unix.ParseUnixRights(&scm[0])
1890 if err != nil {
1891 return err
1892 }
1893
1894 master := os.NewFile(uintptr(fds[0]), "orphan-pts-master")
1895 defer master.Close()
1896
1897
1898 if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
1899 return err
1900 }
1901 case "status-ready":
1902 if opts.StatusFd != -1 {
1903
1904 _, err := unix.Write(opts.StatusFd, []byte{0})
1905 if err != nil {
1906 logrus.Warnf("can't write \\0 to status fd: %v", err)
1907 }
1908 _ = unix.Close(opts.StatusFd)
1909 opts.StatusFd = -1
1910 }
1911 }
1912 return nil
1913 }
1914
1915 func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
1916 if process != nil {
1917 c.initProcess = process
1918 }
1919 state, err := c.currentState()
1920 if err != nil {
1921 return nil, err
1922 }
1923 err = c.saveState(state)
1924 if err != nil {
1925 return nil, err
1926 }
1927 return state, nil
1928 }
1929
1930 func (c *linuxContainer) saveState(s *State) (retErr error) {
1931 tmpFile, err := os.CreateTemp(c.root, "state-")
1932 if err != nil {
1933 return err
1934 }
1935
1936 defer func() {
1937 if retErr != nil {
1938 tmpFile.Close()
1939 os.Remove(tmpFile.Name())
1940 }
1941 }()
1942
1943 err = utils.WriteJSON(tmpFile, s)
1944 if err != nil {
1945 return err
1946 }
1947 err = tmpFile.Close()
1948 if err != nil {
1949 return err
1950 }
1951
1952 stateFilePath := filepath.Join(c.root, stateFilename)
1953 return os.Rename(tmpFile.Name(), stateFilePath)
1954 }
1955
1956 func (c *linuxContainer) currentStatus() (Status, error) {
1957 if err := c.refreshState(); err != nil {
1958 return -1, err
1959 }
1960 return c.state.status(), nil
1961 }
1962
1963
1964
1965
1966
1967 func (c *linuxContainer) refreshState() error {
1968 paused, err := c.isPaused()
1969 if err != nil {
1970 return err
1971 }
1972 if paused {
1973 return c.state.transition(&pausedState{c: c})
1974 }
1975 t := c.runType()
1976 switch t {
1977 case Created:
1978 return c.state.transition(&createdState{c: c})
1979 case Running:
1980 return c.state.transition(&runningState{c: c})
1981 }
1982 return c.state.transition(&stoppedState{c: c})
1983 }
1984
1985 func (c *linuxContainer) runType() Status {
1986 if c.initProcess == nil {
1987 return Stopped
1988 }
1989 pid := c.initProcess.pid()
1990 stat, err := system.Stat(pid)
1991 if err != nil {
1992 return Stopped
1993 }
1994 if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
1995 return Stopped
1996 }
1997
1998
1999 if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
2000 return Created
2001 }
2002 return Running
2003 }
2004
2005 func (c *linuxContainer) isPaused() (bool, error) {
2006 state, err := c.cgroupManager.GetFreezerState()
2007 if err != nil {
2008 return false, err
2009 }
2010 return state == configs.Frozen, nil
2011 }
2012
2013 func (c *linuxContainer) currentState() (*State, error) {
2014 var (
2015 startTime uint64
2016 externalDescriptors []string
2017 pid = -1
2018 )
2019 if c.initProcess != nil {
2020 pid = c.initProcess.pid()
2021 startTime, _ = c.initProcess.startTime()
2022 externalDescriptors = c.initProcess.externalDescriptors()
2023 }
2024
2025 intelRdtPath := ""
2026 if c.intelRdtManager != nil {
2027 intelRdtPath = c.intelRdtManager.GetPath()
2028 }
2029 state := &State{
2030 BaseState: BaseState{
2031 ID: c.ID(),
2032 Config: *c.config,
2033 InitProcessPid: pid,
2034 InitProcessStartTime: startTime,
2035 Created: c.created,
2036 },
2037 Rootless: c.config.RootlessEUID && c.config.RootlessCgroups,
2038 CgroupPaths: c.cgroupManager.GetPaths(),
2039 IntelRdtPath: intelRdtPath,
2040 NamespacePaths: make(map[configs.NamespaceType]string),
2041 ExternalDescriptors: externalDescriptors,
2042 }
2043 if pid > 0 {
2044 for _, ns := range c.config.Namespaces {
2045 state.NamespacePaths[ns.Type] = ns.GetPath(pid)
2046 }
2047 for _, nsType := range configs.NamespaceTypes() {
2048 if !configs.IsNamespaceSupported(nsType) {
2049 continue
2050 }
2051 if _, ok := state.NamespacePaths[nsType]; !ok {
2052 ns := configs.Namespace{Type: nsType}
2053 state.NamespacePaths[ns.Type] = ns.GetPath(pid)
2054 }
2055 }
2056 }
2057 return state, nil
2058 }
2059
2060 func (c *linuxContainer) currentOCIState() (*specs.State, error) {
2061 bundle, annotations := utils.Annotations(c.config.Labels)
2062 state := &specs.State{
2063 Version: specs.Version,
2064 ID: c.ID(),
2065 Bundle: bundle,
2066 Annotations: annotations,
2067 }
2068 status, err := c.currentStatus()
2069 if err != nil {
2070 return nil, err
2071 }
2072 state.Status = specs.ContainerState(status.String())
2073 if status != Stopped {
2074 if c.initProcess != nil {
2075 state.Pid = c.initProcess.pid()
2076 }
2077 }
2078 return state, nil
2079 }
2080
2081
2082
2083 func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
2084 paths := []string{}
2085 for _, ns := range configs.NamespaceTypes() {
2086
2087
2088 if !c.config.Namespaces.Contains(ns) {
2089 continue
2090 }
2091
2092 if p, ok := namespaces[ns]; ok && p != "" {
2093
2094 if !configs.IsNamespaceSupported(ns) {
2095 return nil, fmt.Errorf("namespace %s is not supported", ns)
2096 }
2097
2098 if _, err := os.Lstat(p); err != nil {
2099 return nil, fmt.Errorf("namespace path: %w", err)
2100 }
2101
2102
2103 if strings.ContainsRune(p, ',') {
2104 return nil, fmt.Errorf("invalid namespace path %s", p)
2105 }
2106 paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
2107 }
2108
2109 }
2110
2111 return paths, nil
2112 }
2113
2114 func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
2115 data := bytes.NewBuffer(nil)
2116 for _, im := range idMap {
2117 line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
2118 if _, err := data.WriteString(line); err != nil {
2119 return nil, err
2120 }
2121 }
2122 return data.Bytes(), nil
2123 }
2124
2125
2126
2127
2128 type netlinkError struct{ error }
2129
2130
2131
2132
2133
2134
2135
2136 func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
2137
2138 r := nl.NewNetlinkRequest(int(InitMsg), 0)
2139
2140
2141
2142
2143 defer func() {
2144 if r := recover(); r != nil {
2145 if e, ok := r.(netlinkError); ok {
2146 Err = e.error
2147 } else {
2148 panic(r)
2149 }
2150 }
2151 }()
2152
2153
2154 r.AddData(&Int32msg{
2155 Type: CloneFlagsAttr,
2156 Value: uint32(cloneFlags),
2157 })
2158
2159
2160 if len(nsMaps) > 0 {
2161 nsPaths, err := c.orderNamespacePaths(nsMaps)
2162 if err != nil {
2163 return nil, err
2164 }
2165 r.AddData(&Bytemsg{
2166 Type: NsPathsAttr,
2167 Value: []byte(strings.Join(nsPaths, ",")),
2168 })
2169 }
2170
2171
2172 _, joinExistingUser := nsMaps[configs.NEWUSER]
2173 if !joinExistingUser {
2174
2175 if len(c.config.UidMappings) > 0 {
2176 if c.config.RootlessEUID && c.newuidmapPath != "" {
2177 r.AddData(&Bytemsg{
2178 Type: UidmapPathAttr,
2179 Value: []byte(c.newuidmapPath),
2180 })
2181 }
2182 b, err := encodeIDMapping(c.config.UidMappings)
2183 if err != nil {
2184 return nil, err
2185 }
2186 r.AddData(&Bytemsg{
2187 Type: UidmapAttr,
2188 Value: b,
2189 })
2190 }
2191
2192
2193 if len(c.config.GidMappings) > 0 {
2194 b, err := encodeIDMapping(c.config.GidMappings)
2195 if err != nil {
2196 return nil, err
2197 }
2198 r.AddData(&Bytemsg{
2199 Type: GidmapAttr,
2200 Value: b,
2201 })
2202 if c.config.RootlessEUID && c.newgidmapPath != "" {
2203 r.AddData(&Bytemsg{
2204 Type: GidmapPathAttr,
2205 Value: []byte(c.newgidmapPath),
2206 })
2207 }
2208 if requiresRootOrMappingTool(c.config) {
2209 r.AddData(&Boolmsg{
2210 Type: SetgroupAttr,
2211 Value: true,
2212 })
2213 }
2214 }
2215 }
2216
2217 if c.config.OomScoreAdj != nil {
2218
2219 r.AddData(&Bytemsg{
2220 Type: OomScoreAdjAttr,
2221 Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)),
2222 })
2223 }
2224
2225
2226 r.AddData(&Boolmsg{
2227 Type: RootlessEUIDAttr,
2228 Value: c.config.RootlessEUID,
2229 })
2230
2231
2232 if it == initStandard && c.shouldSendMountSources() {
2233 var mounts []byte
2234 for _, m := range c.config.Mounts {
2235 if m.IsBind() {
2236 if strings.IndexByte(m.Source, 0) >= 0 {
2237 return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
2238 }
2239 mounts = append(mounts, []byte(m.Source)...)
2240 }
2241 mounts = append(mounts, byte(0))
2242 }
2243
2244 r.AddData(&Bytemsg{
2245 Type: MountSourcesAttr,
2246 Value: mounts,
2247 })
2248 }
2249
2250 return bytes.NewReader(r.Serialize()), nil
2251 }
2252
2253
2254
2255
2256 func ignoreTerminateErrors(err error) error {
2257 if err == nil {
2258 return nil
2259 }
2260
2261
2262
2263
2264 var exitErr *exec.ExitError
2265 if errors.As(err, &exitErr) {
2266 return nil
2267 }
2268 if errors.Is(err, os.ErrProcessDone) {
2269 return nil
2270 }
2271 s := err.Error()
2272 if strings.Contains(s, "Wait was already called") {
2273 return nil
2274 }
2275 return err
2276 }
2277
2278 func requiresRootOrMappingTool(c *configs.Config) bool {
2279 gidMap := []configs.IDMap{
2280 {ContainerID: 0, HostID: int64(os.Getegid()), Size: 1},
2281 }
2282 return !reflect.DeepEqual(c.GidMappings, gidMap)
2283 }
2284
View as plain text