1 package libcontainer
2
3 import (
4 "encoding/json"
5 "errors"
6 "fmt"
7 "io"
8 "net"
9 "os"
10 "os/exec"
11 "path/filepath"
12 "strconv"
13 "time"
14
15 "github.com/opencontainers/runc/libcontainer/cgroups"
16 "github.com/opencontainers/runc/libcontainer/cgroups/fs2"
17 "github.com/opencontainers/runc/libcontainer/configs"
18 "github.com/opencontainers/runc/libcontainer/intelrdt"
19 "github.com/opencontainers/runc/libcontainer/logs"
20 "github.com/opencontainers/runc/libcontainer/system"
21 "github.com/opencontainers/runc/libcontainer/utils"
22 "github.com/opencontainers/runtime-spec/specs-go"
23 "github.com/sirupsen/logrus"
24 "golang.org/x/sys/unix"
25 )
26
27 type parentProcess interface {
28
29 pid() int
30
31
32 start() error
33
34
35 terminate() error
36
37
38 wait() (*os.ProcessState, error)
39
40
41 startTime() (uint64, error)
42 signal(os.Signal) error
43 externalDescriptors() []string
44 setExternalDescriptors(fds []string)
45 forwardChildLogs() chan error
46 }
47
48 type filePair struct {
49 parent *os.File
50 child *os.File
51 }
52
53 type setnsProcess struct {
54 cmd *exec.Cmd
55 messageSockPair filePair
56 logFilePair filePair
57 cgroupPaths map[string]string
58 rootlessCgroups bool
59 manager cgroups.Manager
60 intelRdtPath string
61 config *initConfig
62 fds []string
63 process *Process
64 bootstrapData io.Reader
65 initProcessPid int
66 }
67
68 func (p *setnsProcess) startTime() (uint64, error) {
69 stat, err := system.Stat(p.pid())
70 return stat.StartTime, err
71 }
72
73 func (p *setnsProcess) signal(sig os.Signal) error {
74 s, ok := sig.(unix.Signal)
75 if !ok {
76 return errors.New("os: unsupported signal type")
77 }
78 return unix.Kill(p.pid(), s)
79 }
80
81 func (p *setnsProcess) start() (retErr error) {
82 defer p.messageSockPair.parent.Close()
83
84 oom, _ := p.manager.OOMKillCount()
85 err := p.cmd.Start()
86
87 p.messageSockPair.child.Close()
88 p.logFilePair.child.Close()
89 if err != nil {
90 return fmt.Errorf("error starting setns process: %w", err)
91 }
92
93 waitInit := initWaiter(p.messageSockPair.parent)
94 defer func() {
95 if retErr != nil {
96 if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom {
97
98 retErr = fmt.Errorf("%w (possibly OOM-killed)", retErr)
99 }
100 werr := <-waitInit
101 if werr != nil {
102 logrus.WithError(werr).Warn()
103 }
104 err := ignoreTerminateErrors(p.terminate())
105 if err != nil {
106 logrus.WithError(err).Warn("unable to terminate setnsProcess")
107 }
108 }
109 }()
110
111 if p.bootstrapData != nil {
112 if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
113 return fmt.Errorf("error copying bootstrap data to pipe: %w", err)
114 }
115 }
116 err = <-waitInit
117 if err != nil {
118 return err
119 }
120 if err := p.execSetns(); err != nil {
121 return fmt.Errorf("error executing setns process: %w", err)
122 }
123 for _, path := range p.cgroupPaths {
124 if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
125
126
127
128 if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 {
129 initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
130 initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
131 if initCgErr == nil {
132 if initCgPath, ok := initCg[""]; ok {
133 initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
134 logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)",
135 p.pid(), p.cgroupPaths, err, initCg, initCgDirpath)
136
137 err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
138 }
139 }
140 }
141 if err != nil {
142 return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
143 }
144 }
145 }
146 if p.intelRdtPath != "" {
147
148 _, err := os.Stat(p.intelRdtPath)
149 if err == nil {
150 if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
151 return fmt.Errorf("error adding pid %d to Intel RDT: %w", p.pid(), err)
152 }
153 }
154 }
155
156
157 if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
158 return fmt.Errorf("error setting rlimits for process: %w", err)
159 }
160 if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil {
161 return fmt.Errorf("error writing config to pipe: %w", err)
162 }
163
164 ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
165 switch sync.Type {
166 case procReady:
167
168 panic("unexpected procReady in setns")
169 case procHooks:
170
171 panic("unexpected procHooks in setns")
172 case procSeccomp:
173 if p.config.Config.Seccomp.ListenerPath == "" {
174 return errors.New("listenerPath is not set")
175 }
176
177 seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd))
178 if err != nil {
179 return err
180 }
181 defer unix.Close(seccompFd)
182
183 bundle, annotations := utils.Annotations(p.config.Config.Labels)
184 containerProcessState := &specs.ContainerProcessState{
185 Version: specs.Version,
186 Fds: []string{specs.SeccompFdName},
187 Pid: p.cmd.Process.Pid,
188 Metadata: p.config.Config.Seccomp.ListenerMetadata,
189 State: specs.State{
190 Version: specs.Version,
191 ID: p.config.ContainerId,
192 Status: specs.StateRunning,
193 Pid: p.initProcessPid,
194 Bundle: bundle,
195 Annotations: annotations,
196 },
197 }
198 if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
199 containerProcessState, seccompFd); err != nil {
200 return err
201 }
202
203
204 if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
205 return err
206 }
207 return nil
208 default:
209 return errors.New("invalid JSON payload from child")
210 }
211 })
212
213 if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
214 return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err}
215 }
216
217 if ierr != nil {
218 _, _ = p.wait()
219 return ierr
220 }
221 return nil
222 }
223
224
225
226
227
228 func (p *setnsProcess) execSetns() error {
229 status, err := p.cmd.Process.Wait()
230 if err != nil {
231 _ = p.cmd.Wait()
232 return fmt.Errorf("error waiting on setns process to finish: %w", err)
233 }
234 if !status.Success() {
235 _ = p.cmd.Wait()
236 return &exec.ExitError{ProcessState: status}
237 }
238 var pid *pid
239 if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
240 _ = p.cmd.Wait()
241 return fmt.Errorf("error reading pid from init pipe: %w", err)
242 }
243
244
245
246 firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
247
248
249 _, _ = firstChildProcess.Wait()
250
251 process, err := os.FindProcess(pid.Pid)
252 if err != nil {
253 return err
254 }
255 p.cmd.Process = process
256 p.process.ops = p
257 return nil
258 }
259
260
261
262 func (p *setnsProcess) terminate() error {
263 if p.cmd.Process == nil {
264 return nil
265 }
266 err := p.cmd.Process.Kill()
267 if _, werr := p.wait(); err == nil {
268 err = werr
269 }
270 return err
271 }
272
273 func (p *setnsProcess) wait() (*os.ProcessState, error) {
274 err := p.cmd.Wait()
275
276
277 return p.cmd.ProcessState, err
278 }
279
280 func (p *setnsProcess) pid() int {
281 return p.cmd.Process.Pid
282 }
283
284 func (p *setnsProcess) externalDescriptors() []string {
285 return p.fds
286 }
287
288 func (p *setnsProcess) setExternalDescriptors(newFds []string) {
289 p.fds = newFds
290 }
291
292 func (p *setnsProcess) forwardChildLogs() chan error {
293 return logs.ForwardLogs(p.logFilePair.parent)
294 }
295
296 type initProcess struct {
297 cmd *exec.Cmd
298 messageSockPair filePair
299 logFilePair filePair
300 config *initConfig
301 manager cgroups.Manager
302 intelRdtManager *intelrdt.Manager
303 container *linuxContainer
304 fds []string
305 process *Process
306 bootstrapData io.Reader
307 sharePidns bool
308 }
309
310 func (p *initProcess) pid() int {
311 return p.cmd.Process.Pid
312 }
313
314 func (p *initProcess) externalDescriptors() []string {
315 return p.fds
316 }
317
318
319 func (p *initProcess) getChildPid() (int, error) {
320 var pid pid
321 if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
322 _ = p.cmd.Wait()
323 return -1, err
324 }
325
326
327
328 firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
329
330
331 _, _ = firstChildProcess.Wait()
332
333 return pid.Pid, nil
334 }
335
336 func (p *initProcess) waitForChildExit(childPid int) error {
337 status, err := p.cmd.Process.Wait()
338 if err != nil {
339 _ = p.cmd.Wait()
340 return err
341 }
342 if !status.Success() {
343 _ = p.cmd.Wait()
344 return &exec.ExitError{ProcessState: status}
345 }
346
347 process, err := os.FindProcess(childPid)
348 if err != nil {
349 return err
350 }
351 p.cmd.Process = process
352 p.process.ops = p
353 return nil
354 }
355
356 func (p *initProcess) start() (retErr error) {
357 defer p.messageSockPair.parent.Close()
358 err := p.cmd.Start()
359 p.process.ops = p
360
361 _ = p.messageSockPair.child.Close()
362 _ = p.logFilePair.child.Close()
363 if err != nil {
364 p.process.ops = nil
365 return fmt.Errorf("unable to start init: %w", err)
366 }
367
368 waitInit := initWaiter(p.messageSockPair.parent)
369 defer func() {
370 if retErr != nil {
371
372
373
374 oom, err := p.manager.OOMKillCount()
375 if err != nil {
376 logrus.WithError(err).Warn("unable to get oom kill count")
377 } else if oom > 0 {
378
379
380 const oomError = "container init was OOM-killed (memory limit too low?)"
381
382 if logrus.GetLevel() >= logrus.DebugLevel {
383
384
385 retErr = fmt.Errorf(oomError+": %w", retErr)
386 } else {
387 retErr = errors.New(oomError)
388 }
389 }
390
391 werr := <-waitInit
392 if werr != nil {
393 logrus.WithError(werr).Warn()
394 }
395
396
397 if err := ignoreTerminateErrors(p.terminate()); err != nil {
398 logrus.WithError(err).Warn("unable to terminate initProcess")
399 }
400
401 _ = p.manager.Destroy()
402 if p.intelRdtManager != nil {
403 _ = p.intelRdtManager.Destroy()
404 }
405 }
406 }()
407
408
409
410
411 if err := p.manager.Apply(p.pid()); err != nil {
412 return fmt.Errorf("unable to apply cgroup configuration: %w", err)
413 }
414 if p.intelRdtManager != nil {
415 if err := p.intelRdtManager.Apply(p.pid()); err != nil {
416 return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)
417 }
418 }
419 if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
420 return fmt.Errorf("can't copy bootstrap data to pipe: %w", err)
421 }
422 err = <-waitInit
423 if err != nil {
424 return err
425 }
426
427 childPid, err := p.getChildPid()
428 if err != nil {
429 return fmt.Errorf("can't get final child's PID from pipe: %w", err)
430 }
431
432
433
434
435 fds, err := getPipeFds(childPid)
436 if err != nil {
437 return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err)
438 }
439 p.setExternalDescriptors(fds)
440
441
442 if err := p.waitForChildExit(childPid); err != nil {
443 return fmt.Errorf("error waiting for our first child to exit: %w", err)
444 }
445
446 if err := p.createNetworkInterfaces(); err != nil {
447 return fmt.Errorf("error creating network interfaces: %w", err)
448 }
449 if err := p.updateSpecState(); err != nil {
450 return fmt.Errorf("error updating spec state: %w", err)
451 }
452 if err := p.sendConfig(); err != nil {
453 return fmt.Errorf("error sending config to init process: %w", err)
454 }
455 var (
456 sentRun bool
457 sentResume bool
458 )
459
460 ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
461 switch sync.Type {
462 case procSeccomp:
463 if p.config.Config.Seccomp.ListenerPath == "" {
464 return errors.New("listenerPath is not set")
465 }
466
467 seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd))
468 if err != nil {
469 return err
470 }
471 defer unix.Close(seccompFd)
472
473 s, err := p.container.currentOCIState()
474 if err != nil {
475 return err
476 }
477
478
479 s.Pid = p.cmd.Process.Pid
480 s.Status = specs.StateCreating
481 containerProcessState := &specs.ContainerProcessState{
482 Version: specs.Version,
483 Fds: []string{specs.SeccompFdName},
484 Pid: s.Pid,
485 Metadata: p.config.Config.Seccomp.ListenerMetadata,
486 State: *s,
487 }
488 if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
489 containerProcessState, seccompFd); err != nil {
490 return err
491 }
492
493
494 if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
495 return err
496 }
497 case procReady:
498
499
500 if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
501 return fmt.Errorf("error setting rlimits for ready process: %w", err)
502 }
503
504 if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
505
506 if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
507 return fmt.Errorf("error setting cgroup config for ready process: %w", err)
508 }
509 if p.intelRdtManager != nil {
510 if err := p.intelRdtManager.Set(p.config.Config); err != nil {
511 return fmt.Errorf("error setting Intel RDT config for ready process: %w", err)
512 }
513 }
514
515 if len(p.config.Config.Hooks) != 0 {
516 s, err := p.container.currentOCIState()
517 if err != nil {
518 return err
519 }
520
521 s.Pid = p.cmd.Process.Pid
522 s.Status = specs.StateCreating
523 hooks := p.config.Config.Hooks
524
525 if err := hooks[configs.Prestart].RunHooks(s); err != nil {
526 return err
527 }
528 if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
529 return err
530 }
531 }
532 }
533
534
535 p.container.created = time.Now().UTC()
536 p.container.state = &createdState{
537 c: p.container,
538 }
539
540
541
542
543
544
545
546
547
548
549 state, uerr := p.container.updateState(p)
550 if uerr != nil {
551 return fmt.Errorf("unable to store init state: %w", err)
552 }
553 p.container.initProcessStartTime = state.InitProcessStartTime
554
555
556 if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
557 return err
558 }
559 sentRun = true
560 case procHooks:
561
562 if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
563 return fmt.Errorf("error setting cgroup config for procHooks process: %w", err)
564 }
565 if p.intelRdtManager != nil {
566 if err := p.intelRdtManager.Set(p.config.Config); err != nil {
567 return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err)
568 }
569 }
570 if len(p.config.Config.Hooks) != 0 {
571 s, err := p.container.currentOCIState()
572 if err != nil {
573 return err
574 }
575
576 s.Pid = p.cmd.Process.Pid
577 s.Status = specs.StateCreating
578 hooks := p.config.Config.Hooks
579
580 if err := hooks[configs.Prestart].RunHooks(s); err != nil {
581 return err
582 }
583 if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil {
584 return err
585 }
586 }
587
588 if err := writeSync(p.messageSockPair.parent, procResume); err != nil {
589 return err
590 }
591 sentResume = true
592 default:
593 return errors.New("invalid JSON payload from child")
594 }
595
596 return nil
597 })
598
599 if !sentRun {
600 return fmt.Errorf("error during container init: %w", ierr)
601 }
602 if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
603 return errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process")
604 }
605 if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
606 return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err}
607 }
608
609
610 if ierr != nil {
611 _, _ = p.wait()
612 return ierr
613 }
614 return nil
615 }
616
617 func (p *initProcess) wait() (*os.ProcessState, error) {
618 err := p.cmd.Wait()
619
620 if p.sharePidns {
621 _ = signalAllProcesses(p.manager, unix.SIGKILL)
622 }
623 return p.cmd.ProcessState, err
624 }
625
626 func (p *initProcess) terminate() error {
627 if p.cmd.Process == nil {
628 return nil
629 }
630 err := p.cmd.Process.Kill()
631 if _, werr := p.wait(); err == nil {
632 err = werr
633 }
634 return err
635 }
636
637 func (p *initProcess) startTime() (uint64, error) {
638 stat, err := system.Stat(p.pid())
639 return stat.StartTime, err
640 }
641
642 func (p *initProcess) updateSpecState() error {
643 s, err := p.container.currentOCIState()
644 if err != nil {
645 return err
646 }
647
648 p.config.SpecState = s
649 return nil
650 }
651
652 func (p *initProcess) sendConfig() error {
653
654
655
656 return utils.WriteJSON(p.messageSockPair.parent, p.config)
657 }
658
659 func (p *initProcess) createNetworkInterfaces() error {
660 for _, config := range p.config.Config.Networks {
661 strategy, err := getStrategy(config.Type)
662 if err != nil {
663 return err
664 }
665 n := &network{
666 Network: *config,
667 }
668 if err := strategy.create(n, p.pid()); err != nil {
669 return err
670 }
671 p.config.Networks = append(p.config.Networks, n)
672 }
673 return nil
674 }
675
676 func (p *initProcess) signal(sig os.Signal) error {
677 s, ok := sig.(unix.Signal)
678 if !ok {
679 return errors.New("os: unsupported signal type")
680 }
681 return unix.Kill(p.pid(), s)
682 }
683
684 func (p *initProcess) setExternalDescriptors(newFds []string) {
685 p.fds = newFds
686 }
687
688 func (p *initProcess) forwardChildLogs() chan error {
689 return logs.ForwardLogs(p.logFilePair.parent)
690 }
691
692 func recvSeccompFd(childPid, childFd uintptr) (int, error) {
693 pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0)
694 if errno != 0 {
695 return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno)
696 }
697 defer unix.Close(int(pidfd))
698
699 seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0)
700 if errno != 0 {
701 return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno)
702 }
703
704 return int(seccompFd), nil
705 }
706
707 func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error {
708 conn, err := net.Dial("unix", listenerPath)
709 if err != nil {
710 return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err)
711 }
712
713 socket, err := conn.(*net.UnixConn).File()
714 if err != nil {
715 return fmt.Errorf("cannot get seccomp socket: %w", err)
716 }
717 defer socket.Close()
718
719 b, err := json.Marshal(state)
720 if err != nil {
721 return fmt.Errorf("cannot marshall seccomp state: %w", err)
722 }
723
724 err = utils.SendFds(socket, b, fd)
725 if err != nil {
726 return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err)
727 }
728
729 return nil
730 }
731
732 func getPipeFds(pid int) ([]string, error) {
733 fds := make([]string, 3)
734
735 dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
736 for i := 0; i < 3; i++ {
737
738
739 f := filepath.Join(dirPath, strconv.Itoa(i))
740 target, err := os.Readlink(f)
741 if err != nil {
742
743
744
745 if os.IsPermission(err) {
746 continue
747 }
748 return fds, err
749 }
750 fds[i] = target
751 }
752 return fds, nil
753 }
754
755
756
757
758
759 func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
760 var fds []uintptr
761 i = &IO{}
762
763 defer func() {
764 if err != nil {
765 for _, fd := range fds {
766 _ = unix.Close(int(fd))
767 }
768 }
769 }()
770
771 r, w, err := os.Pipe()
772 if err != nil {
773 return nil, err
774 }
775 fds = append(fds, r.Fd(), w.Fd())
776 p.Stdin, i.Stdin = r, w
777
778 if r, w, err = os.Pipe(); err != nil {
779 return nil, err
780 }
781 fds = append(fds, r.Fd(), w.Fd())
782 p.Stdout, i.Stdout = w, r
783
784 if r, w, err = os.Pipe(); err != nil {
785 return nil, err
786 }
787 fds = append(fds, r.Fd(), w.Fd())
788 p.Stderr, i.Stderr = w, r
789
790 for _, fd := range fds {
791 if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
792 return nil, &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
793 }
794 }
795 return i, nil
796 }
797
798
799
800 func initWaiter(r io.Reader) chan error {
801 ch := make(chan error, 1)
802 go func() {
803 defer close(ch)
804
805 inited := make([]byte, 1)
806 n, err := r.Read(inited)
807 if err == nil {
808 if n < 1 {
809 err = errors.New("short read")
810 } else if inited[0] != 0 {
811 err = fmt.Errorf("unexpected %d != 0", inited[0])
812 } else {
813 ch <- nil
814 return
815 }
816 }
817 ch <- fmt.Errorf("waiting for init preliminary setup: %w", err)
818 }()
819
820 return ch
821 }
822
View as plain text