init.c

Documentation: github.com/Microsoft/hcsshim/init

     1#define _GNU_SOURCE
     2#include <errno.h>
     3#include <fcntl.h>
     4#include <getopt.h>
     5#include <net/if.h>
     6#include <netinet/ip.h>
     7#include <signal.h>
     8#include <stdio.h>
     9#include <stdlib.h>
    10#include <string.h>
    11#include <sys/mount.h>
    12#include <sys/resource.h>
    13#include <sys/socket.h>
    14#include <sys/stat.h>
    15#include <sys/sysmacros.h>
    16#include <sys/types.h>
    17#include <sys/wait.h>
    18#include <unistd.h>
    19#include "../vsockexec/vsock.h"
    20
    21// musl-gcc doesn't use headers in /usr/include, so it can't find
    22// linux/random.h which is where RNDADDENTROPY is defined. We only need this
    23// single definition from linux/random.h, so we just duplicate it here as a
    24// workaround.
    25#define RNDADDENTROPY _IOW( 'R', 0x03, int [2] )
    26
    27#define DEFAULT_PATH_ENV "PATH=/sbin:/usr/sbin:/bin:/usr/bin"
    28
    29const char *const default_envp[] = {
    30    DEFAULT_PATH_ENV,
    31    NULL,
    32};
    33
    34// When nothing is passed, default to the LCOWv1 behavior.
    35const char *const default_argv[] = { "/bin/gcs", "-loglevel", "debug", "-logfile=/run/gcs/gcs.log" };
    36const char *const default_shell = "/bin/sh";
    37
    38struct Mount {
    39    const char *source, *target, *type;
    40    unsigned long flags;
    41    const void *data;
    42};
    43
    44struct Mkdir {
    45    const char *path;
    46    mode_t mode;
    47};
    48
    49struct Mknod {
    50    const char *path;
    51    mode_t mode;
    52    int major, minor;
    53};
    54
    55struct Symlink {
    56    const char *linkpath, *target;
    57};
    58
    59enum OpType {
    60    OpMount,
    61    OpMkdir,
    62    OpMknod,
    63    OpSymlink,
    64};
    65
    66struct InitOp {
    67    enum OpType op;
    68    union {
    69        struct Mount mount;
    70        struct Mkdir mkdir;
    71        struct Mknod mknod;
    72        struct Symlink symlink;
    73    };
    74};
    75
    76const struct InitOp ops[] = {
    77    // mount /proc (which should already exist)
    78    { OpMount, .mount = { "proc", "/proc", "proc", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
    79
    80    // add symlinks in /dev (which is already mounted)
    81    { OpSymlink, .symlink = { "/dev/fd", "/proc/self/fd" } },
    82    { OpSymlink, .symlink = { "/dev/stdin", "/proc/self/fd/0" } },
    83    { OpSymlink, .symlink = { "/dev/stdout", "/proc/self/fd/1" } },
    84    { OpSymlink, .symlink = { "/dev/stderr", "/proc/self/fd/2" } },
    85
    86    // mount tmpfs on /run and /tmp (which should already exist)
    87    { OpMount, .mount = { "tmpfs", "/run", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755" } },
    88    { OpMount, .mount = { "tmpfs", "/tmp", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
    89
    90    // mount shm and devpts
    91    { OpMkdir, .mkdir = { "/dev/shm", 0755 } },
    92    { OpMount, .mount = { "shm", "/dev/shm", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
    93    { OpMkdir, .mkdir = { "/dev/pts", 0755 } },
    94    { OpMount, .mount = { "devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC } },
    95
    96    // mount /sys (which should already exist)
    97    { OpMount, .mount = { "sysfs", "/sys", "sysfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
    98    { OpMount, .mount = { "cgroup_root", "/sys/fs/cgroup", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755" } },
    99};
   100
   101void warn(const char *msg) {
   102    int error = errno;
   103    perror(msg);
   104    errno = error;
   105}
   106
   107void warn2(const char *msg1, const char *msg2) {
   108    int error = errno;
   109    fputs(msg1, stderr);
   110    fputs(": ", stderr);
   111    errno = error;
   112    warn(msg2);
   113}
   114
   115_Noreturn void dien() {
   116    exit(errno);
   117}
   118
   119_Noreturn void die(const char *msg) {
   120    warn(msg);
   121    dien();
   122}
   123
   124_Noreturn void die2(const char *msg1, const char *msg2) {
   125    warn2(msg1, msg2);
   126    dien();
   127}
   128
   129void init_rlimit() {
   130    // Set the hard limit for number of open fds much larger. The kernel sets
   131    // a limit of 4096 for historical reasons, and this limit is too low for
   132    // some software. According to the systemd developers, there is no downside
   133    // to a large hard limit in modern Linux kernels.
   134    //
   135    // Retain the small soft limit of 1024 for appcompat.
   136    struct rlimit rlim = {
   137        .rlim_cur = 1024,
   138        .rlim_max = 1024 * 1024,
   139    };
   140    if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
   141        die("setrlimit(RLIMIT_NOFILE)");
   142    }
   143}
   144
   145void init_dev() {
   146    if (mount("dev", "/dev", "devtmpfs", MS_NOSUID | MS_NOEXEC, NULL) < 0) {
   147        warn2("mount", "/dev");
   148        // /dev will be already mounted if devtmpfs.mount = 1 on the kernel
   149        // command line or CONFIG_DEVTMPFS_MOUNT is set. Do not consider this
   150        // an error.
   151        if (errno != EBUSY) {
   152            dien();
   153        }
   154    }
   155}
   156
   157void init_fs(const struct InitOp *ops, size_t count) {
   158    for (size_t i = 0; i < count; i++) {
   159        switch (ops[i].op) {
   160        case OpMount: {
   161            const struct Mount *m = &ops[i].mount;
   162            if (mount(m->source, m->target, m->type, m->flags, m->data) < 0) {
   163                die2("mount", m->target);
   164            }
   165            break;
   166        }
   167        case OpMkdir: {
   168            const struct Mkdir *m = &ops[i].mkdir;
   169            if (mkdir(m->path, m->mode) < 0) {
   170                warn2("mkdir", m->path);
   171                if (errno != EEXIST) {
   172                    dien();
   173                }
   174            }
   175            break;
   176        }
   177        case OpMknod: {
   178            const struct Mknod *n = &ops[i].mknod;
   179            if (mknod(n->path, n->mode, makedev(n->major, n->minor)) < 0) {
   180                warn2("mknod", n->path);
   181                if (errno != EEXIST) {
   182                    dien();
   183                }
   184            }
   185            break;
   186        }
   187        case OpSymlink: {
   188            const struct Symlink *sl = &ops[i].symlink;
   189            if (symlink(sl->target, sl->linkpath) < 0) {
   190                warn2("symlink", sl->linkpath);
   191                if (errno != EEXIST) {
   192                    dien();
   193                }
   194            }
   195            break;
   196        }
   197        }
   198    }
   199}
   200
   201void init_cgroups() {
   202    const char *fpath = "/proc/cgroups";
   203    FILE *f = fopen(fpath, "r");
   204    if (f == NULL) {
   205        die2("fopen", fpath);
   206    }
   207    // Skip the first line.
   208    for (;;) {
   209        char c = fgetc(f);
   210        if (c == EOF || c == '\n') {
   211            break;
   212        }
   213    }
   214    for (;;) {
   215        static const char base_path[] = "/sys/fs/cgroup/";
   216        char path[sizeof(base_path) - 1 + 64];
   217        char* name = path + sizeof(base_path) - 1;
   218        int hier, groups, enabled;
   219        int r = fscanf(f, "%64s %d %d %d\n", name, &hier, &groups, &enabled);
   220        if (r == EOF) {
   221            break;
   222        }
   223        if (r != 4) {
   224            errno = errno ? : EINVAL;
   225            die2("fscanf", fpath);
   226        }
   227        if (enabled) {
   228            memcpy(path, base_path, sizeof(base_path) - 1);
   229            if (mkdir(path, 0755) < 0) {
   230                die2("mkdir", path);
   231            }
   232            if (mount(name, path, "cgroup", MS_NODEV | MS_NOSUID | MS_NOEXEC, name) < 0) {
   233                die2("mount", path);
   234            }
   235        }
   236    }
   237    fclose(f);
   238}
   239
   240void init_network(const char *iface, int domain) {
   241    int s = socket(domain, SOCK_DGRAM, IPPROTO_IP);
   242    if (s < 0) {
   243        if (errno == EAFNOSUPPORT) {
   244            return;
   245        }
   246        die("socket");
   247    }
   248
   249    struct ifreq request = {0};
   250    strncpy(request.ifr_name, iface, sizeof(request.ifr_name));
   251    if (ioctl(s, SIOCGIFFLAGS, &request) < 0) {
   252        die2("ioctl(SIOCGIFFLAGS)", iface);
   253    }
   254
   255    request.ifr_flags |= IFF_UP | IFF_RUNNING;
   256    if (ioctl(s, SIOCSIFFLAGS, &request) < 0) {
   257        die2("ioctl(SIOCSIFFLAGS)", iface);
   258    }
   259
   260    close(s);
   261}
   262
   263// inject boot-time entropy after reading it from a vsock port
   264void init_entropy(int port) {
   265    int s = openvsock(VMADDR_CID_HOST, port);
   266    if (s < 0) {
   267        die("openvsock entropy");
   268    }
   269
   270    int e = open("/dev/random", O_RDWR);
   271    if (e < 0) {
   272        die("open /dev/random");
   273    }
   274
   275    struct {
   276        int entropy_count;
   277        int buf_size;
   278        char buf[4096];
   279    } buf;
   280
   281    for (;;) {
   282        ssize_t n = read(s, buf.buf, sizeof(buf.buf));
   283        if (n < 0) {
   284            die("read entropy");
   285        }
   286
   287        if (n == 0) {
   288            break;
   289        }
   290
   291        buf.entropy_count = n * 8; // in bits
   292        buf.buf_size = n; // in bytes
   293        if (ioctl(e, RNDADDENTROPY, &buf) < 0) {
   294            die("ioctl(RNDADDENTROPY)");
   295        }
   296    }
   297
   298    close(s);
   299    close(e);
   300}
   301
   302pid_t launch(int argc, char **argv) {
   303    int pid = fork();
   304    if (pid != 0) {
   305        if (pid < 0) {
   306            die("fork");
   307        }
   308
   309        return pid;
   310    }
   311
   312    // Unblock signals before execing.
   313    sigset_t set;
   314    sigfillset(&set);
   315    sigprocmask(SIG_UNBLOCK, &set, 0);
   316
   317    // Create a session and process group.
   318    setsid();
   319    setpgid(0, 0);
   320
   321    // Terminate the arguments and exec.
   322    char **argvn = alloca(sizeof(argv[0]) * (argc + 1));
   323    memcpy(argvn, argv, sizeof(argv[0]) * argc);
   324    argvn[argc] = NULL;
   325    if (putenv(DEFAULT_PATH_ENV)) { // Specify the PATH used for execvpe
   326        die("putenv");
   327    }
   328    execvpe(argvn[0], argvn, (char**)default_envp);
   329    die2("execvpe", argvn[0]);
   330}
   331
   332int reap_until(pid_t until_pid) {
   333    for (;;) {
   334        int status;
   335        pid_t pid = wait(&status);
   336        if (pid < 0) {
   337            die("wait");
   338        }
   339
   340        if (pid == until_pid) {
   341            // The initial child process died. Pass through the exit status.
   342            if (WIFEXITED(status)) {
   343                if (WEXITSTATUS(status) != 0) {
   344                    fputs("child exited with error\n", stderr);
   345                }
   346                return WEXITSTATUS(status);
   347            }
   348            fputs("child exited by signal: ", stderr);
   349            fputs(strsignal(WTERMSIG(status)), stderr);
   350            fputs("\n", stderr);
   351            return 128 + WTERMSIG(status);
   352        }
   353    }
   354}
   355
   356int main(int argc, char **argv) {
   357    char *debug_shell = NULL;
   358    int entropy_port = 0;
   359    if (argc <= 1) {
   360        argv = (char **)default_argv;
   361        argc = sizeof(default_argv) / sizeof(default_argv[0]);
   362        optind = 0;
   363        debug_shell = (char*)default_shell;
   364    } else {
   365        for (int opt; (opt = getopt(argc, argv, "+d:e:")) >= 0; ) {
   366            switch (opt) {
   367            case 'd':
   368                debug_shell = optarg;
   369                break;
   370
   371            case 'e':
   372                entropy_port = atoi(optarg);
   373                if (entropy_port == 0) {
   374                    fputs("invalid entropy port\n", stderr);
   375                    exit(1);
   376                }
   377
   378                break;
   379
   380            default:
   381                exit(1);
   382            }
   383        }
   384    }
   385
   386    char **child_argv = argv + optind;
   387    int child_argc = argc - optind;
   388
   389    // Block all signals in init. SIGCHLD will still cause wait() to return.
   390    sigset_t set;
   391    sigfillset(&set);
   392    sigprocmask(SIG_BLOCK, &set, 0);
   393
   394    init_rlimit();
   395    init_dev();
   396    init_fs(ops, sizeof(ops) / sizeof(ops[0]));
   397    init_cgroups();
   398    init_network("lo", AF_INET);
   399    init_network("lo", AF_INET6);
   400    if (entropy_port != 0) {
   401        init_entropy(entropy_port);
   402    }
   403
   404    pid_t pid = launch(child_argc, child_argv);
   405    if (debug_shell != NULL) {
   406        // The debug shell takes over as the primary child.
   407        pid = launch(1, &debug_shell);
   408    }
   409
   410    // Reap until the initial child process dies.
   411    return reap_until(pid);
   412}
View as plain text