#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../vsockexec/vsock.h" // musl-gcc doesn't use headers in /usr/include, so it can't find // linux/random.h which is where RNDADDENTROPY is defined. We only need this // single definition from linux/random.h, so we just duplicate it here as a // workaround. #define RNDADDENTROPY _IOW( 'R', 0x03, int [2] ) #define DEFAULT_PATH_ENV "PATH=/sbin:/usr/sbin:/bin:/usr/bin" const char *const default_envp[] = { DEFAULT_PATH_ENV, NULL, }; // When nothing is passed, default to the LCOWv1 behavior. const char *const default_argv[] = { "/bin/gcs", "-loglevel", "debug", "-logfile=/run/gcs/gcs.log" }; const char *const default_shell = "/bin/sh"; struct Mount { const char *source, *target, *type; unsigned long flags; const void *data; }; struct Mkdir { const char *path; mode_t mode; }; struct Mknod { const char *path; mode_t mode; int major, minor; }; struct Symlink { const char *linkpath, *target; }; enum OpType { OpMount, OpMkdir, OpMknod, OpSymlink, }; struct InitOp { enum OpType op; union { struct Mount mount; struct Mkdir mkdir; struct Mknod mknod; struct Symlink symlink; }; }; const struct InitOp ops[] = { // mount /proc (which should already exist) { OpMount, .mount = { "proc", "/proc", "proc", MS_NODEV | MS_NOSUID | MS_NOEXEC } }, // add symlinks in /dev (which is already mounted) { OpSymlink, .symlink = { "/dev/fd", "/proc/self/fd" } }, { OpSymlink, .symlink = { "/dev/stdin", "/proc/self/fd/0" } }, { OpSymlink, .symlink = { "/dev/stdout", "/proc/self/fd/1" } }, { OpSymlink, .symlink = { "/dev/stderr", "/proc/self/fd/2" } }, // mount tmpfs on /run and /tmp (which should already exist) { OpMount, .mount = { "tmpfs", "/run", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755" } }, { OpMount, .mount = { "tmpfs", "/tmp", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } }, // mount shm and devpts { OpMkdir, .mkdir = { "/dev/shm", 0755 } }, { OpMount, .mount = { "shm", "/dev/shm", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } }, { OpMkdir, .mkdir = { "/dev/pts", 0755 } }, { OpMount, .mount = { "devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC } }, // mount /sys (which should already exist) { OpMount, .mount = { "sysfs", "/sys", "sysfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } }, { OpMount, .mount = { "cgroup_root", "/sys/fs/cgroup", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755" } }, }; void warn(const char *msg) { int error = errno; perror(msg); errno = error; } void warn2(const char *msg1, const char *msg2) { int error = errno; fputs(msg1, stderr); fputs(": ", stderr); errno = error; warn(msg2); } _Noreturn void dien() { exit(errno); } _Noreturn void die(const char *msg) { warn(msg); dien(); } _Noreturn void die2(const char *msg1, const char *msg2) { warn2(msg1, msg2); dien(); } void init_rlimit() { // Set the hard limit for number of open fds much larger. The kernel sets // a limit of 4096 for historical reasons, and this limit is too low for // some software. According to the systemd developers, there is no downside // to a large hard limit in modern Linux kernels. // // Retain the small soft limit of 1024 for appcompat. struct rlimit rlim = { .rlim_cur = 1024, .rlim_max = 1024 * 1024, }; if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) { die("setrlimit(RLIMIT_NOFILE)"); } } void init_dev() { if (mount("dev", "/dev", "devtmpfs", MS_NOSUID | MS_NOEXEC, NULL) < 0) { warn2("mount", "/dev"); // /dev will be already mounted if devtmpfs.mount = 1 on the kernel // command line or CONFIG_DEVTMPFS_MOUNT is set. Do not consider this // an error. if (errno != EBUSY) { dien(); } } } void init_fs(const struct InitOp *ops, size_t count) { for (size_t i = 0; i < count; i++) { switch (ops[i].op) { case OpMount: { const struct Mount *m = &ops[i].mount; if (mount(m->source, m->target, m->type, m->flags, m->data) < 0) { die2("mount", m->target); } break; } case OpMkdir: { const struct Mkdir *m = &ops[i].mkdir; if (mkdir(m->path, m->mode) < 0) { warn2("mkdir", m->path); if (errno != EEXIST) { dien(); } } break; } case OpMknod: { const struct Mknod *n = &ops[i].mknod; if (mknod(n->path, n->mode, makedev(n->major, n->minor)) < 0) { warn2("mknod", n->path); if (errno != EEXIST) { dien(); } } break; } case OpSymlink: { const struct Symlink *sl = &ops[i].symlink; if (symlink(sl->target, sl->linkpath) < 0) { warn2("symlink", sl->linkpath); if (errno != EEXIST) { dien(); } } break; } } } } void init_cgroups() { const char *fpath = "/proc/cgroups"; FILE *f = fopen(fpath, "r"); if (f == NULL) { die2("fopen", fpath); } // Skip the first line. for (;;) { char c = fgetc(f); if (c == EOF || c == '\n') { break; } } for (;;) { static const char base_path[] = "/sys/fs/cgroup/"; char path[sizeof(base_path) - 1 + 64]; char* name = path + sizeof(base_path) - 1; int hier, groups, enabled; int r = fscanf(f, "%64s %d %d %d\n", name, &hier, &groups, &enabled); if (r == EOF) { break; } if (r != 4) { errno = errno ? : EINVAL; die2("fscanf", fpath); } if (enabled) { memcpy(path, base_path, sizeof(base_path) - 1); if (mkdir(path, 0755) < 0) { die2("mkdir", path); } if (mount(name, path, "cgroup", MS_NODEV | MS_NOSUID | MS_NOEXEC, name) < 0) { die2("mount", path); } } } fclose(f); } void init_network(const char *iface, int domain) { int s = socket(domain, SOCK_DGRAM, IPPROTO_IP); if (s < 0) { if (errno == EAFNOSUPPORT) { return; } die("socket"); } struct ifreq request = {0}; strncpy(request.ifr_name, iface, sizeof(request.ifr_name)); if (ioctl(s, SIOCGIFFLAGS, &request) < 0) { die2("ioctl(SIOCGIFFLAGS)", iface); } request.ifr_flags |= IFF_UP | IFF_RUNNING; if (ioctl(s, SIOCSIFFLAGS, &request) < 0) { die2("ioctl(SIOCSIFFLAGS)", iface); } close(s); } // inject boot-time entropy after reading it from a vsock port void init_entropy(int port) { int s = openvsock(VMADDR_CID_HOST, port); if (s < 0) { die("openvsock entropy"); } int e = open("/dev/random", O_RDWR); if (e < 0) { die("open /dev/random"); } struct { int entropy_count; int buf_size; char buf[4096]; } buf; for (;;) { ssize_t n = read(s, buf.buf, sizeof(buf.buf)); if (n < 0) { die("read entropy"); } if (n == 0) { break; } buf.entropy_count = n * 8; // in bits buf.buf_size = n; // in bytes if (ioctl(e, RNDADDENTROPY, &buf) < 0) { die("ioctl(RNDADDENTROPY)"); } } close(s); close(e); } pid_t launch(int argc, char **argv) { int pid = fork(); if (pid != 0) { if (pid < 0) { die("fork"); } return pid; } // Unblock signals before execing. sigset_t set; sigfillset(&set); sigprocmask(SIG_UNBLOCK, &set, 0); // Create a session and process group. setsid(); setpgid(0, 0); // Terminate the arguments and exec. char **argvn = alloca(sizeof(argv[0]) * (argc + 1)); memcpy(argvn, argv, sizeof(argv[0]) * argc); argvn[argc] = NULL; if (putenv(DEFAULT_PATH_ENV)) { // Specify the PATH used for execvpe die("putenv"); } execvpe(argvn[0], argvn, (char**)default_envp); die2("execvpe", argvn[0]); } int reap_until(pid_t until_pid) { for (;;) { int status; pid_t pid = wait(&status); if (pid < 0) { die("wait"); } if (pid == until_pid) { // The initial child process died. Pass through the exit status. if (WIFEXITED(status)) { if (WEXITSTATUS(status) != 0) { fputs("child exited with error\n", stderr); } return WEXITSTATUS(status); } fputs("child exited by signal: ", stderr); fputs(strsignal(WTERMSIG(status)), stderr); fputs("\n", stderr); return 128 + WTERMSIG(status); } } } int main(int argc, char **argv) { char *debug_shell = NULL; int entropy_port = 0; if (argc <= 1) { argv = (char **)default_argv; argc = sizeof(default_argv) / sizeof(default_argv[0]); optind = 0; debug_shell = (char*)default_shell; } else { for (int opt; (opt = getopt(argc, argv, "+d:e:")) >= 0; ) { switch (opt) { case 'd': debug_shell = optarg; break; case 'e': entropy_port = atoi(optarg); if (entropy_port == 0) { fputs("invalid entropy port\n", stderr); exit(1); } break; default: exit(1); } } } char **child_argv = argv + optind; int child_argc = argc - optind; // Block all signals in init. SIGCHLD will still cause wait() to return. sigset_t set; sigfillset(&set); sigprocmask(SIG_BLOCK, &set, 0); init_rlimit(); init_dev(); init_fs(ops, sizeof(ops) / sizeof(ops[0])); init_cgroups(); init_network("lo", AF_INET); init_network("lo", AF_INET6); if (entropy_port != 0) { init_entropy(entropy_port); } pid_t pid = launch(child_argc, child_argv); if (debug_shell != NULL) { // The debug shell takes over as the primary child. pid = launch(1, &debug_shell); } // Reap until the initial child process dies. return reap_until(pid); }