#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <net/if.h>
#include <netinet/ip.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include "../vsockexec/vsock.h"

// musl-gcc doesn't use headers in /usr/include, so it can't find
// linux/random.h which is where RNDADDENTROPY is defined. We only need this
// single definition from linux/random.h, so we just duplicate it here as a
// workaround.
#define RNDADDENTROPY _IOW( 'R', 0x03, int [2] )

#define DEFAULT_PATH_ENV "PATH=/sbin:/usr/sbin:/bin:/usr/bin"

const char *const default_envp[] = {
    DEFAULT_PATH_ENV,
    NULL,
};

// When nothing is passed, default to the LCOWv1 behavior.
const char *const default_argv[] = { "/bin/gcs", "-loglevel", "debug", "-logfile=/run/gcs/gcs.log" };
const char *const default_shell = "/bin/sh";

struct Mount {
    const char *source, *target, *type;
    unsigned long flags;
    const void *data;
};

struct Mkdir {
    const char *path;
    mode_t mode;
};

struct Mknod {
    const char *path;
    mode_t mode;
    int major, minor;
};

struct Symlink {
    const char *linkpath, *target;
};

enum OpType {
    OpMount,
    OpMkdir,
    OpMknod,
    OpSymlink,
};

struct InitOp {
    enum OpType op;
    union {
        struct Mount mount;
        struct Mkdir mkdir;
        struct Mknod mknod;
        struct Symlink symlink;
    };
};

const struct InitOp ops[] = {
    // mount /proc (which should already exist)
    { OpMount, .mount = { "proc", "/proc", "proc", MS_NODEV | MS_NOSUID | MS_NOEXEC } },

    // add symlinks in /dev (which is already mounted)
    { OpSymlink, .symlink = { "/dev/fd", "/proc/self/fd" } },
    { OpSymlink, .symlink = { "/dev/stdin", "/proc/self/fd/0" } },
    { OpSymlink, .symlink = { "/dev/stdout", "/proc/self/fd/1" } },
    { OpSymlink, .symlink = { "/dev/stderr", "/proc/self/fd/2" } },

    // mount tmpfs on /run and /tmp (which should already exist)
    { OpMount, .mount = { "tmpfs", "/run", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755" } },
    { OpMount, .mount = { "tmpfs", "/tmp", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } },

    // mount shm and devpts
    { OpMkdir, .mkdir = { "/dev/shm", 0755 } },
    { OpMount, .mount = { "shm", "/dev/shm", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
    { OpMkdir, .mkdir = { "/dev/pts", 0755 } },
    { OpMount, .mount = { "devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC } },

    // mount /sys (which should already exist)
    { OpMount, .mount = { "sysfs", "/sys", "sysfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
    { OpMount, .mount = { "cgroup_root", "/sys/fs/cgroup", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755" } },
};

void warn(const char *msg) {
    int error = errno;
    perror(msg);
    errno = error;
}

void warn2(const char *msg1, const char *msg2) {
    int error = errno;
    fputs(msg1, stderr);
    fputs(": ", stderr);
    errno = error;
    warn(msg2);
}

_Noreturn void dien() {
    exit(errno);
}

_Noreturn void die(const char *msg) {
    warn(msg);
    dien();
}

_Noreturn void die2(const char *msg1, const char *msg2) {
    warn2(msg1, msg2);
    dien();
}

void init_rlimit() {
    // Set the hard limit for number of open fds much larger. The kernel sets
    // a limit of 4096 for historical reasons, and this limit is too low for
    // some software. According to the systemd developers, there is no downside
    // to a large hard limit in modern Linux kernels.
    //
    // Retain the small soft limit of 1024 for appcompat.
    struct rlimit rlim = {
        .rlim_cur = 1024,
        .rlim_max = 1024 * 1024,
    };
    if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
        die("setrlimit(RLIMIT_NOFILE)");
    }
}

void init_dev() {
    if (mount("dev", "/dev", "devtmpfs", MS_NOSUID | MS_NOEXEC, NULL) < 0) {
        warn2("mount", "/dev");
        // /dev will be already mounted if devtmpfs.mount = 1 on the kernel
        // command line or CONFIG_DEVTMPFS_MOUNT is set. Do not consider this
        // an error.
        if (errno != EBUSY) {
            dien();
        }
    }
}

void init_fs(const struct InitOp *ops, size_t count) {
    for (size_t i = 0; i < count; i++) {
        switch (ops[i].op) {
        case OpMount: {
            const struct Mount *m = &ops[i].mount;
            if (mount(m->source, m->target, m->type, m->flags, m->data) < 0) {
                die2("mount", m->target);
            }
            break;
        }
        case OpMkdir: {
            const struct Mkdir *m = &ops[i].mkdir;
            if (mkdir(m->path, m->mode) < 0) {
                warn2("mkdir", m->path);
                if (errno != EEXIST) {
                    dien();
                }
            }
            break;
        }
        case OpMknod: {
            const struct Mknod *n = &ops[i].mknod;
            if (mknod(n->path, n->mode, makedev(n->major, n->minor)) < 0) {
                warn2("mknod", n->path);
                if (errno != EEXIST) {
                    dien();
                }
            }
            break;
        }
        case OpSymlink: {
            const struct Symlink *sl = &ops[i].symlink;
            if (symlink(sl->target, sl->linkpath) < 0) {
                warn2("symlink", sl->linkpath);
                if (errno != EEXIST) {
                    dien();
                }
            }
            break;
        }
        }
    }
}

void init_cgroups() {
    const char *fpath = "/proc/cgroups";
    FILE *f = fopen(fpath, "r");
    if (f == NULL) {
        die2("fopen", fpath);
    }
    // Skip the first line.
    for (;;) {
        char c = fgetc(f);
        if (c == EOF || c == '\n') {
            break;
        }
    }
    for (;;) {
        static const char base_path[] = "/sys/fs/cgroup/";
        char path[sizeof(base_path) - 1 + 64];
        char* name = path + sizeof(base_path) - 1;
        int hier, groups, enabled;
        int r = fscanf(f, "%64s %d %d %d\n", name, &hier, &groups, &enabled);
        if (r == EOF) {
            break;
        }
        if (r != 4) {
            errno = errno ? : EINVAL;
            die2("fscanf", fpath);
        }
        if (enabled) {
            memcpy(path, base_path, sizeof(base_path) - 1);
            if (mkdir(path, 0755) < 0) {
                die2("mkdir", path);
            }
            if (mount(name, path, "cgroup", MS_NODEV | MS_NOSUID | MS_NOEXEC, name) < 0) {
                die2("mount", path);
            }
        }
    }
    fclose(f);
}

void init_network(const char *iface, int domain) {
    int s = socket(domain, SOCK_DGRAM, IPPROTO_IP);
    if (s < 0) {
        if (errno == EAFNOSUPPORT) {
            return;
        }
        die("socket");
    }

    struct ifreq request = {0};
    strncpy(request.ifr_name, iface, sizeof(request.ifr_name));
    if (ioctl(s, SIOCGIFFLAGS, &request) < 0) {
        die2("ioctl(SIOCGIFFLAGS)", iface);
    }

    request.ifr_flags |= IFF_UP | IFF_RUNNING;
    if (ioctl(s, SIOCSIFFLAGS, &request) < 0) {
        die2("ioctl(SIOCSIFFLAGS)", iface);
    }

    close(s);
}

// inject boot-time entropy after reading it from a vsock port
void init_entropy(int port) {
    int s = openvsock(VMADDR_CID_HOST, port);
    if (s < 0) {
        die("openvsock entropy");
    }

    int e = open("/dev/random", O_RDWR);
    if (e < 0) {
        die("open /dev/random");
    }

    struct {
        int entropy_count;
        int buf_size;
        char buf[4096];
    } buf;

    for (;;) {
        ssize_t n = read(s, buf.buf, sizeof(buf.buf));
        if (n < 0) {
            die("read entropy");
        }

        if (n == 0) {
            break;
        }

        buf.entropy_count = n * 8; // in bits
        buf.buf_size = n; // in bytes
        if (ioctl(e, RNDADDENTROPY, &buf) < 0) {
            die("ioctl(RNDADDENTROPY)");
        }
    }

    close(s);
    close(e);
}

pid_t launch(int argc, char **argv) {
    int pid = fork();
    if (pid != 0) {
        if (pid < 0) {
            die("fork");
        }

        return pid;
    }

    // Unblock signals before execing.
    sigset_t set;
    sigfillset(&set);
    sigprocmask(SIG_UNBLOCK, &set, 0);

    // Create a session and process group.
    setsid();
    setpgid(0, 0);

    // Terminate the arguments and exec.
    char **argvn = alloca(sizeof(argv[0]) * (argc + 1));
    memcpy(argvn, argv, sizeof(argv[0]) * argc);
    argvn[argc] = NULL;
    if (putenv(DEFAULT_PATH_ENV)) { // Specify the PATH used for execvpe
        die("putenv");
    }
    execvpe(argvn[0], argvn, (char**)default_envp);
    die2("execvpe", argvn[0]);
}

int reap_until(pid_t until_pid) {
    for (;;) {
        int status;
        pid_t pid = wait(&status);
        if (pid < 0) {
            die("wait");
        }

        if (pid == until_pid) {
            // The initial child process died. Pass through the exit status.
            if (WIFEXITED(status)) {
                if (WEXITSTATUS(status) != 0) {
                    fputs("child exited with error\n", stderr);
                }
                return WEXITSTATUS(status);
            }
            fputs("child exited by signal: ", stderr);
            fputs(strsignal(WTERMSIG(status)), stderr);
            fputs("\n", stderr);
            return 128 + WTERMSIG(status);
        }
    }
}

int main(int argc, char **argv) {
    char *debug_shell = NULL;
    int entropy_port = 0;
    if (argc <= 1) {
        argv = (char **)default_argv;
        argc = sizeof(default_argv) / sizeof(default_argv[0]);
        optind = 0;
        debug_shell = (char*)default_shell;
    } else {
        for (int opt; (opt = getopt(argc, argv, "+d:e:")) >= 0; ) {
            switch (opt) {
            case 'd':
                debug_shell = optarg;
                break;

            case 'e':
                entropy_port = atoi(optarg);
                if (entropy_port == 0) {
                    fputs("invalid entropy port\n", stderr);
                    exit(1);
                }

                break;

            default:
                exit(1);
            }
        }
    }

    char **child_argv = argv + optind;
    int child_argc = argc - optind;

    // Block all signals in init. SIGCHLD will still cause wait() to return.
    sigset_t set;
    sigfillset(&set);
    sigprocmask(SIG_BLOCK, &set, 0);

    init_rlimit();
    init_dev();
    init_fs(ops, sizeof(ops) / sizeof(ops[0]));
    init_cgroups();
    init_network("lo", AF_INET);
    init_network("lo", AF_INET6);
    if (entropy_port != 0) {
        init_entropy(entropy_port);
    }

    pid_t pid = launch(child_argc, child_argv);
    if (debug_shell != NULL) {
        // The debug shell takes over as the primary child.
        pid = launch(1, &debug_shell);
    }

    // Reap until the initial child process dies.
    return reap_until(pid);
}