1#define _GNU_SOURCE
2#include <errno.h>
3#include <fcntl.h>
4#include <getopt.h>
5#include <net/if.h>
6#include <netinet/ip.h>
7#include <signal.h>
8#include <stdio.h>
9#include <stdlib.h>
10#include <string.h>
11#include <sys/mount.h>
12#include <sys/resource.h>
13#include <sys/socket.h>
14#include <sys/stat.h>
15#include <sys/sysmacros.h>
16#include <sys/types.h>
17#include <sys/wait.h>
18#include <unistd.h>
19#include "../vsockexec/vsock.h"
20
21// musl-gcc doesn't use headers in /usr/include, so it can't find
22// linux/random.h which is where RNDADDENTROPY is defined. We only need this
23// single definition from linux/random.h, so we just duplicate it here as a
24// workaround.
25#define RNDADDENTROPY _IOW( 'R', 0x03, int [2] )
26
27#define DEFAULT_PATH_ENV "PATH=/sbin:/usr/sbin:/bin:/usr/bin"
28
29const char *const default_envp[] = {
30 DEFAULT_PATH_ENV,
31 NULL,
32};
33
34// When nothing is passed, default to the LCOWv1 behavior.
35const char *const default_argv[] = { "/bin/gcs", "-loglevel", "debug", "-logfile=/run/gcs/gcs.log" };
36const char *const default_shell = "/bin/sh";
37
38struct Mount {
39 const char *source, *target, *type;
40 unsigned long flags;
41 const void *data;
42};
43
44struct Mkdir {
45 const char *path;
46 mode_t mode;
47};
48
49struct Mknod {
50 const char *path;
51 mode_t mode;
52 int major, minor;
53};
54
55struct Symlink {
56 const char *linkpath, *target;
57};
58
59enum OpType {
60 OpMount,
61 OpMkdir,
62 OpMknod,
63 OpSymlink,
64};
65
66struct InitOp {
67 enum OpType op;
68 union {
69 struct Mount mount;
70 struct Mkdir mkdir;
71 struct Mknod mknod;
72 struct Symlink symlink;
73 };
74};
75
76const struct InitOp ops[] = {
77 // mount /proc (which should already exist)
78 { OpMount, .mount = { "proc", "/proc", "proc", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
79
80 // add symlinks in /dev (which is already mounted)
81 { OpSymlink, .symlink = { "/dev/fd", "/proc/self/fd" } },
82 { OpSymlink, .symlink = { "/dev/stdin", "/proc/self/fd/0" } },
83 { OpSymlink, .symlink = { "/dev/stdout", "/proc/self/fd/1" } },
84 { OpSymlink, .symlink = { "/dev/stderr", "/proc/self/fd/2" } },
85
86 // mount tmpfs on /run and /tmp (which should already exist)
87 { OpMount, .mount = { "tmpfs", "/run", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755" } },
88 { OpMount, .mount = { "tmpfs", "/tmp", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
89
90 // mount shm and devpts
91 { OpMkdir, .mkdir = { "/dev/shm", 0755 } },
92 { OpMount, .mount = { "shm", "/dev/shm", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
93 { OpMkdir, .mkdir = { "/dev/pts", 0755 } },
94 { OpMount, .mount = { "devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC } },
95
96 // mount /sys (which should already exist)
97 { OpMount, .mount = { "sysfs", "/sys", "sysfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
98 { OpMount, .mount = { "cgroup_root", "/sys/fs/cgroup", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755" } },
99};
100
101void warn(const char *msg) {
102 int error = errno;
103 perror(msg);
104 errno = error;
105}
106
107void warn2(const char *msg1, const char *msg2) {
108 int error = errno;
109 fputs(msg1, stderr);
110 fputs(": ", stderr);
111 errno = error;
112 warn(msg2);
113}
114
115_Noreturn void dien() {
116 exit(errno);
117}
118
119_Noreturn void die(const char *msg) {
120 warn(msg);
121 dien();
122}
123
124_Noreturn void die2(const char *msg1, const char *msg2) {
125 warn2(msg1, msg2);
126 dien();
127}
128
129void init_rlimit() {
130 // Set the hard limit for number of open fds much larger. The kernel sets
131 // a limit of 4096 for historical reasons, and this limit is too low for
132 // some software. According to the systemd developers, there is no downside
133 // to a large hard limit in modern Linux kernels.
134 //
135 // Retain the small soft limit of 1024 for appcompat.
136 struct rlimit rlim = {
137 .rlim_cur = 1024,
138 .rlim_max = 1024 * 1024,
139 };
140 if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
141 die("setrlimit(RLIMIT_NOFILE)");
142 }
143}
144
145void init_dev() {
146 if (mount("dev", "/dev", "devtmpfs", MS_NOSUID | MS_NOEXEC, NULL) < 0) {
147 warn2("mount", "/dev");
148 // /dev will be already mounted if devtmpfs.mount = 1 on the kernel
149 // command line or CONFIG_DEVTMPFS_MOUNT is set. Do not consider this
150 // an error.
151 if (errno != EBUSY) {
152 dien();
153 }
154 }
155}
156
157void init_fs(const struct InitOp *ops, size_t count) {
158 for (size_t i = 0; i < count; i++) {
159 switch (ops[i].op) {
160 case OpMount: {
161 const struct Mount *m = &ops[i].mount;
162 if (mount(m->source, m->target, m->type, m->flags, m->data) < 0) {
163 die2("mount", m->target);
164 }
165 break;
166 }
167 case OpMkdir: {
168 const struct Mkdir *m = &ops[i].mkdir;
169 if (mkdir(m->path, m->mode) < 0) {
170 warn2("mkdir", m->path);
171 if (errno != EEXIST) {
172 dien();
173 }
174 }
175 break;
176 }
177 case OpMknod: {
178 const struct Mknod *n = &ops[i].mknod;
179 if (mknod(n->path, n->mode, makedev(n->major, n->minor)) < 0) {
180 warn2("mknod", n->path);
181 if (errno != EEXIST) {
182 dien();
183 }
184 }
185 break;
186 }
187 case OpSymlink: {
188 const struct Symlink *sl = &ops[i].symlink;
189 if (symlink(sl->target, sl->linkpath) < 0) {
190 warn2("symlink", sl->linkpath);
191 if (errno != EEXIST) {
192 dien();
193 }
194 }
195 break;
196 }
197 }
198 }
199}
200
201void init_cgroups() {
202 const char *fpath = "/proc/cgroups";
203 FILE *f = fopen(fpath, "r");
204 if (f == NULL) {
205 die2("fopen", fpath);
206 }
207 // Skip the first line.
208 for (;;) {
209 char c = fgetc(f);
210 if (c == EOF || c == '\n') {
211 break;
212 }
213 }
214 for (;;) {
215 static const char base_path[] = "/sys/fs/cgroup/";
216 char path[sizeof(base_path) - 1 + 64];
217 char* name = path + sizeof(base_path) - 1;
218 int hier, groups, enabled;
219 int r = fscanf(f, "%64s %d %d %d\n", name, &hier, &groups, &enabled);
220 if (r == EOF) {
221 break;
222 }
223 if (r != 4) {
224 errno = errno ? : EINVAL;
225 die2("fscanf", fpath);
226 }
227 if (enabled) {
228 memcpy(path, base_path, sizeof(base_path) - 1);
229 if (mkdir(path, 0755) < 0) {
230 die2("mkdir", path);
231 }
232 if (mount(name, path, "cgroup", MS_NODEV | MS_NOSUID | MS_NOEXEC, name) < 0) {
233 die2("mount", path);
234 }
235 }
236 }
237 fclose(f);
238}
239
240void init_network(const char *iface, int domain) {
241 int s = socket(domain, SOCK_DGRAM, IPPROTO_IP);
242 if (s < 0) {
243 if (errno == EAFNOSUPPORT) {
244 return;
245 }
246 die("socket");
247 }
248
249 struct ifreq request = {0};
250 strncpy(request.ifr_name, iface, sizeof(request.ifr_name));
251 if (ioctl(s, SIOCGIFFLAGS, &request) < 0) {
252 die2("ioctl(SIOCGIFFLAGS)", iface);
253 }
254
255 request.ifr_flags |= IFF_UP | IFF_RUNNING;
256 if (ioctl(s, SIOCSIFFLAGS, &request) < 0) {
257 die2("ioctl(SIOCSIFFLAGS)", iface);
258 }
259
260 close(s);
261}
262
263// inject boot-time entropy after reading it from a vsock port
264void init_entropy(int port) {
265 int s = openvsock(VMADDR_CID_HOST, port);
266 if (s < 0) {
267 die("openvsock entropy");
268 }
269
270 int e = open("/dev/random", O_RDWR);
271 if (e < 0) {
272 die("open /dev/random");
273 }
274
275 struct {
276 int entropy_count;
277 int buf_size;
278 char buf[4096];
279 } buf;
280
281 for (;;) {
282 ssize_t n = read(s, buf.buf, sizeof(buf.buf));
283 if (n < 0) {
284 die("read entropy");
285 }
286
287 if (n == 0) {
288 break;
289 }
290
291 buf.entropy_count = n * 8; // in bits
292 buf.buf_size = n; // in bytes
293 if (ioctl(e, RNDADDENTROPY, &buf) < 0) {
294 die("ioctl(RNDADDENTROPY)");
295 }
296 }
297
298 close(s);
299 close(e);
300}
301
302pid_t launch(int argc, char **argv) {
303 int pid = fork();
304 if (pid != 0) {
305 if (pid < 0) {
306 die("fork");
307 }
308
309 return pid;
310 }
311
312 // Unblock signals before execing.
313 sigset_t set;
314 sigfillset(&set);
315 sigprocmask(SIG_UNBLOCK, &set, 0);
316
317 // Create a session and process group.
318 setsid();
319 setpgid(0, 0);
320
321 // Terminate the arguments and exec.
322 char **argvn = alloca(sizeof(argv[0]) * (argc + 1));
323 memcpy(argvn, argv, sizeof(argv[0]) * argc);
324 argvn[argc] = NULL;
325 if (putenv(DEFAULT_PATH_ENV)) { // Specify the PATH used for execvpe
326 die("putenv");
327 }
328 execvpe(argvn[0], argvn, (char**)default_envp);
329 die2("execvpe", argvn[0]);
330}
331
332int reap_until(pid_t until_pid) {
333 for (;;) {
334 int status;
335 pid_t pid = wait(&status);
336 if (pid < 0) {
337 die("wait");
338 }
339
340 if (pid == until_pid) {
341 // The initial child process died. Pass through the exit status.
342 if (WIFEXITED(status)) {
343 if (WEXITSTATUS(status) != 0) {
344 fputs("child exited with error\n", stderr);
345 }
346 return WEXITSTATUS(status);
347 }
348 fputs("child exited by signal: ", stderr);
349 fputs(strsignal(WTERMSIG(status)), stderr);
350 fputs("\n", stderr);
351 return 128 + WTERMSIG(status);
352 }
353 }
354}
355
356int main(int argc, char **argv) {
357 char *debug_shell = NULL;
358 int entropy_port = 0;
359 if (argc <= 1) {
360 argv = (char **)default_argv;
361 argc = sizeof(default_argv) / sizeof(default_argv[0]);
362 optind = 0;
363 debug_shell = (char*)default_shell;
364 } else {
365 for (int opt; (opt = getopt(argc, argv, "+d:e:")) >= 0; ) {
366 switch (opt) {
367 case 'd':
368 debug_shell = optarg;
369 break;
370
371 case 'e':
372 entropy_port = atoi(optarg);
373 if (entropy_port == 0) {
374 fputs("invalid entropy port\n", stderr);
375 exit(1);
376 }
377
378 break;
379
380 default:
381 exit(1);
382 }
383 }
384 }
385
386 char **child_argv = argv + optind;
387 int child_argc = argc - optind;
388
389 // Block all signals in init. SIGCHLD will still cause wait() to return.
390 sigset_t set;
391 sigfillset(&set);
392 sigprocmask(SIG_BLOCK, &set, 0);
393
394 init_rlimit();
395 init_dev();
396 init_fs(ops, sizeof(ops) / sizeof(ops[0]));
397 init_cgroups();
398 init_network("lo", AF_INET);
399 init_network("lo", AF_INET6);
400 if (entropy_port != 0) {
401 init_entropy(entropy_port);
402 }
403
404 pid_t pid = launch(child_argc, child_argv);
405 if (debug_shell != NULL) {
406 // The debug shell takes over as the primary child.
407 pid = launch(1, &debug_shell);
408 }
409
410 // Reap until the initial child process dies.
411 return reap_until(pid);
412}
View as plain text