mooa

Lua + lubev + sandboxing
git clone https://code.literati.org/mooa.git
Log | Files | Refs | README | LICENSE

commit 7df84f224e8f9f6a59c5ec1e7594a6ccb839243f
parent eb667788c6800fa49600b3b35773f368ae60064a
Author: Sean Lynch <seanl@literati.org>
Date:   Mon, 23 Jun 2014 17:45:40 -0700

Add SUID jail

Diffstat:
MMakefile | 33++++++++++++++++++++++++++-------
Asuidjail.c | 522+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 548 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile @@ -1,16 +1,35 @@ -CC=gcc -CFLAGS=-ggdb3 -Wall -Werror -pedantic -Wno-strict-aliasing -O2 $(shell pkg-config --cflags lua) +CC = clang +CFLAGS += -std=c99 -g -O0 \ + -fPIE -fstack-protector-strong \ + -DVERSION=\"$(shell git describe)\" LINT=splint -LIBS=$(shell pkg-config --libs lua) -lev +LUA_CFLAGS=$(pkg-config --cflags lua) +LUA_LIBS=$(shell pkg-config --libs lua) -lev +OBJECTS=mooa.o dns.o http.o socket.o task.o utils.o http_parser.o +LDLIBS = -lseccomp +LDFLAGS += -pie -Wl,--as-needed,-z,relro,-z,now +ifeq ($(CC), clang) + CFLAGS += -Weverything \ + -Wno-documentation \ + -Wno-shift-sign-overflow \ + -Wno-padded \ + -Wno-disabled-macro-expansion \ + -Wno-pedantic \ + -Wno-assign-enum +else + CFLAGS += -Wall -Wextra +endif -all: mooa +all: mooa suidjail -mooa: mooa.o dns.o http.o socket.o task.o utils.o http_parser.o +suidjail: suidjail.c + +mooa: $(OBJECTS) ${CC} -o $@ ${LIBS} $^ -ludns -.c.o: - ${CC} ${CFLAGS} -c $< +clean: + rm -f mooa suidjail $(OBJECTS) lint: mooa.c http.c socket.c task.c utils.c splint -checks -posix-lib -mustfreeonly $^ diff --git a/suidjail.c b/suidjail.c @@ -0,0 +1,522 @@ +#define _GNU_SOURCE + +#include <stdbool.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include <getopt.h> +#include <dirent.h> +#include <err.h> +#include <errno.h> +#include <linux/limits.h> +#include <pwd.h> +#include <fcntl.h> +#include <unistd.h> +#include <sched.h> +#include <signal.h> +#include <sys/mount.h> +#include <sys/prctl.h> +#include <sys/resource.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/epoll.h> +#include <sys/ptrace.h> +#include <sys/signalfd.h> +#include <sys/reg.h> +#include <sys/wait.h> + +#include <seccomp.h> + +#if defined(__i386__) +# define REG_SYSCALL 4 * ORIG_EAX +#elif defined(__x86_64__) +# define REG_SYSCALL 8 * ORIG_RAX +#else +# error "Platform does not support seccomp filter yet" +#endif + +static void check(long rc) { + if (rc < 0) errx(1, "%s", strerror((int)-rc)); +} + +static FILE *fopenx(const char *path, const char *mode) { + FILE *f = fopen(path, mode); + if (!f) err(EXIT_FAILURE, "failed to open %s", path); + return f; +} + +static void mountx(const char *source, const char *target, + const char *filesystemtype, unsigned long mountflags, + const void *data) { + if (mount(source, target, filesystemtype, mountflags, data) < 0) + err(1, "mounting %s failed", target); +} + +static void write_to(const char *path, const char *string) { + FILE *fp = fopenx(path, "w"); + fputs(string, fp); + fclose(fp); +} + + + +static void epoll_watch(int epoll_fd, int fd) { + struct epoll_event event = {}; + event.data.fd = fd; + event.events = EPOLLIN | EPOLLET; + + if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &event) < 0) + err(1, "epoll_ctl"); +} + +// This could often use `splice`, but it will not always work with `stdout` and `stderr`. +static void copy_pipe_to(int in_fd, int out_fd) { + ssize_t n; + do { + uint8_t buffer[BUFSIZ]; + n = read(in_fd, buffer, sizeof buffer); + if (n == -1) { + if (errno == EAGAIN) return; + err(EXIT_FAILURE, "read"); + } + if (write(out_fd, buffer, (size_t)n) == -1) + err(EXIT_FAILURE, "write"); + } while (n != 0); +} + +static int get_syscall_nr(const char *name) { + int result = seccomp_syscall_resolve_name(name); + if (result == __NR_SCMP_ERROR) { + errx(EXIT_FAILURE, "non-existent syscall: %s", name); + } + return result; +} + +__attribute__((noreturn)) static void usage(FILE *out) { + fprintf(out, "usage: %s [options] [command ...]\n", program_invocation_short_name); + fputs("Options:\n" + " -h, --help display this help\n" + " -v, --version display version\n" + " -u, --user=USER the user to run the program as\n" + " -n, --hostname=NAME the hostname to set the container to\n" + " -m, --memory-limit=LIMIT the memory limit of the container\n" + " -s, --syscalls=LIST comma-separated whitelist of syscalls\n" + " --syscalls-file=PATH whitelist file containing one syscall name per line\n", + out); + + exit(out == stderr ? EXIT_FAILURE : EXIT_SUCCESS); +} + +static void set_non_blocking(int fd) { + int flags = fcntl(fd, F_GETFL, 0); + if (flags == -1) err(EXIT_FAILURE, "fcntl"); + if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) + err(EXIT_FAILURE, "fcntl"); +} + +// Mark any extra file descriptors `CLOEXEC`. Only `stdin`, `stdout` and `stderr` are left open. +static void prevent_leaked_file_descriptors() { + DIR *dir = opendir("/proc/self/fd"); + if (!dir) err(EXIT_FAILURE, "opendir"); + struct dirent *dp; + while ((dp = readdir(dir))) { + char *end; + int fd = (int)strtol(dp->d_name, &end, 10); + if (*end == '\0' && fd > 2 && fd != dirfd(dir)) { + if (ioctl(fd, FIOCLEX) == -1) err(EXIT_FAILURE, "ioctl"); + } + } + closedir(dir); +} + +static long strtolx_positive(const char *s, const char *what) { + char *end; + errno = 0; + long result = strtol(s, &end, 10); + if (errno) errx(EXIT_FAILURE, "%s is too large", what); + if (*end != '\0' || result < 0) + errx(EXIT_FAILURE, "%s must be a positive integer", what); + return result; +} + +static void child_pipe(int pipefd[2]) { + if (pipe(pipefd) < 0) { + err(EXIT_FAILURE, "pipe"); + } + set_non_blocking(pipefd[0]); +} + +static void init_cgroup(pid_t ppid, const char *memory_limit) { + char path[PATH_MAX]; + + if (mkdir("/sys/fs/cgroup/memory/mooa", 0755) < 0 && errno != EEXIST) { + err(EXIT_FAILURE, "failed to create memory cgroup"); + } + + snprintf(path, PATH_MAX, "/sys/fs/cgroup/memory/mooa/%jd", (intmax_t)ppid); + if (mkdir(path, 0755) < 0 && errno != EEXIST) { + err(EXIT_FAILURE, "failed to create memory cgroup"); + } + + snprintf(path, PATH_MAX, "/sys/fs/cgroup/memory/mooa/%jd/cgroup.procs", (intmax_t)ppid); + write_to(path, "0"); + + snprintf(path, PATH_MAX, "/sys/fs/cgroup/memory/mooa/%jd/memory.limit_in_bytes", (intmax_t)ppid); + write_to(path, memory_limit); + + if (mkdir("/sys/fs/cgroup/devices/mooa", 0755) < 0 && errno != EEXIST) { + err(EXIT_FAILURE, "failed to create device cgroup"); + } + + snprintf(path, PATH_MAX, "/sys/fs/cgroup/devices/mooa/%jd", (intmax_t)ppid); + if (mkdir(path, 0755) < 0 && errno != EEXIST) { + err(EXIT_FAILURE, "failed to create device cgroup"); + } + + snprintf(path, PATH_MAX, "/sys/fs/cgroup/devices/mooa/%jd/cgroup.procs", (intmax_t)ppid); + write_to(path, "0"); + + snprintf(path, PATH_MAX, "/sys/fs/cgroup/devices/mooa/%jd/devices.deny", (intmax_t)ppid); + write_to(path, "a"); + + snprintf(path, PATH_MAX, "/sys/fs/cgroup/devices/mooa/%jd/devices.allow", (intmax_t)ppid); +} + +static void kill_group() { + pid_t pid = getpid(); + char path[PATH_MAX]; + snprintf(path, PATH_MAX, "/sys/fs/cgroup/memory/mooa/%jd/cgroup.procs", (intmax_t)pid); + + bool done = false; + do { + FILE *proc = fopenx(path, "r"); + pid_t p; + done = true; + while (fscanf(proc, "%u", &p) == 1) { + kill(p, SIGKILL); + done = false; + } + fclose(proc); + } while (!done); + + snprintf(path, PATH_MAX, "/sys/fs/cgroup/memory/mooa/%jd", (intmax_t)pid); + if (rmdir(path) < 0 && errno != ENOENT) { + err(1, "rmdir"); + } + + snprintf(path, PATH_MAX, "/sys/fs/cgroup/devices/mooa/%jd", (intmax_t)pid); + if (rmdir(path) < 0 && errno != ENOENT) { + err(1, "rmdir"); + } +} + +static void trace_child(pid_t pid) { + check(ptrace(PTRACE_SETOPTIONS, pid, NULL, + PTRACE_O_EXITKILL | PTRACE_O_TRACECLONE | + PTRACE_O_TRACEFORK | PTRACE_O_TRACEVFORK | + PTRACE_O_TRACEEXEC | + PTRACE_O_TRACESECCOMP)); + check(ptrace(PTRACE_CONT, pid, NULL, NULL)); +} + +static void trace_trap(pid_t pid) { + siginfo_t si; + check(ptrace(PTRACE_GETSIGINFO, pid, NULL, &si)); + if (si.si_signo != SIGTRAP || (si.si_code & SIGTRAP) != SIGTRAP) { + errx(EXIT_FAILURE, "Expected SIGTRAP, got signal %d and code %d", + si.si_signo, si.si_code); + } + + int event = si.si_code >> 8; + long syscall; + unsigned long msg; + switch (event) { + case PTRACE_EVENT_CLONE: + case PTRACE_EVENT_FORK: + case PTRACE_EVENT_VFORK: + check(ptrace(PTRACE_GETEVENTMSG, pid, NULL, &msg)); + check(ptrace(PTRACE_CONT, pid, NULL, NULL)); + break; + case PTRACE_EVENT_EXEC: + check(ptrace(PTRACE_CONT, pid, NULL, NULL)); + break; + case PTRACE_EVENT_SECCOMP: + syscall = ptrace(PTRACE_PEEKUSER, pid, REG_SYSCALL, NULL); + if (syscall <= 0) { + errx(EXIT_FAILURE, "Failed to get syscall from child %d", pid); + } + errx(EXIT_FAILURE, "Child %d called disallowed syscall %s(%ld)", + pid, + seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, (int)syscall), + syscall); + default: + errx(EXIT_FAILURE, "Got trace event %d", event); + } +} + +int main(int argc, char **argv) { + prevent_leaked_file_descriptors(); + + const char *memory_limit = "128M"; + const char *username = "nobody"; + const char *hostname = "mooa"; + char *devices = NULL; + char *syscalls = NULL; + const char *syscalls_file = NULL; + int syscalls_from_file[500]; // upper bound on the number of syscalls + + static const struct option opts[] = { + { "help", no_argument, 0, 'h' }, + { "version", no_argument, 0, 'v' }, + { "hostname", required_argument, 0, 'n' }, + { "memory-limit", required_argument, 0, 'm' }, + { "syscalls", required_argument, 0, 's' }, + { "syscalls-file", required_argument, 0, 0x100 }, + { 0, 0, 0, 0 } + }; + + while (true) { + int opt = getopt_long(argc, argv, "hvpu:r:n:t:m:d:s:", opts, NULL); + if (opt == -1) + break; + + switch (opt) { + case 'h': + usage(stdout); + case 'v': + printf("%s %s\n", program_invocation_short_name, VERSION); + return 0; + case 'u': + username = optarg; + break; + case 'n': + hostname = optarg; + break; + case 'm': + memory_limit = optarg; + break; + case 'd': + devices = optarg; + break; + case 's': + syscalls = optarg; + break; + case 0x100: + syscalls_file = optarg; + break; + default: + usage(stderr); + } + } + + if (argc - optind < 1) { + usage(stderr); + } + + if (syscalls_file) { + char name[30]; // longest syscall name + FILE *file = fopen(syscalls_file, "r"); + if (!file) err(EXIT_FAILURE, "failed to open syscalls file: %s", + syscalls_file); + size_t i = 0; + while (fgets(name, sizeof name / sizeof name[0], file)) { + char *pos; + if ((pos = strchr(name, '\n'))) *pos = '\0'; + syscalls_from_file[i++] = get_syscall_nr(name); + } + syscalls_from_file[i] = -1; + fclose(file); + } + + int epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (epoll_fd < 0) { + err(1, "epoll"); + } + + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, SIGCHLD); + + if (sigprocmask(SIG_BLOCK, &mask, NULL) < 0) { + err(1, "sigprocmask"); + } + + int sig_fd = signalfd(-1, &mask, SFD_CLOEXEC); + if (sig_fd < 0) { + err(1, "signalfd"); + } + + epoll_watch(epoll_fd, sig_fd); + + /* int pipe_in[2]; */ + /* int pipe_out[2]; */ + /* int pipe_err[2]; */ + /* child_pipe(pipe_in); */ + /* child_pipe(pipe_out); */ + /* child_pipe(pipe_err); */ + + // A pipe for signalling that the parent is ready and not dead + int pipe_ready[2]; + if (pipe2(pipe_ready, O_CLOEXEC) < 0) { + err(1, "pipe"); + } + + /* epoll_watch(epoll_fd, STDIN_FILENO); */ + /* epoll_watch(epoll_fd, pipe_out[0]); */ + /* epoll_watch(epoll_fd, pipe_err[0]); */ + + pid_t ppid = getpid(); // getppid() in the child won't work + + unsigned long flags = SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|CLONE_NEWNET; + pid_t pid = (pid_t)syscall(__NR_clone, flags, NULL); + + if (pid == 0) { + /* dup2(pipe_in[0], STDIN_FILENO); */ + /* dup2(pipe_out[1], STDOUT_FILENO); */ + /* dup2(pipe_err[1], STDERR_FILENO); */ + + /* close(pipe_in[0]); */ + /* close(pipe_in[1]); */ + /* close(pipe_out[0]); */ + /* close(pipe_out[1]); */ + /* close(pipe_err[0]); */ + /* close(pipe_err[1]); */ + + close(pipe_ready[1]); + + init_cgroup(ppid, memory_limit); + + // Kill this process if the parent dies. This is not a + // replacement for killing the sandboxed processes via a + // control group as it is not inherited by child processes, + // but is more robust when the sandboxed process is not + // allowed to fork. + prctl(PR_SET_PDEATHSIG, SIGKILL); + + // Ensure that the parent didn't die before `prctl` was + // called. + uint8_t ready; + if (read(pipe_ready[0], &ready, sizeof ready) == -1) { + err(EXIT_FAILURE, "read"); + } + close(pipe_ready[0]); + + if (sethostname(hostname, strlen(hostname)) < 0) { + err(1, "sethostname"); + } + + // avoid propagating mounts to or from the real root + mountx(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL); + + /* // turn directory into a bind mount */ + /* mountx(root, root, "bind", MS_BIND|MS_REC, NULL); */ + + /* // re-mount as read-only */ + /* mountx(root, root, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL); */ + + // create a new session + if (setsid() < 0) { + err(1, "setsid"); + } + + char path[] = "PATH=/usr/local/bin:/usr/bin:/bin"; + char *env[] = {path, NULL}; + + check(ptrace(PTRACE_TRACEME, 0, 0, 0)); + /* glibc caches getpid, but ours will always be 1 */ + check(syscall(__NR_tgkill, 1, 1, SIGSTOP)); + + warnx("execing %s", argv[optind]); + if (execvpe(argv[optind], argv + optind, env) < 0) { + err(1, "execvpe"); + } + } else if (pid < 0) { + err(1, "clone"); + } + + atexit(kill_group); + + if (write(pipe_ready[1], &(uint8_t) { 0 }, 1) == -1) { + err(EXIT_FAILURE, "write"); + } + + struct epoll_event events[4]; + while (true) { + int i, n = epoll_wait(epoll_fd, events, 4, -1); + + if (n < 0) { + if (errno == EINTR) + continue; + err(1, "epoll_wait"); + } + + for (i = 0; i < n; ++i) { + struct epoll_event *evt = &events[i]; + siginfo_t s; + + if (evt->events & EPOLLERR || evt->events & EPOLLHUP) { + warnx("Closing fd %d", evt->data.fd); + close(evt->data.fd); + } else if (evt->data.fd == sig_fd) { + struct signalfd_siginfo si; + ssize_t bytes_r = read(sig_fd, &si, sizeof(si)); + + if (bytes_r < 0) { + err(1, "read"); + } else if (bytes_r != sizeof(si)) { + errx(EXIT_FAILURE, "read the wrong amount of bytes"); + } else if (si.ssi_signo != SIGCHLD) { + errx(EXIT_FAILURE, "got an unexpected signal"); + } + + /* Because signals can be coalesced, we can't expect + to receive an event for each one. So let's call + waitid in a loop. */ + while (1) { + s.si_pid = 0; + check(waitid(P_ALL, 0, &s, WEXITED|WSTOPPED|WNOHANG)); + if (s.si_pid == 0) { + break; + } + + switch (s.si_code) { + case CLD_EXITED: + case CLD_KILLED: + case CLD_DUMPED: + if (s.si_pid == pid) { + errx(s.si_status, "Child exited with status %d", + s.si_status); + } + break; + case CLD_TRAPPED: + if (s.si_status == SIGSTOP) { + trace_child(s.si_pid); + } else { + trace_trap(s.si_pid); + } + break; + case CLD_STOPPED: + errx(EXIT_FAILURE, "Child %d stopped with status %d", + s.si_pid, s.si_status); + default: + errx(s.si_status, + "Got SIGCHLD with code %d and status %d\n", + s.si_code, s.si_status); + } + } + } else { + errx(EXIT_FAILURE, "Got event %d on fd %d", evt->events, + evt->data.fd); + } + + /* else if (evt->data.fd == STDIN_FILENO) { */ + /* copy_pipe_to(STDIN_FILENO, pipe_in[1]); */ + /* } else if (evt->data.fd == pipe_out[0]) { */ + /* copy_pipe_to(pipe_out[0], STDOUT_FILENO); */ + /* } else if (evt->data.fd == pipe_err[0]) { */ + /* copy_pipe_to(pipe_err[0], STDERR_FILENO); */ + /* } */ + } + } +}