aboutsummaryrefslogtreecommitdiff
path: root/libminijail.c
diff options
context:
space:
mode:
Diffstat (limited to 'libminijail.c')
-rw-r--r--libminijail.c391
1 files changed, 305 insertions, 86 deletions
diff --git a/libminijail.c b/libminijail.c
index 7a82ccc..0820dbb 100644
--- a/libminijail.c
+++ b/libminijail.c
@@ -8,6 +8,7 @@
#define _GNU_SOURCE
#include <asm/unistd.h>
+#include <assert.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
@@ -26,6 +27,7 @@
#include <sys/param.h>
#include <sys/prctl.h>
#include <sys/resource.h>
+#include <sys/select.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
@@ -48,27 +50,6 @@
# define PR_ALT_SYSCALL 0x43724f53
#endif
-/* Seccomp filter related flags. */
-#ifndef PR_SET_NO_NEW_PRIVS
-# define PR_SET_NO_NEW_PRIVS 38
-#endif
-
-#ifndef SECCOMP_MODE_FILTER
-#define SECCOMP_MODE_FILTER 2 /* Uses user-supplied filter. */
-#endif
-
-#ifndef SECCOMP_SET_MODE_STRICT
-# define SECCOMP_SET_MODE_STRICT 0
-#endif
-#ifndef SECCOMP_SET_MODE_FILTER
-# define SECCOMP_SET_MODE_FILTER 1
-#endif
-
-#ifndef SECCOMP_FILTER_FLAG_TSYNC
-# define SECCOMP_FILTER_FLAG_TSYNC 1
-#endif
-/* End seccomp filter related flags. */
-
/* New cgroup namespace might not be in linux-headers yet. */
#ifndef CLONE_NEWCGROUP
# define CLONE_NEWCGROUP 0x02000000
@@ -107,6 +88,12 @@ struct mountpoint {
struct mountpoint *next;
};
+struct minijail_remount {
+ unsigned long remount_mode;
+ char *mount_name;
+ struct minijail_remount *next;
+};
+
struct hook {
minijail_hook_t hook;
void *payload;
@@ -149,6 +136,7 @@ struct minijail {
int seccomp_filter : 1;
int seccomp_filter_tsync : 1;
int seccomp_filter_logging : 1;
+ int seccomp_filter_allow_speculation : 1;
int chroot : 1;
int pivot_root : 1;
int mount_dev : 1;
@@ -189,6 +177,8 @@ struct minijail {
struct mountpoint *mounts_tail;
size_t mounts_count;
unsigned long remount_mode;
+ struct minijail_remount *remounts_head;
+ struct minijail_remount *remounts_tail;
size_t tmpfs_size;
char *cgroups[MAX_CGROUPS];
size_t cgroup_count;
@@ -219,6 +209,18 @@ static void free_mounts_list(struct minijail *j)
j->mounts_tail = NULL;
}
+static void free_remounts_list(struct minijail *j)
+{
+ while (j->remounts_head) {
+ struct minijail_remount *m = j->remounts_head;
+ j->remounts_head = j->remounts_head->next;
+ free(m->mount_name);
+ free(m);
+ }
+ // No need to clear remounts_head as we know it's NULL after the loop.
+ j->remounts_tail = NULL;
+}
+
/*
* Writes exactly n bytes from buf to file descriptor fd.
* Returns 0 on success or a negative error code on error.
@@ -271,6 +273,7 @@ void minijail_preenter(struct minijail *j)
j->flags.forward_signals = 0;
j->flags.setsid = 0;
j->remount_mode = 0;
+ free_remounts_list(j);
}
/*
@@ -313,7 +316,9 @@ void minijail_preexec(struct minijail *j)
struct minijail API *minijail_new(void)
{
struct minijail *j = calloc(1, sizeof(struct minijail));
- j->remount_mode = MS_PRIVATE;
+ if (j) {
+ j->remount_mode = MS_PRIVATE;
+ }
return j;
}
@@ -427,6 +432,16 @@ void API minijail_set_seccomp_filter_tsync(struct minijail *j)
j->flags.seccomp_filter_tsync = 1;
}
+void API minijail_set_seccomp_filter_allow_speculation(struct minijail *j)
+{
+ if (j->filter_len > 0 && j->filter_prog != NULL) {
+ die("minijail_set_seccomp_filter_allow_speculation() must be "
+ "called before minijail_parse_seccomp_filters()");
+ }
+
+ j->flags.seccomp_filter_allow_speculation = 1;
+}
+
void API minijail_log_seccomp_filter_failures(struct minijail *j)
{
if (j->filter_len > 0 && j->filter_prog != NULL) {
@@ -722,11 +737,6 @@ char API *minijail_get_original_path(struct minijail *j,
return strdup(path_inside_chroot);
}
-size_t minijail_get_tmpfs_size(const struct minijail *j)
-{
- return j->tmpfs_size;
-}
-
void API minijail_mount_dev(struct minijail *j)
{
j->flags.mount_dev = 1;
@@ -883,6 +893,33 @@ int API minijail_bind(struct minijail *j, const char *src, const char *dest,
return minijail_mount(j, src, dest, "", flags);
}
+int API minijail_add_remount(struct minijail *j, const char *mount_name,
+ unsigned long remount_mode)
+{
+ struct minijail_remount *m;
+
+ if (*mount_name != '/')
+ return -EINVAL;
+ m = calloc(1, sizeof(*m));
+ if (!m)
+ return -ENOMEM;
+ m->mount_name = strdup(mount_name);
+ if (!m->mount_name) {
+ free(m);
+ return -ENOMEM;
+ }
+
+ m->remount_mode = remount_mode;
+
+ if (j->remounts_tail)
+ j->remounts_tail->next = m;
+ else
+ j->remounts_head = m;
+ j->remounts_tail = m;
+
+ return 0;
+}
+
int API minijail_add_hook(struct minijail *j, minijail_hook_t hook,
void *payload, minijail_hook_event_t event)
{
@@ -936,6 +973,7 @@ static void clear_seccomp_options(struct minijail *j)
j->flags.seccomp_filter = 0;
j->flags.seccomp_filter_tsync = 0;
j->flags.seccomp_filter_logging = 0;
+ j->flags.seccomp_filter_allow_speculation = 0;
j->filter_len = 0;
j->filter_prog = NULL;
j->flags.no_new_privs = 0;
@@ -986,6 +1024,15 @@ static int seccomp_should_use_filters(struct minijail *j)
*/
}
}
+ if (j->flags.seccomp_filter_allow_speculation) {
+ /* Is the SPEC_ALLOW flag supported? */
+ if (!seccomp_filter_flags_available(
+ SECCOMP_FILTER_FLAG_SPEC_ALLOW)) {
+ warn("allowing speculative execution on seccomp "
+ "processes not supported");
+ j->flags.seccomp_filter_allow_speculation = 0;
+ }
+ }
return 1;
}
@@ -1048,10 +1095,15 @@ static int parse_seccomp_filters(struct minijail *j, const char *filename,
else
filteropts.action = ACTION_RET_TRAP;
} else {
- if (j->flags.seccomp_filter_tsync)
- filteropts.action = ACTION_RET_TRAP;
- else
+ if (j->flags.seccomp_filter_tsync) {
+ if (seccomp_ret_kill_process_available()) {
+ filteropts.action = ACTION_RET_KILL_PROCESS;
+ } else {
+ filteropts.action = ACTION_RET_TRAP;
+ }
+ } else {
filteropts.action = ACTION_RET_KILL;
+ }
}
/*
@@ -1061,6 +1113,9 @@ static int parse_seccomp_filters(struct minijail *j, const char *filename,
filteropts.allow_syscalls_for_logging =
filteropts.allow_logging && !seccomp_ret_log_available();
+ /* Whether to fail on duplicate syscalls. */
+ filteropts.allow_duplicate_syscalls = allow_duplicate_syscalls();
+
if (compile_filter(filename, policy_file, fprog, &filteropts)) {
free(fprog);
return -1;
@@ -1150,15 +1205,16 @@ struct marshal_state {
char *buf;
};
-void marshal_state_init(struct marshal_state *state, char *buf,
- size_t available)
+static void marshal_state_init(struct marshal_state *state, char *buf,
+ size_t available)
{
state->available = available;
state->buf = buf;
state->total = 0;
}
-void marshal_append(struct marshal_state *state, void *src, size_t length)
+static void marshal_append(struct marshal_state *state, const void *src,
+ size_t length)
{
size_t copy_len = MIN(state->available, length);
@@ -1172,7 +1228,13 @@ void marshal_append(struct marshal_state *state, void *src, size_t length)
state->total += length;
}
-void marshal_mount(struct marshal_state *state, const struct mountpoint *m)
+static void marshal_append_string(struct marshal_state *state, const char *src)
+{
+ marshal_append(state, src, strlen(src) + 1);
+}
+
+static void marshal_mount(struct marshal_state *state,
+ const struct mountpoint *m)
{
marshal_append(state, m->src, strlen(m->src) + 1);
marshal_append(state, m->dest, strlen(m->dest) + 1);
@@ -1183,23 +1245,23 @@ void marshal_mount(struct marshal_state *state, const struct mountpoint *m)
marshal_append(state, (char *)&m->flags, sizeof(m->flags));
}
-void minijail_marshal_helper(struct marshal_state *state,
- const struct minijail *j)
+static void minijail_marshal_helper(struct marshal_state *state,
+ const struct minijail *j)
{
struct mountpoint *m = NULL;
size_t i;
marshal_append(state, (char *)j, sizeof(*j));
if (j->user)
- marshal_append(state, j->user, strlen(j->user) + 1);
+ marshal_append_string(state, j->user);
if (j->suppl_gid_list) {
marshal_append(state, j->suppl_gid_list,
j->suppl_gid_count * sizeof(gid_t));
}
if (j->chrootdir)
- marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
+ marshal_append_string(state, j->chrootdir);
if (j->hostname)
- marshal_append(state, j->hostname, strlen(j->hostname) + 1);
+ marshal_append_string(state, j->hostname);
if (j->alt_syscall_table) {
marshal_append(state, j->alt_syscall_table,
strlen(j->alt_syscall_table) + 1);
@@ -1213,7 +1275,7 @@ void minijail_marshal_helper(struct marshal_state *state,
marshal_mount(state, m);
}
for (i = 0; i < j->cgroup_count; ++i)
- marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1);
+ marshal_append_string(state, j->cgroups[i]);
}
size_t API minijail_size(const struct minijail *j)
@@ -1251,6 +1313,8 @@ int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
j->gidmap = NULL;
j->mounts_head = NULL;
j->mounts_tail = NULL;
+ j->remounts_head = NULL;
+ j->remounts_tail = NULL;
j->filter_prog = NULL;
j->hooks_head = NULL;
j->hooks_tail = NULL;
@@ -1380,6 +1444,7 @@ int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
bad_cgroups:
free_mounts_list(j);
+ free_remounts_list(j);
for (i = 0; i < j->cgroup_count; ++i)
free(j->cgroups[i]);
bad_mounts:
@@ -1521,6 +1586,11 @@ static int mount_dev(char **dev_path_ret)
goto done;
}
+ /* Create empty dir for glibc shared mem APIs. */
+ ret = mkdirat(dev_fd, "shm", 01777);
+ if (ret)
+ goto done;
+
/* Restore old mask. */
done:
close(dev_fd);
@@ -1591,7 +1661,7 @@ static int mount_one(const struct minijail *j, struct mountpoint *m,
setup_mount_destination(m->src, dest, j->uid, j->gid,
(m->flags & MS_BIND), &original_mnt_flags);
if (ret) {
- warn("creating mount target '%s' failed", dest);
+ warn("cannot create mount target '%s'", dest);
goto error;
}
@@ -1614,7 +1684,8 @@ static int mount_one(const struct minijail *j, struct mountpoint *m,
ret = mount(m->src, dest, m->type, m->flags, m->data);
if (ret) {
- pwarn("bind: %s -> %s flags=%#lx", m->src, dest, m->flags);
+ pwarn("cannot bind-mount '%s' as '%s' with flags %#lx", m->src,
+ dest, m->flags);
goto error;
}
@@ -1623,8 +1694,10 @@ static int mount_one(const struct minijail *j, struct mountpoint *m,
mount(m->src, dest, NULL,
m->flags | original_mnt_flags | MS_REMOUNT, m->data);
if (ret) {
- pwarn("bind remount: %s -> %s flags=%#lx", m->src, dest,
- m->flags | original_mnt_flags | MS_REMOUNT);
+ pwarn(
+ "cannot bind-remount '%s' as '%s' with flags %#lx",
+ m->src, dest,
+ m->flags | original_mnt_flags | MS_REMOUNT);
goto error;
}
}
@@ -1650,12 +1723,10 @@ static void process_mounts_or_die(const struct minijail *j)
pdie("mount_dev failed");
if (j->mounts_head && mount_one(j, j->mounts_head, dev_path)) {
- if (dev_path) {
- int saved_errno = errno;
+ if (dev_path)
mount_dev_cleanup(dev_path);
- errno = saved_errno;
- }
- pdie("mount_one failed");
+
+ _exit(MINIJAIL_ERR_MOUNT);
}
/*
@@ -1771,19 +1842,37 @@ static int remount_proc_readonly(const struct minijail *j)
* mutate our parent's mount as well, even though we're in a VFS
* namespace (!). Instead, remove their mount from our namespace lazily
* (MNT_DETACH) and make our own.
+ *
+ * However, we skip this in the user namespace case because it will
+ * invariably fail. Every mount namespace is "owned" by the
+ * user namespace of the process that creates it. Mount namespace A is
+ * "less privileged" than mount namespace B if A is created off of B,
+ * and B is owned by a different user namespace.
+ * When a less privileged mount namespace is created, the mounts used to
+ * initialize it (coming from the more privileged mount namespace) come
+ * as a unit, and are locked together. This means that code running in
+ * the new mount (and user) namespace cannot piecemeal unmount
+ * individual mounts inherited from a more privileged mount namespace.
+ * See https://man7.org/linux/man-pages/man7/mount_namespaces.7.html,
+ * "Restrictions on mount namespaces" for details.
+ *
+ * This happens in our use case because we first enter a new user
+ * namespace (on clone(2)) and then we unshare(2) a new mount namespace,
+ * which means the new mount namespace is less privileged than its
+ * parent mount namespace. This would also happen if we entered a new
+ * mount namespace on clone(2), since the user namespace is created
+ * first.
+ * In all other non-user-namespace cases the new mount namespace is
+ * similarly privileged as the parent mount namespace so unmounting a
+ * single mount is allowed.
+ *
+ * We still remount /proc as read-only in the user namespace case
+ * because while a process with CAP_SYS_ADMIN in the new user namespace
+ * can unmount the RO mount and get at the RW mount, an attacker with
+ * access only to a write primitive will not be able to modify /proc.
*/
- if (umount2(kProcPath, MNT_DETACH)) {
- /*
- * If we are in a new user namespace, umount(2) will fail.
- * See http://man7.org/linux/man-pages/man7/user_namespaces.7.html
- */
- if (j->flags.userns) {
- info("umount(/proc, MNT_DETACH) failed, "
- "this is expected when using user namespaces");
- } else {
- return -errno;
- }
- }
+ if (!j->flags.userns && umount2(kProcPath, MNT_DETACH))
+ return -errno;
if (mount("proc", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
return -errno;
return 0;
@@ -2076,9 +2165,16 @@ static void set_seccomp_filter(const struct minijail *j)
* Install the syscall filter.
*/
if (j->flags.seccomp_filter) {
- if (j->flags.seccomp_filter_tsync) {
- if (sys_seccomp(SECCOMP_SET_MODE_FILTER,
- SECCOMP_FILTER_FLAG_TSYNC,
+ if (j->flags.seccomp_filter_tsync ||
+ j->flags.seccomp_filter_allow_speculation) {
+ int filter_flags =
+ (j->flags.seccomp_filter_tsync
+ ? SECCOMP_FILTER_FLAG_TSYNC
+ : 0) |
+ (j->flags.seccomp_filter_allow_speculation
+ ? SECCOMP_FILTER_FLAG_SPEC_ALLOW
+ : 0);
+ if (sys_seccomp(SECCOMP_SET_MODE_FILTER, filter_flags,
j->filter_prog)) {
pdie("seccomp(tsync) failed");
}
@@ -2208,9 +2304,24 @@ void API minijail_enter(const struct minijail *j)
if (j->remount_mode) {
if (mount(NULL, "/", NULL, MS_REC | j->remount_mode,
NULL))
- pdie("mount(NULL, /, NULL, MS_REC | MS_PRIVATE,"
- " NULL) failed");
+ pdie("mount(NULL, /, NULL, "
+ "MS_REC | j->remount_mode, NULL) failed");
+
+ struct minijail_remount *temp = j->remounts_head;
+ while (temp) {
+ if (temp->remount_mode < j->remount_mode)
+ die("cannot remount %s as stricter "
+ "than the root dir",
+ temp->mount_name);
+ if (mount(NULL, temp->mount_name, NULL,
+ MS_REC | temp->remount_mode, NULL))
+ pdie("mount(NULL, %s, NULL, "
+ "MS_REC | temp->remount_mode, NULL) "
+ "failed", temp->mount_name);
+ temp = temp->next;
+ }
}
+
}
if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
@@ -2333,12 +2444,12 @@ void API minijail_enter(const struct minijail *j)
/* TODO(wad): will visibility affect this variable? */
static int init_exitstatus = 0;
-void init_term(int sig attribute_unused)
+static void init_term(int sig attribute_unused)
{
_exit(init_exitstatus);
}
-void init(pid_t rootpid)
+static void init(pid_t rootpid)
{
pid_t pid;
int status;
@@ -2407,6 +2518,26 @@ error:
return err;
}
+int API minijail_copy_jail(const struct minijail *from, struct minijail *out)
+{
+ size_t sz = minijail_size(from);
+ if (!sz)
+ return -EINVAL;
+
+ char *buf = malloc(sz);
+ if (!buf)
+ return -ENOMEM;
+
+ int err = minijail_marshal(from, buf, sz);
+ if (err)
+ goto error;
+
+ err = minijail_unmarshal(out, buf, sz);
+error:
+ free(buf);
+ return err;
+}
+
static int setup_preload(const struct minijail *j attribute_unused,
char ***child_env attribute_unused)
{
@@ -2483,14 +2614,94 @@ static int close_open_fds(int *inheritable_fds, size_t size)
return 0;
}
+/* Return true if the specified file descriptor is already open. */
+static int fd_is_open(int fd)
+{
+ return fcntl(fd, F_GETFD) != -1 || errno != EBADF;
+}
+
+static_assert(FD_SETSIZE >= MAX_PRESERVED_FDS * 2 - 1,
+ "If true, ensure_no_fd_conflict will always find an unused fd.");
+
+/* If p->parent_fd will be used by a child_fd, move it to an unused fd. */
+static int ensure_no_fd_conflict(const fd_set* child_fds,
+ struct preserved_fd* p)
+{
+ if (!FD_ISSET(p->parent_fd, child_fds)){
+ return 0;
+ }
+
+ /*
+ * If no other parent_fd matches the child_fd then use it instead of a
+ * temporary.
+ */
+ int fd = p->child_fd;
+ if (fd_is_open(fd)) {
+ fd = FD_SETSIZE - 1;
+ while (FD_ISSET(fd, child_fds) || fd_is_open(fd)) {
+ --fd;
+ if (fd < 0) {
+ die("failed to find an unused fd");
+ }
+ }
+ }
+
+ int ret = dup2(p->parent_fd, fd);
+ /*
+ * warn() opens a file descriptor so it needs to happen after dup2 to
+ * avoid unintended side effects. This can be avoided by reordering the
+ * mapping requests so that the source fds with overlap are mapped
+ * first (unless there are cycles).
+ */
+ warn("mapped fd overlap: moving %d to %d", p->parent_fd, fd);
+ if (ret == -1) {
+ return -1;
+ }
+
+ p->parent_fd = fd;
+ return 0;
+}
+
static int redirect_fds(struct minijail *j)
{
+ fd_set child_fds;
+ FD_ZERO(&child_fds);
+
+ /* Relocate parent_fds that would be replaced by a child_fd. */
+ for (size_t i = 0; i < j->preserved_fd_count; i++) {
+ int child_fd = j->preserved_fds[i].child_fd;
+ if (FD_ISSET(child_fd, &child_fds)) {
+ die("fd %d is mapped more than once", child_fd);
+ }
+
+ if (ensure_no_fd_conflict(&child_fds,
+ &j->preserved_fds[i]) == -1) {
+ return -1;
+ }
+
+ FD_SET(child_fd, &child_fds);
+ }
+
for (size_t i = 0; i < j->preserved_fd_count; i++) {
+ if (j->preserved_fds[i].parent_fd ==
+ j->preserved_fds[i].child_fd) {
+ continue;
+ }
if (dup2(j->preserved_fds[i].parent_fd,
j->preserved_fds[i].child_fd) == -1) {
return -1;
}
}
+ /*
+ * After all fds have been duped, we are now free to close all parent
+ * fds that are *not* child fds.
+ */
+ for (size_t i = 0; i < j->preserved_fd_count; i++) {
+ int parent_fd = j->preserved_fds[i].parent_fd;
+ if (!FD_ISSET(parent_fd, &child_fds)) {
+ close(parent_fd);
+ }
+ }
return 0;
}
@@ -2537,10 +2748,10 @@ static void setup_child_std_fds(struct minijail *j,
};
for (size_t i = 0; i < ARRAY_SIZE(fd_map); ++i) {
- if (fd_map[i].from != -1) {
- if (dup2(fd_map[i].from, fd_map[i].to) == -1)
- die("failed to set up %s pipe", fd_map[i].name);
- }
+ if (fd_map[i].from == -1 || fd_map[i].from == fd_map[i].to)
+ continue;
+ if (dup2(fd_map[i].from, fd_map[i].to) == -1)
+ die("failed to set up %s pipe", fd_map[i].name);
}
/* Close temporary pipe file descriptors. */
@@ -3158,18 +3369,7 @@ minijail_run_config_internal(struct minijail *j,
return ret;
}
-int API minijail_kill(struct minijail *j)
-{
- if (j->initpid <= 0)
- return -ECHILD;
-
- if (kill(j->initpid, SIGTERM))
- return -errno;
-
- return minijail_wait(j);
-}
-
-int API minijail_wait(struct minijail *j)
+static int minijail_wait_internal(struct minijail *j, int expected_signal)
{
if (j->initpid <= 0)
return -ECHILD;
@@ -3187,8 +3387,10 @@ int API minijail_wait(struct minijail *j)
int error_status = st;
if (WIFSIGNALED(st)) {
int signum = WTERMSIG(st);
- warn("child process %d received signal %d",
- j->initpid, signum);
+ if (signum != expected_signal) {
+ warn("child process %d received signal %d",
+ j->initpid, signum);
+ }
/*
* We return MINIJAIL_ERR_JAIL if the process received
* SIGSYS, which happens when a syscall is blocked by
@@ -3213,6 +3415,22 @@ int API minijail_wait(struct minijail *j)
return exit_status;
}
+int API minijail_kill(struct minijail *j)
+{
+ if (j->initpid <= 0)
+ return -ECHILD;
+
+ if (kill(j->initpid, SIGTERM))
+ return -errno;
+
+ return minijail_wait_internal(j, SIGTERM);
+}
+
+int API minijail_wait(struct minijail *j)
+{
+ return minijail_wait_internal(j, 0);
+}
+
void API minijail_destroy(struct minijail *j)
{
size_t i;
@@ -3222,6 +3440,7 @@ void API minijail_destroy(struct minijail *j)
free(j->filter_prog);
}
free_mounts_list(j);
+ free_remounts_list(j);
while (j->hooks_head) {
struct hook *c = j->hooks_head;
j->hooks_head = c->next;