qemu-patch-raspberry4/tools/virtiofsd/passthrough_ll.c
Stefan Hajnoczi a59feb483b virtiofsd: only retain file system capabilities
virtiofsd runs as root but only needs a subset of root's Linux
capabilities(7).  As a file server its purpose is to create and access
files on behalf of a client.  It needs to be able to access files with
arbitrary uid/gid owners.  It also needs to be create device nodes.

Introduce a Linux capabilities(7) whitelist and drop all capabilities
that we don't need, making the virtiofsd process less powerful than a
regular uid root process.

  # cat /proc/PID/status
  ...
          Before           After
  CapInh: 0000000000000000 0000000000000000
  CapPrm: 0000003fffffffff 00000000880000df
  CapEff: 0000003fffffffff 00000000880000df
  CapBnd: 0000003fffffffff 0000000000000000
  CapAmb: 0000000000000000 0000000000000000

Note that file capabilities cannot be used to achieve the same effect on
the virtiofsd executable because mount is used during sandbox setup.
Therefore we drop capabilities programmatically at the right point
during startup.

This patch only affects the sandboxed child process.  The parent process
that sits in waitpid(2) still has full root capabilities and will be
addressed in the next patch.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20200416164907.244868-2-stefanha@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2020-05-01 18:57:31 +01:00

3059 lines
75 KiB
C

/*
* FUSE: Filesystem in Userspace
* Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
*
* This program can be distributed under the terms of the GNU GPLv2.
* See the file COPYING.
*/
/*
*
* This file system mirrors the existing file system hierarchy of the
* system, starting at the root file system. This is implemented by
* just "passing through" all requests to the corresponding user-space
* libc functions. In contrast to passthrough.c and passthrough_fh.c,
* this implementation uses the low-level API. Its performance should
* be the least bad among the three, but many operations are not
* implemented. In particular, it is not possible to remove files (or
* directories) because the code necessary to defer actual removal
* until the file is not opened anymore would make the example much
* more complicated.
*
* When writeback caching is enabled (-o writeback mount option), it
* is only possible to write to files for which the mounting user has
* read permissions. This is because the writeback cache requires the
* kernel to be able to issue read requests for all files (which the
* passthrough filesystem cannot satisfy if it can't read the file in
* the underlying filesystem).
*
* Compile with:
*
* gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
* passthrough_ll
*
* ## Source code ##
* \include passthrough_ll.c
*/
#include "qemu/osdep.h"
#include "qemu/timer.h"
#include "fuse_virtio.h"
#include "fuse_log.h"
#include "fuse_lowlevel.h"
#include <assert.h>
#include <cap-ng.h>
#include <dirent.h>
#include <errno.h>
#include <glib.h>
#include <inttypes.h>
#include <limits.h>
#include <pthread.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/file.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/xattr.h>
#include <syslog.h>
#include <unistd.h>
#include "passthrough_helpers.h"
#include "seccomp.h"
/* Keep track of inode posix locks for each owner. */
struct lo_inode_plock {
uint64_t lock_owner;
int fd; /* fd for OFD locks */
};
struct lo_map_elem {
union {
struct lo_inode *inode;
struct lo_dirp *dirp;
int fd;
ssize_t freelist;
};
bool in_use;
};
/* Maps FUSE fh or ino values to internal objects */
struct lo_map {
struct lo_map_elem *elems;
size_t nelems;
ssize_t freelist;
};
struct lo_key {
ino_t ino;
dev_t dev;
};
struct lo_inode {
int fd;
/*
* Atomic reference count for this object. The nlookup field holds a
* reference and release it when nlookup reaches 0.
*/
gint refcount;
struct lo_key key;
/*
* This counter keeps the inode alive during the FUSE session.
* Incremented when the FUSE inode number is sent in a reply
* (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is
* released by requests like FUSE_FORGET, FUSE_RMDIR, FUSE_RENAME, etc.
*
* Note that this value is untrusted because the client can manipulate
* it arbitrarily using FUSE_FORGET requests.
*
* Protected by lo->mutex.
*/
uint64_t nlookup;
fuse_ino_t fuse_ino;
pthread_mutex_t plock_mutex;
GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
mode_t filetype;
};
struct lo_cred {
uid_t euid;
gid_t egid;
};
enum {
CACHE_NONE,
CACHE_AUTO,
CACHE_ALWAYS,
};
struct lo_data {
pthread_mutex_t mutex;
int debug;
int norace;
int writeback;
int flock;
int posix_lock;
int xattr;
char *source;
double timeout;
int cache;
int timeout_set;
int readdirplus_set;
int readdirplus_clear;
struct lo_inode root;
GHashTable *inodes; /* protected by lo->mutex */
struct lo_map ino_map; /* protected by lo->mutex */
struct lo_map dirp_map; /* protected by lo->mutex */
struct lo_map fd_map; /* protected by lo->mutex */
/* An O_PATH file descriptor to /proc/self/fd/ */
int proc_self_fd;
};
static const struct fuse_opt lo_opts[] = {
{ "writeback", offsetof(struct lo_data, writeback), 1 },
{ "no_writeback", offsetof(struct lo_data, writeback), 0 },
{ "source=%s", offsetof(struct lo_data, source), 0 },
{ "flock", offsetof(struct lo_data, flock), 1 },
{ "no_flock", offsetof(struct lo_data, flock), 0 },
{ "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
{ "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
{ "xattr", offsetof(struct lo_data, xattr), 1 },
{ "no_xattr", offsetof(struct lo_data, xattr), 0 },
{ "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
{ "timeout=", offsetof(struct lo_data, timeout_set), 1 },
{ "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
{ "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
{ "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
{ "norace", offsetof(struct lo_data, norace), 1 },
{ "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
{ "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
FUSE_OPT_END
};
static bool use_syslog = false;
static int current_log_level;
static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
uint64_t n);
static struct {
pthread_mutex_t mutex;
void *saved;
} cap;
/* That we loaded cap-ng in the current thread from the saved */
static __thread bool cap_loaded = 0;
static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st);
static int is_dot_or_dotdot(const char *name)
{
return name[0] == '.' &&
(name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
}
/* Is `path` a single path component that is not "." or ".."? */
static int is_safe_path_component(const char *path)
{
if (strchr(path, '/')) {
return 0;
}
return !is_dot_or_dotdot(path);
}
static struct lo_data *lo_data(fuse_req_t req)
{
return (struct lo_data *)fuse_req_userdata(req);
}
/*
* Load capng's state from our saved state if the current thread
* hadn't previously been loaded.
* returns 0 on success
*/
static int load_capng(void)
{
if (!cap_loaded) {
pthread_mutex_lock(&cap.mutex);
capng_restore_state(&cap.saved);
/*
* restore_state free's the saved copy
* so make another.
*/
cap.saved = capng_save_state();
if (!cap.saved) {
pthread_mutex_unlock(&cap.mutex);
fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
return -EINVAL;
}
pthread_mutex_unlock(&cap.mutex);
/*
* We want to use the loaded state for our pid,
* not the original
*/
capng_setpid(syscall(SYS_gettid));
cap_loaded = true;
}
return 0;
}
/*
* Helpers for dropping and regaining effective capabilities. Returns 0
* on success, error otherwise
*/
static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
{
int cap, ret;
cap = capng_name_to_capability(cap_name);
if (cap < 0) {
ret = errno;
fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
cap_name, strerror(errno));
goto out;
}
if (load_capng()) {
ret = errno;
fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
goto out;
}
/* We dont have this capability in effective set already. */
if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
ret = 0;
goto out;
}
if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
ret = errno;
fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
goto out;
}
if (capng_apply(CAPNG_SELECT_CAPS)) {
ret = errno;
fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
goto out;
}
ret = 0;
if (cap_dropped) {
*cap_dropped = true;
}
out:
return ret;
}
static int gain_effective_cap(const char *cap_name)
{
int cap;
int ret = 0;
cap = capng_name_to_capability(cap_name);
if (cap < 0) {
ret = errno;
fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
cap_name, strerror(errno));
goto out;
}
if (load_capng()) {
ret = errno;
fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
goto out;
}
if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
ret = errno;
fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
goto out;
}
if (capng_apply(CAPNG_SELECT_CAPS)) {
ret = errno;
fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
goto out;
}
ret = 0;
out:
return ret;
}
static void lo_map_init(struct lo_map *map)
{
map->elems = NULL;
map->nelems = 0;
map->freelist = -1;
}
static void lo_map_destroy(struct lo_map *map)
{
free(map->elems);
}
static int lo_map_grow(struct lo_map *map, size_t new_nelems)
{
struct lo_map_elem *new_elems;
size_t i;
if (new_nelems <= map->nelems) {
return 1;
}
new_elems = realloc(map->elems, sizeof(map->elems[0]) * new_nelems);
if (!new_elems) {
return 0;
}
for (i = map->nelems; i < new_nelems; i++) {
new_elems[i].freelist = i + 1;
new_elems[i].in_use = false;
}
new_elems[new_nelems - 1].freelist = -1;
map->elems = new_elems;
map->freelist = map->nelems;
map->nelems = new_nelems;
return 1;
}
static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
{
struct lo_map_elem *elem;
if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
return NULL;
}
elem = &map->elems[map->freelist];
map->freelist = elem->freelist;
elem->in_use = true;
return elem;
}
static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
{
ssize_t *prev;
if (!lo_map_grow(map, key + 1)) {
return NULL;
}
for (prev = &map->freelist; *prev != -1;
prev = &map->elems[*prev].freelist) {
if (*prev == key) {
struct lo_map_elem *elem = &map->elems[key];
*prev = elem->freelist;
elem->in_use = true;
return elem;
}
}
return NULL;
}
static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
{
if (key >= map->nelems) {
return NULL;
}
if (!map->elems[key].in_use) {
return NULL;
}
return &map->elems[key];
}
static void lo_map_remove(struct lo_map *map, size_t key)
{
struct lo_map_elem *elem;
if (key >= map->nelems) {
return;
}
elem = &map->elems[key];
if (!elem->in_use) {
return;
}
elem->in_use = false;
elem->freelist = map->freelist;
map->freelist = key;
}
/* Assumes lo->mutex is held */
static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd)
{
struct lo_map_elem *elem;
elem = lo_map_alloc_elem(&lo_data(req)->fd_map);
if (!elem) {
return -1;
}
elem->fd = fd;
return elem - lo_data(req)->fd_map.elems;
}
/* Assumes lo->mutex is held */
static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
{
struct lo_map_elem *elem;
elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
if (!elem) {
return -1;
}
elem->dirp = dirp;
return elem - lo_data(req)->dirp_map.elems;
}
/* Assumes lo->mutex is held */
static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
{
struct lo_map_elem *elem;
elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
if (!elem) {
return -1;
}
elem->inode = inode;
return elem - lo_data(req)->ino_map.elems;
}
static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
{
struct lo_inode *inode = *inodep;
if (!inode) {
return;
}
*inodep = NULL;
if (g_atomic_int_dec_and_test(&inode->refcount)) {
close(inode->fd);
free(inode);
}
}
/* Caller must release refcount using lo_inode_put() */
static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
{
struct lo_data *lo = lo_data(req);
struct lo_map_elem *elem;
pthread_mutex_lock(&lo->mutex);
elem = lo_map_get(&lo->ino_map, ino);
if (elem) {
g_atomic_int_inc(&elem->inode->refcount);
}
pthread_mutex_unlock(&lo->mutex);
if (!elem) {
return NULL;
}
return elem->inode;
}
/*
* TODO Remove this helper and force callers to hold an inode refcount until
* they are done with the fd. This will be done in a later patch to make
* review easier.
*/
static int lo_fd(fuse_req_t req, fuse_ino_t ino)
{
struct lo_inode *inode = lo_inode(req, ino);
int fd;
if (!inode) {
return -1;
}
fd = inode->fd;
lo_inode_put(lo_data(req), &inode);
return fd;
}
static void lo_init(void *userdata, struct fuse_conn_info *conn)
{
struct lo_data *lo = (struct lo_data *)userdata;
if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
conn->want |= FUSE_CAP_EXPORT_SUPPORT;
}
if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
conn->want |= FUSE_CAP_WRITEBACK_CACHE;
}
if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
if (lo->flock) {
fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
conn->want |= FUSE_CAP_FLOCK_LOCKS;
} else {
fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
}
}
if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
if (lo->posix_lock) {
fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
conn->want |= FUSE_CAP_POSIX_LOCKS;
} else {
fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
conn->want &= ~FUSE_CAP_POSIX_LOCKS;
}
}
if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
lo->readdirplus_clear) {
fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
conn->want &= ~FUSE_CAP_READDIRPLUS;
}
}
static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi)
{
int res;
struct stat buf;
struct lo_data *lo = lo_data(req);
(void)fi;
res =
fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
if (res == -1) {
return (void)fuse_reply_err(req, errno);
}
fuse_reply_attr(req, &buf, lo->timeout);
}
/*
* Increments parent->nlookup and caller must release refcount using
* lo_inode_put(&parent).
*/
static int lo_parent_and_name(struct lo_data *lo, struct lo_inode *inode,
char path[PATH_MAX], struct lo_inode **parent)
{
char procname[64];
char *last;
struct stat stat;
struct lo_inode *p;
int retries = 2;
int res;
retry:
sprintf(procname, "%i", inode->fd);
res = readlinkat(lo->proc_self_fd, procname, path, PATH_MAX);
if (res < 0) {
fuse_log(FUSE_LOG_WARNING, "%s: readlink failed: %m\n", __func__);
goto fail_noretry;
}
if (res >= PATH_MAX) {
fuse_log(FUSE_LOG_WARNING, "%s: readlink overflowed\n", __func__);
goto fail_noretry;
}
path[res] = '\0';
last = strrchr(path, '/');
if (last == NULL) {
/* Shouldn't happen */
fuse_log(
FUSE_LOG_WARNING,
"%s: INTERNAL ERROR: bad path read from proc\n", __func__);
goto fail_noretry;
}
if (last == path) {
p = &lo->root;
pthread_mutex_lock(&lo->mutex);
p->nlookup++;
g_atomic_int_inc(&p->refcount);
pthread_mutex_unlock(&lo->mutex);
} else {
*last = '\0';
res = fstatat(AT_FDCWD, last == path ? "/" : path, &stat, 0);
if (res == -1) {
if (!retries) {
fuse_log(FUSE_LOG_WARNING,
"%s: failed to stat parent: %m\n", __func__);
}
goto fail;
}
p = lo_find(lo, &stat);
if (p == NULL) {
if (!retries) {
fuse_log(FUSE_LOG_WARNING,
"%s: failed to find parent\n", __func__);
}
goto fail;
}
}
last++;
res = fstatat(p->fd, last, &stat, AT_SYMLINK_NOFOLLOW);
if (res == -1) {
if (!retries) {
fuse_log(FUSE_LOG_WARNING,
"%s: failed to stat last\n", __func__);
}
goto fail_unref;
}
if (stat.st_dev != inode->key.dev || stat.st_ino != inode->key.ino) {
if (!retries) {
fuse_log(FUSE_LOG_WARNING,
"%s: failed to match last\n", __func__);
}
goto fail_unref;
}
*parent = p;
memmove(path, last, strlen(last) + 1);
return 0;
fail_unref:
unref_inode_lolocked(lo, p, 1);
lo_inode_put(lo, &p);
fail:
if (retries) {
retries--;
goto retry;
}
fail_noretry:
errno = EIO;
return -1;
}
static int utimensat_empty(struct lo_data *lo, struct lo_inode *inode,
const struct timespec *tv)
{
int res;
struct lo_inode *parent;
char path[PATH_MAX];
if (S_ISLNK(inode->filetype)) {
res = utimensat(inode->fd, "", tv, AT_EMPTY_PATH);
if (res == -1 && errno == EINVAL) {
/* Sorry, no race free way to set times on symlink. */
if (lo->norace) {
errno = EPERM;
} else {
goto fallback;
}
}
return res;
}
sprintf(path, "%i", inode->fd);
return utimensat(lo->proc_self_fd, path, tv, 0);
fallback:
res = lo_parent_and_name(lo, inode, path, &parent);
if (res != -1) {
res = utimensat(parent->fd, path, tv, AT_SYMLINK_NOFOLLOW);
unref_inode_lolocked(lo, parent, 1);
lo_inode_put(lo, &parent);
}
return res;
}
static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
{
struct lo_data *lo = lo_data(req);
struct lo_map_elem *elem;
pthread_mutex_lock(&lo->mutex);
elem = lo_map_get(&lo->fd_map, fi->fh);
pthread_mutex_unlock(&lo->mutex);
if (!elem) {
return -1;
}
return elem->fd;
}
static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
int valid, struct fuse_file_info *fi)
{
int saverr;
char procname[64];
struct lo_data *lo = lo_data(req);
struct lo_inode *inode;
int ifd;
int res;
int fd;
inode = lo_inode(req, ino);
if (!inode) {
fuse_reply_err(req, EBADF);
return;
}
ifd = inode->fd;
/* If fi->fh is invalid we'll report EBADF later */
if (fi) {
fd = lo_fi_fd(req, fi);
}
if (valid & FUSE_SET_ATTR_MODE) {
if (fi) {
res = fchmod(fd, attr->st_mode);
} else {
sprintf(procname, "%i", ifd);
res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
}
if (res == -1) {
goto out_err;
}
}
if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
if (res == -1) {
goto out_err;
}
}
if (valid & FUSE_SET_ATTR_SIZE) {
int truncfd;
if (fi) {
truncfd = fd;
} else {
sprintf(procname, "%i", ifd);
truncfd = openat(lo->proc_self_fd, procname, O_RDWR);
if (truncfd < 0) {
goto out_err;
}
}
res = ftruncate(truncfd, attr->st_size);
if (!fi) {
saverr = errno;
close(truncfd);
errno = saverr;
}
if (res == -1) {
goto out_err;
}
}
if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
struct timespec tv[2];
tv[0].tv_sec = 0;
tv[1].tv_sec = 0;
tv[0].tv_nsec = UTIME_OMIT;
tv[1].tv_nsec = UTIME_OMIT;
if (valid & FUSE_SET_ATTR_ATIME_NOW) {
tv[0].tv_nsec = UTIME_NOW;
} else if (valid & FUSE_SET_ATTR_ATIME) {
tv[0] = attr->st_atim;
}
if (valid & FUSE_SET_ATTR_MTIME_NOW) {
tv[1].tv_nsec = UTIME_NOW;
} else if (valid & FUSE_SET_ATTR_MTIME) {
tv[1] = attr->st_mtim;
}
if (fi) {
res = futimens(fd, tv);
} else {
res = utimensat_empty(lo, inode, tv);
}
if (res == -1) {
goto out_err;
}
}
lo_inode_put(lo, &inode);
return lo_getattr(req, ino, fi);
out_err:
saverr = errno;
lo_inode_put(lo, &inode);
fuse_reply_err(req, saverr);
}
static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st)
{
struct lo_inode *p;
struct lo_key key = {
.ino = st->st_ino,
.dev = st->st_dev,
};
pthread_mutex_lock(&lo->mutex);
p = g_hash_table_lookup(lo->inodes, &key);
if (p) {
assert(p->nlookup > 0);
p->nlookup++;
g_atomic_int_inc(&p->refcount);
}
pthread_mutex_unlock(&lo->mutex);
return p;
}
/* value_destroy_func for posix_locks GHashTable */
static void posix_locks_value_destroy(gpointer data)
{
struct lo_inode_plock *plock = data;
/*
* We had used open() for locks and had only one fd. So
* closing this fd should release all OFD locks.
*/
close(plock->fd);
free(plock);
}
/*
* Increments nlookup and caller must release refcount using
* lo_inode_put(&parent).
*/
static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
struct fuse_entry_param *e)
{
int newfd;
int res;
int saverr;
struct lo_data *lo = lo_data(req);
struct lo_inode *inode = NULL;
struct lo_inode *dir = lo_inode(req, parent);
/*
* name_to_handle_at() and open_by_handle_at() can reach here with fuse
* mount point in guest, but we don't have its inode info in the
* ino_map.
*/
if (!dir) {
return ENOENT;
}
memset(e, 0, sizeof(*e));
e->attr_timeout = lo->timeout;
e->entry_timeout = lo->timeout;
/* Do not allow escaping root directory */
if (dir == &lo->root && strcmp(name, "..") == 0) {
name = ".";
}
newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
if (newfd == -1) {
goto out_err;
}
res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
if (res == -1) {
goto out_err;
}
inode = lo_find(lo, &e->attr);
if (inode) {
close(newfd);
} else {
inode = calloc(1, sizeof(struct lo_inode));
if (!inode) {
goto out_err;
}
/* cache only filetype */
inode->filetype = (e->attr.st_mode & S_IFMT);
/*
* One for the caller and one for nlookup (released in
* unref_inode_lolocked())
*/
g_atomic_int_set(&inode->refcount, 2);
inode->nlookup = 1;
inode->fd = newfd;
inode->key.ino = e->attr.st_ino;
inode->key.dev = e->attr.st_dev;
pthread_mutex_init(&inode->plock_mutex, NULL);
inode->posix_locks = g_hash_table_new_full(
g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
pthread_mutex_lock(&lo->mutex);
inode->fuse_ino = lo_add_inode_mapping(req, inode);
g_hash_table_insert(lo->inodes, &inode->key, inode);
pthread_mutex_unlock(&lo->mutex);
}
e->ino = inode->fuse_ino;
lo_inode_put(lo, &inode);
lo_inode_put(lo, &dir);
fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
name, (unsigned long long)e->ino);
return 0;
out_err:
saverr = errno;
if (newfd != -1) {
close(newfd);
}
lo_inode_put(lo, &inode);
lo_inode_put(lo, &dir);
return saverr;
}
static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
{
struct fuse_entry_param e;
int err;
fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
name);
/*
* Don't use is_safe_path_component(), allow "." and ".." for NFS export
* support.
*/
if (strchr(name, '/')) {
fuse_reply_err(req, EINVAL);
return;
}
err = lo_do_lookup(req, parent, name, &e);
if (err) {
fuse_reply_err(req, err);
} else {
fuse_reply_entry(req, &e);
}
}
/*
* On some archs, setres*id is limited to 2^16 but they
* provide setres*id32 variants that allow 2^32.
* Others just let setres*id do 2^32 anyway.
*/
#ifdef SYS_setresgid32
#define OURSYS_setresgid SYS_setresgid32
#else
#define OURSYS_setresgid SYS_setresgid
#endif
#ifdef SYS_setresuid32
#define OURSYS_setresuid SYS_setresuid32
#else
#define OURSYS_setresuid SYS_setresuid
#endif
/*
* Change to uid/gid of caller so that file is created with
* ownership of caller.
* TODO: What about selinux context?
*/
static int lo_change_cred(fuse_req_t req, struct lo_cred *old)
{
int res;
old->euid = geteuid();
old->egid = getegid();
res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
if (res == -1) {
return errno;
}
res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
if (res == -1) {
int errno_save = errno;
syscall(OURSYS_setresgid, -1, old->egid, -1);
return errno_save;
}
return 0;
}
/* Regain Privileges */
static void lo_restore_cred(struct lo_cred *old)
{
int res;
res = syscall(OURSYS_setresuid, -1, old->euid, -1);
if (res == -1) {
fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
exit(1);
}
res = syscall(OURSYS_setresgid, -1, old->egid, -1);
if (res == -1) {
fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
exit(1);
}
}
static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
const char *name, mode_t mode, dev_t rdev,
const char *link)
{
int res;
int saverr;
struct lo_data *lo = lo_data(req);
struct lo_inode *dir;
struct fuse_entry_param e;
struct lo_cred old = {};
if (!is_safe_path_component(name)) {
fuse_reply_err(req, EINVAL);
return;
}
dir = lo_inode(req, parent);
if (!dir) {
fuse_reply_err(req, EBADF);
return;
}
saverr = lo_change_cred(req, &old);
if (saverr) {
goto out;
}
res = mknod_wrapper(dir->fd, name, link, mode, rdev);
saverr = errno;
lo_restore_cred(&old);
if (res == -1) {
goto out;
}
saverr = lo_do_lookup(req, parent, name, &e);
if (saverr) {
goto out;
}
fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
name, (unsigned long long)e.ino);
fuse_reply_entry(req, &e);
lo_inode_put(lo, &dir);
return;
out:
lo_inode_put(lo, &dir);
fuse_reply_err(req, saverr);
}
static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
mode_t mode, dev_t rdev)
{
lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
}
static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
mode_t mode)
{
lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
}
static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
const char *name)
{
lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
}
static int linkat_empty_nofollow(struct lo_data *lo, struct lo_inode *inode,
int dfd, const char *name)
{
int res;
struct lo_inode *parent;
char path[PATH_MAX];
if (S_ISLNK(inode->filetype)) {
res = linkat(inode->fd, "", dfd, name, AT_EMPTY_PATH);
if (res == -1 && (errno == ENOENT || errno == EINVAL)) {
/* Sorry, no race free way to hard-link a symlink. */
if (lo->norace) {
errno = EPERM;
} else {
goto fallback;
}
}
return res;
}
sprintf(path, "%i", inode->fd);
return linkat(lo->proc_self_fd, path, dfd, name, AT_SYMLINK_FOLLOW);
fallback:
res = lo_parent_and_name(lo, inode, path, &parent);
if (res != -1) {
res = linkat(parent->fd, path, dfd, name, 0);
unref_inode_lolocked(lo, parent, 1);
lo_inode_put(lo, &parent);
}
return res;
}
static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
const char *name)
{
int res;
struct lo_data *lo = lo_data(req);
struct lo_inode *parent_inode;
struct lo_inode *inode;
struct fuse_entry_param e;
int saverr;
if (!is_safe_path_component(name)) {
fuse_reply_err(req, EINVAL);
return;
}
parent_inode = lo_inode(req, parent);
inode = lo_inode(req, ino);
if (!parent_inode || !inode) {
errno = EBADF;
goto out_err;
}
memset(&e, 0, sizeof(struct fuse_entry_param));
e.attr_timeout = lo->timeout;
e.entry_timeout = lo->timeout;
res = linkat_empty_nofollow(lo, inode, parent_inode->fd, name);
if (res == -1) {
goto out_err;
}
res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
if (res == -1) {
goto out_err;
}
pthread_mutex_lock(&lo->mutex);
inode->nlookup++;
pthread_mutex_unlock(&lo->mutex);
e.ino = inode->fuse_ino;
fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
name, (unsigned long long)e.ino);
fuse_reply_entry(req, &e);
lo_inode_put(lo, &parent_inode);
lo_inode_put(lo, &inode);
return;
out_err:
saverr = errno;
lo_inode_put(lo, &parent_inode);
lo_inode_put(lo, &inode);
fuse_reply_err(req, saverr);
}
/* Increments nlookup and caller must release refcount using lo_inode_put() */
static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
const char *name)
{
int res;
struct stat attr;
res = fstatat(lo_fd(req, parent), name, &attr,
AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
if (res == -1) {
return NULL;
}
return lo_find(lo_data(req), &attr);
}
static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
{
int res;
struct lo_inode *inode;
struct lo_data *lo = lo_data(req);
if (!is_safe_path_component(name)) {
fuse_reply_err(req, EINVAL);
return;
}
inode = lookup_name(req, parent, name);
if (!inode) {
fuse_reply_err(req, EIO);
return;
}
res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
fuse_reply_err(req, res == -1 ? errno : 0);
unref_inode_lolocked(lo, inode, 1);
lo_inode_put(lo, &inode);
}
static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
fuse_ino_t newparent, const char *newname,
unsigned int flags)
{
int res;
struct lo_inode *parent_inode;
struct lo_inode *newparent_inode;
struct lo_inode *oldinode = NULL;
struct lo_inode *newinode = NULL;
struct lo_data *lo = lo_data(req);
if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
fuse_reply_err(req, EINVAL);
return;
}
parent_inode = lo_inode(req, parent);
newparent_inode = lo_inode(req, newparent);
if (!parent_inode || !newparent_inode) {
fuse_reply_err(req, EBADF);
goto out;
}
oldinode = lookup_name(req, parent, name);
newinode = lookup_name(req, newparent, newname);
if (!oldinode) {
fuse_reply_err(req, EIO);
goto out;
}
if (flags) {
#ifndef SYS_renameat2
fuse_reply_err(req, EINVAL);
#else
res = syscall(SYS_renameat2, parent_inode->fd, name,
newparent_inode->fd, newname, flags);
if (res == -1 && errno == ENOSYS) {
fuse_reply_err(req, EINVAL);
} else {
fuse_reply_err(req, res == -1 ? errno : 0);
}
#endif
goto out;
}
res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
fuse_reply_err(req, res == -1 ? errno : 0);
out:
unref_inode_lolocked(lo, oldinode, 1);
unref_inode_lolocked(lo, newinode, 1);
lo_inode_put(lo, &oldinode);
lo_inode_put(lo, &newinode);
lo_inode_put(lo, &parent_inode);
lo_inode_put(lo, &newparent_inode);
}
static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
{
int res;
struct lo_inode *inode;
struct lo_data *lo = lo_data(req);
if (!is_safe_path_component(name)) {
fuse_reply_err(req, EINVAL);
return;
}
inode = lookup_name(req, parent, name);
if (!inode) {
fuse_reply_err(req, EIO);
return;
}
res = unlinkat(lo_fd(req, parent), name, 0);
fuse_reply_err(req, res == -1 ? errno : 0);
unref_inode_lolocked(lo, inode, 1);
lo_inode_put(lo, &inode);
}
/* To be called with lo->mutex held */
static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
{
if (!inode) {
return;
}
assert(inode->nlookup >= n);
inode->nlookup -= n;
if (!inode->nlookup) {
lo_map_remove(&lo->ino_map, inode->fuse_ino);
g_hash_table_remove(lo->inodes, &inode->key);
if (g_hash_table_size(inode->posix_locks)) {
fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
}
g_hash_table_destroy(inode->posix_locks);
pthread_mutex_destroy(&inode->plock_mutex);
/* Drop our refcount from lo_do_lookup() */
lo_inode_put(lo, &inode);
}
}
static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
uint64_t n)
{
if (!inode) {
return;
}
pthread_mutex_lock(&lo->mutex);
unref_inode(lo, inode, n);
pthread_mutex_unlock(&lo->mutex);
}
static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
{
struct lo_data *lo = lo_data(req);
struct lo_inode *inode;
inode = lo_inode(req, ino);
if (!inode) {
return;
}
fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n",
(unsigned long long)ino, (unsigned long long)inode->nlookup,
(unsigned long long)nlookup);
unref_inode_lolocked(lo, inode, nlookup);
lo_inode_put(lo, &inode);
}
static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
{
lo_forget_one(req, ino, nlookup);
fuse_reply_none(req);
}
static void lo_forget_multi(fuse_req_t req, size_t count,
struct fuse_forget_data *forgets)
{
int i;
for (i = 0; i < count; i++) {
lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
}
fuse_reply_none(req);
}
static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
{
char buf[PATH_MAX + 1];
int res;
res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
if (res == -1) {
return (void)fuse_reply_err(req, errno);
}
if (res == sizeof(buf)) {
return (void)fuse_reply_err(req, ENAMETOOLONG);
}
buf[res] = '\0';
fuse_reply_readlink(req, buf);
}
struct lo_dirp {
gint refcount;
DIR *dp;
struct dirent *entry;
off_t offset;
};
static void lo_dirp_put(struct lo_dirp **dp)
{
struct lo_dirp *d = *dp;
if (!d) {
return;
}
*dp = NULL;
if (g_atomic_int_dec_and_test(&d->refcount)) {
closedir(d->dp);
free(d);
}
}
/* Call lo_dirp_put() on the return value when no longer needed */
static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
{
struct lo_data *lo = lo_data(req);
struct lo_map_elem *elem;
pthread_mutex_lock(&lo->mutex);
elem = lo_map_get(&lo->dirp_map, fi->fh);
if (elem) {
g_atomic_int_inc(&elem->dirp->refcount);
}
pthread_mutex_unlock(&lo->mutex);
if (!elem) {
return NULL;
}
return elem->dirp;
}
static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi)
{
int error = ENOMEM;
struct lo_data *lo = lo_data(req);
struct lo_dirp *d;
int fd;
ssize_t fh;
d = calloc(1, sizeof(struct lo_dirp));
if (d == NULL) {
goto out_err;
}
fd = openat(lo_fd(req, ino), ".", O_RDONLY);
if (fd == -1) {
goto out_errno;
}
d->dp = fdopendir(fd);
if (d->dp == NULL) {
goto out_errno;
}
d->offset = 0;
d->entry = NULL;
g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
pthread_mutex_lock(&lo->mutex);
fh = lo_add_dirp_mapping(req, d);
pthread_mutex_unlock(&lo->mutex);
if (fh == -1) {
goto out_err;
}
fi->fh = fh;
if (lo->cache == CACHE_ALWAYS) {
fi->cache_readdir = 1;
}
fuse_reply_open(req, fi);
return;
out_errno:
error = errno;
out_err:
if (d) {
if (d->dp) {
closedir(d->dp);
} else if (fd != -1) {
close(fd);
}
free(d);
}
fuse_reply_err(req, error);
}
static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
off_t offset, struct fuse_file_info *fi, int plus)
{
struct lo_data *lo = lo_data(req);
struct lo_dirp *d = NULL;
struct lo_inode *dinode;
char *buf = NULL;
char *p;
size_t rem = size;
int err = EBADF;
dinode = lo_inode(req, ino);
if (!dinode) {
goto error;
}
d = lo_dirp(req, fi);
if (!d) {
goto error;
}
err = ENOMEM;
buf = calloc(1, size);
if (!buf) {
goto error;
}
p = buf;
if (offset != d->offset) {
seekdir(d->dp, offset);
d->entry = NULL;
d->offset = offset;
}
while (1) {
size_t entsize;
off_t nextoff;
const char *name;
if (!d->entry) {
errno = 0;
d->entry = readdir(d->dp);
if (!d->entry) {
if (errno) { /* Error */
err = errno;
goto error;
} else { /* End of stream */
break;
}
}
}
nextoff = d->entry->d_off;
name = d->entry->d_name;
fuse_ino_t entry_ino = 0;
struct fuse_entry_param e = (struct fuse_entry_param){
.attr.st_ino = d->entry->d_ino,
.attr.st_mode = d->entry->d_type << 12,
};
/* Hide root's parent directory */
if (dinode == &lo->root && strcmp(name, "..") == 0) {
e.attr.st_ino = lo->root.key.ino;
e.attr.st_mode = DT_DIR << 12;
}
if (plus) {
if (!is_dot_or_dotdot(name)) {
err = lo_do_lookup(req, ino, name, &e);
if (err) {
goto error;
}
entry_ino = e.ino;
}
entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
} else {
entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
}
if (entsize > rem) {
if (entry_ino != 0) {
lo_forget_one(req, entry_ino, 1);
}
break;
}
p += entsize;
rem -= entsize;
d->entry = NULL;
d->offset = nextoff;
}
err = 0;
error:
lo_dirp_put(&d);
lo_inode_put(lo, &dinode);
/*
* If there's an error, we can only signal it if we haven't stored
* any entries yet - otherwise we'd end up with wrong lookup
* counts for the entries that are already in the buffer. So we
* return what we've collected until that point.
*/
if (err && rem == size) {
fuse_reply_err(req, err);
} else {
fuse_reply_buf(req, buf, size - rem);
}
free(buf);
}
static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
off_t offset, struct fuse_file_info *fi)
{
lo_do_readdir(req, ino, size, offset, fi, 0);
}
static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
off_t offset, struct fuse_file_info *fi)
{
lo_do_readdir(req, ino, size, offset, fi, 1);
}
static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi)
{
struct lo_data *lo = lo_data(req);
struct lo_map_elem *elem;
struct lo_dirp *d;
(void)ino;
pthread_mutex_lock(&lo->mutex);
elem = lo_map_get(&lo->dirp_map, fi->fh);
if (!elem) {
pthread_mutex_unlock(&lo->mutex);
fuse_reply_err(req, EBADF);
return;
}
d = elem->dirp;
lo_map_remove(&lo->dirp_map, fi->fh);
pthread_mutex_unlock(&lo->mutex);
lo_dirp_put(&d); /* paired with lo_opendir() */
fuse_reply_err(req, 0);
}
static void update_open_flags(int writeback, struct fuse_file_info *fi)
{
/*
* With writeback cache, kernel may send read requests even
* when userspace opened write-only
*/
if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
fi->flags &= ~O_ACCMODE;
fi->flags |= O_RDWR;
}
/*
* With writeback cache, O_APPEND is handled by the kernel.
* This breaks atomicity (since the file may change in the
* underlying filesystem, so that the kernel's idea of the
* end of the file isn't accurate anymore). In this example,
* we just accept that. A more rigorous filesystem may want
* to return an error here
*/
if (writeback && (fi->flags & O_APPEND)) {
fi->flags &= ~O_APPEND;
}
/*
* O_DIRECT in guest should not necessarily mean bypassing page
* cache on host as well. If somebody needs that behavior, it
* probably should be a configuration knob in daemon.
*/
fi->flags &= ~O_DIRECT;
}
static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
mode_t mode, struct fuse_file_info *fi)
{
int fd;
struct lo_data *lo = lo_data(req);
struct lo_inode *parent_inode;
struct fuse_entry_param e;
int err;
struct lo_cred old = {};
fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent,
name);
if (!is_safe_path_component(name)) {
fuse_reply_err(req, EINVAL);
return;
}
parent_inode = lo_inode(req, parent);
if (!parent_inode) {
fuse_reply_err(req, EBADF);
return;
}
err = lo_change_cred(req, &old);
if (err) {
goto out;
}
update_open_flags(lo->writeback, fi);
fd = openat(parent_inode->fd, name, (fi->flags | O_CREAT) & ~O_NOFOLLOW,
mode);
err = fd == -1 ? errno : 0;
lo_restore_cred(&old);
if (!err) {
ssize_t fh;
pthread_mutex_lock(&lo->mutex);
fh = lo_add_fd_mapping(req, fd);
pthread_mutex_unlock(&lo->mutex);
if (fh == -1) {
close(fd);
err = ENOMEM;
goto out;
}
fi->fh = fh;
err = lo_do_lookup(req, parent, name, &e);
}
if (lo->cache == CACHE_NONE) {
fi->direct_io = 1;
} else if (lo->cache == CACHE_ALWAYS) {
fi->keep_cache = 1;
}
out:
lo_inode_put(lo, &parent_inode);
if (err) {
fuse_reply_err(req, err);
} else {
fuse_reply_create(req, &e, fi);
}
}
/* Should be called with inode->plock_mutex held */
static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
struct lo_inode *inode,
uint64_t lock_owner,
pid_t pid, int *err)
{
struct lo_inode_plock *plock;
char procname[64];
int fd;
plock =
g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
if (plock) {
return plock;
}
plock = malloc(sizeof(struct lo_inode_plock));
if (!plock) {
*err = ENOMEM;
return NULL;
}
/* Open another instance of file which can be used for ofd locks. */
sprintf(procname, "%i", inode->fd);
/* TODO: What if file is not writable? */
fd = openat(lo->proc_self_fd, procname, O_RDWR);
if (fd == -1) {
*err = errno;
free(plock);
return NULL;
}
plock->lock_owner = lock_owner;
plock->fd = fd;
g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
plock);
return plock;
}
static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
struct flock *lock)
{
struct lo_data *lo = lo_data(req);
struct lo_inode *inode;
struct lo_inode_plock *plock;
int ret, saverr = 0;
fuse_log(FUSE_LOG_DEBUG,
"lo_getlk(ino=%" PRIu64 ", flags=%d)"
" owner=0x%lx, l_type=%d l_start=0x%lx"
" l_len=0x%lx\n",
ino, fi->flags, fi->lock_owner, lock->l_type, lock->l_start,
lock->l_len);
inode = lo_inode(req, ino);
if (!inode) {
fuse_reply_err(req, EBADF);
return;
}
pthread_mutex_lock(&inode->plock_mutex);
plock =
lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
if (!plock) {
saverr = ret;
goto out;
}
ret = fcntl(plock->fd, F_OFD_GETLK, lock);
if (ret == -1) {
saverr = errno;
}
out:
pthread_mutex_unlock(&inode->plock_mutex);
lo_inode_put(lo, &inode);
if (saverr) {
fuse_reply_err(req, saverr);
} else {
fuse_reply_lock(req, lock);
}
}
static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
struct flock *lock, int sleep)
{
struct lo_data *lo = lo_data(req);
struct lo_inode *inode;
struct lo_inode_plock *plock;
int ret, saverr = 0;
fuse_log(FUSE_LOG_DEBUG,
"lo_setlk(ino=%" PRIu64 ", flags=%d)"
" cmd=%d pid=%d owner=0x%lx sleep=%d l_whence=%d"
" l_start=0x%lx l_len=0x%lx\n",
ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
lock->l_whence, lock->l_start, lock->l_len);
if (sleep) {
fuse_reply_err(req, EOPNOTSUPP);
return;
}
inode = lo_inode(req, ino);
if (!inode) {
fuse_reply_err(req, EBADF);
return;
}
pthread_mutex_lock(&inode->plock_mutex);
plock =
lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
if (!plock) {
saverr = ret;
goto out;
}
/* TODO: Is it alright to modify flock? */
lock->l_pid = 0;
ret = fcntl(plock->fd, F_OFD_SETLK, lock);
if (ret == -1) {
saverr = errno;
}
out:
pthread_mutex_unlock(&inode->plock_mutex);
lo_inode_put(lo, &inode);
fuse_reply_err(req, saverr);
}
static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
struct fuse_file_info *fi)
{
int res;
struct lo_dirp *d;
int fd;
(void)ino;
d = lo_dirp(req, fi);
if (!d) {
fuse_reply_err(req, EBADF);
return;
}
fd = dirfd(d->dp);
if (datasync) {
res = fdatasync(fd);
} else {
res = fsync(fd);
}
lo_dirp_put(&d);
fuse_reply_err(req, res == -1 ? errno : 0);
}
static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
{
int fd;
ssize_t fh;
char buf[64];
struct lo_data *lo = lo_data(req);
fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino,
fi->flags);
update_open_flags(lo->writeback, fi);
sprintf(buf, "%i", lo_fd(req, ino));
fd = openat(lo->proc_self_fd, buf, fi->flags & ~O_NOFOLLOW);
if (fd == -1) {
return (void)fuse_reply_err(req, errno);
}
pthread_mutex_lock(&lo->mutex);
fh = lo_add_fd_mapping(req, fd);
pthread_mutex_unlock(&lo->mutex);
if (fh == -1) {
close(fd);
fuse_reply_err(req, ENOMEM);
return;
}
fi->fh = fh;
if (lo->cache == CACHE_NONE) {
fi->direct_io = 1;
} else if (lo->cache == CACHE_ALWAYS) {
fi->keep_cache = 1;
}
fuse_reply_open(req, fi);
}
static void lo_release(fuse_req_t req, fuse_ino_t ino,
struct fuse_file_info *fi)
{
struct lo_data *lo = lo_data(req);
struct lo_map_elem *elem;
int fd = -1;
(void)ino;
pthread_mutex_lock(&lo->mutex);
elem = lo_map_get(&lo->fd_map, fi->fh);
if (elem) {
fd = elem->fd;
elem = NULL;
lo_map_remove(&lo->fd_map, fi->fh);
}
pthread_mutex_unlock(&lo->mutex);
close(fd);
fuse_reply_err(req, 0);
}
static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
{
int res;
(void)ino;
struct lo_inode *inode;
inode = lo_inode(req, ino);
if (!inode) {
fuse_reply_err(req, EBADF);
return;
}
/* An fd is going away. Cleanup associated posix locks */
pthread_mutex_lock(&inode->plock_mutex);
g_hash_table_remove(inode->posix_locks, GUINT_TO_POINTER(fi->lock_owner));
pthread_mutex_unlock(&inode->plock_mutex);
res = close(dup(lo_fi_fd(req, fi)));
lo_inode_put(lo_data(req), &inode);
fuse_reply_err(req, res == -1 ? errno : 0);
}
static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
struct fuse_file_info *fi)
{
int res;
int fd;
char *buf;
fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
(void *)fi);
if (!fi) {
struct lo_data *lo = lo_data(req);
res = asprintf(&buf, "%i", lo_fd(req, ino));
if (res == -1) {
return (void)fuse_reply_err(req, errno);
}
fd = openat(lo->proc_self_fd, buf, O_RDWR);
free(buf);
if (fd == -1) {
return (void)fuse_reply_err(req, errno);
}
} else {
fd = lo_fi_fd(req, fi);
}
if (datasync) {
res = fdatasync(fd);
} else {
res = fsync(fd);
}
if (!fi) {
close(fd);
}
fuse_reply_err(req, res == -1 ? errno : 0);
}
static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
struct fuse_file_info *fi)
{
struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
fuse_log(FUSE_LOG_DEBUG,
"lo_read(ino=%" PRIu64 ", size=%zd, "
"off=%lu)\n",
ino, size, (unsigned long)offset);
buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
buf.buf[0].fd = lo_fi_fd(req, fi);
buf.buf[0].pos = offset;
fuse_reply_data(req, &buf);
}
static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
struct fuse_bufvec *in_buf, off_t off,
struct fuse_file_info *fi)
{
(void)ino;
ssize_t res;
struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
bool cap_fsetid_dropped = false;
out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
out_buf.buf[0].fd = lo_fi_fd(req, fi);
out_buf.buf[0].pos = off;
fuse_log(FUSE_LOG_DEBUG,
"lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino,
out_buf.buf[0].size, (unsigned long)off);
/*
* If kill_priv is set, drop CAP_FSETID which should lead to kernel
* clearing setuid/setgid on file.
*/
if (fi->kill_priv) {
res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
if (res != 0) {
fuse_reply_err(req, res);
return;
}
}
res = fuse_buf_copy(&out_buf, in_buf);
if (res < 0) {
fuse_reply_err(req, -res);
} else {
fuse_reply_write(req, (size_t)res);
}
if (cap_fsetid_dropped) {
res = gain_effective_cap("FSETID");
if (res) {
fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
}
}
}
static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
{
int res;
struct statvfs stbuf;
res = fstatvfs(lo_fd(req, ino), &stbuf);
if (res == -1) {
fuse_reply_err(req, errno);
} else {
fuse_reply_statfs(req, &stbuf);
}
}
static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
off_t length, struct fuse_file_info *fi)
{
int err = EOPNOTSUPP;
(void)ino;
#ifdef CONFIG_FALLOCATE
err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
if (err < 0) {
err = errno;
}
#elif defined(CONFIG_POSIX_FALLOCATE)
if (mode) {
fuse_reply_err(req, EOPNOTSUPP);
return;
}
err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
#endif
fuse_reply_err(req, err);
}
static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
int op)
{
int res;
(void)ino;
res = flock(lo_fi_fd(req, fi), op);
fuse_reply_err(req, res == -1 ? errno : 0);
}
static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
size_t size)
{
struct lo_data *lo = lo_data(req);
char *value = NULL;
char procname[64];
struct lo_inode *inode;
ssize_t ret;
int saverr;
int fd = -1;
inode = lo_inode(req, ino);
if (!inode) {
fuse_reply_err(req, EBADF);
return;
}
saverr = ENOSYS;
if (!lo_data(req)->xattr) {
goto out;
}
fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
ino, name, size);
if (size) {
value = malloc(size);
if (!value) {
goto out_err;
}
}
sprintf(procname, "%i", inode->fd);
/*
* It is not safe to open() non-regular/non-dir files in file server
* unless O_PATH is used, so use that method for regular files/dir
* only (as it seems giving less performance overhead).
* Otherwise, call fchdir() to avoid open().
*/
if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
fd = openat(lo->proc_self_fd, procname, O_RDONLY);
if (fd < 0) {
goto out_err;
}
ret = fgetxattr(fd, name, value, size);
} else {
/* fchdir should not fail here */
assert(fchdir(lo->proc_self_fd) == 0);
ret = getxattr(procname, name, value, size);
assert(fchdir(lo->root.fd) == 0);
}
if (ret == -1) {
goto out_err;
}
if (size) {
saverr = 0;
if (ret == 0) {
goto out;
}
fuse_reply_buf(req, value, ret);
} else {
fuse_reply_xattr(req, ret);
}
out_free:
free(value);
if (fd >= 0) {
close(fd);
}
lo_inode_put(lo, &inode);
return;
out_err:
saverr = errno;
out:
fuse_reply_err(req, saverr);
goto out_free;
}
static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
{
struct lo_data *lo = lo_data(req);
char *value = NULL;
char procname[64];
struct lo_inode *inode;
ssize_t ret;
int saverr;
int fd = -1;
inode = lo_inode(req, ino);
if (!inode) {
fuse_reply_err(req, EBADF);
return;
}
saverr = ENOSYS;
if (!lo_data(req)->xattr) {
goto out;
}
fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
size);
if (size) {
value = malloc(size);
if (!value) {
goto out_err;
}
}
sprintf(procname, "%i", inode->fd);
if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
fd = openat(lo->proc_self_fd, procname, O_RDONLY);
if (fd < 0) {
goto out_err;
}
ret = flistxattr(fd, value, size);
} else {
/* fchdir should not fail here */
assert(fchdir(lo->proc_self_fd) == 0);
ret = listxattr(procname, value, size);
assert(fchdir(lo->root.fd) == 0);
}
if (ret == -1) {
goto out_err;
}
if (size) {
saverr = 0;
if (ret == 0) {
goto out;
}
fuse_reply_buf(req, value, ret);
} else {
fuse_reply_xattr(req, ret);
}
out_free:
free(value);
if (fd >= 0) {
close(fd);
}
lo_inode_put(lo, &inode);
return;
out_err:
saverr = errno;
out:
fuse_reply_err(req, saverr);
goto out_free;
}
static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
const char *value, size_t size, int flags)
{
char procname[64];
struct lo_data *lo = lo_data(req);
struct lo_inode *inode;
ssize_t ret;
int saverr;
int fd = -1;
inode = lo_inode(req, ino);
if (!inode) {
fuse_reply_err(req, EBADF);
return;
}
saverr = ENOSYS;
if (!lo_data(req)->xattr) {
goto out;
}
fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
", name=%s value=%s size=%zd)\n", ino, name, value, size);
sprintf(procname, "%i", inode->fd);
if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
fd = openat(lo->proc_self_fd, procname, O_RDONLY);
if (fd < 0) {
saverr = errno;
goto out;
}
ret = fsetxattr(fd, name, value, size, flags);
} else {
/* fchdir should not fail here */
assert(fchdir(lo->proc_self_fd) == 0);
ret = setxattr(procname, name, value, size, flags);
assert(fchdir(lo->root.fd) == 0);
}
saverr = ret == -1 ? errno : 0;
out:
if (fd >= 0) {
close(fd);
}
lo_inode_put(lo, &inode);
fuse_reply_err(req, saverr);
}
static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name)
{
char procname[64];
struct lo_data *lo = lo_data(req);
struct lo_inode *inode;
ssize_t ret;
int saverr;
int fd = -1;
inode = lo_inode(req, ino);
if (!inode) {
fuse_reply_err(req, EBADF);
return;
}
saverr = ENOSYS;
if (!lo_data(req)->xattr) {
goto out;
}
fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
name);
sprintf(procname, "%i", inode->fd);
if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
fd = openat(lo->proc_self_fd, procname, O_RDONLY);
if (fd < 0) {
saverr = errno;
goto out;
}
ret = fremovexattr(fd, name);
} else {
/* fchdir should not fail here */
assert(fchdir(lo->proc_self_fd) == 0);
ret = removexattr(procname, name);
assert(fchdir(lo->root.fd) == 0);
}
saverr = ret == -1 ? errno : 0;
out:
if (fd >= 0) {
close(fd);
}
lo_inode_put(lo, &inode);
fuse_reply_err(req, saverr);
}
#ifdef HAVE_COPY_FILE_RANGE
static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
struct fuse_file_info *fi_in, fuse_ino_t ino_out,
off_t off_out, struct fuse_file_info *fi_out,
size_t len, int flags)
{
int in_fd, out_fd;
ssize_t res;
in_fd = lo_fi_fd(req, fi_in);
out_fd = lo_fi_fd(req, fi_out);
fuse_log(FUSE_LOG_DEBUG,
"lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
"off=%lu, ino=%" PRIu64 "/fd=%d, "
"off=%lu, size=%zd, flags=0x%x)\n",
ino_in, in_fd, off_in, ino_out, out_fd, off_out, len, flags);
res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
if (res < 0) {
fuse_reply_err(req, errno);
} else {
fuse_reply_write(req, res);
}
}
#endif
static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
struct fuse_file_info *fi)
{
off_t res;
(void)ino;
res = lseek(lo_fi_fd(req, fi), off, whence);
if (res != -1) {
fuse_reply_lseek(req, res);
} else {
fuse_reply_err(req, errno);
}
}
static void lo_destroy(void *userdata)
{
struct lo_data *lo = (struct lo_data *)userdata;
pthread_mutex_lock(&lo->mutex);
while (true) {
GHashTableIter iter;
gpointer key, value;
g_hash_table_iter_init(&iter, lo->inodes);
if (!g_hash_table_iter_next(&iter, &key, &value)) {
break;
}
struct lo_inode *inode = value;
unref_inode(lo, inode, inode->nlookup);
}
pthread_mutex_unlock(&lo->mutex);
}
static struct fuse_lowlevel_ops lo_oper = {
.init = lo_init,
.lookup = lo_lookup,
.mkdir = lo_mkdir,
.mknod = lo_mknod,
.symlink = lo_symlink,
.link = lo_link,
.unlink = lo_unlink,
.rmdir = lo_rmdir,
.rename = lo_rename,
.forget = lo_forget,
.forget_multi = lo_forget_multi,
.getattr = lo_getattr,
.setattr = lo_setattr,
.readlink = lo_readlink,
.opendir = lo_opendir,
.readdir = lo_readdir,
.readdirplus = lo_readdirplus,
.releasedir = lo_releasedir,
.fsyncdir = lo_fsyncdir,
.create = lo_create,
.getlk = lo_getlk,
.setlk = lo_setlk,
.open = lo_open,
.release = lo_release,
.flush = lo_flush,
.fsync = lo_fsync,
.read = lo_read,
.write_buf = lo_write_buf,
.statfs = lo_statfs,
.fallocate = lo_fallocate,
.flock = lo_flock,
.getxattr = lo_getxattr,
.listxattr = lo_listxattr,
.setxattr = lo_setxattr,
.removexattr = lo_removexattr,
#ifdef HAVE_COPY_FILE_RANGE
.copy_file_range = lo_copy_file_range,
#endif
.lseek = lo_lseek,
.destroy = lo_destroy,
};
/* Print vhost-user.json backend program capabilities */
static void print_capabilities(void)
{
printf("{\n");
printf(" \"type\": \"fs\"\n");
printf("}\n");
}
/*
* Move to a new mount, net, and pid namespaces to isolate this process.
*/
static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
{
pid_t child;
char template[] = "virtiofsd-XXXXXX";
char *tmpdir;
/*
* Create a new pid namespace for *child* processes. We'll have to
* fork in order to enter the new pid namespace. A new mount namespace
* is also needed so that we can remount /proc for the new pid
* namespace.
*
* Our UNIX domain sockets have been created. Now we can move to
* an empty network namespace to prevent TCP/IP and other network
* activity in case this process is compromised.
*/
if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
exit(1);
}
child = fork();
if (child < 0) {
fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
exit(1);
}
if (child > 0) {
pid_t waited;
int wstatus;
/* The parent waits for the child */
do {
waited = waitpid(child, &wstatus, 0);
} while (waited < 0 && errno == EINTR && !se->exited);
/* We were terminated by a signal, see fuse_signals.c */
if (se->exited) {
exit(0);
}
if (WIFEXITED(wstatus)) {
exit(WEXITSTATUS(wstatus));
}
exit(1);
}
/* Send us SIGTERM when the parent thread terminates, see prctl(2) */
prctl(PR_SET_PDEATHSIG, SIGTERM);
/*
* If the mounts have shared propagation then we want to opt out so our
* mount changes don't affect the parent mount namespace.
*/
if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
exit(1);
}
/* The child must remount /proc to use the new pid namespace */
if (mount("proc", "/proc", "proc",
MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
exit(1);
}
tmpdir = mkdtemp(template);
if (!tmpdir) {
fuse_log(FUSE_LOG_ERR, "tmpdir(%s): %m\n", template);
exit(1);
}
if (mount("/proc/self/fd", tmpdir, NULL, MS_BIND, NULL) < 0) {
fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, %s, MS_BIND): %m\n",
tmpdir);
exit(1);
}
/* Now we can get our /proc/self/fd directory file descriptor */
lo->proc_self_fd = open(tmpdir, O_PATH);
if (lo->proc_self_fd == -1) {
fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", tmpdir);
exit(1);
}
if (umount2(tmpdir, MNT_DETACH) < 0) {
fuse_log(FUSE_LOG_ERR, "umount2(%s, MNT_DETACH): %m\n", tmpdir);
exit(1);
}
if (rmdir(tmpdir) < 0) {
fuse_log(FUSE_LOG_ERR, "rmdir(%s): %m\n", tmpdir);
}
}
/*
* Capture the capability state, we'll need to restore this for individual
* threads later; see load_capng.
*/
static void setup_capng(void)
{
/* Note this accesses /proc so has to happen before the sandbox */
if (capng_get_caps_process()) {
fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
exit(1);
}
pthread_mutex_init(&cap.mutex, NULL);
pthread_mutex_lock(&cap.mutex);
cap.saved = capng_save_state();
if (!cap.saved) {
fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
exit(1);
}
pthread_mutex_unlock(&cap.mutex);
}
static void cleanup_capng(void)
{
free(cap.saved);
cap.saved = NULL;
pthread_mutex_destroy(&cap.mutex);
}
/*
* Make the source directory our root so symlinks cannot escape and no other
* files are accessible. Assumes unshare(CLONE_NEWNS) was already called.
*/
static void setup_mounts(const char *source)
{
int oldroot;
int newroot;
if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
exit(1);
}
/* This magic is based on lxc's lxc_pivot_root() */
oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
if (oldroot < 0) {
fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
exit(1);
}
newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
if (newroot < 0) {
fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
exit(1);
}
if (fchdir(newroot) < 0) {
fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
exit(1);
}
if (syscall(__NR_pivot_root, ".", ".") < 0) {
fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
exit(1);
}
if (fchdir(oldroot) < 0) {
fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
exit(1);
}
if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
exit(1);
}
if (umount2(".", MNT_DETACH) < 0) {
fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
exit(1);
}
if (fchdir(newroot) < 0) {
fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
exit(1);
}
close(newroot);
close(oldroot);
}
/*
* Only keep whitelisted capabilities that are needed for file system operation
*/
static void setup_capabilities(void)
{
pthread_mutex_lock(&cap.mutex);
capng_restore_state(&cap.saved);
/*
* Whitelist file system-related capabilities that are needed for a file
* server to act like root. Drop everything else like networking and
* sysadmin capabilities.
*
* Exclusions:
* 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
* and we don't support that.
* 2. CAP_MAC_OVERRIDE is not included because it only seems to be
* used by the Smack LSM. Omit it until there is demand for it.
*/
capng_setpid(syscall(SYS_gettid));
capng_clear(CAPNG_SELECT_BOTH);
capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
CAP_CHOWN,
CAP_DAC_OVERRIDE,
CAP_DAC_READ_SEARCH,
CAP_FOWNER,
CAP_FSETID,
CAP_SETGID,
CAP_SETUID,
CAP_MKNOD,
CAP_SETFCAP);
capng_apply(CAPNG_SELECT_BOTH);
cap.saved = capng_save_state();
pthread_mutex_unlock(&cap.mutex);
}
/*
* Lock down this process to prevent access to other processes or files outside
* source directory. This reduces the impact of arbitrary code execution bugs.
*/
static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
bool enable_syslog)
{
setup_namespaces(lo, se);
setup_mounts(lo->source);
setup_seccomp(enable_syslog);
setup_capabilities();
}
/* Set the maximum number of open file descriptors */
static void setup_nofile_rlimit(unsigned long rlimit_nofile)
{
struct rlimit rlim = {
.rlim_cur = rlimit_nofile,
.rlim_max = rlimit_nofile,
};
if (rlimit_nofile == 0) {
return; /* nothing to do */
}
if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
/* Ignore SELinux denials */
if (errno == EPERM) {
return;
}
fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
exit(1);
}
}
static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
{
g_autofree char *localfmt = NULL;
if (current_log_level < level) {
return;
}
if (current_log_level == FUSE_LOG_DEBUG) {
if (!use_syslog) {
localfmt = g_strdup_printf("[%" PRId64 "] [ID: %08ld] %s",
get_clock(), syscall(__NR_gettid), fmt);
} else {
localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
fmt);
}
fmt = localfmt;
}
if (use_syslog) {
int priority = LOG_ERR;
switch (level) {
case FUSE_LOG_EMERG:
priority = LOG_EMERG;
break;
case FUSE_LOG_ALERT:
priority = LOG_ALERT;
break;
case FUSE_LOG_CRIT:
priority = LOG_CRIT;
break;
case FUSE_LOG_ERR:
priority = LOG_ERR;
break;
case FUSE_LOG_WARNING:
priority = LOG_WARNING;
break;
case FUSE_LOG_NOTICE:
priority = LOG_NOTICE;
break;
case FUSE_LOG_INFO:
priority = LOG_INFO;
break;
case FUSE_LOG_DEBUG:
priority = LOG_DEBUG;
break;
}
vsyslog(priority, fmt, ap);
} else {
vfprintf(stderr, fmt, ap);
}
}
static void setup_root(struct lo_data *lo, struct lo_inode *root)
{
int fd, res;
struct stat stat;
fd = open("/", O_PATH);
if (fd == -1) {
fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
exit(1);
}
res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
if (res == -1) {
fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
exit(1);
}
root->filetype = S_IFDIR;
root->fd = fd;
root->key.ino = stat.st_ino;
root->key.dev = stat.st_dev;
root->nlookup = 2;
g_atomic_int_set(&root->refcount, 2);
}
static guint lo_key_hash(gconstpointer key)
{
const struct lo_key *lkey = key;
return (guint)lkey->ino + (guint)lkey->dev;
}
static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
{
const struct lo_key *la = a;
const struct lo_key *lb = b;
return la->ino == lb->ino && la->dev == lb->dev;
}
static void fuse_lo_data_cleanup(struct lo_data *lo)
{
if (lo->inodes) {
g_hash_table_destroy(lo->inodes);
}
lo_map_destroy(&lo->fd_map);
lo_map_destroy(&lo->dirp_map);
lo_map_destroy(&lo->ino_map);
if (lo->proc_self_fd >= 0) {
close(lo->proc_self_fd);
}
if (lo->root.fd >= 0) {
close(lo->root.fd);
}
free(lo->source);
}
int main(int argc, char *argv[])
{
struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
struct fuse_session *se;
struct fuse_cmdline_opts opts;
struct lo_data lo = {
.debug = 0,
.writeback = 0,
.posix_lock = 1,
.proc_self_fd = -1,
};
struct lo_map_elem *root_elem;
int ret = -1;
/* Don't mask creation mode, kernel already did that */
umask(0);
pthread_mutex_init(&lo.mutex, NULL);
lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
lo.root.fd = -1;
lo.root.fuse_ino = FUSE_ROOT_ID;
lo.cache = CACHE_AUTO;
/*
* Set up the ino map like this:
* [0] Reserved (will not be used)
* [1] Root inode
*/
lo_map_init(&lo.ino_map);
lo_map_reserve(&lo.ino_map, 0)->in_use = false;
root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
root_elem->inode = &lo.root;
lo_map_init(&lo.dirp_map);
lo_map_init(&lo.fd_map);
if (fuse_parse_cmdline(&args, &opts) != 0) {
goto err_out1;
}
fuse_set_log_func(log_func);
use_syslog = opts.syslog;
if (use_syslog) {
openlog("virtiofsd", LOG_PID, LOG_DAEMON);
}
if (opts.show_help) {
printf("usage: %s [options]\n\n", argv[0]);
fuse_cmdline_help();
printf(" -o source=PATH shared directory tree\n");
fuse_lowlevel_help();
ret = 0;
goto err_out1;
} else if (opts.show_version) {
fuse_lowlevel_version();
ret = 0;
goto err_out1;
} else if (opts.print_capabilities) {
print_capabilities();
ret = 0;
goto err_out1;
}
if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
goto err_out1;
}
/*
* log_level is 0 if not configured via cmd options (0 is LOG_EMERG,
* and we don't use this log level).
*/
if (opts.log_level != 0) {
current_log_level = opts.log_level;
}
lo.debug = opts.debug;
if (lo.debug) {
current_log_level = FUSE_LOG_DEBUG;
}
if (lo.source) {
struct stat stat;
int res;
res = lstat(lo.source, &stat);
if (res == -1) {
fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
lo.source);
exit(1);
}
if (!S_ISDIR(stat.st_mode)) {
fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
exit(1);
}
} else {
lo.source = strdup("/");
}
if (!lo.timeout_set) {
switch (lo.cache) {
case CACHE_NONE:
lo.timeout = 0.0;
break;
case CACHE_AUTO:
lo.timeout = 1.0;
break;
case CACHE_ALWAYS:
lo.timeout = 86400.0;
break;
}
} else if (lo.timeout < 0) {
fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
exit(1);
}
se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
if (se == NULL) {
goto err_out1;
}
if (fuse_set_signal_handlers(se) != 0) {
goto err_out2;
}
if (fuse_session_mount(se) != 0) {
goto err_out3;
}
fuse_daemonize(opts.foreground);
setup_nofile_rlimit(opts.rlimit_nofile);
/* Must be before sandbox since it wants /proc */
setup_capng();
setup_sandbox(&lo, se, opts.syslog);
setup_root(&lo, &lo.root);
/* Block until ctrl+c or fusermount -u */
ret = virtio_loop(se);
fuse_session_unmount(se);
cleanup_capng();
err_out3:
fuse_remove_signal_handlers(se);
err_out2:
fuse_session_destroy(se);
err_out1:
fuse_opt_free_args(&args);
fuse_lo_data_cleanup(&lo);
return ret ? 1 : 0;
}