The most notable change is that we now detect cross-device setups in the

host since it may cause inode number collision and mayhem in the guest.
 A new fsdev property is added for the user to choose the appropriate
 policy to handle that: either remap all inode numbers or fail I/Os to
 another host device or just print out a warning (default behaviour).
 
 This is also my last PR as _active_ maintainer of 9pfs.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEtIKLr5QxQM7yo0kQcdTV5YIvc9YFAl2fEn8ACgkQcdTV5YIv
 c9bnTxAApYimbNUT+OjfNfPDjMHrezHCLnczuAWya3JcUCEkZC2E+qEwYdCzdwvq
 TGcdXPcbiUKUNY/3V3pEefuckPJ2+UVmqPpzYcuRjZNYrxqo7SzVPyxxMtG3f5Fh
 +dMu6Hx1s/vkoWf81HO1tnkTdL9aiOMQS7yUtEYidD8yoqJRLwbKGB+uGZrY6aDy
 65n9z/0uwwzOwJsFlRjLMeifkmMC4tA1DLIZHQxGLCUk9K0/xCcI2CbYITgt1T4m
 2xf/0t/+RQT/n6sXheskDpI8hf3A0rvEDETrvHp90zal3iDq93ZfvPd134LFRZIu
 tWsRYNKsaJE4ecIHa/wp535isb4uQa7PL10+oD075o+BF98Nk10ALyAQf7RTefkC
 90lkXeRAGfJaMCuDuTmxFVBmQPgUjXsfKvASG8V4yweqO7oUSl5D8m+aOu7t3+f4
 8n+DhEZp1ANQPgLv4raAxwFhlsVl+BImOZRv/SGKzqgf0jy+NT1/ebfTFyPttFff
 vn7kYfm1V/hPhQVVm7xqGwyRybP+V8td3mWo8hVsiqziZIN4x1wb/qFpJeuHuFSj
 IcJymcH7BgeBYWyjpmn+W94DdIoj20cLwcLHxU6d2L61oUrhKHd7R2g1Ow/aXh4L
 ohoK104GUqTBPbmxn0Dpal/Xz26X4k4l0JvVXzwPdBv99JkRF4I=
 =TqfQ
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/gkurz/tags/9p-next-2019-10-10' into staging

The most notable change is that we now detect cross-device setups in the
host since it may cause inode number collision and mayhem in the guest.
A new fsdev property is added for the user to choose the appropriate
policy to handle that: either remap all inode numbers or fail I/Os to
another host device or just print out a warning (default behaviour).

This is also my last PR as _active_ maintainer of 9pfs.

# gpg: Signature made Thu 10 Oct 2019 12:14:07 BST
# gpg:                using RSA key B4828BAF943140CEF2A3491071D4D5E5822F73D6
# gpg: Good signature from "Greg Kurz <groug@kaod.org>" [full]
# gpg:                 aka "Gregory Kurz <gregory.kurz@free.fr>" [full]
# gpg:                 aka "[jpeg image of size 3330]" [full]
# Primary key fingerprint: B482 8BAF 9431 40CE F2A3  4910 71D4 D5E5 822F 73D6

* remotes/gkurz/tags/9p-next-2019-10-10:
  MAINTAINERS: Downgrade status of virtio-9p to "Odd Fixes"
  9p: Use variable length suffixes for inode remapping
  9p: stat_to_qid: implement slow path
  9p: Added virtfs option 'multidevs=remap|forbid|warn'
  9p: Treat multiple devices on one export as an error
  fsdev: Add return value to fsdev_throttle_parse_opts()
  9p: Simplify error path of v9fs_device_realize_common()
  9p: unsigned type for type, version, path

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2019-10-14 13:34:39 +01:00
commit c8b2bc5185
14 changed files with 639 additions and 61 deletions

View file

@ -1517,7 +1517,7 @@ F: tests/virtio-balloon-test.c
virtio-9p
M: Greg Kurz <groug@kaod.org>
S: Supported
S: Odd Fixes
F: hw/9pfs/
X: hw/9pfs/xen-9p*
F: fsdev/

View file

@ -9,9 +9,9 @@ typedef struct V9fsString
typedef struct V9fsQID
{
int8_t type;
int32_t version;
int64_t path;
uint8_t type;
uint32_t version;
uint64_t path;
} V9fsQID;
typedef struct V9fsStat

View file

@ -59,6 +59,11 @@ typedef struct ExtendedOps {
#define V9FS_RDONLY 0x00000040
#define V9FS_PROXY_SOCK_FD 0x00000080
#define V9FS_PROXY_SOCK_NAME 0x00000100
/*
* multidevs option (either one of the two applies exclusively)
*/
#define V9FS_REMAP_INODES 0x00000200
#define V9FS_FORBID_MULTIDEVS 0x00000400
#define V9FS_SEC_MASK 0x0000003C

View file

@ -31,7 +31,9 @@ static QemuOptsList qemu_fsdev_opts = {
}, {
.name = "readonly",
.type = QEMU_OPT_BOOL,
}, {
.name = "multidevs",
.type = QEMU_OPT_STRING,
}, {
.name = "socket",
.type = QEMU_OPT_STRING,
@ -75,6 +77,9 @@ static QemuOptsList qemu_virtfs_opts = {
}, {
.name = "readonly",
.type = QEMU_OPT_BOOL,
}, {
.name = "multidevs",
.type = QEMU_OPT_STRING,
}, {
.name = "socket",
.type = QEMU_OPT_STRING,

View file

@ -31,7 +31,7 @@ static void fsdev_throttle_write_timer_cb(void *opaque)
qemu_co_enter_next(&fst->throttled_reqs[true], NULL);
}
void fsdev_throttle_parse_opts(QemuOpts *opts, FsThrottle *fst, Error **errp)
int fsdev_throttle_parse_opts(QemuOpts *opts, FsThrottle *fst, Error **errp)
{
throttle_config_init(&fst->cfg);
fst->cfg.buckets[THROTTLE_BPS_TOTAL].avg =
@ -75,7 +75,7 @@ void fsdev_throttle_parse_opts(QemuOpts *opts, FsThrottle *fst, Error **errp)
fst->cfg.op_size =
qemu_opt_get_number(opts, "throttling.iops-size", 0);
throttle_is_valid(&fst->cfg, errp);
return throttle_is_valid(&fst->cfg, errp) ? 0 : -1;
}
void fsdev_throttle_init(FsThrottle *fst)

View file

@ -26,7 +26,7 @@ typedef struct FsThrottle {
CoQueue throttled_reqs[2];
} FsThrottle;
void fsdev_throttle_parse_opts(QemuOpts *, FsThrottle *, Error **);
int fsdev_throttle_parse_opts(QemuOpts *, FsThrottle *, Error **);
void fsdev_throttle_init(FsThrottle *);

View file

@ -58,6 +58,7 @@ static FsDriverTable FsDrivers[] = {
"writeout",
"fmode",
"dmode",
"multidevs",
"throttling.bps-total",
"throttling.bps-read",
"throttling.bps-write",

View file

@ -1465,6 +1465,10 @@ static void local_cleanup(FsContext *ctx)
{
LocalData *data = ctx->private;
if (!data) {
return;
}
close(data->mountfd);
g_free(data);
}
@ -1479,6 +1483,7 @@ static int local_parse_opts(QemuOpts *opts, FsDriverEntry *fse, Error **errp)
{
const char *sec_model = qemu_opt_get(opts, "security_model");
const char *path = qemu_opt_get(opts, "path");
const char *multidevs = qemu_opt_get(opts, "multidevs");
Error *local_err = NULL;
if (!sec_model) {
@ -1502,13 +1507,32 @@ static int local_parse_opts(QemuOpts *opts, FsDriverEntry *fse, Error **errp)
return -1;
}
if (multidevs) {
if (!strcmp(multidevs, "remap")) {
fse->export_flags &= ~V9FS_FORBID_MULTIDEVS;
fse->export_flags |= V9FS_REMAP_INODES;
} else if (!strcmp(multidevs, "forbid")) {
fse->export_flags &= ~V9FS_REMAP_INODES;
fse->export_flags |= V9FS_FORBID_MULTIDEVS;
} else if (!strcmp(multidevs, "warn")) {
fse->export_flags &= ~V9FS_FORBID_MULTIDEVS;
fse->export_flags &= ~V9FS_REMAP_INODES;
} else {
error_setg(&local_err, "invalid multidevs property '%s'",
multidevs);
error_append_hint(&local_err, "Valid options are: multidevs="
"[remap|forbid|warn]\n");
error_propagate(errp, local_err);
return -1;
}
}
if (!path) {
error_setg(errp, "path property not set");
return -1;
}
fsdev_throttle_parse_opts(opts, &fse->fst, &local_err);
if (local_err) {
if (fsdev_throttle_parse_opts(opts, &fse->fst, &local_err)) {
error_propagate_prepend(errp, local_err,
"invalid throttle configuration: ");
return -1;

View file

@ -1185,6 +1185,10 @@ static void proxy_cleanup(FsContext *ctx)
{
V9fsProxy *proxy = ctx->private;
if (!proxy) {
return;
}
g_free(proxy->out_iovec.iov_base);
g_free(proxy->in_iovec.iov_base);
if (ctx->export_flags & V9FS_PROXY_SOCK_NAME) {

View file

@ -26,6 +26,8 @@
#include "trace.h"
#include "migration/blocker.h"
#include "sysemu/qtest.h"
#include "qemu/xxhash.h"
#include <math.h>
int open_fd_hw;
int total_open_fd;
@ -572,14 +574,374 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu)
P9_STAT_MODE_NAMED_PIPE | \
P9_STAT_MODE_SOCKET)
/* This is the algorithm from ufs in spfs */
static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
/* Mirrors all bits of a byte. So e.g. binary 10100000 would become 00000101. */
static inline uint8_t mirror8bit(uint8_t byte)
{
return (byte * 0x0202020202ULL & 0x010884422010ULL) % 1023;
}
/* Same as mirror8bit() just for a 64 bit data type instead for a byte. */
static inline uint64_t mirror64bit(uint64_t value)
{
return ((uint64_t)mirror8bit(value & 0xff) << 56) |
((uint64_t)mirror8bit((value >> 8) & 0xff) << 48) |
((uint64_t)mirror8bit((value >> 16) & 0xff) << 40) |
((uint64_t)mirror8bit((value >> 24) & 0xff) << 32) |
((uint64_t)mirror8bit((value >> 32) & 0xff) << 24) |
((uint64_t)mirror8bit((value >> 40) & 0xff) << 16) |
((uint64_t)mirror8bit((value >> 48) & 0xff) << 8) |
((uint64_t)mirror8bit((value >> 56) & 0xff));
}
/**
* @brief Parameter k for the Exponential Golomb algorihm to be used.
*
* The smaller this value, the smaller the minimum bit count for the Exp.
* Golomb generated affixes will be (at lowest index) however for the
* price of having higher maximum bit count of generated affixes (at highest
* index). Likewise increasing this parameter yields in smaller maximum bit
* count for the price of having higher minimum bit count.
*
* In practice that means: a good value for k depends on the expected amount
* of devices to be exposed by one export. For a small amount of devices k
* should be small, for a large amount of devices k might be increased
* instead. The default of k=0 should be fine for most users though.
*
* @b IMPORTANT: In case this ever becomes a runtime parameter; the value of
* k should not change as long as guest is still running! Because that would
* cause completely different inode numbers to be generated on guest.
*/
#define EXP_GOLOMB_K 0
/**
* @brief Exponential Golomb algorithm for arbitrary k (including k=0).
*
* The Exponential Golomb algorithm generates @b prefixes (@b not suffixes!)
* with growing length and with the mathematical property of being
* "prefix-free". The latter means the generated prefixes can be prepended
* in front of arbitrary numbers and the resulting concatenated numbers are
* guaranteed to be always unique.
*
* This is a minor adjustment to the original Exp. Golomb algorithm in the
* sense that lowest allowed index (@param n) starts with 1, not with zero.
*
* @param n - natural number (or index) of the prefix to be generated
* (1, 2, 3, ...)
* @param k - parameter k of Exp. Golomb algorithm to be used
* (see comment on EXP_GOLOMB_K macro for details about k)
*/
static VariLenAffix expGolombEncode(uint64_t n, int k)
{
const uint64_t value = n + (1 << k) - 1;
const int bits = (int) log2(value) + 1;
return (VariLenAffix) {
.type = AffixType_Prefix,
.value = value,
.bits = bits + MAX((bits - 1 - k), 0)
};
}
/**
* @brief Converts a suffix into a prefix, or a prefix into a suffix.
*
* Simply mirror all bits of the affix value, for the purpose to preserve
* respectively the mathematical "prefix-free" or "suffix-free" property
* after the conversion.
*
* If a passed prefix is suitable to create unique numbers, then the
* returned suffix is suitable to create unique numbers as well (and vice
* versa).
*/
static VariLenAffix invertAffix(const VariLenAffix *affix)
{
return (VariLenAffix) {
.type =
(affix->type == AffixType_Suffix) ?
AffixType_Prefix : AffixType_Suffix,
.value =
mirror64bit(affix->value) >>
((sizeof(affix->value) * 8) - affix->bits),
.bits = affix->bits
};
}
/**
* @brief Generates suffix numbers with "suffix-free" property.
*
* This is just a wrapper function on top of the Exp. Golomb algorithm.
*
* Since the Exp. Golomb algorithm generates prefixes, but we need suffixes,
* this function converts the Exp. Golomb prefixes into appropriate suffixes
* which are still suitable for generating unique numbers.
*
* @param n - natural number (or index) of the suffix to be generated
* (1, 2, 3, ...)
*/
static VariLenAffix affixForIndex(uint64_t index)
{
VariLenAffix prefix;
prefix = expGolombEncode(index, EXP_GOLOMB_K);
return invertAffix(&prefix); /* convert prefix to suffix */
}
/* creative abuse of tb_hash_func7, which is based on xxhash */
static uint32_t qpp_hash(QppEntry e)
{
return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0);
}
static uint32_t qpf_hash(QpfEntry e)
{
return qemu_xxhash7(e.ino, e.dev, 0, 0, 0);
}
static bool qpd_cmp_func(const void *obj, const void *userp)
{
const QpdEntry *e1 = obj, *e2 = userp;
return e1->dev == e2->dev;
}
static bool qpp_cmp_func(const void *obj, const void *userp)
{
const QppEntry *e1 = obj, *e2 = userp;
return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix;
}
static bool qpf_cmp_func(const void *obj, const void *userp)
{
const QpfEntry *e1 = obj, *e2 = userp;
return e1->dev == e2->dev && e1->ino == e2->ino;
}
static void qp_table_remove(void *p, uint32_t h, void *up)
{
g_free(p);
}
static void qp_table_destroy(struct qht *ht)
{
if (!ht || !ht->map) {
return;
}
qht_iter(ht, qp_table_remove, NULL);
qht_destroy(ht);
}
static void qpd_table_init(struct qht *ht)
{
qht_init(ht, qpd_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
}
static void qpp_table_init(struct qht *ht)
{
qht_init(ht, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE);
}
static void qpf_table_init(struct qht *ht)
{
qht_init(ht, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE);
}
/*
* Returns how many (high end) bits of inode numbers of the passed fs
* device shall be used (in combination with the device number) to
* generate hash values for qpp_table entries.
*
* This function is required if variable length suffixes are used for inode
* number mapping on guest level. Since a device may end up having multiple
* entries in qpp_table, each entry most probably with a different suffix
* length, we thus need this function in conjunction with qpd_table to
* "agree" about a fix amount of bits (per device) to be always used for
* generating hash values for the purpose of accessing qpp_table in order
* get consistent behaviour when accessing qpp_table.
*/
static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev)
{
QpdEntry lookup = {
.dev = dev
}, *val;
uint32_t hash = dev;
VariLenAffix affix;
val = qht_lookup(&pdu->s->qpd_table, &lookup, hash);
if (!val) {
val = g_malloc0(sizeof(QpdEntry));
*val = lookup;
affix = affixForIndex(pdu->s->qp_affix_next);
val->prefix_bits = affix.bits;
qht_insert(&pdu->s->qpd_table, val, hash, NULL);
pdu->s->qp_ndevices++;
}
return val->prefix_bits;
}
/**
* @brief Slow / full mapping host inode nr -> guest inode nr.
*
* This function performs a slower and much more costly remapping of an
* original file inode number on host to an appropriate different inode
* number on guest. For every (dev, inode) combination on host a new
* sequential number is generated, cached and exposed as inode number on
* guest.
*
* This is just a "last resort" fallback solution if the much faster/cheaper
* qid_path_suffixmap() failed. In practice this slow / full mapping is not
* expected ever to be used at all though.
*
* @see qid_path_suffixmap() for details
*
*/
static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf,
uint64_t *path)
{
QpfEntry lookup = {
.dev = stbuf->st_dev,
.ino = stbuf->st_ino
}, *val;
uint32_t hash = qpf_hash(lookup);
VariLenAffix affix;
val = qht_lookup(&pdu->s->qpf_table, &lookup, hash);
if (!val) {
if (pdu->s->qp_fullpath_next == 0) {
/* no more files can be mapped :'( */
error_report_once(
"9p: No more prefixes available for remapping inodes from "
"host to guest."
);
return -ENFILE;
}
val = g_malloc0(sizeof(QppEntry));
*val = lookup;
/* new unique inode and device combo */
affix = affixForIndex(
1ULL << (sizeof(pdu->s->qp_affix_next) * 8)
);
val->path = (pdu->s->qp_fullpath_next++ << affix.bits) | affix.value;
pdu->s->qp_fullpath_next &= ((1ULL << (64 - affix.bits)) - 1);
qht_insert(&pdu->s->qpf_table, val, hash, NULL);
}
*path = val->path;
return 0;
}
/**
* @brief Quick mapping host inode nr -> guest inode nr.
*
* This function performs quick remapping of an original file inode number
* on host to an appropriate different inode number on guest. This remapping
* of inodes is required to avoid inode nr collisions on guest which would
* happen if the 9p export contains more than 1 exported file system (or
* more than 1 file system data set), because unlike on host level where the
* files would have different device nrs, all files exported by 9p would
* share the same device nr on guest (the device nr of the virtual 9p device
* that is).
*
* Inode remapping is performed by chopping off high end bits of the original
* inode number from host, shifting the result upwards and then assigning a
* generated suffix number for the low end bits, where the same suffix number
* will be shared by all inodes with the same device id AND the same high end
* bits that have been chopped off. That approach utilizes the fact that inode
* numbers very likely share the same high end bits (i.e. due to their common
* sequential generation by file systems) and hence we only have to generate
* and track a very limited amount of suffixes in practice due to that.
*
* We generate variable size suffixes for that purpose. The 1st generated
* suffix will only have 1 bit and hence we only need to chop off 1 bit from
* the original inode number. The subsequent suffixes being generated will
* grow in (bit) size subsequently, i.e. the 2nd and 3rd suffix being
* generated will have 3 bits and hence we have to chop off 3 bits from their
* original inodes, and so on. That approach of using variable length suffixes
* (i.e. over fixed size ones) utilizes the fact that in practice only a very
* limited amount of devices are shared by the same export (e.g. typically
* less than 2 dozen devices per 9p export), so in practice we need to chop
* off less bits than with fixed size prefixes and yet are flexible to add
* new devices at runtime below host's export directory at any time without
* having to reboot guest nor requiring to reconfigure guest for that. And due
* to the very limited amount of original high end bits that we chop off that
* way, the total amount of suffixes we need to generate is less than by using
* fixed size prefixes and hence it also improves performance of the inode
* remapping algorithm, and finally has the nice side effect that the inode
* numbers on guest will be much smaller & human friendly. ;-)
*/
static int qid_path_suffixmap(V9fsPDU *pdu, const struct stat *stbuf,
uint64_t *path)
{
const int ino_hash_bits = qid_inode_prefix_hash_bits(pdu, stbuf->st_dev);
QppEntry lookup = {
.dev = stbuf->st_dev,
.ino_prefix = (uint16_t) (stbuf->st_ino >> (64 - ino_hash_bits))
}, *val;
uint32_t hash = qpp_hash(lookup);
val = qht_lookup(&pdu->s->qpp_table, &lookup, hash);
if (!val) {
if (pdu->s->qp_affix_next == 0) {
/* we ran out of affixes */
warn_report_once(
"9p: Potential degraded performance of inode remapping"
);
return -ENFILE;
}
val = g_malloc0(sizeof(QppEntry));
*val = lookup;
/* new unique inode affix and device combo */
val->qp_affix_index = pdu->s->qp_affix_next++;
val->qp_affix = affixForIndex(val->qp_affix_index);
qht_insert(&pdu->s->qpp_table, val, hash, NULL);
}
/* assuming generated affix to be suffix type, not prefix */
*path = (stbuf->st_ino << val->qp_affix.bits) | val->qp_affix.value;
return 0;
}
static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp)
{
int err;
size_t size;
memset(&qidp->path, 0, sizeof(qidp->path));
size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
memcpy(&qidp->path, &stbuf->st_ino, size);
if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
/* map inode+device to qid path (fast path) */
err = qid_path_suffixmap(pdu, stbuf, &qidp->path);
if (err == -ENFILE) {
/* fast path didn't work, fall back to full map */
err = qid_path_fullmap(pdu, stbuf, &qidp->path);
}
if (err) {
return err;
}
} else {
if (pdu->s->dev_id != stbuf->st_dev) {
if (pdu->s->ctx.export_flags & V9FS_FORBID_MULTIDEVS) {
error_report_once(
"9p: Multiple devices detected in same VirtFS export. "
"Access of guest to additional devices is (partly) "
"denied due to virtfs option 'multidevs=forbid' being "
"effective."
);
return -ENODEV;
} else {
warn_report_once(
"9p: Multiple devices detected in same VirtFS export, "
"which might lead to file ID collisions and severe "
"misbehaviours on guest! You should either use a "
"separate export for each device shared from host or "
"use virtfs option 'multidevs=remap'!"
);
}
}
memset(&qidp->path, 0, sizeof(qidp->path));
size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path));
memcpy(&qidp->path, &stbuf->st_ino, size);
}
qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8);
qidp->type = 0;
if (S_ISDIR(stbuf->st_mode)) {
@ -588,6 +950,8 @@ static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp)
if (S_ISLNK(stbuf->st_mode)) {
qidp->type |= P9_QID_TYPE_SYMLINK;
}
return 0;
}
static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
@ -600,10 +964,37 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
if (err < 0) {
return err;
}
stat_to_qid(&stbuf, qidp);
err = stat_to_qid(pdu, &stbuf, qidp);
if (err < 0) {
return err;
}
return 0;
}
static int coroutine_fn dirent_to_qid(V9fsPDU *pdu, V9fsFidState *fidp,
struct dirent *dent, V9fsQID *qidp)
{
struct stat stbuf;
V9fsPath path;
int err;
v9fs_path_init(&path);
err = v9fs_co_name_to_path(pdu, &fidp->path, dent->d_name, &path);
if (err < 0) {
goto out;
}
err = v9fs_co_lstat(pdu, &path, &stbuf);
if (err < 0) {
goto out;
}
err = stat_to_qid(pdu, &stbuf, qidp);
out:
v9fs_path_free(&path);
return err;
}
V9fsPDU *pdu_alloc(V9fsState *s)
{
V9fsPDU *pdu = NULL;
@ -744,9 +1135,9 @@ static int donttouch_stat(V9fsStat *stat)
{
if (stat->type == -1 &&
stat->dev == -1 &&
stat->qid.type == -1 &&
stat->qid.version == -1 &&
stat->qid.path == -1 &&
stat->qid.type == 0xff &&
stat->qid.version == (uint32_t) -1 &&
stat->qid.path == (uint64_t) -1 &&
stat->mode == -1 &&
stat->atime == -1 &&
stat->mtime == -1 &&
@ -831,7 +1222,10 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
memset(v9stat, 0, sizeof(*v9stat));
stat_to_qid(stbuf, &v9stat->qid);
err = stat_to_qid(pdu, stbuf, &v9stat->qid);
if (err < 0) {
return err;
}
v9stat->mode = stat_to_v9mode(stbuf);
v9stat->atime = stbuf->st_atime;
v9stat->mtime = stbuf->st_mtime;
@ -892,7 +1286,7 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path,
#define P9_STATS_ALL 0x00003fffULL /* Mask for All fields above */
static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
static int stat_to_v9stat_dotl(V9fsPDU *pdu, const struct stat *stbuf,
V9fsStatDotl *v9lstat)
{
memset(v9lstat, 0, sizeof(*v9lstat));
@ -914,7 +1308,7 @@ static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf,
/* Currently we only support BASIC fields in stat */
v9lstat->st_result_mask = P9_STATS_BASIC;
stat_to_qid(stbuf, &v9lstat->qid);
return stat_to_qid(pdu, stbuf, &v9lstat->qid);
}
static void print_sg(struct iovec *sg, int cnt)
@ -1116,7 +1510,6 @@ static void coroutine_fn v9fs_getattr(void *opaque)
uint64_t request_mask;
V9fsStatDotl v9stat_dotl;
V9fsPDU *pdu = opaque;
V9fsState *s = pdu->s;
retval = pdu_unmarshal(pdu, offset, "dq", &fid, &request_mask);
if (retval < 0) {
@ -1137,7 +1530,10 @@ static void coroutine_fn v9fs_getattr(void *opaque)
if (retval < 0) {
goto out;
}
stat_to_v9stat_dotl(s, &stbuf, &v9stat_dotl);
retval = stat_to_v9stat_dotl(pdu, &stbuf, &v9stat_dotl);
if (retval < 0) {
goto out;
}
/* fill st_gen if requested and supported by underlying fs */
if (request_mask & P9_STATS_GEN) {
@ -1382,7 +1778,10 @@ static void coroutine_fn v9fs_walk(void *opaque)
if (err < 0) {
goto out;
}
stat_to_qid(&stbuf, &qid);
err = stat_to_qid(pdu, &stbuf, &qid);
if (err < 0) {
goto out;
}
v9fs_path_copy(&dpath, &path);
}
memcpy(&qids[name_idx], &qid, sizeof(qid));
@ -1484,7 +1883,10 @@ static void coroutine_fn v9fs_open(void *opaque)
if (err < 0) {
goto out;
}
stat_to_qid(&stbuf, &qid);
err = stat_to_qid(pdu, &stbuf, &qid);
if (err < 0) {
goto out;
}
if (S_ISDIR(stbuf.st_mode)) {
err = v9fs_co_opendir(pdu, fidp);
if (err < 0) {
@ -1594,7 +1996,10 @@ static void coroutine_fn v9fs_lcreate(void *opaque)
fidp->flags |= FID_NON_RECLAIMABLE;
}
iounit = get_iounit(pdu, &fidp->path);
stat_to_qid(&stbuf, &qid);
err = stat_to_qid(pdu, &stbuf, &qid);
if (err < 0) {
goto out;
}
err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
if (err < 0) {
goto out;
@ -1938,16 +2343,39 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp,
v9fs_string_free(&name);
return count;
}
/*
* Fill up just the path field of qid because the client uses
* only that. To fill the entire qid structure we will have
* to stat each dirent found, which is expensive
*/
size = MIN(sizeof(dent->d_ino), sizeof(qid.path));
memcpy(&qid.path, &dent->d_ino, size);
/* Fill the other fields with dummy values */
qid.type = 0;
qid.version = 0;
if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) {
/*
* dirent_to_qid() implies expensive stat call for each entry,
* we must do that here though since inode remapping requires
* the device id, which in turn might be different for
* different entries; we cannot make any assumption to avoid
* that here.
*/
err = dirent_to_qid(pdu, fidp, dent, &qid);
if (err < 0) {
v9fs_readdir_unlock(&fidp->fs.dir);
v9fs_co_seekdir(pdu, fidp, saved_dir_pos);
v9fs_string_free(&name);
return err;
}
} else {
/*
* Fill up just the path field of qid because the client uses
* only that. To fill the entire qid structure we will have
* to stat each dirent found, which is expensive. For the
* latter reason we don't call dirent_to_qid() here. Only drawback
* is that no multi-device export detection of stat_to_qid()
* would be done and provided as error to the user here. But
* user would get that error anyway when accessing those
* files/dirs through other ways.
*/
size = MIN(sizeof(dent->d_ino), sizeof(qid.path));
memcpy(&qid.path, &dent->d_ino, size);
/* Fill the other fields with dummy values */
qid.type = 0;
qid.version = 0;
}
/* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */
len = pdu_marshal(pdu, 11 + count, "Qqbs",
@ -2328,7 +2756,10 @@ static void coroutine_fn v9fs_create(void *opaque)
}
}
iounit = get_iounit(pdu, &fidp->path);
stat_to_qid(&stbuf, &qid);
err = stat_to_qid(pdu, &stbuf, &qid);
if (err < 0) {
goto out;
}
err = pdu_marshal(pdu, offset, "Qd", &qid, iounit);
if (err < 0) {
goto out;
@ -2385,7 +2816,10 @@ static void coroutine_fn v9fs_symlink(void *opaque)
if (err < 0) {
goto out;
}
stat_to_qid(&stbuf, &qid);
err = stat_to_qid(pdu, &stbuf, &qid);
if (err < 0) {
goto out;
}
err = pdu_marshal(pdu, offset, "Q", &qid);
if (err < 0) {
goto out;
@ -3065,7 +3499,10 @@ static void coroutine_fn v9fs_mknod(void *opaque)
if (err < 0) {
goto out;
}
stat_to_qid(&stbuf, &qid);
err = stat_to_qid(pdu, &stbuf, &qid);
if (err < 0) {
goto out;
}
err = pdu_marshal(pdu, offset, "Q", &qid);
if (err < 0) {
goto out;
@ -3223,7 +3660,10 @@ static void coroutine_fn v9fs_mkdir(void *opaque)
if (err < 0) {
goto out;
}
stat_to_qid(&stbuf, &qid);
err = stat_to_qid(pdu, &stbuf, &qid);
if (err < 0) {
goto out;
}
err = pdu_marshal(pdu, offset, "Q", &qid);
if (err < 0) {
goto out;
@ -3634,31 +4074,43 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t,
goto out;
}
s->dev_id = stat.st_dev;
/* init inode remapping : */
/* hash table for variable length inode suffixes */
qpd_table_init(&s->qpd_table);
/* hash table for slow/full inode remapping (most users won't need it) */
qpf_table_init(&s->qpf_table);
/* hash table for quick inode remapping */
qpp_table_init(&s->qpp_table);
s->qp_ndevices = 0;
s->qp_affix_next = 1; /* reserve 0 to detect overflow */
s->qp_fullpath_next = 1;
s->ctx.fst = &fse->fst;
fsdev_throttle_init(s->ctx.fst);
v9fs_path_free(&path);
rc = 0;
out:
if (rc) {
if (s->ops && s->ops->cleanup && s->ctx.private) {
s->ops->cleanup(&s->ctx);
}
g_free(s->tag);
g_free(s->ctx.fs_root);
v9fs_path_free(&path);
v9fs_device_unrealize_common(s, NULL);
}
v9fs_path_free(&path);
return rc;
}
void v9fs_device_unrealize_common(V9fsState *s, Error **errp)
{
if (s->ops->cleanup) {
if (s->ops && s->ops->cleanup) {
s->ops->cleanup(&s->ctx);
}
fsdev_throttle_cleanup(s->ctx.fst);
if (s->ctx.fst) {
fsdev_throttle_cleanup(s->ctx.fst);
}
g_free(s->tag);
qp_table_destroy(&s->qpd_table);
qp_table_destroy(&s->qpp_table);
qp_table_destroy(&s->qpf_table);
g_free(s->ctx.fs_root);
}

View file

@ -8,6 +8,7 @@
#include "fsdev/9p-iov-marshal.h"
#include "qemu/thread.h"
#include "qemu/coroutine.h"
#include "qemu/qht.h"
enum {
P9_TLERROR = 6,
@ -235,6 +236,58 @@ struct V9fsFidState
V9fsFidState *rclm_lst;
};
typedef enum AffixType_t {
AffixType_Prefix,
AffixType_Suffix, /* A.k.a. postfix. */
} AffixType_t;
/**
* @brief Unique affix of variable length.
*
* An affix is (currently) either a suffix or a prefix, which is either
* going to be prepended (prefix) or appended (suffix) with some other
* number for the goal to generate unique numbers. Accordingly the
* suffixes (or prefixes) we generate @b must all have the mathematical
* property of being suffix-free (or prefix-free in case of prefixes)
* so that no matter what number we concatenate the affix with, that we
* always reliably get unique numbers as result after concatenation.
*/
typedef struct VariLenAffix {
AffixType_t type; /* Whether this affix is a suffix or a prefix. */
uint64_t value; /* Actual numerical value of this affix. */
/*
* Lenght of the affix, that is how many (of the lowest) bits of @c value
* must be used for appending/prepending this affix to its final resulting,
* unique number.
*/
int bits;
} VariLenAffix;
/* See qid_inode_prefix_hash_bits(). */
typedef struct {
dev_t dev; /* FS device on host. */
/*
* How many (high) bits of the original inode number shall be used for
* hashing.
*/
int prefix_bits;
} QpdEntry;
/* QID path prefix entry, see stat_to_qid */
typedef struct {
dev_t dev;
uint16_t ino_prefix;
uint32_t qp_affix_index;
VariLenAffix qp_affix;
} QppEntry;
/* QID path full entry, as above */
typedef struct {
dev_t dev;
ino_t ino;
uint64_t path;
} QpfEntry;
struct V9fsState
{
QLIST_HEAD(, V9fsPDU) free_list;
@ -256,6 +309,13 @@ struct V9fsState
Error *migration_blocker;
V9fsConf fsconf;
V9fsQID root_qid;
dev_t dev_id;
struct qht qpd_table;
struct qht qpp_table;
struct qht qpf_table;
uint64_t qp_ndevices; /* Amount of entries in qpd_table. */
uint16_t qp_affix_next;
uint64_t qp_fullpath_next;
};
/* 9p2000.L open flags */

View file

@ -6,7 +6,7 @@ v9fs_rerror(uint16_t tag, uint8_t id, int err) "tag %d id %d err %d"
v9fs_version(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
v9fs_version_return(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s"
v9fs_attach(uint16_t tag, uint8_t id, int32_t fid, int32_t afid, char* uname, char* aname) "tag %u id %u fid %d afid %d uname %s aname %s"
v9fs_attach_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d type %d version %d path %"PRId64
v9fs_attach_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %u id %u type %u version %u path %"PRIu64
v9fs_stat(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
v9fs_stat_return(uint16_t tag, uint8_t id, int32_t mode, int32_t atime, int32_t mtime, int64_t length) "tag %d id %d stat={mode %d atime %d mtime %d length %"PRId64"}"
v9fs_getattr(uint16_t tag, uint8_t id, int32_t fid, uint64_t request_mask) "tag %d id %d fid %d request_mask %"PRIu64
@ -14,9 +14,9 @@ v9fs_getattr_return(uint16_t tag, uint8_t id, uint64_t result_mask, uint32_t mod
v9fs_walk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, uint16_t nwnames) "tag %d id %d fid %d newfid %d nwnames %d"
v9fs_walk_return(uint16_t tag, uint8_t id, uint16_t nwnames, void* qids) "tag %d id %d nwnames %d qids %p"
v9fs_open(uint16_t tag, uint8_t id, int32_t fid, int32_t mode) "tag %d id %d fid %d mode %d"
v9fs_open_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
v9fs_open_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %u id %u qid={type %u version %u path %"PRIu64"} iounit %d"
v9fs_lcreate(uint16_t tag, uint8_t id, int32_t dfid, int32_t flags, int32_t mode, uint32_t gid) "tag %d id %d dfid %d flags %d mode %d gid %u"
v9fs_lcreate_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int32_t iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
v9fs_lcreate_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int32_t iounit) "tag %u id %u qid={type %u version %u path %"PRIu64"} iounit %d"
v9fs_fsync(uint16_t tag, uint8_t id, int32_t fid, int datasync) "tag %d id %d fid %d datasync %d"
v9fs_clunk(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
v9fs_read(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t max_count) "tag %d id %d fid %d off %"PRIu64" max_count %u"
@ -26,21 +26,21 @@ v9fs_readdir_return(uint16_t tag, uint8_t id, uint32_t count, ssize_t retval) "t
v9fs_write(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t count, int cnt) "tag %d id %d fid %d off %"PRIu64" count %u cnt %d"
v9fs_write_return(uint16_t tag, uint8_t id, int32_t total, ssize_t err) "tag %d id %d total %d err %zd"
v9fs_create(uint16_t tag, uint8_t id, int32_t fid, char* name, int32_t perm, int8_t mode) "tag %d id %d fid %d name %s perm %d mode %d"
v9fs_create_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d"
v9fs_create_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %u id %u qid={type %u version %u path %"PRIu64"} iounit %d"
v9fs_symlink(uint16_t tag, uint8_t id, int32_t fid, char* name, char* symname, uint32_t gid) "tag %d id %d fid %d name %s symname %s gid %u"
v9fs_symlink_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
v9fs_symlink_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %u id %u qid={type %u version %u path %"PRIu64"}"
v9fs_flush(uint16_t tag, uint8_t id, int16_t flush_tag) "tag %d id %d flush_tag %d"
v9fs_link(uint16_t tag, uint8_t id, int32_t dfid, int32_t oldfid, char* name) "tag %d id %d dfid %d oldfid %d name %s"
v9fs_remove(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d"
v9fs_wstat(uint16_t tag, uint8_t id, int32_t fid, int32_t mode, int32_t atime, int32_t mtime) "tag %u id %u fid %d stat={mode %d atime %d mtime %d}"
v9fs_mknod(uint16_t tag, uint8_t id, int32_t fid, int mode, int major, int minor) "tag %d id %d fid %d mode %d major %d minor %d"
v9fs_mknod_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}"
v9fs_mknod_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %u id %u qid={type %u version %u path %"PRIu64"}"
v9fs_lock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length) "tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64
v9fs_lock_return(uint16_t tag, uint8_t id, int8_t status) "tag %d id %d status %d"
v9fs_getlock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length)"tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64
v9fs_getlock_return(uint16_t tag, uint8_t id, uint8_t type, uint64_t start, uint64_t length, uint32_t proc_id) "tag %d id %d type %d start %"PRIu64" length %"PRIu64" proc_id %u"
v9fs_mkdir(uint16_t tag, uint8_t id, int32_t fid, char* name, int mode, uint32_t gid) "tag %u id %u fid %d name %s mode %d gid %u"
v9fs_mkdir_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d"
v9fs_mkdir_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int err) "tag %u id %u qid={type %u version %u path %"PRIu64"} err %d"
v9fs_xattrwalk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, char* name) "tag %d id %d fid %d newfid %d name %s"
v9fs_xattrwalk_return(uint16_t tag, uint8_t id, int64_t size) "tag %d id %d size %"PRId64
v9fs_xattrcreate(uint16_t tag, uint8_t id, int32_t fid, char* name, uint64_t size, int flags) "tag %d id %d fid %d name %s size %"PRIu64" flags %d"

View file

@ -1339,7 +1339,7 @@ ETEXI
DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
"-virtfs local,path=path,mount_tag=tag,security_model=mapped-xattr|mapped-file|passthrough|none\n"
" [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode]\n"
" [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode][,multidevs=remap|forbid|warn]\n"
"-virtfs proxy,mount_tag=tag,socket=socket[,id=id][,writeout=immediate][,readonly]\n"
"-virtfs proxy,mount_tag=tag,sock_fd=sock_fd[,id=id][,writeout=immediate][,readonly]\n"
"-virtfs synth,mount_tag=tag[,id=id][,readonly]\n",
@ -1347,7 +1347,7 @@ DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs,
STEXI
@item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}]
@item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}][,multidevs=@var{multidevs}]
@itemx -virtfs proxy,socket=@var{socket},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly]
@itemx -virtfs proxy,sock_fd=@var{sock_fd},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly]
@itemx -virtfs synth,mount_tag=@var{mount_tag}
@ -1403,6 +1403,28 @@ Specifies the default mode for newly created directories on the host. Works
only with security models "mapped-xattr" and "mapped-file".
@item mount_tag=@var{mount_tag}
Specifies the tag name to be used by the guest to mount this export point.
@item multidevs=@var{multidevs}
Specifies how to deal with multiple devices being shared with a 9p export.
Supported behaviours are either "remap", "forbid" or "warn". The latter is
the default behaviour on which virtfs 9p expects only one device to be
shared with the same export, and if more than one device is shared and
accessed via the same 9p export then only a warning message is logged
(once) by qemu on host side. In order to avoid file ID collisions on guest
you should either create a separate virtfs export for each device to be
shared with guests (recommended way) or you might use "remap" instead which
allows you to share multiple devices with only one export instead, which is
achieved by remapping the original inode numbers from host to guest in a
way that would prevent such collisions. Remapping inodes in such use cases
is required because the original device IDs from host are never passed and
exposed on guest. Instead all files of an export shared with virtfs always
share the same device id on guest. So two files with identical inode
numbers but from actually different devices on host would otherwise cause a
file ID collision and hence potential misbehaviours on guest. "forbid" on
the other hand assumes like "warn" that only one device is shared by the
same export, however it will not only log a warning message but also
deny access to additional devices on guest. Note though that "forbid" does
currently not block all possible file access operations (e.g. readdir()
would still return entries from other devices).
@end table
ETEXI

7
vl.c
View file

@ -3335,7 +3335,8 @@ int main(int argc, char **argv, char **envp)
case QEMU_OPTION_virtfs: {
QemuOpts *fsdev;
QemuOpts *device;
const char *writeout, *sock_fd, *socket, *path, *security_model;
const char *writeout, *sock_fd, *socket, *path, *security_model,
*multidevs;
olist = qemu_find_opts("virtfs");
if (!olist) {
@ -3395,6 +3396,10 @@ int main(int argc, char **argv, char **envp)
qemu_opt_set_bool(fsdev, "readonly",
qemu_opt_get_bool(opts, "readonly", 0),
&error_abort);
multidevs = qemu_opt_get(opts, "multidevs");
if (multidevs) {
qemu_opt_set(fsdev, "multidevs", multidevs, &error_abort);
}
device = qemu_opts_create(qemu_find_opts("device"), NULL, 0,
&error_abort);
qemu_opt_set(device, "driver", "virtio-9p-pci", &error_abort);