From 40a892b78c9a620bc5ace4164dd566764c35cb53 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sat, 19 Feb 2011 22:18:11 +0100 Subject: [PATCH 1/9] block/vdi: Don't ignore immediate read/write failures This patch is similar to 171e3d6b9997c98a97d0c525867f7cd9b640cadd which fixed qcow2: Returning -EIO is far from optimal, but at least it's an error code. Cc: Kevin Wolf Signed-off-by: Stefan Weil Signed-off-by: Kevin Wolf --- block/vdi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/block/vdi.c b/block/vdi.c index 116b25bc9b..90540792d3 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -610,6 +610,7 @@ static void vdi_aio_read_cb(void *opaque, int ret) acb->hd_aiocb = bdrv_aio_readv(bs->file, offset, &acb->hd_qiov, n_sectors, vdi_aio_read_cb, acb); if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; } } @@ -673,6 +674,7 @@ static void vdi_aio_write_cb(void *opaque, int ret) acb->hd_aiocb = bdrv_aio_writev(bs->file, 0, &acb->hd_qiov, 1, vdi_aio_write_cb, acb); if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; } return; @@ -702,6 +704,7 @@ static void vdi_aio_write_cb(void *opaque, int ret) acb->hd_aiocb = bdrv_aio_writev(bs->file, offset, &acb->hd_qiov, n_sectors, vdi_aio_write_cb, acb); if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; } return; @@ -752,6 +755,7 @@ static void vdi_aio_write_cb(void *opaque, int ret) &acb->hd_qiov, s->block_sectors, vdi_aio_write_cb, acb); if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; } } else { @@ -764,6 +768,7 @@ static void vdi_aio_write_cb(void *opaque, int ret) acb->hd_aiocb = bdrv_aio_writev(bs->file, offset, &acb->hd_qiov, n_sectors, vdi_aio_write_cb, acb); if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; } } From 5614c188c65a8194ae499cb16400cab690a45299 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sat, 19 Feb 2011 22:18:12 +0100 Subject: [PATCH 2/9] block/qcow: Don't ignore immediate read/write and other failures This patch is similar to 171e3d6b9997c98a97d0c525867f7cd9b640cadd which fixed qcow2: Returning -EIO is far from optimal, but at least it's an error code. In addition to read/write failures, -EIO is also returned when decompress_cluster failed. Cc: Kevin Wolf Signed-off-by: Stefan Weil Signed-off-by: Kevin Wolf --- block/qcow.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index f67d3d39f2..a26c88620f 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -589,8 +589,10 @@ static void qcow_aio_read_cb(void *opaque, int ret) qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num, &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); - if (acb->hd_aiocb == NULL) + if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; + } } else { /* Note: in this case, no need to wait */ memset(acb->buf, 0, 512 * acb->n); @@ -598,8 +600,10 @@ static void qcow_aio_read_cb(void *opaque, int ret) } } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ - if (decompress_cluster(bs, acb->cluster_offset) < 0) + if (decompress_cluster(bs, acb->cluster_offset) < 0) { + ret = -EIO; goto done; + } memcpy(acb->buf, s->cluster_cache + index_in_cluster * 512, 512 * acb->n); goto redo; @@ -614,8 +618,10 @@ static void qcow_aio_read_cb(void *opaque, int ret) acb->hd_aiocb = bdrv_aio_readv(bs->file, (acb->cluster_offset >> 9) + index_in_cluster, &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); - if (acb->hd_aiocb == NULL) + if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; + } } return; @@ -700,8 +706,10 @@ static void qcow_aio_write_cb(void *opaque, int ret) (cluster_offset >> 9) + index_in_cluster, &acb->hd_qiov, acb->n, qcow_aio_write_cb, acb); - if (acb->hd_aiocb == NULL) + if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; + } return; done: From e11480db7ff15a9e878f6b3cc1199b439bf7c825 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 1 Mar 2011 10:48:12 +0100 Subject: [PATCH 3/9] Add error message for loading snapshot without VM state It already fails, but it didn't tell the user why. Signed-off-by: Kevin Wolf Reviewed-by: Juan Quintela --- savevm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/savevm.c b/savevm.c index 60d2f2a547..d1b9b4a413 100644 --- a/savevm.c +++ b/savevm.c @@ -2021,6 +2021,8 @@ int load_vmstate(const char *name) if (ret < 0) { return ret; } else if (sn.vm_state_size == 0) { + error_report("This is a disk-only snapshot. Revert to it offline " + "using qemu-img."); return -EINVAL; } From 4e59b545868a5ee5f59b346337f0c44209929334 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 22 Feb 2011 18:42:31 +0100 Subject: [PATCH 4/9] tools: Use real async.c instead of stubs It's wrong to call BHs directly, even in tools. The only operations that schedule BHs are called in a loop that (indirectly) contains a call to qemu_bh_poll anyway, so we're not losing the scheduled BHs: Tools either use synchronous functions, which are guaranteed to have completed (including any BHs) when they return; or if they use asynchronous functions, they need to call qemu_aio_wait() or similar functions already today. Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi --- Makefile.objs | 4 ++-- qemu-tool.c | 47 ++++------------------------------------------- 2 files changed, 6 insertions(+), 45 deletions(-) diff --git a/Makefile.objs b/Makefile.objs index a52f42fb72..167ccc2c6c 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -13,7 +13,7 @@ oslib-obj-$(CONFIG_POSIX) += oslib-posix.o ####################################################################### # block-obj-y is code used by both qemu system emulation and qemu-img -block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o +block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o async.o block-obj-y += nbd.o block.o aio.o aes.o qemu-config.o block-obj-$(CONFIG_POSIX) += posix-aio-compat.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o @@ -63,7 +63,7 @@ common-obj-y = $(block-obj-y) blockdev.o common-obj-y += $(net-obj-y) common-obj-y += $(qobject-obj-y) common-obj-$(CONFIG_LINUX) += $(fsdev-obj-$(CONFIG_LINUX)) -common-obj-y += readline.o console.o cursor.o async.o qemu-error.o +common-obj-y += readline.o console.o cursor.o qemu-error.o common-obj-y += $(oslib-obj-y) common-obj-$(CONFIG_WIN32) += os-win32.o common-obj-$(CONFIG_POSIX) += os-posix.o diff --git a/qemu-tool.c b/qemu-tool.c index 392e1c9505..d45840de28 100644 --- a/qemu-tool.c +++ b/qemu-tool.c @@ -56,53 +56,10 @@ void monitor_print_filename(Monitor *mon, const char *filename) { } -void async_context_push(void) -{ -} - -void async_context_pop(void) -{ -} - -int get_async_context_id(void) -{ - return 0; -} - void monitor_protocol_event(MonitorEvent event, QObject *data) { } -QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque) -{ - QEMUBH *bh; - - bh = qemu_malloc(sizeof(*bh)); - bh->cb = cb; - bh->opaque = opaque; - - return bh; -} - -int qemu_bh_poll(void) -{ - return 0; -} - -void qemu_bh_schedule(QEMUBH *bh) -{ - bh->cb(bh->opaque); -} - -void qemu_bh_cancel(QEMUBH *bh) -{ -} - -void qemu_bh_delete(QEMUBH *bh) -{ - qemu_free(bh); -} - int qemu_set_fd_handler2(int fd, IOCanReadHandler *fd_read_poll, IOHandler *fd_read, @@ -111,3 +68,7 @@ int qemu_set_fd_handler2(int fd, { return 0; } + +void qemu_notify_event(void) +{ +} From 301db7c2dd769d48e97c9a766520f8affff76cd7 Mon Sep 17 00:00:00 2001 From: Ryan Harper Date: Mon, 7 Mar 2011 10:01:04 -0600 Subject: [PATCH 5/9] Don't allow multiwrites against a block device without underlying medium If the block device has been closed, we no longer have a medium to submit IO against, check for this before submitting io. This prevents a segfault further in the code where we dereference elements of the block driver. Signed-off-by: Ryan Harper Reviewed-by: Stefan Hajnoczi Signed-off-by: Kevin Wolf --- block.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block.c b/block.c index 0559d835a2..c8e2f97614 100644 --- a/block.c +++ b/block.c @@ -2398,6 +2398,14 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) MultiwriteCB *mcb; int i; + /* don't submit writes if we don't have a medium */ + if (bs->drv == NULL) { + for (i = 0; i < num_reqs; i++) { + reqs[i].error = -ENOMEDIUM; + } + return -1; + } + if (num_reqs == 0) { return 0; } From b93af93d2bbacf034af1888c96291850aa849b59 Mon Sep 17 00:00:00 2001 From: Brian Wheeler Date: Tue, 1 Mar 2011 08:30:23 -0500 Subject: [PATCH 6/9] Fix ATA SMART and CHECK POWER MODE This patch fixes two things: 1) CHECK POWER MODE The error return value wasn't always zero, so it would show up as offline. Error is now explicitly set to zero. 2) SMART The smart values that were returned were invalid and tools like skdump would not recognize that the smart data was actually valid and would dump weird output. The data has been fixed up and raw value support was added. Tools like skdump and palimpsest work as expected. Signed-off-by: Brian Wheeler Acked-by: Ryan Harper Signed-off-by: Kevin Wolf --- hw/ide/core.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/hw/ide/core.c b/hw/ide/core.c index 9c91a49767..1ffca56887 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -34,13 +34,26 @@ #include -static const int smart_attributes[][5] = { - /* id, flags, val, wrst, thrsh */ - { 0x01, 0x03, 0x64, 0x64, 0x06}, /* raw read */ - { 0x03, 0x03, 0x64, 0x64, 0x46}, /* spin up */ - { 0x04, 0x02, 0x64, 0x64, 0x14}, /* start stop count */ - { 0x05, 0x03, 0x64, 0x64, 0x36}, /* remapped sectors */ - { 0x00, 0x00, 0x00, 0x00, 0x00} +/* These values were based on a Seagate ST3500418AS but have been modified + to make more sense in QEMU */ +static const int smart_attributes[][12] = { + /* id, flags, hflags, val, wrst, raw (6 bytes), threshold */ + /* raw read error rate*/ + { 0x01, 0x03, 0x00, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06}, + /* spin up */ + { 0x03, 0x03, 0x00, 0x64, 0x64, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + /* start stop count */ + { 0x04, 0x02, 0x00, 0x64, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14}, + /* remapped sectors */ + { 0x05, 0x03, 0x00, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x24}, + /* power on hours */ + { 0x09, 0x03, 0x00, 0x64, 0x64, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + /* power cycle count */ + { 0x0c, 0x03, 0x00, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + /* airflow-temperature-celsius */ + { 190, 0x03, 0x00, 0x45, 0x45, 0x1f, 0x00, 0x1f, 0x1f, 0x00, 0x00, 0x32}, + /* end of list */ + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} }; /* XXX: DVDs that could fit on a CD will be reported as a CD */ @@ -1843,6 +1856,7 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) break; case WIN_CHECKPOWERMODE1: case WIN_CHECKPOWERMODE2: + s->error = 0; s->nsector = 0xff; /* device active or idle */ s->status = READY_STAT | SEEK_STAT; ide_set_irq(s->bus); @@ -2097,7 +2111,7 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) if (smart_attributes[n][0] == 0) break; s->io_buffer[2+0+(n*12)] = smart_attributes[n][0]; - s->io_buffer[2+1+(n*12)] = smart_attributes[n][4]; + s->io_buffer[2+1+(n*12)] = smart_attributes[n][11]; } for (n=0; n<511; n++) /* checksum */ s->io_buffer[511] += s->io_buffer[n]; @@ -2110,12 +2124,13 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) memset(s->io_buffer, 0, 0x200); s->io_buffer[0] = 0x01; /* smart struct version */ for (n=0; n<30; n++) { - if (smart_attributes[n][0] == 0) + if (smart_attributes[n][0] == 0) { break; - s->io_buffer[2+0+(n*12)] = smart_attributes[n][0]; - s->io_buffer[2+1+(n*12)] = smart_attributes[n][1]; - s->io_buffer[2+3+(n*12)] = smart_attributes[n][2]; - s->io_buffer[2+4+(n*12)] = smart_attributes[n][3]; + } + int i; + for(i = 0; i < 11; i++) { + s->io_buffer[2+i+(n*12)] = smart_attributes[n][i]; + } } s->io_buffer[362] = 0x02 | (s->smart_autosave?0x80:0x00); if (s->smart_selftest_count == 0) { From 52f9a172b6db89ba1f4389883be805d65dd3ca8c Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Wed, 9 Mar 2011 11:20:30 +0100 Subject: [PATCH 7/9] Improve error handling in do_snapshot_blkdev() In case we cannot open the newly created snapshot image, try to fall back to the original image file and continue running on that, which should prevent the guest from aborting. This is a corner case which can happen if the admin by mistake specifies the snapshot file on a virtual file system which does not support O_DIRECT. bdrv_create() does not use O_DIRECT, but the following open in bdrv_open() does and will then fail. Signed-off-by: Jes Sorensen Signed-off-by: Kevin Wolf --- blockdev.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/blockdev.c b/blockdev.c index 0690cc8bea..ecf2252d83 100644 --- a/blockdev.c +++ b/blockdev.c @@ -574,9 +574,10 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data) const char *filename = qdict_get_try_str(qdict, "snapshot_file"); const char *format = qdict_get_try_str(qdict, "format"); BlockDriverState *bs; - BlockDriver *drv, *proto_drv; + BlockDriver *drv, *old_drv, *proto_drv; int ret = 0; int flags; + char old_filename[1024]; if (!filename) { qerror_report(QERR_MISSING_PARAMETER, "snapshot_file"); @@ -591,6 +592,11 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data) goto out; } + pstrcpy(old_filename, sizeof(old_filename), bs->filename); + + old_drv = bs->drv; + flags = bs->open_flags; + if (!format) { format = "qcow2"; } @@ -610,7 +616,7 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data) } ret = bdrv_img_create(filename, format, bs->filename, - bs->drv->format_name, NULL, -1, bs->open_flags); + bs->drv->format_name, NULL, -1, flags); if (ret) { goto out; } @@ -618,15 +624,20 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data) qemu_aio_flush(); bdrv_flush(bs); - flags = bs->open_flags; bdrv_close(bs); ret = bdrv_open(bs, filename, flags, drv); /* - * If reopening the image file we just created fails, we really - * are in trouble :( + * If reopening the image file we just created fails, fall back + * and try to re-open the original image. If that fails too, we + * are in serious trouble. */ if (ret != 0) { - abort(); + ret = bdrv_open(bs, old_filename, flags, old_drv); + if (ret != 0) { + qerror_report(QERR_OPEN_FILE_FAILED, old_filename); + } else { + qerror_report(QERR_OPEN_FILE_FAILED, filename); + } } out: if (ret) { From 209bef3e014ba1613759575e2c10f0ef8d64eb84 Mon Sep 17 00:00:00 2001 From: Feiran Zheng Date: Wed, 9 Mar 2011 21:19:35 +0800 Subject: [PATCH 8/9] hw/xen_disk: aio_inflight not released in handling ioreq when nr_segments==0 In hw/xen_disk.c, async writing ioreq is leaked when ioreq->req.nr_segments==0, because `aio_inflight` flag is not released properly (skipped by misplaced "break"). Signed-off-by: Feiran Zheng Acked-by: Stefano Stabellini Signed-off-by: Kevin Wolf --- hw/xen_disk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/xen_disk.c b/hw/xen_disk.c index ed9e5eb4d7..445bf03aa0 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -408,9 +408,9 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq) break; case BLKIF_OP_WRITE: case BLKIF_OP_WRITE_BARRIER: - ioreq->aio_inflight++; if (!ioreq->req.nr_segments) break; + ioreq->aio_inflight++; bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE, &ioreq->v, ioreq->v.size / BLOCK_SIZE, qemu_aio_complete, ioreq); From 03feae73056ba3223151c31871860e30630645ac Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 14 Feb 2011 17:49:46 +0100 Subject: [PATCH 9/9] Add qcow2 documentation This adds a description of the qcow2 file format to the docs/ directory. Besides documenting what's there, which is never wrong, the document should provide a good basis for the discussion of format extensions (called "qcow3" in previous discussions) Signed-off-by: Kevin Wolf Reviewed-by: Stefan Hajnoczi --- docs/specs/qcow2.txt | 260 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 docs/specs/qcow2.txt diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt new file mode 100644 index 0000000000..8fc3cb2f1a --- /dev/null +++ b/docs/specs/qcow2.txt @@ -0,0 +1,260 @@ +== General == + +A qcow2 image file is organized in units of constant size, which are called +(host) clusters. A cluster is the unit in which all allocations are done, +both for actual guest data and for image metadata. + +Likewise, the virtual disk as seen by the guest is divided into (guest) +clusters of the same size. + +All numbers in qcow2 are stored in Big Endian byte order. + + +== Header == + +The first cluster of a qcow2 image contains the file header: + + Byte 0 - 3: magic + QCOW magic string ("QFI\xfb") + + 4 - 7: version + Version number (only valid value is 2) + + 8 - 15: backing_file_offset + Offset into the image file at which the backing file name + is stored (NB: The string is not null terminated). 0 if the + image doesn't have a backing file. + + 16 - 19: backing_file_size + Length of the backing file name in bytes. Must not be + longer than 1023 bytes. Undefined if the image doesn't have + a backing file. + + 20 - 23: cluster_bits + Number of bits that are used for addressing an offset + within a cluster (1 << cluster_bits is the cluster size). + Must not be less than 9 (i.e. 512 byte clusters). + + Note: qemu as of today has an implementation limit of 2 MB + as the maximum cluster size and won't be able to open images + with larger cluster sizes. + + 24 - 31: size + Virtual disk size in bytes + + 32 - 35: crypt_method + 0 for no encryption + 1 for AES encryption + + 36 - 39: l1_size + Number of entries in the active L1 table + + 40 - 47: l1_table_offset + Offset into the image file at which the active L1 table + starts. Must be aligned to a cluster boundary. + + 48 - 55: refcount_table_offset + Offset into the image file at which the refcount table + starts. Must be aligned to a cluster boundary. + + 56 - 59: refcount_table_clusters + Number of clusters that the refcount table occupies + + 60 - 63: nb_snapshots + Number of snapshots contained in the image + + 64 - 71: snapshots_offset + Offset into the image file at which the snapshot table + starts. Must be aligned to a cluster boundary. + +Directly after the image header, optional sections called header extensions can +be stored. Each extension has a structure like the following: + + Byte 0 - 3: Header extension type: + 0x00000000 - End of the header extension area + 0xE2792ACA - Backing file format name + other - Unknown header extension, can be safely + ignored + + 4 - 7: Length of the header extension data + + 8 - n: Header extension data + + n - m: Padding to round up the header extension size to the next + multiple of 8. + +The remaining space between the end of the header extension area and the end of +the first cluster can be used for other data. Usually, the backing file name is +stored there. + + +== Host cluster management == + +qcow2 manages the allocation of host clusters by maintaining a reference count +for each host cluster. A refcount of 0 means that the cluster is free, 1 means +that it is used, and >= 2 means that it is used and any write access must +perform a COW (copy on write) operation. + +The refcounts are managed in a two-level table. The first level is called +refcount table and has a variable size (which is stored in the header). The +refcount table can cover multiple clusters, however it needs to be contiguous +in the image file. + +It contains pointers to the second level structures which are called refcount +blocks and are exactly one cluster in size. + +Given a offset into the image file, the refcount of its cluster can be obtained +as follows: + + refcount_block_entries = (cluster_size / sizeof(uint16_t)) + + refcount_block_index = (offset / cluster_size) % refcount_table_entries + refcount_table_index = (offset / cluster_size) / refcount_table_entries + + refcount_block = load_cluster(refcount_table[refcount_table_index]); + return refcount_block[refcount_block_index]; + +Refcount table entry: + + Bit 0 - 8: Reserved (set to 0) + + 9 - 63: Bits 9-63 of the offset into the image file at which the + refcount block starts. Must be aligned to a cluster + boundary. + + If this is 0, the corresponding refcount block has not yet + been allocated. All refcounts managed by this refcount block + are 0. + +Refcount block entry: + + Bit 0 - 15: Reference count of the cluster + + +== Cluster mapping == + +Just as for refcounts, qcow2 uses a two-level structure for the mapping of +guest clusters to host clusters. They are called L1 and L2 table. + +The L1 table has a variable size (stored in the header) and may use multiple +clusters, however it must be contiguous in the image file. L2 tables are +exactly one cluster in size. + +Given a offset into the virtual disk, the offset into the image file can be +obtained as follows: + + l2_entries = (cluster_size / sizeof(uint64_t)) + + l2_index = (offset / cluster_size) % l2_entries + l1_index = (offset / cluster_size) / l2_entries + + l2_table = load_cluster(l1_table[l1_index]); + cluster_offset = l2_table[l2_index]; + + return cluster_offset + (offset % cluster_size) + +L1 table entry: + + Bit 0 - 8: Reserved (set to 0) + + 9 - 55: Bits 9-55 of the offset into the image file at which the L2 + table starts. Must be aligned to a cluster boundary. If the + offset is 0, the L2 table and all clusters described by this + L2 table are unallocated. + + 56 - 62: Reserved (set to 0) + + 63: 0 for an L2 table that is unused or requires COW, 1 if its + refcount is exactly one. This information is only accurate + in the active L1 table. + +L2 table entry (for normal clusters): + + Bit 0 - 8: Reserved (set to 0) + + 9 - 55: Bits 9-55 of host cluster offset. Must be aligned to a + cluster boundary. If the offset is 0, the cluster is + unallocated. + + 56 - 61: Reserved (set to 0) + + 62: 0 (this cluster is not compressed) + + 63: 0 for a cluster that is unused or requires COW, 1 if its + refcount is exactly one. This information is only accurate + in L2 tables that are reachable from the the active L1 + table. + +L2 table entry (for compressed clusters; x = 62 - (cluster_size - 8)): + + Bit 0 - x: Host cluster offset. This is usually _not_ aligned to a + cluster boundary! + + x+1 - 61: Compressed size of the images in sectors of 512 bytes + + 62: 1 (this cluster is compressed using zlib) + + 63: 0 for a cluster that is unused or requires COW, 1 if its + refcount is exactly one. This information is only accurate + in L2 tables that are reachable from the the active L1 + table. + +If a cluster is unallocated, read requests shall read the data from the backing +file. If there is no backing file or the backing file is smaller than the image, +they shall read zeros for all parts that are not covered by the backing file. + + +== Snapshots == + +qcow2 supports internal snapshots. Their basic principle of operation is to +switch the active L1 table, so that a different set of host clusters are +exposed to the guest. + +When creating a snapshot, the L1 table should be copied and the refcount of all +L2 tables and clusters reachable form this L1 table must be increased, so that +a write causes a COW and isn't visible in other snapshots. + +When loading a snapshot, bit 63 of all entries in the new active L1 table and +all L2 tables referenced by it must be reconstructed from the refcount table +as it doesn't need to be accurate in inactive L1 tables. + +A directory of all snapshots is stored in the snapshot table, a contiguous area +in the image file, whose starting offset and length are given by the header +fields snapshots_offset and nb_snapshots. The entries of the snapshot table +have variable length, depending on the length of ID, name and extra data. + +Snapshot table entry: + + Byte 0 - 7: Offset into the image file at which the L1 table for the + snapshot starts. Must be aligned to a cluster boundary. + + 8 - 11: Number of entries in the L1 table of the snapshots + + 12 - 13: Length of the unique ID string describing the snapshot + + 14 - 15: Length of the name of the snapshot + + 16 - 19: Time at which the snapshot was taken in seconds since the + Epoch + + 20 - 23: Subsecond part of the time at which the snapshot was taken + in nanoseconds + + 24 - 31: Time that the guest was running until the snapshot was + taken in nanoseconds + + 32 - 35: Size of the VM state in bytes. 0 if no VM state is saved. + If there is VM state, it starts at the first cluster + described by first L1 table entry that doesn't describe a + regular guest cluster (i.e. VM state is stored like guest + disk content, except that it is stored at offsets that are + larger than the virtual disk presented to the guest) + + 36 - 39: Size of extra data in the table entry (used for future + extensions of the format) + + variable: Extra data for future extensions. Must be ignored. + + variable: Unique ID string for the snapshot (not null terminated) + + variable: Name of the snapshot (not null terminated)