From 40a892b78c9a620bc5ace4164dd566764c35cb53 Mon Sep 17 00:00:00 2001
From: Stefan Weil <weil@mail.berlios.de>
Date: Sat, 19 Feb 2011 22:18:11 +0100
Subject: [PATCH 1/9] block/vdi: Don't ignore immediate read/write failures

This patch is similar to 171e3d6b9997c98a97d0c525867f7cd9b640cadd
which fixed qcow2:

Returning -EIO is far from optimal, but at least it's an error code.

Cc: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Weil <weil@mail.berlios.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/vdi.c b/block/vdi.c
index 116b25bc9b..90540792d3 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -610,6 +610,7 @@ static void vdi_aio_read_cb(void *opaque, int ret)
         acb->hd_aiocb = bdrv_aio_readv(bs->file, offset, &acb->hd_qiov,
                                        n_sectors, vdi_aio_read_cb, acb);
         if (acb->hd_aiocb == NULL) {
+            ret = -EIO;
             goto done;
         }
     }
@@ -673,6 +674,7 @@ static void vdi_aio_write_cb(void *opaque, int ret)
             acb->hd_aiocb = bdrv_aio_writev(bs->file, 0, &acb->hd_qiov, 1,
                                             vdi_aio_write_cb, acb);
             if (acb->hd_aiocb == NULL) {
+                ret = -EIO;
                 goto done;
             }
             return;
@@ -702,6 +704,7 @@ static void vdi_aio_write_cb(void *opaque, int ret)
             acb->hd_aiocb = bdrv_aio_writev(bs->file, offset, &acb->hd_qiov,
                                             n_sectors, vdi_aio_write_cb, acb);
             if (acb->hd_aiocb == NULL) {
+                ret = -EIO;
                 goto done;
             }
             return;
@@ -752,6 +755,7 @@ static void vdi_aio_write_cb(void *opaque, int ret)
                                         &acb->hd_qiov, s->block_sectors,
                                         vdi_aio_write_cb, acb);
         if (acb->hd_aiocb == NULL) {
+            ret = -EIO;
             goto done;
         }
     } else {
@@ -764,6 +768,7 @@ static void vdi_aio_write_cb(void *opaque, int ret)
         acb->hd_aiocb = bdrv_aio_writev(bs->file, offset, &acb->hd_qiov,
                                         n_sectors, vdi_aio_write_cb, acb);
         if (acb->hd_aiocb == NULL) {
+            ret = -EIO;
             goto done;
         }
     }

From 5614c188c65a8194ae499cb16400cab690a45299 Mon Sep 17 00:00:00 2001
From: Stefan Weil <weil@mail.berlios.de>
Date: Sat, 19 Feb 2011 22:18:12 +0100
Subject: [PATCH 2/9] block/qcow: Don't ignore immediate read/write and other
 failures

This patch is similar to 171e3d6b9997c98a97d0c525867f7cd9b640cadd
which fixed qcow2:

Returning -EIO is far from optimal, but at least it's an error code.

In addition to read/write failures, -EIO is also returned when
decompress_cluster failed.

Cc: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Weil <weil@mail.berlios.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index f67d3d39f2..a26c88620f 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -589,8 +589,10 @@ static void qcow_aio_read_cb(void *opaque, int ret)
             qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
             acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
                 &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
-            if (acb->hd_aiocb == NULL)
+            if (acb->hd_aiocb == NULL) {
+                ret = -EIO;
                 goto done;
+            }
         } else {
             /* Note: in this case, no need to wait */
             memset(acb->buf, 0, 512 * acb->n);
@@ -598,8 +600,10 @@ static void qcow_aio_read_cb(void *opaque, int ret)
         }
     } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
         /* add AIO support for compressed blocks ? */
-        if (decompress_cluster(bs, acb->cluster_offset) < 0)
+        if (decompress_cluster(bs, acb->cluster_offset) < 0) {
+            ret = -EIO;
             goto done;
+        }
         memcpy(acb->buf,
                s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
         goto redo;
@@ -614,8 +618,10 @@ static void qcow_aio_read_cb(void *opaque, int ret)
         acb->hd_aiocb = bdrv_aio_readv(bs->file,
                             (acb->cluster_offset >> 9) + index_in_cluster,
                             &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
-        if (acb->hd_aiocb == NULL)
+        if (acb->hd_aiocb == NULL) {
+            ret = -EIO;
             goto done;
+        }
     }
 
     return;
@@ -700,8 +706,10 @@ static void qcow_aio_write_cb(void *opaque, int ret)
                                     (cluster_offset >> 9) + index_in_cluster,
                                     &acb->hd_qiov, acb->n,
                                     qcow_aio_write_cb, acb);
-    if (acb->hd_aiocb == NULL)
+    if (acb->hd_aiocb == NULL) {
+        ret = -EIO;
         goto done;
+    }
     return;
 
 done:

From e11480db7ff15a9e878f6b3cc1199b439bf7c825 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 1 Mar 2011 10:48:12 +0100
Subject: [PATCH 3/9] Add error message for loading snapshot without VM state

It already fails, but it didn't tell the user why.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
---
 savevm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/savevm.c b/savevm.c
index 60d2f2a547..d1b9b4a413 100644
--- a/savevm.c
+++ b/savevm.c
@@ -2021,6 +2021,8 @@ int load_vmstate(const char *name)
     if (ret < 0) {
         return ret;
     } else if (sn.vm_state_size == 0) {
+        error_report("This is a disk-only snapshot. Revert to it offline "
+            "using qemu-img.");
         return -EINVAL;
     }
 

From 4e59b545868a5ee5f59b346337f0c44209929334 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 22 Feb 2011 18:42:31 +0100
Subject: [PATCH 4/9] tools: Use real async.c instead of stubs

It's wrong to call BHs directly, even in tools. The only operations that
schedule BHs are called in a loop that (indirectly) contains a call to
qemu_bh_poll anyway, so we're not losing the scheduled BHs: Tools either use
synchronous functions, which are guaranteed to have completed (including any
BHs) when they return; or if they use asynchronous functions, they need to call
qemu_aio_wait() or similar functions already today.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 Makefile.objs |  4 ++--
 qemu-tool.c   | 47 ++++-------------------------------------------
 2 files changed, 6 insertions(+), 45 deletions(-)

diff --git a/Makefile.objs b/Makefile.objs
index a52f42fb72..167ccc2c6c 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -13,7 +13,7 @@ oslib-obj-$(CONFIG_POSIX) += oslib-posix.o
 #######################################################################
 # block-obj-y is code used by both qemu system emulation and qemu-img
 
-block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o
+block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o async.o
 block-obj-y += nbd.o block.o aio.o aes.o qemu-config.o
 block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
@@ -63,7 +63,7 @@ common-obj-y = $(block-obj-y) blockdev.o
 common-obj-y += $(net-obj-y)
 common-obj-y += $(qobject-obj-y)
 common-obj-$(CONFIG_LINUX) += $(fsdev-obj-$(CONFIG_LINUX))
-common-obj-y += readline.o console.o cursor.o async.o qemu-error.o
+common-obj-y += readline.o console.o cursor.o qemu-error.o
 common-obj-y += $(oslib-obj-y)
 common-obj-$(CONFIG_WIN32) += os-win32.o
 common-obj-$(CONFIG_POSIX) += os-posix.o
diff --git a/qemu-tool.c b/qemu-tool.c
index 392e1c9505..d45840de28 100644
--- a/qemu-tool.c
+++ b/qemu-tool.c
@@ -56,53 +56,10 @@ void monitor_print_filename(Monitor *mon, const char *filename)
 {
 }
 
-void async_context_push(void)
-{
-}
-
-void async_context_pop(void)
-{
-}
-
-int get_async_context_id(void)
-{
-    return 0;
-}
-
 void monitor_protocol_event(MonitorEvent event, QObject *data)
 {
 }
 
-QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque)
-{
-    QEMUBH *bh;
-
-    bh = qemu_malloc(sizeof(*bh));
-    bh->cb = cb;
-    bh->opaque = opaque;
-
-    return bh;
-}
-
-int qemu_bh_poll(void)
-{
-    return 0;
-}
-
-void qemu_bh_schedule(QEMUBH *bh)
-{
-    bh->cb(bh->opaque);
-}
-
-void qemu_bh_cancel(QEMUBH *bh)
-{
-}
-
-void qemu_bh_delete(QEMUBH *bh)
-{
-    qemu_free(bh);
-}
-
 int qemu_set_fd_handler2(int fd,
                          IOCanReadHandler *fd_read_poll,
                          IOHandler *fd_read,
@@ -111,3 +68,7 @@ int qemu_set_fd_handler2(int fd,
 {
     return 0;
 }
+
+void qemu_notify_event(void)
+{
+}

From 301db7c2dd769d48e97c9a766520f8affff76cd7 Mon Sep 17 00:00:00 2001
From: Ryan Harper <ryanh@us.ibm.com>
Date: Mon, 7 Mar 2011 10:01:04 -0600
Subject: [PATCH 5/9] Don't allow multiwrites against a block device without
 underlying medium

If the block device has been closed, we no longer have a medium to submit
IO against, check for this before submitting io.  This prevents a segfault
further in the code where we dereference elements of the block driver.

Signed-off-by: Ryan Harper <ryanh@us.ibm.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/block.c b/block.c
index 0559d835a2..c8e2f97614 100644
--- a/block.c
+++ b/block.c
@@ -2398,6 +2398,14 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
     MultiwriteCB *mcb;
     int i;
 
+    /* don't submit writes if we don't have a medium */
+    if (bs->drv == NULL) {
+        for (i = 0; i < num_reqs; i++) {
+            reqs[i].error = -ENOMEDIUM;
+        }
+        return -1;
+    }
+
     if (num_reqs == 0) {
         return 0;
     }

From b93af93d2bbacf034af1888c96291850aa849b59 Mon Sep 17 00:00:00 2001
From: Brian Wheeler <bdwheele@indiana.edu>
Date: Tue, 1 Mar 2011 08:30:23 -0500
Subject: [PATCH 6/9] Fix ATA SMART and CHECK POWER MODE

This patch fixes two things:

 1) CHECK POWER MODE

The error return value wasn't always zero, so it would show up as
offline.  Error is now explicitly set to zero.

 2) SMART

The smart values that were returned were invalid and tools like skdump
would not recognize that the smart data was actually valid and would
dump weird output.  The data has been fixed up and raw value support
was added.  Tools like skdump and palimpsest work as expected.

Signed-off-by: Brian Wheeler <bdwheele@indiana.edu>
Acked-by: Ryan Harper <ryanh@us.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/core.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 9c91a49767..1ffca56887 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -34,13 +34,26 @@
 
 #include <hw/ide/internal.h>
 
-static const int smart_attributes[][5] = {
-    /* id,  flags, val, wrst, thrsh */
-    { 0x01, 0x03, 0x64, 0x64, 0x06}, /* raw read */
-    { 0x03, 0x03, 0x64, 0x64, 0x46}, /* spin up */
-    { 0x04, 0x02, 0x64, 0x64, 0x14}, /* start stop count */
-    { 0x05, 0x03, 0x64, 0x64, 0x36}, /* remapped sectors */
-    { 0x00, 0x00, 0x00, 0x00, 0x00}
+/* These values were based on a Seagate ST3500418AS but have been modified
+   to make more sense in QEMU */
+static const int smart_attributes[][12] = {
+    /* id,  flags, hflags, val, wrst, raw (6 bytes), threshold */
+    /* raw read error rate*/
+    { 0x01, 0x03, 0x00, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06},
+    /* spin up */
+    { 0x03, 0x03, 0x00, 0x64, 0x64, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    /* start stop count */
+    { 0x04, 0x02, 0x00, 0x64, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14},
+    /* remapped sectors */
+    { 0x05, 0x03, 0x00, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x24},
+    /* power on hours */
+    { 0x09, 0x03, 0x00, 0x64, 0x64, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    /* power cycle count */
+    { 0x0c, 0x03, 0x00, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+    /* airflow-temperature-celsius */
+    { 190,  0x03, 0x00, 0x45, 0x45, 0x1f, 0x00, 0x1f, 0x1f, 0x00, 0x00, 0x32},
+    /* end of list */
+    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
 };
 
 /* XXX: DVDs that could fit on a CD will be reported as a CD */
@@ -1843,6 +1856,7 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
         break;
     case WIN_CHECKPOWERMODE1:
     case WIN_CHECKPOWERMODE2:
+        s->error = 0;
         s->nsector = 0xff; /* device active or idle */
         s->status = READY_STAT | SEEK_STAT;
         ide_set_irq(s->bus);
@@ -2097,7 +2111,7 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
 		if (smart_attributes[n][0] == 0)
 			break;
 		s->io_buffer[2+0+(n*12)] = smart_attributes[n][0];
-		s->io_buffer[2+1+(n*12)] = smart_attributes[n][4];
+		s->io_buffer[2+1+(n*12)] = smart_attributes[n][11];
 		}
 		for (n=0; n<511; n++) /* checksum */
 		s->io_buffer[511] += s->io_buffer[n];
@@ -2110,12 +2124,13 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val)
 		memset(s->io_buffer, 0, 0x200);
 		s->io_buffer[0] = 0x01; /* smart struct version */
 		for (n=0; n<30; n++) {
-		if (smart_attributes[n][0] == 0)
+		    if (smart_attributes[n][0] == 0) {
 			break;
-		s->io_buffer[2+0+(n*12)] = smart_attributes[n][0];
-		s->io_buffer[2+1+(n*12)] = smart_attributes[n][1];
-		s->io_buffer[2+3+(n*12)] = smart_attributes[n][2];
-		s->io_buffer[2+4+(n*12)] = smart_attributes[n][3];
+		    }
+		    int i;
+		    for(i = 0; i < 11; i++) {
+			s->io_buffer[2+i+(n*12)] = smart_attributes[n][i];
+		    }
 		}
 		s->io_buffer[362] = 0x02 | (s->smart_autosave?0x80:0x00);
 		if (s->smart_selftest_count == 0) {

From 52f9a172b6db89ba1f4389883be805d65dd3ca8c Mon Sep 17 00:00:00 2001
From: Jes Sorensen <Jes.Sorensen@redhat.com>
Date: Wed, 9 Mar 2011 11:20:30 +0100
Subject: [PATCH 7/9] Improve error handling in do_snapshot_blkdev()

In case we cannot open the newly created snapshot image, try to fall
back to the original image file and continue running on that, which
should prevent the guest from aborting.

This is a corner case which can happen if the admin by mistake
specifies the snapshot file on a virtual file system which does not
support O_DIRECT. bdrv_create() does not use O_DIRECT, but the
following open in bdrv_open() does and will then fail.

Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 0690cc8bea..ecf2252d83 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -574,9 +574,10 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
     const char *filename = qdict_get_try_str(qdict, "snapshot_file");
     const char *format = qdict_get_try_str(qdict, "format");
     BlockDriverState *bs;
-    BlockDriver *drv, *proto_drv;
+    BlockDriver *drv, *old_drv, *proto_drv;
     int ret = 0;
     int flags;
+    char old_filename[1024];
 
     if (!filename) {
         qerror_report(QERR_MISSING_PARAMETER, "snapshot_file");
@@ -591,6 +592,11 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
         goto out;
     }
 
+    pstrcpy(old_filename, sizeof(old_filename), bs->filename);
+
+    old_drv = bs->drv;
+    flags = bs->open_flags;
+
     if (!format) {
         format = "qcow2";
     }
@@ -610,7 +616,7 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
     }
 
     ret = bdrv_img_create(filename, format, bs->filename,
-                          bs->drv->format_name, NULL, -1, bs->open_flags);
+                          bs->drv->format_name, NULL, -1, flags);
     if (ret) {
         goto out;
     }
@@ -618,15 +624,20 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data)
     qemu_aio_flush();
     bdrv_flush(bs);
 
-    flags = bs->open_flags;
     bdrv_close(bs);
     ret = bdrv_open(bs, filename, flags, drv);
     /*
-     * If reopening the image file we just created fails, we really
-     * are in trouble :(
+     * If reopening the image file we just created fails, fall back
+     * and try to re-open the original image. If that fails too, we
+     * are in serious trouble.
      */
     if (ret != 0) {
-        abort();
+        ret = bdrv_open(bs, old_filename, flags, old_drv);
+        if (ret != 0) {
+            qerror_report(QERR_OPEN_FILE_FAILED, old_filename);
+        } else {
+            qerror_report(QERR_OPEN_FILE_FAILED, filename);
+        }
     }
 out:
     if (ret) {

From 209bef3e014ba1613759575e2c10f0ef8d64eb84 Mon Sep 17 00:00:00 2001
From: Feiran Zheng <famcool@gmail.com>
Date: Wed, 9 Mar 2011 21:19:35 +0800
Subject: [PATCH 8/9] hw/xen_disk: aio_inflight not released in handling ioreq
 when nr_segments==0

In hw/xen_disk.c, async writing ioreq is leaked when
ioreq->req.nr_segments==0, because `aio_inflight` flag is not released
properly (skipped by misplaced "break").

Signed-off-by: Feiran Zheng <famcool@gmail.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/xen_disk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/xen_disk.c b/hw/xen_disk.c
index ed9e5eb4d7..445bf03aa0 100644
--- a/hw/xen_disk.c
+++ b/hw/xen_disk.c
@@ -408,9 +408,9 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
 	break;
     case BLKIF_OP_WRITE:
     case BLKIF_OP_WRITE_BARRIER:
-        ioreq->aio_inflight++;
         if (!ioreq->req.nr_segments)
             break;
+        ioreq->aio_inflight++;
         bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE,
                         &ioreq->v, ioreq->v.size / BLOCK_SIZE,
                         qemu_aio_complete, ioreq);

From 03feae73056ba3223151c31871860e30630645ac Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 14 Feb 2011 17:49:46 +0100
Subject: [PATCH 9/9] Add qcow2 documentation

This adds a description of the qcow2 file format to the docs/ directory.
Besides documenting what's there, which is never wrong, the document should
provide a good basis for the discussion of format extensions (called "qcow3"
in previous discussions)

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 docs/specs/qcow2.txt | 260 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 260 insertions(+)
 create mode 100644 docs/specs/qcow2.txt

diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt
new file mode 100644
index 0000000000..8fc3cb2f1a
--- /dev/null
+++ b/docs/specs/qcow2.txt
@@ -0,0 +1,260 @@
+== General ==
+
+A qcow2 image file is organized in units of constant size, which are called
+(host) clusters. A cluster is the unit in which all allocations are done,
+both for actual guest data and for image metadata.
+
+Likewise, the virtual disk as seen by the guest is divided into (guest)
+clusters of the same size.
+
+All numbers in qcow2 are stored in Big Endian byte order.
+
+
+== Header ==
+
+The first cluster of a qcow2 image contains the file header:
+
+    Byte  0 -  3:   magic
+                    QCOW magic string ("QFI\xfb")
+
+          4 -  7:   version
+                    Version number (only valid value is 2)
+
+          8 - 15:   backing_file_offset
+                    Offset into the image file at which the backing file name
+                    is stored (NB: The string is not null terminated). 0 if the
+                    image doesn't have a backing file.
+
+         16 - 19:   backing_file_size
+                    Length of the backing file name in bytes. Must not be
+                    longer than 1023 bytes. Undefined if the image doesn't have
+                    a backing file.
+
+         20 - 23:   cluster_bits
+                    Number of bits that are used for addressing an offset
+                    within a cluster (1 << cluster_bits is the cluster size).
+                    Must not be less than 9 (i.e. 512 byte clusters).
+
+                    Note: qemu as of today has an implementation limit of 2 MB
+                    as the maximum cluster size and won't be able to open images
+                    with larger cluster sizes.
+
+         24 - 31:   size
+                    Virtual disk size in bytes
+
+         32 - 35:   crypt_method
+                    0 for no encryption
+                    1 for AES encryption
+
+         36 - 39:   l1_size
+                    Number of entries in the active L1 table
+
+         40 - 47:   l1_table_offset
+                    Offset into the image file at which the active L1 table
+                    starts. Must be aligned to a cluster boundary.
+
+         48 - 55:   refcount_table_offset
+                    Offset into the image file at which the refcount table
+                    starts. Must be aligned to a cluster boundary.
+
+         56 - 59:   refcount_table_clusters
+                    Number of clusters that the refcount table occupies
+
+         60 - 63:   nb_snapshots
+                    Number of snapshots contained in the image
+
+         64 - 71:   snapshots_offset
+                    Offset into the image file at which the snapshot table
+                    starts. Must be aligned to a cluster boundary.
+
+Directly after the image header, optional sections called header extensions can
+be stored. Each extension has a structure like the following:
+
+    Byte  0 -  3:   Header extension type:
+                        0x00000000 - End of the header extension area
+                        0xE2792ACA - Backing file format name
+                        other      - Unknown header extension, can be safely
+                                     ignored
+
+          4 -  7:   Length of the header extension data
+
+          8 -  n:   Header extension data
+
+          n -  m:   Padding to round up the header extension size to the next
+                    multiple of 8.
+
+The remaining space between the end of the header extension area and the end of
+the first cluster can be used for other data. Usually, the backing file name is
+stored there.
+
+
+== Host cluster management ==
+
+qcow2 manages the allocation of host clusters by maintaining a reference count
+for each host cluster. A refcount of 0 means that the cluster is free, 1 means
+that it is used, and >= 2 means that it is used and any write access must
+perform a COW (copy on write) operation.
+
+The refcounts are managed in a two-level table. The first level is called
+refcount table and has a variable size (which is stored in the header). The
+refcount table can cover multiple clusters, however it needs to be contiguous
+in the image file.
+
+It contains pointers to the second level structures which are called refcount
+blocks and are exactly one cluster in size.
+
+Given a offset into the image file, the refcount of its cluster can be obtained
+as follows:
+
+    refcount_block_entries = (cluster_size / sizeof(uint16_t))
+
+    refcount_block_index = (offset / cluster_size) % refcount_table_entries
+    refcount_table_index = (offset / cluster_size) / refcount_table_entries
+
+    refcount_block = load_cluster(refcount_table[refcount_table_index]);
+    return refcount_block[refcount_block_index];
+
+Refcount table entry:
+
+    Bit  0 -  8:    Reserved (set to 0)
+
+         9 - 63:    Bits 9-63 of the offset into the image file at which the
+                    refcount block starts. Must be aligned to a cluster
+                    boundary.
+
+                    If this is 0, the corresponding refcount block has not yet
+                    been allocated. All refcounts managed by this refcount block
+                    are 0.
+
+Refcount block entry:
+
+    Bit  0 - 15:    Reference count of the cluster
+
+
+== Cluster mapping ==
+
+Just as for refcounts, qcow2 uses a two-level structure for the mapping of
+guest clusters to host clusters. They are called L1 and L2 table.
+
+The L1 table has a variable size (stored in the header) and may use multiple
+clusters, however it must be contiguous in the image file. L2 tables are
+exactly one cluster in size.
+
+Given a offset into the virtual disk, the offset into the image file can be
+obtained as follows:
+
+    l2_entries = (cluster_size / sizeof(uint64_t))
+
+    l2_index = (offset / cluster_size) % l2_entries
+    l1_index = (offset / cluster_size) / l2_entries
+
+    l2_table = load_cluster(l1_table[l1_index]);
+    cluster_offset = l2_table[l2_index];
+
+    return cluster_offset + (offset % cluster_size)
+
+L1 table entry:
+
+    Bit  0 -  8:    Reserved (set to 0)
+
+         9 - 55:    Bits 9-55 of the offset into the image file at which the L2
+                    table starts. Must be aligned to a cluster boundary. If the
+                    offset is 0, the L2 table and all clusters described by this
+                    L2 table are unallocated.
+
+        56 - 62:    Reserved (set to 0)
+
+             63:    0 for an L2 table that is unused or requires COW, 1 if its
+                    refcount is exactly one. This information is only accurate
+                    in the active L1 table.
+
+L2 table entry (for normal clusters):
+
+    Bit  0 -  8:    Reserved (set to 0)
+
+         9 - 55:    Bits 9-55 of host cluster offset. Must be aligned to a
+                    cluster boundary. If the offset is 0, the cluster is
+                    unallocated.
+
+        56 - 61:    Reserved (set to 0)
+
+             62:    0 (this cluster is not compressed)
+
+             63:    0 for a cluster that is unused or requires COW, 1 if its
+                    refcount is exactly one. This information is only accurate
+                    in L2 tables that are reachable from the the active L1
+                    table.
+
+L2 table entry (for compressed clusters; x = 62 - (cluster_size - 8)):
+
+    Bit  0 -  x:    Host cluster offset. This is usually _not_ aligned to a
+                    cluster boundary!
+
+       x+1 - 61:    Compressed size of the images in sectors of 512 bytes
+
+             62:    1 (this cluster is compressed using zlib)
+
+             63:    0 for a cluster that is unused or requires COW, 1 if its
+                    refcount is exactly one. This information is only accurate
+                    in L2 tables that are reachable from the the active L1
+                    table.
+
+If a cluster is unallocated, read requests shall read the data from the backing
+file. If there is no backing file or the backing file is smaller than the image,
+they shall read zeros for all parts that are not covered by the backing file.
+
+
+== Snapshots ==
+
+qcow2 supports internal snapshots. Their basic principle of operation is to
+switch the active L1 table, so that a different set of host clusters are
+exposed to the guest.
+
+When creating a snapshot, the L1 table should be copied and the refcount of all
+L2 tables and clusters reachable form this L1 table must be increased, so that
+a write causes a COW and isn't visible in other snapshots.
+
+When loading a snapshot, bit 63 of all entries in the new active L1 table and
+all L2 tables referenced by it must be reconstructed from the refcount table
+as it doesn't need to be accurate in inactive L1 tables.
+
+A directory of all snapshots is stored in the snapshot table, a contiguous area
+in the image file, whose starting offset and length are given by the header
+fields snapshots_offset and nb_snapshots. The entries of the snapshot table
+have variable length, depending on the length of ID, name and extra data.
+
+Snapshot table entry:
+
+    Byte 0 -  7:    Offset into the image file at which the L1 table for the
+                    snapshot starts. Must be aligned to a cluster boundary.
+
+         8 - 11:    Number of entries in the L1 table of the snapshots
+
+        12 - 13:    Length of the unique ID string describing the snapshot
+
+        14 - 15:    Length of the name of the snapshot
+
+        16 - 19:    Time at which the snapshot was taken in seconds since the
+                    Epoch
+
+        20 - 23:    Subsecond part of the time at which the snapshot was taken
+                    in nanoseconds
+
+        24 - 31:    Time that the guest was running until the snapshot was
+                    taken in nanoseconds
+
+        32 - 35:    Size of the VM state in bytes. 0 if no VM state is saved.
+                    If there is VM state, it starts at the first cluster
+                    described by first L1 table entry that doesn't describe a
+                    regular guest cluster (i.e. VM state is stored like guest
+                    disk content, except that it is stored at offsets that are
+                    larger than the virtual disk presented to the guest)
+
+        36 - 39:    Size of extra data in the table entry (used for future
+                    extensions of the format)
+
+        variable:   Extra data for future extensions. Must be ignored.
+
+        variable:   Unique ID string for the snapshot (not null terminated)
+
+        variable:   Name of the snapshot (not null terminated)